The campaign is run from 1. January 2018 to N January 2018.
0. Data Acquisiton
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script ThankYou_2018_Production_SQL.R
on stat1005.eqiad.wmnet, collecting the data as .tsv
and .csv
files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R
.
### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets
### --- Campaign Details:
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018.
# - The report must include any activity from the beginning of the campaign.
# - The estimated start will be 1st January 2018.
# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig
### --- Training Modules Schema:
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking
### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate'
### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'
### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------
# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1
### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
to = as.POSIXlt(endDate, tz = "CET"),
by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
dR[[i]] <- data.frame(
cetName = names(dateRange[i]),
utcYear = year(dateRange[i]),
utcMonth = month(dateRange[i]),
utcDay = mday(dateRange[i]),
utcHour = hour(dateRange[i])
)
}
dR <- rbindlist(dR)
dR <- dR %>%
group_by(cetName, utcYear, utcMonth, utcDay) %>%
summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))
### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------
# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1
# - set bannerClicksDir
setwd(bannerClicksDir)
for (i in 1:length(unique(dR$cetName))) {
wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
for (j in 1:length(wCetName)) {
# - construct HiveQL query:
y <- dR$utcYear[wCetName[j]]
m <- dR$utcMonth[wCetName[j]]
d <- dR$utcDay[wCetName[j]]
hour <- dR$utcHour[wCetName[j]]
q <- paste(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE uri_host = 'de.wikipedia.org'
AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia'
AND year = ", y,
" AND month = ", m,
" AND day = ", d,
" AND (", hour, ");",
sep = "")
# - write hql
write(q, 'thankyou2018_BannerClicks.hql')
# - prepare output file:
fileName <- "thankyou2018_BannerClicks_"
fileName <- paste0(fileName,
as.character(unique(dR$cetName)[i]),
"_", j,
".tsv")
fileName <- paste0(bannerClicksDir, "/", fileName)
# - execute hql script:
hiveArgs <-
'beeline -f'
hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
}
### --- Wrangle this dataset:
### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'
### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
dS <- readLines(lF[i], n = -1)
timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
dataSet[[i]] <- data.frame(timestamp = timeStamp,
bannerClicks = bannerClicks,
stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)
### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")
### --- SQL
startDate <- '2018-01-01'
### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------
# - NOTE: UTC timestamps - adjustment for CE(S)T introduced.
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)
### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------
# - NOTE: UTC timestamps - adjustment for CE(S)T introduced.
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ",
gsub("-", "", startDate, fixed = T), "220000));\" > ",
dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)
### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF,
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
userReg <- userReg %>%
dplyr::select(event_userId, event_isSelfMade, event_campaign) %>%
filter(event_isSelfMade == 1) %>%
filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
paste(uid, collapse = ", "),
') GROUP BY rev_user;',
sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
paste('"', sqlQuery, '" > ', sep = ""),
dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand,
wait = TRUE)
### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------
1B. Campaign Pageviews (Banner Clicks)
1B. 1 The data set
pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
row.names = 1,
header = T,
stringsAsFactors = F)
pageviews <- pageviews %>%
group_by(timestamp) %>%
summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>%
kable_styling(full_width = F, position = "left")
timestamp
|
Pageviews
|
2018-01-01
|
533
|
2018-01-02
|
1188
|
2018-01-03
|
1269
|
2018-01-04
|
1148
|
2018-01-05
|
1122
|
2018-01-06
|
1148
|
2018-01-07
|
1242
|
2018-01-08
|
1150
|
2018-01-09
|
1254
|
2018-01-10
|
1284
|
2018-01-11
|
0
|
1B. 2 Chart
ggplot(pageviews, aes(x = timestamp,
y = Pageviews,
label = Pageviews)) +
geom_bar(stat = "identity",
position = "dodge",
width = .15,
fill = "white",
color = "darkblue") +
ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
geom_label(size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
2. Campaign User Registrations
2. 1 The data set
userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
sep = "\t",
header = T,
row.names = 1,
stringsAsFactors = F)
EOF within quoted string
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 5, 6)
d <- substr(x, 7, 8)
part1Date <- paste(y, m, d, sep = "-")
hr <- substr(x, 9, 10)
mi <- substr(x, 11, 12)
se <- substr(x, 13, 14)
part2Date <- paste(hr, mi, se, sep = ":")
paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <-
as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 6, 7)
d <- substr(x, 9, 10)
paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
filter(event_campaign %in% 'wmde_etc2017_bt1') %>%
group_by(timestamp) %>%
summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>%
kable_styling(full_width = F, position = "left")
timestamp
|
Registrations
|
2018-01-01
|
1
|
2018-01-02
|
8
|
2018-01-03
|
15
|
2018-01-04
|
13
|
2018-01-05
|
5
|
2018-01-06
|
6
|
2018-01-07
|
12
|
2018-01-08
|
6
|
2018-01-09
|
9
|
2018-01-10
|
4
|
2. 2 Chart
ggplot(userReg, aes(x = timestamp,
y = Registrations,
label = Registrations)) +
geom_bar(stat = "identity",
position = "dodge",
width = .15,
fill = "white",
color = "darkblue") +
ggtitle('Thank You 2018:\nOverview of User Registrations') +
geom_label(size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
4. Campaign Guided Tours
4. 1 The data set: Guided Tour Exit Steps
Campaign guided tours: diskutieren
and seimutig
.
tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
sep = "\t",
header = T,
row.names = 1,
stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 5, 6)
d <- substr(x, 7, 8)
part1Date <- paste(y, m, d, sep = "-")
hr <- substr(x, 9, 10)
mi <- substr(x, 11, 12)
se <- substr(x, 13, 14)
part2Date <- paste(hr, mi, se, sep = ":")
paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <-
as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 6, 7)
d <- substr(x, 9, 10)
paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
id <- round(runif(1, 1, 10e6))
while (id %in% an_userId) {
id <- round(runif(1, 1, 10e6))
}
an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
w <- which(eventUserId %in% x)
if (length(w) > 0) {
an_userId[w]
} else {
x
}
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
gTours <- gTours[w, ]
gTours <- gTours %>%
filter(event_userId != 0) %>%
select(timestamp, event_tour, event_step, an_userId)
knitr::kable(gTours, format = "html") %>%
kable_styling(full_width = F, position = "left")
} else {
print("There are currently no data on guided tours from this campaign.")
}
timestamp
|
event_tour
|
event_step
|
an_userId
|
2018-01-03
|
diskutieren
|
returnToTraining
|
u_638086
|
2018-01-03
|
seimutig
|
returnToTraining
|
u_638086
|
2018-01-04
|
seimutig
|
boldness
|
u_2937302
|
2018-01-04
|
seimutig
|
positionCursor
|
u_2937302
|
2018-01-04
|
diskutieren
|
returnToTraining
|
u_2937302
|
2018-01-05
|
diskutieren
|
saveReply
|
u_7828514
|
2018-01-04
|
seimutig
|
returnToTraining
|
u_2937302
|
2018-01-04
|
seimutig
|
editButtonCitation
|
u_2937302
|
2018-01-06
|
seimutig
|
editButton
|
u_6495795
|
2018-01-06
|
seimutig
|
boldness
|
u_6495795
|
2018-01-06
|
diskutieren
|
firstMessage
|
u_6495795
|
2018-01-08
|
seimutig
|
editBoldness
|
u_4296245
|
2018-01-08
|
seimutig
|
editSummary
|
u_4296245
|
2018-01-08
|
diskutieren
|
returnToTraining
|
u_4296245
|
2018-01-10
|
seimutig
|
editButton
|
u_4773138
|
2018-01-10
|
seimutig
|
editButton
|
u_4773138
|
2018-01-10
|
seimutig
|
editBoldness
|
u_4773138
|
2018-01-10
|
seimutig
|
editSummary
|
u_4773138
|
2018-01-10
|
seimutig
|
editButtonCitation
|
u_4773138
|
2018-01-10
|
seimutig
|
saveCitation
|
u_4773138
|
2018-01-10
|
seimutig
|
editButton
|
u_4773138
|
2018-01-10
|
diskutieren
|
returnToTraining
|
u_4773138
|
4. 2 Unique users in Guided Tours
Campaign guided tours: diskutieren
and seimutig
.
gTours %>%
select(event_tour) %>%
group_by(event_tour) %>%
summarise(`Unique users` = n()) %>%
knitr::kable(format = "html") %>%
kable_styling(full_width = F, position = "left")
event_tour
|
Unique users
|
diskutieren
|
6
|
seimutig
|
16
|
---
title: 'Thank You Campaign 2018: Report'
author: "Goran S. Milovanovic, Data Analyst, WMDE"
date: "January, 2018"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---


**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign is run from 1. January 2018 to N January 2018.

```{r setup}
knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
```

```{r, echo = F, warning = F, message = F, results = 'hide'}
# !diagnostics off
### --- Setup
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(ggrepel)
library(scales)
library(RColorBrewer)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(DT)
library(reshape2)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script `ThankYou_2018_Production_SQL.R` on stat1005.eqiad.wmnet, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. Run from stat1005 stat box by executing `Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R`.

```{r, echo = T, eval = F}

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

```

## 1A. Campaign Banner Impressions

### 1A. 1 The data set
```{r echo = T, warning = 'hide', message = F}
# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
bannerImpressions <- read.csv('thankyouBannerImpressions.csv',
                              row.names = 1,
                              header = T,
                              stringsAsFactors = F) %>% 
  gather(key =  Banner,
         value = Views,
         B17WMDE_thankyou_authors:B17WMDE_thankyou_authors_mob_B) %>% 
  group_by(timeStamp, Banner) %>% 
  summarise(Views = sum(Views))
knitr::kable(bannerImpressions, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```
### 1A. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(bannerImpressions, aes(x = timeStamp,
                    y = Views,
                    color = Banner,
                    label = Views)) +
  geom_line(aes(group = Banner), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of BannerImpressions') +
  ylab("Impressions") +
  geom_text_repel(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 1B. Campaign Pageviews (Banner Clicks)

### 1B. 1 The data set

```{r echo = T, warning = 'hide', message = F}
pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

### 1B. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```


## 2. Campaign User Registrations

### 2. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      header = T,
                      row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

### 2. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 3. Campaign User Edits

### 3. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
```

### 3. 2 Chart

```{r echo = T, warning = 'hide', message = F}
```

## 4. Campaign Guided Tours

### 4. 1 The data set: Guided Tour Exit Steps

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
  id <- round(runif(1, 1, 10e6))
  while (id %in% an_userId) {
    id <- round(runif(1, 1, 10e6))
  }
  an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
  w <- which(eventUserId %in% x)
  if (length(w) > 0) {
    an_userId[w]
  } else {
    x
  }
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    filter(event_userId != 0) %>% 
    select(timestamp, event_tour, event_step, an_userId)
  knitr::kable(gTours, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
```
### 4. 2 Unique users in Guided Tours

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
gTours %>% 
  select(event_tour) %>% 
  group_by(event_tour) %>% 
  summarise(`Unique users` = n()) %>% 
  knitr::kable(format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```
