Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign is run from 1. January 2018 to N January 2018.

knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script ThankYou_2018_Production_SQL.R on stat1005.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R.

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

1A. Campaign Banner Impressions

1A. 1 The data set

# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
[1] "Current update: 2018-01-25 15:51:16"
bannerImpressions <- read.csv('thankyouBannerImpressions.csv',
                              row.names = 1,
                              header = T,
                              stringsAsFactors = F) %>% 
  gather(key =  Banner,
         value = Views,
         B17WMDE_thankyou_authors:B17WMDE_thankyou_authors_mob_B) %>% 
  group_by(timeStamp, Banner) %>% 
  summarise(Views = sum(Views))
knitr::kable(bannerImpressions, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timeStamp Banner Views
2018-01-01 B17WMDE_thankyou_authors 257533
2018-01-01 B17WMDE_thankyou_authors_B 128423
2018-01-01 B17WMDE_thankyou_authors_mob_A 0
2018-01-01 B17WMDE_thankyou_authors_mob_B 0
2018-01-01 B17WMDE_thankyou_authors_pad_A 0
2018-01-01 B17WMDE_thankyou_authors_pad_B 0
2018-01-02 B17WMDE_thankyou_authors 741533
2018-01-02 B17WMDE_thankyou_authors_B 370737
2018-01-02 B17WMDE_thankyou_authors_mob_A 0
2018-01-02 B17WMDE_thankyou_authors_mob_B 3
2018-01-02 B17WMDE_thankyou_authors_pad_A 0
2018-01-02 B17WMDE_thankyou_authors_pad_B 0
2018-01-03 B17WMDE_thankyou_authors 824290
2018-01-03 B17WMDE_thankyou_authors_B 410722
2018-01-03 B17WMDE_thankyou_authors_mob_A 0
2018-01-03 B17WMDE_thankyou_authors_mob_B 0
2018-01-03 B17WMDE_thankyou_authors_pad_A 0
2018-01-03 B17WMDE_thankyou_authors_pad_B 0
2018-01-04 B17WMDE_thankyou_authors 818331
2018-01-04 B17WMDE_thankyou_authors_B 409420
2018-01-04 B17WMDE_thankyou_authors_mob_A 0
2018-01-04 B17WMDE_thankyou_authors_mob_B 0
2018-01-04 B17WMDE_thankyou_authors_pad_A 0
2018-01-04 B17WMDE_thankyou_authors_pad_B 0
2018-01-05 B17WMDE_thankyou_authors 750962
2018-01-05 B17WMDE_thankyou_authors_B 375077
2018-01-05 B17WMDE_thankyou_authors_mob_A 0
2018-01-05 B17WMDE_thankyou_authors_mob_B 2
2018-01-05 B17WMDE_thankyou_authors_pad_A 0
2018-01-05 B17WMDE_thankyou_authors_pad_B 0
2018-01-06 B17WMDE_thankyou_authors 586811
2018-01-06 B17WMDE_thankyou_authors_B 293237
2018-01-06 B17WMDE_thankyou_authors_mob_A 0
2018-01-06 B17WMDE_thankyou_authors_mob_B 0
2018-01-06 B17WMDE_thankyou_authors_pad_A 0
2018-01-06 B17WMDE_thankyou_authors_pad_B 0
2018-01-07 B17WMDE_thankyou_authors 706186
2018-01-07 B17WMDE_thankyou_authors_B 352536
2018-01-07 B17WMDE_thankyou_authors_mob_A 0
2018-01-07 B17WMDE_thankyou_authors_mob_B 0
2018-01-07 B17WMDE_thankyou_authors_pad_A 0
2018-01-07 B17WMDE_thankyou_authors_pad_B 0
2018-01-08 B17WMDE_thankyou_authors 920162
2018-01-08 B17WMDE_thankyou_authors_B 460469
2018-01-08 B17WMDE_thankyou_authors_mob_A 0
2018-01-08 B17WMDE_thankyou_authors_mob_B 0
2018-01-08 B17WMDE_thankyou_authors_pad_A 0
2018-01-08 B17WMDE_thankyou_authors_pad_B 0
2018-01-09 B17WMDE_thankyou_authors 988107
2018-01-09 B17WMDE_thankyou_authors_B 495139
2018-01-09 B17WMDE_thankyou_authors_mob_A 0
2018-01-09 B17WMDE_thankyou_authors_mob_B 0
2018-01-09 B17WMDE_thankyou_authors_pad_A 0
2018-01-09 B17WMDE_thankyou_authors_pad_B 0
2018-01-10 B17WMDE_thankyou_authors 990820
2018-01-10 B17WMDE_thankyou_authors_B 495235
2018-01-10 B17WMDE_thankyou_authors_mob_A 0
2018-01-10 B17WMDE_thankyou_authors_mob_B 0
2018-01-10 B17WMDE_thankyou_authors_pad_A 0
2018-01-10 B17WMDE_thankyou_authors_pad_B 0
2018-01-11 B17WMDE_thankyou_authors 986539
2018-01-11 B17WMDE_thankyou_authors_B 493215
2018-01-11 B17WMDE_thankyou_authors_mob_A 0
2018-01-11 B17WMDE_thankyou_authors_mob_B 1
2018-01-11 B17WMDE_thankyou_authors_pad_A 0
2018-01-11 B17WMDE_thankyou_authors_pad_B 0
2018-01-12 B17WMDE_thankyou_authors 852340
2018-01-12 B17WMDE_thankyou_authors_B 426957
2018-01-12 B17WMDE_thankyou_authors_mob_A 0
2018-01-12 B17WMDE_thankyou_authors_mob_B 0
2018-01-12 B17WMDE_thankyou_authors_pad_A 0
2018-01-12 B17WMDE_thankyou_authors_pad_B 0
2018-01-13 B17WMDE_thankyou_authors 628561
2018-01-13 B17WMDE_thankyou_authors_B 313426
2018-01-13 B17WMDE_thankyou_authors_mob_A 0
2018-01-13 B17WMDE_thankyou_authors_mob_B 0
2018-01-13 B17WMDE_thankyou_authors_pad_A 0
2018-01-13 B17WMDE_thankyou_authors_pad_B 0
2018-01-14 B17WMDE_thankyou_authors 779314
2018-01-14 B17WMDE_thankyou_authors_B 389017
2018-01-14 B17WMDE_thankyou_authors_mob_A 0
2018-01-14 B17WMDE_thankyou_authors_mob_B 0
2018-01-14 B17WMDE_thankyou_authors_pad_A 0
2018-01-14 B17WMDE_thankyou_authors_pad_B 0

1A. 2 Chart

ggplot(bannerImpressions, aes(x = timeStamp,
                    y = Views,
                    color = Banner,
                    label = Views)) +
  geom_line(aes(group = Banner), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of BannerImpressions') +
  ylab("Impressions") +
  geom_text_repel(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

1B. Campaign Pageviews (Banner Clicks)

1B. 1 The data set

pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Pageviews
2018-01-01 533
2018-01-02 1188
2018-01-03 1269
2018-01-04 1148
2018-01-05 1122
2018-01-06 1148
2018-01-07 1242
2018-01-08 1150
2018-01-09 1254
2018-01-10 1347
2018-01-11 1773
2018-01-12 1214
2018-01-13 1201
2018-01-14 1492

1B. 2 Chart

ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

2. Campaign User Registrations

2. 1 The data set

userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      quote = "",
                      header = T,
                      # row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Registrations
2018-01-01 1
2018-01-02 8
2018-01-03 15
2018-01-04 13
2018-01-05 5
2018-01-06 6
2018-01-07 12
2018-01-08 6
2018-01-09 9
2018-01-10 7
2018-01-11 15
2018-01-12 9
2018-01-13 3
2018-01-14 11
2018-01-15 1
print(paste0("Total number of registered users: ", sum(userReg$Registrations)))
[1] "Total number of registered users: 121"

2. 2 Chart

ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

3. Campaign User Edits

3. 1 The data set

userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
[1] "135 edits were made by the campaign registered users thus far."

3. 2 Chart

userEditsDist <- as.data.frame(table(userEdits$edits))
colnames(userEditsDist) <- c('Number of Edits', 'Number of Users')
ggplot(userEditsDist, aes(x = `Number of Edits`,
                    y = `Number of Users`,
                    label = `Number of Users`)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "darkorange", 
           color = "darkorange") +
  ggtitle('Thank You 2018:\nHow many users made a particular number of edits') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 0, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

4. Campaign Guided Tours

4. 1 The data set: Guided Tour Exit Steps

Campaign guided tours: diskutieren and seimutig.

tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
  id <- round(runif(1, 1, 10e6))
  while (id %in% an_userId) {
    id <- round(runif(1, 1, 10e6))
  }
  an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
  w <- which(eventUserId %in% x)
  if (length(w) > 0) {
    an_userId[w]
  } else {
    x
  }
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    filter(event_userId != 0) %>% 
    select(timestamp, event_tour, event_step, an_userId)
  knitr::kable(gTours, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
timestamp event_tour event_step an_userId
2018-01-03 diskutieren returnToTraining u_9719807
2018-01-03 seimutig returnToTraining u_9719807
2018-01-04 seimutig boldness u_126071
2018-01-04 seimutig positionCursor u_126071
2018-01-04 diskutieren returnToTraining u_126071
2018-01-05 diskutieren saveReply u_5424461
2018-01-04 seimutig returnToTraining u_126071
2018-01-04 seimutig editButtonCitation u_126071
2018-01-06 seimutig editButton u_514354
2018-01-06 seimutig boldness u_514354
2018-01-06 diskutieren firstMessage u_514354
2018-01-08 seimutig editBoldness u_4937020
2018-01-08 seimutig editSummary u_4937020
2018-01-08 diskutieren returnToTraining u_4937020
2018-01-10 seimutig editButton u_475981
2018-01-10 seimutig editButton u_475981
2018-01-10 seimutig editBoldness u_475981
2018-01-10 seimutig editSummary u_475981
2018-01-10 seimutig editButtonCitation u_475981
2018-01-10 seimutig saveCitation u_475981
2018-01-10 seimutig editButton u_475981
2018-01-10 diskutieren returnToTraining u_475981
2018-01-11 seimutig editBoldness u_445447
2018-01-11 seimutig boldness u_445447
2018-01-11 seimutig insertCitation u_445447
2018-01-11 seimutig anyEdit u_445447
2018-01-11 diskutieren returnToTraining u_445447
2018-01-11 seimutig anyEdit u_5703010

4. 2 Unique users in Guided Tours

Campaign guided tours: diskutieren and seimutig.

gTours %>% 
  select(event_tour) %>% 
  group_by(event_tour) %>% 
  summarise(`Unique users` = n()) %>% 
  knitr::kable(format = "html") %>% 
  kable_styling(full_width = F, position = "left")
event_tour Unique users
diskutieren 7
seimutig 21

5 Training Module

trainData <- read.csv('wmde_training_data.csv')
# - remove users: Stefan Schneider (WMDE), Sage (Wiki Ed)
wR <- which(trainData$username %in% c('Stefan Schneider (WMDE)', 'Sage (Wiki Ed)'))
trainData <- trainData[-wR, ]
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userData <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
edData <- read.delim('thankyou_2018_userEdits.tsv',
                     sep = "\t",
                     header = T,
                     stringsAsFactors = F)
# - which users in the Training Module data set have registered via Thank You 2018:
wRegCampaign <- which(trainData$username %in% userData$event_userName)
trainData <- trainData[wRegCampaign, ]
completedTraining <- sapply(1:dim(trainData)[1], function(x) {
  w <- which(trainData$username %in% trainData$username[x])
  ct <- ifelse(
    trainData$last_slide_completed[w] %in% c('relevanz-quiz-fortsetzung', 'das-wars', 'fertig'), T, F)
  ifelse(sum(ct) >= 1, T, F)
})
trainData$completed_training <- completedTraining
userData <- userData %>% 
  filter(event_campaign %in% 'wmde_etc2017_bt1' & event_isSelfMade) %>% 
  select(event_userId, event_userName)
completedTrainingData <- trainData %>%
  select(username, completed_training)
completedTrainingData <- completedTrainingData[!duplicated(completedTrainingData), ]
userData <- left_join(userData, completedTrainingData, by = c("event_userName" = "username"))
Column `event_userName`/`username` joining character vector and factor, coercing into character vector
userData$module_completion_date <- NA
userData <- left_join(userData, edData,
                      by = c('event_userId' = 'rev_user'))
userData$started_training <- ifelse(is.na(userData$completed_training), F, T)

5.1 Training Module Overview

How many registered users have started the Training Module?

print(paste0("Number of users who have started the Training Module is ", 
             sum(userData$started_training),
             " , which is ",
             round(sum(userData$started_training)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
[1] "Number of users who have started the Training Module is 16 , which is 13.22% of registered users."

How many registered users have completed the Training Module?

print(paste0("Number of users who have completed the Training Module is ", 
             sum(userData$completed_training, na.rm = T),
             " , which is ",
             round(sum(userData$completed_training, na.rm = T)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
[1] "Number of users who have completed the Training Module is 13 , which is 10.74% of registered users."

What are the completed slides per user Training Module?

slidesData <- trainData %>% 
  select(training_module, last_slide_completed) %>% 
  group_by(training_module, last_slide_completed) %>% 
  summarise(Count = n()) %>% 
  arrange(training_module, desc(Count))
colnames(slidesData)[3] <- c('N. users completed')
knitr::kable(slidesData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
training_module last_slide_completed N. users completed
artikel-bewerten fertig 3
editieren-basiswissen das-wars 2
editieren-basiswissen versuche-es 1
wikipedia-basiswissen relevanz-quiz-fortsetzung 14
wikipedia-basiswissen wikipedia-ist-frei 2
wikipedia-basiswissen urheberrecht-und-plagiate 1
ggplot(slidesData, aes(x = last_slide_completed,
                       y = `N. users completed`,
                       color = training_module,
                       label = `N. users completed`)) +
  geom_line(aes(group = 1), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of Training Modules') +
  ylab("No. Users") +
  geom_text_repel(size = 3) +
  facet_wrap(~training_module) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6.5, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

Do users start editing after training modules? Is there a difference in users with completed and not-completed or not at all having taken the modules?

How many edits were made on behalf of those users who have started the Training Module vs. those who did not?

tAn <- userData %>% 
  select(started_training, edits) %>% 
  group_by(started_training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(started_training, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
started_training Users Edits
FALSE 105 125
TRUE 16 10
print(paste0("In total, the users who have started the Training Module made ", 
             sum(tAn$Edits[2], na.rm = T),
             " edits, which is ",
             round(sum(tAn$Edits[2], na.rm = T)/sum(tAn$Edits, na.rm = T)*100, 2),
             "% of all edits."
               ))
[1] "In total, the users who have started the Training Module made 10 edits, which is 7.41% of all edits."
print(paste0("On the other hand, the users who did not take the Training Module made ", 
             tAn$Edits[1],
             " edits, which is ",
             round(tAn$Edits[1]/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
[1] "On the other hand, the users who did not take the Training Module made 125 edits, which is 92.59% of all edits."

How many edits were made on behalf of those users who have completed the Training Module vs. those who did not?

tAn <- userData %>% 
  select(completed_training, edits) %>% 
  group_by(completed_training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(completed_training, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
completed_training Users Edits
FALSE 3 0
TRUE 13 10
NA 105 125

Completion of the Training Module vs. number of edits

# - final slide for: wikipedia-basiswissen = relevanz-quiz-fortsetzung
# - final slide for: editieren-basiswissen = das-wars
# - final slide for: artikel-bewerten = fertig
# finalSlide <- c('relevanz-quiz-fortsetzung', 'das', 'fertig')
# names(finalSlide) <- c('wikipedia-basiswissen', 'editieren-basiswissen', 'relevanz-quiz-fortsetzung')
completedTrainingEdits <- userData %>% 
  select(completed_training, edits) %>% 
  group_by(completed_training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
completed_training Users Edits
FALSE 3 0
TRUE 13 10
NA 105 125

Let’s now take a look at the number of users who (a) made any edits at all, or(b) have reached their 10th edit, in these two groups:

userData$Edits <- (userData$edits > 0)
userData$Edits10 <- (userData$edits >= 10)
completedTrainingEdits <- userData %>% 
  select(completed_training, Edits10) %>%
  group_by(completed_training) %>% 
  summarise(Users = n(), Edits10 = sum(Edits10, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
completed_training Users Edits10
FALSE 3 0
TRUE 13 0
NA 105 2
---
title: 'Thank You Campaign 2018: Report'
author: "Goran S. Milovanovic, Data Analyst, WMDE"
date: "January, 2018"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---


**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign is run from 1. January 2018 to N January 2018.

```{r setup}
knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
```

```{r, echo = F, warning = F, message = F, results = 'hide'}
# !diagnostics off
### --- Setup
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(ggrepel)
library(scales)
library(RColorBrewer)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(DT)
library(reshape2)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script `ThankYou_2018_Production_SQL.R` on stat1005.eqiad.wmnet, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. Run from stat1005 stat box by executing `Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R`.

```{r, echo = T, eval = F}

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

```

## 1A. Campaign Banner Impressions

### 1A. 1 The data set
```{r echo = T, warning = 'hide', message = F}
# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
bannerImpressions <- read.csv('thankyouBannerImpressions.csv',
                              row.names = 1,
                              header = T,
                              stringsAsFactors = F) %>% 
  gather(key =  Banner,
         value = Views,
         B17WMDE_thankyou_authors:B17WMDE_thankyou_authors_mob_B) %>% 
  group_by(timeStamp, Banner) %>% 
  summarise(Views = sum(Views))
knitr::kable(bannerImpressions, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```
### 1A. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(bannerImpressions, aes(x = timeStamp,
                    y = Views,
                    color = Banner,
                    label = Views)) +
  geom_line(aes(group = Banner), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of BannerImpressions') +
  ylab("Impressions") +
  geom_text_repel(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 1B. Campaign Pageviews (Banner Clicks)

### 1B. 1 The data set

```{r echo = T, warning = 'hide', message = F}
pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

### 1B. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```


## 2. Campaign User Registrations

### 2. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      quote = "",
                      header = T,
                      # row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

```{r echo = T, warning = 'hide', message = F}
print(paste0("Total number of registered users: ", sum(userReg$Registrations)))
```

### 2. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 3. Campaign User Edits

### 3. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
```

### 3. 2 Chart

```{r echo = T, warning = 'hide', message = F}
userEditsDist <- as.data.frame(table(userEdits$edits))
colnames(userEditsDist) <- c('Number of Edits', 'Number of Users')
ggplot(userEditsDist, aes(x = `Number of Edits`,
                    y = `Number of Users`,
                    label = `Number of Users`)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "darkorange", 
           color = "darkorange") +
  ggtitle('Thank You 2018:\nHow many users made a particular number of edits') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 0, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 4. Campaign Guided Tours

### 4. 1 The data set: Guided Tour Exit Steps

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
  id <- round(runif(1, 1, 10e6))
  while (id %in% an_userId) {
    id <- round(runif(1, 1, 10e6))
  }
  an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
  w <- which(eventUserId %in% x)
  if (length(w) > 0) {
    an_userId[w]
  } else {
    x
  }
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    filter(event_userId != 0) %>% 
    select(timestamp, event_tour, event_step, an_userId)
  knitr::kable(gTours, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
```
### 4. 2 Unique users in Guided Tours

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
gTours %>% 
  select(event_tour) %>% 
  group_by(event_tour) %>% 
  summarise(`Unique users` = n()) %>% 
  knitr::kable(format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

## 5 Training Module

```{r echo = T, warning = 'hide', message = F}
trainData <- read.csv('wmde_training_data.csv')
# - remove users: Stefan Schneider (WMDE), Sage (Wiki Ed)
wR <- which(trainData$username %in% c('Stefan Schneider (WMDE)', 'Sage (Wiki Ed)'))
trainData <- trainData[-wR, ]
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userData <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
edData <- read.delim('thankyou_2018_userEdits.tsv',
                     sep = "\t",
                     header = T,
                     stringsAsFactors = F)
# - which users in the Training Module data set have registered via Thank You 2018:
wRegCampaign <- which(trainData$username %in% userData$event_userName)
trainData <- trainData[wRegCampaign, ]
completedTraining <- sapply(1:dim(trainData)[1], function(x) {
  w <- which(trainData$username %in% trainData$username[x])
  ct <- ifelse(
    trainData$last_slide_completed[w] %in% c('relevanz-quiz-fortsetzung', 'das-wars', 'fertig'), T, F)
  ifelse(sum(ct) >= 1, T, F)
})
trainData$completed_training <- completedTraining
userData <- userData %>% 
  filter(event_campaign %in% 'wmde_etc2017_bt1' & event_isSelfMade) %>% 
  select(event_userId, event_userName)
completedTrainingData <- trainData %>%
  select(username, completed_training)
completedTrainingData <- completedTrainingData[!duplicated(completedTrainingData), ]
userData <- left_join(userData, completedTrainingData, by = c("event_userName" = "username"))
userData$module_completion_date <- NA
userData <- left_join(userData, edData,
                      by = c('event_userId' = 'rev_user'))
userData$started_training <- ifelse(is.na(userData$completed_training), F, T)
```


### 5.1 Training Module Overview

How many registered users have started the Training Module?

```{r echo = T, warning = 'hide', message = F}
print(paste0("Number of users who have started the Training Module is ", 
             sum(userData$started_training),
             " , which is ",
             round(sum(userData$started_training)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
```

How many registered users have completed the Training Module?

```{r echo = T, warning = 'hide', message = F}
print(paste0("Number of users who have completed the Training Module is ", 
             sum(userData$completed_training, na.rm = T),
             " , which is ",
             round(sum(userData$completed_training, na.rm = T)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
```

What are the completed slides per user Training Module?

```{r echo = T, warning = 'hide', message = F}
slidesData <- trainData %>% 
  select(training_module, last_slide_completed) %>% 
  group_by(training_module, last_slide_completed) %>% 
  summarise(Count = n()) %>% 
  arrange(training_module, desc(Count))
colnames(slidesData)[3] <- c('N. users completed')
knitr::kable(slidesData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

```{r echo = T, warning = 'hide', message = F}
ggplot(slidesData, aes(x = last_slide_completed,
                       y = `N. users completed`,
                       color = training_module,
                       label = `N. users completed`)) +
  geom_line(aes(group = 1), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of Training Modules') +
  ylab("No. Users") +
  geom_text_repel(size = 3) +
  facet_wrap(~training_module) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6.5, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

Do users start editing after training modules? Is there a difference in users with completed and not-completed or not at all having taken the modules?

How many edits were made on behalf of those users who have started the Training Module vs. those who did not?

```{r echo = T, warning = 'hide', message = F}
tAn <- userData %>% 
  select(started_training, edits) %>% 
  group_by(started_training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(started_training, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

```{r echo = T, warning = 'hide', message = F}
print(paste0("In total, the users who have started the Training Module made ", 
             sum(tAn$Edits[2], na.rm = T),
             " edits, which is ",
             round(sum(tAn$Edits[2], na.rm = T)/sum(tAn$Edits, na.rm = T)*100, 2),
             "% of all edits."
               ))
print(paste0("On the other hand, the users who did not take the Training Module made ", 
             tAn$Edits[1],
             " edits, which is ",
             round(tAn$Edits[1]/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
```

How many edits were made on behalf of those users who have completed the Training Module vs. those who did not?

```{r echo = T, warning = 'hide', message = F}
tAn <- userData %>% 
  select(completed_training, edits) %>% 
  group_by(completed_training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(completed_training, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

Completion of the Training Module vs. number of edits

```{r echo = T, warning = 'hide', message = F}
# - final slide for: wikipedia-basiswissen = relevanz-quiz-fortsetzung
# - final slide for: editieren-basiswissen = das-wars
# - final slide for: artikel-bewerten = fertig
# finalSlide <- c('relevanz-quiz-fortsetzung', 'das', 'fertig')
# names(finalSlide) <- c('wikipedia-basiswissen', 'editieren-basiswissen', 'relevanz-quiz-fortsetzung')
completedTrainingEdits <- userData %>% 
  select(completed_training, edits) %>% 
  group_by(completed_training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

Let's now take a look at the number of users who (a) made any edits at all, or(b) have reached their 10th edit, in these two groups:

```{r echo = T, warning = 'hide', message = F}
userData$Edits <- (userData$edits > 0)
userData$Edits10 <- (userData$edits >= 10)
completedTrainingEdits <- userData %>% 
  select(completed_training, Edits10) %>%
  group_by(completed_training) %>% 
  summarise(Users = n(), Edits10 = sum(Edits10, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```