Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign is run from 1. January 2018 to N January 2018.

knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script ThankYou_2018_Production_SQL.R on stat1005.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R.

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

1A. Campaign Banner Impressions

1A. 1 The data set

# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
[1] "Current update: 2018-01-17 10:12:14"
bannerImpressions <- read.csv('thankyouBannerImpressions.csv',
                              row.names = 1,
                              header = T,
                              stringsAsFactors = F) %>% 
  gather(key =  Banner,
         value = Views,
         B17WMDE_thankyou_authors:B17WMDE_thankyou_authors_mob_B) %>% 
  group_by(timeStamp, Banner) %>% 
  summarise(Views = sum(Views))
knitr::kable(bannerImpressions, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timeStamp Banner Views
2018-01-01 B17WMDE_thankyou_authors 257533
2018-01-01 B17WMDE_thankyou_authors_B 128423
2018-01-01 B17WMDE_thankyou_authors_mob_A 0
2018-01-01 B17WMDE_thankyou_authors_mob_B 0
2018-01-01 B17WMDE_thankyou_authors_pad_A 0
2018-01-01 B17WMDE_thankyou_authors_pad_B 0
2018-01-02 B17WMDE_thankyou_authors 741533
2018-01-02 B17WMDE_thankyou_authors_B 370737
2018-01-02 B17WMDE_thankyou_authors_mob_A 0
2018-01-02 B17WMDE_thankyou_authors_mob_B 3
2018-01-02 B17WMDE_thankyou_authors_pad_A 0
2018-01-02 B17WMDE_thankyou_authors_pad_B 0
2018-01-03 B17WMDE_thankyou_authors 824290
2018-01-03 B17WMDE_thankyou_authors_B 410722
2018-01-03 B17WMDE_thankyou_authors_mob_A 0
2018-01-03 B17WMDE_thankyou_authors_mob_B 0
2018-01-03 B17WMDE_thankyou_authors_pad_A 0
2018-01-03 B17WMDE_thankyou_authors_pad_B 0
2018-01-04 B17WMDE_thankyou_authors 818331
2018-01-04 B17WMDE_thankyou_authors_B 409420
2018-01-04 B17WMDE_thankyou_authors_mob_A 0
2018-01-04 B17WMDE_thankyou_authors_mob_B 0
2018-01-04 B17WMDE_thankyou_authors_pad_A 0
2018-01-04 B17WMDE_thankyou_authors_pad_B 0
2018-01-05 B17WMDE_thankyou_authors 750962
2018-01-05 B17WMDE_thankyou_authors_B 375077
2018-01-05 B17WMDE_thankyou_authors_mob_A 0
2018-01-05 B17WMDE_thankyou_authors_mob_B 2
2018-01-05 B17WMDE_thankyou_authors_pad_A 0
2018-01-05 B17WMDE_thankyou_authors_pad_B 0
2018-01-06 B17WMDE_thankyou_authors 586811
2018-01-06 B17WMDE_thankyou_authors_B 293237
2018-01-06 B17WMDE_thankyou_authors_mob_A 0
2018-01-06 B17WMDE_thankyou_authors_mob_B 0
2018-01-06 B17WMDE_thankyou_authors_pad_A 0
2018-01-06 B17WMDE_thankyou_authors_pad_B 0
2018-01-07 B17WMDE_thankyou_authors 706186
2018-01-07 B17WMDE_thankyou_authors_B 352536
2018-01-07 B17WMDE_thankyou_authors_mob_A 0
2018-01-07 B17WMDE_thankyou_authors_mob_B 0
2018-01-07 B17WMDE_thankyou_authors_pad_A 0
2018-01-07 B17WMDE_thankyou_authors_pad_B 0
2018-01-08 B17WMDE_thankyou_authors 920162
2018-01-08 B17WMDE_thankyou_authors_B 460469
2018-01-08 B17WMDE_thankyou_authors_mob_A 0
2018-01-08 B17WMDE_thankyou_authors_mob_B 0
2018-01-08 B17WMDE_thankyou_authors_pad_A 0
2018-01-08 B17WMDE_thankyou_authors_pad_B 0
2018-01-09 B17WMDE_thankyou_authors 988107
2018-01-09 B17WMDE_thankyou_authors_B 495139
2018-01-09 B17WMDE_thankyou_authors_mob_A 0
2018-01-09 B17WMDE_thankyou_authors_mob_B 0
2018-01-09 B17WMDE_thankyou_authors_pad_A 0
2018-01-09 B17WMDE_thankyou_authors_pad_B 0
2018-01-10 B17WMDE_thankyou_authors 990820
2018-01-10 B17WMDE_thankyou_authors_B 495235
2018-01-10 B17WMDE_thankyou_authors_mob_A 0
2018-01-10 B17WMDE_thankyou_authors_mob_B 0
2018-01-10 B17WMDE_thankyou_authors_pad_A 0
2018-01-10 B17WMDE_thankyou_authors_pad_B 0
2018-01-11 B17WMDE_thankyou_authors 986539
2018-01-11 B17WMDE_thankyou_authors_B 493215
2018-01-11 B17WMDE_thankyou_authors_mob_A 0
2018-01-11 B17WMDE_thankyou_authors_mob_B 1
2018-01-11 B17WMDE_thankyou_authors_pad_A 0
2018-01-11 B17WMDE_thankyou_authors_pad_B 0
2018-01-12 B17WMDE_thankyou_authors 852340
2018-01-12 B17WMDE_thankyou_authors_B 426957
2018-01-12 B17WMDE_thankyou_authors_mob_A 0
2018-01-12 B17WMDE_thankyou_authors_mob_B 0
2018-01-12 B17WMDE_thankyou_authors_pad_A 0
2018-01-12 B17WMDE_thankyou_authors_pad_B 0
2018-01-13 B17WMDE_thankyou_authors 628561
2018-01-13 B17WMDE_thankyou_authors_B 313426
2018-01-13 B17WMDE_thankyou_authors_mob_A 0
2018-01-13 B17WMDE_thankyou_authors_mob_B 0
2018-01-13 B17WMDE_thankyou_authors_pad_A 0
2018-01-13 B17WMDE_thankyou_authors_pad_B 0
2018-01-14 B17WMDE_thankyou_authors 779314
2018-01-14 B17WMDE_thankyou_authors_B 389017
2018-01-14 B17WMDE_thankyou_authors_mob_A 0
2018-01-14 B17WMDE_thankyou_authors_mob_B 0
2018-01-14 B17WMDE_thankyou_authors_pad_A 0
2018-01-14 B17WMDE_thankyou_authors_pad_B 0

1A. 2 Chart

ggplot(bannerImpressions, aes(x = timeStamp,
                    y = Views,
                    color = Banner,
                    label = Views)) +
  geom_line(aes(group = Banner), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of BannerImpressions') +
  ylab("Impressions") +
  geom_text_repel(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

1B. Campaign Pageviews (Banner Clicks)

1B. 1 The data set

pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Pageviews
2018-01-01 533
2018-01-02 1188
2018-01-03 1269
2018-01-04 1148
2018-01-05 1122
2018-01-06 1148
2018-01-07 1242
2018-01-08 1150
2018-01-09 1254
2018-01-10 1347
2018-01-11 1773
2018-01-12 1214
2018-01-13 1201
2018-01-14 1492

1B. 2 Chart

ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

2. Campaign User Registrations

2. 1 The data set

userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      quote = "",
                      header = T,
                      # row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Registrations
2018-01-01 1
2018-01-02 8
2018-01-03 15
2018-01-04 13
2018-01-05 5
2018-01-06 6
2018-01-07 12
2018-01-08 6
2018-01-09 9
2018-01-10 7
2018-01-11 15
2018-01-12 9
2018-01-13 3
2018-01-14 11
2018-01-15 1

2. 2 Chart

ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

3. Campaign User Edits

3. 1 The data set

userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
[1] "135 edits were made by the campaign registered users thus far."

3. 2 Chart

4. Campaign Guided Tours

4. 1 The data set: Guided Tour Exit Steps

Campaign guided tours: diskutieren and seimutig.

tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
  id <- round(runif(1, 1, 10e6))
  while (id %in% an_userId) {
    id <- round(runif(1, 1, 10e6))
  }
  an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
  w <- which(eventUserId %in% x)
  if (length(w) > 0) {
    an_userId[w]
  } else {
    x
  }
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    filter(event_userId != 0) %>% 
    select(timestamp, event_tour, event_step, an_userId)
  knitr::kable(gTours, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
timestamp event_tour event_step an_userId
2018-01-03 diskutieren returnToTraining u_5167969
2018-01-03 seimutig returnToTraining u_5167969
2018-01-04 seimutig boldness u_6495795
2018-01-04 seimutig positionCursor u_6495795
2018-01-04 diskutieren returnToTraining u_6495795
2018-01-05 diskutieren saveReply u_6896279
2018-01-04 seimutig returnToTraining u_6495795
2018-01-04 seimutig editButtonCitation u_6495795
2018-01-06 seimutig editButton u_4706786
2018-01-06 seimutig boldness u_4706786
2018-01-06 diskutieren firstMessage u_4706786
2018-01-08 seimutig editBoldness u_2986526
2018-01-08 seimutig editSummary u_2986526
2018-01-08 diskutieren returnToTraining u_2986526
2018-01-10 seimutig editButton u_2672122
2018-01-10 seimutig editButton u_2672122
2018-01-10 seimutig editBoldness u_2672122
2018-01-10 seimutig editSummary u_2672122
2018-01-10 seimutig editButtonCitation u_2672122
2018-01-10 seimutig saveCitation u_2672122
2018-01-10 seimutig editButton u_2672122
2018-01-10 diskutieren returnToTraining u_2672122
2018-01-11 seimutig editBoldness u_771518
2018-01-11 seimutig boldness u_771518
2018-01-11 seimutig insertCitation u_771518
2018-01-11 seimutig anyEdit u_771518
2018-01-11 diskutieren returnToTraining u_771518
2018-01-11 seimutig anyEdit u_4425925

4. 2 Unique users in Guided Tours

Campaign guided tours: diskutieren and seimutig.

gTours %>% 
  select(event_tour) %>% 
  group_by(event_tour) %>% 
  summarise(`Unique users` = n()) %>% 
  knitr::kable(format = "html") %>% 
  kable_styling(full_width = F, position = "left")
event_tour Unique users
diskutieren 7
seimutig 21

5 Training Module

trainData <- read.csv('wmde_training_data.csv')
# - remove first two rows (test data)
trainData <- trainData[-c(1,2), ]
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userData <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
gtData <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
edData <- read.delim('thankyou_2018_userEdits.tsv',
                     sep = "\t",
                     header = T,
                     stringsAsFactors = F)

5.1 Training Module Overview

How many registered users take the Training Module?

print(paste0("Number of uses who took the Training Module is ", 
             length(trainData$username),
             " , which is ",
             round(length(trainData$username)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
[1] "Number of uses who took the Training Module is 67 , which is 55.37% of registered users."

What are the last completed slides per user Training Module?

slidesData <- trainData %>% 
  select(training_module, last_slide_completed) %>% 
  group_by(training_module, last_slide_completed) %>% 
  summarise(Count = n()) %>% 
  arrange(training_module, desc(Count))
knitr::kable(slidesData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
training_module last_slide_completed Count
artikel-bewerten fertig 6
artikel-bewerten artikel-qualitat-bewerten 1
artikel-bewerten artikel-qualitat-quiz 1
editieren-basiswissen das-wars 7
editieren-basiswissen video-wiki-code 2
editieren-basiswissen visual-editor-buttons 2
editieren-basiswissen beobachtungsliste-video 1
editieren-basiswissen edit-vs-edit-quelle 1
editieren-basiswissen versuche-es 1
wikipedia-basiswissen relevanz-quiz-fortsetzung 35
wikipedia-basiswissen richtlinien-der-wikipedia 2
wikipedia-basiswissen urheberrecht-und-plagiate 2
wikipedia-basiswissen wikipedia-ist-frei 2
wikipedia-basiswissen nachweisbarkeit-quiz 1
wikipedia-basiswissen prinzipien-ruckblick 1
wikipedia-basiswissen relevanz 1
wikipedia-basiswissen sei-respektvoll 1
ggplot(slidesData, aes(x = last_slide_completed,
                       y = Count,
                       color = training_module,
                       label = Count)) +
  geom_line(aes(group = 1), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of Training Modules') +
  ylab("No. Users") +
  geom_text_repel(size = 3) +
  facet_wrap(~training_module) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6.5, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

Do users start editing after training modules? Is there a difference in users with completed and not-completed or not at all having taken the modules?

userData <- userData %>% 
  filter(event_campaign %in% 'wmde_etc2017_bt1' & event_isSelfMade) %>% 
  select(event_userId, event_userName)
userData <- left_join(userData, trainData, by = c("event_userName" = "username"))
Column `event_userName`/`username` joining character vector and factor, coercing into character vector
userData$module_completion_date <- NA
userData <- left_join(userData, 
                      gtData[, c('event_userId', 'event_tour', 'event_step')],
                      by = c('event_userId'))
userData <- left_join(userData, edData,
                      by = c('event_userId' = 'rev_user'))

How many edits were made on behalf of those users who have started the Training Module vs. those who did not?

tAn <- userData %>% 
  select(training_module, edits) %>% 
  group_by(training_module) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(training_module, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
training_module Users Edits
artikel-bewerten 7 36
editieren-basiswissen 6 36
wikipedia-basiswissen 20 38
NA 111 187
print(paste0("In total, the users who took the Training Module made ", 
             sum(tAn$Edits[1:3]),
             " edits, which is ",
             round(sum(tAn$Edits[1:3])/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
[1] "In total, the users who took the Training Module made 110 edits, which is 37.04% of all edits."
print(paste0("On the other hand, the users who did not take the Training Module made ", 
             tAn$Edits[4],
             " edits, which is ",
             round(tAn$Edits[4]/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
[1] "On the other hand, the users who did not take the Training Module made 187 edits, which is 62.96% of all edits."
---
title: 'Thank You Campaign 2018: Report'
author: "Goran S. Milovanovic, Data Analyst, WMDE"
date: "January, 2018"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---


**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign is run from 1. January 2018 to N January 2018.

```{r setup}
knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
```

```{r, echo = F, warning = F, message = F, results = 'hide'}
# !diagnostics off
### --- Setup
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(ggrepel)
library(scales)
library(RColorBrewer)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(DT)
library(reshape2)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script `ThankYou_2018_Production_SQL.R` on stat1005.eqiad.wmnet, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. Run from stat1005 stat box by executing `Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R`.

```{r, echo = T, eval = F}

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

```

## 1A. Campaign Banner Impressions

### 1A. 1 The data set
```{r echo = T, warning = 'hide', message = F}
# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
bannerImpressions <- read.csv('thankyouBannerImpressions.csv',
                              row.names = 1,
                              header = T,
                              stringsAsFactors = F) %>% 
  gather(key =  Banner,
         value = Views,
         B17WMDE_thankyou_authors:B17WMDE_thankyou_authors_mob_B) %>% 
  group_by(timeStamp, Banner) %>% 
  summarise(Views = sum(Views))
knitr::kable(bannerImpressions, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```
### 1A. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(bannerImpressions, aes(x = timeStamp,
                    y = Views,
                    color = Banner,
                    label = Views)) +
  geom_line(aes(group = Banner), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of BannerImpressions') +
  ylab("Impressions") +
  geom_text_repel(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 1B. Campaign Pageviews (Banner Clicks)

### 1B. 1 The data set

```{r echo = T, warning = 'hide', message = F}
pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

### 1B. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```


## 2. Campaign User Registrations

### 2. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      quote = "",
                      header = T,
                      # row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

### 2. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 3. Campaign User Edits

### 3. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
```

### 3. 2 Chart

```{r echo = T, warning = 'hide', message = F}
```

## 4. Campaign Guided Tours

### 4. 1 The data set: Guided Tour Exit Steps

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
  id <- round(runif(1, 1, 10e6))
  while (id %in% an_userId) {
    id <- round(runif(1, 1, 10e6))
  }
  an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
  w <- which(eventUserId %in% x)
  if (length(w) > 0) {
    an_userId[w]
  } else {
    x
  }
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    filter(event_userId != 0) %>% 
    select(timestamp, event_tour, event_step, an_userId)
  knitr::kable(gTours, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
```
### 4. 2 Unique users in Guided Tours

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
gTours %>% 
  select(event_tour) %>% 
  group_by(event_tour) %>% 
  summarise(`Unique users` = n()) %>% 
  knitr::kable(format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

## 5 Training Module

```{r echo = T, warning = 'hide', message = F}
trainData <- read.csv('wmde_training_data.csv')
# - remove first two rows (test data)
trainData <- trainData[-c(1,2), ]
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userData <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
gtData <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
edData <- read.delim('thankyou_2018_userEdits.tsv',
                     sep = "\t",
                     header = T,
                     stringsAsFactors = F)
```


### 5.1 Training Module Overview

How many registered users take the Training Module?

```{r echo = T, warning = 'hide', message = F}
print(paste0("Number of uses who took the Training Module is ", 
             length(trainData$username),
             " , which is ",
             round(length(trainData$username)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
```

What are the last completed slides per user Training Module?

```{r echo = T, warning = 'hide', message = F}
slidesData <- trainData %>% 
  select(training_module, last_slide_completed) %>% 
  group_by(training_module, last_slide_completed) %>% 
  summarise(Count = n()) %>% 
  arrange(training_module, desc(Count))
knitr::kable(slidesData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

```{r echo = T, warning = 'hide', message = F}
ggplot(slidesData, aes(x = last_slide_completed,
                       y = Count,
                       color = training_module,
                       label = Count)) +
  geom_line(aes(group = 1), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of Training Modules') +
  ylab("No. Users") +
  geom_text_repel(size = 3) +
  facet_wrap(~training_module) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6.5, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

Do users start editing after training modules? Is there a difference in users with completed and not-completed or not at all having taken the modules?

```{r echo = T, warning = 'hide', message = F}
userData <- userData %>% 
  filter(event_campaign %in% 'wmde_etc2017_bt1' & event_isSelfMade) %>% 
  select(event_userId, event_userName)
userData <- left_join(userData, trainData, by = c("event_userName" = "username"))
userData$module_completion_date <- NA
userData <- left_join(userData, 
                      gtData[, c('event_userId', 'event_tour', 'event_step')],
                      by = c('event_userId'))
userData <- left_join(userData, edData,
                      by = c('event_userId' = 'rev_user'))
```

How many edits were made on behalf of those users who have started the Training Module vs. those who did not?

```{r echo = T, warning = 'hide', message = F}
tAn <- userData %>% 
  select(training_module, edits) %>% 
  group_by(training_module) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(training_module, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

```{r echo = T, warning = 'hide', message = F}
print(paste0("In total, the users who took the Training Module made ", 
             sum(tAn$Edits[1:3]),
             " edits, which is ",
             round(sum(tAn$Edits[1:3])/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
print(paste0("On the other hand, the users who did not take the Training Module made ", 
             tAn$Edits[4],
             " edits, which is ",
             round(tAn$Edits[4]/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
```



