Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign is run from 1. January 2018 to N January 2018.

knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script ThankYou_2018_Production_SQL.R on stat1005.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R.

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

1A. Campaign Banner Impressions

1A. 1 The data set

# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
[1] "Current update: 2018-01-17 15:02:42"
bannerImpressions <- read.csv('thankyouBannerImpressions.csv',
                              row.names = 1,
                              header = T,
                              stringsAsFactors = F) %>% 
  gather(key =  Banner,
         value = Views,
         B17WMDE_thankyou_authors:B17WMDE_thankyou_authors_mob_B) %>% 
  group_by(timeStamp, Banner) %>% 
  summarise(Views = sum(Views))
knitr::kable(bannerImpressions, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timeStamp Banner Views
2018-01-01 B17WMDE_thankyou_authors 257533
2018-01-01 B17WMDE_thankyou_authors_B 128423
2018-01-01 B17WMDE_thankyou_authors_mob_A 0
2018-01-01 B17WMDE_thankyou_authors_mob_B 0
2018-01-01 B17WMDE_thankyou_authors_pad_A 0
2018-01-01 B17WMDE_thankyou_authors_pad_B 0
2018-01-02 B17WMDE_thankyou_authors 741533
2018-01-02 B17WMDE_thankyou_authors_B 370737
2018-01-02 B17WMDE_thankyou_authors_mob_A 0
2018-01-02 B17WMDE_thankyou_authors_mob_B 3
2018-01-02 B17WMDE_thankyou_authors_pad_A 0
2018-01-02 B17WMDE_thankyou_authors_pad_B 0
2018-01-03 B17WMDE_thankyou_authors 824290
2018-01-03 B17WMDE_thankyou_authors_B 410722
2018-01-03 B17WMDE_thankyou_authors_mob_A 0
2018-01-03 B17WMDE_thankyou_authors_mob_B 0
2018-01-03 B17WMDE_thankyou_authors_pad_A 0
2018-01-03 B17WMDE_thankyou_authors_pad_B 0
2018-01-04 B17WMDE_thankyou_authors 818331
2018-01-04 B17WMDE_thankyou_authors_B 409420
2018-01-04 B17WMDE_thankyou_authors_mob_A 0
2018-01-04 B17WMDE_thankyou_authors_mob_B 0
2018-01-04 B17WMDE_thankyou_authors_pad_A 0
2018-01-04 B17WMDE_thankyou_authors_pad_B 0
2018-01-05 B17WMDE_thankyou_authors 750962
2018-01-05 B17WMDE_thankyou_authors_B 375077
2018-01-05 B17WMDE_thankyou_authors_mob_A 0
2018-01-05 B17WMDE_thankyou_authors_mob_B 2
2018-01-05 B17WMDE_thankyou_authors_pad_A 0
2018-01-05 B17WMDE_thankyou_authors_pad_B 0
2018-01-06 B17WMDE_thankyou_authors 586811
2018-01-06 B17WMDE_thankyou_authors_B 293237
2018-01-06 B17WMDE_thankyou_authors_mob_A 0
2018-01-06 B17WMDE_thankyou_authors_mob_B 0
2018-01-06 B17WMDE_thankyou_authors_pad_A 0
2018-01-06 B17WMDE_thankyou_authors_pad_B 0
2018-01-07 B17WMDE_thankyou_authors 706186
2018-01-07 B17WMDE_thankyou_authors_B 352536
2018-01-07 B17WMDE_thankyou_authors_mob_A 0
2018-01-07 B17WMDE_thankyou_authors_mob_B 0
2018-01-07 B17WMDE_thankyou_authors_pad_A 0
2018-01-07 B17WMDE_thankyou_authors_pad_B 0
2018-01-08 B17WMDE_thankyou_authors 920162
2018-01-08 B17WMDE_thankyou_authors_B 460469
2018-01-08 B17WMDE_thankyou_authors_mob_A 0
2018-01-08 B17WMDE_thankyou_authors_mob_B 0
2018-01-08 B17WMDE_thankyou_authors_pad_A 0
2018-01-08 B17WMDE_thankyou_authors_pad_B 0
2018-01-09 B17WMDE_thankyou_authors 988107
2018-01-09 B17WMDE_thankyou_authors_B 495139
2018-01-09 B17WMDE_thankyou_authors_mob_A 0
2018-01-09 B17WMDE_thankyou_authors_mob_B 0
2018-01-09 B17WMDE_thankyou_authors_pad_A 0
2018-01-09 B17WMDE_thankyou_authors_pad_B 0
2018-01-10 B17WMDE_thankyou_authors 990820
2018-01-10 B17WMDE_thankyou_authors_B 495235
2018-01-10 B17WMDE_thankyou_authors_mob_A 0
2018-01-10 B17WMDE_thankyou_authors_mob_B 0
2018-01-10 B17WMDE_thankyou_authors_pad_A 0
2018-01-10 B17WMDE_thankyou_authors_pad_B 0
2018-01-11 B17WMDE_thankyou_authors 986539
2018-01-11 B17WMDE_thankyou_authors_B 493215
2018-01-11 B17WMDE_thankyou_authors_mob_A 0
2018-01-11 B17WMDE_thankyou_authors_mob_B 1
2018-01-11 B17WMDE_thankyou_authors_pad_A 0
2018-01-11 B17WMDE_thankyou_authors_pad_B 0
2018-01-12 B17WMDE_thankyou_authors 852340
2018-01-12 B17WMDE_thankyou_authors_B 426957
2018-01-12 B17WMDE_thankyou_authors_mob_A 0
2018-01-12 B17WMDE_thankyou_authors_mob_B 0
2018-01-12 B17WMDE_thankyou_authors_pad_A 0
2018-01-12 B17WMDE_thankyou_authors_pad_B 0
2018-01-13 B17WMDE_thankyou_authors 628561
2018-01-13 B17WMDE_thankyou_authors_B 313426
2018-01-13 B17WMDE_thankyou_authors_mob_A 0
2018-01-13 B17WMDE_thankyou_authors_mob_B 0
2018-01-13 B17WMDE_thankyou_authors_pad_A 0
2018-01-13 B17WMDE_thankyou_authors_pad_B 0
2018-01-14 B17WMDE_thankyou_authors 779314
2018-01-14 B17WMDE_thankyou_authors_B 389017
2018-01-14 B17WMDE_thankyou_authors_mob_A 0
2018-01-14 B17WMDE_thankyou_authors_mob_B 0
2018-01-14 B17WMDE_thankyou_authors_pad_A 0
2018-01-14 B17WMDE_thankyou_authors_pad_B 0

1A. 2 Chart

ggplot(bannerImpressions, aes(x = timeStamp,
                    y = Views,
                    color = Banner,
                    label = Views)) +
  geom_line(aes(group = Banner), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of BannerImpressions') +
  ylab("Impressions") +
  geom_text_repel(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

1B. Campaign Pageviews (Banner Clicks)

1B. 1 The data set

pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Pageviews
2018-01-01 533
2018-01-02 1188
2018-01-03 1269
2018-01-04 1148
2018-01-05 1122
2018-01-06 1148
2018-01-07 1242
2018-01-08 1150
2018-01-09 1254
2018-01-10 1347
2018-01-11 1773
2018-01-12 1214
2018-01-13 1201
2018-01-14 1492

1B. 2 Chart

ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

2. Campaign User Registrations

2. 1 The data set

userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      quote = "",
                      header = T,
                      # row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Registrations
2018-01-01 1
2018-01-02 8
2018-01-03 15
2018-01-04 13
2018-01-05 5
2018-01-06 6
2018-01-07 12
2018-01-08 6
2018-01-09 9
2018-01-10 7
2018-01-11 15
2018-01-12 9
2018-01-13 3
2018-01-14 11
2018-01-15 1

2. 2 Chart

ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

3. Campaign User Edits

3. 1 The data set

userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
[1] "135 edits were made by the campaign registered users thus far."

3. 2 Chart

4. Campaign Guided Tours

4. 1 The data set: Guided Tour Exit Steps

Campaign guided tours: diskutieren and seimutig.

tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
  id <- round(runif(1, 1, 10e6))
  while (id %in% an_userId) {
    id <- round(runif(1, 1, 10e6))
  }
  an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
  w <- which(eventUserId %in% x)
  if (length(w) > 0) {
    an_userId[w]
  } else {
    x
  }
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    filter(event_userId != 0) %>% 
    select(timestamp, event_tour, event_step, an_userId)
  knitr::kable(gTours, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
timestamp event_tour event_step an_userId
2018-01-03 diskutieren returnToTraining u_5167969
2018-01-03 seimutig returnToTraining u_5167969
2018-01-04 seimutig boldness u_6495795
2018-01-04 seimutig positionCursor u_6495795
2018-01-04 diskutieren returnToTraining u_6495795
2018-01-05 diskutieren saveReply u_6896279
2018-01-04 seimutig returnToTraining u_6495795
2018-01-04 seimutig editButtonCitation u_6495795
2018-01-06 seimutig editButton u_4706786
2018-01-06 seimutig boldness u_4706786
2018-01-06 diskutieren firstMessage u_4706786
2018-01-08 seimutig editBoldness u_2986526
2018-01-08 seimutig editSummary u_2986526
2018-01-08 diskutieren returnToTraining u_2986526
2018-01-10 seimutig editButton u_2672122
2018-01-10 seimutig editButton u_2672122
2018-01-10 seimutig editBoldness u_2672122
2018-01-10 seimutig editSummary u_2672122
2018-01-10 seimutig editButtonCitation u_2672122
2018-01-10 seimutig saveCitation u_2672122
2018-01-10 seimutig editButton u_2672122
2018-01-10 diskutieren returnToTraining u_2672122
2018-01-11 seimutig editBoldness u_771518
2018-01-11 seimutig boldness u_771518
2018-01-11 seimutig insertCitation u_771518
2018-01-11 seimutig anyEdit u_771518
2018-01-11 diskutieren returnToTraining u_771518
2018-01-11 seimutig anyEdit u_4425925

4. 2 Unique users in Guided Tours

Campaign guided tours: diskutieren and seimutig.

gTours %>% 
  select(event_tour) %>% 
  group_by(event_tour) %>% 
  summarise(`Unique users` = n()) %>% 
  knitr::kable(format = "html") %>% 
  kable_styling(full_width = F, position = "left")
event_tour Unique users
diskutieren 7
seimutig 21

5 Training Module

trainData <- read.csv('wmde_training_data.csv')
# - remove first two rows (test data)
trainData <- trainData[-c(1,2), ]
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userData <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
gtData <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
edData <- read.delim('thankyou_2018_userEdits.tsv',
                     sep = "\t",
                     header = T,
                     stringsAsFactors = F)

5.1 Training Module Overview

How many registered users take the Training Module?

print(paste0("Number of uses who took the Training Module is ", 
             length(trainData$username),
             " , which is ",
             round(length(trainData$username)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
[1] "Number of uses who took the Training Module is 67 , which is 55.37% of registered users."

What are the last completed slides per user Training Module?

slidesData <- trainData %>% 
  select(training_module, last_slide_completed) %>% 
  group_by(training_module, last_slide_completed) %>% 
  summarise(Count = n()) %>% 
  arrange(training_module, desc(Count))
knitr::kable(slidesData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
training_module last_slide_completed Count
artikel-bewerten fertig 6
artikel-bewerten artikel-qualitat-bewerten 1
artikel-bewerten artikel-qualitat-quiz 1
editieren-basiswissen das-wars 7
editieren-basiswissen video-wiki-code 2
editieren-basiswissen visual-editor-buttons 2
editieren-basiswissen beobachtungsliste-video 1
editieren-basiswissen edit-vs-edit-quelle 1
editieren-basiswissen versuche-es 1
wikipedia-basiswissen relevanz-quiz-fortsetzung 35
wikipedia-basiswissen richtlinien-der-wikipedia 2
wikipedia-basiswissen urheberrecht-und-plagiate 2
wikipedia-basiswissen wikipedia-ist-frei 2
wikipedia-basiswissen nachweisbarkeit-quiz 1
wikipedia-basiswissen prinzipien-ruckblick 1
wikipedia-basiswissen relevanz 1
wikipedia-basiswissen sei-respektvoll 1
ggplot(slidesData, aes(x = last_slide_completed,
                       y = Count,
                       color = training_module,
                       label = Count)) +
  geom_line(aes(group = 1), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of Training Modules') +
  ylab("No. Users") +
  geom_text_repel(size = 3) +
  facet_wrap(~training_module) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6.5, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

Do users start editing after training modules? Is there a difference in users with completed and not-completed or not at all having taken the modules?

userData <- userData %>% 
  filter(event_campaign %in% 'wmde_etc2017_bt1' & event_isSelfMade) %>% 
  select(event_userId, event_userName)
userData <- left_join(userData, trainData, by = c("event_userName" = "username"))
Column `event_userName`/`username` joining character vector and factor, coercing into character vector
userData$module_completion_date <- NA
userData <- left_join(userData, 
                      gtData[, c('event_userId', 'event_tour', 'event_step')],
                      by = c('event_userId'))
userData <- left_join(userData, edData,
                      by = c('event_userId' = 'rev_user'))

How many edits were made on behalf of those users who have started the Training Module vs. those who did not?

tAn <- userData %>% 
  select(training_module, edits) %>% 
  group_by(training_module) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(training_module, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
training_module Users Edits
artikel-bewerten 7 36
editieren-basiswissen 6 36
wikipedia-basiswissen 20 38
NA 111 187
print(paste0("In total, the users who took the Training Module made ", 
             sum(tAn$Edits[1:3]),
             " edits, which is ",
             round(sum(tAn$Edits[1:3])/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
[1] "In total, the users who took the Training Module made 110 edits, which is 37.04% of all edits."
print(paste0("On the other hand, the users who did not take the Training Module made ", 
             tAn$Edits[4],
             " edits, which is ",
             round(tAn$Edits[4]/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
[1] "On the other hand, the users who did not take the Training Module made 187 edits, which is 62.96% of all edits."

Completion of the Training Module vs. number of edits

# - final slide for: wikipedia-basiswissen = relevanz-quiz-fortsetzung
# - final slide for: editieren-basiswissen = das-wars
# - final slide for: artikel-bewerten = fertig
finalSlide <- c('relevanz-quiz-fortsetzung', 'das', 'fertig')
names(finalSlide) <- c('wikipedia-basiswissen', 'editieren-basiswissen', 'relevanz-quiz-fortsetzung')
userData$completed_Training <- ifelse(userData$last_slide_completed %in% finalSlide, T, F)
completedTrainingEdits <- userData %>% 
  select(completed_Training, edits) %>% 
  group_by(completed_Training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
completed_Training Users Edits
FALSE 120 223
TRUE 24 74

We will express this result as the number of edits per use ratio for two groups: those who did complete the Training Module and those who did not. In other words, we divide the Edits column with the Users column, separately for TRUE and FALSE in the completed_Training column. This approach provides the following insight: in the group of users that did not complete the Training Module we have obtained 223/120 = 1.86 edits per user, while in the group of those who have completed the Training Module we find 74/24 = 3.08 edits per user. The ratio of these two ratios (i.e. 3.08 divided by 1.86) is 1.66 in favor of the group who has completed the Training Module, and we conclude that they have provided about 66% more edits per user compared to the group that has not completed the module. Let’s now take a look at the number of users who (a) made any edits at all, or(b) have reached their 10th edit, in these two groups:

userData$Edits <- (userData$edits > 0)
userData$Edits10 <- (userData$edits >= 10)
completedTrainingEdits <- userData %>% 
  select(completed_Training, Edits, Edits10) %>%
  group_by(completed_Training) %>% 
  summarise(Users = n(), Edits = sum(Edits, na.rm = T), Edits10 = sum(Edits10, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
completed_Training Users Edits Edits10
FALSE 120 36 6
TRUE 24 13 0

We can see that 13 of users who have completed the Training Module have made at least one edit, 13/24 = .54, or about 54%. None of them, however, have reached their 10th edit. On the other hand, 36/120 = .3 or 30% of those who did not complete the Training Module have made at least one edit, while 6 of them have reached their 10th edit. This probably tells us that an intrinsic (aka “internal”) locus of motivation to contribute still works better than the (external) locus of motivation that we can help develop through our training modules - which does not present an unexpected result.

Detailed data for the editieren-basiswissen Training Module:

editModData <- userData %>% 
  select(training_module, last_slide_completed, edits) %>% 
  filter(training_module %in% 'editieren-basiswissen') %>% 
  group_by(last_slide_completed) %>% 
  summarise(edits = sum(edits, na.rm = T)) %>% 
  arrange(desc(edits))
knitr::kable(editModData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
last_slide_completed edits
das-wars 36

There are six users only who took the editieren-basiswissen Training Module, and because we know that das-wars is the last slide in the editieren-basiswissen Training Module, we also know that only those who have completed this Training Module have made any edits!

---
title: 'Thank You Campaign 2018: Report'
author: "Goran S. Milovanovic, Data Analyst, WMDE"
date: "January, 2018"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---


**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign is run from 1. January 2018 to N January 2018.

```{r setup}
knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
```

```{r, echo = F, warning = F, message = F, results = 'hide'}
# !diagnostics off
### --- Setup
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(ggrepel)
library(scales)
library(RColorBrewer)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(DT)
library(reshape2)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script `ThankYou_2018_Production_SQL.R` on stat1005.eqiad.wmnet, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. Run from stat1005 stat box by executing `Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R`.

```{r, echo = T, eval = F}

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

```

## 1A. Campaign Banner Impressions

### 1A. 1 The data set
```{r echo = T, warning = 'hide', message = F}
# - report: current update
print(paste0("Current update: ", as.character(Sys.time())))
bannerImpressions <- read.csv('thankyouBannerImpressions.csv',
                              row.names = 1,
                              header = T,
                              stringsAsFactors = F) %>% 
  gather(key =  Banner,
         value = Views,
         B17WMDE_thankyou_authors:B17WMDE_thankyou_authors_mob_B) %>% 
  group_by(timeStamp, Banner) %>% 
  summarise(Views = sum(Views))
knitr::kable(bannerImpressions, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```
### 1A. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(bannerImpressions, aes(x = timeStamp,
                    y = Views,
                    color = Banner,
                    label = Views)) +
  geom_line(aes(group = Banner), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of BannerImpressions') +
  ylab("Impressions") +
  geom_text_repel(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 1B. Campaign Pageviews (Banner Clicks)

### 1B. 1 The data set

```{r echo = T, warning = 'hide', message = F}
pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

### 1B. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```


## 2. Campaign User Registrations

### 2. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      quote = "",
                      header = T,
                      # row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

### 2. 2 Chart

```{r echo = T, warning = 'hide', message = F}
ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

## 3. Campaign User Edits

### 3. 1 The data set

```{r echo = T, warning = 'hide', message = F}
userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
```

### 3. 2 Chart

```{r echo = T, warning = 'hide', message = F}
```

## 4. Campaign Guided Tours

### 4. 1 The data set: Guided Tour Exit Steps

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
  id <- round(runif(1, 1, 10e6))
  while (id %in% an_userId) {
    id <- round(runif(1, 1, 10e6))
  }
  an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
  w <- which(eventUserId %in% x)
  if (length(w) > 0) {
    an_userId[w]
  } else {
    x
  }
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    filter(event_userId != 0) %>% 
    select(timestamp, event_tour, event_step, an_userId)
  knitr::kable(gTours, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
```
### 4. 2 Unique users in Guided Tours

Campaign guided tours: `diskutieren` and `seimutig`.

```{r echo = T, warning = 'hide', message = F}
gTours %>% 
  select(event_tour) %>% 
  group_by(event_tour) %>% 
  summarise(`Unique users` = n()) %>% 
  knitr::kable(format = "html") %>% 
  kable_styling(full_width = F, position = "left")
```

## 5 Training Module

```{r echo = T, warning = 'hide', message = F}
trainData <- read.csv('wmde_training_data.csv')
# - remove first two rows (test data)
trainData <- trainData[-c(1,2), ]
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userData <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
gtData <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
edData <- read.delim('thankyou_2018_userEdits.tsv',
                     sep = "\t",
                     header = T,
                     stringsAsFactors = F)
```


### 5.1 Training Module Overview

How many registered users take the Training Module?

```{r echo = T, warning = 'hide', message = F}
print(paste0("Number of uses who took the Training Module is ", 
             length(trainData$username),
             " , which is ",
             round(length(trainData$username)/sum(userReg$Registrations)*100, 2),
             "% of registered users."
               ))
```

What are the last completed slides per user Training Module?

```{r echo = T, warning = 'hide', message = F}
slidesData <- trainData %>% 
  select(training_module, last_slide_completed) %>% 
  group_by(training_module, last_slide_completed) %>% 
  summarise(Count = n()) %>% 
  arrange(training_module, desc(Count))
knitr::kable(slidesData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

```{r echo = T, warning = 'hide', message = F}
ggplot(slidesData, aes(x = last_slide_completed,
                       y = Count,
                       color = training_module,
                       label = Count)) +
  geom_line(aes(group = 1), size = .25) +
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = scales::comma) +
  ggtitle('Thank You 2018:\nOverview of Training Modules') +
  ylab("No. Users") +
  geom_text_repel(size = 3) +
  facet_wrap(~training_module) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6.5, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())
```

Do users start editing after training modules? Is there a difference in users with completed and not-completed or not at all having taken the modules?

```{r echo = T, warning = 'hide', message = F}
userData <- userData %>% 
  filter(event_campaign %in% 'wmde_etc2017_bt1' & event_isSelfMade) %>% 
  select(event_userId, event_userName)
userData <- left_join(userData, trainData, by = c("event_userName" = "username"))
userData$module_completion_date <- NA
userData <- left_join(userData, 
                      gtData[, c('event_userId', 'event_tour', 'event_step')],
                      by = c('event_userId'))
userData <- left_join(userData, edData,
                      by = c('event_userId' = 'rev_user'))
```

How many edits were made on behalf of those users who have started the Training Module vs. those who did not?

```{r echo = T, warning = 'hide', message = F}
tAn <- userData %>% 
  select(training_module, edits) %>% 
  group_by(training_module) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>% 
  arrange(training_module, desc(Edits))
knitr::kable(tAn, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

```{r echo = T, warning = 'hide', message = F}
print(paste0("In total, the users who took the Training Module made ", 
             sum(tAn$Edits[1:3]),
             " edits, which is ",
             round(sum(tAn$Edits[1:3])/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
print(paste0("On the other hand, the users who did not take the Training Module made ", 
             tAn$Edits[4],
             " edits, which is ",
             round(tAn$Edits[4]/sum(tAn$Edits)*100, 2),
             "% of all edits."
               ))
```

Completion of the Training Module vs. number of edits

```{r echo = T, warning = 'hide', message = F}
# - final slide for: wikipedia-basiswissen = relevanz-quiz-fortsetzung
# - final slide for: editieren-basiswissen = das-wars
# - final slide for: artikel-bewerten = fertig
finalSlide <- c('relevanz-quiz-fortsetzung', 'das', 'fertig')
names(finalSlide) <- c('wikipedia-basiswissen', 'editieren-basiswissen', 'relevanz-quiz-fortsetzung')
userData$completed_Training <- ifelse(userData$last_slide_completed %in% finalSlide, T, F)
completedTrainingEdits <- userData %>% 
  select(completed_Training, edits) %>% 
  group_by(completed_Training) %>% 
  summarise(Users = n(), Edits = sum(edits, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

We will express this result as the number of edits per use ratio for two groups: those who did complete the Training Module and those who did not. In other words, we divide the Edits column with the Users column, separately for `TRUE` and `FALSE` in the `completed_Training` column. This approach provides the following insight: in the group of users that did not complete the Training Module we have obtained `223/120 = 1.86` edits per user, while in the group of those who have completed the Training Module we find `74/24` = 3.08 edits per user. The ratio of these two ratios (i.e. `3.08 divided by 1.86`) is 1.66 in favor of the group who has completed the Training Module, and we conclude that they have provided about 66% more edits per user compared to the group that has not completed the module. Let's now take a look at the number of users who (a) made any edits at all, or(b) have reached their 10th edit, in these two groups:

```{r echo = T, warning = 'hide', message = F}
userData$Edits <- (userData$edits > 0)
userData$Edits10 <- (userData$edits >= 10)
completedTrainingEdits <- userData %>% 
  select(completed_Training, Edits, Edits10) %>%
  group_by(completed_Training) %>% 
  summarise(Users = n(), Edits = sum(Edits, na.rm = T), Edits10 = sum(Edits10, na.rm = T))
knitr::kable(completedTrainingEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

We can see that 13 of users who have completed the Training Module have made at least one edit, `13/24 = .54`, or about 54%. None of them, however, have reached their 10th edit. On the other hand, `36/120 = .3` or 30% of those who did not complete the Training Module have made at least one edit, while 6 of them have reached their 10th edit. This probably tells us that an intrinsic (aka "internal") locus of motivation to contribute still works better than the (external) locus of motivation that we can help develop through our training modules - which does not present an unexpected result.

Detailed data for the `editieren-basiswissen` Training Module:

```{r echo = T, warning = 'hide', message = F}
editModData <- userData %>% 
  select(training_module, last_slide_completed, edits) %>% 
  filter(training_module %in% 'editieren-basiswissen') %>% 
  group_by(last_slide_completed) %>% 
  summarise(edits = sum(edits, na.rm = T)) %>% 
  arrange(desc(edits))
knitr::kable(editModData, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

There are six users only who took the `editieren-basiswissen` Training Module, and because we know that `das-wars` is the last slide in the `editieren-basiswissen` Training Module, we also know that only those who have completed this Training Module have made any edits!
