Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign is run from 2018/08/01 to 2018/08/08.

CURRENT UPDATE: Complete dataset as of 2018/08/09.

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script SummerBC2018_DailyUpdate_PRODUCTION.R on stat1005.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally.

0.1 Daily Update

# - wle_2018_de

### --- WMDE Spring Banner Campaign 2018
### --- Daily Update

rm(list = ls())

### --- Libraries
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
outDir <- '/home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/'
setwd(outDir)

### --- General Info:
# First Banner (https://de.wikipedia.org/wiki/Wikipedia:Hauptseite?banner=B18WMDE_authors_02_180801_1&force=1)
# Registration Name: Tesssssttttt
# Created: Mo, 30.07.2018 around 15:42
# 
# Second Banner (https://de.wikipedia.org/wiki/Wikipedia:Hauptseite?banner=B18WMDE_authors_02_180801_2&force=1)
# Registration name: Tessssttttt
# Created: Mo, 30.07.2018 around 15:45
# 
# Third Banner (https://de.wikipedia.org/wiki/Wikipedia:Hauptseite?banner=B18WMDE_authors_02_180801_3&force=1)
# Registration name: Tttteesssstt
# Created: Mo, 30.07.2018 around 15:45

### --- banner names (/beacon/impression):
# - B18WMDE_authors_02_180801_1
# - B18WMDE_authors_02_180801_2
# - B18WMDE_authors_02_180801_3

### --- campaign tags
# - WMDE_neweditors_summer2018_1
# - WMDE_neweditors_summer2018_2
# - WMDE_neweditors_summer2018_3

### --- landingpage including the campaign tag and anchor
# - Banner 1: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ?campaign=WMDE_neweditors_summer2018_1#Wer-darf-schreiben
# - Banner 2: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ?campaign=WMDE_neweditors_summer2018_2#Wikipedia-Buero
# - Banner 3: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ?campaign=WMDE_neweditors_summer2018_3#Schutz-vor-Missbrauch
# - Second landing page: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia

# - Update on (Reminder: stat1005 time is UTC): today minus one day
today <- strsplit(
  strsplit(
    as.character(as.POSIXct(Sys.time()) - 24*60*60),
    split = " ",
    fixed = TRUE)[[1]][1],
  split = "-",
  fixed = TRUE)[[1]]
today <- as.list(today)
names(today) <- c('year', 'month', 'day')

# - construct wmf.webrequest year/month/day/hour condition
# - for today:
dateCondition <- paste0("((year = ", today$year,
                        " AND month = ", today$month,
                        " AND day = ", as.integer(today$day) - 1,
                        " AND (hour = 22 OR hour = 23))",
                        " OR ",
                        "(year = ", today$year,
                        " AND month = ", today$month,
                        " AND day = ", today$day,
                        " AND (hour >= 0 OR hour < 22)))"
                        )

### --- Collect Banner Impressions for Update: HiveQL
q <- paste(
  "USE wmf;
  SELECT uri_query FROM webrequest
  WHERE uri_host = 'de.wikipedia.org'
  AND uri_path = '/beacon/impression'
  AND ",
  dateCondition,
  " AND ((uri_query LIKE '%B18WMDE_authors_02_180801_1%') OR
    (uri_query LIKE '%B18WMDE_authors_02_180801_2%') OR
    (uri_query LIKE '%B18WMDE_authors_02_180801_3%'));",
  sep = "")
# - write hql
write(q, 'summerBC2018_BannerImpressionsDaily.hql')
# - prepare output file:
fileName <- 'sbc2018_BannerImpressionsDaily.tsv'
# - execute hql script:
hiveArgs <- 'beeline --verbose=true -f'
hiveInput <- paste0('summerBC2018_BannerImpressionsDaily.hql > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)

### --- Collect Pageviews == Banner clicks for Update: HiveQL
# https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia
q <- paste(
  "USE wmf;
  SELECT uri_path, uri_query, referer FROM webrequest
  WHERE uri_host = 'de.wikipedia.org'
  AND ((uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ' AND uri_query LIKE '%campaign=WMDE_neweditors_summer2018%')
  OR (uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia')) 
  AND ", dateCondition,
  " ;",
  sep = "")
# - write hql
write(q, 'summerBC2018_PageViewsDaily.hql')
# - prepare output file:
fileName <- 'sbc2018_PageViewsDaily.tsv'
# - execute hql script:
hiveArgs <- 'beeline --verbose=true -f'
hiveInput <- paste0('summerBC2018_PageViewsDaily.hql > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)

### --- Collect Use Registrations for Update: SQL
# - ServerSideAccountCreation_17719237 schema
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_17719237 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20180729000000) and ((event_campaign like '%WMDE_neweditors_summer2018_1%') or (event_campaign like '%WMDE_neweditors_summer2018_2%') or (event_campaign like '%WMDE_neweditors_summer2018_3%')));\" > /home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_userRegistrations.tsv"
system(command = qCommand, wait = TRUE)

### --- Collect Banner Events for Update: SQL
# - WMDEBannerEvents_18193948 schema
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.WMDEBannerEvents_18193948 where timestamp >= 20180729000000;\" > /home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_bannerEvents.tsv"
system(command = qCommand, wait = TRUE)


### --- Prepare for repoting

# - Banner Impressions
bImpData <- readLines('sbc2018_BannerImpressionsDaily.tsv', n = -1)
bImpData <- bImpData[9:(length(bImpData) - 2)]
bImpData <- bImpData[-1]
bImpData <- data.frame(dat = bImpData, stringsAsFactors = F)
bannerA <- length(which(grepl("banner=B18WMDE_authors_02_180801_1", bImpData$dat, fixed = T)))
bannerB <- length(which(grepl("banner=B18WMDE_authors_02_180801_2", bImpData$dat, fixed = T)))
bannerC <- length(which(grepl("banner=B18WMDE_authors_02_180801_3", bImpData$dat, fixed = T)))
DailyBannerImpressions <- data.frame(
  Banner = c('B18WMDE_authors_1', 'B18WMDE_authors_2', 'B18WMDE_authors_3'),
  Count = c(bannerA, bannerB, bannerC),
  CampaignDay = rep(paste0(today, collapse = "-", sep = ""), 3)
)
DailyBannerImpressions$Count <- DailyBannerImpressions$Count/.01
# - DailyBannerImpressions
lF <- list.files()
if ('DailyBannerImpressions.csv' %in% lF) {
  dPS <- read.csv('DailyBannerImpressions.csv',
                  header = T,
                  row.names = 1)
  DailyBannerImpressions <- rbind(dPS, DailyBannerImpressions)
  write.csv(DailyBannerImpressions, 'DailyBannerImpressions.csv')
} else {
  write.csv(DailyBannerImpressions, 'DailyBannerImpressions.csv')
}

# - Pageviews
pageViews <- readLines('sbc2018_PageViewsDaily.tsv', n = -1)
pageViews <- pageViews[8:(length(pageViews) - 2)]
header = pageViews[1]
pageViews <- pageViews[-1]
header <- unlist(strsplit(header, "\t")[[1]])
pageViews <- data.frame(dat = pageViews, stringsAsFactors = F)
pageViews <- separate(pageViews,
                      dat,
                      into = c('uri_path', 'uri_query', 'referer'),
                      sep = "\t")
dailyPageviewsSet <- pageViews %>% 
  filter(grepl('campaign=WMDE_neweditors_summer2018', uri_query) | 
           grepl('/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia', uri_path)) %>%
  group_by(uri_query, uri_path) %>%
  summarise(Count = n())
dailyPageviewsSet$CampaignDay <- paste(unlist(today), collapse = "-", sep = "")
# - DailyPageviews
lF <- list.files()
if ('DailyPageviews.csv' %in% lF) {
  dPS <- read.csv('DailyPageviews.csv',
                  header = T,
                  row.names = 1)
  dailyPageviewsSet <- rbind(dPS, as.data.frame(dailyPageviewsSet))
  write.csv(dailyPageviewsSet, 'DailyPageviews.csv')
} else {
  write.csv(dailyPageviewsSet, 'DailyPageviews.csv')
}

# - User registrations
userRegistrations <- read.delim('summerBC2018_userRegistrations.tsv', 
                                sep = "\t",
                                header = T,
                                quote = "", 
                                stringsAsFactors = F)
userRegistrations <- userRegistrations[grepl('is_bot": false', userRegistrations$userAgent, fixed = T), ]
userRegistrations$timestamp <- as.character(userRegistrations$timestamp)
userRegistrations$timestamp <- as.POSIXct(userRegistrations$timestamp, tz = "UTC", format = "%Y%m%d%H%M%S") 
# - CEST correction on POSIXct class, +2hours:
userRegistrations$timestamp <- userRegistrations$timestamp + 2*60*60
# - wrangle
userRegistrations$CampaignDay <- sapply(userRegistrations$timestamp, function(x) {
  paste(c(
    substr(x, 1,4),
    substr(x, 6,7),
    substr(x, 9,10)
    ),
    collapse = "-",
    sep = "")
})
userRegistrations <- userRegistrations[which(userRegistrations$CampaignDay %in% paste(today, collapse = "-", sep = "")), ]
dailyUserRegSet <- userRegistrations %>%
  select(event_campaign) %>%
  group_by(event_campaign) %>%
  summarise(Count = n())
dailyUserRegSet$CampaignDay <- paste(unlist(today), collapse = "-", sep = "")
# - Daily User Registrations
lF <- list.files()
if ('dailyUserRegistrations.csv' %in% lF) {
  dPS <- read.csv('dailyUserRegistrations.csv', 
                  header = T, 
                  row.names = 1)
  dailyUserRegSet <- rbind(dPS, dailyUserRegSet)
  write.csv(dailyUserRegSet, 'dailyUserRegistrations.csv')
} else {
  write.csv(dailyUserRegSet, 'dailyUserRegistrations.csv')
}

0.1 Post-Campaign Analytics

NOTE: Training Module data are obtained directly from the application maintainer. The following code retrieves all user registrations and edits.

### --- User registrations

### --- Libraries
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

# - ServerSideAccountCreation_5487345
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_17719237 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20180729000000) and ((event_campaign like '%WMDE_neweditors_summer2018_1%') or (event_campaign like '%WMDE_neweditors_summer2018_2%') or (event_campaign like '%WMDE_neweditors_summer2018_3%')));\" > /home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_userRegistrations_FULL.tsv"
system(command = qCommand, wait = TRUE)

### --- User edits
# - get user IDs from registered:
userReg <- read.table('/home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_userRegistrations_FULL.tsv', 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade) %>% 
  filter(event_isSelfMade == 1)
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""),
                      '/home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/sBC2018_userEdits_FULL.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

1. Campaign Banners and Pages

This section presents all data and statistics on the campaign banners and pages.

1.4 User Registrations

Chart 1. 4. Daily user registrations per banner

dataSet <- read.delim('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/summerBC2018_userRegistrations_FULL.tsv',
                    header = T,
                    check.names = F,
                    sep = "\t",
                    quote = "",
                    stringsAsFactors = F)
dataSet <- dataSet[grepl('is_bot": false', dataSet$userAgent, fixed = T), ]
dataSet$timestamp <- as.character(dataSet$timestamp)
dataSet$timestamp <- as.POSIXct(dataSet$timestamp, tz = "UTC", format = "%Y%m%d%H%M%S") 
# - CEST correction on POSIXct class, +2hours:
dataSet$timestamp <- dataSet$timestamp + 2*60*60
# - Campaign day:
dataSet$CampaignDay <- sapply(dataSet$timestamp, function(x) {
  paste(c(
    substr(x, 1,4),
    substr(x, 6,7),
    substr(x, 9,10)
    ),
    collapse = "-",
    sep = "")
})
dataSet <- filter(dataSet,
                  CampaignDay %in% c("2018-08-01", 
                                     "2018-08-02",
                                     "2018-08-03",
                                     "2018-08-04",
                                     "2018-08-05",
                                     "2018-08-06",
                                     "2018-08-07",
                                     "2018-08-08"))
userReg <- dataSet %>% 
  select(event_campaign, CampaignDay) %>% 
  group_by(CampaignDay, event_campaign) %>% 
  summarise(Count = n())
colnames(userReg)[2] <- "Banner"
# - Visualize w. {ggplot2}
ggplot(userReg, aes(x = CampaignDay,
                    y = Count,
                    group = Banner,
                    color = Banner,
                    fill = Banner,
                    label = Count)) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) +
  ggtitle('Summer Banner Campaign 2018: User Registrations') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank())

Table 1. 4. Daily user registrations per banner

### --- Full Dataset (Table Report)
datatable(userReg)

Table 1. 4. 1 Banner Conversion Rates

### --- Banner Conversion Rates
totalRegs <- userReg[, 2:3] %>% 
  select(Banner, Count) %>%
  group_by(Banner) %>%
  summarise(totalReg = sum(Count))
totalRegs$Banner[grepl("WMDE_neweditors_summer2018_1", totalRegs$Banner)] <- 'B18WMDE_authors_1'
totalRegs$Banner[grepl("WMDE_neweditors_summer2018_2", totalRegs$Banner)] <- 'B18WMDE_authors_2'
totalRegs$Banner[grepl("WMDE_neweditors_summer2018_3", totalRegs$Banner)] <- 'B18WMDE_authors_3'
totalClicks$uri_query[grepl("WMDE_neweditors_summer2018_1", totalClicks$uri_query)] <- 'B18WMDE_authors_1'
totalClicks$uri_query[grepl("WMDE_neweditors_summer2018_2", totalClicks$uri_query)] <- 'B18WMDE_authors_2'
totalClicks$uri_query[grepl("WMDE_neweditors_summer2018_3", totalClicks$uri_query)] <- 'B18WMDE_authors_3'
totalClicks <- totalClicks %>% 
  group_by(uri_query) %>% 
  summarise(totalClicks = sum(totalClicks))
totalRegs <- left_join(totalRegs, totalClicks,
                       by = c("Banner" = "uri_query"))
totalRegs$`Registrations/Clicks` <- round(totalRegs$totalReg/totalRegs$totalClicks, 5)
datatable(totalRegs)

1.5 User Edits

Chart 1. 5. Edits of registered users per banner and campaign day of registration

NOTE: the Campaign Day here refers to the day when the user has registered, not the day in which the respective edits were made.

dataSet2 <- read.delim('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/sBC2018_userEdits_FULL.tsv',
                    header = T,
                    check.names = F,
                    sep = "\t",
                    quote = "",
                    stringsAsFactors = F)
dataSet <- left_join(dataSet, dataSet2, by = c("event_userId" = "rev_user"))
dataSet$edits[is.na(dataSet$edits)] <- 0
userEds <- dataSet %>% 
  select(event_campaign, CampaignDay, edits) %>% 
  group_by(CampaignDay, event_campaign) %>% 
  summarise(Edits = sum(edits))
colnames(userEds)[2] <- "Banner"
# - Visualize w. {ggplot2}
ggplot(userEds, aes(x = CampaignDay,
                    y = Edits,
                    group = Banner,
                    color = Banner,
                    fill = Banner,
                    label = Edits)) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) +
  ggtitle('Summer Banner Campaign 2018: User Edits') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank())

Table 1. 5. Edits of registered users per banner and campaign day of registration

### --- Full Dataset (Table Report)
datatable(userEds)

Table 1. 5. 1 Edits of registered users per banner and campaign day of registration

Total number of user edits per banner:

totalEdits <- userEds %>% 
  group_by(Banner) %>% 
  summarise(Edits = sum(Edits))
datatable(totalEdits)

Table 1. 5. 2 Users reaching their 10th edit per banner

edit10 <- dataSet %>% 
  filter(edits >= 10) %>% 
  select(event_campaign) %>% 
  group_by(event_campaign) %>%
  summarise(`edit10th` = n())
datatable(edit10)

Table 1. 5. 3 Full user edits distribution

### --- Full Dataset (Table Report)
pltEdits <- as.tbl(dataSet) %>% 
  dplyr::group_by(edits) %>% 
  count()
colnames(pltEdits) <- c('Edits', 'Num.users')
knitr::kable(pltEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
Edits Num.users
0 78
1 11
2 9
3 5
4 3
5 2
8 1
9 1
48 1

Table 1. 5. 4 Users edit categories

edits0 <- pltEdits$`Num.users`[pltEdits$Edits == 0]
edits <- sum(pltEdits$`Num.users`[pltEdits$Edits > 0])
edits1 <- sum(pltEdits$`Num.users`[pltEdits$Edits == 1])
edits2_4 <- sum(pltEdits$`Num.users`[pltEdits$Edits >= 2 & pltEdits$Edits <= 4])
edits5_10 <- sum(pltEdits$`Num.users`[pltEdits$Edits >= 5 & pltEdits$Edits <= 10])
edits10 <- sum(pltEdits$`Num.users`[pltEdits$Edits > 10])
editClasses <- data.frame(`No edits` = edits0,
                          `Edited` = edits,
                          `1 edit` = edits1,
                          `2 - 4 edits` = edits2_4,
                          `5 - 10 edits` = edits5_10, 
                          `> 10 edits` = edits10,
                          check.names = F,
                          stringsAsFactors = F)
knitr::kable(editClasses, format = "html") %>%
  kable_styling(full_width = F, position = "left")
No edits Edited 1 edit 2 - 4 edits 5 - 10 edits > 10 edits
78 33 11 17 4 1

2 Training Modules

trainMod <- read.csv('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/wmde_training_data_2018-08.csv',
                    header = T,
                    check.names = F,
                    stringsAsFactors = F)
# - remove users Stefan Schneider (WMDE) and Sage (Wiki Ed)
trainMod <- trainMod %>% 
  filter(!(username %in% c('Stefan Schneider (WMDE)', 'Sage (Wiki Ed)')))
# - data types
trainMod$module_completion_date <- as.character(trainMod$module_completion_date)
trainMod$module_completion_date <- as.POSIXct(trainMod$module_completion_date, tz = "UTC", format = "%Y-%m-%d %H:%M:%S") 
# - CEST correction on POSIXct class, +2hours:
trainMod$module_completion_date <- trainMod$module_completion_date + 2*60*60
trainMod$started_at <- as.character(trainMod$started_at)
trainMod$started_at <- as.POSIXct(trainMod$started_at, tz = "UTC", format = "%Y-%m-%d %H:%M:%S") 
# - CEST correction on POSIXct class, +2hours:
trainMod$started_at <- trainMod$started_at + 2*60*60
trainMod$last_slide_completed_at <- as.character(trainMod$last_slide_completed_at)
trainMod$last_slide_completed_at <- as.POSIXct(trainMod$last_slide_completed_at, tz = "UTC", format = "%Y-%m-%d %H:%M:%S") 
# - CEST correction on POSIXct class, +2hours:
trainMod$last_slide_completed_at <- trainMod$last_slide_completed_at + 2*60*60
# - filter for campaign days only
trainMod <- filter(trainMod, 
                   started_at >= "2018-08-01 00:00:00" & started_at < "2018-08-09 00:00:00")
# - filter campaign training modules only
# trainMod <- filter(trainMod,
                   # training_module %in% c('diskutieren-basiswissen', 'editieren-basiswissen'))
# - left_join w. dataSet (user registrations + user edits)
# - to produce trainMod that has user registrations + edits
trainMod <- left_join(trainMod, dataSet,
                      by = c("username" = "event_userName"))
# - filter those who did not register with the campaign:
trainMod <- filter(trainMod, !is.na(CampaignDay))
# - trainMod aggregates 1: number of training modules started and completed per registered user
modulesStarted <- as.data.frame(table(trainMod$username))
colnames(modulesStarted) <- c('username', 'modulesStarted')
modulesCompleted <- filter(trainMod, !is.na(module_completion_date))
modulesCompleted <- as.data.frame(table(modulesCompleted$username))
colnames(modulesCompleted) <- c('username', 'modulesCompleted')
modulesData <- left_join(modulesStarted, modulesCompleted, by = 'username')
Column `username` joining factors with different levels, coercing to character vector
modulesData$modulesCompleted[is.na(modulesData$modulesCompleted)] <- 0
# - left_join dataSet w. modulesData
dataSet <- left_join(dataSet, modulesData, 
                     by = c("event_userName" = "username"))
dataSet$modulesStarted[is.na(dataSet$modulesStarted)] <- 0
dataSet$modulesCompleted[is.na(dataSet$modulesCompleted)] <- 0
# - produce aggregates for reporting on training modules from dataSet
# - agg1: tM_started
tM_started <- dataSet %>% 
  select(modulesStarted, edits) %>% 
  group_by(modulesStarted) %>% 
  summarise(sumEdits = sum(edits), users = n())
tM_started$editsPerUser <- round(tM_started$sumEdits/tM_started$users, 2)
# - agg1: tM_completed
tM_completed <- dataSet %>% 
  select(modulesCompleted, edits) %>% 
  group_by(modulesCompleted) %>% 
  summarise(sumEdits = sum(edits), users = n())
tM_completed$editsPerUser <- round(tM_completed$sumEdits/tM_completed$users, 2)
# - agg3: tm_started_perModule
tm_started_perModule <- trainMod %>% 
  select(training_module) %>% 
  group_by(training_module) %>% 
  summarise(Count = n())
# - agg4: tm_completed_perModule
tm_completed_perModule <- trainMod %>%
  filter(!is.na(module_completion_date)) %>%
  select(training_module) %>%
  group_by(training_module) %>% 
  summarise(Count = n())
# - agg5(1-2): exit slides per module
exitSlides1 <- data.frame(t(table(
  trainMod$training_module[trainMod$training_module %in% "diskutieren-basiswissen"], 
  trainMod$last_slide_completed[trainMod$training_module %in% "diskutieren-basiswissen"])))
colnames(exitSlides1) <- c('Slide', 'Module', 'Count')
exitSlides2 <- data.frame(t(table(
  trainMod$training_module[trainMod$training_module %in% "editieren-basiswissen"], 
  trainMod$last_slide_completed[trainMod$training_module %in% "editieren-basiswissen"])))
colnames(exitSlides2) <- c('Slide', 'Module', 'Count')
# - agg6(1-2): time spent in training
trainMod$Completed <- !is.na(trainMod$module_completion_date)
timeInTraining <- trainMod %>% 
  select(Completed, training_module, started_at, last_slide_completed_at) %>% 
  mutate(timeSpent = as.numeric((last_slide_completed_at - started_at)/60)) %>%
  select(Completed, training_module, timeSpent) %>%
  group_by(training_module, Completed) %>%
  summarise(`Time (mins)` = mean(round(timeSpent, 2)))
# - agg7: distribution of time spent in training
timeInTrainingDist <- trainMod %>%
  select(Completed, training_module, started_at, last_slide_completed_at) %>%
  mutate(timeSpent = as.numeric((last_slide_completed_at - started_at)/60))

Table 2. 1 Number of users who have started a particular number of training modules (modulesStarted), the total number of edits that they have made (sumEdits), number of users who took a particular number of training modules (users), and the edits per user ratio.

datatable(tM_started)

Table 2. 2 Number of users who have completed a particular number of training modules (modulesStarted), the total number of edits that they have made (sumEdits), number of users who took a particular number of training modules (users), and the edits per user ratio.

datatable(tM_completed)

Table 2. 3 Number of users who have started a particular training module.

datatable(tm_started_perModule)

Table 2. 4 Number of users who have completed a particular training module.

datatable(tm_completed_perModule)

Table 2. 5a Exit slides per training module and the number of users who have exited at a particular slide.

datatable(exitSlides1)

Table 2. 5b Exit slides per training module and the number of users who have exited at a particular slide.

datatable(exitSlides2)

Table 2. 6 Average Time spent in training per training module.

datatable(timeInTraining)

Chart 2. 7 The distribution of time spent in training per training module.

ggplot(data = timeInTrainingDist, 
       aes(x = Completed, y = timeSpent, color = Completed)) +
  geom_point() +
  geom_jitter() +
  facet_wrap(~training_module) + 
  theme_bw()

---
title: 'Summer Banner Campaign 2018: Report'
author: "Goran S. Milovanovic, Data Scientist, WMDE"
date: "August 09, 2018"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---


**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign is run from 2018/08/01 to 2018/08/08.

**CURRENT UPDATE:** Complete dataset as of 2018/08/09.

```{r, echo = F, warning = 'hide', message = F, results = 'hide'}
# !diagnostics off
### --- Setup
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
rm(list = ls())
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(ggrepel)
library(scales)
library(RColorBrewer)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(DT)
library(reshape2)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script `SummerBC2018_DailyUpdate_PRODUCTION.R` on stat1005.eqiad.wmnet, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. 

### 0.1 Daily Update

```{r, echo = T, eval = F}

# - wle_2018_de

### --- WMDE Spring Banner Campaign 2018
### --- Daily Update

rm(list = ls())

### --- Libraries
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
outDir <- '/home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/'
setwd(outDir)

### --- General Info:
# First Banner (https://de.wikipedia.org/wiki/Wikipedia:Hauptseite?banner=B18WMDE_authors_02_180801_1&force=1)
# Registration Name: Tesssssttttt
# Created: Mo, 30.07.2018 around 15:42
# 
# Second Banner (https://de.wikipedia.org/wiki/Wikipedia:Hauptseite?banner=B18WMDE_authors_02_180801_2&force=1)
# Registration name: Tessssttttt
# Created: Mo, 30.07.2018 around 15:45
# 
# Third Banner (https://de.wikipedia.org/wiki/Wikipedia:Hauptseite?banner=B18WMDE_authors_02_180801_3&force=1)
# Registration name: Tttteesssstt
# Created: Mo, 30.07.2018 around 15:45

### --- banner names (/beacon/impression):
# - B18WMDE_authors_02_180801_1
# - B18WMDE_authors_02_180801_2
# - B18WMDE_authors_02_180801_3

### --- campaign tags
# - WMDE_neweditors_summer2018_1
# - WMDE_neweditors_summer2018_2
# - WMDE_neweditors_summer2018_3

### --- landingpage including the campaign tag and anchor
# - Banner 1: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ?campaign=WMDE_neweditors_summer2018_1#Wer-darf-schreiben
# - Banner 2: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ?campaign=WMDE_neweditors_summer2018_2#Wikipedia-Buero
# - Banner 3: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ?campaign=WMDE_neweditors_summer2018_3#Schutz-vor-Missbrauch
# - Second landing page: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia

# - Update on (Reminder: stat1005 time is UTC): today minus one day
today <- strsplit(
  strsplit(
    as.character(as.POSIXct(Sys.time()) - 24*60*60),
    split = " ",
    fixed = TRUE)[[1]][1],
  split = "-",
  fixed = TRUE)[[1]]
today <- as.list(today)
names(today) <- c('year', 'month', 'day')

# - construct wmf.webrequest year/month/day/hour condition
# - for today:
dateCondition <- paste0("((year = ", today$year,
                        " AND month = ", today$month,
                        " AND day = ", as.integer(today$day) - 1,
                        " AND (hour = 22 OR hour = 23))",
                        " OR ",
                        "(year = ", today$year,
                        " AND month = ", today$month,
                        " AND day = ", today$day,
                        " AND (hour >= 0 OR hour < 22)))"
                        )

### --- Collect Banner Impressions for Update: HiveQL
q <- paste(
  "USE wmf;
  SELECT uri_query FROM webrequest
  WHERE uri_host = 'de.wikipedia.org'
  AND uri_path = '/beacon/impression'
  AND ",
  dateCondition,
  " AND ((uri_query LIKE '%B18WMDE_authors_02_180801_1%') OR
    (uri_query LIKE '%B18WMDE_authors_02_180801_2%') OR
    (uri_query LIKE '%B18WMDE_authors_02_180801_3%'));",
  sep = "")
# - write hql
write(q, 'summerBC2018_BannerImpressionsDaily.hql')
# - prepare output file:
fileName <- 'sbc2018_BannerImpressionsDaily.tsv'
# - execute hql script:
hiveArgs <- 'beeline --verbose=true -f'
hiveInput <- paste0('summerBC2018_BannerImpressionsDaily.hql > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)

### --- Collect Pageviews == Banner clicks for Update: HiveQL
# https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia
q <- paste(
  "USE wmf;
  SELECT uri_path, uri_query, referer FROM webrequest
  WHERE uri_host = 'de.wikipedia.org'
  AND ((uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/Neue_Ehrenamtliche/FAQ' AND uri_query LIKE '%campaign=WMDE_neweditors_summer2018%')
  OR (uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia')) 
  AND ", dateCondition,
  " ;",
  sep = "")
# - write hql
write(q, 'summerBC2018_PageViewsDaily.hql')
# - prepare output file:
fileName <- 'sbc2018_PageViewsDaily.tsv'
# - execute hql script:
hiveArgs <- 'beeline --verbose=true -f'
hiveInput <- paste0('summerBC2018_PageViewsDaily.hql > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)

### --- Collect Use Registrations for Update: SQL
# - ServerSideAccountCreation_17719237 schema
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_17719237 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20180729000000) and ((event_campaign like '%WMDE_neweditors_summer2018_1%') or (event_campaign like '%WMDE_neweditors_summer2018_2%') or (event_campaign like '%WMDE_neweditors_summer2018_3%')));\" > /home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_userRegistrations.tsv"
system(command = qCommand, wait = TRUE)

### --- Collect Banner Events for Update: SQL
# - WMDEBannerEvents_18193948 schema
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.WMDEBannerEvents_18193948 where timestamp >= 20180729000000;\" > /home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_bannerEvents.tsv"
system(command = qCommand, wait = TRUE)


### --- Prepare for repoting

# - Banner Impressions
bImpData <- readLines('sbc2018_BannerImpressionsDaily.tsv', n = -1)
bImpData <- bImpData[9:(length(bImpData) - 2)]
bImpData <- bImpData[-1]
bImpData <- data.frame(dat = bImpData, stringsAsFactors = F)
bannerA <- length(which(grepl("banner=B18WMDE_authors_02_180801_1", bImpData$dat, fixed = T)))
bannerB <- length(which(grepl("banner=B18WMDE_authors_02_180801_2", bImpData$dat, fixed = T)))
bannerC <- length(which(grepl("banner=B18WMDE_authors_02_180801_3", bImpData$dat, fixed = T)))
DailyBannerImpressions <- data.frame(
  Banner = c('B18WMDE_authors_1', 'B18WMDE_authors_2', 'B18WMDE_authors_3'),
  Count = c(bannerA, bannerB, bannerC),
  CampaignDay = rep(paste0(today, collapse = "-", sep = ""), 3)
)
DailyBannerImpressions$Count <- DailyBannerImpressions$Count/.01
# - DailyBannerImpressions
lF <- list.files()
if ('DailyBannerImpressions.csv' %in% lF) {
  dPS <- read.csv('DailyBannerImpressions.csv',
                  header = T,
                  row.names = 1)
  DailyBannerImpressions <- rbind(dPS, DailyBannerImpressions)
  write.csv(DailyBannerImpressions, 'DailyBannerImpressions.csv')
} else {
  write.csv(DailyBannerImpressions, 'DailyBannerImpressions.csv')
}

# - Pageviews
pageViews <- readLines('sbc2018_PageViewsDaily.tsv', n = -1)
pageViews <- pageViews[8:(length(pageViews) - 2)]
header = pageViews[1]
pageViews <- pageViews[-1]
header <- unlist(strsplit(header, "\t")[[1]])
pageViews <- data.frame(dat = pageViews, stringsAsFactors = F)
pageViews <- separate(pageViews,
                      dat,
                      into = c('uri_path', 'uri_query', 'referer'),
                      sep = "\t")
dailyPageviewsSet <- pageViews %>% 
  filter(grepl('campaign=WMDE_neweditors_summer2018', uri_query) | 
           grepl('/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia', uri_path)) %>%
  group_by(uri_query, uri_path) %>%
  summarise(Count = n())
dailyPageviewsSet$CampaignDay <- paste(unlist(today), collapse = "-", sep = "")
# - DailyPageviews
lF <- list.files()
if ('DailyPageviews.csv' %in% lF) {
  dPS <- read.csv('DailyPageviews.csv',
                  header = T,
                  row.names = 1)
  dailyPageviewsSet <- rbind(dPS, as.data.frame(dailyPageviewsSet))
  write.csv(dailyPageviewsSet, 'DailyPageviews.csv')
} else {
  write.csv(dailyPageviewsSet, 'DailyPageviews.csv')
}

# - User registrations
userRegistrations <- read.delim('summerBC2018_userRegistrations.tsv', 
                                sep = "\t",
                                header = T,
                                quote = "", 
                                stringsAsFactors = F)
userRegistrations <- userRegistrations[grepl('is_bot": false', userRegistrations$userAgent, fixed = T), ]
userRegistrations$timestamp <- as.character(userRegistrations$timestamp)
userRegistrations$timestamp <- as.POSIXct(userRegistrations$timestamp, tz = "UTC", format = "%Y%m%d%H%M%S") 
# - CEST correction on POSIXct class, +2hours:
userRegistrations$timestamp <- userRegistrations$timestamp + 2*60*60
# - wrangle
userRegistrations$CampaignDay <- sapply(userRegistrations$timestamp, function(x) {
  paste(c(
    substr(x, 1,4),
    substr(x, 6,7),
    substr(x, 9,10)
    ),
    collapse = "-",
    sep = "")
})
userRegistrations <- userRegistrations[which(userRegistrations$CampaignDay %in% paste(today, collapse = "-", sep = "")), ]
dailyUserRegSet <- userRegistrations %>%
  select(event_campaign) %>%
  group_by(event_campaign) %>%
  summarise(Count = n())
dailyUserRegSet$CampaignDay <- paste(unlist(today), collapse = "-", sep = "")
# - Daily User Registrations
lF <- list.files()
if ('dailyUserRegistrations.csv' %in% lF) {
  dPS <- read.csv('dailyUserRegistrations.csv', 
                  header = T, 
                  row.names = 1)
  dailyUserRegSet <- rbind(dPS, dailyUserRegSet)
  write.csv(dailyUserRegSet, 'dailyUserRegistrations.csv')
} else {
  write.csv(dailyUserRegSet, 'dailyUserRegistrations.csv')
}
```

### 0.1 Post-Campaign Analytics

**NOTE:** Training Module data are obtained directly from the application maintainer. The following code retrieves all user registrations and edits.

```{r, echo = T, eval = F}
### --- User registrations

### --- Libraries
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

# - ServerSideAccountCreation_5487345
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_17719237 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20180729000000) and ((event_campaign like '%WMDE_neweditors_summer2018_1%') or (event_campaign like '%WMDE_neweditors_summer2018_2%') or (event_campaign like '%WMDE_neweditors_summer2018_3%')));\" > /home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_userRegistrations_FULL.tsv"
system(command = qCommand, wait = TRUE)

### --- User edits
# - get user IDs from registered:
userReg <- read.table('/home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/summerBC2018_userRegistrations_FULL.tsv', 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade) %>% 
  filter(event_isSelfMade == 1)
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""),
                      '/home/goransm/RScripts/NewEditors/2018_SummerBannerCampaign/sBC2018_userEdits_FULL.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)
```

## 1. Campaign Banners and Pages

This section presents all data and statistics on the campaign banners and pages.

### 1.1 Banner Impressions

**Chart 1. 1. Daily Banner Impressions **

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
dataSet <- read.csv('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/DailyBannerImpressions.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)

# - banner impression rate == .01
# - dataSet$Count <- dataSet$Count * (1/.01)

# - Visualize w. {ggplot2}
ggplot(dataSet, aes(x = CampaignDay,
                    y = Count,
                    group = Banner,
                    color = Banner,
                    fill = Banner,
                    label = Count,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) +
  ggtitle('Summer Banner Campaign 2018: Banner Impressions') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank())
```

**Table 1. 1. Daily Banner Impressions **

```{r echo = T, warning = 'hide', message = F}
### --- Full Dataset (Table Report)
datatable(dataSet)
```

**Table 1. 1. 1 Total Banner Impressions per Banner **

```{r echo = T, warning = 'hide', message = F}
### --- Summary Stats for Banner Impressions
totalImpressions <- dataSet %>% 
  group_by(Banner) %>% 
  summarise(totalImpressions = sum(Count))
datatable(totalImpressions)
```

### 1.2 Banner Clicks (Pageviews)

**Chart 1. 2. Daily Banner Clicks (Landing Page Views) **

The `uri_query` field defines the source of the respective pageview.

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
dataSet <- read.csv(paste('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/DailyPageviews.csv'),
                    header = T,
                    check.names = F,
                    row.names = 1,
                    stringsAsFactors = F)
dataSet$uri_path <- gsub("/wiki/Wikipedia:Wikimedia_Deutschland/", "", dataSet$uri_path)
dataSet$uri_query <- gsub("?campaign=", "", dataSet$uri_query, fixed = T)
dataSet <- dataSet %>% 
  filter(!(grepl("WMDE_2018_sprbt1|WMDE_2018_sprbt2|wmde_etc2017_bt1", uri_query)))
dataSet$uri_query[which(dataSet$uri_query == "")] <- "No referer"
dataSet$uri_query[which(grepl("^WMDE_neweditors_summer2018_1", dataSet$uri_query))] <- "WMDE_neweditors_summer2018_1"
dataSet$uri_query[which(grepl("^WMDE_neweditors_summer2018_2", dataSet$uri_query))] <- "WMDE_neweditors_summer2018_2"
dataSet$uri_query[which(grepl("^WMDE_neweditors_summer2018_3", dataSet$uri_query))] <- "WMDE_neweditors_summer2018_3"
dataSet <- dataSet %>% 
  group_by(uri_query, uri_path, CampaignDay) %>% 
  summarise(Count = sum(Count))
# - Visualize w. {ggplot2}
ggplot(dataSet, aes(x = CampaignDay,
                    y = Count,
                    group = uri_query,
                    color = uri_query,
                    fill = uri_query,
                    label = Count)) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) + 
  facet_wrap(~ uri_path, scales = "free") + 
  ggtitle('Summer Banner Campaign 2018: Banner Cliks (Pageviews)') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.position =  'right') + 
  theme(legend.text =  element_text(size = 7))
```


**Table 1. 2. Daily Banner Clicks (Landing Page Views) **

```{r echo = T, warning = 'hide', message = F}
### --- Full Dataset (Table Report)
dataSet <- read.csv(paste('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/DailyPageviews.csv'),
                    header = T,
                    check.names = F,
                    row.names = 1,
                    stringsAsFactors = F)
dataSet$uri_path <- gsub("/wiki/Wikipedia:Wikimedia_Deutschland/", "", dataSet$uri_path)
dataSet$uri_query <- gsub("?campaign=", "", dataSet$uri_query, fixed = T)
dataSet <- dataSet %>% 
  filter(!(grepl("WMDE_2018_sprbt1|WMDE_2018_sprbt2|wmde_etc2017_bt1", uri_query)))
dataSet$uri_query[which(dataSet$uri_query == "")] <- "No referer"
dataSet$uri_query[which(grepl("^WMDE_neweditors_summer2018_1", dataSet$uri_query))] <- "WMDE_neweditors_summer2018_1"
dataSet$uri_query[which(grepl("^WMDE_neweditors_summer2018_2", dataSet$uri_query))] <- "WMDE_neweditors_summer2018_2"
dataSet$uri_query[which(grepl("^WMDE_neweditors_summer2018_3", dataSet$uri_query))] <- "WMDE_neweditors_summer2018_3"
dataSet <- dataSet %>% 
  group_by(uri_query, uri_path, CampaignDay) %>% 
  summarise(Count = sum(Count))
datatable(dataSet)
```

**Table 1. 2. 1 Total Banner Clicks/Pageviews per Banner/Source **

```{r echo = T, warning = 'hide', message = F}
### --- Summary Stats for Banner Impressions
totalClicks <- dataSet %>% 
  group_by(uri_query) %>% 
  summarise(totalClicks = sum(Count)) %>% 
  arrange(desc(totalClicks))
datatable(totalClicks)
```

### 1.3 Banner Clicks/Impressions Clicks (Pageviews)

**Chart 1. 3. The Clicks/Impressions ratio per banner**

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
impressions <- read.csv('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/DailyBannerImpressions.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
# - adjust for banner impressions sample rate
clicks <- read.csv(paste('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/DailyPageviews.csv'),
                    header = T,
                    check.names = F,
                    row.names = 1,
                    stringsAsFactors = F)
clicks$uri_query[which(clicks$uri_query == "")] <- "No referer"
clicks$uri_query <- gsub("?campaign=", "", clicks$uri_query, fixed = T)
clicks <- clicks %>% 
  filter(!(grepl("WMDE_2018_sprbt1|WMDE_2018_sprbt2|wmde_etc2017_bt1", uri_query)))
clicks$uri_query[grepl("^WMDE_neweditors_summer2018_1", clicks$uri_query)] <- 'B18WMDE_authors_1'
clicks$uri_query[grepl("^WMDE_neweditors_summer2018_2", clicks$uri_query)] <- 'B18WMDE_authors_2'
clicks$uri_query[grepl("^WMDE_neweditors_summer2018_3", clicks$uri_query)] <- 'B18WMDE_authors_3'
clicks <- clicks %>% 
  group_by(uri_query, CampaignDay) %>% 
  summarise(Count = sum(Count))
# - join impressions and clicks:  
dataSet <- left_join(impressions, clicks, 
                     by = c("Banner" = "uri_query", "CampaignDay"))
dataSet$`Clicks/Impressions` <- round(dataSet$Count.y/dataSet$Count.x, 5)
# - Visualize w. {ggplot2}
ggplot(dataSet, aes(x = CampaignDay,
                    y = `Clicks/Impressions`,
                    group = Banner,
                    color = Banner,
                    fill = Banner,
                    label = `Clicks/Impressions`)) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) +
  ggtitle('Summer Banner Campaign 2018: Banner Clicks/Impressions') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank())
```

**Table 1. 3. The Clicks/Impressions ratio per banner**

```{r echo = T, warning = 'hide', message = F}
### --- Full Dataset (Table Report)
datatable(dataSet[, c(1, 3, 5)])
```

**Table 1. 3. 1 Total Clicks/Impressions ratio per banner**

```{r echo = T, warning = 'hide', message = F}
### --- Full Dataset (Table Report)
totalImpClicks <- dataSet %>% 
  select(Banner, Count.x, Count.y) %>% 
  group_by(Banner) %>% 
  summarise(Impressions = sum(Count.x), Clicks = sum(Count.y))
totalImpClicks$`Clicks/Impressions` <- round(totalImpClicks$Clicks/totalImpClicks$Impressions, 5)
datatable(totalImpClicks)
```

### 1.4 User Registrations

**Chart 1. 4. Daily user registrations per banner**

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
dataSet <- read.delim('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/summerBC2018_userRegistrations_FULL.tsv',
                    header = T,
                    check.names = F,
                    sep = "\t",
                    quote = "",
                    stringsAsFactors = F)
dataSet <- dataSet[grepl('is_bot": false', dataSet$userAgent, fixed = T), ]
dataSet$timestamp <- as.character(dataSet$timestamp)
dataSet$timestamp <- as.POSIXct(dataSet$timestamp, tz = "UTC", format = "%Y%m%d%H%M%S") 
# - CEST correction on POSIXct class, +2hours:
dataSet$timestamp <- dataSet$timestamp + 2*60*60
# - Campaign day:
dataSet$CampaignDay <- sapply(dataSet$timestamp, function(x) {
  paste(c(
    substr(x, 1,4),
    substr(x, 6,7),
    substr(x, 9,10)
    ),
    collapse = "-",
    sep = "")
})
dataSet <- filter(dataSet,
                  CampaignDay %in% c("2018-08-01", 
                                     "2018-08-02",
                                     "2018-08-03",
                                     "2018-08-04",
                                     "2018-08-05",
                                     "2018-08-06",
                                     "2018-08-07",
                                     "2018-08-08"))
userReg <- dataSet %>% 
  select(event_campaign, CampaignDay) %>% 
  group_by(CampaignDay, event_campaign) %>% 
  summarise(Count = n())
colnames(userReg)[2] <- "Banner"

# - Visualize w. {ggplot2}
ggplot(userReg, aes(x = CampaignDay,
                    y = Count,
                    group = Banner,
                    color = Banner,
                    fill = Banner,
                    label = Count)) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) +
  ggtitle('Summer Banner Campaign 2018: User Registrations') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank())
```

**Table 1. 4. Daily user registrations per banner**

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
datatable(userReg)
```

**Table 1. 4. 1 Banner Conversion Rates**

```{r echo = T, eval = T, warning = 'hide', message = F}
### --- Banner Conversion Rates
totalRegs <- userReg[, 2:3] %>% 
  select(Banner, Count) %>%
  group_by(Banner) %>%
  summarise(totalReg = sum(Count))
totalRegs$Banner[grepl("WMDE_neweditors_summer2018_1", totalRegs$Banner)] <- 'B18WMDE_authors_1'
totalRegs$Banner[grepl("WMDE_neweditors_summer2018_2", totalRegs$Banner)] <- 'B18WMDE_authors_2'
totalRegs$Banner[grepl("WMDE_neweditors_summer2018_3", totalRegs$Banner)] <- 'B18WMDE_authors_3'
totalClicks$uri_query[grepl("WMDE_neweditors_summer2018_1", totalClicks$uri_query)] <- 'B18WMDE_authors_1'
totalClicks$uri_query[grepl("WMDE_neweditors_summer2018_2", totalClicks$uri_query)] <- 'B18WMDE_authors_2'
totalClicks$uri_query[grepl("WMDE_neweditors_summer2018_3", totalClicks$uri_query)] <- 'B18WMDE_authors_3'
totalClicks <- totalClicks %>% 
  group_by(uri_query) %>% 
  summarise(totalClicks = sum(totalClicks))
totalRegs <- left_join(totalRegs, totalClicks,
                       by = c("Banner" = "uri_query"))
totalRegs$`Registrations/Clicks` <- round(totalRegs$totalReg/totalRegs$totalClicks, 5)
datatable(totalRegs)
```

### 1.5 User Edits

**Chart 1. 5. Edits of registered users per banner and campaign day of registration**

**NOTE:** the Campaign Day here refers to the day when the user has registered, not the day in which the respective edits were made.

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
dataSet2 <- read.delim('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/sBC2018_userEdits_FULL.tsv',
                    header = T,
                    check.names = F,
                    sep = "\t",
                    quote = "",
                    stringsAsFactors = F)
dataSet <- left_join(dataSet, dataSet2, by = c("event_userId" = "rev_user"))
dataSet$edits[is.na(dataSet$edits)] <- 0

userEds <- dataSet %>% 
  select(event_campaign, CampaignDay, edits) %>% 
  group_by(CampaignDay, event_campaign) %>% 
  summarise(Edits = sum(edits))
colnames(userEds)[2] <- "Banner"

# - Visualize w. {ggplot2}
ggplot(userEds, aes(x = CampaignDay,
                    y = Edits,
                    group = Banner,
                    color = Banner,
                    fill = Banner,
                    label = Edits)) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  scale_y_continuous(labels = comma) +
  ggtitle('Summer Banner Campaign 2018: User Edits') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank())
```

**Table 1. 5. Edits of registered users per banner and campaign day of registration**

```{r echo = T, warning = 'hide', message = F}
### --- Full Dataset (Table Report)
datatable(userEds)
```


**Table 1. 5. 1 Edits of registered users per banner and campaign day of registration**

Total number of user edits per banner:

```{r echo = T, warning = 'hide', message = F}
totalEdits <- userEds %>% 
  group_by(Banner) %>% 
  summarise(Edits = sum(Edits))
datatable(totalEdits)
```

**Table 1. 5. 2 Users reaching their 10th edit per banner**

```{r echo = T, eval = T, warning = 'hide', message = F}
edit10 <- dataSet %>% 
  filter(edits >= 10) %>% 
  select(event_campaign) %>% 
  group_by(event_campaign) %>%
  summarise(`edit10th` = n())
datatable(edit10)
```

**Table 1. 5. 3 Full user edits distribution**

```{r echo = T, eval = T, warning = 'hide', message = F}
### --- Full Dataset (Table Report)
pltEdits <- as.tbl(dataSet) %>% 
  dplyr::group_by(edits) %>% 
  count()
colnames(pltEdits) <- c('Edits', 'Num.users')
knitr::kable(pltEdits, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```


**Table 1. 5. 4 Users edit categories**

```{r echo = T, eval = T, warning = 'hide', message = F}
edits0 <- pltEdits$`Num.users`[pltEdits$Edits == 0]
edits <- sum(pltEdits$`Num.users`[pltEdits$Edits > 0])
edits1 <- sum(pltEdits$`Num.users`[pltEdits$Edits == 1])
edits2_4 <- sum(pltEdits$`Num.users`[pltEdits$Edits >= 2 & pltEdits$Edits <= 4])
edits5_10 <- sum(pltEdits$`Num.users`[pltEdits$Edits >= 5 & pltEdits$Edits <= 10])
edits10 <- sum(pltEdits$`Num.users`[pltEdits$Edits > 10])
editClasses <- data.frame(`No edits` = edits0,
                          `Edited` = edits,
                          `1 edit` = edits1,
                          `2 - 4 edits` = edits2_4,
                          `5 - 10 edits` = edits5_10, 
                          `> 10 edits` = edits10,
                          check.names = F,
                          stringsAsFactors = F)
knitr::kable(editClasses, format = "html") %>%
  kable_styling(full_width = F, position = "left")
```

## 2 Training Modules

```{r echo = T, eval = T, warning = 'hide', message = F}
trainMod <- read.csv('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_SummerBannerCampaign/_data/wmde_training_data_2018-08.csv',
                    header = T,
                    check.names = F,
                    stringsAsFactors = F)
# - remove users Stefan Schneider (WMDE) and Sage (Wiki Ed)
trainMod <- trainMod %>% 
  filter(!(username %in% c('Stefan Schneider (WMDE)', 'Sage (Wiki Ed)')))
# - data types
trainMod$module_completion_date <- as.character(trainMod$module_completion_date)
trainMod$module_completion_date <- as.POSIXct(trainMod$module_completion_date, tz = "UTC", format = "%Y-%m-%d %H:%M:%S") 
# - CEST correction on POSIXct class, +2hours:
trainMod$module_completion_date <- trainMod$module_completion_date + 2*60*60

trainMod$started_at <- as.character(trainMod$started_at)
trainMod$started_at <- as.POSIXct(trainMod$started_at, tz = "UTC", format = "%Y-%m-%d %H:%M:%S") 
# - CEST correction on POSIXct class, +2hours:
trainMod$started_at <- trainMod$started_at + 2*60*60

trainMod$last_slide_completed_at <- as.character(trainMod$last_slide_completed_at)
trainMod$last_slide_completed_at <- as.POSIXct(trainMod$last_slide_completed_at, tz = "UTC", format = "%Y-%m-%d %H:%M:%S") 
# - CEST correction on POSIXct class, +2hours:
trainMod$last_slide_completed_at <- trainMod$last_slide_completed_at + 2*60*60

# - filter for campaign days only
trainMod <- filter(trainMod, 
                   started_at >= "2018-08-01 00:00:00" & started_at < "2018-08-09 00:00:00")

# - filter campaign training modules only
# trainMod <- filter(trainMod,
                   # training_module %in% c('diskutieren-basiswissen', 'editieren-basiswissen'))


# - left_join w. dataSet (user registrations + user edits)
# - to produce trainMod that has user registrations + edits
trainMod <- left_join(trainMod, dataSet,
                      by = c("username" = "event_userName"))
# - filter those who did not register with the campaign:
trainMod <- filter(trainMod, !is.na(CampaignDay))

# - trainMod aggregates 1: number of training modules started and completed per registered user
modulesStarted <- as.data.frame(table(trainMod$username))
colnames(modulesStarted) <- c('username', 'modulesStarted')
modulesCompleted <- filter(trainMod, !is.na(module_completion_date))
modulesCompleted <- as.data.frame(table(modulesCompleted$username))
colnames(modulesCompleted) <- c('username', 'modulesCompleted')
modulesData <- left_join(modulesStarted, modulesCompleted, by = 'username')
modulesData$modulesCompleted[is.na(modulesData$modulesCompleted)] <- 0

# - left_join dataSet w. modulesData
dataSet <- left_join(dataSet, modulesData, 
                     by = c("event_userName" = "username"))
dataSet$modulesStarted[is.na(dataSet$modulesStarted)] <- 0
dataSet$modulesCompleted[is.na(dataSet$modulesCompleted)] <- 0

# - produce aggregates for reporting on training modules from dataSet
# - agg1: tM_started
tM_started <- dataSet %>% 
  select(modulesStarted, edits) %>% 
  group_by(modulesStarted) %>% 
  summarise(sumEdits = sum(edits), users = n())
tM_started$editsPerUser <- round(tM_started$sumEdits/tM_started$users, 2)

# - agg1: tM_completed
tM_completed <- dataSet %>% 
  select(modulesCompleted, edits) %>% 
  group_by(modulesCompleted) %>% 
  summarise(sumEdits = sum(edits), users = n())
tM_completed$editsPerUser <- round(tM_completed$sumEdits/tM_completed$users, 2)

# - agg3: tm_started_perModule
tm_started_perModule <- trainMod %>% 
  select(training_module) %>% 
  group_by(training_module) %>% 
  summarise(Count = n())

# - agg4: tm_completed_perModule
tm_completed_perModule <- trainMod %>%
  filter(!is.na(module_completion_date)) %>%
  select(training_module) %>%
  group_by(training_module) %>% 
  summarise(Count = n())

# - agg5(1-2): exit slides per module
exitSlides1 <- data.frame(t(table(
  trainMod$training_module[trainMod$training_module %in% "diskutieren-basiswissen"], 
  trainMod$last_slide_completed[trainMod$training_module %in% "diskutieren-basiswissen"])))
colnames(exitSlides1) <- c('Slide', 'Module', 'Count')
exitSlides2 <- data.frame(t(table(
  trainMod$training_module[trainMod$training_module %in% "editieren-basiswissen"], 
  trainMod$last_slide_completed[trainMod$training_module %in% "editieren-basiswissen"])))
colnames(exitSlides2) <- c('Slide', 'Module', 'Count')

# - agg6(1-2): time spent in training
trainMod$Completed <- !is.na(trainMod$module_completion_date)
timeInTraining <- trainMod %>% 
  select(Completed, training_module, started_at, last_slide_completed_at) %>% 
  mutate(timeSpent = as.numeric((last_slide_completed_at - started_at)/60)) %>%
  select(Completed, training_module, timeSpent) %>%
  group_by(training_module, Completed) %>%
  summarise(`Time (mins)` = mean(round(timeSpent, 2)))

# - agg7: distribution of time spent in training
timeInTrainingDist <- trainMod %>%
  select(Completed, training_module, started_at, last_slide_completed_at) %>%
  mutate(timeSpent = as.numeric((last_slide_completed_at - started_at)/60))
```


**Table 2. 1 Number of users who have started a particular number of training modules (modulesStarted), the total number of edits that they have made (sumEdits), number of users who took a particular number of training modules (users), and the edits per user ratio. **

```{r echo = T, warning = 'hide', message = F}
datatable(tM_started)
```

**Table 2. 2 Number of users who have completed a particular number of training modules (modulesStarted), the total number of edits that they have made (sumEdits), number of users who took a particular number of training modules (users), and the edits per user ratio. **


```{r echo = T, warning = 'hide', message = F}
datatable(tM_completed)
```

**Table 2. 3 Number of users who have started a particular training module. **

```{r echo = T, warning = 'hide', message = F}
datatable(tm_started_perModule)
```

**Table 2. 4 Number of users who have completed a particular training module. **

```{r echo = T, warning = 'hide', message = F}
datatable(tm_completed_perModule)
```

**Table 2. 5a Exit slides per training module and the number of users who have exited at a particular slide. **

```{r echo = T, warning = 'hide', message = F}
datatable(exitSlides1)
```
**Table 2. 5b Exit slides per training module and the number of users who have exited at a particular slide. **

```{r echo = T, warning = 'hide', message = F}
datatable(exitSlides2)
```
**Table 2. 6 Average Time spent in training per training module. **

```{r echo = T, warning = 'hide', message = F}
datatable(timeInTraining)
```
**Chart 2. 7 The distribution of time spent in training per training module. **

```{r echo = T, warning = 'hide', message = F}
ggplot(data = timeInTrainingDist, 
       aes(x = Completed, y = timeSpent, color = Completed)) +
  geom_point() +
  geom_jitter() +
  facet_wrap(~training_module) + 
  theme_bw()
```

