Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign was run from 2020/05/14 to 2020/06/30.

CURRENT UPDATE: Complete dataset as of 2020/06/30.

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script 2020_EmailCampaignWikipediaChallenge_PRODUCTION.R on stat1004.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally. A daily crontab job was run from 2020/05/14 to 2020/06/30 to collect the data for daily reporting. The data used in this report are aggregates of the daily datasets, sanitized and anonymized.

0.1 Daily Update

### --- WMDE 2020_EmailCampaignWikipediaChallenge_PRODUCTION.R
### --- Campaign start: 2020/05/14
### --- Campaign end: 2020/06/30
### --- run from: stat1004
### --- path: /home/goransm/Analytics/NewEditors/Campaigns/2020_EmailCampaignWikipediaChallenge/

### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)

### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2020_EmailCampaignWikipediaChallenge/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- campaign specifics
campaignName <- 'Email_Campaign_Wikipedia_Challenge2020'

### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
  strsplit(as.character(cetDay), 
           split = " ", 
           fixed = T)[[1]][1]
) - 1

cetDay <- "2020-06-30"

### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------

# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
                                   uri_path,
                                   cetDay,
                                   queryFile,
                                   fileName,
                                   dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                  paste(
                                    paste0("uri_host = '", uri_host, "'"),
                                    collapse = " OR ", sep = " "),
                                  ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "USE wmf;
    SELECT uri_host, uri_path, uri_query, referer FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
}

# - set params to wmde_collect_pageviews
# - for the 2020_EmailCampaignWikipediaChallenge
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org',
              'wikidata.org', 'm.wikidata.org',
              'commons.wikimedia.org', 'commons.m.wikimedia.org',
              'de.wikivoyage.org', 'de.m.wikivoyage.org',
              'de.wiktionary.org', 'de.m.wiktionary.org')
uri_path  <- c(
  '/wiki/Langer_Grund-Kohlberg?tour=einfuhrung',
  '/wiki/Spezial:Benutzerkonto_anlegen',
  '/wiki/Wikidata:Introduction/de',
  '/wiki/Spezial:Anmelden',
  '/wiki/Special:MyPage/Artikelwerkstatt',
  '/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training',
  '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
  '/w/index.php?title=Q16943273&tour=wbitems&data=ok',
  '/w/index.php?title=Special:UploadWizard&uselang=de',
  '/wiki/Hauptseite',
  '/wiki/Wiktionary:Hauptseite',
  '/wiki/Commons:Willkommen')
# uri_query <- paste0('WMDE_2020_challenge_', 1:30)
queryFile <- 'emailFollowUp2019_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")

# - collect Pageviews data
wmde_collect_pageviews(uri_host,
                       uri_path,
                       cetDay,
                       queryFile,
                       fileName,
                       dataDir)

### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
                                   dataDir, 
                                   uri_query_filter,
                                   page_filter,
                                   cetDay = cetDay,
                                   campaignName = campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)
  
  # - load
  pageviewsData <- readLines(fileName)
  wStart <- which(grepl("uri_host", pageviewsData))
  pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
  pageviewsData <- data.frame(dat = pageviewsData, 
                              stringsAsFactors = F)
  pageviewsData <- separate(pageviewsData,
                            dat,
                            into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
                            sep = "\t")
  # - apply page_filter
  pageviewsData$page <- paste0(pageviewsData$uri_host, 
                               pageviewsData$uri_path,
                               pageviewsData$uri_query)
  wFilter <- sapply(pageviewsData$page, function(x) {
    sapply(page_filter, function(y) {
      grepl(y, x)
    })
  })
  wFilter <- colSums(wFilter)
  wFilter <- which(wFilter > 0)
  pageviewsData <- pageviewsData[wFilter, ]
  # - apply uri_query_filter
  # - NOTE: looking in both: uri_query, referer 
  w_uri_query <- which(grepl(uri_query_filter, pageviewsData$page))
  
  if (length(w_uri_query) > 0) {
    
    # - filter for w_uri_query
    pageviewsData <- pageviewsData[w_uri_query, ] 
    
    # - aggregate:
    pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
    pageviewsData$uri_host <- NULL
    pageviewsData$page <- NULL
    pageviewsData$referer <- NULL
    pageviewsData <- pageviewsData %>% 
      dplyr::select(uri_query, uri_path) %>% 
      dplyr::group_by(uri_query, uri_path) %>% 
      dplyr::summarise(pageviews = n())
    colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
    
    # - add cetDay, campaignName
    pageviewsData$date <- cetDay
    pageviewsData$campaign <- campaignName
    
    # - store:
    write.csv(pageviewsData, 
              paste0(analyticsDir, 
                     "pageviewsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
  }
  
}

# - set params to wmde_process_pageviews
# - for the WMDE 2020_EmailCampaignWikipediaChallenge
uri_query_filter <- 'WMDE_2020_challenge_'
page_filter <- c('de.wikipedia.org/wiki/Langer_Grund-Kohlberg?tour=einfuhrung', 
                 'de.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen', 
                 'wikidata.org/wiki/Wikidata:Introduction/de', 
                 'de.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.wikipedia.org/wiki/Special:MyPage/Artikelwerkstatt', 
                 'de.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur', 
                 'www.wikidata.org/w/index.php?title=Q16943273&tour=wbitems&data=ok', 
                 'commons.wikimedia.org/w/index.php?title=Special:UploadWizard&uselang=de', 
                 'commons.wikimedia.org/wiki/Commons:Willkommen',
                 'de.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.wikivoyage.org/wiki/Hauptseite', 
                 'de.wiktionary.org/wiki/Wiktionary:Hauptseite', 
                 'de.m.wikipedia.org/wiki/Langer_Grund-Kohlberg?tour=einfuhrung', 
                 'de.m.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen', 
                 'm.wikidata.org/wiki/Wikidata:Introduction/de', 
                 'de.m.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.m.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.m.wikipedia.org/wiki/Special:MyPage/Artikelwerkstatt', 
                 'de.m.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.m.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur', 
                 'www.wikidata.org/w/index.php?title=Q16943273&tour=wbitems&data=ok', 
                 'commons.m.wikimedia.org/w/index.php?title=Special:UploadWizard&uselang=de',
                 'commons.m.wikimedia.org/wiki/Commons:Willkommen',
                 'de.m.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.m.wikivoyage.org/wiki/Hauptseite', 
                 'de.m.wiktionary.org/wiki/Wiktionary:Hauptseite')

# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
                       dataDir = dataDir,
                       uri_query_filter = uri_query_filter, 
                       page_filter = page_filter,
                       cetDay = cetDay,
                       campaignName = campaignName)

### ----------------------------------------------------------
### --- User Registrations
### --- HiveQL query: event.ServerSideAccountCreation table
### ----------------------------------------------------------
hiveQL <-
  "SELECT year, month, day, hour,  event.campaign, event.userId, event.userName, webhost, wiki
      FROM event.serversideaccountcreation
      WHERE
        year = 2020
        AND (month = 5 OR month = 6)
        AND (event.campaign LIKE '%WMDE_2020_challenge_%');"
# - write hql
write(hiveQL, paste0(dataDir, 'user_registrations.hql'))
### --- output filename
filename <- paste0(dataDir, 'user_registrations.tsv')
### --- execute hql script:
hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
hiveInput <- paste(paste0(dataDir, 'user_registrations.hql'),
                   " > ",
                   filename,
                   sep = "")
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)

### ----------------------------------------------------------
### --- User Edits
### --- User Edits via revision_actor_temp
### ----------------------------------------------------------

startTimestamp <- '20200514000000'
# - get user ids
userRegistrations <- fread(paste0(dataDir,
                                  'user_registrations.tsv'))
dim(userRegistrations)
# - clean up from test accounts
userRegistrations <- filter(userRegistrations, 
                            !grepl("Test E-Mail Campaign 2020", 
                                   userRegistrations$username))
dim(userRegistrations)
head(userRegistrations)
rev_user <- userRegistrations$userid
# - iterate over rev_user
for (i in 1:length(rev_user)) {
  # - SQL query
  sqlQuery <- paste("\"SELECT actor.actor_id, 
                        actor.actor_user, 
                        actor.actor_name, 
                        revision_actor_temp.revactor_timestamp 
                    FROM actor 
                    LEFT JOIN revision_actor_temp ON (actor.actor_id = revision_actor_temp.revactor_actor) 
                    WHERE (revision_actor_temp.revactor_timestamp >= 20200511000000 
                      AND actor.actor_user = ", rev_user[i], ");\"");
  ### --- output filename
  filename <- paste(dataDir,'userEdits', "_", i, ".tsv", sep = "")
  ### --- execute sql script:
  sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
  sqlInput <- paste(sqlQuery,
                    " > ",
                    filename,
                    sep = "")
  # - command:
  sqlCommand <- paste(sqlLogInPre, sqlInput)
  system(command = sqlCommand, wait = TRUE)
  # - report
  print(paste0("DONE: user ", i, "."))
}
### --- END run SQL scripts
# - load user edits:
lF <- list.files(dataDir)
lF <- lF[grepl("^userEdits_", lF)]
userEdits <- lapply(paste0(dataDir, lF), fread)
userEdits <- rbindlist(userEdits)
# - store user edits:
write.csv(userEdits, paste0(analyticsDir, 'userEdits.csv'))

1. Campaign Pageviews

This section presents all data and statistics on the campaign pages.

The following chunk loads and then re-structures the dataset a bit.

lF <- list.files('_analytics')
lF <- lF[grepl("^pageviewsAggregated", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - expand grid to account for missing observations per day
dS <- expand.grid(unique(dataSet$Tag), 
                  unique(dataSet$Page), 
                  unique(dataSet$date), 
                  stringsAsFactors = F)
colnames(dS) <- c('Tag', 'Page', 'date')
dS <- dS %>% 
  left_join(dataSet, 
            by = c("Tag", "Page", "date"))
dataSet <- dS; rm(dS)
dataSet$Pageviews[is.na(dataSet$Pageviews)] <- 0
dataSet$Tag <- gsub("&.+$", "", dataSet$Tag)
dataSet$Tag <- gsub("\\?campaign=", "", dataSet$Tag)
dataSet$date <- sapply(dataSet$date, function(x) {
  d <- strsplit(x, split = "-")[[1]]
  if (nchar(d[3]) == 1) {
    d[3] <- paste0("0", d[3])
  }
  return(paste(d, collapse = "-"))
})

1.1 Pageviews Overview

Chart 1.1.1 Daily Pageviews, aggregated across the campaign channels.

pFrame <- dataSet %>% 
  select(date, Page, Pageviews) %>% 
  group_by(date, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  facet_wrap(~Page, ncol = 2) + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

Table 1.1.1 Pageviews totals

tFrame <- pFrame %>% 
  select(Page, Pageviews) %>% 
  group_by(Page) %>% 
  summarise(totalPageviews = sum(Pageviews)) %>% 
  arrange(desc(totalPageviews))
datatable(tFrame)

1.2 Pageviews: Campaign Channels

Chart 1.2.1 Pageviews, by channels

pFrame <- dataSet %>% 
  select(date, Tag, Pageviews) %>% 
  group_by(date, Tag) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  facet_wrap(~Tag, ncol = 2) + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

Chart 1.2.2 Total Pageviews, by channels

pFrame <- pFrame %>% 
  select(Tag, Pageviews) %>% 
  group_by(Tag) %>% 
  summarise(Pageviews = sum(Pageviews)) %>% 
  arrange(desc(Pageviews))
pFrame$Tag <- factor(pFrame$Tag,
                     levels = pFrame$Tag,
                     ordered = T)
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews,
                    )) + 
  geom_bar(stat = "identity", color = "blue", fill = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

Table 2.2.1 Total pageviews, by channels and by pages.

tFrame <- dataSet %>% 
  select(Tag, Page, Pageviews) %>% 
  group_by(Tag, Page) %>% 
  summarise(totalPageviews = sum(Pageviews)) %>% 
  arrange(desc(totalPageviews))
datatable(tFrame)

2. User Registrations

This section presents all data and statistics on the user registrations.

2.1 User Registrations Overview

lF <- list.files("_analytics")
lF <- lF[grepl("^user_registrations", lF)]
dataSet <- fread(paste0("_analytics/", lF))
dataSet <- filter(dataSet, 
                  !grepl("^Test E-Mail Campaign", dataSet$username))
dataSet$date <- paste(dataSet$year, 
                      ifelse(nchar(dataSet$month) == 1, paste0("0", dataSet$month), dataSet$month),
                      ifelse(nchar(dataSet$day) == 1, paste0("0", dataSet$day), dataSet$day),
                      sep = "-")
regUsers <- select(dataSet,
                   userid,
                   username, 
                   campaign)
# - expand grid to account for missing observations per day
dateSpan <- seq(from = as.Date("2020-05-14"), 
                to = as.Date("2020-06-30"), 
                by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan, 
                  unique(dataSet$campaign), 
                  stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>% 
  left_join(select(dataSet, date, campaign, userid),
            by = c("date", "campaign"))
dS$userid <- ifelse(is.na(dS$userid), 0, 1)
pFrame <- dS %>% 
  select(date, userid) %>% 
  group_by(date) %>% 
  summarise(registrations = sum(userid)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = registrations,
                   label = registrations,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Registrations") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

2.2 User Registrations per Campaign Channel

pFrame <- dS %>% 
  group_by(date, campaign) %>% 
  summarise(registrations = sum(userid)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = registrations,
                   label = registrations,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  facet_wrap(~campaign, ncol = 2) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Registrations") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

2.3 Total User Registrations per Campaign Channel

pFrame <- dS %>% 
  select(campaign, userid) %>% 
  group_by( campaign) %>% 
  summarise(registrations = sum(userid)) %>% 
  arrange(desc(registrations))
pFrame$campaign <- factor(pFrame$campaign, 
                          levels = pFrame$campaign, 
                          ordered = T)
ggplot(pFrame, aes(x = campaign,
                   y = registrations,
                   label = registrations,
                    )) + 
  geom_bar(stat = "identity", color = "blue", fill = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Registrations") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

3. User Edits

This section presents all data and statistics on the user edits.

3.1 User Edits Overview

The following chunk loads the dataset of user interactions with campaign channels and then re-structures the dataset a bit.

lF <- list.files("_analytics")
lF <- lF[grepl("^userEdits", lF)]
editSet <- fread(paste0("_analytics/", lF))
editSet <- select(editSet,
                  actor_name, 
                  revactor_timestamp)                 
colnames(editSet) <- c('username', 'rev_timestamp')
dataSet <- left_join(editSet,
                     select(regUsers, 
                            username,
                            campaign),
                     by = "username")
dataSet$year <- substr(dataSet$rev_timestamp, 1, 4)
dataSet$month <- substr(dataSet$rev_timestamp, 5, 6)
dataSet$day <- substr(dataSet$rev_timestamp, 7, 8)
dataSet$rev_timestamp <- NULL
dataSet$date <- paste(dataSet$year,
                       dataSet$month,
                       dataSet$day,
                       sep = "-")
dataSet <- select(dataSet,
                  date,
                  username,
                  campaign)
dateSpan <- seq(from = as.Date("2020-05-14"), 
                to = as.Date(max(dataSet$date)), 
                by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan, 
                  unique(dataSet$campaign),
                  stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>% 
  left_join(select(dataSet, date, campaign, username),
            by = c("date", "campaign"))
dS$username <- ifelse(is.na(dS$username), 0, 1)
pFrame <- dS %>%
  select(date, username) %>% 
  group_by(date) %>% 
  summarise(edits = sum(username)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = edits,
                   label = edits,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

3.2 User Edits by Campaign Channels

pFrame <- dS %>% 
  group_by(date, campaign) %>% 
  summarise(edits = sum(username)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = edits,
                   label = edits,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  facet_wrap(~campaign, ncol = 2) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 5.5)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

3.3 Total User Edits by Campaign Channels

pFrame <- dS %>% 
  select(campaign, username) %>% 
  group_by( campaign) %>% 
  summarise(edits = sum(username)) %>% 
  arrange(desc(edits))
pFrame$campaign <- factor(pFrame$campaign, 
                          levels = pFrame$campaign, 
                          ordered = T)
ggplot(pFrame, aes(x = campaign,
                   y = edits,
                   label = edits,
                    )) + 
  geom_bar(stat = "identity", color = "blue", fill = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

3.4 Edit Classes

3.4.1 Edit Classes: all users

userClass <- dataSet %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

3.4.2A Edit Classes per Campaign Channel: WMDE_2020_challenge_1

userClass <- dataSet %>% 
  filter(campaign == "WMDE_2020_challenge_1") %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

3.4.2B Edit Classes per Campaign Channel: WMDE_2020_challenge_4

userClass <- dataSet %>% 
  filter(campaign == "WMDE_2020_challenge_4") %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

3.4.2C Edit Classes per Campaign Channel: WMDE_2020_challenge_5

userClass <- dataSet %>% 
  filter(campaign == "WMDE_2020_challenge_5") %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
---
title: '2020 WMDE Email Campaign Wikipedia Challenge 2020'
author: "Goran S. Milovanovic, Data Scientist, WMDE"
date: "July 7, 2020"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---

**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign was run from 2020/05/14 to 2020/06/30.

**CURRENT UPDATE:** Complete dataset as of 2020/06/30.

```{r, echo = F, warning = 'hide', message = F, results = 'hide'}
# !diagnostics off
### --- Setup
library(kableExtra)
library(rmarkdown)
library(knitr)
library(tidyverse)
library(data.table)
library(reshape2)
library(DT)
library(ggrepel)
library(scales)
library(RColorBrewer)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script `2020_EmailCampaignWikipediaChallenge_PRODUCTION.R` on stat1004.eqiad.wmnet, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. A daily crontab job was run from `2020/05/14` to `2020/06/30` to collect the data for daily reporting. The data used in this report are aggregates of the daily datasets, sanitized and anonymized.   

### 0.1 Daily Update

```{r, echo = T, eval = F}

### --- WMDE 2020_EmailCampaignWikipediaChallenge_PRODUCTION.R
### --- Campaign start: 2020/05/14
### --- Campaign end: 2020/06/30
### --- run from: stat1004
### --- path: /home/goransm/Analytics/NewEditors/Campaigns/2020_EmailCampaignWikipediaChallenge/

### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)

### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2020_EmailCampaignWikipediaChallenge/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- campaign specifics
campaignName <- 'Email_Campaign_Wikipedia_Challenge2020'

### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
  strsplit(as.character(cetDay), 
           split = " ", 
           fixed = T)[[1]][1]
) - 1

cetDay <- "2020-06-30"

### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------

# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
                                   uri_path,
                                   cetDay,
                                   queryFile,
                                   fileName,
                                   dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                  paste(
                                    paste0("uri_host = '", uri_host, "'"),
                                    collapse = " OR ", sep = " "),
                                  ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "USE wmf;
    SELECT uri_host, uri_path, uri_query, referer FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
}

# - set params to wmde_collect_pageviews
# - for the 2020_EmailCampaignWikipediaChallenge
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org',
              'wikidata.org', 'm.wikidata.org',
              'commons.wikimedia.org', 'commons.m.wikimedia.org',
              'de.wikivoyage.org', 'de.m.wikivoyage.org',
              'de.wiktionary.org', 'de.m.wiktionary.org')
uri_path  <- c(
  '/wiki/Langer_Grund-Kohlberg?tour=einfuhrung',
  '/wiki/Spezial:Benutzerkonto_anlegen',
  '/wiki/Wikidata:Introduction/de',
  '/wiki/Spezial:Anmelden',
  '/wiki/Special:MyPage/Artikelwerkstatt',
  '/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training',
  '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
  '/w/index.php?title=Q16943273&tour=wbitems&data=ok',
  '/w/index.php?title=Special:UploadWizard&uselang=de',
  '/wiki/Hauptseite',
  '/wiki/Wiktionary:Hauptseite',
  '/wiki/Commons:Willkommen')
# uri_query <- paste0('WMDE_2020_challenge_', 1:30)
queryFile <- 'emailFollowUp2019_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")

# - collect Pageviews data
wmde_collect_pageviews(uri_host,
                       uri_path,
                       cetDay,
                       queryFile,
                       fileName,
                       dataDir)

### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
                                   dataDir, 
                                   uri_query_filter,
                                   page_filter,
                                   cetDay = cetDay,
                                   campaignName = campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)
  
  # - load
  pageviewsData <- readLines(fileName)
  wStart <- which(grepl("uri_host", pageviewsData))
  pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
  pageviewsData <- data.frame(dat = pageviewsData, 
                              stringsAsFactors = F)
  pageviewsData <- separate(pageviewsData,
                            dat,
                            into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
                            sep = "\t")
  # - apply page_filter
  pageviewsData$page <- paste0(pageviewsData$uri_host, 
                               pageviewsData$uri_path,
                               pageviewsData$uri_query)
  wFilter <- sapply(pageviewsData$page, function(x) {
    sapply(page_filter, function(y) {
      grepl(y, x)
    })
  })
  wFilter <- colSums(wFilter)
  wFilter <- which(wFilter > 0)
  pageviewsData <- pageviewsData[wFilter, ]
  # - apply uri_query_filter
  # - NOTE: looking in both: uri_query, referer 
  w_uri_query <- which(grepl(uri_query_filter, pageviewsData$page))
  
  if (length(w_uri_query) > 0) {
    
    # - filter for w_uri_query
    pageviewsData <- pageviewsData[w_uri_query, ] 
    
    # - aggregate:
    pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
    pageviewsData$uri_host <- NULL
    pageviewsData$page <- NULL
    pageviewsData$referer <- NULL
    pageviewsData <- pageviewsData %>% 
      dplyr::select(uri_query, uri_path) %>% 
      dplyr::group_by(uri_query, uri_path) %>% 
      dplyr::summarise(pageviews = n())
    colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
    
    # - add cetDay, campaignName
    pageviewsData$date <- cetDay
    pageviewsData$campaign <- campaignName
    
    # - store:
    write.csv(pageviewsData, 
              paste0(analyticsDir, 
                     "pageviewsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
  }
  
}

# - set params to wmde_process_pageviews
# - for the WMDE 2020_EmailCampaignWikipediaChallenge
uri_query_filter <- 'WMDE_2020_challenge_'
page_filter <- c('de.wikipedia.org/wiki/Langer_Grund-Kohlberg?tour=einfuhrung', 
                 'de.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen', 
                 'wikidata.org/wiki/Wikidata:Introduction/de', 
                 'de.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.wikipedia.org/wiki/Special:MyPage/Artikelwerkstatt', 
                 'de.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur', 
                 'www.wikidata.org/w/index.php?title=Q16943273&tour=wbitems&data=ok', 
                 'commons.wikimedia.org/w/index.php?title=Special:UploadWizard&uselang=de', 
                 'commons.wikimedia.org/wiki/Commons:Willkommen',
                 'de.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.wikivoyage.org/wiki/Hauptseite', 
                 'de.wiktionary.org/wiki/Wiktionary:Hauptseite', 
                 'de.m.wikipedia.org/wiki/Langer_Grund-Kohlberg?tour=einfuhrung', 
                 'de.m.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen', 
                 'm.wikidata.org/wiki/Wikidata:Introduction/de', 
                 'de.m.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.m.wikipedia.org/wiki/Spezial:Anmelden', 
                 'de.m.wikipedia.org/wiki/Special:MyPage/Artikelwerkstatt', 
                 'de.m.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.m.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur', 
                 'www.wikidata.org/w/index.php?title=Q16943273&tour=wbitems&data=ok', 
                 'commons.m.wikimedia.org/w/index.php?title=Special:UploadWizard&uselang=de',
                 'commons.m.wikimedia.org/wiki/Commons:Willkommen',
                 'de.m.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training', 
                 'de.m.wikivoyage.org/wiki/Hauptseite', 
                 'de.m.wiktionary.org/wiki/Wiktionary:Hauptseite')

# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
                       dataDir = dataDir,
                       uri_query_filter = uri_query_filter, 
                       page_filter = page_filter,
                       cetDay = cetDay,
                       campaignName = campaignName)

### ----------------------------------------------------------
### --- User Registrations
### --- HiveQL query: event.ServerSideAccountCreation table
### ----------------------------------------------------------
hiveQL <-
  "SELECT year, month, day, hour,  event.campaign, event.userId, event.userName, webhost, wiki
      FROM event.serversideaccountcreation
      WHERE
        year = 2020
        AND (month = 5 OR month = 6)
        AND (event.campaign LIKE '%WMDE_2020_challenge_%');"
# - write hql
write(hiveQL, paste0(dataDir, 'user_registrations.hql'))
### --- output filename
filename <- paste0(dataDir, 'user_registrations.tsv')
### --- execute hql script:
hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
hiveInput <- paste(paste0(dataDir, 'user_registrations.hql'),
                   " > ",
                   filename,
                   sep = "")
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)

### ----------------------------------------------------------
### --- User Edits
### --- User Edits via revision_actor_temp
### ----------------------------------------------------------

startTimestamp <- '20200514000000'
# - get user ids
userRegistrations <- fread(paste0(dataDir,
                                  'user_registrations.tsv'))
dim(userRegistrations)
# - clean up from test accounts
userRegistrations <- filter(userRegistrations, 
                            !grepl("Test E-Mail Campaign 2020", 
                                   userRegistrations$username))
dim(userRegistrations)
head(userRegistrations)
rev_user <- userRegistrations$userid
# - iterate over rev_user
for (i in 1:length(rev_user)) {
  # - SQL query
  sqlQuery <- paste("\"SELECT actor.actor_id, 
                        actor.actor_user, 
                        actor.actor_name, 
                        revision_actor_temp.revactor_timestamp 
                    FROM actor 
                    LEFT JOIN revision_actor_temp ON (actor.actor_id = revision_actor_temp.revactor_actor) 
                    WHERE (revision_actor_temp.revactor_timestamp >= 20200511000000 
                      AND actor.actor_user = ", rev_user[i], ");\"");
  ### --- output filename
  filename <- paste(dataDir,'userEdits', "_", i, ".tsv", sep = "")
  ### --- execute sql script:
  sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
  sqlInput <- paste(sqlQuery,
                    " > ",
                    filename,
                    sep = "")
  # - command:
  sqlCommand <- paste(sqlLogInPre, sqlInput)
  system(command = sqlCommand, wait = TRUE)
  # - report
  print(paste0("DONE: user ", i, "."))
}
### --- END run SQL scripts
# - load user edits:
lF <- list.files(dataDir)
lF <- lF[grepl("^userEdits_", lF)]
userEdits <- lapply(paste0(dataDir, lF), fread)
userEdits <- rbindlist(userEdits)
# - store user edits:
write.csv(userEdits, paste0(analyticsDir, 'userEdits.csv'))

```

## 1. Campaign Pageviews

This section presents all data and statistics on the campaign pages.

The following chunk loads and then re-structures the dataset a bit.

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
lF <- list.files('_analytics')
lF <- lF[grepl("^pageviewsAggregated", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - expand grid to account for missing observations per day
dS <- expand.grid(unique(dataSet$Tag), 
                  unique(dataSet$Page), 
                  unique(dataSet$date), 
                  stringsAsFactors = F)
colnames(dS) <- c('Tag', 'Page', 'date')
dS <- dS %>% 
  left_join(dataSet, 
            by = c("Tag", "Page", "date"))
dataSet <- dS; rm(dS)
dataSet$Pageviews[is.na(dataSet$Pageviews)] <- 0
dataSet$Tag <- gsub("&.+$", "", dataSet$Tag)
dataSet$Tag <- gsub("\\?campaign=", "", dataSet$Tag)
dataSet$date <- sapply(dataSet$date, function(x) {
  d <- strsplit(x, split = "-")[[1]]
  if (nchar(d[3]) == 1) {
    d[3] <- paste0("0", d[3])
  }
  return(paste(d, collapse = "-"))
})
```

### 1.1 Pageviews Overview

**Chart 1.1.1** Daily Pageviews, aggregated across the campaign channels.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 10, fig.height = 10}
pFrame <- dataSet %>% 
  select(date, Page, Pageviews) %>% 
  group_by(date, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  facet_wrap(~Page, ncol = 2) + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

**Table 1.1.1** Pageviews totals

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
tFrame <- pFrame %>% 
  select(Page, Pageviews) %>% 
  group_by(Page) %>% 
  summarise(totalPageviews = sum(Pageviews)) %>% 
  arrange(desc(totalPageviews))
datatable(tFrame)
```

### 1.2 Pageviews: Campaign Channels

**Chart 1.2.1** Pageviews, by **channels**

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 10, fig.height = 10}
pFrame <- dataSet %>% 
  select(date, Tag, Pageviews) %>% 
  group_by(date, Tag) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  facet_wrap(~Tag, ncol = 2) + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

**Chart 1.2.2** Total Pageviews, by **channels**

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 5}
pFrame <- pFrame %>% 
  select(Tag, Pageviews) %>% 
  group_by(Tag) %>% 
  summarise(Pageviews = sum(Pageviews)) %>% 
  arrange(desc(Pageviews))
pFrame$Tag <- factor(pFrame$Tag,
                     levels = pFrame$Tag,
                     ordered = T)
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews,
                    )) + 
  geom_bar(stat = "identity", color = "blue", fill = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

**Table 2.2.1** Total pageviews, by **channels** and by **pages**.

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
tFrame <- dataSet %>% 
  select(Tag, Page, Pageviews) %>% 
  group_by(Tag, Page) %>% 
  summarise(totalPageviews = sum(Pageviews)) %>% 
  arrange(desc(totalPageviews))
datatable(tFrame)
```

## 2. User Registrations

This section presents all data and statistics on the user registrations.

### 2.1 User Registrations Overview

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
lF <- list.files("_analytics")
lF <- lF[grepl("^user_registrations", lF)]
dataSet <- fread(paste0("_analytics/", lF))
dataSet <- filter(dataSet, 
                  !grepl("^Test E-Mail Campaign", dataSet$username))
dataSet$date <- paste(dataSet$year, 
                      ifelse(nchar(dataSet$month) == 1, paste0("0", dataSet$month), dataSet$month),
                      ifelse(nchar(dataSet$day) == 1, paste0("0", dataSet$day), dataSet$day),
                      sep = "-")
regUsers <- select(dataSet,
                   userid,
                   username, 
                   campaign)
# - expand grid to account for missing observations per day
dateSpan <- seq(from = as.Date("2020-05-14"), 
                to = as.Date("2020-06-30"), 
                by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan, 
                  unique(dataSet$campaign), 
                  stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>% 
  left_join(select(dataSet, date, campaign, userid),
            by = c("date", "campaign"))
dS$userid <- ifelse(is.na(dS$userid), 0, 1)
pFrame <- dS %>% 
  select(date, userid) %>% 
  group_by(date) %>% 
  summarise(registrations = sum(userid)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = registrations,
                   label = registrations,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Registrations") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 2.2 User Registrations per Campaign Channel

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 10, fig.height = 8}
pFrame <- dS %>% 
  group_by(date, campaign) %>% 
  summarise(registrations = sum(userid)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = registrations,
                   label = registrations,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  facet_wrap(~campaign, ncol = 2) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Registrations") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 2.3 Total User Registrations per Campaign Channel

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 5}
pFrame <- dS %>% 
  select(campaign, userid) %>% 
  group_by( campaign) %>% 
  summarise(registrations = sum(userid)) %>% 
  arrange(desc(registrations))
pFrame$campaign <- factor(pFrame$campaign, 
                          levels = pFrame$campaign, 
                          ordered = T)
ggplot(pFrame, aes(x = campaign,
                   y = registrations,
                   label = registrations,
                    )) + 
  geom_bar(stat = "identity", color = "blue", fill = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Registrations") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

## 3. User Edits

This section presents all data and statistics on the user edits.

### 3.1 User Edits Overview

The following chunk loads the dataset of user interactions with campaign channels and then re-structures the dataset a bit.

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
lF <- list.files("_analytics")
lF <- lF[grepl("^userEdits", lF)]
editSet <- fread(paste0("_analytics/", lF))
editSet <- select(editSet,
                  actor_name, 
                  revactor_timestamp)                 
colnames(editSet) <- c('username', 'rev_timestamp')
dataSet <- left_join(editSet,
                     select(regUsers, 
                            username,
                            campaign),
                     by = "username")
dataSet$year <- substr(dataSet$rev_timestamp, 1, 4)
dataSet$month <- substr(dataSet$rev_timestamp, 5, 6)
dataSet$day <- substr(dataSet$rev_timestamp, 7, 8)
dataSet$rev_timestamp <- NULL
dataSet$date <- paste(dataSet$year,
                       dataSet$month,
                       dataSet$day,
                       sep = "-")
dataSet <- select(dataSet,
                  date,
                  username,
                  campaign)
dateSpan <- seq(from = as.Date("2020-05-14"), 
                to = as.Date(max(dataSet$date)), 
                by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan, 
                  unique(dataSet$campaign),
                  stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>% 
  left_join(select(dataSet, date, campaign, username),
            by = c("date", "campaign"))
dS$username <- ifelse(is.na(dS$username), 0, 1)
pFrame <- dS %>%
  select(date, username) %>% 
  group_by(date) %>% 
  summarise(edits = sum(username)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = edits,
                   label = edits,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 3.2 User Edits by Campaign Channels

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 10, fig.height = 8}
pFrame <- dS %>% 
  group_by(date, campaign) %>% 
  summarise(edits = sum(username)) %>% 
  arrange(date)
ggplot(pFrame, aes(x = date,
                   y = edits,
                   label = edits,
                    )) + 
  geom_path(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  facet_wrap(~campaign, ncol = 2) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  # geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 5.5)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 3.3 Total User Edits by Campaign Channels

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 5}
pFrame <- dS %>% 
  select(campaign, username) %>% 
  group_by( campaign) %>% 
  summarise(edits = sum(username)) %>% 
  arrange(desc(edits))
pFrame$campaign <- factor(pFrame$campaign, 
                          levels = pFrame$campaign, 
                          ordered = T)
ggplot(pFrame, aes(x = campaign,
                   y = edits,
                   label = edits,
                    )) + 
  geom_bar(stat = "identity", color = "blue", fill = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) +
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 3.4 Edit Classes

#### 3.4.1 Edit Classes: all users

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userClass <- dataSet %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

#### 3.4.2A Edit Classes per Campaign Channel: `WMDE_2020_challenge_1`

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userClass <- dataSet %>% 
  filter(campaign == "WMDE_2020_challenge_1") %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

#### 3.4.2B Edit Classes per Campaign Channel: `WMDE_2020_challenge_4`

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userClass <- dataSet %>% 
  filter(campaign == "WMDE_2020_challenge_4") %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

#### 3.4.2C Edit Classes per Campaign Channel: `WMDE_2020_challenge_5`

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userClass <- dataSet %>% 
  filter(campaign == "WMDE_2020_challenge_5") %>% 
  select(username) %>% 
  group_by(username) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```