Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign is run from 1. January 2018 to N January 2018.

knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script ThankYou_2018_Production_SQL.R on stat1005.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R.

### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets

### --- Campaign Details: 
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018. 
# - The report must include any activity from the beginning of the campaign. 
# - The estimated start will be 1st January 2018.

# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig

### --- Training Modules Schema: 
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking

### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)

### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate' 

### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'

### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------

# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1

### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
                        to = as.POSIXlt(endDate, tz = "CET"),
                        by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
  strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
  dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
  dR[[i]] <- data.frame(
    cetName = names(dateRange[i]),
    utcYear = year(dateRange[i]),
    utcMonth = month(dateRange[i]),
    utcDay = mday(dateRange[i]),
    utcHour = hour(dateRange[i])
  )
}
dR <- rbindlist(dR)
dR <- dR %>%
  group_by(cetName, utcYear, utcMonth, utcDay) %>%
  summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))

### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------

# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1

# - set bannerClicksDir
setwd(bannerClicksDir)

for (i in 1:length(unique(dR$cetName))) {
  
  wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
  
  for (j in 1:length(wCetName)) {
    
    # - construct HiveQL query:
    y <- dR$utcYear[wCetName[j]]
    m <- dR$utcMonth[wCetName[j]]
    d <- dR$utcDay[wCetName[j]]
    hour <- dR$utcHour[wCetName[j]]
    q <- paste(
      "USE wmf;
      SELECT uri_path, uri_query, referer FROM webrequest
      WHERE uri_host = 'de.wikipedia.org'
      AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia' 
      AND year = ", y,
      " AND month = ", m,
      " AND day = ", d,
      " AND (", hour, ");",
      sep = "")
    # - write hql
    write(q, 'thankyou2018_BannerClicks.hql')
    # - prepare output file:
    fileName <- "thankyou2018_BannerClicks_"
    fileName <- paste0(fileName,
                       as.character(unique(dR$cetName)[i]),
                       "_", j,
                       ".tsv")
    fileName <- paste0(bannerClicksDir, "/", fileName)
    # - execute hql script:
    hiveArgs <-
      'beeline -f'
    hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
                        fileName)
    # - command:
    hiveCommand <- paste(hiveArgs, hiveInput)
    system(command = hiveCommand, wait = TRUE)
    
  }
  
}

### --- Wrangle this dataset:

### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'

### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
  dS <- readLines(lF[i], n = -1)
  timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
  bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
  dataSet[[i]] <- data.frame(timestamp = timeStamp,
                             bannerClicks = bannerClicks,
                             stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)

### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")

### --- SQL
startDate <- '2018-01-01'

### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
            dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)


### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------

# - NOTE: UTC timestamps - adjustment for CE(S)T introduced. 
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", 
                  gsub("-", "", startDate, fixed = T), "220000));\" > ", 
                  dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)

### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------

# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF, 
                      quote = "",
                      sep = "\t",
                      header = T,
                      check.names = F,
                      stringsAsFactors = F)
userReg <- userReg %>% 
  dplyr::select(event_userId, event_isSelfMade, event_campaign) %>% 
  filter(event_isSelfMade == 1) %>% 
  filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
                  paste(uid, collapse = ", "),
                  ') GROUP BY rev_user;',
                  sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
                      paste('"', sqlQuery, '" > ', sep = ""), 
                      dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand, 
       wait = TRUE)

### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------

1. Campaign Pageviews (Banner Clicks)

1. 1 The data set

pageviews <- read.csv('thankyou2018_BannerClicksPageViews_Update.csv',
                      row.names = 1,
                      header = T,
                      stringsAsFactors = F)
pageviews <- pageviews %>% 
  group_by(timestamp) %>% 
  summarise(Pageviews = sum(bannerClicks))
knitr::kable(pageviews, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Pageviews
2018-01-01 533
2018-01-02 1188
2018-01-03 251
2018-01-04 0
2018-01-05 0
2018-01-06 0
2018-01-07 0

1. 2 Chart

ggplot(pageviews, aes(x = timestamp,
                    y = Pageviews,
                    label = Pageviews)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of Landing Pageviews') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

2. Campaign User Registrations

2. 1 The data set

userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
                      sep = "\t",
                      header = T,
                      row.names = 1,
                      stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
  filter(event_campaign %in% 'wmde_etc2017_bt1') %>% 
  group_by(timestamp) %>% 
  summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>% 
  kable_styling(full_width = F, position = "left")
timestamp Registrations
2018-01-01 1
2018-01-02 8
2018-01-03 3

2. 2 Chart

ggplot(userReg, aes(x = timestamp,
                    y = Registrations,
                    label = Registrations)) +
  geom_bar(stat = "identity", 
           position = "dodge", 
           width = .15, 
           fill = "white", 
           color = "darkblue") +
  ggtitle('Thank You 2018:\nOverview of User Registrations') +
  geom_label(size = 3) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) +
  theme(panel.grid.major.x = element_blank()) +
  theme(panel.grid.minor.x = element_blank()) +
  theme(panel.background = element_blank())

3. Campaign User Edits

3. 1 The data set

userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
  userEdits <- read.delim('thankyou_2018_userEdits.tsv',
                          sep = "\t",
                          header = T,
                          stringsAsFactors = F)
  # - report
  print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
  print("There are currently no user edits from this campaign.")
}
[1] "4 edits were made by the campaign registered users thus far."

3. 2 Chart

4. Campaign Guided Tours

4. 1 The data set

Campaign guided tours: diskutieren and seimutig.

tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
                     sep = "\t",
                     header = T,
                     row.names = 1,
                     stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 5, 6)
  d <- substr(x, 7, 8)
  part1Date <- paste(y, m, d, sep = "-")
  hr <- substr(x, 9, 10)
  mi <- substr(x, 11, 12)
  se <- substr(x, 13, 14)
  part2Date <- paste(hr, mi, se, sep = ":")
  paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <- 
  as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
  y <- substr(x, 1, 4)
  m <- substr(x, 6, 7)
  d <- substr(x, 9, 10) 
  paste(y, m, d, sep = "-")
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
  gTours <- gTours[w, ]
  gTours <- gTours %>% 
    select(timestamp, event_step, event_userId)
  knitr::kable(gTours[, c(timestamp, "event_step")], format = "html") %>% 
  kable_styling(full_width = F, position = "left")
} else {
  print("There are currently no data on guided tours from this campaign.")
}
[1] "There are currently no data on guided tours from this campaign."

4. 2 Chart

