Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign is run from 2018/10/19 to 2018/10/28.

CURRENT UPDATE: Complete dataset as of 2018/10/29.

0. Data Acquisiton

The data acquisition and aggregation R code.

0.1 Daily Update

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script 2018_ABC_Production.R on stat1005.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally.

### --- Data Acquisition for the Autumn Banner Campaign 2018
### --- run from stat1005
### --- /home/goransm/RScripts/NewEditors/2018_AutumnBannerCampaign/_data

### --- to data directory
dataDir <- '/home/goransm/RScripts/NewEditors/2018_AutumnBannerCampaign/_data'
setwd(dataDir)

### --- determine cetDay
library(lubridate)
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
  strsplit(as.character(cetDay), 
           split = " ", 
           fixed = T)[[1]][1]
  ) - 1

### --- Collect Banner Impression Data

# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host, 
                                            uri_path, 
                                            uri_query, 
                                            cetDay,
                                            queryFile,
                                            fileName,
                                            dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
                                 )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                 paste(
                                   paste0("uri_host = '", uri_host, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_query_condition
  if (length(uri_query) > 1) {
    uri_query_condition <- paste0("(",
                                 paste(
                                   paste0("uri_query LIKE '%", uri_query, "%'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0(
    "USE wmf;
    SELECT uri_query FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    uri_query_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
  )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  hiveArgs <- '/usr/local/bin/beeline -f'
  hiveInput <- paste0(queryFile, ' > ', fileName)
  # - command:
  hiveCommand <- paste(hiveArgs, hiveInput)
  return(
    system(command = hiveCommand, wait = TRUE))
}

# - set params to wmde_collect_banner_impressions
# - for the Autumn Banner Campaign 2018
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- '/beacon/impression'
uri_query <- c('WMDE_neweditors_autumn_2018_lpn', 
               'B18WMDE_neweditors_autumn_2018_lp'
               )
queryFile <- 'abc2018_BannerImpressions.hql'
fileName <- paste0("bannerImpressions_", cetDay, ".tsv")
dataDir <- '/home/goransm/RScripts/NewEditors/2018_AutumnBannerCampaign/_data'

# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
                                uri_path,
                                uri_query,
                                cetDay,
                                queryFile,
                                fileName,
                                dataDir)

### --- Wrangle Banner Impression Data

# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
                                            dataDir, 
                                            cetDay, 
                                            campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  
  # - load
  bannerData <- read.delim(fileName, 
                           stringsAsFactors = F)
  colnames(bannerData) <- 'uri_query'
  
  # - clean
  wStart <- which(bannerData$uri_query == "uri_query")
  bannerData <- bannerData[(wStart + 1):(dim(bannerData)[1] - 2), ]
  
  # - split
  bannerData <- strsplit(bannerData, split = "&", fixed = T)
  # - extract relevant fields
  # - banner:
  banner <- sapply(bannerData, function(x) {
    x[which(grepl("^banner=", x))]
  })
  banner <- gsub("^banner=", "", banner)
  # - recordImpressionSampleRate:
  recordImpressionSampleRate <- sapply(bannerData, function(x) {
    x[which(grepl("^recordImpressionSampleRate=", x))]
  })
  recordImpressionSampleRate <- as.numeric(
    gsub("^recordImpressionSampleRate=", "", recordImpressionSampleRate)
    )
  # - result:
  result <- sapply(bannerData, function(x) {
    x[which(grepl("^result=", x))]
  })
  result <- gsub("^result=", "", result)
  
  # - compose table:
  bannerObservations <- data.frame(banner = banner, 
                                   recordImpressionSampleRate = recordImpressionSampleRate, 
                                   result = result, 
                                   stringsAsFactors = F)
  
  # - filter for result=show
  bannerObservations <- dplyr::filter(bannerObservations,
                                      result == "show")
  
  # - correction for recordImpressionSampleRate
  bannerObservations$recordImpressionSampleRate <- 
    1/bannerObservations$recordImpressionSampleRate
  
  # - aggregate:
  bannerObservations <- bannerObservations %>% 
    dplyr::select(banner, recordImpressionSampleRate) %>% 
    dplyr::group_by(banner) %>% 
    dplyr::summarise(impressions = sum(recordImpressionSampleRate))
  
  # - add cetDay, campaignName
  bannerObservations$date <- cetDay
  bannerObservations$campaign <- campaignName
  
  # - store:
  write.csv(bannerObservations, 
            paste0("bannerImpressionsAggregated_",
                   strsplit(
                     strsplit(fileName, split = "_", fixed = T)[[1]][2],
                     split = ".", 
                     fixed = T)[[1]][1],
                   ".csv"
                   )
            )
  
}

# - wrangle Banner Impression data
campaignName <- "2018_AuBC"
wmde_process_banner_impressions(fileName = fileName, 
                                dataDir = dataDir, 
                                cetDay = cetDay,
                                campaignName = campaignName)

### --- Collect Pageviews

# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
                                   uri_path,
                                   cetDay,
                                   queryFile,
                                   fileName,
                                   dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                 paste(
                                   paste0("uri_host = '", uri_host, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0(
    "USE wmf;
    SELECT uri_path, uri_query, referer FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  hiveArgs <- '/usr/local/bin/beeline -f'
  hiveInput <- paste0(queryFile, ' > ', fileName)
  # - command:
  hiveCommand <- paste(hiveArgs, hiveInput)
  return(
    system(command = hiveCommand, wait = TRUE))
}

# - set params to wmde_collect_pageviews
# - for the Autumn Banner Campaign 2018
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- c(
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018',
  '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Berlin', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Hannover', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Köln', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Stuttgart', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Ulm', 
  '/wiki/Wikipedia:Aktionstag_Wikipedia_2018/München', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Augsburg', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Wien', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Linz', 
  '/wiki/Wikipedia:Wikipedia_vor_Ort_2018/Zürich')
queryFile <- 'abc2018_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
dataDir <- '/home/goransm/RScripts/NewEditors/2018_AutumnBannerCampaign/_data'

# - collect Pageviews data
wmde_collect_pageviews(uri_host,
                       uri_path,
                       cetDay,
                       queryFile,
                       fileName,
                       dataDir)

### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
                                   dataDir, 
                                   uri_query_filter, 
                                   cetDay = cetDay,
                                   campaignName = campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)

  # - load
  pageviewsData <- readLines(fileName)
  wStart <- which(grepl("^uri_path", pageviewsData))
  pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
  pageviewsData <- data.frame(dat = pageviewsData, 
                              stringsAsFactors = F)
  pageviewsData <- separate(pageviewsData,
                            dat,
                            into = c('uri_path', 'uri_query', 'referer'),
                            sep = "\t")
  # - apply uri_query_filter
  # - NOTE: Autumn 2018, looking in both: uri_query, referer
  w_uri_query <- which(grepl(uri_query_filter, pageviewsData$uri_query))
  w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
  w_uri_query <- unique(c(w_uri_query, w_uri_query_referer))
  pageviewsData <- pageviewsData[w_uri_query, ]
  w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
  w_uri_query_referer_delete <- setdiff(1:dim(pageviewsData)[1], w_uri_query_referer)
  pageviewsData$referer[w_uri_query_referer_delete] <- ''
  # - when there is no uri_query, use the query from the referer field if present there
  pageviewsData$referer <- str_extract(pageviewsData$referer, "\\?campaign=.*$")
  pageviewsData$referer[is.na(pageviewsData$referer)] <- ""
  pageviewsData$referer <- gsub("?campaign=", "", pageviewsData$referer, fixed = T)
  pageviewsData$uri_query <- gsub("?campaign=", "", pageviewsData$uri_query, fixed = T)
  pageviewsData$uri_query[pageviewsData$uri_query == ""] <- 
    pageviewsData$referer[pageviewsData$uri_query == ""]
  pageviewsData <- dplyr::filter(pageviewsData, 
                          uri_query != "")
  pageviewsData$referer <- NULL
  # - clean up a bit:
  pageviewsData$uri_query <- gsub("/.*$", "", pageviewsData$uri_query)
  
  # - aggregate:
  pageviewsData <- pageviewsData %>% 
    dplyr::group_by(uri_query, uri_path) %>% 
    dplyr::summarise(pageviews = n())
  colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
  
  # - add cetDay, campaignName
  pageviewsData$date <- cetDay
  pageviewsData$campaign <- campaignName

  # - store:
  write.csv(pageviewsData, 
            paste0("pageviewsAggregated_",
                   strsplit(
                     strsplit(fileName, split = "_", fixed = T)[[1]][2],
                     split = ".", 
                     fixed = T)[[1]][1],
                   ".csv"
            )
  )
  
}

# - set params to wmde_process_pageviews
# - for the Autumn Banner Campaign 2018
uri_query_filter <- 'WMDE_neweditors_autumn_2018'

# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
                       dataDir = dataDir,
                       uri_query_filter = uri_query_filter, 
                       cetDay = cetDay,
                       campaignName = campaignName) 


### --- Collect User Registrations

# - function: wmde_collect_registrations
wmde_collect_registrations <- function(logSchema, 
                                       web_host,
                                       event_campaign, 
                                       cetDay,
                                       dataDir, 
                                       fileName, 
                                       campaignName) {
  
  # - WHERE condition: create start_timestamp, stop_timestamp
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 24:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  start_timestamp <- paste(
    unlist(str_extract_all(cet_condition[1],
                    "[[:digit:]]")),
    collapse = "")
  stop_timestamp <- paste(
    unlist(str_extract_all(tail(cet_condition, 1),
                           "[[:digit:]]")),
    collapse = "")
  
  # - WHERE condition: create event_campaign_condition
  if (length(event_campaign) > 1) {
    event_campaign_condition <- paste0("(",
                                       paste(
                                         paste0("event_campaign LIKE '%", event_campaign, "%'"),
                                         collapse = " OR ", sep = " "),
                                       ")"
    )
  } else {
    event_campaign_condition = paste0("event_campaign LIKE '%", event_campaign, "%'")
  }
  

  # - compose SQL query:
  sqlParams <- 'mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e'
  query <- paste0(
    "\"SELECT * FROM ", 
    paste0("log.", logSchema), 
    " WHERE ((webHost = '", 
    web_host, 
    "') AND (timestamp > ", 
    start_timestamp, 
    ") AND (timestamp <= ", 
    stop_timestamp, 
    ") AND (", 
    event_campaign_condition,
    "));\"")
  sqlOutput <- paste0("> ", paste0(dataDir, "/", fileName))
    
  # - run command
  qCommand <- paste(sqlParams, query, sqlOutput, sep = " ")
  system(command = qCommand, wait = TRUE)
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)
  
  # - load
  userReg <- fread(fileName, sep = "\t")
  
  # - filter bots
  wBot <- which(grepl("\"is_bot\": true", userReg$userAgent))
  if (length(wBot) > 0) {
    userReg <- userReg[-wBot, ]
  }
  
  # - select fields
  userReg <- userReg %>% 
    dplyr::select(event_userId, 
                  event_userName, 
                  event_isSelfMade, 
                  event_campaign, 
                  timestamp)
  
  # - add cetDay, campaignName
  userReg$date <- cetDay
  userReg$campaign <- campaignName
  
  # - store:
  write.csv(userReg, 
            paste0(
              strsplit(fileName, split = ".", fixed = T)[[1]][1],
            ".csv")
  )
  
  # - remove temp .tsv file
  file.remove(fileName)
  
}

# - set params for: wmde_collect_registrations
logSchema <- 'ServerSideAccountCreation_17719237' 
web_host <- 'de.wikipedia.org'
event_campaign <- 'WMDE_neweditors_autumn_2018'
fileName <- paste0("userRegistrations_", cetDay, ".tsv")

# - collect user registrations
wmde_collect_registrations(logSchema = logSchema,
                           web_host = web_host,
                           event_campaign = event_campaign,
                           cetDay = cetDay,
                           dataDir = dataDir,
                           fileName = fileName, 
                           campaignName = campaignName)

### --- Wrangle User Registrations
# - function: wmde_process_registrations
wmde_process_registrations <- function(fileName,
                                       dataDir, 
                                       cetDay, 
                                       campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(dplyr)
  library(data.table)

  # - load
  userReg <- fread(fileName)
  
  # - agregate
  userReg <- userReg %>% 
    dplyr::select(event_campaign) %>% 
    dplyr::group_by(event_campaign) %>% 
    dplyr::summarise(Registrations = n())

  # - add cetDay, campaignName
  userReg$date <- cetDay
  userReg$campaign <- campaignName
  
  # - store:
  write.csv(userReg, 
            paste0('userRegistrationsAggreagted_', cetDay, ".csv")
  )
  
}

# - set params for: wmde_process_registrations
fileName <- paste0("userRegistrations_", cetDay, ".csv")

# - wrangle user registrations:
wmde_process_registrations(fileName,
                           dataDir,
                           cetDay,
                           campaignName)

### --- Collect Newsletter registrations - for 2018_AuBC excl.
# - ServerSideAccountCreation_17719237 schema
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_17719237 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20181011000000) and (event_campaign like '%WMDE_neweditors_autumn_2018_lpn%'));\" > /home/goransm/RScripts/NewEditors/2018_AutumnBannerCampaign/_data/Newsletter_userRegistrations.tsv"
system(command = qCommand, wait = TRUE)
### --- Wrangle Newsletter registrations - for 2018_AuBC excl.
# - function: wmde_process_registrations_general
wmde_process_registrations_general <- function(fileName,
                                       dataDir, 
                                       cetDay, 
                                       campaignName, 
                                       outFileName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(dplyr)
  library(data.table)
  
  # - load
  userReg <- fread(fileName)
  
  # - agregate
  userReg <- userReg %>% 
    dplyr::select(event_campaign) %>% 
    dplyr::group_by(event_campaign) %>% 
    dplyr::summarise(Registrations = n())
  
  # - add cetDay, campaignName
  userReg$date <- cetDay
  userReg$campaign <- campaignName
  
  # - store:
  write.csv(userReg, 
            outFileName
  )
  
}

# - set params for: wmde_process_registrations
# - wrangle user registrations:
fileName = 'Newsletter_userRegistrations.tsv'
outFileName <- 'all_Newsletter_userRegistrations.csv'
wmde_process_registrations_general(fileName = fileName,
                                   dataDir = dataDir,
                                   cetDay = cetDay,
                                   campaignName = campaignName, 
                                   outFileName = outFileName)

0.2 Data Aggregation

NOTE: Not run from this report; the data were already pre-processed and aggregated by the following R script before being submitted to analytical procedures.

### --- Report Generation for the Autumn Banner Campaign 2018
### --- run locally

### --- to data directory
dataDir <- 
  '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_AutumnBannerCampaign/_data/'
analyticsDir <- 
  '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2018_AutumnBannerCampaign/_analytics/'
setwd(analyticsDir)

### --- Report Banner Impression Data

# - function: wmde_report_banner_impressions
wmde_report_banner_impressions <- function(dataDir) {
  
  # - Setup
  library(data.table)
  library(dplyr)

  # - list files:
  lF <- list.files(dataDir)
  
  # - filter aggregated banner impression data
  lF <- lF[grepl("bannerImpressionsAggregated_", lF, fixed = T)]
  
  # - load files and merge
  bannerData <- vector(mode = "list", length = length(lF))
  for (i in 1:length(lF)) {
    if (grepl("csv$|tsv$", lF[i])) {
      bannerData[[i]] <- fread(paste0(dataDir, lF[i]))
    } else {
      bannerData[[i]] <- NULL
    }
  }
  bannerData <- rbindlist(bannerData)
  bannerData$V1 <- NULL
  
  # - aggregates
  perBannerTotals <- bannerData %>% 
    select(banner, impressions) %>% 
    group_by(banner) %>% 
    summarise(totalImpressions = sum(impressions))
  perDayTotals <- bannerData %>% 
    select(date, impressions) %>% 
    group_by(date) %>% 
    summarise(totalImpressions = sum(impressions))
  
  # - output
  return(
    list(bannerImpressionsReport = bannerData, 
         perBannerTotals = perBannerTotals, 
         perDayTotals = perDayTotals)
  )
    
}

# - Report banner impressions
bannerImpressionsData <- wmde_report_banner_impressions(dataDir)
bannerImpressionsFile <- bannerImpressionsData$bannerImpressionsReport
write.csv(bannerImpressionsFile, "bannerImpressionsFile.csv")
bannerTotals <- bannerImpressionsData$perBannerTotals
write.csv(bannerTotals, "bannerTotals.csv")
bannerDayTotals <- bannerImpressionsData$perDayTotals
write.csv(bannerDayTotals, "bannerDayTotals.csv")


### --- Report Pageviews Data

# - function: wmde_report_pageviews
wmde_report_pageviews <- function(dataDir) {
  
  # - Setup
  library(data.table)
  library(dplyr)
  
  # - list files:
  lF <- list.files(dataDir)
  
  # - filter aggregated banner impression data
  lF <- lF[grepl("pageviewsAggregated_", lF, fixed = T)]
  
  # - load files and merge
  pageviewsData <- vector(mode = "list", length = length(lF))
  for (i in 1:length(lF)) {
    if (grepl("csv$|tsv$", lF[i])) {
      pageviewsData[[i]] <- fread(paste0(dataDir, lF[i]))
    } else {
      pageviewsData[[i]] <- NULL
    }
  }
  pageviewsData <- rbindlist(pageviewsData)
  pageviewsData$V1 <- NULL
  
  # - aggregates
  perDayTotals <- pageviewsData %>% 
    select(date, Pageviews) %>% 
    group_by(date) %>% 
    summarise(totalPageviews = sum(Pageviews))
  perTagTotals <- pageviewsData %>% 
    select(Tag, Pageviews) %>% 
    group_by(Tag) %>% 
    summarise(totalPageviews = sum(Pageviews))
  perPageTotals <- pageviewsData %>% 
    select(Page, Pageviews) %>% 
    group_by(Page) %>% 
    summarise(totalPageviews = sum(Pageviews))
  perPageDayTotals <- pageviewsData %>% 
    select(Page, date, Pageviews) %>%
    group_by(Page, date) %>% 
    summarise(totalPageviews = sum(Pageviews))
  perTagDayTotals <- pageviewsData %>% 
    select(Tag, date, Pageviews) %>%
    group_by(Tag, date) %>% 
    summarise(totalPageviews = sum(Pageviews))
  perTagPageTotals <- pageviewsData %>% 
    select(Tag, Page, Pageviews) %>%
    group_by(Tag, Page) %>% 
    summarise(totalPageviews = sum(Pageviews))
    
  # - output
  return(
    list(pageviewsDataReport = pageviewsData, 
         perDayTotals = perDayTotals, 
         perTagTotals = perTagTotals, 
         perPageTotals = perPageTotals, 
         perPageDayTotals = perPageDayTotals, 
         perTagDayTotals = perTagDayTotals,
         perTagPageTotals = perTagPageTotals)
  )
  
}

# - Report pageviews:
pageviewsData <- wmde_report_pageviews(dataDir)
pageviewsReportFile <- pageviewsData$pageviewsDataReport
write.csv(pageviewsReportFile, "pageviewsReportFile.csv")
pageviews_perDayTotals <- pageviewsData$perDayTotals
write.csv(pageviews_perDayTotals, "pageviews_perDayTotals.csv")
pageviews_perTagTotals <- pageviewsData$perTagTotals
write.csv(pageviews_perTagTotals, "pageviews_perTagTotals.csv")
pageviews_perPageTotals <- pageviewsData$perPageTotals
write.csv(pageviews_perPageTotals, "pageviews_perPageTotals.csv")
pageviews_perPageDayTotals <- pageviewsData$perPageDayTotals
write.csv(pageviews_perPageDayTotals, "pageviews_perPageDayTotals.csv")
pageviews_perTagDayTotals <- pageviewsData$perTagDayTotals
write.csv(pageviews_perTagDayTotals, "pageviews_perTagDayTotals.csv")
pageviews_perTagPageTotals <- pageviewsData$perTagPageTotals
write.csv(pageviews_perTagPageTotals, "perTagPageTotals.csv")

### --- Report User Registrations

# - function: wmde_report_registrations
wmde_report_registrations <- function(dataDir) {
  
  # - Setup
  library(data.table)
  library(dplyr)
  
  # - list files:
  lF <- list.files(dataDir)
  
  # - filter aggregated user registration data
  lF <- lF[grepl("userRegistrationsAggreagted_", lF, fixed = T)]
  
  # - load files and merge
  registrationData <- vector(mode = "list", length = length(lF))
  for (i in 1:length(lF)) {
    if (grepl("csv$|tsv$", lF[i])) {
      registrationData[[i]] <- fread(paste0(dataDir, lF[i]))
    } else {
      registrationData[[i]] <- NULL
    }
  }
  registrationData <- rbindlist(registrationData)
  registrationData$V1 <- NULL
  
  # - aggregates
  perDayTotals <- registrationData %>% 
    select(date, Registrations) %>% 
    group_by(date) %>% 
    summarise(totalRegistrations = sum(Registrations))
  perTagTotals <- registrationData %>% 
    select(event_campaign, Registrations) %>% 
    group_by(event_campaign) %>% 
    summarise(totalRegistrations = sum(Registrations))
  perTagDayTotals <- registrationData %>% 
    select(event_campaign, date, Registrations) %>% 
    group_by(event_campaign, date) %>% 
    summarise(totalRegistrations = sum(Registrations))
  
  # - output
  return(
    list(registrationsDataReport = registrationData, 
         perDayTotals = perDayTotals, 
         perTagTotals = perTagTotals, 
         perTagDayTotals = perTagDayTotals)
  )
  
}

# - Report upon user registrations
userRegData <- wmde_report_registrations(dataDir)
userRegistrationsReportFile <- userRegData$registrationsDataReport
write.csv(userRegistrationsReportFile, "userRegistrationsReportFile.csv")
userRegistrations_perDayTotals <- userRegData$perDayTotals
write.csv(userRegistrations_perDayTotals, "userRegistrations_perDayTotals.csv")
userRegistrations_perTagTotals <- userRegData$perTagTotals
write.csv(userRegistrations_perTagTotals, "userRegistrations_perTagTotals.csv")
userRegistrations_perTagDayTotals <- userRegData$perTagDayTotals
write.csv(userRegistrations_perTagDayTotals, "userRegistrations_perTagDayTotals.csv")

1. Campaign Banners and Pages

This section presents all data and statistics on the campaign banners and pages.

1.2 Pageviews

1.2.1 Pageviews Overview

Chart 1.2.1. Pageviews Overview. Log scaling of the pageviews is necessary; the numbers reported in the data point labels are exact.

dataSet <- read.csv(
  '_analytics/pageviews_perPageDayTotals.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet$Page <- gsub("/wiki/Wikipedia:Wikipedia_|/wiki/Wikipedia:Wikimedia_", "", dataSet$Page)
# - Visualize w. {ggplot2}
ggplot(dataSet, aes(x = date,
                    y = log10(totalPageviews),
                    group = Page,
                    color = Page,
                    fill = Page,
                    label = totalPageviews,
                    )) + 
  geom_path(size = .25) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('Autumn Banner Campaign 2018: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.

1.2.1 Pageviews Overview: Table

Table 1.2.1. Pageviews Overview

### --- Full Dataset (Table Report)
datatable(dataSet %>% arrange(desc(totalPageviews)))

1.2.2 Pageviews Overview: totals per Page

Chart 1.2.2. Pageviews Overview: totals per Page

dataSet <- read.csv(
  '_analytics/pageviews_perPageTotals.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet$Page <- gsub("/wiki/Wikipedia:Wikipedia_|/wiki/Wikipedia:Wikimedia_", "", dataSet$Page)
dataSet$Page <- factor(dataSet$Page, 
                       levels = dataSet$Page[order(-dataSet$totalPageviews)])
ggplot(dataSet, aes(x = Page, 
                    y = log(totalPageviews), 
                    color = Page,
                    fill = Page,
                    label = totalPageviews)) + 
  geom_bar(width = .5, stat = "identity") + 
  scale_y_continuous(labels = comma) +
  ggtitle('Autumn Banner Campaign 2018: Banner Total Pageviews') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, color = "white", show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_blank())
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.

1.2.3 Pageviews Overview: totals per day

Chart 1.2.3. Pageviews Overview: totals per day

dataSet <- read.csv(
  '_analytics/pageviews_perDayTotals.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
ggplot(dataSet, aes(x = date, 
                    y = totalPageviews, 
                    label = totalPageviews)) +
  geom_path(size = .25, group = 1, color = "darkblue") +
  geom_point(size = 1.5, color = "darkblue") + 
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('Autumn Banner Campaign 2018: Pageviews') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_text(angle = 90))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.

1.2.3 Pageviews Overview: totals per Tag/Page

Chart 1.2.3. Pageviews Overview: totals per Tag/Page

dataSet <- read.csv(
  '_analytics/perTagPageTotals.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
campaignTags <- c('WMDE_neweditors_autumn_2018_lp1', 
                  'WMDE_neweditors_autumn_2018_lp1m',
                  'WMDE_neweditors_autumn_2018_lpn')
dataSet <- filter(dataSet, 
                  grepl(campaignTags[1], dataSet$Tag)|grepl(campaignTags[2], dataSet$Tag)|grepl(campaignTags[3], dataSet$Tag))
dataSet$Tag[grepl(campaignTags[2], dataSet$Tag, fixed = T)] <- "lp1m"
dataSet$Tag[grepl(campaignTags[1], dataSet$Tag, fixed = T)] <- "lp1"
dataSet$Tag[grepl(campaignTags[3], dataSet$Tag, fixed = T)] <- "lp1n"
dataSet$Page <- gsub("/wiki/Wikipedia:Wikipedia_|/wiki/Wikipedia:Wikimedia_", "", dataSet$Page)
dataSet <- dataSet %>% 
  select(Tag, Page, totalPageviews) %>% 
  group_by(Tag, Page) %>% 
  summarise(totalPageviews = sum(totalPageviews))
dataSet %>% 
  ggplot(aes(x = Tag,
             y = Page,
             color = Tag,
             label = totalPageviews)) + 
  ggtitle('Autumn Banner Campaign 2018: Pageviews per Tag') +
  geom_point(aes(size = totalPageviews), shape = 19) +
  geom_text_repel(size = 3, nudge_x = .3, show.legend = F) + 
  xlab("Tags") + ylab("Pages") + 
  theme_bw() + 
  theme(panel.background = element_rect(fill = "white"))

1.2.4 Pageviews per Tag, daily totals

Chart 1.2.4. Pageviews per Tag, daily totals. Log scaling of the pageviews is necessary; the numbers reported in the data point labels are exact.

dataSet <- read.csv(
  '_analytics/pageviews_perTagDayTotals.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
campaignTags <- c('WMDE_neweditors_autumn_2018_lp1', 
                  'WMDE_neweditors_autumn_2018_lp1m',
                  'WMDE_neweditors_autumn_2018_lpn')
dataSet <- filter(dataSet, 
                  grepl(campaignTags[1], dataSet$Tag)|grepl(campaignTags[2], dataSet$Tag)|grepl(campaignTags[3], dataSet$Tag))
dataSet$Tag[grepl(campaignTags[2], dataSet$Tag, fixed = T)] <- "lp1m"
dataSet$Tag[grepl(campaignTags[1], dataSet$Tag, fixed = T)] <- "lp1"
dataSet$Tag[grepl(campaignTags[3], dataSet$Tag, fixed = T)] <- "lp1n"# - Visualize w. {ggplot2}
dataSet <- dataSet %>% 
  group_by(Tag, date) %>% 
  summarise(totalPageviews = sum(totalPageviews))
ggplot(dataSet, aes(x = date,
                    y = log10(totalPageviews),
                    group = Tag,
                    color = Tag,
                    fill = Tag,
                    label = totalPageviews,
                    )) + 
  geom_path(size = .25) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('Autumn Banner Campaign 2018: Pageviews per Tag, daily totals') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.

2. User Registrations

All data on user registrations are presented in this section.

2.1 Registrations per tag and day

Chart 2.1 Registrations per tag and day. Please note: points with no data labels signify 0 user registrations.

# - Standard registrations
dataSet <- read.csv(
  '_analytics/userRegistrationsReportFile.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet <- dataSet %>% 
  filter(!(event_campaign %in% 'WMDE_neweditors_autumn_2018_lpn'))
dataSet$campaign <- NULL
# - Newsletter registrations
dataSet2 <- read.delim(
  '_analytics/Newsletter_userRegistrations.tsv',
  sep = "\t",
  header = T,
  row.names = 1,
  check.names = F,
  stringsAsFactors = F)
dataSet2 <- dataSet2 %>% 
  select(event_campaign, timestamp)
dataSet2$timestamp <- as.character(dataSet2$timestamp)
dataSet2$timestamp <- sapply(dataSet2$timestamp, function(x) {
  year <- substr(x, 1, 4)
  month <- substr(x, 5, 6)
  day <- substr(x, 7, 8)
  paste(year, month, day, sep = "-")
})
dataSet2 <- dataSet2 %>% 
  group_by(event_campaign, timestamp) %>% 
  summarise(Registrations = n())
colnames(dataSet2)[2] <- "date"
dataSet2 <- dataSet2[, c(1, 3, 2)]
dataSet <- rbindlist(list(dataSet, dataSet2))
dataSet$event_campaign <- gsub("WMDE_neweditors_autumn_2018_", "", dataSet$event_campaign)
ggplot(dataSet, aes(x = date,
                    y = Registrations,
                    group = event_campaign,
                    color = event_campaign,
                    fill = event_campaign,
                    label = Registrations,
                    )) + 
  geom_path(size = .25) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('Autumn Banner Campaign 2018: Registrations per Tag') +
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = F) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.

### --- Full Dataset (Table Report)
colnames(dataSet)[1] <- 'Tag'
datatable(dataSet %>% arrange(Tag, date, desc(Registrations)))

2.2 Total registrations per tag

Chart 2.2 Total registrations per tag.

dataSet <- dataSet %>% 
  group_by(Tag) %>% 
  summarise(totalRegistrations = sum(Registrations))
ggplot(dataSet, aes(x = Tag, 
                    y = totalRegistrations, 
                    color = Tag,
                    fill = Tag,
                    label = totalRegistrations)) + 
  geom_bar(width = .5, stat = "identity") + 
  scale_y_continuous(labels = comma) +
  ggtitle('Autumn Banner Campaign 2018: Total Registrations per Tag') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, color = "white", show.legend = FALSE) + 
  scale_y_continuous(labels = comma) + 
  ylab("Registrations") +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.

3. User Edits

All data on user edits are presented in this section.

3.1 User edits: distribution

dataSet <- read.csv(
  '_analytics/allUserEdits_2018_AuBC.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
# - Edit | 1 | 2-4 | 5-9 | 10-49 | >50
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 49)
)
dataSet$editClass <- sapply(dataSet$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 50")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(dataSet$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
