Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign was run from 2020/05/14 to 2020/05/27.

CURRENT UPDATE: Complete dataset as of 2020/05/27.

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script 2020_OccasionalEditors_PRODUCTION.R on stat1004.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally. A daily crontab job was run from 2020/05/14 to 2020/05/27 to collect the data for daily reporting. The data used in this report are aggregates of the daily datasets, sanitized and anonymized.

0.1 Daily Update

### --- WMDE 2020_OccasionalEditors_PRODUCTION.R
# - Campaign start: 2020/05/14
# - Campaign end: 2020/05/27
# - run from: stat1004
# - path: 

### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)

### --- dir structure
campaignPath <- paste0(getwd(), "/")
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- campaign specifics
campaignName <- 'OccasionalEditors2020'

### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
  strsplit(as.character(cetDay), 
           split = " ", 
           fixed = T)[[1]][1]
) - 1

### ----------------------------------------------------------
### --- Banner Impressions
### ----------------------------------------------------------

# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host, 
                                            uri_path, 
                                            uri_query, 
                                            cetDay,
                                            queryFile,
                                            fileName,
                                            dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                 paste(
                                   paste0("uri_host = '", uri_host, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_query_condition
  if (length(uri_query) > 1) {
    uri_query_condition <- paste0("(",
                                  paste(
                                    paste0("uri_query LIKE '%", uri_query, "%'"),
                                    collapse = " OR ", sep = " "),
                                  ")"
    )
  } else {
    uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0(
    "USE wmf;
    SELECT uri_query FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    uri_query_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
  hiveInput <- paste0(queryFile, ' > ', fileName)
  # - command:
  hiveCommand <- paste(hiveArgs, hiveInput)
  return(
    system(command = hiveCommand, wait = TRUE)
  )
}

# - set params to wmde_collect_banner_impressions
# - for the OccasionalEditors2020
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- '/beacon/impression'
uri_query <- c('WMDE_oceditors_spring_2020_')
queryFile <- 'OccasionalEditors2020_BannerImpressions.hql'
fileName <- paste0(dataDir, "bannerImpressions_", cetDay, ".tsv")

# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
                                uri_path,
                                uri_query,
                                cetDay,
                                queryFile,
                                fileName,
                                dataDir)

# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
                                            dataDir, 
                                            cetDay, 
                                            campaignName, 
                                            uri_query) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  
  # - load
  bannerData <- tryCatch({
    as.data.frame(fread(fileName))
    },
    error = function(condition) {
      return(FALSE)
  })
  # - process
  if (class(bannerData) == 'logical') {
    return(FALSE) 
  } else {
    # - clean
    bannerData <- dplyr::filter(bannerData,
                                uri_query != "")
    # - split
    bannerData <- tidyr::separate(bannerData, 
                                  col = uri_query, 
                                  into = c('country', 
                                           'region',
                                           'anonymous',
                                           'project',
                                           'db',
                                           'uselang', 
                                           'device', 
                                           'debug', 
                                           'randomcampaign',
                                           'randombanner',
                                           'recordImpressionSampleRate',
                                           'impressionEventSampleRate',
                                           'campaignStatuses',
                                           'status',
                                           'statusCode',
                                           'campaign',
                                           'campaignCategory',
                                           'campaignCategoryUsesLegacy',
                                           'bucket',
                                           'banner',
                                           'bannerCategory', 
                                           'result'),
                                           sep = "&") %>% 
      dplyr::select(banner, device, recordImpressionSampleRate, result)
    # - filter for uri_query
    bannerData <- bannerData[grepl(uri_query, bannerData$banner), ]
    # - clean relevant fields
    # - banner:
    bannerData$banner <- gsub("^banner=", "", bannerData$banner)
    # - recordImpressionSampleRate:
    bannerData$recordImpressionSampleRate <- as.numeric(
      gsub("^recordImpressionSampleRate=", "", bannerData$recordImpressionSampleRate)
    )
    # - device:
    bannerData$device <- gsub("^device=", "", bannerData$device)
    # - result:
    bannerData$result <- gsub("^result=", "", bannerData$result)
    # - filter for result=show
    bannerData <- dplyr::filter(bannerData,
                                result == "show")
    # - correction for recordImpressionSampleRate
    bannerData$recordImpressionSampleRate <- 
      1/bannerData$recordImpressionSampleRate
    
    # - aggregate:
    bannerData <- bannerData %>% 
      dplyr::select(banner, device, recordImpressionSampleRate) %>% 
      dplyr::group_by(banner, device) %>% 
      dplyr::summarise(impressions = sum(recordImpressionSampleRate))
    
    # - add cetDay, me
    bannerData$date <- cetDay
    bannerData$campaign <- campaignName
    
    # - store:
    write.csv(bannerData, 
              paste0(analyticsDir, "bannerImpressions",
                     cetDay,
                     ".csv"
              )
    )
    
    # - return
    return(TRUE)
  }
}

# - wrangle Banner Impression data
campaignName <- "OccasionalEditors2020"
uri_query <- c('WMDE_oceditors_spring_2020_')
bannerProcess <- wmde_process_banner_impressions(fileName = fileName,
                                                 dataDir = dataDir,
                                                 cetDay = cetDay,
                                                 campaignName = campaignName,
                                                 uri_query = uri_query)



### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------

# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
                                   uri_path,
                                   cetDay,
                                   queryFile,
                                   fileName,
                                   dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                  paste(
                                    paste0("uri_host = '", uri_host, "'"),
                                    collapse = " OR ", sep = " "),
                                  ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "USE wmf;
    SELECT uri_host, uri_path, uri_query, referer FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
}

# - set params to wmde_collect_pageviews
# - for the 2020_EmailCampaignWikipediaChallenge
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- c(
  '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
  '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
  '/wiki/Wikipedia:Mentorenprogramm')
# uri_query <- paste0('WMDE_2020_challenge_', 1:30)
queryFile <- 'OccasionalEditors2020_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")

# - collect Pageviews data
wmde_collect_pageviews(uri_host,
                       uri_path,
                       cetDay,
                       queryFile,
                       fileName,
                       dataDir)

### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
                                   dataDir, 
                                   uri_query_filter,
                                   cetDay = cetDay,
                                   campaignName = campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)
  
  # - load
  pageviewsData <- readLines(fileName)
  wStart <- which(grepl("uri_host", pageviewsData))
  pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
  pageviewsData <- data.frame(dat = pageviewsData, 
                              stringsAsFactors = F)
  pageviewsData <- separate(pageviewsData,
                            dat,
                            into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
                            sep = "\t")
  
  # - apply uri_query_filter
  # - NOTE: looking in both: uri_query, referer 
  # - NOTE: hack for the 2020_OccasionalEditors Campaign
  # - include pageviews for 
  # - '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
  # - '/wiki/Wikipedia:Mentorenprogramm'
  # - on dewiki where grepl(w_uri_query, referer)
  w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer) & 
                                 !grepl(uri_query_filter, pageviewsData$uri_query))
  if (length(w_uri_query_referer) > 0) {
    refererTags <- strsplit(pageviewsData$referer[w_uri_query_referer], 
                            split = "?",
                            fixed = T)
    refererTags <- sapply(refererTags, function(x) {x[2]})
    refererTags <- paste0("?", refererTags)
    pageviewsData$uri_query[w_uri_query_referer] <- refererTags 
  }
  w_uri_query <- which(grepl(uri_query_filter, pageviewsData$uri_query))
  
  if (length(w_uri_query) > 0) {
    
    # - filter for w_uri_query
    pageviewsData <- pageviewsData[w_uri_query, ] 
    
    # - aggregate:
    pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
    pageviewsData$uri_host <- NULL
    pageviewsData$referer <- NULL
    pageviewsData <- pageviewsData %>% 
      dplyr::select(uri_query, uri_path) %>% 
      dplyr::group_by(uri_query, uri_path) %>% 
      dplyr::summarise(pageviews = n())
    colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
    
    # - add cetDay, campaignName
    pageviewsData$date <- cetDay
    pageviewsData$campaign <- campaignName
    
    # - store:
    write.csv(pageviewsData, 
              paste0(analyticsDir, 
                     "pageviewsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
  }
  
}

# - set params to wmde_process_pageviews
# - for the WMDE 2020_EmailCampaignWikipediaChallenge
uri_query_filter <- 'WMDE_oceditors_spring_2020_'

# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
                       dataDir = dataDir,
                       uri_query_filter = uri_query_filter, 
                       cetDay = cetDay,
                       campaignName = campaignName)

### ----------------------------------------------------------
### --- Banner Actions
### --- via event.WMDEBannerActions
### ----------------------------------------------------------

### ___ NOTE:
# - Suffix explanation:
# - ctrl is the banner that dynamically displays text depending on the target group
# - var is the banner that shows the same text for both target groups
# - ipad/mobile does not represent the actual device type, but the display mode of the banner (small desktop screens may be reported as ipad)
# - cs/nt indicates, which target group the user belongs to
# - Kai Nissen in https://phabricator.wikimedia.org/T251535#6132468

# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID 
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);

# - function: wmde_collect_pageviews
wmde_banner_actions <- function(uri_query_filter,
                                cetDay,
                                queryFile,
                                fileName,
                                analyticsDir,
                                campaignName) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  

  # - WHERE condition: create eventBannerName_condition
  if (length(uri_query) > 1) {
    eventBannerName_condition <- paste0("(",
                                        paste(
                                          paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
                                          collapse = " OR ", sep = " "),
                                        ")"
    )
  } else {
    eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions 
    WHERE (",
    eventBannerName_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
  
  # - Wrangle Banner Interactions
  # - load
  bannerData <- tryCatch({
    as.data.frame(fread(paste0(dataDir, fileName)))
  },
  error = function(condition) {
    return(FALSE)
  })
  
  # - process
  if (class(bannerData) == 'logical') {
    return(FALSE) 
  } else { 
    # - bannerSeen
    bannerSeen <- bannerData %>% 
      dplyr::select(bannername, userid)
    bannerSeen <- bannerSeen[!duplicated(bannerSeen), ]
    bannerSeen <- bannerSeen %>% 
      dplyr::select(bannername) %>% 
      dplyr::group_by(bannername) %>% 
      dplyr::summarise(seen_by = n())
    # - bannerClosed
    bannerClosed <- bannerData %>% 
      dplyr::filter(banneraction == "banner-closed") %>% 
      dplyr::select(bannername, bannerimpressions) %>% 
      dplyr::group_by(bannername) %>%
      dplyr::summarise(closed_by = n(), mean_close_imp = round(mean(bannerimpressions), 2))
    # - bannerClicked
    bannerClicked <- bannerData %>% 
      dplyr::filter(banneraction == "banner-clicked") %>% 
      dplyr::select(bannername, bannerimpressions) %>% 
      dplyr::group_by(bannername) %>%
      dplyr::summarise(clicked_by = n(), mean_click_imp = round(mean(bannerimpressions), 2))
    # - whoClicked
    whoClicked <- bannerData %>% 
      dplyr::filter(banneraction == "banner-clicked")
    whoClicked <- data.frame(userid = unique(whoClicked$userid))
    # - store:
    write.csv(whoClicked, 
              paste0(analyticsDir, 
                     "whoClicked_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    # - join
    bannerData <- bannerSeen %>% 
      dplyr::left_join(bannerClosed, 'bannername') %>% 
      dplyr::left_join(bannerClicked, 'bannername')
    bannerData$close_rate <- round(bannerData$closed_by/bannerData$seen_by, 2)
    bannerData$click_rate <- round(bannerData$clicked_by/bannerData$seen_by, 2)
    # - date, campaign
    bannerData$day <- cetDay
    bannerData$campaign <- campaignName
    
    # - store:
    write.csv(bannerData, 
              paste0(analyticsDir, 
                     "bannerInteractionsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
  }

}

# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- 'WMDE_oceditors_spring_2020_'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
                                     cetDay = cetDay,
                                     queryFile = queryFile,
                                     fileName = fileName,
                                     analyticsDir = analyticsDir, 
                                     campaignName = campaignName)

1. Campaign Banners

This section presents all data and statistics on the campaign banners. The following chunk loads and then re-structures the dataset a bit:

lF <- list.files('_analytics')
lF <- lF[grepl("^bannerInteractions", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - NAs to zero
dataSet[which(is.na(dataSet), arr.ind = T)] <- 0
# - banner codes
dataSet$var_ctrl <- sapply(dataSet$bannername, function(x) {
  if (grepl("var", x)) {return("var")} else {return("ctrl")}
})
dataSet$device <- sapply(dataSet$bannername, function(x) {
  if (grepl("mobile", x)) {
    return("mobile")
  } else if (grepl("ipad", x)) {
      return("ipad")
  } else {
      return("desktop")
  }
})
dataSet$cs_nt <- sapply(dataSet$bannername, function(x) {
  if (grepl("cs", x)) {return("cs")} else {return("nt")}
})

2. Campaign Pageviews

This section presents all data and statistics on the campaign pages.

The following chunk loads and then re-structures the dataset a bit. NOTE. The 3all campaign tag was observed only once, on 2020/05/14, and is removed from analysis.

lF <- list.files('_analytics')
lF <- lF[grepl("^pageviewsAggregated", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - expand grid to account for missing observations per day
dS <- expand.grid(unique(dataSet$Tag), 
                  unique(dataSet$Page), 
                  unique(dataSet$date), 
                  stringsAsFactors = F)
colnames(dS) <- c('Tag', 'Page', 'date')
dS <- dS %>% 
  left_join(dataSet, 
            by = c("Tag", "Page", "date"))
dataSet <- dS; rm(dS)
dataSet$Pageviews[is.na(dataSet$Pageviews)] <- 0
# - banner codes
dataSet$var_ctrl <- sapply(dataSet$Tag, function(x) {
  if (grepl("var", x)) {return("var")} else {return("ctrl")}
})
dataSet$device <- sapply(dataSet$Tag, function(x) {
  if (grepl("mobile", x)) {
    return("mobile")
  } else if (grepl("ipad", x)) {
      return("ipad")
  } else {
      return("desktop")
  }
})
dataSet$cs_nt <- sapply(dataSet$Tag, function(x) {
  if (grepl("cs", x)) {return("cs")} else {return("nt")}
})
dataSet$Tag <- gsub("\\?campaign=WMDE_oceditors_spring_2020_", "", dataSet$Tag)
dataSet$Page <- gsub("de\\.wikipedia\\.org/wiki/Wikipedia:|de\\.m\\.wikipedia\\.org/wiki/Wikipedia:", "", dataSet$Page)
# - remove 3all tag:
dataSet <- filter(dataSet, Tag != "3all")

2.1 Pageviews Overview

Chart 2.1.1 Daily Pageviews, aggregated across the campaign banners.

pFrame <- dataSet %>% 
  select(date, Page, Pageviews) %>% 
  group_by(date, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

2.2 Pageviews: Devices

Reminder. From the Campaign Tracking Concept:

ipad/mobile does not represent the actual device type, but the display mode of the banner (small desktop screens may be reported as ipad)

Chart 2.2.1 Pageviews, by devices, aggregated across the campaign banners.

pFrame <- dataSet %>% 
  select(date, device, Page, Pageviews) %>% 
  group_by(date, device, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~device, nrow = 3, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

2.3 Pageviews: Var vs. Ctrl

Reminder. From the Campaign Tracking Concept:

ctrl is the banner that dynamically displays text depending on the target group

var is the banner that shows the same text for both target groups

Chart 1.3.1 Pageviews, by var/ctrl, aggregated across the campaign banners.

pFrame <- dataSet %>% 
  select(date, var_ctrl, Page, Pageviews) %>% 
  group_by(date, var_ctrl, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~var_ctrl, nrow = 3, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

2.4 Pageviews: nt vs. cs

Reminder. From the Campaign Tracking Concept:

cs/nt indicates, which target group the user belongs to (cs = community support, nt = new tasks)

Chart 1.4.1 Pageviews, by nt/cs, aggregated across the campaign banners.

pFrame <- dataSet %>% 
  select(date, cs_nt, Page, Pageviews) %>% 
  group_by(date, cs_nt, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~cs_nt, nrow = 3, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

2.5 Pageviews: Full Dataset

Table 2.5.1 Pageviews dataset

### --- Full Dataset (Table Report)
datatable(dataSet, 
          options = list(pageLength = 30)
          )

3. User Edits

This section presents all data and statistics on the user edits.

3.1 The Assignment of Users to Campaign Banners

The following chunk loads the dataset of user interactions with campaign banners and then re-structures the dataset a bit.

lF <- list.files("_data")
lF <- lF[grepl("^bannerInteractions", lF)]
dataSet <- lapply(paste0("_data/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet <- filter(dataSet, banneraction == "banner-clicked")
dataSet <- select(dataSet, bannername, userid, dt)
dataSet <- arrange(dataSet, userid, dt)
dataSet <- dataSet[!duplicated(dataSet$userid), ]
dataSet$dt <- NULL
dataSet$bannername <- gsub("WMDE_oceditors_spring_2020_", "", dataSet$bannername)
dataSet$bannername <- gsub("mobile_|ipad_", "", dataSet$bannername)
# - banner codes
dataSet$var_ctrl <- sapply(dataSet$bannername, function(x) {
  if (grepl("var", x)) {return("var")} else {return("ctrl")}
})
dataSet$cs_nt <- sapply(dataSet$bannername, function(x) {
  if (grepl("cs", x)) {return("cs")} else {return("nt")}
})
users <- select(dataSet, userid, cs_nt)
rm(dataSet)

Because the assingment of users to cs (community support) vs. nt (new tasks) was unique, we will focus on this variable in the campaign design in the analysis of user edits.

userEdits <- read.csv("_analytics/userEdits.csv", 
                      header = T,
                      row.names = 1,
                      check.names = F,
                      stringsAsFactors = F)
userEdits$revactor_timestamp <- as.character(userEdits$revactor_timestamp)
userEdits <- left_join(userEdits,
                       users,
                       by = c("actor_user" = "userid"))
userEdits$actor_id <- NULL
rm(users)
userEdits$date <- paste0(
  substr(userEdits$revactor_timestamp, 1, 4),
  "-",
  substr(userEdits$revactor_timestamp, 5, 6),
  "-",
  substr(userEdits$revactor_timestamp, 7, 8)
)

3.2 Daily user edits since the campaign onset

pFrame <- userEdits %>% 
  select(date) %>% 
  group_by(date) %>% 
  summarise(Edits = n())
pFrame <- arrange(pFrame, date)
campaignDays <- c("2020-05-14", "2020-05-15", "2020-05-16", "2020-05-17", "2020-05-18", "2020-05-19", 
                  "2020-05-20", "2020-05-21", "2020-05-22", "2020-05-23", "2020-05-24", "2020-05-25", 
                  "2020-05-26", "2020-05-27")
pFrame$campaign <- ifelse(pFrame$date %in% campaignDays, 
                          "Campaign", 
                          "After Campaign")
ggplot(pFrame, aes(x = date,
                   y = Edits,
                   group = campaign,
                   color = campaign,
                   fill = campaign,
                   label = Edits,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

3.3 Daily user edits since the campaign onset by cs/nt

pFrame <- userEdits %>% 
  select(date, cs_nt) %>% 
  group_by(date, cs_nt) %>% 
  summarise(Edits = n())
pFrame <- arrange(pFrame, date)
campaignDays <- c("2020-05-14", "2020-05-15", "2020-05-16", "2020-05-17", "2020-05-18", "2020-05-19", 
                  "2020-05-20", "2020-05-21", "2020-05-22", "2020-05-23", "2020-05-24", "2020-05-25", 
                  "2020-05-26", "2020-05-27")
pFrame$campaign <- ifelse(pFrame$date %in% campaignDays, 
                          "Campaign", 
                          "After Campaign")
ggplot(pFrame, aes(x = date,
                   y = Edits,
                   group = campaign,
                   color = campaign,
                   fill = campaign,
                   label = Edits,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  facet_wrap(~cs_nt, ncol = 1, scales = "free") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")

3.4 Edit Classes

3.4.1 Edit Classes: all users

userClass <- userEdits %>% 
  select(actor_user) %>% 
  group_by(actor_user) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

3.4.2 Edit Classes: cs (community support) users

userClass <- userEdits %>% 
  filter(cs_nt == "cs") %>% 
  select(actor_user) %>% 
  group_by(actor_user) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

3.4.2 Edit Classes: nt (new tasks) users

userClass <- userEdits %>% 
  filter(cs_nt == "nt") %>% 
  select(actor_user) %>% 
  group_by(actor_user) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
---
title: '2020 WMDE Occasional Editors Banner Campaign'
author: "Goran S. Milovanovic, Data Scientist, WMDE"
date: "June 12, 2020"
output:
  html_notebook:
    code_folding: hide
    theme: simplex
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---

**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign was run from 2020/05/14 to 2020/05/27.

**CURRENT UPDATE:** Complete dataset as of 2020/05/27.

```{r, echo = F, warning = 'hide', message = F, results = 'hide'}
# !diagnostics off
### --- Setup
library(kableExtra)
library(rmarkdown)
library(knitr)
library(tidyverse)
library(data.table)
library(reshape2)
library(DT)
library(ggrepel)
library(scales)
library(RColorBrewer)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script `2020_OccasionalEditors_PRODUCTION.R` on stat1004.eqiad.wmnet, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. A daily crontab job was run from `2020/05/14` to `2020/05/27` to collect the data for daily reporting. The data used in this report are aggregates of the daily datasets, sanitized and anonymized.   

### 0.1 Daily Update

```{r, echo = T, eval = F}
### --- WMDE 2020_OccasionalEditors_PRODUCTION.R
# - Campaign start: 2020/05/14
# - Campaign end: 2020/05/27
# - run from: stat1004
# - path: 

### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)

### --- dir structure
campaignPath <- paste0(getwd(), "/")
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- campaign specifics
campaignName <- 'OccasionalEditors2020'

### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
  strsplit(as.character(cetDay), 
           split = " ", 
           fixed = T)[[1]][1]
) - 1

### ----------------------------------------------------------
### --- Banner Impressions
### ----------------------------------------------------------

# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host, 
                                            uri_path, 
                                            uri_query, 
                                            cetDay,
                                            queryFile,
                                            fileName,
                                            dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                 paste(
                                   paste0("uri_host = '", uri_host, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_query_condition
  if (length(uri_query) > 1) {
    uri_query_condition <- paste0("(",
                                  paste(
                                    paste0("uri_query LIKE '%", uri_query, "%'"),
                                    collapse = " OR ", sep = " "),
                                  ")"
    )
  } else {
    uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0(
    "USE wmf;
    SELECT uri_query FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    uri_query_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
  hiveInput <- paste0(queryFile, ' > ', fileName)
  # - command:
  hiveCommand <- paste(hiveArgs, hiveInput)
  return(
    system(command = hiveCommand, wait = TRUE)
  )
}

# - set params to wmde_collect_banner_impressions
# - for the OccasionalEditors2020
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- '/beacon/impression'
uri_query <- c('WMDE_oceditors_spring_2020_')
queryFile <- 'OccasionalEditors2020_BannerImpressions.hql'
fileName <- paste0(dataDir, "bannerImpressions_", cetDay, ".tsv")

# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
                                uri_path,
                                uri_query,
                                cetDay,
                                queryFile,
                                fileName,
                                dataDir)

# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
                                            dataDir, 
                                            cetDay, 
                                            campaignName, 
                                            uri_query) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  
  # - load
  bannerData <- tryCatch({
    as.data.frame(fread(fileName))
    },
    error = function(condition) {
      return(FALSE)
  })
  # - process
  if (class(bannerData) == 'logical') {
    return(FALSE) 
  } else {
    # - clean
    bannerData <- dplyr::filter(bannerData,
                                uri_query != "")
    # - split
    bannerData <- tidyr::separate(bannerData, 
                                  col = uri_query, 
                                  into = c('country', 
                                           'region',
                                           'anonymous',
                                           'project',
                                           'db',
                                           'uselang', 
                                           'device', 
                                           'debug', 
                                           'randomcampaign',
                                           'randombanner',
                                           'recordImpressionSampleRate',
                                           'impressionEventSampleRate',
                                           'campaignStatuses',
                                           'status',
                                           'statusCode',
                                           'campaign',
                                           'campaignCategory',
                                           'campaignCategoryUsesLegacy',
                                           'bucket',
                                           'banner',
                                           'bannerCategory', 
                                           'result'),
                                           sep = "&") %>% 
      dplyr::select(banner, device, recordImpressionSampleRate, result)
    # - filter for uri_query
    bannerData <- bannerData[grepl(uri_query, bannerData$banner), ]
    # - clean relevant fields
    # - banner:
    bannerData$banner <- gsub("^banner=", "", bannerData$banner)
    # - recordImpressionSampleRate:
    bannerData$recordImpressionSampleRate <- as.numeric(
      gsub("^recordImpressionSampleRate=", "", bannerData$recordImpressionSampleRate)
    )
    # - device:
    bannerData$device <- gsub("^device=", "", bannerData$device)
    # - result:
    bannerData$result <- gsub("^result=", "", bannerData$result)
    # - filter for result=show
    bannerData <- dplyr::filter(bannerData,
                                result == "show")
    # - correction for recordImpressionSampleRate
    bannerData$recordImpressionSampleRate <- 
      1/bannerData$recordImpressionSampleRate
    
    # - aggregate:
    bannerData <- bannerData %>% 
      dplyr::select(banner, device, recordImpressionSampleRate) %>% 
      dplyr::group_by(banner, device) %>% 
      dplyr::summarise(impressions = sum(recordImpressionSampleRate))
    
    # - add cetDay, me
    bannerData$date <- cetDay
    bannerData$campaign <- campaignName
    
    # - store:
    write.csv(bannerData, 
              paste0(analyticsDir, "bannerImpressions",
                     cetDay,
                     ".csv"
              )
    )
    
    # - return
    return(TRUE)
  }
}

# - wrangle Banner Impression data
campaignName <- "OccasionalEditors2020"
uri_query <- c('WMDE_oceditors_spring_2020_')
bannerProcess <- wmde_process_banner_impressions(fileName = fileName,
                                                 dataDir = dataDir,
                                                 cetDay = cetDay,
                                                 campaignName = campaignName,
                                                 uri_query = uri_query)



### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------

# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
                                   uri_path,
                                   cetDay,
                                   queryFile,
                                   fileName,
                                   dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                  paste(
                                    paste0("uri_host = '", uri_host, "'"),
                                    collapse = " OR ", sep = " "),
                                  ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "USE wmf;
    SELECT uri_host, uri_path, uri_query, referer FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
}

# - set params to wmde_collect_pageviews
# - for the 2020_EmailCampaignWikipediaChallenge
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- c(
  '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
  '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
  '/wiki/Wikipedia:Mentorenprogramm')
# uri_query <- paste0('WMDE_2020_challenge_', 1:30)
queryFile <- 'OccasionalEditors2020_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")

# - collect Pageviews data
wmde_collect_pageviews(uri_host,
                       uri_path,
                       cetDay,
                       queryFile,
                       fileName,
                       dataDir)

### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
                                   dataDir, 
                                   uri_query_filter,
                                   cetDay = cetDay,
                                   campaignName = campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)
  
  # - load
  pageviewsData <- readLines(fileName)
  wStart <- which(grepl("uri_host", pageviewsData))
  pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
  pageviewsData <- data.frame(dat = pageviewsData, 
                              stringsAsFactors = F)
  pageviewsData <- separate(pageviewsData,
                            dat,
                            into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
                            sep = "\t")
  
  # - apply uri_query_filter
  # - NOTE: looking in both: uri_query, referer 
  # - NOTE: hack for the 2020_OccasionalEditors Campaign
  # - include pageviews for 
  # - '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
  # - '/wiki/Wikipedia:Mentorenprogramm'
  # - on dewiki where grepl(w_uri_query, referer)
  w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer) & 
                                 !grepl(uri_query_filter, pageviewsData$uri_query))
  if (length(w_uri_query_referer) > 0) {
    refererTags <- strsplit(pageviewsData$referer[w_uri_query_referer], 
                            split = "?",
                            fixed = T)
    refererTags <- sapply(refererTags, function(x) {x[2]})
    refererTags <- paste0("?", refererTags)
    pageviewsData$uri_query[w_uri_query_referer] <- refererTags 
  }
  w_uri_query <- which(grepl(uri_query_filter, pageviewsData$uri_query))
  
  if (length(w_uri_query) > 0) {
    
    # - filter for w_uri_query
    pageviewsData <- pageviewsData[w_uri_query, ] 
    
    # - aggregate:
    pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
    pageviewsData$uri_host <- NULL
    pageviewsData$referer <- NULL
    pageviewsData <- pageviewsData %>% 
      dplyr::select(uri_query, uri_path) %>% 
      dplyr::group_by(uri_query, uri_path) %>% 
      dplyr::summarise(pageviews = n())
    colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
    
    # - add cetDay, campaignName
    pageviewsData$date <- cetDay
    pageviewsData$campaign <- campaignName
    
    # - store:
    write.csv(pageviewsData, 
              paste0(analyticsDir, 
                     "pageviewsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
  }
  
}

# - set params to wmde_process_pageviews
# - for the WMDE 2020_EmailCampaignWikipediaChallenge
uri_query_filter <- 'WMDE_oceditors_spring_2020_'

# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
                       dataDir = dataDir,
                       uri_query_filter = uri_query_filter, 
                       cetDay = cetDay,
                       campaignName = campaignName)

### ----------------------------------------------------------
### --- Banner Actions
### --- via event.WMDEBannerActions
### ----------------------------------------------------------

### ___ NOTE:
# - Suffix explanation:
# - ctrl is the banner that dynamically displays text depending on the target group
# - var is the banner that shows the same text for both target groups
# - ipad/mobile does not represent the actual device type, but the display mode of the banner (small desktop screens may be reported as ipad)
# - cs/nt indicates, which target group the user belongs to
# - Kai Nissen in https://phabricator.wikimedia.org/T251535#6132468

# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID 
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);

# - function: wmde_collect_pageviews
wmde_banner_actions <- function(uri_query_filter,
                                cetDay,
                                queryFile,
                                fileName,
                                analyticsDir,
                                campaignName) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  

  # - WHERE condition: create eventBannerName_condition
  if (length(uri_query) > 1) {
    eventBannerName_condition <- paste0("(",
                                        paste(
                                          paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
                                          collapse = " OR ", sep = " "),
                                        ")"
    )
  } else {
    eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions 
    WHERE (",
    eventBannerName_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
    )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
  
  # - Wrangle Banner Interactions
  # - load
  bannerData <- tryCatch({
    as.data.frame(fread(paste0(dataDir, fileName)))
  },
  error = function(condition) {
    return(FALSE)
  })
  
  # - process
  if (class(bannerData) == 'logical') {
    return(FALSE) 
  } else { 
    # - bannerSeen
    bannerSeen <- bannerData %>% 
      dplyr::select(bannername, userid)
    bannerSeen <- bannerSeen[!duplicated(bannerSeen), ]
    bannerSeen <- bannerSeen %>% 
      dplyr::select(bannername) %>% 
      dplyr::group_by(bannername) %>% 
      dplyr::summarise(seen_by = n())
    # - bannerClosed
    bannerClosed <- bannerData %>% 
      dplyr::filter(banneraction == "banner-closed") %>% 
      dplyr::select(bannername, bannerimpressions) %>% 
      dplyr::group_by(bannername) %>%
      dplyr::summarise(closed_by = n(), mean_close_imp = round(mean(bannerimpressions), 2))
    # - bannerClicked
    bannerClicked <- bannerData %>% 
      dplyr::filter(banneraction == "banner-clicked") %>% 
      dplyr::select(bannername, bannerimpressions) %>% 
      dplyr::group_by(bannername) %>%
      dplyr::summarise(clicked_by = n(), mean_click_imp = round(mean(bannerimpressions), 2))
    # - whoClicked
    whoClicked <- bannerData %>% 
      dplyr::filter(banneraction == "banner-clicked")
    whoClicked <- data.frame(userid = unique(whoClicked$userid))
    # - store:
    write.csv(whoClicked, 
              paste0(analyticsDir, 
                     "whoClicked_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    # - join
    bannerData <- bannerSeen %>% 
      dplyr::left_join(bannerClosed, 'bannername') %>% 
      dplyr::left_join(bannerClicked, 'bannername')
    bannerData$close_rate <- round(bannerData$closed_by/bannerData$seen_by, 2)
    bannerData$click_rate <- round(bannerData$clicked_by/bannerData$seen_by, 2)
    # - date, campaign
    bannerData$day <- cetDay
    bannerData$campaign <- campaignName
    
    # - store:
    write.csv(bannerData, 
              paste0(analyticsDir, 
                     "bannerInteractionsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
  }

}

# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- 'WMDE_oceditors_spring_2020_'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
                                     cetDay = cetDay,
                                     queryFile = queryFile,
                                     fileName = fileName,
                                     analyticsDir = analyticsDir, 
                                     campaignName = campaignName)
```

## 1. Campaign Banners

This section presents all data and statistics on the campaign banners.
The following chunk loads and then re-structures the dataset a bit:

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
lF <- list.files('_analytics')
lF <- lF[grepl("^bannerInteractions", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - NAs to zero
dataSet[which(is.na(dataSet), arr.ind = T)] <- 0
# - banner codes
dataSet$var_ctrl <- sapply(dataSet$bannername, function(x) {
  if (grepl("var", x)) {return("var")} else {return("ctrl")}
})
dataSet$device <- sapply(dataSet$bannername, function(x) {
  if (grepl("mobile", x)) {
    return("mobile")
  } else if (grepl("ipad", x)) {
      return("ipad")
  } else {
      return("desktop")
  }
})
dataSet$cs_nt <- sapply(dataSet$bannername, function(x) {
  if (grepl("cs", x)) {return("cs")} else {return("nt")}
})
```

### 1.1 Banner Actions Overview

**Chart 1.1.1** Daily Banner Impressions, Clicks, and Closures, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 5}
pFrame <- dataSet %>% 
  select(day, seen_by, closed_by, clicked_by) %>% 
  group_by(day) %>% 
  summarise(seen_by = sum(seen_by), 
            closed_by = sum(closed_by), 
            clicked_by = sum(clicked_by))
pFrame$close_rate = round(pFrame$closed_by/pFrame$seen_by, 2)
pFrame$click_rate = round(pFrame$clicked_by/pFrame$seen_by, 2)
pFrame <- arrange(pFrame, day)
# - Visualize w. {ggplot2}: seen_by, clicked_by, closed_by, daily:
pF <- pFrame %>% 
  select(day, seen_by, closed_by, clicked_by) %>% 
  pivot_longer(c('seen_by', 'closed_by', 'clicked_by'), 
               names_to = "Action", 
               values_to = "Value")
ggplot(pF, aes(x = day,
               y = Value,
               group = Action,
               color = Action,
               fill = Action,
               label = Value,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

**Chart 1.1.2** Daily Banner Click and Close rates, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 5}
# - Visualize w. {ggplot2}: close_rate, click_rate, daily:
pF <- pFrame %>% 
  select(day, click_rate, close_rate) %>% 
  pivot_longer(c('click_rate', 'close_rate'), 
               names_to = "Action", 
               values_to = "Value")
ggplot(pF, aes(x = day,
               y = Value,
               group = Action,
               color = Action,
               fill = Action,
               label = Value,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) + 
  ylim(c(0, 1)) + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

**Chart 1.1.3** Daily: How Many Impressions to Action (Click, Close), aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 5}
pFrame <- dataSet %>% 
  select(day, mean_click_imp, mean_close_imp) %>% 
  group_by(day) %>% 
  summarise(mean_click_imp = round(mean(mean_click_imp), 2), 
            mean_close_imp = round(mean(mean_close_imp), 2)) %>% 
  arrange(day)
colnames(pFrame) <- c('day', 'click', 'close')
# - Visualize w. {ggplot2}: click, close, daily:
pF <- pFrame %>% 
  pivot_longer(c('click', 'close'), 
               names_to = "Action", 
               values_to = "Average Impressions")
ggplot(pF, aes(x = day,
               y = `Average Impressions`,
               group = Action,
               color = Action,
               fill = Action,
               label = `Average Impressions`,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Avg. Impressions before Action") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

### 1.2 Banner Actions: Devices

**Reminder.** From the [Campaign Tracking Concept](https://docs.google.com/document/d/17k7gqDhrxCDpSTtkgY__4BO0Qy5GK-ibDd1cotaSZ-E/edit#):

> ipad/mobile does not represent the actual device type, but the display mode of the banner (small desktop screens may be reported as ipad)


**Chart 1.2.1** Daily Banner Click and Close rates, by **devices**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 10}
pFrame <- dataSet %>% 
  select(day, seen_by, closed_by, clicked_by, device) %>% 
  group_by(day, device) %>% 
  summarise(seen_by = sum(seen_by), 
            closed_by = sum(closed_by), 
            clicked_by = sum(clicked_by))
pFrame$close_rate = round(pFrame$closed_by/pFrame$seen_by, 2)
pFrame$click_rate = round(pFrame$clicked_by/pFrame$seen_by, 2)
pFrame <- arrange(pFrame, day)
# - Visualize w. {ggplot2}: seen_by, clicked_by, closed_by, daily:
pF <- pFrame %>% 
  select(day, device, seen_by, closed_by, clicked_by) %>% 
  pivot_longer(c('seen_by', 'closed_by', 'clicked_by'), 
               names_to = "Action", 
               values_to = "Value")
ggplot(pF, aes(x = day,
               y = Value,
               group = Action,
               color = Action,
               fill = Action,
               label = Value,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~device, nrow = 3, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

**Chart 1.2.2** Daily: How Mean Impressions to Action (Click, Close), by **devices**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 10}
pFrame <- dataSet %>% 
  select(day, device, mean_click_imp, mean_close_imp) %>% 
  group_by(day, device) %>% 
  summarise(mean_click_imp = round(mean(mean_click_imp), 2), 
            mean_close_imp = round(mean(mean_close_imp), 2)) %>% 
  arrange(day)
colnames(pFrame) <- c('day', 'device', 'click', 'close')
# - Visualize w. {ggplot2}: click, close, daily:
pF <- pFrame %>% 
  pivot_longer(c('click', 'close'), 
               names_to = "Action", 
               values_to = "Average Impressions")
ggplot(pF, aes(x = day,
               y = `Average Impressions`,
               group = Action,
               color = Action,
               fill = Action,
               label = `Average Impressions`,
                    )) + 
  facet_wrap(~device, nrow = 3, scales = "free") +
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Avg. Impressions before Action") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

### 1.3 Banner Actions: Var vs. Ctrl

**Reminder.** From the [Campaign Tracking Concept](https://docs.google.com/document/d/17k7gqDhrxCDpSTtkgY__4BO0Qy5GK-ibDd1cotaSZ-E/edit#):

> ctrl is the banner that dynamically displays text depending on the target group

> var is the banner that shows the same text for both target groups

**Chart 1.3.1** Daily Banner Click and Close rates, by **var/ctrl**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- dataSet %>% 
  select(day, seen_by, closed_by, clicked_by, var_ctrl) %>% 
  group_by(day, var_ctrl) %>% 
  summarise(seen_by = sum(seen_by), 
            closed_by = sum(closed_by), 
            clicked_by = sum(clicked_by))
pFrame$close_rate = round(pFrame$closed_by/pFrame$seen_by, 2)
pFrame$click_rate = round(pFrame$clicked_by/pFrame$seen_by, 2)
pFrame <- arrange(pFrame, day)
# - Visualize w. {ggplot2}: seen_by, clicked_by, closed_by, daily:
pF <- pFrame %>% 
  select(day, var_ctrl, seen_by, closed_by, clicked_by) %>% 
  pivot_longer(c('seen_by', 'closed_by', 'clicked_by'), 
               names_to = "Action", 
               values_to = "Value")
ggplot(pF, aes(x = day,
               y = Value,
               group = Action,
               color = Action,
               fill = Action,
               label = Value,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~var_ctrl, nrow = 2, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

**Chart 1.3.2** Daily: How Mean Impressions to Action (Click, Close), by **var/ctrl**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- dataSet %>% 
  select(day, var_ctrl, mean_click_imp, mean_close_imp) %>% 
  group_by(day, var_ctrl) %>% 
  summarise(mean_click_imp = round(mean(mean_click_imp), 2), 
            mean_close_imp = round(mean(mean_close_imp), 2)) %>% 
  arrange(day)
colnames(pFrame) <- c('day', 'var_ctrl', 'click', 'close')
# - Visualize w. {ggplot2}: click, close, daily:
pF <- pFrame %>% 
  pivot_longer(c('click', 'close'), 
               names_to = "Action", 
               values_to = "Average Impressions")
ggplot(pF, aes(x = day,
               y = `Average Impressions`,
               group = Action,
               color = Action,
               fill = Action,
               label = `Average Impressions`,
                    )) + 
  facet_wrap(~var_ctrl, nrow = 2, scales = "free") +
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Avg. Impressions before Action") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

### 1.4 Banner Actions: nt vs. cs

**Reminder.** From the [Campaign Tracking Concept](https://docs.google.com/document/d/17k7gqDhrxCDpSTtkgY__4BO0Qy5GK-ibDd1cotaSZ-E/edit#):

> cs/nt indicates, which target group the user belongs to (cs = community support, nt = new tasks)

**Chart 1.4.1** Daily Banner Click and Close rates, by **nt/cs**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- dataSet %>% 
  select(day, seen_by, closed_by, clicked_by, cs_nt) %>% 
  group_by(day, cs_nt) %>% 
  summarise(seen_by = sum(seen_by), 
            closed_by = sum(closed_by), 
            clicked_by = sum(clicked_by))
pFrame$close_rate = round(pFrame$closed_by/pFrame$seen_by, 2)
pFrame$click_rate = round(pFrame$clicked_by/pFrame$seen_by, 2)
pFrame <- arrange(pFrame, day)
# - Visualize w. {ggplot2}: seen_by, clicked_by, closed_by, daily:
pF <- pFrame %>% 
  select(day, cs_nt, seen_by, closed_by, clicked_by) %>% 
  pivot_longer(c('seen_by', 'closed_by', 'clicked_by'), 
               names_to = "Action", 
               values_to = "Value")
ggplot(pF, aes(x = day,
               y = Value,
               group = Action,
               color = Action,
               fill = Action,
               label = Value,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~cs_nt, nrow = 2, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

**Chart 1.4.2** Daily: How Mean Impressions to Action (Click, Close), by **nt/cs**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- dataSet %>% 
  select(day, cs_nt, mean_click_imp, mean_close_imp) %>% 
  group_by(day, cs_nt) %>% 
  summarise(mean_click_imp = round(mean(mean_click_imp), 2), 
            mean_close_imp = round(mean(mean_close_imp), 2)) %>% 
  arrange(day)
colnames(pFrame) <- c('day', 'cs_nt', 'click', 'close')
# - Visualize w. {ggplot2}: click, close, daily:
pF <- pFrame %>% 
  pivot_longer(c('click', 'close'), 
               names_to = "Action", 
               values_to = "Average Impressions")
ggplot(pF, aes(x = day,
               y = `Average Impressions`,
               group = Action,
               color = Action,
               fill = Action,
               label = `Average Impressions`,
                    )) + 
  facet_wrap(~cs_nt, nrow = 2, scales = "free") +
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Avg. Impressions before Action") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

### 1.5 Banner Actions: Full Dataset

**Table 1.5.1** Daily Banner Impressions: seen by, closed by, clicked by

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
dS <- dataSet
dS$bannername <- gsub("WMDE_oceditors_spring_2020_", "", dS$bannername)
dS <- dS[,
         c('bannername', 'device', 'var_ctrl', 'cs_nt',
           'seen_by', 'closed_by', 'clicked_by')]
datatable(dS, 
          options = list(pageLength = 30)
          )
```

**Table 1.5.2** Daily Banner Actions: mean impressions before action

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
dS <- dataSet
dS$bannername <- gsub("WMDE_oceditors_spring_2020_", "", dS$bannername)
dS <- dS[,
         c('bannername', 'device', 'var_ctrl', 'cs_nt',
           'mean_click_imp', 'mean_close_imp')]
datatable(dS, 
          options = list(pageLength = 30)
          )
```

## 2. Campaign Pageviews

This section presents all data and statistics on the campaign pages.

The following chunk loads and then re-structures the dataset a bit.
**NOTE.** The `3all` campaign tag was observed only once, on `2020/05/14`, and is removed from analysis.

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
lF <- list.files('_analytics')
lF <- lF[grepl("^pageviewsAggregated", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - expand grid to account for missing observations per day
dS <- expand.grid(unique(dataSet$Tag), 
                  unique(dataSet$Page), 
                  unique(dataSet$date), 
                  stringsAsFactors = F)
colnames(dS) <- c('Tag', 'Page', 'date')
dS <- dS %>% 
  left_join(dataSet, 
            by = c("Tag", "Page", "date"))
dataSet <- dS; rm(dS)
dataSet$Pageviews[is.na(dataSet$Pageviews)] <- 0
# - banner codes
dataSet$var_ctrl <- sapply(dataSet$Tag, function(x) {
  if (grepl("var", x)) {return("var")} else {return("ctrl")}
})
dataSet$device <- sapply(dataSet$Tag, function(x) {
  if (grepl("mobile", x)) {
    return("mobile")
  } else if (grepl("ipad", x)) {
      return("ipad")
  } else {
      return("desktop")
  }
})
dataSet$cs_nt <- sapply(dataSet$Tag, function(x) {
  if (grepl("cs", x)) {return("cs")} else {return("nt")}
})
dataSet$Tag <- gsub("\\?campaign=WMDE_oceditors_spring_2020_", "", dataSet$Tag)
dataSet$Page <- gsub("de\\.wikipedia\\.org/wiki/Wikipedia:|de\\.m\\.wikipedia\\.org/wiki/Wikipedia:", "", dataSet$Page)
# - remove 3all tag:
dataSet <- filter(dataSet, Tag != "3all")
```

### 2.1 Pageviews Overview

**Chart 2.1.1** Daily Pageviews, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 5}
pFrame <- dataSet %>% 
  select(date, Page, Pageviews) %>% 
  group_by(date, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Pageviews") + 
  theme_minimal() + 
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 2.2 Pageviews: Devices

**Reminder.** From the [Campaign Tracking Concept](https://docs.google.com/document/d/17k7gqDhrxCDpSTtkgY__4BO0Qy5GK-ibDd1cotaSZ-E/edit#):

> ipad/mobile does not represent the actual device type, but the display mode of the banner (small desktop screens may be reported as ipad)


**Chart 2.2.1** Pageviews, by **devices**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 10}
pFrame <- dataSet %>% 
  select(date, device, Page, Pageviews) %>% 
  group_by(date, device, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~device, nrow = 3, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```


### 2.3 Pageviews: Var vs. Ctrl

**Reminder.** From the [Campaign Tracking Concept](https://docs.google.com/document/d/17k7gqDhrxCDpSTtkgY__4BO0Qy5GK-ibDd1cotaSZ-E/edit#):

> ctrl is the banner that dynamically displays text depending on the target group

> var is the banner that shows the same text for both target groups

**Chart 1.3.1** Pageviews, by **var/ctrl**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- dataSet %>% 
  select(date, var_ctrl, Page, Pageviews) %>% 
  group_by(date, var_ctrl, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~var_ctrl, nrow = 3, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```


### 2.4 Pageviews: nt vs. cs

**Reminder.** From the [Campaign Tracking Concept](https://docs.google.com/document/d/17k7gqDhrxCDpSTtkgY__4BO0Qy5GK-ibDd1cotaSZ-E/edit#):

> cs/nt indicates, which target group the user belongs to (cs = community support, nt = new tasks)

**Chart 1.4.1** Pageviews, by **nt/cs**, aggregated across the campaign banners.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- dataSet %>% 
  select(date, cs_nt, Page, Pageviews) %>% 
  group_by(date, cs_nt, Page) %>% 
  summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
                   y = Pageviews,
                   group = Page,
                   color = Page,
                   fill = Page,
                   label = Pageviews,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  facet_wrap(~cs_nt, nrow = 3, scales = "free") + 
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Actions") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 2.5 Pageviews: Full Dataset

**Table 2.5.1** Pageviews dataset

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
datatable(dataSet, 
          options = list(pageLength = 30)
          )
```

## 3. User Edits

This section presents all data and statistics on the user edits.

### 3.1 The Assignment of Users to Campaign Banners

The following chunk loads the dataset of user interactions with campaign banners and then re-structures the dataset a bit.

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
lF <- list.files("_data")
lF <- lF[grepl("^bannerInteractions", lF)]
dataSet <- lapply(paste0("_data/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet <- filter(dataSet, banneraction == "banner-clicked")
dataSet <- select(dataSet, bannername, userid, dt)
dataSet <- arrange(dataSet, userid, dt)
dataSet <- dataSet[!duplicated(dataSet$userid), ]
dataSet$dt <- NULL
dataSet$bannername <- gsub("WMDE_oceditors_spring_2020_", "", dataSet$bannername)
dataSet$bannername <- gsub("mobile_|ipad_", "", dataSet$bannername)
# - banner codes
dataSet$var_ctrl <- sapply(dataSet$bannername, function(x) {
  if (grepl("var", x)) {return("var")} else {return("ctrl")}
})
dataSet$cs_nt <- sapply(dataSet$bannername, function(x) {
  if (grepl("cs", x)) {return("cs")} else {return("nt")}
})
users <- select(dataSet, userid, cs_nt)
rm(dataSet)
```

Because the assingment of users to **cs (community support)** vs. **nt (new tasks)** was unique, we will focus on this variable in the campaign design in the analysis of user edits.

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userEdits <- read.csv("_analytics/userEdits.csv", 
                      header = T,
                      row.names = 1,
                      check.names = F,
                      stringsAsFactors = F)
userEdits$revactor_timestamp <- as.character(userEdits$revactor_timestamp)
userEdits <- left_join(userEdits,
                       users,
                       by = c("actor_user" = "userid"))
userEdits$actor_id <- NULL
rm(users)
userEdits$date <- paste0(
  substr(userEdits$revactor_timestamp, 1, 4),
  "-",
  substr(userEdits$revactor_timestamp, 5, 6),
  "-",
  substr(userEdits$revactor_timestamp, 7, 8)
)
```

### 3.2 Daily user edits since the campaign onset

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- userEdits %>% 
  select(date) %>% 
  group_by(date) %>% 
  summarise(Edits = n())
pFrame <- arrange(pFrame, date)
campaignDays <- c("2020-05-14", "2020-05-15", "2020-05-16", "2020-05-17", "2020-05-18", "2020-05-19", 
                  "2020-05-20", "2020-05-21", "2020-05-22", "2020-05-23", "2020-05-24", "2020-05-25", 
                  "2020-05-26", "2020-05-27")
pFrame$campaign <- ifelse(pFrame$date %in% campaignDays, 
                          "Campaign", 
                          "After Campaign")
ggplot(pFrame, aes(x = date,
                   y = Edits,
                   group = campaign,
                   color = campaign,
                   fill = campaign,
                   label = Edits,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 3.3 Daily user edits since the campaign onset by cs/nt

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- userEdits %>% 
  select(date, cs_nt) %>% 
  group_by(date, cs_nt) %>% 
  summarise(Edits = n())
pFrame <- arrange(pFrame, date)
campaignDays <- c("2020-05-14", "2020-05-15", "2020-05-16", "2020-05-17", "2020-05-18", "2020-05-19", 
                  "2020-05-20", "2020-05-21", "2020-05-22", "2020-05-23", "2020-05-24", "2020-05-25", 
                  "2020-05-26", "2020-05-27")
pFrame$campaign <- ifelse(pFrame$date %in% campaignDays, 
                          "Campaign", 
                          "After Campaign")
ggplot(pFrame, aes(x = date,
                   y = Edits,
                   group = campaign,
                   color = campaign,
                   fill = campaign,
                   label = Edits,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  facet_wrap(~cs_nt, ncol = 1, scales = "free") + 
  scale_y_continuous(labels = comma) +
  ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right")
```

### 3.4 Edit Classes

#### 3.4.1 Edit Classes: all users

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userClass <- userEdits %>% 
  select(actor_user) %>% 
  group_by(actor_user) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

#### 3.4.2 Edit Classes: cs (community support) users

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userClass <- userEdits %>% 
  filter(cs_nt == "cs") %>% 
  select(actor_user) %>% 
  group_by(actor_user) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

#### 3.4.2 Edit Classes: nt (new tasks) users

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
userClass <- userEdits %>% 
  filter(cs_nt == "nt") %>% 
  select(actor_user) %>% 
  group_by(actor_user) %>% 
  summarise(edits = n())
editBoundaries <- list(
  c(0, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return("> 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})
editClass <- as.data.frame(table(userClass$editClass), 
                           stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```
