Feedback should be send to goran.milovanovic_ext@wikimedia.de.

The campaign is run from 2021/10/18 to 2021/10/31.

CURRENT UPDATE: Complete dataset as of 2021/10/31.

0. Data Acquisiton

NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running an R script on stat1007.eqiad.wmnet, collecting the data as .tsv and .csv files, copying manually, and processing locally.

0.1 Daily Update

### ----------------------------------------------------------------
### --- WMDE 2021 Ocassional Editors
### --- https://phabricator.wikimedia.org/T291635
### ----------------------------------------------------------------

### ----------------------------------------------------------------
### --- Campaign Description and Parameters
### ----------------------------------------------------------------

# - Landing Pages:
# - Landing Page 1: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement
# - Landing Page 2a: https://tools.wmflabs.org/mitmachen/
# - Landing Page 2b: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia
# - Landing Page 2c: https://de.wikipedia.org/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote
# - Campaign tags:
# - WMDE_oceditors_fall_2021

# - Start of the banner campaign: October 12th
# - End of the banner campaign: October 26 th
# - Tracking test: October 5 - 7
# - Preliminary report for tracking: beginning of November
# - Track editing behavior four weeks after end of campaign: November 23rd
# - final report for tracking: beginning of December

### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)

### --- campaign specifics
campaignName <- 'OccasionalEditors_2021'

### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/2021_OccasionalEditors/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
publicDir <- '/srv/published/datasets/wmde-analytics-engineering/NewEditors/campaigns/2021_OccasionalEditors/'

### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
  strsplit(as.character(cetDay), 
           split = " ", 
           fixed = T)[[1]][1]
) - 1

### ----------------------------------------------------------
### --- Banner Interactions:
### --- via event.WMDEBannerInteractions
### --- https://meta.wikimedia.org/wiki/Schema:WMDEBannerInteractions
### ----------------------------------------------------------

# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID 
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);

# - function: wmde_banner_actions
wmde_banner_actions <- function(uri_query_filter,
                                cetDay,
                                queryFile,
                                fileName,
                                analyticsDir,
                                campaignName) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  
  # - WHERE condition: create eventBannerName_condition
  if (length(uri_query_filter) > 1) {
    eventBannerName_condition <- paste0("(",
                                        paste(
                                          paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
                                          collapse = " OR ", sep = " "),
                                        ")"
    )
  } else {
    eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions 
    WHERE (",
    eventBannerName_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
  )
  
  # - write hql
  write(hiveQuery, paste0(dataDir, queryFile))
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
  
  # - Wrangle Banner Interactions
  # - load
  bannerData <- tryCatch({
    as.data.frame(fread(paste0(dataDir, fileName)))
  },
  error = function(condition) {
    return(FALSE)
  })
  
  # - process
  if (class(bannerData) == 'logical') {
    
    return(FALSE) 
    
  } else { 
    
    # - whoClicked
    whoClicked <- bannerData %>% 
      dplyr::filter(banneraction == "banner-clicked")
    whoClicked <- data.frame(userid = unique(whoClicked$userid))
    
    # - store:
    write.csv(whoClicked, 
              paste0(analyticsDir, 
                     "whoClicked_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
    # - process bannerData
    bannerData <- t(table(bannerData$banneraction,
                          bannerData$bannername))
    bannerData <- as.data.frame(bannerData)
    colnames(bannerData) <- c('banner', 'action', 'count')
    bannerData$day <- cetDay
    bannerData$campaign <- campaignName
    
    # - store:
    write.csv(bannerData, 
              paste0(analyticsDir, 
                     "bannerInteractionsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              ))
  }
  
}

# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- 'WMDE_oceditors_fall_2021'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
                                     cetDay = cetDay,
                                     queryFile = queryFile,
                                     fileName = fileName,
                                     analyticsDir = analyticsDir, 
                                     campaignName = campaignName)

### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------

# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
                                   uri_path,
                                   cetDay,
                                   queryFile,
                                   fileName,
                                   dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                 paste(
                                   paste0("uri_host = '", uri_host, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "USE wmf;
    SELECT uri_host, uri_path, uri_query, referer FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
  )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
}

# - set params to wmde_collect_pageviews
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- c('/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
               '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia', 
               '/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote')
queryFile <- 'WMDE_oceditors_fall_2021_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")

# - collect Pageviews data
wmde_collect_pageviews(uri_host,
                       uri_path,
                       cetDay,
                       queryFile,
                       fileName,
                       dataDir)

### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
                                   dataDir, 
                                   uri_query_filter,
                                   page_filter,
                                   cetDay = cetDay,
                                   campaignName = campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)
  
  # - load
  pageviewsData <- readLines(fileName)
  wStart <- which(grepl("uri_host", pageviewsData))
  pageviewsData <- pageviewsData[(wStart + 1):(length(pageviewsData) - 1)]
  pageviewsData <- data.frame(dat = pageviewsData, 
                              stringsAsFactors = F)
  pageviewsData <- separate(pageviewsData,
                            dat,
                            into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
                            sep = "\t")
  # - apply page_filter
  wFilter1 <- unique(unname(unlist(sapply(page_filter, function (x) {
    which(grepl(x, pageviewsData$uri_path))
  }))))
  pageviewsData <- pageviewsData[wFilter1, ]
  # - apply uri_query_filter
  w_uri_query_1 <- which(grepl(uri_query_filter, pageviewsData$uri_query))
  w_uri_query_2 <- which(grepl(uri_query_filter, pageviewsData$referer))
  w_uri_query <- unique(c(w_uri_query_1, w_uri_query_2))

  if (length(w_uri_query) > 0) {
    
    # - filter for w_uri_query
    pageviewsData <- pageviewsData[w_uri_query, ] 
    
    # - copy uri_query from referer where uri_query is empty:
    wCuriq <- which(pageviewsData$uri_query == "")
    curiq <- stringr::str_extract(pageviewsData$referer[wCuriq], 
                                  paste0(uri_query_filter, ".+$"))
    pageviewsData$uri_query[wCuriq] <- curiq
    # - remove ?campaign= from pageviewsData$uri_query
    pageviewsData$uri_query <- gsub("?campaign=", 
                                    "",
                                    pageviewsData$uri_query, 
                                    fixed = T)
    
    # - aggregate:
    pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
    pageviewsData$uri_host <- NULL
    pageviewsData$page <- NULL
    pageviewsData$referer <- NULL
    pageviewsData <- pageviewsData %>% 
      dplyr::select(uri_query, uri_path) %>% 
      dplyr::group_by(uri_query, uri_path) %>% 
      dplyr::summarise(pageviews = n())
    colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
    
    # - add cetDay, campaignName
    pageviewsData$date <- cetDay
    pageviewsData$campaign <- campaignName
    
    # - store:
    write.csv(pageviewsData, 
              paste0(analyticsDir, 
                     "pageviewsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
  }
  
}

# - set params to wmde_process_pageviews
uri_query_filter <- 'WMDE_oceditors_fall_2021'
page_filter <- c('/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
                 '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia', 
                 '/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote')

# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
                       dataDir = dataDir,
                       uri_query_filter = uri_query_filter, 
                       page_filter = page_filter,
                       cetDay = cetDay,
                       campaignName = campaignName)


### ---------------------------------------------------------------------------
### --- Compose and copy to publicDir
### ---------------------------------------------------------------------------

lF <- list.files(analyticsDir)
lF <- lF[grepl("Aggregated", lF)]
lFp <- lF[grepl("pageviews", lF)]
lFp <- lapply(paste0(analyticsDir, lFp),
              data.table::fread)
lFp <- data.table::rbindlist(lFp)
write.csv(lFp, 
          paste0(publicDir, 
                 "2021_OccasionalEditors_Pageviews.csv"))
lFb <- lF[grepl("banner", lF)]
lFb <- lapply(paste0(analyticsDir, lFb),
              data.table::fread)
lFb <- data.table::rbindlist(lFb)
write.csv(lFb, 
          paste0(publicDir, 
                 "2021_OccasionalEditors_Banners.csv"))


### ----------------------------------------------------------
### --- User Edits
### ----------------------------------------------------------

1. Campaign Banners

This section presents all data and statistics on the campaign banners.

2. Pageviews

2.1 Pageviews Overview

Chart 2.1.1. Pageviews Overview.

dataSet <- read.csv(
  '_reporting/2021_OccasionalEditors_Pageviews.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet$V1 <- NULL
dataSet <- dataSet %>% 
  dplyr::filter(!(grepl("2021-11", date)))
dataSet$Tag <- sapply(dataSet$Tag, function(x) {
  if (grepl("ipad$", x)) {
    "ipad"
  } else if (grepl("mobile$", x)) {
    "mobile"
  } else {
    "desktop"
  }
})
dataSet$Page <- sapply(dataSet$Page, function(x) {
  if (grepl("F%C3%B6rderangebote", x)) {
    return("Förderung/Förderangebote")
  } else if (grepl("DeinEngagement", x)) {
    return("DeinEngagement ")
  } else {
    return("LerneWikipedia")
  }
})
dataSet <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::group_by(date, Tag, Page) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(dataSet, aes(x = date,
                    y = log(Pageviews),
                    group = Page,
                    color = Page,
                    fill = Page,
                    label = Pageviews,
                    )) + 
  geom_path(size = .25) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  ylab("log(Pageviews)") + 
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  facet_wrap(~Tag, ncol = 2) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(strip.text =  element_text(size = 15)) +
  theme(legend.position = "top")

Table 2.2.1. Pageviews Overview

### --- Full Dataset (Table Report)
datatable(dataSet %>% arrange(desc(Pageviews)))

2.2 Pageviews Daily

pFrame <- dataSet %>% 
  dplyr::select(date, Pageviews) %>% 
  dplyr::group_by(date) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))
Adding missing grouping variables: `Tag`
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = date,
                    y = Pageviews,
                    label = Pageviews,
                    )) + 
  geom_line(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")

2.3 Pageviews per Tag

pFrame <- dataSet %>% 
  dplyr::select(Tag, Pageviews) %>% 
  dplyr::group_by(Tag) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))
Adding missing grouping variables: `date`
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")

2.3.1 Pageviews per Tag: pageviews per device - Landing Page 1 (DeinEngagement) only

pFrame <- dataSet %>% 
  ungroup() %>% 
  dplyr::filter(Page == "DeinEngagement ") %>% 
  dplyr::select(Tag, Pageviews) %>% 
  dplyr::group_by(Tag) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")

2.3.2 Pageviews per Tag: pageviews per device - Landing Pages 2a and 2b

pFrame <- dataSet %>% 
  ungroup() %>% 
  dplyr::filter(Page != "DeinEngagement ") %>% 
  dplyr::select(Tag, Pageviews) %>% 
  dplyr::group_by(Tag) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")

2.4 Pageviews per Page

pFrame <- dataSet %>% 
  dplyr::select(Page, Pageviews) %>% 
  dplyr::group_by(Page) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))
Adding missing grouping variables: `date`, `Tag`
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Page,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "darkorange") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")

3. Editing Behaviour

dataSet <- read.csv(
  '_analytics/userEdits_FINAL_ANON.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet$banner <- sapply(dataSet$banner, function(x) {
  if (grepl("ipad$", x)) {
    "ipad"
  } else if (grepl("mobile$", x)) {
    "mobile"
  } else {
    "desktop"
  }
})

3.1 Edits per tag

Edits per tag during the campaign:

3.2 Daily edits per tag

Daily edits per tag during the campaign:

3.4 Daily edits (total)

Daily edits (total) during the campaign:

3.4 Edit classes

The following overview of edit classes includes the 0/1 edits class split and

editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision))

editorFrameTotal <- editorFrame %>% 
  dplyr::select(anon_userid, edits) %>% 
  dplyr::group_by(anon_userid) %>% 
  dplyr::summarise(edits = sum(edits))

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrameTotal$editClass <- sapply(editorFrameTotal$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrameTotal$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

3.5 Daily user edits before, during, and after campaign

pFrame <- dataSet %>% 
  dplyr::filter(!is.na(ts_revision)) %>% 
  dplyr::select(ts_revision, campaignDay) %>% 
  dplyr::group_by(ts_revision, campaignDay) %>% 
  dplyr::summarise(Edits = n())
colnames(pFrame) <- c("date", "campaign", "Edits")
ggplot(pFrame, aes(x = date,
                   y = Edits,
                   group = campaign,
                   color = campaign,
                   fill = campaign,
                   label = Edits,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2021 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")

tFrame <- pFrame %>% 
  dplyr::select(campaign, Edits) %>% 
  dplyr::group_by(campaign) %>% 
  dplyr::summarise(totalEdits = sum(Edits),
                   avgEdits = round(mean(Edits), 2))
Adding missing grouping variables: `date`
DT::datatable(tFrame)

Edit Classes: Before Campaign

editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision)) %>% 
  dplyr::filter(campaignDay == "beforeCampaign")

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrame$editClass <- sapply(editorFrame$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrame$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

Edit Classes: During Campaign

editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision)) %>% 
  dplyr::filter(campaignDay == "Campaign")

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrame$editClass <- sapply(editorFrame$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrame$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)

Edit Classes: After Campaign

editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision)) %>% 
  dplyr::filter(campaignDay == "afterCampaign")

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrame$editClass <- sapply(editorFrame$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrame$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
dewikiEdits <- read.csv(paste0(getwd(),"/_reporting/", "avg_edits_dewiki.csv"),
                      header = TRUE, 
                      check.names = FALSE,
                      row.names = 1,
                      stringsAsFactors = FALSE)
dewikiEdits$dateType <- as.character(dewikiEdits$revactor_timestamp)
dewikiEdits$dateType <- substr(dewikiEdits$dateType, 1, 8)
dewikiEdits$dateType <- sapply(dewikiEdits$dateType, function(x) {
  d <- paste0(
      substr(x, 1, 4), 
      "-",
      substr(x, 5, 6),
      "-",
      substr(x, 7, 8)
    )
  return(d)
})
dewikiEdits <- dplyr::arrange(dewikiEdits, dateType) %>% 
  dplyr::select(user_id, dateType)

campaignDays <- c("2021-10-18", "2021-10-19", "2021-10-20", "2021-10-21", "2021-10-22", "2021-10-23",
                  "2021-10-24", "2021-10-25", "2021-10-26", "2021-10-27", "2021-10-28", "2021-10-29",
                  "2021-10-30", "2021-10-31")

dewikiEdits <- dewikiEdits %>% 
  select(dateType) %>% 
  group_by(dateType) %>% 
  summarise(Edits = n())
dewikiEdits <- arrange(dewikiEdits, dateType)
dewikiEdits$campaign <- sapply(dewikiEdits$dateType, function(x) {
  if (x < "2021-10-18") {
    return("Before Campaign")
  } else if (x > "2021-10-31") {
    return("After Campaign") 
  } else {
    return("Campaign")
  }
})

campaignBefore <- mean(pFrame$Edits[pFrame$campaign == "beforeCampaign"])
campaign <- mean(pFrame$Edits[pFrame$campaign == "Campaign"])
campaignAfter <- mean(pFrame$Edits[pFrame$campaign == "afterCampaign"])

dewikiBefore <- mean(dewikiEdits$Edits[dewikiEdits$campaign == "Before Campaign"])
dewiki <- mean(dewikiEdits$Edits[dewikiEdits$campaign == "Campaign"])
dewikiAfter <- mean(dewikiEdits$Edits[dewikiEdits$campaign == "After Campaign"])

In comparison to the overall edits in dewiki, in the same time span as in the chart above:

  • before the campaign, the campaign users made 99.89 edits on the average daily, while in dewiki we find 1234.18 edits;
  • during the campaign, the campaign users made 192.64 edits on the average daily, while in dewiki we find 1256.5 edits;
  • and after the campaign, the campaign users made 159.11 edits on the average daily, while in dewiki we find 1350.32 edits;
---
title: 'Occasional Editors 2021: Interim Report'
author: "Goran S. Milovanovic, Data Scientist, WMDE"
date: "November 3, 2021"
output:
  html_notebook:
    code_folding: hide
    theme: cosmo
    toc: yes
    toc_float: yes
    toc_depth: 5
  html_document:
    toc: yes
    toc_depth: 5
---

**Feedback** should be send to `goran.milovanovic_ext@wikimedia.de`. 

The campaign is run from 2021/10/18 to 2021/10/31.

**CURRENT UPDATE:** Complete dataset as of 2021/10/31.

```{r, echo = F, warning = 'hide', message = F, results = 'hide'}
# !diagnostics off
### --- Setup
options(dplyr.summarise.inform = FALSE)
knitr::opts_chunk$set(fig.width = 15, fig.height = 8) 
rm(list = ls())
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
library(ggplot2)
library(ggrepel)
library(scales)
library(RColorBrewer)
library(kableExtra)
library(rmarkdown)
library(knitr)
library(DT)
library(reshape2)
library(purrr)
```

## 0. Data Acquisiton

**NOTE:** the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running an R script on `stat1007.eqiad.wmnet`, collecting the data as `.tsv` and `.csv` files, copying manually, and processing locally. 

### 0.1 Daily Update

```{r, echo = T, eval = F}

### ----------------------------------------------------------------
### --- WMDE 2021 Ocassional Editors
### --- https://phabricator.wikimedia.org/T291635
### ----------------------------------------------------------------

### ----------------------------------------------------------------
### --- Campaign Description and Parameters
### ----------------------------------------------------------------

# - Landing Pages:
# - Landing Page 1: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement
# - Landing Page 2a: https://tools.wmflabs.org/mitmachen/
# - Landing Page 2b: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia
# - Landing Page 2c: https://de.wikipedia.org/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote
# - Campaign tags:
# - WMDE_oceditors_fall_2021

# - Start of the banner campaign: October 12th
# - End of the banner campaign: October 26 th
# - Tracking test: October 5 - 7
# - Preliminary report for tracking: beginning of November
# - Track editing behavior four weeks after end of campaign: November 23rd
# - final report for tracking: beginning of December

### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)

### --- campaign specifics
campaignName <- 'OccasionalEditors_2021'

### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/2021_OccasionalEditors/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
publicDir <- '/srv/published/datasets/wmde-analytics-engineering/NewEditors/campaigns/2021_OccasionalEditors/'

### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
  strsplit(as.character(cetDay), 
           split = " ", 
           fixed = T)[[1]][1]
) - 1

### ----------------------------------------------------------
### --- Banner Interactions:
### --- via event.WMDEBannerInteractions
### --- https://meta.wikimedia.org/wiki/Schema:WMDEBannerInteractions
### ----------------------------------------------------------

# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID 
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);

# - function: wmde_banner_actions
wmde_banner_actions <- function(uri_query_filter,
                                cetDay,
                                queryFile,
                                fileName,
                                analyticsDir,
                                campaignName) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  
  # - WHERE condition: create eventBannerName_condition
  if (length(uri_query_filter) > 1) {
    eventBannerName_condition <- paste0("(",
                                        paste(
                                          paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
                                          collapse = " OR ", sep = " "),
                                        ")"
    )
  } else {
    eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions 
    WHERE (",
    eventBannerName_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
  )
  
  # - write hql
  write(hiveQuery, paste0(dataDir, queryFile))
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
  
  # - Wrangle Banner Interactions
  # - load
  bannerData <- tryCatch({
    as.data.frame(fread(paste0(dataDir, fileName)))
  },
  error = function(condition) {
    return(FALSE)
  })
  
  # - process
  if (class(bannerData) == 'logical') {
    
    return(FALSE) 
    
  } else { 
    
    # - whoClicked
    whoClicked <- bannerData %>% 
      dplyr::filter(banneraction == "banner-clicked")
    whoClicked <- data.frame(userid = unique(whoClicked$userid))
    
    # - store:
    write.csv(whoClicked, 
              paste0(analyticsDir, 
                     "whoClicked_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
    # - process bannerData
    bannerData <- t(table(bannerData$banneraction,
                          bannerData$bannername))
    bannerData <- as.data.frame(bannerData)
    colnames(bannerData) <- c('banner', 'action', 'count')
    bannerData$day <- cetDay
    bannerData$campaign <- campaignName
    
    # - store:
    write.csv(bannerData, 
              paste0(analyticsDir, 
                     "bannerInteractionsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              ))
  }
  
}

# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- 'WMDE_oceditors_fall_2021'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
                                     cetDay = cetDay,
                                     queryFile = queryFile,
                                     fileName = fileName,
                                     analyticsDir = analyticsDir, 
                                     campaignName = campaignName)

### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------

# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
                                   uri_path,
                                   cetDay,
                                   queryFile,
                                   fileName,
                                   dataDir) {
  
  # - NOTE:
  # - expected format for cetDay is: YYYY-MM-DD
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  
  # - WHERE condition: create datetime_condition
  cet_condition <- seq(
    from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
    to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
    by = "hour"
  ) 
  attr(cet_condition, "tzone") <- "UTC"
  cet_condition <- as.character(cet_condition)
  cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
  cet_years <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][1]
    })
  cet_months <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][2]
    })
  cet_months <- gsub("^0", "", cet_months)
  cet_days <- sapply(
    strsplit(cet_condition, split = " ", fixed = T), function(x) {
      strsplit(x, split = "-")[[1]][3]
    })
  cet_days <- gsub("^0", "", cet_days)
  cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T), 
                      function(x) {
                        x[2]
                      })
  cet_hours <- gsub("^0", "", cet_hours)
  datetimeCondition <- paste0(
    "year = ", cet_years, " AND ",
    "month = ", cet_months, " AND ",
    "day = ", cet_days, " AND ", 
    "hour = ", cet_hours
  )
  datetimeCondition <- paste("(", 
                             datetimeCondition, 
                             ")",
                             collapse = " OR ", 
                             sep = "")
  
  # - WHERE condition: create uri_host_condition
  if (length(uri_host) > 1) {
    uri_host_condition <- paste0("(",
                                 paste(
                                   paste0("uri_host = '", uri_host, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_host_condition = paste0("uri_host = '", uri_host, "'")
  }
  
  # - WHERE condition: create uri_path_condition
  if (length(uri_path) > 1) {
    uri_path_condition <- paste0("(",
                                 paste(
                                   paste0("uri_path = '", uri_path, "'"),
                                   collapse = " OR ", sep = " "),
                                 ")"
    )
  } else {
    uri_path_condition = paste0("uri_path = '", uri_path, "'")
  }
  
  # - compose HiveQL query
  hiveQuery <- paste0( 
    "USE wmf;
    SELECT uri_host, uri_path, uri_query, referer FROM webrequest
    WHERE (",
    uri_host_condition, " AND ",
    uri_path_condition, " AND ",
    "(", datetimeCondition, ")",
    ");"
  )
  
  # - write hql
  write(hiveQuery, queryFile)
  # - execute hql script:
  kerberosPrefix <- 
    'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
  # - Kerberos init
  system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'), 
         wait = T)
  # - Run query
  query <- system(command = paste(kerberosPrefix, 
                                  '/usr/local/bin/beeline --incremental=true --silent -f "',
                                  paste0(dataDir, queryFile),
                                  '" > ', dataDir, fileName,
                                  sep = ""),
                  wait = TRUE)
}

# - set params to wmde_collect_pageviews
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path  <- c('/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
               '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia', 
               '/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote')
queryFile <- 'WMDE_oceditors_fall_2021_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")

# - collect Pageviews data
wmde_collect_pageviews(uri_host,
                       uri_path,
                       cetDay,
                       queryFile,
                       fileName,
                       dataDir)

### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
                                   dataDir, 
                                   uri_query_filter,
                                   page_filter,
                                   cetDay = cetDay,
                                   campaignName = campaignName) {
  
  # - to dataDir
  setwd(dataDir)
  
  # - libraries
  library(stringr)
  library(dplyr)
  library(tidyr)
  library(data.table)
  
  # - load
  pageviewsData <- readLines(fileName)
  wStart <- which(grepl("uri_host", pageviewsData))
  pageviewsData <- pageviewsData[(wStart + 1):(length(pageviewsData) - 1)]
  pageviewsData <- data.frame(dat = pageviewsData, 
                              stringsAsFactors = F)
  pageviewsData <- separate(pageviewsData,
                            dat,
                            into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
                            sep = "\t")
  # - apply page_filter
  wFilter1 <- unique(unname(unlist(sapply(page_filter, function (x) {
    which(grepl(x, pageviewsData$uri_path))
  }))))
  pageviewsData <- pageviewsData[wFilter1, ]
  # - apply uri_query_filter
  w_uri_query_1 <- which(grepl(uri_query_filter, pageviewsData$uri_query))
  w_uri_query_2 <- which(grepl(uri_query_filter, pageviewsData$referer))
  w_uri_query <- unique(c(w_uri_query_1, w_uri_query_2))

  if (length(w_uri_query) > 0) {
    
    # - filter for w_uri_query
    pageviewsData <- pageviewsData[w_uri_query, ] 
    
    # - copy uri_query from referer where uri_query is empty:
    wCuriq <- which(pageviewsData$uri_query == "")
    curiq <- stringr::str_extract(pageviewsData$referer[wCuriq], 
                                  paste0(uri_query_filter, ".+$"))
    pageviewsData$uri_query[wCuriq] <- curiq
    # - remove ?campaign= from pageviewsData$uri_query
    pageviewsData$uri_query <- gsub("?campaign=", 
                                    "",
                                    pageviewsData$uri_query, 
                                    fixed = T)
    
    # - aggregate:
    pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
    pageviewsData$uri_host <- NULL
    pageviewsData$page <- NULL
    pageviewsData$referer <- NULL
    pageviewsData <- pageviewsData %>% 
      dplyr::select(uri_query, uri_path) %>% 
      dplyr::group_by(uri_query, uri_path) %>% 
      dplyr::summarise(pageviews = n())
    colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
    
    # - add cetDay, campaignName
    pageviewsData$date <- cetDay
    pageviewsData$campaign <- campaignName
    
    # - store:
    write.csv(pageviewsData, 
              paste0(analyticsDir, 
                     "pageviewsAggregated_",
                     strsplit(
                       strsplit(fileName, split = "_", fixed = T)[[1]][2],
                       split = ".", 
                       fixed = T)[[1]][1],
                     ".csv"
              )
    )
    
  }
  
}

# - set params to wmde_process_pageviews
uri_query_filter <- 'WMDE_oceditors_fall_2021'
page_filter <- c('/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
                 '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia', 
                 '/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote')

# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
                       dataDir = dataDir,
                       uri_query_filter = uri_query_filter, 
                       page_filter = page_filter,
                       cetDay = cetDay,
                       campaignName = campaignName)


### ---------------------------------------------------------------------------
### --- Compose and copy to publicDir
### ---------------------------------------------------------------------------

lF <- list.files(analyticsDir)
lF <- lF[grepl("Aggregated", lF)]
lFp <- lF[grepl("pageviews", lF)]
lFp <- lapply(paste0(analyticsDir, lFp),
              data.table::fread)
lFp <- data.table::rbindlist(lFp)
write.csv(lFp, 
          paste0(publicDir, 
                 "2021_OccasionalEditors_Pageviews.csv"))
lFb <- lF[grepl("banner", lF)]
lFb <- lapply(paste0(analyticsDir, lFb),
              data.table::fread)
lFb <- data.table::rbindlist(lFb)
write.csv(lFb, 
          paste0(publicDir, 
                 "2021_OccasionalEditors_Banners.csv"))


### ----------------------------------------------------------
### --- User Edits
### ----------------------------------------------------------





```


## 1. Campaign Banners

This section presents all data and statistics on the campaign banners.

### 1.1 Banner Impressions
#### 1.1.1 Banner Impressions Overview

**Chart 1.1.1** Daily Banner Impressions

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - wrangle dataSet
dataSet <- read.csv(
  '_reporting/2021_OccasionalEditors_Banners.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet$V1 <- NULL
dataSet <- dataSet %>% 
  dplyr::filter(!grepl("2021-11", dataSet$day))
dataSet$banner <- gsub("_ctrl", "", dataSet$banner, fixed = T)
dataSet$banner <- sapply(dataSet$banner, function(x) {
  if (grepl("ipad$", x)) {
    "ipad"
  } else if (grepl("mobile$", x)) {
    "mobile"
  } else {
    "desktop"
  }
})

# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-seen") %>% 
  dplyr::select(-action)

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = day,
                   y = count,
                   group = banner,
                   color = banner,
                   fill = banner,
                   label = count)) + 
  geom_line(size = .5) + 
  geom_point(size = 1.5) + 
  ggrepel::geom_text_repel(size = 4, show.legend = FALSE) +
  ggtitle('Occasional Editors 2021: Daily Banner Impressions') +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

#### 1.1.1 Banner Impressions Overview: Table

**Table 1.1.1.** Daily Banner Impressions

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
datatable(pFrame)
```

#### 1.1.2 Total Banner Impressions

**Chart 1.1.2.** Total Banner Impressions

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-seen") %>%
  dplyr::select(-action, - day) %>% 
  dplyr::group_by(banner) %>% 
  dplyr::summarise(totalImpressions = sum(count))

ggplot(pFrame, aes(x = banner, 
                    y = totalImpressions, 
                    color = banner, 
                    fill = banner, 
                    label = totalImpressions)) + 
  geom_bar(width = .5, stat = "identity") + 
  ggtitle('Occasional Editors 2021: Banner Impressions') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, color = "white", show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_blank())
```

#### 1.1.3 Banner Impressions per Day

**Chart 1.1.3.** Banner Impressions per Day

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-seen") %>%
  dplyr::select(-action, -banner) %>% 
  dplyr::group_by(day) %>% 
  dplyr::summarise(totalImpressions = sum(count))


ggplot(pFrame, aes(x = day, 
                    y = totalImpressions, 
                    label = totalImpressions)) +
  geom_path(size = .25, group = 1, color = "darkblue") +
  geom_point(size = 1.5, color = "darkblue") + 
  geom_point(size = 1, color = "white") + 
  ggtitle('Occasional Editors 2021: Banner Impressions') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_text(angle = 90))
```

### 1.2 Banner Clicks
#### 1.2.1 Banner Clicks Overview

**Chart 1.2.1** Daily Banner Clicks

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-clicked") %>% 
  dplyr::select(-action)

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = day,
                   y = count,
                   group = banner,
                   color = banner,
                   fill = banner,
                   label = count
                    )) + 
  geom_line(size = .5) + 
  geom_point(size = 1.5) +
  ggrepel::geom_text_repel(size = 4) +
  ggtitle('Occasional Editors 2021: Daily Banner Clicks') +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

#### 1.2.1 Banner Clicks Overview: Table

**Table 1.2.1.** Daily Banner Clicks

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
datatable(pFrame)
```

#### 1.2.2 Total Banner Clicks

**Chart 1.2.2.** Total Banner Clicks

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-clicked") %>%
  dplyr::select(-action, - day) %>% 
  dplyr::group_by(banner) %>% 
  dplyr::summarise(totalClicks = sum(count))

ggplot(pFrame, aes(x = banner, 
                    y = totalClicks, 
                    color = banner, 
                    fill = banner, 
                    label = totalClicks)) + 
  geom_bar(width = .5, stat = "identity") + 
  ggtitle('Occasional Editors 2021: Banner Clicks') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, color = "white", show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_blank())
```

#### 1.2.3 Banner Clicks per Day

**Chart 1.2.3.** Banner Clicks per Day

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-clicked") %>%
  dplyr::select(-action, -banner) %>% 
  dplyr::group_by(day) %>% 
  dplyr::summarise(totalClicks = sum(count))


ggplot(pFrame, aes(x = day, 
                    y = totalClicks, 
                    label = totalClicks)) +
  geom_path(size = .25, group = 1, color = "darkblue") +
  geom_point(size = 1.5, color = "darkblue") + 
  geom_point(size = 1, color = "white") + 
  ggtitle('Occasional Editors 2021: Banner Clicks') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_text(angle = 90))
```

### 1.3 Banner Close
#### 1.3.1 Banner Close Overview

**Chart 1.3.1** Daily Banner Close

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-closed") %>% 
  dplyr::select(-action)

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = day,
                   y = count,
                   group = banner,
                   color = banner,
                   fill = banner,
                   label = count)) + 
  geom_line(size = .5) + 
  geom_point(size = 1.5) +
  ggtitle('Occasional Editors 2021: Daily Banner Close') +
  ggrepel::geom_text_repel(size = 4) +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

#### 1.3.1 Banner Close Overview: Table

**Table 1.3.1.** Daily Banner Close

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
datatable(pFrame)
```

#### 1.3.2 Total Banner Close

**Chart 1.3.2.** Total Banner Close

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-closed") %>%
  dplyr::select(-action, - day) %>% 
  dplyr::group_by(banner) %>% 
  dplyr::summarise(totalClose = sum(count))

ggplot(pFrame, aes(x = banner, 
                    y = totalClose, 
                    color = banner, 
                    fill = banner, 
                    label = totalClose)) + 
  geom_bar(width = .5, stat = "identity") + 
  ggtitle('Occasional Editors 2021: Banner Close') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, color = "white", show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_blank())
```

#### 1.3.3 Banner Close per Day

**Chart 1.3.3.** Banner Close per Day

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action == "banner-closed") %>%
  dplyr::select(-action, -banner) %>% 
  dplyr::group_by(day) %>% 
  dplyr::summarise(totalClose = sum(count))


ggplot(pFrame, aes(x = day, 
                    y = totalClose, 
                    label = totalClose)) +
  geom_path(size = .25, group = 1, color = "darkblue") +
  geom_point(size = 1.5, color = "darkblue") + 
  geom_point(size = 1, color = "white") + 
  ggtitle('Occasional Editors 2021: Banner Close') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_text(angle = 90))
```

### 1.4 Banner Click Rate
#### 1.4.1 Banner Click Rate Overview

**Chart 1.4.1** Daily Banner Close

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
pFrame <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::filter(action != "banner-closed") %>% 
  tidyr::pivot_wider(names_from = "action", 
                     values_from = "count",
                     values_fill = 0)
pFrame$click_rate <- pFrame$`banner-clicked`/pFrame$`banner-seen`
pFrame <- pFrame %>% 
  dplyr::select(banner, day, click_rate)

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = day,
                   y = click_rate,
                   group = banner,
                   color = banner,
                   fill = banner, 
                   label = round(click_rate, 2))) + 
  geom_line(size = .5) + 
  geom_point(size = 1.5) +
  ggrepel::geom_text_repel(size = 4) +
  ggtitle('Occasional Editors 2021: Daily Banner Click Rate') +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

#### 1.4.1 Banner Close Overview: Table

**Table 1.4.1.** Daily Banner Close

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
datatable(pFrame)
```

#### 1.4.2 Average Banner Click Rate

**Chart 1.4.2.** Average Banner Click Rate

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
ppFrame <- pFrame %>% 
  dplyr::select(banner, click_rate) %>% 
  dplyr::group_by(banner) %>% 
  dplyr::summarise(avg_click_rate = mean(click_rate))

ggplot(ppFrame, aes(x = banner, 
                    y = avg_click_rate, 
                    color = banner, 
                    fill = banner, 
                    label = round(avg_click_rate, 2))) + 
  geom_bar(width = .5, stat = "identity") + 
  ggtitle('Occasional Editors 2021: Average Banner Click Rate') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, color = "white", show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_blank())
```

#### 1.4.3 Average Banner Click Rate per Day

**Chart 1.3.3.** Average Banner Click Rate per Day

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
# - pFrame
ppFrame <- pFrame %>% 
  dplyr::select(day, click_rate) %>% 
  dplyr::group_by(day) %>% 
  dplyr::summarise(avg_click_rate = mean(click_rate))
  
ggplot(ppFrame, aes(x = day, 
                    y = avg_click_rate, 
                    label = round(avg_click_rate, 2))) +
  geom_path(size = .25, group = 1, color = "darkblue") +
  geom_point(size = 1.5, color = "darkblue") + 
  geom_point(size = 1, color = "white") + 
  ggtitle('Occasional Editors 2021: Average Banner Click Rate') +
  theme_minimal() + 
  geom_label_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, size = 8)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "right") + 
  theme(axis.text.x = element_text(angle = 90))
```

## 2. Pageviews

### 2.1 Pageviews Overview

**Chart 2.1.1.** Pageviews Overview. 

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
dataSet <- read.csv(
  '_reporting/2021_OccasionalEditors_Pageviews.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet$V1 <- NULL
dataSet <- dataSet %>% 
  dplyr::filter(!(grepl("2021-11", date)))
dataSet$Tag <- sapply(dataSet$Tag, function(x) {
  if (grepl("ipad$", x)) {
    "ipad"
  } else if (grepl("mobile$", x)) {
    "mobile"
  } else {
    "desktop"
  }
})
dataSet$Page <- sapply(dataSet$Page, function(x) {
  if (grepl("F%C3%B6rderangebote", x)) {
    return("Förderung/Förderangebote")
  } else if (grepl("DeinEngagement", x)) {
    return("DeinEngagement ")
  } else {
    return("LerneWikipedia")
  }
})
dataSet <- dataSet %>% 
  dplyr::select(-campaign) %>% 
  dplyr::group_by(date, Tag, Page) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(dataSet, aes(x = date,
                    y = log(Pageviews),
                    group = Page,
                    color = Page,
                    fill = Page,
                    label = Pageviews,
                    )) + 
  geom_path(size = .25) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  ylab("log(Pageviews)") + 
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  facet_wrap(~Tag, ncol = 2) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(strip.text =  element_text(size = 15)) +
  theme(legend.position = "top")
```


**Table 2.2.1.** Pageviews Overview

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
### --- Full Dataset (Table Report)
datatable(dataSet %>% arrange(desc(Pageviews)))
```

### 2.2 Pageviews Daily

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
pFrame <- dataSet %>% 
  dplyr::select(date, Pageviews) %>% 
  dplyr::group_by(date) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = date,
                    y = Pageviews,
                    label = Pageviews,
                    )) + 
  geom_line(size = .25, group = 1) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, color = "white") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```

### 2.3 Pageviews per Tag

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
pFrame <- dataSet %>% 
  dplyr::select(Tag, Pageviews) %>% 
  dplyr::group_by(Tag) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```

#### 2.3.1 Pageviews per Tag: pageviews per device - Landing Page 1 (DeinEngagement) only

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
pFrame <- dataSet %>% 
  ungroup() %>% 
  dplyr::filter(Page == "DeinEngagement ") %>% 
  dplyr::select(Tag, Pageviews) %>% 
  dplyr::group_by(Tag) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```

#### 2.3.2 Pageviews per Tag: pageviews per device - Landing Pages 2a and 2b

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
pFrame <- dataSet %>% 
  ungroup() %>% 
  dplyr::filter(Page != "DeinEngagement ") %>% 
  dplyr::select(Tag, Pageviews) %>% 
  dplyr::group_by(Tag) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```

### 2.4 Pageviews per Page

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
pFrame <- dataSet %>% 
  dplyr::select(Page, Pageviews) %>% 
  dplyr::group_by(Page) %>% 
  dplyr::summarise(Pageviews = sum(Pageviews))

# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Page,
                   y = Pageviews,
                   label = Pageviews)) +
  geom_bar(stat = "identity", fill = "darkorange") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Pageviews') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```


## 3. Editing Behaviour

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
dataSet <- read.csv(
  '_analytics/userEdits_FINAL_ANON.csv',
                    header = T,
                    row.names = 1,
                    check.names = F,
                    stringsAsFactors = F)
dataSet$banner <- sapply(dataSet$banner, function(x) {
  if (grepl("ipad$", x)) {
    "ipad"
  } else if (grepl("mobile$", x)) {
    "mobile"
  } else {
    "desktop"
  }
})
```

### 3.1 Edits per tag

Edits per tag **during the campaign**:

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
pFrame <- dataSet %>% 
  dplyr::filter(!is.na(ts_revision)) %>% 
  dplyr::filter(campaignDay == "Campaign") %>%
  dplyr::select(banner) %>% 
  dplyr::group_by(banner) %>% 
  dplyr::summarise(edits = n())
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = banner,
                   y = edits,
                   label = edits)) +
  geom_bar(stat = "identity", fill = "darkRed") +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Edits') +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```


### 3.2 Daily edits per tag

Daily edits per tag **during the campaign**:

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
pFrame <- dataSet %>% 
  dplyr::filter(!is.na(ts_revision)) %>%
  dplyr::filter(campaignDay == "Campaign") %>%
  dplyr::select(banner, ts_revision)
pFrame <- pFrame %>%
  dplyr::group_by(banner, ts_revision) %>%
  dplyr::summarise(edits = n())
pFrame$ts_revision <- as.factor(pFrame$ts_revision)
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = ts_revision,
                   y = log(edits),
                   label = edits, 
                   group = banner, 
                   color = banner,
                   fill = banner)) +
  geom_point(size = 1.5) + 
  geom_point(size = 1, color = "white") + 
  geom_line(size = .25) +
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Edits') +
  xlab("Day") + ylab("Log(edits)") +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```

### 3.4 Daily edits (total)

Daily edits (total) **during the campaign**:

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
pFrame <- pFrame %>%
  dplyr::select(ts_revision, edits) %>% 
  dplyr::group_by(ts_revision) %>%
  dplyr::summarise(edits = sum(edits))
pFrame$ts_revision <- as.factor(pFrame$ts_revision)
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = ts_revision,
                   y = log(edits),
                   label = edits)) +
  geom_line(size = .25, group = 1, color = "darkred") +
  geom_point(size = 1.5, color = "darkred") + 
  geom_point(size = 1, color = "white") + 
  scale_y_continuous(labels = comma) +
  ggtitle('Occasional Editors 2021: Edits') +
  xlab("Day") + ylab("Log(edits)") +
  theme_minimal() + 
  geom_text_repel(size = 6, show.legend = FALSE) + 
  theme(axis.text.x = element_text(angle = 90, size = 12)) +
  theme(plot.title = element_text(size = 15)) +
  theme(legend.title = element_blank()) + 
  theme(legend.text = element_text(size = 15)) +
  theme(legend.position = "top")
```

### 3.4 Edit classes

The following overview of edit classes includes the `0/1` edits class split and 

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision))

editorFrameTotal <- editorFrame %>% 
  dplyr::select(anon_userid, edits) %>% 
  dplyr::group_by(anon_userid) %>% 
  dplyr::summarise(edits = sum(edits))

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrameTotal$editClass <- sapply(editorFrameTotal$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrameTotal$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

### 3.5 Daily user edits before, during, and after campaign

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
pFrame <- dataSet %>% 
  dplyr::filter(!is.na(ts_revision)) %>% 
  dplyr::select(ts_revision, campaignDay) %>% 
  dplyr::group_by(ts_revision, campaignDay) %>% 
  dplyr::summarise(Edits = n())
colnames(pFrame) <- c("date", "campaign", "Edits")
ggplot(pFrame, aes(x = date,
                   y = Edits,
                   group = campaign,
                   color = campaign,
                   fill = campaign,
                   label = Edits,
                    )) + 
  geom_path(size = .5) + 
  geom_point(size = 1.5) +
  geom_point(size = 1, fill = "white", color = "white") +
  geom_text_repel(size = 3.5, show.legend = FALSE) + 
  scale_y_continuous(labels = comma) +
  ggtitle('2021 WMDE Occasional Editors Banner Campaign') +
  xlab("Date") + ylab("Edits") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, size = 6)) +
  theme(plot.title = element_text(size = 10)) +
  theme(legend.title = element_blank()) + 
  theme(legend.position = "top")
```

```{r echo = T, eval = T, warning = 'hide', message = FALSE}
tFrame <- pFrame %>% 
  dplyr::select(campaign, Edits) %>% 
  dplyr::group_by(campaign) %>% 
  dplyr::summarise(totalEdits = sum(Edits),
                   avgEdits = round(mean(Edits), 2))
DT::datatable(tFrame)
```

**Edit Classes: Before Campaign**

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision)) %>% 
  dplyr::filter(campaignDay == "beforeCampaign")

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrame$editClass <- sapply(editorFrame$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrame$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

**Edit Classes: During Campaign**

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision)) %>% 
  dplyr::filter(campaignDay == "Campaign")

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrame$editClass <- sapply(editorFrame$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrame$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```

**Edit Classes: After Campaign**

```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.height  = 10}
editorFrame <- expand.grid(anon_userid = unique(dataSet$anon_userid),
                           campaignDay = unique(dataSet$campaignDay))
editorFrame <- dplyr::filter(editorFrame, !is.na(campaignDay))
ds <- dataSet %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision)
editorFrame <- editorFrame %>% 
  dplyr::left_join(ds, 
                   by = c("anon_userid", "campaignDay"))
editorFrame$ts_revision <- ifelse(is.na(editorFrame$ts_revision), 
                                  0, 
                                  1)
editorFrame <- editorFrame %>% 
  dplyr::select(anon_userid, campaignDay, ts_revision) %>% 
  dplyr::group_by(anon_userid, campaignDay) %>% 
  dplyr::summarise(edits = sum(ts_revision)) %>% 
  dplyr::filter(campaignDay == "afterCampaign")

editBoundaries <- list(
  c(0, 0),
  c(1, 1), 
  c(2, 4),
  c(5, 9),
  c(10, 20),
  c(21, 50),
  c(51, 100)
)

editorFrame$editClass <- sapply(editorFrame$edits, function(x) {
  wEC <- sapply(editBoundaries, function(y) {
    x >= y[1] & x <= y[2]
  })
  if (sum(wEC) == 0) {
    return(">= 100")
  } else {
    return(paste0("(",
                  editBoundaries[[which(wEC)]][1],
                  " - ",
                  editBoundaries[[which(wEC)]][2], 
                  ")"
                  )
    )
  }
})

editClass <- as.data.frame(table(editorFrame$editClass), 
                           stringsAsFactors = F)
editClass[editClass == "(0 - 0)"] <- "0"
editClass[editClass == "(1 - 1)"] <- "1"
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
  lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
```


```{r echo = T, eval = T, warning = 'hide', message = FALSE, fig.width = 8, fig.height = 6.5}
dewikiEdits <- read.csv(paste0(getwd(),"/_reporting/", "avg_edits_dewiki.csv"),
                      header = TRUE, 
                      check.names = FALSE,
                      row.names = 1,
                      stringsAsFactors = FALSE)
dewikiEdits$dateType <- as.character(dewikiEdits$revactor_timestamp)
dewikiEdits$dateType <- substr(dewikiEdits$dateType, 1, 8)
dewikiEdits$dateType <- sapply(dewikiEdits$dateType, function(x) {
  d <- paste0(
      substr(x, 1, 4), 
      "-",
      substr(x, 5, 6),
      "-",
      substr(x, 7, 8)
    )
  return(d)
})
dewikiEdits <- dplyr::arrange(dewikiEdits, dateType) %>% 
  dplyr::select(user_id, dateType)

campaignDays <- c("2021-10-18", "2021-10-19", "2021-10-20", "2021-10-21", "2021-10-22", "2021-10-23",
                  "2021-10-24", "2021-10-25", "2021-10-26", "2021-10-27", "2021-10-28", "2021-10-29",
                  "2021-10-30", "2021-10-31")

dewikiEdits <- dewikiEdits %>% 
  select(dateType) %>% 
  group_by(dateType) %>% 
  summarise(Edits = n())
dewikiEdits <- arrange(dewikiEdits, dateType)
dewikiEdits$campaign <- sapply(dewikiEdits$dateType, function(x) {
  if (x < "2021-10-18") {
    return("Before Campaign")
  } else if (x > "2021-10-31") {
    return("After Campaign") 
  } else {
    return("Campaign")
  }
})

campaignBefore <- mean(pFrame$Edits[pFrame$campaign == "beforeCampaign"])
campaign <- mean(pFrame$Edits[pFrame$campaign == "Campaign"])
campaignAfter <- mean(pFrame$Edits[pFrame$campaign == "afterCampaign"])

dewikiBefore <- mean(dewikiEdits$Edits[dewikiEdits$campaign == "Before Campaign"])
dewiki <- mean(dewikiEdits$Edits[dewikiEdits$campaign == "Campaign"])
dewikiAfter <- mean(dewikiEdits$Edits[dewikiEdits$campaign == "After Campaign"])
```

In comparison to the overall edits in `dewiki`, in the same time span as in the chart above:

- before the campaign, the campaign users made `r round(campaignBefore,2)` edits on the average daily, while in `dewiki` we find `r round(dewikiBefore, 2)` edits;
- during the campaign, the campaign users made `r round(campaign, 2)` edits on the average daily, while in `dewiki` we find `r round(dewiki, 2)` edits;
- and after the campaign, the campaign users made `r round(campaignAfter, 2)` edits on the average daily, while in `dewiki` we find `r round(dewikiAfter, 2)` edits;
