Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign was run from 2020/05/14 to 2020/05/28.
CURRENT UPDATE: Complete dataset as of 2020/05/28.
NOTE: This is a re-work of the existing report for the WMDE 2020 WMDE Occasional Editors Banner Campaign: all banner clicks data from event.wmdebannerinteractions
are replaced by the respective pageviews (see: Phab T249617)
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script 2020_OccasionalEditors_PRODUCTION.R
on stat1004.eqiad.wmnet, collecting the data as .tsv
and .csv
files, copying manually, and processing locally. A daily crontab job was run from 2020/05/14
to 2020/05/28
to collect the data for daily reporting. The data used in this report are aggregates of the daily datasets, sanitized and anonymized.
### --- WMDE 2020_OccasionalEditors_PRODUCTION.R
# - Campaign start: 2020/05/14
# - Campaign end: 2020/05/27
# - run from: stat1004
# - path:
### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)
### --- dir structure
campaignPath <- paste0(getwd(), "/")
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- campaign specifics
campaignName <- 'OccasionalEditors2020'
### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
strsplit(as.character(cetDay),
split = " ",
fixed = T)[[1]][1]
) - 1
### ----------------------------------------------------------
### --- Banner Impressions
### ----------------------------------------------------------
# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_query_condition
if (length(uri_query) > 1) {
uri_query_condition <- paste0("(",
paste(
paste0("uri_query LIKE '%", uri_query, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_query FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
uri_query_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
hiveInput <- paste0(queryFile, ' > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE)
)
}
# - set params to wmde_collect_banner_impressions
# - for the OccasionalEditors2020
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- '/beacon/impression'
uri_query <- c('WMDE_oceditors_spring_2020_')
queryFile <- 'OccasionalEditors2020_BannerImpressions.hql'
fileName <- paste0(dataDir, "bannerImpressions_", cetDay, ".tsv")
# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir)
# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
dataDir,
cetDay,
campaignName,
uri_query) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
# - load
bannerData <- tryCatch({
as.data.frame(fread(fileName))
},
error = function(condition) {
return(FALSE)
})
# - process
if (class(bannerData) == 'logical') {
return(FALSE)
} else {
# - clean
bannerData <- dplyr::filter(bannerData,
uri_query != "")
# - split
bannerData <- tidyr::separate(bannerData,
col = uri_query,
into = c('country',
'region',
'anonymous',
'project',
'db',
'uselang',
'device',
'debug',
'randomcampaign',
'randombanner',
'recordImpressionSampleRate',
'impressionEventSampleRate',
'campaignStatuses',
'status',
'statusCode',
'campaign',
'campaignCategory',
'campaignCategoryUsesLegacy',
'bucket',
'banner',
'bannerCategory',
'result'),
sep = "&") %>%
dplyr::select(banner, device, recordImpressionSampleRate, result)
# - filter for uri_query
bannerData <- bannerData[grepl(uri_query, bannerData$banner), ]
# - clean relevant fields
# - banner:
bannerData$banner <- gsub("^banner=", "", bannerData$banner)
# - recordImpressionSampleRate:
bannerData$recordImpressionSampleRate <- as.numeric(
gsub("^recordImpressionSampleRate=", "", bannerData$recordImpressionSampleRate)
)
# - device:
bannerData$device <- gsub("^device=", "", bannerData$device)
# - result:
bannerData$result <- gsub("^result=", "", bannerData$result)
# - filter for result=show
bannerData <- dplyr::filter(bannerData,
result == "show")
# - correction for recordImpressionSampleRate
bannerData$recordImpressionSampleRate <-
1/bannerData$recordImpressionSampleRate
# - aggregate:
bannerData <- bannerData %>%
dplyr::select(banner, device, recordImpressionSampleRate) %>%
dplyr::group_by(banner, device) %>%
dplyr::summarise(impressions = sum(recordImpressionSampleRate))
# - add cetDay, me
bannerData$date <- cetDay
bannerData$campaign <- campaignName
# - store:
write.csv(bannerData,
paste0(analyticsDir, "bannerImpressions",
cetDay,
".csv"
)
)
# - return
return(TRUE)
}
}
# - wrangle Banner Impression data
campaignName <- "OccasionalEditors2020"
uri_query <- c('WMDE_oceditors_spring_2020_')
bannerProcess <- wmde_process_banner_impressions(fileName = fileName,
dataDir = dataDir,
cetDay = cetDay,
campaignName = campaignName,
uri_query = uri_query)
### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------
# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_host, uri_path, uri_query, referer FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
}
# - set params to wmde_collect_pageviews
# - for the 2020_EmailCampaignWikipediaChallenge
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- c(
'/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
'/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
'/wiki/Wikipedia:Mentorenprogramm')
# uri_query <- paste0('WMDE_2020_challenge_', 1:30)
queryFile <- 'OccasionalEditors2020_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
# - collect Pageviews data
wmde_collect_pageviews(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir)
### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
dataDir,
uri_query_filter,
cetDay = cetDay,
campaignName = campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
pageviewsData <- readLines(fileName)
wStart <- which(grepl("uri_host", pageviewsData))
pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
pageviewsData <- data.frame(dat = pageviewsData,
stringsAsFactors = F)
pageviewsData <- separate(pageviewsData,
dat,
into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
sep = "\t")
# - apply uri_query_filter
# - NOTE: looking in both: uri_query, referer
# - NOTE: hack for the 2020_OccasionalEditors Campaign
# - include pageviews for
# - '/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur',
# - '/wiki/Wikipedia:Mentorenprogramm'
# - on dewiki where grepl(w_uri_query, referer)
w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer) &
!grepl(uri_query_filter, pageviewsData$uri_query))
if (length(w_uri_query_referer) > 0) {
refererTags <- strsplit(pageviewsData$referer[w_uri_query_referer],
split = "?",
fixed = T)
refererTags <- sapply(refererTags, function(x) {x[2]})
refererTags <- paste0("?", refererTags)
pageviewsData$uri_query[w_uri_query_referer] <- refererTags
}
w_uri_query <- which(grepl(uri_query_filter, pageviewsData$uri_query))
if (length(w_uri_query) > 0) {
# - filter for w_uri_query
pageviewsData <- pageviewsData[w_uri_query, ]
# - aggregate:
pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
pageviewsData$uri_host <- NULL
pageviewsData$referer <- NULL
pageviewsData <- pageviewsData %>%
dplyr::select(uri_query, uri_path) %>%
dplyr::group_by(uri_query, uri_path) %>%
dplyr::summarise(pageviews = n())
colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
# - add cetDay, campaignName
pageviewsData$date <- cetDay
pageviewsData$campaign <- campaignName
# - store:
write.csv(pageviewsData,
paste0(analyticsDir,
"pageviewsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
}
# - set params to wmde_process_pageviews
# - for the WMDE 2020_EmailCampaignWikipediaChallenge
uri_query_filter <- 'WMDE_oceditors_spring_2020_'
# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
dataDir = dataDir,
uri_query_filter = uri_query_filter,
cetDay = cetDay,
campaignName = campaignName)
### ----------------------------------------------------------
### --- Banner Actions
### --- via event.WMDEBannerActions
### ----------------------------------------------------------
### ___ NOTE:
# - Suffix explanation:
# - ctrl is the banner that dynamically displays text depending on the target group
# - var is the banner that shows the same text for both target groups
# - ipad/mobile does not represent the actual device type, but the display mode of the banner (small desktop screens may be reported as ipad)
# - cs/nt indicates, which target group the user belongs to
# - Kai Nissen in https://phabricator.wikimedia.org/T251535#6132468
# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);
# - function: wmde_collect_pageviews
wmde_banner_actions <- function(uri_query_filter,
cetDay,
queryFile,
fileName,
analyticsDir,
campaignName) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create eventBannerName_condition
if (length(uri_query) > 1) {
eventBannerName_condition <- paste0("(",
paste(
paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions
WHERE (",
eventBannerName_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
# - Wrangle Banner Interactions
# - load
bannerData <- tryCatch({
as.data.frame(fread(paste0(dataDir, fileName)))
},
error = function(condition) {
return(FALSE)
})
# - process
if (class(bannerData) == 'logical') {
return(FALSE)
} else {
# - bannerSeen
bannerSeen <- bannerData %>%
dplyr::select(bannername, userid)
bannerSeen <- bannerSeen[!duplicated(bannerSeen), ]
bannerSeen <- bannerSeen %>%
dplyr::select(bannername) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(seen_by = n())
# - bannerClosed
bannerClosed <- bannerData %>%
dplyr::filter(banneraction == "banner-closed") %>%
dplyr::select(bannername, bannerimpressions) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(closed_by = n(), mean_close_imp = round(mean(bannerimpressions), 2))
# - bannerClicked
bannerClicked <- bannerData %>%
dplyr::filter(banneraction == "banner-clicked") %>%
dplyr::select(bannername, bannerimpressions) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(clicked_by = n(), mean_click_imp = round(mean(bannerimpressions), 2))
# - whoClicked
whoClicked <- bannerData %>%
dplyr::filter(banneraction == "banner-clicked")
whoClicked <- data.frame(userid = unique(whoClicked$userid))
# - store:
write.csv(whoClicked,
paste0(analyticsDir,
"whoClicked_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
# - join
bannerData <- bannerSeen %>%
dplyr::left_join(bannerClosed, 'bannername') %>%
dplyr::left_join(bannerClicked, 'bannername')
bannerData$close_rate <- round(bannerData$closed_by/bannerData$seen_by, 2)
bannerData$click_rate <- round(bannerData$clicked_by/bannerData$seen_by, 2)
# - date, campaign
bannerData$day <- cetDay
bannerData$campaign <- campaignName
# - store:
write.csv(bannerData,
paste0(analyticsDir,
"bannerInteractionsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
}
# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- 'WMDE_oceditors_spring_2020_'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
cetDay = cetDay,
queryFile = queryFile,
fileName = fileName,
analyticsDir = analyticsDir,
campaignName = campaignName)
This section presents all data and statistics on the campaign pages.
The following chunk loads and then re-structures the dataset a bit. NOTE. The 3all
campaign tag was observed only once, on 2020/05/14
, and is removed from analysis.
lF <- list.files('_analytics')
lF <- lF[grepl("^pageviewsAggregated", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - expand grid to account for missing observations per day
dS <- expand.grid(unique(dataSet$Tag),
unique(dataSet$Page),
unique(dataSet$date),
stringsAsFactors = F)
colnames(dS) <- c('Tag', 'Page', 'date')
dS <- dS %>%
left_join(dataSet,
by = c("Tag", "Page", "date"))
dataSet <- dS; rm(dS)
dataSet$Pageviews[is.na(dataSet$Pageviews)] <- 0
# - banner codes
dataSet$var_ctrl <- sapply(dataSet$Tag, function(x) {
if (grepl("var", x)) {return("var")} else {return("ctrl")}
})
dataSet$device <- sapply(dataSet$Tag, function(x) {
if (grepl("mobile", x)) {
return("mobile")
} else if (grepl("ipad", x)) {
return("ipad")
} else {
return("desktop")
}
})
dataSet$cs_nt <- sapply(dataSet$Tag, function(x) {
if (grepl("cs", x)) {return("cs")} else {return("nt")}
})
dataSet$Tag <- gsub("\\?campaign=WMDE_oceditors_spring_2020_", "", dataSet$Tag)
dataSet$Page <- gsub("de\\.wikipedia\\.org/wiki/Wikipedia:|de\\.m\\.wikipedia\\.org/wiki/Wikipedia:", "", dataSet$Page)
# - remove 3all tag:
dataSet <- filter(dataSet, Tag != "3all")
Chart 2.1.1 Daily Pageviews, aggregated across the campaign banners.
pFrame <- dataSet %>%
select(date, Page, Pageviews) %>%
group_by(date, Page) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
group = Page,
color = Page,
fill = Page,
label = Pageviews,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Date") + ylab("Pageviews") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 2.1.1 Pageviews totals
tFrame <- pFrame %>%
select(Page, Pageviews) %>%
group_by(Page) %>%
summarise(totalPageviews = sum(Pageviews))
datatable(tFrame)
Reminder. From the Campaign Tracking Concept:
ipad/mobile does not represent the actual device type, but the display mode of the banner (small desktop screens may be reported as ipad)
Chart 2.2.1 Pageviews, by devices, aggregated across the campaign banners.
pFrame <- dataSet %>%
select(date, device, Page, Pageviews) %>%
group_by(date, device, Page) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
group = Page,
color = Page,
fill = Page,
label = Pageviews,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
facet_wrap(~device, nrow = 3, scales = "free") +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Date") + ylab("Actions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 2.2.1 Total pageviews, by devices, aggregated across the campaign banners.
tFrame <- pFrame %>%
select(device, Page, Pageviews) %>%
group_by(device, Page) %>%
summarise(totalPageviews = sum(Pageviews))
datatable(tFrame)
Reminder. From the Campaign Tracking Concept:
ctrl is the banner that dynamically displays text depending on the target group
var is the banner that shows the same text for both target groups
Chart 2.3.1 Pageviews, by var/ctrl, aggregated across the campaign banners.
pFrame <- dataSet %>%
select(date, var_ctrl, Page, Pageviews) %>%
group_by(date, var_ctrl, Page) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
group = Page,
color = Page,
fill = Page,
label = Pageviews,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
facet_wrap(~var_ctrl, nrow = 3, scales = "free") +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Date") + ylab("Actions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 2.3.1 Total pageviews, by devices, aggregated across the campaign banners.
tFrame <- pFrame %>%
select(var_ctrl, Page, Pageviews) %>%
group_by(var_ctrl, Page) %>%
summarise(totalPageviews = sum(Pageviews))
datatable(tFrame)
Reminder. From the Campaign Tracking Concept:
cs/nt indicates, which target group the user belongs to (cs = community support, nt = new tasks)
Chart 2.4.1 Pageviews, by nt/cs, aggregated across the campaign banners.
pFrame <- dataSet %>%
select(date, cs_nt, Page, Pageviews) %>%
group_by(date, cs_nt, Page) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
group = Page,
color = Page,
fill = Page,
label = Pageviews,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
facet_wrap(~cs_nt, nrow = 3, scales = "free") +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Date") + ylab("Actions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 2.4.2 Total pageviews, by nt/cs, aggregated across the campaign banners.
tFrame <- pFrame %>%
select(cs_nt, Page, Pageviews) %>%
group_by(cs_nt, Page) %>%
summarise(totalPageviews = sum(Pageviews))
datatable(tFrame)
Chart 2.5.1 Pageviews, by nt/cs vs. ctrl/var.
pFrame <- dataSet %>%
select(var_ctrl, cs_nt, Pageviews) %>%
group_by(var_ctrl, cs_nt) %>%
summarise(pageviews = sum(Pageviews))
ggplot(pFrame, aes(x = var_ctrl,
y = pageviews,
group = cs_nt,
color = cs_nt,
fill = cs_nt,
label = pageviews,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Banner") + ylab("Pageviews") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Chart 2.5.2 Pageviews, by nt/cs vs. ctrl/var, across the campaign pages.
pFrame <- dataSet %>%
select(var_ctrl, cs_nt, Page, Pageviews) %>%
group_by(var_ctrl, cs_nt, Page) %>%
summarise(pageviews = sum(Pageviews))
ggplot(pFrame, aes(x = var_ctrl,
y = pageviews,
group = cs_nt,
color = cs_nt,
fill = cs_nt,
label = pageviews,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
facet_wrap(~Page, nrow = 3, scales = "free") +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Banner") + ylab("Pageviews") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 2.6.1 Pageviews dataset
### --- Full Dataset (Table Report)
datatable(dataSet,
options = list(pageLength = 30)
)
This section presents all data and statistics on the user edits.
Chart 3.2.1 daily user edits
campaignDays <- c("2020-05-14", "2020-05-15", "2020-05-16", "2020-05-17", "2020-05-18", "2020-05-19",
"2020-05-20", "2020-05-21", "2020-05-22", "2020-05-23", "2020-05-24", "2020-05-25",
"2020-05-26", "2020-05-27")
pFrame <- userEdits %>%
select(dateType) %>%
group_by(dateType) %>%
summarise(Edits = n())
pFrame <- arrange(pFrame, dateType)
pFrame$campaign <- sapply(pFrame$dateType, function(x) {
if (x < "2020-05-14") {
return("Before Campaign")
} else if (x > "2020-05-28") {
return("After Campaign")
} else {
return("Campaign")
}
})
pFrame$date <- as.character(pFrame$dateType)
ggplot(pFrame, aes(x = date,
y = Edits,
group = campaign,
color = campaign,
fill = campaign,
label = Edits,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 6)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "top")
Table 3.2.1 Mean user edits per day: before campaign, during the campaign, and after the campaign
tFrame <- pFrame %>%
select(campaign, Edits) %>%
group_by(campaign) %>%
summarise(totalEdits = sum(Edits),
meanEdits = round(mean(Edits), 2)
)
datatable(tFrame)
Chart 3.3.1 Mean user edits per day, per cs/nt
: before campaign, during the campaign, and after the campaign
pFrame <- userEdits %>%
select(date, cs_nt) %>%
group_by(date, cs_nt) %>%
summarise(Edits = n())
pFrame$dateType <- as.Date(pFrame$date)
pFrame$campaign <- sapply(pFrame$dateType, function(x) {
if (x < "2020-05-14") {
return("Before Campaign")
} else if (x > "2020-05-28") {
return("After Campaign")
} else {
return("Campaign")
}
})
ggplot(pFrame, aes(x = date,
y = Edits,
group = campaign,
color = campaign,
fill = campaign,
label = Edits,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
facet_wrap(~cs_nt, ncol = 1, scales = "free") +
scale_y_continuous(labels = comma) +
ggtitle('2020 WMDE Occasional Editors Banner Campaign') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 6)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "top")
Table 3.3.1 Mean user edits per day, per cs/nt
: before campaign, during the campaign, and after the campaign
tFrame <- pFrame %>%
select(campaign, cs_nt, Edits) %>%
group_by(campaign, cs_nt) %>%
summarise(totalEdits = sum(Edits),
meanEdits = round(mean(Edits), 2)
)
datatable(tFrame)
userClass <- userEdits %>%
select(actor_user) %>%
group_by(actor_user) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
userClass <- userEdits %>%
filter(cs_nt == "cs") %>%
select(actor_user) %>%
group_by(actor_user) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
userClass <- userEdits %>%
filter(cs_nt == "nt") %>%
select(actor_user) %>%
group_by(actor_user) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
userEdits$campaign <- sapply(userEdits$dateType, function(x) {
if (x < "2020-05-14") {
return("Before Campaign")
} else if (x > "2020-05-28") {
return("After Campaign")
} else {
return("Campaign")
}
})
userClass <- userEdits %>%
select(campaign, actor_user) %>%
group_by(campaign, actor_user) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
userClassFull <- expand.grid(actor_user = unique(userClass$actor_user),
campaign = unique(userClass$campaign))
userClass <- left_join(userClassFull,
userClass,
by = c('actor_user', 'campaign'))
userClass$edits[is.na(userClass$edits)] <- 0
userClass$editClass[is.na(userClass$editClass)] <- '(0 - 1)'
editClass <- as.data.frame(table(userClass$editClass, userClass$campaign),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Campaign', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
userEdits$campaign <- sapply(userEdits$dateType, function(x) {
if (x < "2020-05-14") {
return("Before Campaign")
} else if (x > "2020-05-28") {
return("After Campaign")
} else {
return("Campaign")
}
})
userClass <- userEdits %>%
select(campaign, actor_user) %>%
group_by(campaign, actor_user) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
userClassFull <- expand.grid(actor_user = unique(userClass$actor_user),
campaign = unique(userClass$campaign))
userClass <- left_join(userClassFull,
userClass,
by = c('actor_user', 'campaign'))
userClass$edits[is.na(userClass$edits)] <- 0
userClass$editClass[is.na(userClass$editClass)] <- '0'
userClass$editClass[userClass$editClass == '(0 - 1)'] <- '1'
editClass <- as.data.frame(table(userClass$editClass, userClass$campaign),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Campaign', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)