Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign was run from 2020/10/01 to 2020/10/11.
CURRENT UPDATE: Dataset as of 2020/10/20.
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script 2020_WikipediaChallenge_October_2020_Campaign_PRODUCTION.R
on stat1007.eqiad.wmnet, collecting the data as .tsv
and .csv
files, copying manually, and processing locally. A daily crontab job was run from 2020/10/01
to 2020/10/11
to collect the data for daily reporting. User registrations are then collected until November 11th 2020, and user edits are collected until the end of the year. The data used in this report are aggregates of the daily datasets, sanitized and anonymized.
DATA: (1) The event.wmdebannerinteractions
schema is used for banner clicks/closing (see: Phab); (2) the wmf.webrequest
table is a source of banner impressions data; (3) pageviews, user registrations, and user edits are collected by following the standard procedure.
### --- WMDE_Wikipedia_Challenge_October_2020_Campaign.PRODUCTION.R
### --- https://phabricator.wikimedia.org/T262534
### ----------------------------------------------------------------
### --- Campaign Description and Parameters
### ----------------------------------------------------------------
### --- Tracking Part 1: banner campaign
# - Please provide daily reports during the banner campaign (October 1st - 11th).
### --- Tracking Part 2: emailing campaign Wikipedia challenge
# - Please provide a tracking report für the Wikipedia challenge email campaign
# - as already implemented see https://phabricator.wikimedia.org/T251524 from October to December.
### ---Timeline for the whole project
# - Start of the banner campaign: 1. October 2020
# - End of the banner campaign: 11. October 2020
# - Tracking test: End of September
# - Last User signing up: 11. October
# - Preliminary report for tracking part 1: sometime until end of October
# - Last User finishing mailing campaign: November 11th
# - Track editing behavior six weeks after end of campaign: December 23
# - final report for tracking part 1 and 2: beginning of January
### --- Banner Campaign information
# - There will be one campaign with 6 different banners (including mobile versions)
# - which target readers in wikipedia. Banner campaign goal are registrations
# - for the emailing campaign.
# - Find all details in this document:
# - https://docs.google.com/document/d/1kTlLdsx9_rD-k8Xcp2yFvb9ftHDxkMn6-MJ1-5ihtS8/edit
### --- Campaign Tags and Landingpages
# - Landing Page 1: https://www.wikimedia.de/wikipedia-challenge
# - Banner 1
# - ?campaign=WMDE_challengebnr_fall2020_ctrl
# - https://www.wikimedia.de/wikipedia-challenge?campaign=WMDE_challengebnr_fall2020_ctrl
# - ?campaign=WMDE_challengebnr_fall2020_ctrl_ipad
# - https://www.wikimedia.de/wikipedia-challenge?campaign=WMDE_challengebnr_fall2020_ctrl_ipad
# - ?campaign=WMDE_challengebnr_fall2020_ctrl_mobile
# - https://www.wikimedia.de/wikipedia-challenge?campaign=WMDE_challengebnr_fall2020_ctrl_mobile
# - Banner 2
# - ?campaign=WMDE_challengebnr_fall2020_var
# - https://www.wikimedia.de/wikipedia-challenge?campaign=WMDE_challengebnr_fall2020_var
# - ?campaign=WMDE_challengebnr_fall2020_var_ipad
# - https://www.wikimedia.de/wikipedia-challenge?campaign=WMDE_challengebnr_fall2020_var_ipad
# - ?campaign=WMDE_challengebnr_fall2020_var_mobile
# - https://www.wikimedia.de/wikipedia-challenge?campaign=WMDE_challengebnr_fall2020_var_mobile
# - target groups
# - Banners will be targeted to non-logged in users only.
# - The a/b test scope is comparing two different claims in the banner text
# - Tracking is dependent on eventlogging.
# - The required event scheme will be used for closing clicks only.
# - https://phabricator.wikimedia.org/T250791
### --- daily reporting during campaign
# - The following information should be included in the daily reporting:
# - impressions per banner
# - clicks per banner/ page views per landing page
# - closing rate of banners
### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)
### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2020_Wikipedia_Challenge_October_2020_Campaign/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- campaign specifics
campaignName <- 'Wikipedia_Challenge_October_2020'
### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
strsplit(as.character(cetDay),
split = " ",
fixed = T)[[1]][1]
) - 1
### ----------------------------------------------------------
### --- Banner Impressions
### ----------------------------------------------------------
# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_query_condition
if (length(uri_query) > 1) {
uri_query_condition <- paste0("(",
paste(
paste0("uri_query LIKE '%", uri_query, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_query FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
uri_query_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script + Kerberos Auth
hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline -f'
hiveInput <- paste0(queryFile, ' > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE)
)
}
# - set params to wmde_collect_banner_impressions
# - for the Autumn Banner Campaign 2019
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- '/beacon/impression'
uri_query <- c('WMDE_challengebnr_fall2020_ctrl', 'WMDE_challengebnr_fall2020_ctrl_ipad',
'WMDE_challengebnr_fall2020_ctrl_mobile','WMDE_challengebnr_fall2020_var',
'WMDE_challengebnr_fall2020_var_ipad','WMDE_challengebnr_fall2020_var_mobile')
queryFile <- 'wikipedia_challenge_Oct2020_BannerImpressions.hql'
fileName <- paste0(dataDir, "bannerImpressions_", cetDay, ".tsv")
# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir)
# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
dataDir,
cetDay,
campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
# - load
bannerData <- read.delim(fileName,
stringsAsFactors = F)
colnames(bannerData) <- 'uri_query'
# - clean
wStart <- which(bannerData$uri_query == "uri_query")
bannerData <- bannerData[(wStart + 1):(dim(bannerData)[1] - 2), ]
# - split
bannerData <- strsplit(bannerData, split = "&", fixed = T)
# - extract relevant fields
# - banner:
banner <- sapply(bannerData, function(x) {
x[which(grepl("^banner=", x))]
})
banner <- gsub("^banner=", "", banner)
# - recordImpressionSampleRate:
recordImpressionSampleRate <- sapply(bannerData, function(x) {
x[which(grepl("^recordImpressionSampleRate=", x))]
})
recordImpressionSampleRate <- as.numeric(
gsub("^recordImpressionSampleRate=", "", recordImpressionSampleRate)
)
# - result:
result <- sapply(bannerData, function(x) {
x[which(grepl("^result=", x))]
})
result <- gsub("^result=", "", result)
# - compose table:
bannerObservations <- data.frame(banner = banner,
recordImpressionSampleRate = recordImpressionSampleRate,
result = result,
stringsAsFactors = F)
# - filter for result=show
bannerObservations <- dplyr::filter(bannerObservations,
result == "show")
# - correction for recordImpressionSampleRate
bannerObservations$recordImpressionSampleRate <-
1/bannerObservations$recordImpressionSampleRate
# - aggregate:
bannerObservations <- bannerObservations %>%
dplyr::select(banner, recordImpressionSampleRate) %>%
dplyr::group_by(banner) %>%
dplyr::summarise(impressions = sum(recordImpressionSampleRate))
# - add cetDay, me
bannerObservations$date <- cetDay
bannerObservations$campaign <- campaignName
# - store:
write.csv(bannerObservations,
paste0("bannerImpressionsAggregated_",
cetDay,
".csv"
)
)
}
# - wrangle Banner Impression data
wmde_process_banner_impressions(fileName = fileName,
dataDir = dataDir,
cetDay = cetDay,
campaignName = campaignName)
### ----------------------------------------------------------
### --- Banner Interactions:
### --- via event.WMDEBannerInteractions
### --- https://meta.wikimedia.org/wiki/Schema:WMDEBannerInteractions
### ----------------------------------------------------------
# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);
# - function: wmde_collect_pageviews
wmde_banner_actions <- function(uri_query_filter,
cetDay,
queryFile,
fileName,
analyticsDir,
campaignName) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create eventBannerName_condition
if (length(uri_query) > 1) {
eventBannerName_condition <- paste0("(",
paste(
paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions
WHERE (",
eventBannerName_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
# - Wrangle Banner Interactions
# - load
bannerData <- tryCatch({
as.data.frame(fread(paste0(dataDir, fileName)))
},
error = function(condition) {
return(FALSE)
})
# - process
if (class(bannerData) == 'logical') {
return(FALSE)
} else {
# - bannerSeen
bannerSeen <- bannerData %>%
dplyr::select(bannername, userid)
bannerSeen <- bannerSeen[!duplicated(bannerSeen), ]
bannerSeen <- bannerSeen %>%
dplyr::select(bannername) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(seen_by = n())
# - bannerClosed
bannerClosed <- bannerData %>%
dplyr::filter(banneraction == "banner-closed") %>%
dplyr::select(bannername, bannerimpressions) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(closed_by = n(), mean_close_imp = round(mean(bannerimpressions), 2))
# - bannerClicked
bannerClicked <- bannerData %>%
dplyr::filter(banneraction == "banner-clicked") %>%
dplyr::select(bannername, bannerimpressions) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(clicked_by = n(), mean_click_imp = round(mean(bannerimpressions), 2))
# - whoClicked
whoClicked <- bannerData %>%
dplyr::filter(banneraction == "banner-clicked")
whoClicked <- data.frame(userid = unique(whoClicked$userid))
# - store:
write.csv(whoClicked,
paste0(analyticsDir,
"whoClicked_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
# - join
bannerData <- bannerSeen %>%
dplyr::left_join(bannerClosed, 'bannername') %>%
dplyr::left_join(bannerClicked, 'bannername')
bannerData$close_rate <- round(bannerData$closed_by/bannerData$seen_by, 2)
bannerData$click_rate <- round(bannerData$clicked_by/bannerData$seen_by, 2)
# - date, campaign
bannerData$day <- cetDay
bannerData$campaign <- campaignName
# - store:
write.csv(bannerData,
paste0(analyticsDir,
"bannerInteractionsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
}
# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- 'WMDE_challengebnr_fall2020_'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
cetDay = cetDay,
queryFile = queryFile,
fileName = fileName,
analyticsDir = analyticsDir,
campaignName = campaignName)
### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------
# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_host, uri_path, uri_query, referer FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
}
# - set params to wmde_collect_pageviews
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- c(
'/wiki/(274301)_Wikipedia?tour=einfuhrung',
'/wiki/Spezial:Benutzerkonto_anlegen',
'/wiki/Spezial:Anmelden',
'/wiki/Special:MyPage/Artikelwerkstatt',
'/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training',
'/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur'
)
# uri_query <- paste0('WMDE_2020_challenge_', 1:30)
queryFile <- 'wikimedia_challenge_Oct2020_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
# - collect Pageviews data
wmde_collect_pageviews(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir)
### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
dataDir,
uri_query_filter,
page_filter,
cetDay = cetDay,
campaignName = campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
pageviewsData <- readLines(fileName)
wStart <- which(grepl("uri_host", pageviewsData))
pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
pageviewsData <- data.frame(dat = pageviewsData,
stringsAsFactors = F)
pageviewsData <- separate(pageviewsData,
dat,
into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
sep = "\t")
# - apply page_filter
pageviewsData$page <- paste0(pageviewsData$uri_host,
pageviewsData$uri_path,
pageviewsData$uri_query)
wFilter <- sapply(pageviewsData$page, function(x) {
sapply(page_filter, function(y) {
grepl(y, x)
})
})
wFilter <- colSums(wFilter)
wFilter <- which(wFilter > 0)
pageviewsData <- pageviewsData[wFilter, ]
# - apply uri_query_filter
# - NOTE: looking in both: uri_query, referer
w_uri_query <- which(grepl(uri_query_filter, pageviewsData$page))
if (length(w_uri_query) > 0) {
# - filter for w_uri_query
pageviewsData <- pageviewsData[w_uri_query, ]
# - aggregate:
pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
pageviewsData$uri_host <- NULL
pageviewsData$page <- NULL
pageviewsData$referer <- NULL
pageviewsData <- pageviewsData %>%
dplyr::select(uri_query, uri_path) %>%
dplyr::group_by(uri_query, uri_path) %>%
dplyr::summarise(pageviews = n())
colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
# - add cetDay, campaignName
pageviewsData$date <- cetDay
pageviewsData$campaign <- campaignName
# - store:
write.csv(pageviewsData,
paste0(analyticsDir,
"pageviewsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
}
# - set params to wmde_process_pageviews
# - for the WMDE 2020_EmailCampaignWikipediaChallenge
uri_query_filter <- 'WMDE_2020_challenge_'
page_filter <- c('de.wikipedia.org/wiki/(274301)_Wikipedia?tour=einfuhrung',
'de.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen',
'de.wikipedia.org/wiki/Spezial:Anmelden',
'de.wikipedia.org/wiki/Special:MyPage/Artikelwerkstatt',
'de.wikipedia.org/wiki/Benutzer:Trainingskonto_(WMDE)/Wikipedia_Training',
'de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement/Literatur')
# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
dataDir = dataDir,
uri_query_filter = uri_query_filter,
page_filter = page_filter,
cetDay = cetDay,
campaignName = campaignName)
### ----------------------------------------------------------
### --- User Registrations
### --- HiveQL query: event.ServerSideAccountCreation table
### ----------------------------------------------------------
hiveQL <-
"SELECT year, month, day, hour, event.campaign, event.userId, event.userName, webhost, wiki
FROM event.serversideaccountcreation
WHERE
year = 2020
AND (month = 10 OR month = 11)
AND (event.campaign LIKE '%WMDE_2020_challenge_%');"
# - write hql
write(hiveQL, paste0(dataDir, 'user_registrations.hql'))
### --- output filename
filename <- paste0(dataDir, 'user_registrations.tsv')
### --- execute hql script:
hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
hiveInput <- paste(paste0(dataDir, 'user_registrations.hql'),
" > ",
filename,
sep = "")
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
### ----------------------------------------------------------
### --- User Edits
### --- User Edits via revision_actor_temp
### ----------------------------------------------------------
startTimestamp <- '20201001000000'
# - get user ids
userRegistrations <- fread(paste0(dataDir,
'user_registrations.tsv'))
dim(userRegistrations)
# - clean up from test accounts
userRegistrations <- filter(userRegistrations,
!grepl("^test|^Test",
userRegistrations$username))
dim(userRegistrations)
head(userRegistrations)
rev_user <- userRegistrations$userid
# - iterate over rev_user
for (i in 1:length(rev_user)) {
# - SQL query
sqlQuery <- paste("\"SELECT actor.actor_id,
actor.actor_user,
actor.actor_name,
revision_actor_temp.revactor_timestamp
FROM actor
LEFT JOIN revision_actor_temp ON (actor.actor_id = revision_actor_temp.revactor_actor)
WHERE (revision_actor_temp.revactor_timestamp >= 20201001000000
AND actor.actor_user = ", rev_user[i], ");\"");
### --- output filename
filename <- paste(dataDir,'userEdits', "_", i, ".tsv", sep = "")
### --- execute sql script:
sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
sqlInput <- paste(sqlQuery,
" > ",
filename,
sep = "")
# - command:
sqlCommand <- paste(sqlLogInPre, sqlInput)
system(command = sqlCommand, wait = TRUE)
# - report
print(paste0("DONE: user ", i, "."))
}
### --- END run SQL scripts
# - load user edits:
lF <- list.files(dataDir)
lF <- lF[grepl("^userEdits_", lF)]
userEdits <- lapply(paste0(dataDir, lF), fread)
userEdits <- rbindlist(userEdits)
# - store user edits:
write.csv(userEdits, paste0(analyticsDir, 'userEdits.csv'))
This section presents all data and statistics on the campaign pages.
The following chunk loads and then re-structures the dataset a bit.
lF <- list.files('_dailyReporting')
lF <- lF[grepl("^pageviewsAggregated", lF)]
dataSet <- lapply(paste0("_dailyReporting/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - expand grid to account for missing observations per day
dS <- expand.grid(unique(dataSet$Tag),
unique(dataSet$Page),
unique(dataSet$date),
stringsAsFactors = F)
colnames(dS) <- c('Tag', 'Page', 'date')
dS <- dS %>%
left_join(dataSet,
by = c("Tag", "Page", "date"))
dataSet <- dS; rm(dS)
dataSet$Pageviews[is.na(dataSet$Pageviews)] <- 0
dataSet$Tag <- gsub("&.+$", "", dataSet$Tag)
dataSet$Tag <- gsub("\\?campaign=", "", dataSet$Tag)
dataSet$date <- sapply(dataSet$date, function(x) {
d <- strsplit(x, split = "-")[[1]]
if (nchar(d[3]) == 1) {
d[3] <- paste0("0", d[3])
}
return(paste(d, collapse = "-"))
})
Chart 1.1.1 Daily Pageviews, aggregated across the campaign channels.
pFrame <- dataSet %>%
select(date, Page, Pageviews) %>%
group_by(date, Page) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
label = Pageviews,
)) +
geom_path(size = .5, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
facet_wrap(~Page, ncol = 2) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Pageviews") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 1.1.1 Pageviews totals
tFrame <- pFrame %>%
select(Page, Pageviews) %>%
group_by(Page) %>%
summarise(totalPageviews = sum(Pageviews)) %>%
arrange(desc(totalPageviews))
datatable(tFrame)
Chart 1.2.1 Pageviews, by channels
pFrame <- dataSet %>%
select(date, Tag, Pageviews) %>%
group_by(date, Tag) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
label = Pageviews,
)) +
geom_path(size = .5, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
facet_wrap(~Tag, ncol = 2) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Pageviews") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 6)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Chart 1.2.2 Total Pageviews, by channels
pFrame <- pFrame %>%
select(Tag, Pageviews) %>%
group_by(Tag) %>%
summarise(Pageviews = sum(Pageviews)) %>%
arrange(desc(Pageviews))
pFrame$Tag <- factor(pFrame$Tag,
levels = pFrame$Tag,
ordered = T)
ggplot(pFrame, aes(x = Tag,
y = Pageviews,
label = Pageviews,
)) +
geom_bar(stat = "identity", color = "blue", fill = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Pageviews") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 2.2.1 Total pageviews, by channels and by pages.
tFrame <- dataSet %>%
select(Tag, Page, Pageviews) %>%
group_by(Tag, Page) %>%
summarise(totalPageviews = sum(Pageviews)) %>%
arrange(desc(totalPageviews))
datatable(tFrame)
This section presents all data and statistics on the user registrations.
lF <- list.files("_dailyReporting")
lF <- lF[grepl("^user_registrations", lF)]
dataSet <- fread(paste0("_dailyReporting/", lF))
dataSet <- filter(dataSet,
!grepl("^test|^Test", dataSet$username))
dataSet$date <- paste(dataSet$year,
ifelse(nchar(dataSet$month) == 1, paste0("0", dataSet$month), dataSet$month),
ifelse(nchar(dataSet$day) == 1, paste0("0", dataSet$day), dataSet$day),
sep = "-")
regUsers <- select(dataSet,
userid,
username,
campaign)
# - expand grid to account for missing observations per day
dateSpan <- seq(from = as.Date("2020-10-01"),
to = as.Date("2020-11-11"),
by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan,
unique(dataSet$campaign),
stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>%
left_join(select(dataSet, date, campaign, userid),
by = c("date", "campaign"))
dS$userid <- ifelse(is.na(dS$userid), 0, 1)
pFrame <- dS %>%
select(date, userid) %>%
group_by(date) %>%
summarise(registrations = sum(userid)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = registrations,
label = registrations,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Registrations") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
pFrame <- dS %>%
group_by(date, campaign) %>%
summarise(registrations = sum(userid)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = registrations,
label = registrations,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
facet_wrap(~campaign, ncol = 2) +
scale_y_continuous(labels = comma) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Registrations") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 6)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
pFrame <- dS %>%
select(campaign, userid) %>%
group_by( campaign) %>%
summarise(registrations = sum(userid)) %>%
arrange(desc(registrations))
pFrame$campaign <- factor(pFrame$campaign,
levels = pFrame$campaign,
ordered = T)
ggplot(pFrame, aes(x = campaign,
y = registrations,
label = registrations,
)) +
geom_bar(stat = "identity", color = "blue", fill = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Registrations") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
This section presents all data and statistics on the user edits.
The following chunk loads the dataset of user interactions with campaign channels and then re-structures the dataset a bit.
lF <- list.files("_dailyReporting")
lF <- lF[grepl("^userEdits", lF)]
editSet <- fread(paste0("_dailyReporting/", lF))
editSet <- select(editSet,
actor_name,
revactor_timestamp)
colnames(editSet) <- c('username', 'rev_timestamp')
dataSet <- left_join(editSet,
select(regUsers,
username,
campaign),
by = "username")
dataSet$year <- substr(dataSet$rev_timestamp, 1, 4)
dataSet$month <- substr(dataSet$rev_timestamp, 5, 6)
dataSet$day <- substr(dataSet$rev_timestamp, 7, 8)
dataSet$rev_timestamp <- NULL
dataSet$date <- paste(dataSet$year,
dataSet$month,
dataSet$day,
sep = "-")
dataSet <- select(dataSet,
date,
username,
campaign)
dateSpan <- seq(from = as.Date("2020-10-01"),
to = as.Date(max(dataSet$date)),
by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan,
unique(dataSet$campaign),
stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>%
left_join(select(dataSet, date, campaign, username),
by = c("date", "campaign"))
dS$username <- ifelse(is.na(dS$username), 0, 1)
pFrame <- dS %>%
select(date, username) %>%
group_by(date) %>%
summarise(edits = sum(username)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = edits,
label = edits,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
pFrame <- dS %>%
group_by(date, campaign) %>%
summarise(edits = sum(username)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = edits,
label = edits,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
facet_wrap(~campaign, ncol = 2) +
scale_y_continuous(labels = comma) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 5.5)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
pFrame <- dS %>%
select(campaign, username) %>%
group_by( campaign) %>%
summarise(edits = sum(username)) %>%
arrange(desc(edits))
pFrame$campaign <- factor(pFrame$campaign,
levels = pFrame$campaign,
ordered = T)
ggplot(pFrame, aes(x = campaign,
y = edits,
label = edits,
)) +
geom_bar(stat = "identity", color = "blue", fill = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Wikipedia Challenge October 2020') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
userClass <- dataSet %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
WMDE_2020_challenge_1
userClass <- dataSet %>%
filter(campaign == "WMDE_2020_challenge_1") %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
WMDE_2020_challenge_3
userClass <- dataSet %>%
filter(campaign == "WMDE_2020_challenge_3") %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
WMDE_2020_challenge_4
userClass <- dataSet %>%
filter(campaign == "WMDE_2020_challenge_4") %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
WMDE_2020_challenge_5
userClass <- dataSet %>%
filter(campaign == "WMDE_2020_challenge_5") %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
WMDE_2020_challenge_6
userClass <- dataSet %>%
filter(campaign == "WMDE_2020_challenge_6") %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
WMDE_2020_challenge_12
userClass <- dataSet %>%
filter(campaign == "WMDE_2020_challenge_12") %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
WMDE_2020_challenge_14
userClass <- dataSet %>%
filter(campaign == "WMDE_2020_challenge_14") %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)