Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign was run from 2021/03/08 to 2021/03/21.
CURRENT UPDATE: Dataset as of 2021/03/21; Report updated w. user edits data on 2021/03/29.
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script 2021_Actionweek_PRODUCTION.R
on stat1007.eqiad.wmnet
Analytics Client, collecting the data as .tsv
and .csv
files, copying manually, and processing locally. A daily crontab job was run from 2021/03/08
to 2021/03/21
to collect the data for daily reporting. User edits are then collected until 2021/05/02. The data used in this report are aggregates of the daily datasets, sanitized and anonymized.
DATA: (1) The event.wmdebannerinteractions
schema is used for banner clicks/closing (see: Phab); (2) the wmf.webrequest
table is a source of banner impressions data; (3) pageviews, user registrations, and user edits are collected by following the standard procedure.
### ----------------------------------------------------------------
### --- WMDE 2021_Actionweek
### --- https://phabricator.wikimedia.org/T274563
### ----------------------------------------------------------------
### ----------------------------------------------------------------
### --- Campaign Description and Parameters
### ----------------------------------------------------------------
### --- Tracking Part 1: banner campaign
### --- Please provide daily reports during the banner campaign (08th of March to 21th of March).
# - Tracking Part 2: Landingpage links to Wikipedia
# - Please provide a tracking report for the Links from the Landingpage of the campaign
# - until 8 weeks after the end of the campaign.
### --- Timeline for the whole project
# - Start of the banner campaign: 08. March 2020
# - End of the banner campaign: 21. March 2020
# - Tracking test: first week of March
# - Last User signing up: 21.March 2020
# - Preliminary report for tracking part 1: sometime till end of March
# - Track editing behavior eight weeks after end of campaign: 02. May 2020
# - final report for tracking part 1 and 2: sometime till end of May
### --- Tracking Part 1: banner campaign
# - Campaign Tag used for Links
# -------- ?campaign=WMDE_aw21
# -------- this is only for pageviews, this tag is picked-up at our WMDE landing page
# - the Banners can carry one of the two following tags:
# -------- ?banner=WMDE_aw2021_var
# -------- ?banner=WMDE_aw2021_ctrl
### --- Timeline for the whole project
# - Start of the banner campaign: 1. October 2020
# - End of the banner campaign: 11. October 2020
# - Tracking test: End of September
# - Last User signing up: 11. October
# - Preliminary report for tracking part 1: sometime until end of October
# - Last User finishing mailing campaign: November 11th
# - Track editing behavior six weeks after end of campaign: December 23
# - final report for tracking part 1 and 2: beginning of January
### --- Daily Tracking
# - daily reporting during campaign
# - The following information should be included in the daily reporting:
# --- impressions per banner
# --- clicks per banner/ page views per landing page
# --- closing rate of banners
# --- If possible with the distinction whether the person was logged in or not.
### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)
### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2021_Actionweek/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- campaign specifics
campaignName <- 'Actionweek_2021'
### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
strsplit(as.character(cetDay),
split = " ",
fixed = T)[[1]][1]
) - 1
### ----------------------------------------------------------
### --- Banner Interactions:
### --- via event.WMDEBannerInteractions
### --- https://meta.wikimedia.org/wiki/Schema:WMDEBannerInteractions
### ----------------------------------------------------------
# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);
# - function: wmde_banner_actions
wmde_banner_actions <- function(uri_query_filter,
cetDay,
queryFile,
fileName,
analyticsDir,
campaignName) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create eventBannerName_condition
if (length(uri_query_filter) > 1) {
eventBannerName_condition <- paste0("(",
paste(
paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions
WHERE (",
eventBannerName_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, paste0(dataDir, queryFile))
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
# - Wrangle Banner Interactions
# - load
bannerData <- tryCatch({
as.data.frame(fread(paste0(dataDir, fileName)))
},
error = function(condition) {
return(FALSE)
})
# - process
if (class(bannerData) == 'logical') {
return(FALSE)
} else {
# - bannerSeen
bannerSeen <- bannerData %>%
dplyr::select(bannername, userid)
# - bannerSeen <- bannerSeen[!duplicated(bannerSeen), ]
bannerSeen <- bannerSeen %>%
dplyr::select(bannername) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(seen_by = n())
# - bannerClosed
bannerClosed <- bannerData %>%
dplyr::filter(banneraction == "banner-closed") %>%
dplyr::select(bannername, bannerimpressions) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(closed_by = n(),
mean_close_imp = round(mean(bannerimpressions), 2))
# - bannerClicked
bannerClicked <- bannerData %>%
dplyr::filter(banneraction == "banner-clicked") %>%
dplyr::select(bannername, bannerimpressions) %>%
dplyr::group_by(bannername) %>%
dplyr::summarise(clicked_by = n(),
mean_click_imp = round(mean(bannerimpressions), 2))
# - whoClicked
whoClicked <- bannerData %>%
dplyr::filter(banneraction == "banner-clicked")
whoClicked <- data.frame(userid = unique(whoClicked$userid))
# - store:
write.csv(whoClicked,
paste0(analyticsDir,
"whoClicked_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
# - join
bannerData <- bannerSeen %>%
dplyr::left_join(bannerClosed, 'bannername') %>%
dplyr::left_join(bannerClicked, 'bannername')
bannerData$close_rate <- round(bannerData$closed_by/bannerData$seen_by, 2)
bannerData$click_rate <- round(bannerData$clicked_by/bannerData$seen_by, 2)
# - date, campaign
bannerData$day <- cetDay
bannerData$campaign <- campaignName
# - store:
write.csv(bannerData,
paste0(analyticsDir,
"bannerInteractionsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
}
# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- '_wp20_aw_'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
cetDay = cetDay,
queryFile = queryFile,
fileName = fileName,
analyticsDir = analyticsDir,
campaignName = campaignName)
### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------
# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_host, uri_path, uri_query, referer FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
}
# - set params to wmde_collect_pageviews
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- c('/wiki/Wikipedia:Edit-a-thon/Wirtschaftsuniversit%C3%A4t_Wien_zum_Internationalen_Frauentag_2021',
'/wiki/Wikipedia:GLAM',
'/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia',
'/wiki/Wikipedia:WikiProjekt_Frauen/Frauen_in_Rot',
'/wiki/Wikipedia:Redaktionen',
'/wiki/Wikipedia:Stuttgart#Workshop_Frauen_in_der_Politik',
'/wiki/Wikipedia:WomenEdit',
'/wiki/Wikipedia:FemNetz2021',
'/wiki/Wikipedia:60_Minuten',
'/wiki/Wikipedia%3AWho_writes_his_tory%3F',
'/wiki/Wikipedia:Stuttgart',
'/wiki/Benutzer:Ykskwadrat',
'/wiki/Benutzer:Matthias_S%C3%BC%C3%9Fen',
'/wiki/Benutzer:Cirdan',
'/wiki/Benutzer:Mellebga',
'/wiki/Benutzer:DerHexer',
'/wiki/Benutzer:Nexo20')
queryFile <- 'Actionweek2021_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
# - collect Pageviews data
wmde_collect_pageviews(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir)
### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
dataDir,
uri_query_filter,
page_filter,
cetDay = cetDay,
campaignName = campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
pageviewsData <- readLines(fileName)
wStart <- which(grepl("uri_host", pageviewsData))
pageviewsData <- pageviewsData[(wStart + 1):(length(pageviewsData) - 1)]
pageviewsData <- data.frame(dat = pageviewsData,
stringsAsFactors = F)
pageviewsData <- separate(pageviewsData,
dat,
into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
sep = "\t")
# - apply page_filter
pageviewsData$page <- paste0(pageviewsData$uri_host,
pageviewsData$uri_path,
pageviewsData$uri_query)
wFilter <- sapply(pageviewsData$page, function(x) {
sapply(page_filter, function(y) {
grepl(y, x)
})
})
wFilter <- colSums(wFilter)
wFilter <- which(wFilter > 0)
pageviewsData <- pageviewsData[wFilter, ]
# - apply uri_query_filter
# - NOTE: looking in both: uri_query, referer
w_uri_query <- which(grepl(uri_query_filter, pageviewsData$page))
if (length(w_uri_query) > 0) {
# - filter for w_uri_query
pageviewsData <- pageviewsData[w_uri_query, ]
# - aggregate:
pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
pageviewsData$uri_host <- NULL
pageviewsData$page <- NULL
pageviewsData$referer <- NULL
pageviewsData <- pageviewsData %>%
dplyr::select(uri_query, uri_path) %>%
dplyr::group_by(uri_query, uri_path) %>%
dplyr::summarise(pageviews = n())
colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
# - add cetDay, campaignName
pageviewsData$date <- cetDay
pageviewsData$campaign <- campaignName
# - store:
write.csv(pageviewsData,
paste0(analyticsDir,
"pageviewsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
}
# - set params to wmde_process_pageviews
# - for the WMDE 2020_EmailCampaignWikipediaChallenge
uri_query_filter <- 'WMDE_aw21'
page_filter <- c('de.wikipedia.org/wiki/Wikipedia:Edit-a-thon/Wirtschaftsuniversit%C3%A4t_Wien_zum_Internationalen_Frauentag_2021',
'de.wikipedia.org/wiki/Wikipedia:GLAM',
'de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia',
'de.wikipedia.org/wiki/Wikipedia:WikiProjekt_Frauen/Frauen_in_Rot',
'de.wikipedia.org/wiki/Wikipedia:Redaktionen',
'de.wikipedia.org/wiki/Wikipedia:Stuttgart#Workshop_Frauen_in_der_Politik',
'de.wikipedia.org/wiki/Wikipedia:WomenEdit',
'de.wikipedia.org/wiki/Wikipedia:FemNetz2021',
'de.wikipedia.org/wiki/Wikipedia:60_Minuten',
'de.wikipedia.org/wiki/Wikipedia%3AWho_writes_his_tory%3F',
'de.wikipedia.org/wiki/Wikipedia:Stuttgart',
'de.wikipedia.org/wiki/Benutzer:Ykskwadrat',
'de.wikipedia.org/wiki/Benutzer:Matthias_S%C3%BC%C3%9Fen',
'de.wikipedia.org/wiki/Benutzer:Cirdan',
'de.wikipedia.org/wiki/Benutzer:Mellebga',
'de.wikipedia.org/wiki/Benutzer:DerHexer',
'de.wikipedia.org/wiki/Benutzer:Nexo20')
# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
dataDir = dataDir,
uri_query_filter = uri_query_filter,
page_filter = page_filter,
cetDay = cetDay,
campaignName = campaignName)
### ----------------------------------------------------------
### --- User Registrations
### --- HiveQL query: event.ServerSideAccountCreation table
### ----------------------------------------------------------
hiveQL <-
"SELECT year, month, day, hour, event.campaign, event.userId, event.userName, webhost, wiki
FROM event.serversideaccountcreation
WHERE
YEAR = 2021
AND month = 3
AND event.campaign LIKE '%WMDE_aw21%'
AND wiki = 'dewiki';"
# - write hql
write(hiveQL, paste0(dataDir, 'user_registrations.hql'))
### --- output filename
filename <- paste0(analyticsDir, 'user_registrations.tsv')
### --- execute hql script:
hiveArgs <- 'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata /usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
hiveInput <- paste(paste0(dataDir, 'user_registrations.hql'),
" > ",
filename,
sep = "")
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
### ----------------------------------------------------------
### --- User Edits
### --- User Edits via revision_actor_temp
### ----------------------------------------------------------
library(bit64)
startTimestamp <- '20210308000000'
# - get user ids
userRegistrations <- fread(paste0(analyticsDir,
'user_registrations.tsv'))
dim(userRegistrations)
# - clean up from test accounts
userRegistrations <- filter(userRegistrations,
!grepl("^test|^Test",
userRegistrations$username))
dim(userRegistrations)
head(userRegistrations)
rev_user <- userRegistrations$userid
# - iterate over rev_user
for (i in 1:length(rev_user)) {
# - SQL query
sqlQuery <- paste("\"SELECT actor.actor_id,
actor.actor_user,
actor.actor_name,
revision_actor_temp.revactor_timestamp
FROM actor
LEFT JOIN revision_actor_temp ON (actor.actor_id = revision_actor_temp.revactor_actor)
WHERE (revision_actor_temp.revactor_timestamp >= 20210308000000 AND actor.actor_user = ", rev_user[i], ");\"");
### --- output filename
filename <- paste(dataDir,'userEdits', "_", i, ".tsv", sep = "")
### --- execute sql script:
sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
sqlInput <- paste(sqlQuery,
" > ",
filename,
sep = "")
# - command:
sqlCommand <- paste(sqlLogInPre, sqlInput)
system(command = sqlCommand, wait = TRUE)
# - report
print(paste0("DONE: user ", i, "."))
}
### --- END run SQL scripts
# - load user edits:
lF <- list.files(dataDir)
lF <- lF[grepl("^userEdits_", lF)]
userEdits <- lapply(paste0(dataDir, lF), fread)
edited <- sapply(userEdits, function(x){dim(x)[1] > 0})
userEdits <- userEdits[edited]
userEdits <- rbindlist(userEdits)
# - store user edits:
write.csv(userEdits, paste0(analyticsDir, 'userEdits.csv'))
This section presents all data and statistics on the campaign pages.
The following chunk loads and then re-structures the dataset a bit.
lF <- list.files('_analytics')
lF <- lF[grepl("^pageviewsAggregated", lF)]
dataSet <- lapply(paste0("_analytics/", lF), fread)
dataSet <- rbindlist(dataSet)
dataSet$V1 <- NULL
dataSet$campaign <- NULL
# - expand grid to account for missing observations per day
dS <- expand.grid(unique(dataSet$Tag),
unique(dataSet$Page),
unique(dataSet$date),
stringsAsFactors = F)
colnames(dS) <- c('Tag', 'Page', 'date')
dS <- dS %>%
left_join(dataSet,
by = c("Tag", "Page", "date"))
dataSet <- dS; rm(dS)
dataSet$Pageviews[is.na(dataSet$Pageviews)] <- 0
dataSet$Tag <- gsub("&.+$", "", dataSet$Tag)
dataSet$Tag <- gsub("\\?campaign=", "", dataSet$Tag)
dataSet$date <- sapply(dataSet$date, function(x) {
d <- strsplit(x, split = "-")[[1]]
if (nchar(d[3]) == 1) {
d[3] <- paste0("0", d[3])
}
return(paste(d, collapse = "-"))
})
dataSet <- dataSet %>%
filter(!(grepl("^%", dataSet$Tag)))
Chart 1.1.1 Daily Pageviews, aggregated across the campaign channels.
pFrame <- dataSet %>%
select(date, Page, Pageviews) %>%
group_by(date, Page) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
label = Pageviews,
)) +
geom_path(size = .5, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
facet_wrap(~Page, ncol = 2) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Pageviews") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 1.1.1 Pageviews totals
tFrame <- pFrame %>%
select(Page, Pageviews) %>%
group_by(Page) %>%
summarise(totalPageviews = sum(Pageviews)) %>%
arrange(desc(totalPageviews))
datatable(tFrame)
Chart 1.2.1 Pageviews, by channels
pFrame <- dataSet %>%
select(date, Tag, Pageviews) %>%
group_by(date, Tag) %>%
summarise(Pageviews = sum(Pageviews))
pFrame <- arrange(pFrame, date)
ggplot(pFrame, aes(x = date,
y = Pageviews,
label = Pageviews,
)) +
geom_path(size = .5, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
facet_wrap(~Tag, ncol = 2) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Pageviews") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 6)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Chart 1.2.2 Total Pageviews, by channels
pFrame <- pFrame %>%
select(Tag, Pageviews) %>%
group_by(Tag) %>%
summarise(Pageviews = sum(Pageviews)) %>%
arrange(desc(Pageviews))
pFrame$Tag <- factor(pFrame$Tag,
levels = pFrame$Tag,
ordered = T)
ggplot(pFrame, aes(x = Tag,
y = Pageviews,
label = Pageviews,
)) +
geom_bar(stat = "identity", color = "blue", fill = "white") +
scale_y_continuous(labels = comma) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Pageviews") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Table 2.2.1 Total pageviews, by channels and by pages.
tFrame <- dataSet %>%
select(Tag, Page, Pageviews) %>%
group_by(Tag, Page) %>%
summarise(totalPageviews = sum(Pageviews)) %>%
arrange(desc(totalPageviews))
datatable(tFrame)
This section presents all data and statistics on the user registrations.
lF <- list.files("_analytics")
lF <- lF[grepl("^user_registrations", lF)]
dataSet <- fread(paste0("_analytics/", lF))
dataSet <- filter(dataSet,
!grepl("^test|^Test", dataSet$username))
dataSet$date <- paste(dataSet$year,
ifelse(nchar(dataSet$month) == 1, paste0("0", dataSet$month), dataSet$month),
ifelse(nchar(dataSet$day) == 1, paste0("0", dataSet$day), dataSet$day),
sep = "-")
regUsers <- select(dataSet,
userid,
username,
campaign)
# - total number of registered users
# - total_registered <- dim(regUsers)[1]
# - expand grid to account for missing observations per day
dateSpan <- seq(from = as.Date("2021-03-08"),
to = as.Date("2021-03-21"),
by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan,
unique(dataSet$campaign),
stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>%
left_join(select(dataSet, date, campaign, userid),
by = c("date", "campaign"))
dS$userid <- ifelse(is.na(dS$userid), 0, 1)
total_registered <- sum(dS$userid)
pFrame <- dS %>%
select(date, userid) %>%
group_by(date) %>%
summarise(registrations = sum(userid)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = registrations,
label = registrations,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Registrations") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 7)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
The total number of users registered in this campaign is 17.
pFrame <- dS %>%
group_by(date, campaign) %>%
summarise(registrations = sum(userid)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = registrations,
label = registrations,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
facet_wrap(~campaign, ncol = 2) +
scale_y_continuous(labels = comma) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Registrations") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 6)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
pFrame <- dS %>%
select(campaign, userid) %>%
group_by( campaign) %>%
summarise(registrations = sum(userid)) %>%
arrange(desc(registrations))
pFrame$campaign <- factor(pFrame$campaign,
levels = pFrame$campaign,
ordered = T)
ggplot(pFrame, aes(x = campaign,
y = registrations,
label = registrations,
)) +
geom_bar(stat = "identity", color = "blue", fill = "white") +
scale_y_continuous(labels = comma) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Registrations") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
This section presents all data and statistics on the user edits.
The following chunk loads the dataset of user interactions with campaign channels and then re-structures the dataset a bit.
lF <- list.files("_analytics/")
lF <- lF[grepl("^userEdits", lF)]
editSet <- fread(paste0("_analytics/", lF))
editSet <- select(editSet,
actor_name,
revactor_timestamp)
colnames(editSet) <- c('username', 'rev_timestamp')
dataSet <- left_join(editSet,
select(regUsers,
username,
campaign),
by = "username")
dataSet <- dataSet[complete.cases(dataSet), ]
dataSet$year <- substr(dataSet$rev_timestamp, 1, 4)
dataSet$month <- substr(dataSet$rev_timestamp, 5, 6)
dataSet$day <- substr(dataSet$rev_timestamp, 7, 8)
dataSet$rev_timestamp <- NULL
dataSet$date <- paste(dataSet$year,
dataSet$month,
dataSet$day,
sep = "-")
dataSet <- select(dataSet,
date,
username,
campaign)
dateSpan <- seq(from = as.Date("2021-03-08"),
to = as.Date(max(dataSet$date)),
by = "day")
dateSpan <- as.character(dateSpan)
dS <- expand.grid(dateSpan,
unique(dataSet$campaign),
stringsAsFactors = F)
colnames(dS) <- c('date', 'campaign')
dS <- dS %>%
left_join(select(dataSet, date, campaign, username),
by = c("date", "campaign"))
dS$username <- ifelse(is.na(dS$username), 0, 1)
pFrame <- dS %>%
select(date, username) %>%
group_by(date) %>%
summarise(edits = sum(username)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = edits,
label = edits,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 7)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
pFrame <- dS %>%
group_by(date, campaign) %>%
summarise(edits = sum(username)) %>%
arrange(date)
ggplot(pFrame, aes(x = date,
y = edits,
label = edits,
)) +
geom_path(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
facet_wrap(~campaign, ncol = 2) +
scale_y_continuous(labels = comma) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
# geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 5.5)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
pFrame <- dS %>%
select(campaign, username) %>%
group_by( campaign) %>%
summarise(edits = sum(username)) %>%
arrange(desc(edits))
pFrame$campaign <- factor(pFrame$campaign,
levels = pFrame$campaign,
ordered = T)
ggplot(pFrame, aes(x = campaign,
y = edits,
label = edits,
)) +
geom_bar(stat = "identity", color = "blue", fill = "white") +
scale_y_continuous(labels = comma) +
ggtitle('2021 Actionweek') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
userClass <- dataSet %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100),
c(101, 1000000)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
editClass$`Edit Class`[editClass$`Edit Class` == '(0 - 1)'] <- '(1)'
editClass$`Edit Class`[editClass$`Edit Class` == '(101 - 1e+06)'] <- '(> 100)'
datatable(editClass)
edClasses <- lapply(unique(dataSet$campaign), function(x) {
userClass <- dataSet %>%
filter(campaign == x) %>%
select(username) %>%
group_by(username) %>%
summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100),
c(101, 1000000)
)
userClass$editClass <- sapply(userClass$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return("> 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
userClass$editClass[userClass$editClass == '(0 - 1)'] <- '(1)'
userClass$editClass[userClass$editClass == '(101 - 1e+06)'] <- '(> 100)'
editClass <- as.data.frame(table(userClass$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
editClass$campaign <- x
return(editClass)
})
edClasses <- rbindlist(edClasses)
edClasses$campaign <- sapply(edClasses$campaign, function(x) {
d <- strsplit(x, "_")[[1]]
if (nchar(tail(d, 1)) == 1) {
d[length(d)] <- paste0("0", d[length(d)])
}
d <- paste(d, collapse = "_")
return(d)
})
edClasses <- arrange(edClasses, campaign)
edClasses$`Edit Class`[edClasses$`Edit Class` == '(0 - 1)'] <- '(1)'
edClasses$`Edit Class` <- factor(edClasses$`Edit Class`,
levels = c('(1)',
'(2 - 4)',
'(5 - 9)',
'(10 - 20)',
'(21 - 50)',
'(51 - 100)',
'(> 100)'),
ordered = T
)
ggplot(edClasses,
aes(x = `Edit Class`,
y = `Num.Users`,
label = `Num.Users`)) +
geom_bar(stat = "identity", color = "black", fill = "white") +
facet_wrap(~campaign) +
ggtitle('2021 Actionweek') +
xlab("Edit Class") + ylab("Num.users") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")