Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign is run from 2021/10/18 to 2021/10/31.
CURRENT UPDATE: Complete dataset as of 2021/10/31.
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running an R script on stat1007.eqiad.wmnet
, collecting the data as .tsv
and .csv
files, copying manually, and processing locally.
### ----------------------------------------------------------------
### --- WMDE 2021 Ocassional Editors
### --- https://phabricator.wikimedia.org/T291635
### ----------------------------------------------------------------
### ----------------------------------------------------------------
### --- Campaign Description and Parameters
### ----------------------------------------------------------------
# - Landing Pages:
# - Landing Page 1: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement
# - Landing Page 2a: https://tools.wmflabs.org/mitmachen/
# - Landing Page 2b: https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia
# - Landing Page 2c: https://de.wikipedia.org/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote
# - Campaign tags:
# - WMDE_oceditors_fall_2021
# - Start of the banner campaign: October 12th
# - End of the banner campaign: October 26 th
# - Tracking test: October 5 - 7
# - Preliminary report for tracking: beginning of November
# - Track editing behavior four weeks after end of campaign: November 23rd
# - final report for tracking: beginning of December
### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)
### --- campaign specifics
campaignName <- 'OccasionalEditors_2021'
### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/2021_OccasionalEditors/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
publicDir <- '/srv/published/datasets/wmde-analytics-engineering/NewEditors/campaigns/2021_OccasionalEditors/'
### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
strsplit(as.character(cetDay),
split = " ",
fixed = T)[[1]][1]
) - 1
### ----------------------------------------------------------
### --- Banner Interactions:
### --- via event.WMDEBannerInteractions
### --- https://meta.wikimedia.org/wiki/Schema:WMDEBannerInteractions
### ----------------------------------------------------------
# - select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID
# - from event.wmdebannerinteractions where year=2020 and month=5 and (day=11 or day=12 or day=13);
# - function: wmde_banner_actions
wmde_banner_actions <- function(uri_query_filter,
cetDay,
queryFile,
fileName,
analyticsDir,
campaignName) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create eventBannerName_condition
if (length(uri_query_filter) > 1) {
eventBannerName_condition <- paste0("(",
paste(
paste0("event.bannerName LIKE '%", uri_query_filter, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
eventBannerName_condition = paste0("event.bannerName LIKE '%", uri_query_filter, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"select dt, event.bannerName, event.bannerAction, event.bannerImpressions, event.userID from event.wmdebannerinteractions
WHERE (",
eventBannerName_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, paste0(dataDir, queryFile))
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
# - Wrangle Banner Interactions
# - load
bannerData <- tryCatch({
as.data.frame(fread(paste0(dataDir, fileName)))
},
error = function(condition) {
return(FALSE)
})
# - process
if (class(bannerData) == 'logical') {
return(FALSE)
} else {
# - whoClicked
whoClicked <- bannerData %>%
dplyr::filter(banneraction == "banner-clicked")
whoClicked <- data.frame(userid = unique(whoClicked$userid))
# - store:
write.csv(whoClicked,
paste0(analyticsDir,
"whoClicked_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
# - process bannerData
bannerData <- t(table(bannerData$banneraction,
bannerData$bannername))
bannerData <- as.data.frame(bannerData)
colnames(bannerData) <- c('banner', 'action', 'count')
bannerData$day <- cetDay
bannerData$campaign <- campaignName
# - store:
write.csv(bannerData,
paste0(analyticsDir,
"bannerInteractionsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
))
}
}
# - set params for wmde_banner_actions()
queryFile <- paste0(campaignName, "_bannerInteractions.hql")
fileName <- paste0("bannerInteractions_", cetDay, ".tsv")
uri_query_filter <- 'WMDE_oceditors_fall_2021'
banner_status <- wmde_banner_actions(uri_query_filter = uri_query_filter,
cetDay = cetDay,
queryFile = queryFile,
fileName = fileName,
analyticsDir = analyticsDir,
campaignName = campaignName)
### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------
# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_host, uri_path, uri_query, referer FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
kerberosPrefix <-
'sudo -u analytics-privatedata kerberos-run-command analytics-privatedata '
# - Kerberos init
system(command = paste0(kerberosPrefix, ' hdfs dfs -ls'),
wait = T)
# - Run query
query <- system(command = paste(kerberosPrefix,
'/usr/local/bin/beeline --incremental=true --silent -f "',
paste0(dataDir, queryFile),
'" > ', dataDir, fileName,
sep = ""),
wait = TRUE)
}
# - set params to wmde_collect_pageviews
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- c('/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
'/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia',
'/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote')
queryFile <- 'WMDE_oceditors_fall_2021_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
# - collect Pageviews data
wmde_collect_pageviews(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir)
### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
dataDir,
uri_query_filter,
page_filter,
cetDay = cetDay,
campaignName = campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
pageviewsData <- readLines(fileName)
wStart <- which(grepl("uri_host", pageviewsData))
pageviewsData <- pageviewsData[(wStart + 1):(length(pageviewsData) - 1)]
pageviewsData <- data.frame(dat = pageviewsData,
stringsAsFactors = F)
pageviewsData <- separate(pageviewsData,
dat,
into = c('uri_host', 'uri_path', 'uri_query', 'referer'),
sep = "\t")
# - apply page_filter
wFilter1 <- unique(unname(unlist(sapply(page_filter, function (x) {
which(grepl(x, pageviewsData$uri_path))
}))))
pageviewsData <- pageviewsData[wFilter1, ]
# - apply uri_query_filter
w_uri_query_1 <- which(grepl(uri_query_filter, pageviewsData$uri_query))
w_uri_query_2 <- which(grepl(uri_query_filter, pageviewsData$referer))
w_uri_query <- unique(c(w_uri_query_1, w_uri_query_2))
if (length(w_uri_query) > 0) {
# - filter for w_uri_query
pageviewsData <- pageviewsData[w_uri_query, ]
# - copy uri_query from referer where uri_query is empty:
wCuriq <- which(pageviewsData$uri_query == "")
curiq <- stringr::str_extract(pageviewsData$referer[wCuriq],
paste0(uri_query_filter, ".+$"))
pageviewsData$uri_query[wCuriq] <- curiq
# - remove ?campaign= from pageviewsData$uri_query
pageviewsData$uri_query <- gsub("?campaign=",
"",
pageviewsData$uri_query,
fixed = T)
# - aggregate:
pageviewsData$uri_path <- paste0(pageviewsData$uri_host, pageviewsData$uri_path)
pageviewsData$uri_host <- NULL
pageviewsData$page <- NULL
pageviewsData$referer <- NULL
pageviewsData <- pageviewsData %>%
dplyr::select(uri_query, uri_path) %>%
dplyr::group_by(uri_query, uri_path) %>%
dplyr::summarise(pageviews = n())
colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
# - add cetDay, campaignName
pageviewsData$date <- cetDay
pageviewsData$campaign <- campaignName
# - store:
write.csv(pageviewsData,
paste0(analyticsDir,
"pageviewsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
}
# - set params to wmde_process_pageviews
uri_query_filter <- 'WMDE_oceditors_fall_2021'
page_filter <- c('/wiki/Wikipedia:Wikimedia_Deutschland/DeinEngagement',
'/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia',
'/wiki/Wikipedia:F%C3%B6rderung/F%C3%B6rderangebote')
# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
dataDir = dataDir,
uri_query_filter = uri_query_filter,
page_filter = page_filter,
cetDay = cetDay,
campaignName = campaignName)
### ---------------------------------------------------------------------------
### --- Compose and copy to publicDir
### ---------------------------------------------------------------------------
lF <- list.files(analyticsDir)
lF <- lF[grepl("Aggregated", lF)]
lFp <- lF[grepl("pageviews", lF)]
lFp <- lapply(paste0(analyticsDir, lFp),
data.table::fread)
lFp <- data.table::rbindlist(lFp)
write.csv(lFp,
paste0(publicDir,
"2021_OccasionalEditors_Pageviews.csv"))
lFb <- lF[grepl("banner", lF)]
lFb <- lapply(paste0(analyticsDir, lFb),
data.table::fread)
lFb <- data.table::rbindlist(lFb)
write.csv(lFb,
paste0(publicDir,
"2021_OccasionalEditors_Banners.csv"))
Chart 2.1.1. Pageviews Overview.
dataSet <- read.csv(
'_reporting/2021_OccasionalEditors_Pageviews.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
dataSet$V1 <- NULL
dataSet <- dataSet %>%
dplyr::filter(!(grepl("2021-11", date)))
dataSet$Tag <- sapply(dataSet$Tag, function(x) {
if (grepl("ipad$", x)) {
"ipad"
} else if (grepl("mobile$", x)) {
"mobile"
} else {
"desktop"
}
})
dataSet$Page <- sapply(dataSet$Page, function(x) {
if (grepl("F%C3%B6rderangebote", x)) {
return("Förderung/Förderangebote")
} else if (grepl("DeinEngagement", x)) {
return("DeinEngagement ")
} else {
return("LerneWikipedia")
}
})
dataSet <- dataSet %>%
dplyr::select(-campaign) %>%
dplyr::group_by(date, Tag, Page) %>%
dplyr::summarise(Pageviews = sum(Pageviews))
# - Visualize w. {ggplot2}
ggplot(dataSet, aes(x = date,
y = log(Pageviews),
group = Page,
color = Page,
fill = Page,
label = Pageviews,
)) +
geom_path(size = .25) +
geom_point(size = 1.5) +
geom_point(size = 1, color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Pageviews') +
ylab("log(Pageviews)") +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
facet_wrap(~Tag, ncol = 2) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(strip.text = element_text(size = 15)) +
theme(legend.position = "top")
Table 2.2.1. Pageviews Overview
### --- Full Dataset (Table Report)
datatable(dataSet %>% arrange(desc(Pageviews)))
pFrame <- dataSet %>%
dplyr::select(date, Pageviews) %>%
dplyr::group_by(date) %>%
dplyr::summarise(Pageviews = sum(Pageviews))
Adding missing grouping variables: `Tag`
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = date,
y = Pageviews,
label = Pageviews,
)) +
geom_line(size = .25, group = 1) +
geom_point(size = 1.5) +
geom_point(size = 1, color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Pageviews') +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
pFrame <- dataSet %>%
dplyr::select(Tag, Pageviews) %>%
dplyr::group_by(Tag) %>%
dplyr::summarise(Pageviews = sum(Pageviews))
Adding missing grouping variables: `date`
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
y = Pageviews,
label = Pageviews)) +
geom_bar(stat = "identity", fill = "deepskyblue") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Pageviews') +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
pFrame <- dataSet %>%
ungroup() %>%
dplyr::filter(Page == "DeinEngagement ") %>%
dplyr::select(Tag, Pageviews) %>%
dplyr::group_by(Tag) %>%
dplyr::summarise(Pageviews = sum(Pageviews))
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
y = Pageviews,
label = Pageviews)) +
geom_bar(stat = "identity", fill = "deepskyblue") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Pageviews') +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
pFrame <- dataSet %>%
ungroup() %>%
dplyr::filter(Page != "DeinEngagement ") %>%
dplyr::select(Tag, Pageviews) %>%
dplyr::group_by(Tag) %>%
dplyr::summarise(Pageviews = sum(Pageviews))
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Tag,
y = Pageviews,
label = Pageviews)) +
geom_bar(stat = "identity", fill = "deepskyblue") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Pageviews') +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
pFrame <- dataSet %>%
dplyr::select(Page, Pageviews) %>%
dplyr::group_by(Page) %>%
dplyr::summarise(Pageviews = sum(Pageviews))
Adding missing grouping variables: `date`, `Tag`
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = Page,
y = Pageviews,
label = Pageviews)) +
geom_bar(stat = "identity", fill = "darkorange") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Pageviews') +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
dataSet <- read.csv(
'_reporting/userEdits_Complete.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
dataSet$ts_revision <- as.character(dataSet$ts_revision)
dataSet$banner <- sapply(dataSet$banner, function(x) {
if (grepl("ipad$", x)) {
"ipad"
} else if (grepl("mobile$", x)) {
"mobile"
} else {
"desktop"
}
})
pFrame <- dataSet %>%
dplyr::filter(!is.na(ts_revision)) %>%
dplyr::select(banner) %>%
dplyr::group_by(banner) %>%
dplyr::summarise(edits = n())
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = banner,
y = edits,
label = edits)) +
geom_bar(stat = "identity", fill = "darkRed") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Edits') +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
pFrame <- dataSet %>%
dplyr::filter(!is.na(ts_revision)) %>%
dplyr::select(banner, ts_revision)
pFrame$ts_revision <- substr(pFrame$ts_revision, 5, 8)
pFrame$ts_revision <- sapply(pFrame$ts_revision, function(x) {
first <- substr(x, 1, 2)
second <- substr(x, 3, 4)
return(
paste0(first, "-", second)
)
})
pFrame <- pFrame %>%
dplyr::group_by(banner, ts_revision) %>%
dplyr::summarise(edits = n())
pFrame$ts_revision <- as.factor(pFrame$ts_revision)
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = ts_revision,
y = log(edits),
label = edits,
group = banner,
color = banner,
fill = banner)) +
geom_point(size = 1.5) +
geom_point(size = 1, color = "white") +
geom_line(size = .25) +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Edits') +
xlab("Day") + ylab("Log(edits)") +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
pFrame <- pFrame %>%
dplyr::select(ts_revision, edits) %>%
dplyr::group_by(ts_revision) %>%
dplyr::summarise(edits = sum(edits))
Adding missing grouping variables: `banner`
pFrame$ts_revision <- as.factor(pFrame$ts_revision)
# - Visualize w. {ggplot2}
ggplot(pFrame, aes(x = ts_revision,
y = log(edits),
label = edits)) +
geom_line(size = .25, group = 1, color = "darkred") +
geom_point(size = 1.5, color = "darkred") +
geom_point(size = 1, color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Occasional Editors 2021: Edits') +
xlab("Day") + ylab("Log(edits)") +
theme_minimal() +
geom_text_repel(size = 6, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90, size = 12)) +
theme(plot.title = element_text(size = 15)) +
theme(legend.title = element_blank()) +
theme(legend.text = element_text(size = 15)) +
theme(legend.position = "top")
pFrame <- dataSet %>%
dplyr::filter(!is.na(ts_revision)) %>%
dplyr::select(userid) %>%
dplyr::group_by(userid) %>%
dplyr::summarise(edits = n())
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 20),
c(21, 50),
c(51, 100)
)
pFrame$editClass <- sapply(pFrame$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return(">= 100")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(pFrame$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
userEdits <- read.csv(paste0(getwd(),"/_analytics/", "userEdits_History.csv"),
header = TRUE,
check.names = FALSE,
row.names = 1,
stringsAsFactors = FALSE)
userEdits$dateType <- as.character(userEdits$revactor_timestamp)
userEdits$dateType <- substr(userEdits$dateType, 1, 8)
userEdits$dateType <- sapply(userEdits$dateType, function(x) {
d <- paste0(
substr(x, 1, 4),
"-",
substr(x, 5, 6),
"-",
substr(x, 7, 8)
)
return(d)
})
userEdits <- dplyr::arrange(userEdits, dateType) %>%
select(actor_user, dateType)
campaignDays <- c("2021-10-18", "2021-10-19", "2021-10-20", "2021-10-21", "2021-10-22", "2021-10-23",
"2021-10-24", "2021-10-25", "2021-10-26", "2021-10-27", "2021-10-28", "2021-10-29",
"2021-10-30", "2021-10-31")
pFrame <- userEdits %>%
select(dateType) %>%
group_by(dateType) %>%
summarise(Edits = n())
pFrame <- arrange(pFrame, dateType)
pFrame$campaign <- sapply(pFrame$dateType, function(x) {
if (x < "2021-10-18") {
return("Before Campaign")
} else if (x > "2021-10-31") {
return("After Campaign")
} else {
return("Campaign")
}
})
pFrame$date <- as.character(pFrame$dateType)
ggplot(pFrame, aes(x = date,
y = Edits,
group = campaign,
color = campaign,
fill = campaign,
label = Edits,
)) +
geom_path(size = .5) +
geom_point(size = 1.5) +
geom_point(size = 1, fill = "white", color = "white") +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
ggtitle('2021 WMDE Occasional Editors Banner Campaign') +
xlab("Date") + ylab("Edits") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 6)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "top")