Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign is run from 2019/01/02 to 2019/01/16.
CURRENT UPDATE: Complete dataset as of 2019/01/16.
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running an R script on stat1007.eqiad.wmnet
, collecting the data as .tsv
and .csv
files, copying manually, and processing locally.
### --- Data Acquisition for the Thank You 2019 Campaign
### --- run from stat1007
### --- /home/goransm/Analytics/NewEditors/Campaigns/2019_ThankYou
### --- to data directory
dataDir <- '/home/goransm/Analytics/NewEditors/Campaigns/2019_ThankYou/data/'
setwd(dataDir)
### --- determine cetDay
library(lubridate)
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
strsplit(as.character(cetDay),
split = " ",
fixed = T)[[1]][1]
) - 1
### -----------------------------------------------------------------------
### --- Collect Banner Impression Data
### -----------------------------------------------------------------------
# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_query_condition
if (length(uri_query) > 1) {
uri_query_condition <- paste0("(",
paste(
paste0("uri_query LIKE '%", uri_query, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_query FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
uri_query_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
hiveArgs <- '/usr/local/bin/beeline -f'
hiveInput <- paste0(queryFile, ' > ', paste0(dataDir, fileName))
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE))
}
# - set params to wmde_collect_banner_impressions
# - for the Thank You 2919
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- '/beacon/impression'
uri_query <- c('WMDE_2019_thx'
)
queryFile <- 'thankyou2019_BannerImpressions.hql'
fileName <- paste0("bannerImpressions_", cetDay, ".tsv")
# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir)
### -----------------------------------------------------------------------
### --- Wrangle Banner Impression Data
### -----------------------------------------------------------------------
# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
dataDir,
cetDay,
campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
# - load
bannerData <- read.delim(fileName,
stringsAsFactors = F,
sep = "\t")
colnames(bannerData) <- 'uri_query'
# - clean
wInfo <- which(grepl("campaign=", bannerData$uri_query))
bannerData <- bannerData[wInfo, ]
# - split
bannerData <- strsplit(bannerData, split = "&", fixed = T)
# - extract relevant fields
# - banner:
banner <- sapply(bannerData, function(x) {
x[which(grepl("^banner=", x))]
})
banner <- gsub("^banner=", "", banner)
# - recordImpressionSampleRate:
recordImpressionSampleRate <- sapply(bannerData, function(x) {
x[which(grepl("^recordImpressionSampleRate=", x))]
})
recordImpressionSampleRate <- as.numeric(
gsub("^recordImpressionSampleRate=", "", recordImpressionSampleRate)
)
# - result:
result <- sapply(bannerData, function(x) {
x[which(grepl("^result=", x))]
})
result <- gsub("^result=", "", result)
# - compose table:
bannerObservations <- data.frame(banner = banner,
recordImpressionSampleRate = recordImpressionSampleRate,
result = result,
stringsAsFactors = F)
# - filter for result=show
bannerObservations <- dplyr::filter(bannerObservations,
result == "show")
# - correction for recordImpressionSampleRate
bannerObservations$recordImpressionSampleRate <-
1/bannerObservations$recordImpressionSampleRate
# - aggregate:
bannerObservations <- bannerObservations %>%
dplyr::select(banner, recordImpressionSampleRate) %>%
dplyr::group_by(banner) %>%
dplyr::summarise(impressions = sum(recordImpressionSampleRate))
# - add cetDay, me
bannerObservations$date <- cetDay
bannerObservations$campaign <- campaignName
# - store:
write.csv(bannerObservations,
paste0("bannerImpressionsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
# - wrangle Banner Impression data
campaignName <- "thankyou2019"
wmde_process_banner_impressions(fileName = fileName,
dataDir = dataDir,
cetDay = cetDay,
campaignName = campaignName)
### -----------------------------------------------------------------------
### --- Collect Pageviews
### -----------------------------------------------------------------------
# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
hiveArgs <- '/usr/local/bin/beeline -f'
hiveInput <- paste0(queryFile, ' > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE))
}
# - set params to wmde_collect_pageviews
# - for the Thank You 2919
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- c(
'/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia')
queryFile <- 'thankyou2019_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
# - collect Pageviews data
wmde_collect_pageviews(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir)
### -----------------------------------------------------------------------
### --- Wrangle Pageviews
### -----------------------------------------------------------------------
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
dataDir,
uri_query_filter,
cetDay = cetDay,
campaignName = campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
pageviewsData <- readLines(fileName)
wStart <- which(grepl("^uri_path", pageviewsData))
pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
pageviewsData <- data.frame(dat = pageviewsData,
stringsAsFactors = F)
pageviewsData <- separate(pageviewsData,
dat,
into = c('uri_path', 'uri_query', 'referer'),
sep = "\t")
# - apply uri_query_filter
# - NOTE: Autumn 2018, looking in both: uri_query, referer
w_uri_query <- which(grepl(uri_query_filter, pageviewsData$uri_query))
w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
w_uri_query <- unique(c(w_uri_query, w_uri_query_referer))
pageviewsData <- pageviewsData[w_uri_query, ]
w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
w_uri_query_referer_delete <- setdiff(1:dim(pageviewsData)[1], w_uri_query_referer)
pageviewsData$referer[w_uri_query_referer_delete] <- ''
# - when there is no uri_query, use the query from the referer field if present there
pageviewsData$referer <- str_extract(pageviewsData$referer, "\\?campaign=.*$")
pageviewsData$referer[is.na(pageviewsData$referer)] <- ""
pageviewsData$referer <- gsub("?campaign=", "", pageviewsData$referer, fixed = T)
pageviewsData$uri_query <- gsub("?campaign=", "", pageviewsData$uri_query, fixed = T)
pageviewsData$uri_query[pageviewsData$uri_query == ""] <-
pageviewsData$referer[pageviewsData$uri_query == ""]
pageviewsData <- dplyr::filter(pageviewsData,
uri_query != "")
pageviewsData$referer <- NULL
# - clean up a bit:
pageviewsData$uri_query <- gsub("/.*$", "", pageviewsData$uri_query)
# - aggregate:
pageviewsData <- pageviewsData %>%
dplyr::group_by(uri_query, uri_path) %>%
dplyr::summarise(pageviews = n())
colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
# - add cetDay, campaignName
pageviewsData$date <- cetDay
pageviewsData$campaign <- campaignName
# - store:
write.csv(pageviewsData,
paste0("pageviewsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
# - set params to wmde_process_pageviews
# - for the Thank You 2919
uri_query_filter <- 'WMDE_2019_thx'
# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
dataDir = dataDir,
uri_query_filter = uri_query_filter,
cetDay = cetDay,
campaignName = campaignName)
### -----------------------------------------------------------------------
### --- Collect User Registrations
### -----------------------------------------------------------------------
# - function: wmde_collect_registrations
wmde_collect_registrations <- function(logSchema,
web_host,
event_campaign,
cetDay,
dataDir,
fileName,
campaignName) {
# - WHERE condition: create start_timestamp, stop_timestamp
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 24:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
start_timestamp <- paste(
unlist(str_extract_all(cet_condition[1],
"[[:digit:]]")),
collapse = "")
stop_timestamp <- paste(
unlist(str_extract_all(tail(cet_condition, 1),
"[[:digit:]]")),
collapse = "")
# - WHERE condition: create event_campaign_condition
if (length(event_campaign) > 1) {
event_campaign_condition <- paste0("(",
paste(
paste0("event_campaign LIKE '%", event_campaign, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
event_campaign_condition = paste0("event_campaign LIKE '%", event_campaign, "%'")
}
# - compose SQL query:
sqlParams <- 'mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e'
query <- paste0(
"\"SELECT * FROM ",
paste0("log.", logSchema),
" WHERE ((webHost = '",
web_host,
"') AND (timestamp > ",
start_timestamp,
") AND (timestamp <= ",
stop_timestamp,
") AND (",
event_campaign_condition,
"));\"")
sqlOutput <- paste0("> ", paste0(dataDir, "/", fileName))
# - run command
qCommand <- paste(sqlParams, query, sqlOutput, sep = " ")
system(command = qCommand, wait = TRUE)
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
userReg <- fread(fileName, sep = "\t")
# - filter bots
wBot <- which(grepl("\"is_bot\": true", userReg$userAgent))
if (length(wBot) > 0) {
userReg <- userReg[-wBot, ]
}
# - select fields
userReg <- userReg %>%
dplyr::select(event_userId,
event_userName,
event_isSelfMade,
event_campaign,
timestamp)
# - add cetDay, campaignName
userReg$date <- cetDay
userReg$campaign <- campaignName
# - store:
write.csv(userReg,
paste0(
strsplit(fileName, split = ".", fixed = T)[[1]][1],
".csv")
)
# - remove temp .tsv file
file.remove(fileName)
}
# - set params for: wmde_collect_registrations
logSchema <- 'ServerSideAccountCreation_17719237'
web_host <- 'de.wikipedia.org'
event_campaign <- 'WMDE_2019_thx'
fileName <- paste0("userRegistrations_", cetDay, ".tsv")
# - collect user registrations
wmde_collect_registrations(logSchema = logSchema,
web_host = web_host,
event_campaign = event_campaign,
cetDay = cetDay,
dataDir = dataDir,
fileName = fileName,
campaignName = campaignName)
### -----------------------------------------------------------------------
### --- Wrangle User Registrations
### -----------------------------------------------------------------------
# - function: wmde_process_registrations
wmde_process_registrations <- function(fileName,
dataDir,
cetDay,
campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(dplyr)
library(data.table)
# - load
userReg <- fread(fileName)
# - agregate
userReg <- userReg %>%
dplyr::select(event_campaign) %>%
dplyr::group_by(event_campaign) %>%
dplyr::summarise(Registrations = n())
# - add cetDay, campaignName
userReg$date <- cetDay
userReg$campaign <- campaignName
# - store:
write.csv(userReg,
paste0('userRegistrationsAggreagted_', cetDay, ".csv")
)
}
# - set params for: wmde_process_registrations
fileName <- paste0("userRegistrations_", cetDay, ".csv")
# - wrangle user registrations:
wmde_process_registrations(fileName,
dataDir,
cetDay,
campaignName)
### --- Collect Newsletter registrations - for 2018_AuBC excl.
# - ServerSideAccountCreation_17719237 schema
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_17719237 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20181011000000) and (event_campaign like '%WMDE_neweditors_autumn_2018_lpn%'));\" > /home/goransm/RScripts/NewEditors/2018_AutumnBannerCampaign/_data/Newsletter_userRegistrations.tsv"
system(command = qCommand, wait = TRUE)
### --- Wrangle Newsletter registrations - for 2018_AuBC excl.
# - function: wmde_process_registrations_general
wmde_process_registrations_general <- function(fileName,
dataDir,
cetDay,
campaignName,
outFileName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(dplyr)
library(data.table)
# - load
userReg <- fread(fileName)
# - agregate
userReg <- userReg %>%
dplyr::select(event_campaign) %>%
dplyr::group_by(event_campaign) %>%
dplyr::summarise(Registrations = n())
# - add cetDay, campaignName
userReg$date <- cetDay
userReg$campaign <- campaignName
# - store:
write.csv(userReg,
outFileName
)
}
NOTE: Not run from this report; the data were already pre-processed and aggregated by the following R
script before being submitted to analytical procedures:
### --- Report Generation for the Thank You 2919
### --- run locally
### --- to data directory
dataDir <-
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_data/'
analyticsDir <-
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_analytics/'
setwd(analyticsDir)
### --- Report Banner Impression Data
# - function: wmde_report_banner_impressions
wmde_report_banner_impressions <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated banner impression data
lF <- lF[grepl("bannerImpressionsAggregated_", lF, fixed = T)]
# - load files and merge
bannerData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
bannerData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
bannerData[[i]] <- NULL
}
}
bannerData <- rbindlist(bannerData)
bannerData$V1 <- NULL
# - aggregates
perBannerTotals <- bannerData %>%
select(banner, impressions) %>%
group_by(banner) %>%
summarise(totalImpressions = sum(impressions))
perDayTotals <- bannerData %>%
select(date, impressions) %>%
group_by(date) %>%
summarise(totalImpressions = sum(impressions))
# - output
return(
list(bannerImpressionsReport = bannerData,
perBannerTotals = perBannerTotals,
perDayTotals = perDayTotals)
)
}
# - Report banner impressions
bannerImpressionsData <- wmde_report_banner_impressions(dataDir)
bannerImpressionsFile <- bannerImpressionsData$bannerImpressionsReport
write.csv(bannerImpressionsFile, "bannerImpressionsFile.csv")
bannerTotals <- bannerImpressionsData$perBannerTotals
write.csv(bannerTotals, "bannerTotals.csv")
bannerDayTotals <- bannerImpressionsData$perDayTotals
write.csv(bannerDayTotals, "bannerDayTotals.csv")
### --- Report Pageviews Data
# - function: wmde_report_pageviews
wmde_report_pageviews <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated banner impression data
lF <- lF[grepl("pageviewsAggregated_", lF, fixed = T)]
# - load files and merge
pageviewsData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
pageviewsData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
pageviewsData[[i]] <- NULL
}
}
pageviewsData <- rbindlist(pageviewsData)
pageviewsData$V1 <- NULL
# - aggregates
perDayTotals <- pageviewsData %>%
select(date, Pageviews) %>%
group_by(date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagTotals <- pageviewsData %>%
select(Tag, Pageviews) %>%
group_by(Tag) %>%
summarise(totalPageviews = sum(Pageviews))
perPageTotals <- pageviewsData %>%
select(Page, Pageviews) %>%
group_by(Page) %>%
summarise(totalPageviews = sum(Pageviews))
perPageDayTotals <- pageviewsData %>%
select(Page, date, Pageviews) %>%
group_by(Page, date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagDayTotals <- pageviewsData %>%
select(Tag, date, Pageviews) %>%
group_by(Tag, date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagPageTotals <- pageviewsData %>%
select(Tag, Page, Pageviews) %>%
group_by(Tag, Page) %>%
summarise(totalPageviews = sum(Pageviews))
# - output
return(
list(pageviewsDataReport = pageviewsData,
perDayTotals = perDayTotals,
perTagTotals = perTagTotals,
perPageTotals = perPageTotals,
perPageDayTotals = perPageDayTotals,
perTagDayTotals = perTagDayTotals,
perTagPageTotals = perTagPageTotals)
)
}
# - Report pageviews:
pageviewsData <- wmde_report_pageviews(dataDir)
pageviewsReportFile <- pageviewsData$pageviewsDataReport
write.csv(pageviewsReportFile, "pageviewsReportFile.csv")
pageviews_perDayTotals <- pageviewsData$perDayTotals
write.csv(pageviews_perDayTotals, "pageviews_perDayTotals.csv")
pageviews_perTagTotals <- pageviewsData$perTagTotals
write.csv(pageviews_perTagTotals, "pageviews_perTagTotals.csv")
pageviews_perPageTotals <- pageviewsData$perPageTotals
write.csv(pageviews_perPageTotals, "pageviews_perPageTotals.csv")
pageviews_perPageDayTotals <- pageviewsData$perPageDayTotals
write.csv(pageviews_perPageDayTotals, "pageviews_perPageDayTotals.csv")
pageviews_perTagDayTotals <- pageviewsData$perTagDayTotals
write.csv(pageviews_perTagDayTotals, "pageviews_perTagDayTotals.csv")
pageviews_perTagPageTotals <- pageviewsData$perTagPageTotals
write.csv(pageviews_perTagPageTotals, "perTagPageTotals.csv")
### --- Report User Registrations
# - function: wmde_report_registrations
wmde_report_registrations <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated user registration data
lF <- lF[grepl("userRegistrationsAggreagted_", lF, fixed = T)]
# - load files and merge
registrationData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
registrationData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
registrationData[[i]] <- NULL
}
}
registrationData <- rbindlist(registrationData)
registrationData$V1 <- NULL
# - aggregates
perDayTotals <- registrationData %>%
select(date, Registrations) %>%
group_by(date) %>%
summarise(totalRegistrations = sum(Registrations))
perTagTotals <- registrationData %>%
select(event_campaign, Registrations) %>%
group_by(event_campaign) %>%
summarise(totalRegistrations = sum(Registrations))
perTagDayTotals <- registrationData %>%
select(event_campaign, date, Registrations) %>%
group_by(event_campaign, date) %>%
summarise(totalRegistrations = sum(Registrations))
# - output
return(
list(registrationsDataReport = registrationData,
perDayTotals = perDayTotals,
perTagTotals = perTagTotals,
perTagDayTotals = perTagDayTotals)
)
}
# - Report upon user registrations
userRegData <- wmde_report_registrations(dataDir)
userRegistrationsReportFile <- userRegData$registrationsDataReport
write.csv(userRegistrationsReportFile, "userRegistrationsReportFile.csv")
userRegistrations_perDayTotals <- userRegData$perDayTotals
write.csv(userRegistrations_perDayTotals, "userRegistrations_perDayTotals.csv")
userRegistrations_perTagTotals <- userRegData$perTagTotals
write.csv(userRegistrations_perTagTotals, "userRegistrations_perTagTotals.csv")
userRegistrations_perTagDayTotals <- userRegData$perTagDayTotals
write.csv(userRegistrations_perTagDayTotals, "userRegistrations_perTagDayTotals.csv")
All data on user registrations are presented in this section.
Chart 2.1. Registrations per tag and day. Please note: points with no data labels signify 0 user registrations.
# - Standard registrations
dataSet <- read.csv(
'_analytics/userRegistrationsReportFile.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
dataSet <- dataSet %>%
filter(!(event_campaign %in% 'WMDE_neweditors_autumn_2018_lpn'))
dataSet$campaign <- NULL
ggplot(dataSet, aes(x = date,
y = Registrations,
group = event_campaign,
color = event_campaign,
fill = event_campaign,
label = Registrations,
)) +
geom_path(size = .25) +
geom_point(size = 1.5) +
geom_point(size = 1, color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Thank You 2919: Registrations per Tag') +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = F) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "top")
### --- Full Dataset (Table Report)
colnames(dataSet)[1] <- 'Tag'
datatable(dataSet %>% arrange(Tag, date, desc(Registrations)))
This data acquisition chank is not reproducible from this Report; runs in production on stat1007
:
### ------------------------------------------------------
### --- Collect all user edits per day from dewiki.revision
### --- from 2. January 2019, w. event_campaign
### --- update 1: one week after then end: 23th January
### --- update 2: two weeks after then end: 30th January
### ------------------------------------------------------
localDir <- '/home/goransm/Analytics/NewEditors/Campaigns/2019_ThankYou/data/'
startTimestamp <- '20190102000000' # - campaign begins
# - SQL query
sqlQuery <- paste("\"SELECT rev_user, SUBSTR(rev_timestamp, 1, 8)
FROM dewiki.revision WHERE rev_timestamp >= ", startTimestamp, ";\"", sep = "")
### --- output filename
filename <- paste(localDir,'ReferenceEdits_dewiki_20190123', ".tsv", sep = "")
### --- execute sql script:
sqlArgs <- 'mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-store.eqiad.wmnet -A -e'
sqlInput <- paste(sqlQuery,
" > ",
filename,
sep = "")
# - command:
sqlCommand <- paste(sqlArgs, sqlInput)
system(command = sqlCommand, wait = TRUE)
This analysis is scheduled for January 24th and 30th Phab
Wrangle the dataset; visualize campaign edits per day.
allEdits <-
fread(
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_analytics/ReferenceEdits_dewiki_20190131.tsv'
)
userRegs <- read.csv(
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_analytics/fullRegistrationDataset.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
# - keep only campaign registered users:
userEdits <- allEdits[allEdits$rev_user %in% unique(userRegs$event_userId), ]
rm(userRegs); rm(allEdits); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 993551 53.1 1770749 94.6 1770749 94.6
Vcells 2671840 20.4 9089513 69.4 9089499 69.4
# - wrangle timestamp
colnames(userEdits) <- c('user_id', 'date')
userEdits$date <- sapply(userEdits$date, function(x) {
paste(
substr(x, 1, 4),
substr(x, 5, 6),
substr(x, 7, 8),
sep = "-"
)
})
# - aggregate per day
userEditsDay <- userEdits %>%
group_by(date) %>%
summarise(edits = n()) %>%
arrange(date)
# - input zero edits at missing dates
userEditsDayCopy <- userEditsDay
editDateRange <- userEditsDayCopy$date
editDateRange <- as.POSIXct(editDateRange)
editDateRange <- seq(min(editDateRange), max(editDateRange), by = "day")
userEditsDayCopy <- data.frame(date = as.character(editDateRange), edits = numeric(length(editDateRange)),
stringsAsFactors = F)
userEditsDayCopy <- left_join(userEditsDayCopy, userEditsDay, by = "date")
userEditsDayCopy$edits.x <- NULL
colnames(userEditsDayCopy) <- c('date', 'edits')
userEditsDayCopy$edits[is.na(userEditsDayCopy$edits)] <- 0
userEditsDay <-userEditsDayCopy
# - visualize daily edits
# - Visualize w. {ggplot2}
ggplot(userEditsDay, aes(x = date,
y = edits,
label = edits
)) +
geom_path(size = .5, color = 'blue', group = 1) +
geom_point(size = 1.5, color = 'blue') +
geom_point(size = 1.5, color = 'white') +
geom_text_repel(size = 3.5) +
ggtitle('Thank You 2919: Edits per day') +
theme_minimal() +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "top")
# - Edit | 1 | 2-4 | 5-9 | 10-49 | >50
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 49)
)
userEdits_user <- userEdits %>%
group_by(user_id) %>%
summarise(edits = n())
userEdits_user$editClass <- sapply(userEdits_user$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return(">= 50")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(userEdits_user$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
Chart 3.2.1
Q. Also it would be interesting to know when the new users edit: is there a possibility to calculate in which time interval new users are active, e.g. directly after registration, or some days later etc.? On the x-axis would be time after registration in days, with 0 being point of registration and on the y-axis number of edits. The diagram would then show average edits per user x-days after registration. Could this work? Phab
Note. The chart presents the mean number of edits calculated from the total number of edits per day divided by the number of active users (i.e. by how many users edited) on that day.
# - load user registrations per day, per user
userRegistrationsDaily <- fread("_analytics/fullRegistrationDataset.csv")
userRegistrationsDaily <- select(userRegistrationsDaily,
event_userId, date)
editsSinceReg <- userRegistrationsDaily
editsSinceReg <- left_join(editsSinceReg,
userEdits,
by = c("event_userId" = "user_id")) %>%
arrange(event_userId)
colnames(editsSinceReg) <- c("user_id", "registration", "edit")
editsSinceReg <- filter(editsSinceReg, !is.na(edit))
editsSinceReg$registration <- as.POSIXct(editsSinceReg$registration)
editsSinceReg$edit <- as.POSIXct(editsSinceReg$edit)
editsSinceReg$diff <- (editsSinceReg$edit - editsSinceReg$registration)/(60^2*24)
avgEditsSinceReg <- editsSinceReg %>%
select(diff, user_id) %>%
group_by(diff, user_id) %>%
summarise(edits = n()) %>%
select(diff, edits) %>%
group_by(diff) %>%
summarise(meanEdits = mean(edits))
avgEditsSinceReg$diff <- as.numeric(avgEditsSinceReg$diff)
avgEditsFrame <- data.frame(diff = seq(min(avgEditsSinceReg$diff), max(avgEditsSinceReg$diff)))
avgEditsFrame <- left_join(avgEditsFrame, avgEditsSinceReg, by = "diff")
avgEditsFrame$meanEdits[is.na(avgEditsFrame$meanEdits)] <- 0
avgEditsFrame$meanEdits <- round(avgEditsFrame$meanEdits, 2)
ggplot(avgEditsFrame, aes(x = diff,
y = meanEdits,
label = meanEdits)) +
geom_path(color = "blue", size = .25) +
geom_point(color = "blue", size = 1.5) +
geom_point(color = "white", size = 1) +
ggtitle('Thank You 2919: Average number of edits N days after registration') +
xlab("Days after registration") + ylab("Mean edits per user") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 0, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Chart 3.2.2
Note. The chart presents the mean number of edits calculated from the total number of edits per day divided by the number ofusers registered until that day (i.e. by how many campaign registered users could have edited on that day) see Phab.
# - load user registrations per day, per user
userRegistrationsDaily <- fread("_analytics/fullRegistrationDataset.csv")
userRegistrationsDaily <- select(userRegistrationsDaily,
event_userId, date)
cumulativeRegistrations <- userRegistrationsDaily %>%
select(date) %>%
group_by(date) %>%
summarise(registrations = n())
cumulativeRegistrations$registrations <- cumsum(cumulativeRegistrations$registrations)
userEditsPerDay <- userEdits %>%
select(date) %>%
group_by(date) %>%
summarise(edits = n())
userEditsPerDay <- left_join(userEditsPerDay, cumulativeRegistrations,
by = "date")
userEditsPerDay$registrations[is.na(userEditsPerDay$registrations)] <-
max(userEditsPerDay$registrations, na.rm = T)
userEditsPerDay$meanEdits = round(userEditsPerDay$edits/userEditsPerDay$registrations, 3)
dateRangeFrame <- data.frame(date =
as.character(
seq(min(as.POSIXct(userEditsPerDay$date)),
max(as.POSIXct(userEditsPerDay$date)),
by = "day")),
stringsAsFactors = F)
userEditsPerDay <- left_join(dateRangeFrame, userEditsPerDay, by = "date")
userEditsPerDay$meanEdits[is.na(userEditsPerDay$meanEdits)] <- 0
ggplot(userEditsPerDay, aes(x = date,
y = meanEdits,
label = meanEdits)) +
geom_path(color = "blue", size = .25, group = 1) +
geom_point(color = "blue", size = 1.5) +
geom_point(color = "white", size = 1) +
ggtitle('Thank You 2919: Average number of edits N days after registration') +
xlab("Days after registration") + ylab("Mean edits per user") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
Chart 3.2.3
Note. The chart presents the mean number of edits calculated from the total number of edits per day divided by the number of active users (i.e. by how many users edited) until that day." see Phab.
# - load user registrations per day, per user
userEditsRev <- userEdits %>%
arrange(date)
uniqueDates <- unique(userEditsRev$date)
editors <- list()
editors <- lapply(uniqueDates, function(x) {
unique(userEditsRev$user_id[userEditsRev$date %in% x])
})
names(editors) <- uniqueDates
editorsDaily <- list()
for (i in 1:length(editors)) {
editorsDaily[[i]] <- c(unique(unname(unlist(editors[1:i]))))
}
editorsDaily <- lapply(editorsDaily, length)
editorsDaily <- data.frame(editors = unname(unlist(editorsDaily)),
date = uniqueDates,
stringsAsFactors = F)
userEditsPerDay <- userEdits %>%
select(date) %>%
group_by(date) %>%
summarise(edits = n())
userEditsPerDay <- left_join(userEditsPerDay, editorsDaily,
by = "date")
userEditsPerDay$meanEdits = round(userEditsPerDay$edits/userEditsPerDay$editors, 3)
dateRangeFrame <- data.frame(date =
as.character(
seq(min(as.POSIXct(userEditsPerDay$date)),
max(as.POSIXct(userEditsPerDay$date)),
by = "day")),
stringsAsFactors = F)
userEditsPerDay <- left_join(dateRangeFrame, userEditsPerDay, by = "date")
userEditsPerDay$meanEdits[is.na(userEditsPerDay$meanEdits)] <- 0
ggplot(userEditsPerDay, aes(x = date,
y = meanEdits,
label = meanEdits)) +
geom_path(color = "blue", size = .25, group = 1) +
geom_point(color = "blue", size = 1.5) +
geom_point(color = "white", size = 1) +
ggtitle('Thank You 2919: Average number of edits N days after registration') +
xlab("Days after registration") + ylab("Mean edits per user") +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = FALSE) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
The dataset is obtained directly from the maintainer.
# - training modules data
trainModules <- fread(
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_analytics/wmde_training_data_2019-01.csv')
# - get user registrations w. user names for joins
locDir <-
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_data/'
lF <- list.files(locDir)
lF <- lF[grepl("userRegistrations_", lF)]
userReg <- lapply(lF, function(x) {
fread(paste0(locDir, x))
})
userReg <- rbindlist(userReg)
userReg$V1 <- NULL
trainModules <- left_join(trainModules,
select(userReg, event_userId, event_userName),
by = c("username" = "event_userName"))
trainModules <- trainModules[!is.na(trainModules$event_userId), ]
trainModules$username <- NULL
49 new editors have started at least one training module; 32 have completed at least one the training and 19 did not.
The following table 4.2.1 gives and overview of how many users have completed a particular training module.
tmUsers <- trainModules %>%
select(event_userId, training_module, module_completion_date) %>%
mutate(module_completion_date = ifelse(module_completion_date == '', F, T)) %>%
group_by(event_userId, training_module, module_completion_date) %>%
summarise(n = n()) %>%
filter(module_completion_date == T) %>%
select(training_module) %>%
group_by(training_module) %>%
summarise(editors = n())
datatable(tmUsers)
Table 4.2.2 gives and overview of how many users have started a particular training module.
tmUsers <- trainModules %>%
select(training_module) %>%
group_by(training_module) %>%
summarise(editors = n())
datatable(tmUsers)
Table 4.2.3 The last slide from a training module completed by new editors; the editors
column shows the number of editors who have abandoned (or completed) their traning at the respective slide.
tmLastSlide <- trainModules %>%
group_by(training_module, last_slide_completed) %>%
summarise(editors = n())
datatable(tmLastSlide)
Table 4.2.4 User edits: training modules completed vs not completed.
tmEdits <- left_join(userEdits, trainModules, by = c('user_id' = 'event_userId'))
tmEdits <- tmEdits %>%
select(training_module, module_completion_date)
tmEdits$completed <- sapply(tmEdits$module_completion_date, function(x) {
if(is.na(x) | x == '') {return(FALSE)} else {return(TRUE)}
})
tmEdits$started <- sapply(tmEdits$training_module, function(x) {
if(is.na(x) | x == '') {return(FALSE)} else {return(TRUE)}
})
tmEdits$module_completion_date <- NULL
tmEdits$training_module <- NULL
tmEdits$training_outcome <- ifelse(tmEdits$completed, 'Completed', NA)
tmEdits$training_outcome <- ifelse(tmEdits$started & is.na(tmEdits$training_outcome),
'Started', tmEdits$training_outcome)
tmEdits$training_outcome[is.na(tmEdits$training_outcome)] <- 'No training'
tmEdits <- tmEdits %>%
group_by(training_outcome) %>%
summarise(edits = n())
datatable(tmEdits)
Table 4.2.5 User edits: average number of edits per user, training modules completed vs not completed.
tmEdits <- left_join(userEdits, trainModules, by = c('user_id' = 'event_userId'))
tmEdits <- tmEdits %>%
select(user_id, training_module, module_completion_date)
tmEdits$completed <- sapply(tmEdits$module_completion_date, function(x) {
if(is.na(x) | x == '') {return(FALSE)} else {return(TRUE)}
})
tmEdits$started <- sapply(tmEdits$training_module, function(x) {
if(is.na(x) | x == '') {return(FALSE)} else {return(TRUE)}
})
tmEdits$module_completion_date <- NULL
tmEdits$training_module <- NULL
tmEdits$training_outcome <- ifelse(tmEdits$completed, 'Completed', NA)
tmEdits$training_outcome <- ifelse(tmEdits$started & is.na(tmEdits$training_outcome),
'Started', tmEdits$training_outcome)
tmEdits$training_outcome[is.na(tmEdits$training_outcome)] <- 'No training'
tmEdits <- tmEdits %>%
group_by(user_id, training_outcome) %>%
summarise(edits = n()) %>%
group_by(training_outcome) %>%
summarise(`mean(Edits)` = round(mean(edits), 2))
datatable(tmEdits)
Do you see in the tables from Ragesoss how many people did or begin the modules in general - regardless of the fact they register or not. Question behind the question: I would like to know how many people potentially just did the module, but didn’t register beforhand. (see: Phab)
# - training modules data
trainModules <- fread(
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_analytics/wmde_training_data_2019-01.csv')
# - Unique number of trainModules$username -2 (Stefan and Sage)
length(unique(trainModules$username)) - 2
[1] 387
387 people took the training modules irrespective of registration.