Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign is run from 2019/01/02 to 2019/01/16.
CURRENT UPDATE: Complete dataset as of 2019/01/16.
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running an R script on stat1007.eqiad.wmnet
, collecting the data as .tsv
and .csv
files, copying manually, and processing locally.
### --- Data Acquisition for the Thank You 2019 Campaign
### --- run from stat1007
### --- /home/goransm/Analytics/NewEditors/Campaigns/2019_ThankYou
### --- to data directory
dataDir <- '/home/goransm/Analytics/NewEditors/Campaigns/2019_ThankYou/data/'
setwd(dataDir)
### --- determine cetDay
library(lubridate)
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
strsplit(as.character(cetDay),
split = " ",
fixed = T)[[1]][1]
) - 1
### -----------------------------------------------------------------------
### --- Collect Banner Impression Data
### -----------------------------------------------------------------------
# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_query_condition
if (length(uri_query) > 1) {
uri_query_condition <- paste0("(",
paste(
paste0("uri_query LIKE '%", uri_query, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_query FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
uri_query_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
hiveArgs <- '/usr/local/bin/beeline -f'
hiveInput <- paste0(queryFile, ' > ', paste0(dataDir, fileName))
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE))
}
# - set params to wmde_collect_banner_impressions
# - for the Thank You 2919
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- '/beacon/impression'
uri_query <- c('WMDE_2019_thx'
)
queryFile <- 'thankyou2019_BannerImpressions.hql'
fileName <- paste0("bannerImpressions_", cetDay, ".tsv")
# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir)
### -----------------------------------------------------------------------
### --- Wrangle Banner Impression Data
### -----------------------------------------------------------------------
# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
dataDir,
cetDay,
campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
# - load
bannerData <- read.delim(fileName,
stringsAsFactors = F,
sep = "\t")
colnames(bannerData) <- 'uri_query'
# - clean
wInfo <- which(grepl("campaign=", bannerData$uri_query))
bannerData <- bannerData[wInfo, ]
# - split
bannerData <- strsplit(bannerData, split = "&", fixed = T)
# - extract relevant fields
# - banner:
banner <- sapply(bannerData, function(x) {
x[which(grepl("^banner=", x))]
})
banner <- gsub("^banner=", "", banner)
# - recordImpressionSampleRate:
recordImpressionSampleRate <- sapply(bannerData, function(x) {
x[which(grepl("^recordImpressionSampleRate=", x))]
})
recordImpressionSampleRate <- as.numeric(
gsub("^recordImpressionSampleRate=", "", recordImpressionSampleRate)
)
# - result:
result <- sapply(bannerData, function(x) {
x[which(grepl("^result=", x))]
})
result <- gsub("^result=", "", result)
# - compose table:
bannerObservations <- data.frame(banner = banner,
recordImpressionSampleRate = recordImpressionSampleRate,
result = result,
stringsAsFactors = F)
# - filter for result=show
bannerObservations <- dplyr::filter(bannerObservations,
result == "show")
# - correction for recordImpressionSampleRate
bannerObservations$recordImpressionSampleRate <-
1/bannerObservations$recordImpressionSampleRate
# - aggregate:
bannerObservations <- bannerObservations %>%
dplyr::select(banner, recordImpressionSampleRate) %>%
dplyr::group_by(banner) %>%
dplyr::summarise(impressions = sum(recordImpressionSampleRate))
# - add cetDay, me
bannerObservations$date <- cetDay
bannerObservations$campaign <- campaignName
# - store:
write.csv(bannerObservations,
paste0("bannerImpressionsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
# - wrangle Banner Impression data
campaignName <- "thankyou2019"
wmde_process_banner_impressions(fileName = fileName,
dataDir = dataDir,
cetDay = cetDay,
campaignName = campaignName)
### -----------------------------------------------------------------------
### --- Collect Pageviews
### -----------------------------------------------------------------------
# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
hiveArgs <- '/usr/local/bin/beeline -f'
hiveInput <- paste0(queryFile, ' > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE))
}
# - set params to wmde_collect_pageviews
# - for the Thank You 2919
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- c(
'/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia')
queryFile <- 'thankyou2019_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
# - collect Pageviews data
wmde_collect_pageviews(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir)
### -----------------------------------------------------------------------
### --- Wrangle Pageviews
### -----------------------------------------------------------------------
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
dataDir,
uri_query_filter,
cetDay = cetDay,
campaignName = campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
pageviewsData <- readLines(fileName)
wStart <- which(grepl("^uri_path", pageviewsData))
pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
pageviewsData <- data.frame(dat = pageviewsData,
stringsAsFactors = F)
pageviewsData <- separate(pageviewsData,
dat,
into = c('uri_path', 'uri_query', 'referer'),
sep = "\t")
# - apply uri_query_filter
# - NOTE: Autumn 2018, looking in both: uri_query, referer
w_uri_query <- which(grepl(uri_query_filter, pageviewsData$uri_query))
w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
w_uri_query <- unique(c(w_uri_query, w_uri_query_referer))
pageviewsData <- pageviewsData[w_uri_query, ]
w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
w_uri_query_referer_delete <- setdiff(1:dim(pageviewsData)[1], w_uri_query_referer)
pageviewsData$referer[w_uri_query_referer_delete] <- ''
# - when there is no uri_query, use the query from the referer field if present there
pageviewsData$referer <- str_extract(pageviewsData$referer, "\\?campaign=.*$")
pageviewsData$referer[is.na(pageviewsData$referer)] <- ""
pageviewsData$referer <- gsub("?campaign=", "", pageviewsData$referer, fixed = T)
pageviewsData$uri_query <- gsub("?campaign=", "", pageviewsData$uri_query, fixed = T)
pageviewsData$uri_query[pageviewsData$uri_query == ""] <-
pageviewsData$referer[pageviewsData$uri_query == ""]
pageviewsData <- dplyr::filter(pageviewsData,
uri_query != "")
pageviewsData$referer <- NULL
# - clean up a bit:
pageviewsData$uri_query <- gsub("/.*$", "", pageviewsData$uri_query)
# - aggregate:
pageviewsData <- pageviewsData %>%
dplyr::group_by(uri_query, uri_path) %>%
dplyr::summarise(pageviews = n())
colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
# - add cetDay, campaignName
pageviewsData$date <- cetDay
pageviewsData$campaign <- campaignName
# - store:
write.csv(pageviewsData,
paste0("pageviewsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
# - set params to wmde_process_pageviews
# - for the Thank You 2919
uri_query_filter <- 'WMDE_2019_thx'
# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
dataDir = dataDir,
uri_query_filter = uri_query_filter,
cetDay = cetDay,
campaignName = campaignName)
### -----------------------------------------------------------------------
### --- Collect User Registrations
### -----------------------------------------------------------------------
# - function: wmde_collect_registrations
wmde_collect_registrations <- function(logSchema,
web_host,
event_campaign,
cetDay,
dataDir,
fileName,
campaignName) {
# - WHERE condition: create start_timestamp, stop_timestamp
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 24:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
start_timestamp <- paste(
unlist(str_extract_all(cet_condition[1],
"[[:digit:]]")),
collapse = "")
stop_timestamp <- paste(
unlist(str_extract_all(tail(cet_condition, 1),
"[[:digit:]]")),
collapse = "")
# - WHERE condition: create event_campaign_condition
if (length(event_campaign) > 1) {
event_campaign_condition <- paste0("(",
paste(
paste0("event_campaign LIKE '%", event_campaign, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
event_campaign_condition = paste0("event_campaign LIKE '%", event_campaign, "%'")
}
# - compose SQL query:
sqlParams <- 'mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e'
query <- paste0(
"\"SELECT * FROM ",
paste0("log.", logSchema),
" WHERE ((webHost = '",
web_host,
"') AND (timestamp > ",
start_timestamp,
") AND (timestamp <= ",
stop_timestamp,
") AND (",
event_campaign_condition,
"));\"")
sqlOutput <- paste0("> ", paste0(dataDir, "/", fileName))
# - run command
qCommand <- paste(sqlParams, query, sqlOutput, sep = " ")
system(command = qCommand, wait = TRUE)
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
userReg <- fread(fileName, sep = "\t")
# - filter bots
wBot <- which(grepl("\"is_bot\": true", userReg$userAgent))
if (length(wBot) > 0) {
userReg <- userReg[-wBot, ]
}
# - select fields
userReg <- userReg %>%
dplyr::select(event_userId,
event_userName,
event_isSelfMade,
event_campaign,
timestamp)
# - add cetDay, campaignName
userReg$date <- cetDay
userReg$campaign <- campaignName
# - store:
write.csv(userReg,
paste0(
strsplit(fileName, split = ".", fixed = T)[[1]][1],
".csv")
)
# - remove temp .tsv file
file.remove(fileName)
}
# - set params for: wmde_collect_registrations
logSchema <- 'ServerSideAccountCreation_17719237'
web_host <- 'de.wikipedia.org'
event_campaign <- 'WMDE_2019_thx'
fileName <- paste0("userRegistrations_", cetDay, ".tsv")
# - collect user registrations
wmde_collect_registrations(logSchema = logSchema,
web_host = web_host,
event_campaign = event_campaign,
cetDay = cetDay,
dataDir = dataDir,
fileName = fileName,
campaignName = campaignName)
### -----------------------------------------------------------------------
### --- Wrangle User Registrations
### -----------------------------------------------------------------------
# - function: wmde_process_registrations
wmde_process_registrations <- function(fileName,
dataDir,
cetDay,
campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(dplyr)
library(data.table)
# - load
userReg <- fread(fileName)
# - agregate
userReg <- userReg %>%
dplyr::select(event_campaign) %>%
dplyr::group_by(event_campaign) %>%
dplyr::summarise(Registrations = n())
# - add cetDay, campaignName
userReg$date <- cetDay
userReg$campaign <- campaignName
# - store:
write.csv(userReg,
paste0('userRegistrationsAggreagted_', cetDay, ".csv")
)
}
# - set params for: wmde_process_registrations
fileName <- paste0("userRegistrations_", cetDay, ".csv")
# - wrangle user registrations:
wmde_process_registrations(fileName,
dataDir,
cetDay,
campaignName)
### --- Collect Newsletter registrations - for 2018_AuBC excl.
# - ServerSideAccountCreation_17719237 schema
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_17719237 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20181011000000) and (event_campaign like '%WMDE_neweditors_autumn_2018_lpn%'));\" > /home/goransm/RScripts/NewEditors/2018_AutumnBannerCampaign/_data/Newsletter_userRegistrations.tsv"
system(command = qCommand, wait = TRUE)
### --- Wrangle Newsletter registrations - for 2018_AuBC excl.
# - function: wmde_process_registrations_general
wmde_process_registrations_general <- function(fileName,
dataDir,
cetDay,
campaignName,
outFileName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(dplyr)
library(data.table)
# - load
userReg <- fread(fileName)
# - agregate
userReg <- userReg %>%
dplyr::select(event_campaign) %>%
dplyr::group_by(event_campaign) %>%
dplyr::summarise(Registrations = n())
# - add cetDay, campaignName
userReg$date <- cetDay
userReg$campaign <- campaignName
# - store:
write.csv(userReg,
outFileName
)
}
NOTE: Not run from this report; the data were already pre-processed and aggregated by the following R
script before being submitted to analytical procedures:
### --- Report Generation for the Thank You 2919
### --- run locally
### --- to data directory
dataDir <-
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_data/'
analyticsDir <-
'/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/2019_ThankYou/_analytics/'
setwd(analyticsDir)
### --- Report Banner Impression Data
# - function: wmde_report_banner_impressions
wmde_report_banner_impressions <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated banner impression data
lF <- lF[grepl("bannerImpressionsAggregated_", lF, fixed = T)]
# - load files and merge
bannerData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
bannerData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
bannerData[[i]] <- NULL
}
}
bannerData <- rbindlist(bannerData)
bannerData$V1 <- NULL
# - aggregates
perBannerTotals <- bannerData %>%
select(banner, impressions) %>%
group_by(banner) %>%
summarise(totalImpressions = sum(impressions))
perDayTotals <- bannerData %>%
select(date, impressions) %>%
group_by(date) %>%
summarise(totalImpressions = sum(impressions))
# - output
return(
list(bannerImpressionsReport = bannerData,
perBannerTotals = perBannerTotals,
perDayTotals = perDayTotals)
)
}
# - Report banner impressions
bannerImpressionsData <- wmde_report_banner_impressions(dataDir)
bannerImpressionsFile <- bannerImpressionsData$bannerImpressionsReport
write.csv(bannerImpressionsFile, "bannerImpressionsFile.csv")
bannerTotals <- bannerImpressionsData$perBannerTotals
write.csv(bannerTotals, "bannerTotals.csv")
bannerDayTotals <- bannerImpressionsData$perDayTotals
write.csv(bannerDayTotals, "bannerDayTotals.csv")
### --- Report Pageviews Data
# - function: wmde_report_pageviews
wmde_report_pageviews <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated banner impression data
lF <- lF[grepl("pageviewsAggregated_", lF, fixed = T)]
# - load files and merge
pageviewsData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
pageviewsData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
pageviewsData[[i]] <- NULL
}
}
pageviewsData <- rbindlist(pageviewsData)
pageviewsData$V1 <- NULL
# - aggregates
perDayTotals <- pageviewsData %>%
select(date, Pageviews) %>%
group_by(date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagTotals <- pageviewsData %>%
select(Tag, Pageviews) %>%
group_by(Tag) %>%
summarise(totalPageviews = sum(Pageviews))
perPageTotals <- pageviewsData %>%
select(Page, Pageviews) %>%
group_by(Page) %>%
summarise(totalPageviews = sum(Pageviews))
perPageDayTotals <- pageviewsData %>%
select(Page, date, Pageviews) %>%
group_by(Page, date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagDayTotals <- pageviewsData %>%
select(Tag, date, Pageviews) %>%
group_by(Tag, date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagPageTotals <- pageviewsData %>%
select(Tag, Page, Pageviews) %>%
group_by(Tag, Page) %>%
summarise(totalPageviews = sum(Pageviews))
# - output
return(
list(pageviewsDataReport = pageviewsData,
perDayTotals = perDayTotals,
perTagTotals = perTagTotals,
perPageTotals = perPageTotals,
perPageDayTotals = perPageDayTotals,
perTagDayTotals = perTagDayTotals,
perTagPageTotals = perTagPageTotals)
)
}
# - Report pageviews:
pageviewsData <- wmde_report_pageviews(dataDir)
pageviewsReportFile <- pageviewsData$pageviewsDataReport
write.csv(pageviewsReportFile, "pageviewsReportFile.csv")
pageviews_perDayTotals <- pageviewsData$perDayTotals
write.csv(pageviews_perDayTotals, "pageviews_perDayTotals.csv")
pageviews_perTagTotals <- pageviewsData$perTagTotals
write.csv(pageviews_perTagTotals, "pageviews_perTagTotals.csv")
pageviews_perPageTotals <- pageviewsData$perPageTotals
write.csv(pageviews_perPageTotals, "pageviews_perPageTotals.csv")
pageviews_perPageDayTotals <- pageviewsData$perPageDayTotals
write.csv(pageviews_perPageDayTotals, "pageviews_perPageDayTotals.csv")
pageviews_perTagDayTotals <- pageviewsData$perTagDayTotals
write.csv(pageviews_perTagDayTotals, "pageviews_perTagDayTotals.csv")
pageviews_perTagPageTotals <- pageviewsData$perTagPageTotals
write.csv(pageviews_perTagPageTotals, "perTagPageTotals.csv")
### --- Report User Registrations
# - function: wmde_report_registrations
wmde_report_registrations <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated user registration data
lF <- lF[grepl("userRegistrationsAggreagted_", lF, fixed = T)]
# - load files and merge
registrationData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
registrationData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
registrationData[[i]] <- NULL
}
}
registrationData <- rbindlist(registrationData)
registrationData$V1 <- NULL
# - aggregates
perDayTotals <- registrationData %>%
select(date, Registrations) %>%
group_by(date) %>%
summarise(totalRegistrations = sum(Registrations))
perTagTotals <- registrationData %>%
select(event_campaign, Registrations) %>%
group_by(event_campaign) %>%
summarise(totalRegistrations = sum(Registrations))
perTagDayTotals <- registrationData %>%
select(event_campaign, date, Registrations) %>%
group_by(event_campaign, date) %>%
summarise(totalRegistrations = sum(Registrations))
# - output
return(
list(registrationsDataReport = registrationData,
perDayTotals = perDayTotals,
perTagTotals = perTagTotals,
perTagDayTotals = perTagDayTotals)
)
}
# - Report upon user registrations
userRegData <- wmde_report_registrations(dataDir)
userRegistrationsReportFile <- userRegData$registrationsDataReport
write.csv(userRegistrationsReportFile, "userRegistrationsReportFile.csv")
userRegistrations_perDayTotals <- userRegData$perDayTotals
write.csv(userRegistrations_perDayTotals, "userRegistrations_perDayTotals.csv")
userRegistrations_perTagTotals <- userRegData$perTagTotals
write.csv(userRegistrations_perTagTotals, "userRegistrations_perTagTotals.csv")
userRegistrations_perTagDayTotals <- userRegData$perTagDayTotals
write.csv(userRegistrations_perTagDayTotals, "userRegistrations_perTagDayTotals.csv")
All data on user registrations are presented in this section.
Chart 2.1. Registrations per tag and day. Please note: points with no data labels signify 0 user registrations.
# - Standard registrations
dataSet <- read.csv(
'_analytics/userRegistrationsReportFile.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
dataSet <- dataSet %>%
filter(!(event_campaign %in% 'WMDE_neweditors_autumn_2018_lpn'))
dataSet$campaign <- NULL
ggplot(dataSet, aes(x = date,
y = Registrations,
group = event_campaign,
color = event_campaign,
fill = event_campaign,
label = Registrations,
)) +
geom_path(size = .25) +
geom_point(size = 1.5) +
geom_point(size = 1, color = "white") +
scale_y_continuous(labels = comma) +
ggtitle('Thank You 2919: Registrations per Tag') +
theme_minimal() +
geom_text_repel(size = 3.5, show.legend = F) +
scale_y_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "top")
### --- Full Dataset (Table Report)
colnames(dataSet)[1] <- 'Tag'
datatable(dataSet %>% arrange(Tag, date, desc(Registrations)))
This analysis is scheduled for January 21st and 30th Phab
dataSet <- read.csv(
'_analytics/allUserEdits_ThankYou2019.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
# - Edit | 1 | 2-4 | 5-9 | 10-49 | >50
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 49)
)
dataSet$editClass <- sapply(dataSet$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return(">= 50")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(dataSet$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)