Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign is run from 2019/11/01 to 2019/11/09.
CURRENT UPDATE: Complete dataset as of 2019/11/09.
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script ABC_2019_PRODUCTION.R
from the stat1004
WMF statistics server, collecting the data as .tsv
and .csv
files, aggregating on the server with ABC_2019_Aggregation.R
, copying manually and processing locally.
# !diagnostics off
### --- script: ABC_2019_PRODUCTION.R
### --- Data Acquisition for the Autumn Banner Campaign 2019
### --- run from stat1004
### --- /home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/
### --- libraries
library(tidyverse)
library(data.table)
library(lubridate)
### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- determine cetDay
cetDay <- Sys.time()
cetDay
attr(cetDay, "tzone") <- "Europe/Berlin"
# - one day behind for crontab
# - (i.e. waiting for wmf.webrequest to complete is data acquisition)
cetDay <- ymd(
strsplit(as.character(cetDay),
split = " ",
fixed = T)[[1]][1]
) - 1
### ----------------------------------------------------------
### --- Banner Impressions
### ----------------------------------------------------------
# - function: wmde_collect_banner_impressions
wmde_collect_banner_impressions <- function(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - WHERE condition: create uri_query_condition
if (length(uri_query) > 1) {
uri_query_condition <- paste0("(",
paste(
paste0("uri_query LIKE '%", uri_query, "%'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_query_condition = paste0("uri_query LIKE '%", uri_query, "%'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_query FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
uri_query_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
hiveArgs <- '/usr/local/bin/beeline -f'
hiveInput <- paste0(queryFile, ' > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE)
)
}
# - set params to wmde_collect_banner_impressions
# - for the Autumn Banner Campaign 2019
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- '/beacon/impression'
uri_query <- c('WMDE_neweditors_autumn_2019')
queryFile <- 'abc2019_BannerImpressions.hql'
fileName <- paste0(dataDir, "bannerImpressions_", cetDay, ".tsv")
# - collect Banner Impression data
wmde_collect_banner_impressions(uri_host,
uri_path,
uri_query,
cetDay,
queryFile,
fileName,
dataDir)
# - function: wmde_process_banner_impressions
wmde_process_banner_impressions <- function(fileName,
dataDir,
cetDay,
campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
# - load
bannerData <- read.delim(fileName,
stringsAsFactors = F)
colnames(bannerData) <- 'uri_query'
# - clean
wStart <- which(bannerData$uri_query == "uri_query")
bannerData <- bannerData[(wStart + 1):(dim(bannerData)[1] - 2), ]
# - split
bannerData <- strsplit(bannerData, split = "&", fixed = T)
# - extract relevant fields
# - banner:
banner <- sapply(bannerData, function(x) {
x[which(grepl("^banner=", x))]
})
banner <- gsub("^banner=", "", banner)
# - recordImpressionSampleRate:
recordImpressionSampleRate <- sapply(bannerData, function(x) {
x[which(grepl("^recordImpressionSampleRate=", x))]
})
recordImpressionSampleRate <- as.numeric(
gsub("^recordImpressionSampleRate=", "", recordImpressionSampleRate)
)
# - result:
result <- sapply(bannerData, function(x) {
x[which(grepl("^result=", x))]
})
result <- gsub("^result=", "", result)
# - compose table:
bannerObservations <- data.frame(banner = banner,
recordImpressionSampleRate = recordImpressionSampleRate,
result = result,
stringsAsFactors = F)
# - filter for result=show
bannerObservations <- dplyr::filter(bannerObservations,
result == "show")
# - correction for recordImpressionSampleRate
bannerObservations$recordImpressionSampleRate <-
1/bannerObservations$recordImpressionSampleRate
# - aggregate:
bannerObservations <- bannerObservations %>%
dplyr::select(banner, recordImpressionSampleRate) %>%
dplyr::group_by(banner) %>%
dplyr::summarise(impressions = sum(recordImpressionSampleRate))
# - add cetDay, me
bannerObservations$date <- cetDay
bannerObservations$campaign <- campaignName
# - store:
write.csv(bannerObservations,
paste0("bannerImpressionsAggregated_",
cetDay,
".csv"
)
)
}
# - wrangle Banner Impression data
campaignName <- "2019_AuBC"
wmde_process_banner_impressions(fileName = fileName,
dataDir = dataDir,
cetDay = cetDay,
campaignName = campaignName)
### ----------------------------------------------------------
### --- Pageviews
### ----------------------------------------------------------
# - function: wmde_collect_pageviews
wmde_collect_pageviews <- function(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir) {
# - NOTE:
# - expected format for cetDay is: YYYY-MM-DD
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
# - WHERE condition: create datetime_condition
cet_condition <- seq(
from = as.POSIXct(paste0(cetDay," 0:00"), tz = "Europe/Berlin"),
to = as.POSIXct(paste0(cetDay," 23:00"), tz = "Europe/Berlin"),
by = "hour"
)
attr(cet_condition, "tzone") <- "UTC"
cet_condition <- as.character(cet_condition)
cet_condition <- unlist(str_extract_all(cet_condition, "^([[:digit:]]|\\s|-)*"))
cet_years <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][1]
})
cet_months <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][2]
})
cet_months <- gsub("^0", "", cet_months)
cet_days <- sapply(
strsplit(cet_condition, split = " ", fixed = T), function(x) {
strsplit(x, split = "-")[[1]][3]
})
cet_days <- gsub("^0", "", cet_days)
cet_hours <- sapply(strsplit(cet_condition, split = " ", fixed = T),
function(x) {
x[2]
})
cet_hours <- gsub("^0", "", cet_hours)
datetimeCondition <- paste0(
"year = ", cet_years, " AND ",
"month = ", cet_months, " AND ",
"day = ", cet_days, " AND ",
"hour = ", cet_hours
)
datetimeCondition <- paste("(",
datetimeCondition,
")",
collapse = " OR ",
sep = "")
# - WHERE condition: create uri_path_condition
if (length(uri_path) > 1) {
uri_path_condition <- paste0("(",
paste(
paste0("uri_path = '", uri_path, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_path_condition = paste0("uri_path = '", uri_path, "'")
}
# - WHERE condition: create uri_host_condition
if (length(uri_host) > 1) {
uri_host_condition <- paste0("(",
paste(
paste0("uri_host = '", uri_host, "'"),
collapse = " OR ", sep = " "),
")"
)
} else {
uri_host_condition = paste0("uri_host = '", uri_host, "'")
}
# - compose HiveQL query
hiveQuery <- paste0(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE (",
uri_host_condition, " AND ",
uri_path_condition, " AND ",
"(", datetimeCondition, ")",
");"
)
# - write hql
write(hiveQuery, queryFile)
# - execute hql script:
hiveArgs <- '/usr/local/bin/beeline -f'
hiveInput <- paste0(queryFile, ' > ', fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
return(
system(command = hiveCommand, wait = TRUE)
)
}
# - set params to wmde_collect_pageviews
# - for the Autumn Banner Campaign 2019
uri_host <- c('de.wikipedia.org', 'de.m.wikipedia.org')
uri_path <- c(
'/wiki/Wikipedia:Wikipedia_vor_Ort',
'/wiki/Wikipedia:Kontor_Hamburg/Aktionstag_2019',
'/wiki/Wikipedia:WikiWedding/Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:WikiB%C3%A4r/Aktionstag_Wikipedia_2019',
'/wiki/Wikipedia:Ruhrgebiet/Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:K%C3%B6ln/Aktionstag_Wikipedia_2019',
'/wiki/Wikipedia:Hannover/Aktionstag_Wikipedia_2019',
'/wiki/Wikipedia:Frankfurt/Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:Augsburg/Aktionstag_Wikipedia_2019',
'/wiki/Wikipedia:WikiMUC/2019-11-10_Wikipedia_vor_Ort',
'/wiki/Wikipedia:Freiburg_im_Breisgau/Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:L%C3%B6rrach/Aktionstag_Wikipedia_2019',
'/wiki/Wikipedia:Bodensee/Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:Ober%C3%B6sterreich/Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:Wien/Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:Nieder%C3%B6sterreich/Aktionstag_Wikipedia_vor_Ort_2019',
'/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia')
# - set params for wmde_collect_pageviews
queryFile <- 'abc2019_Pageviews.hql'
fileName <- paste0("pageviews_", cetDay, ".tsv")
# - collect Pageviews data
wmde_collect_pageviews(uri_host,
uri_path,
cetDay,
queryFile,
fileName,
dataDir)
### --- Wrangle Pageviews
# - function: wmde_process_pageviews
wmde_process_pageviews <- function(fileName,
dataDir,
uri_query_filter,
cetDay = cetDay,
campaignName = campaignName) {
# - to dataDir
setwd(dataDir)
# - libraries
library(stringr)
library(dplyr)
library(tidyr)
library(data.table)
# - load
pageviewsData <- readLines(fileName)
wStart <- which(grepl("^uri_path", pageviewsData))
pageviewsData <- pageviewsData[(wStart + 2):(length(pageviewsData) - 2)]
pageviewsData <- data.frame(dat = pageviewsData,
stringsAsFactors = F)
pageviewsData <- separate(pageviewsData,
dat,
into = c('uri_path', 'uri_query', 'referer'),
sep = "\t")
# - apply uri_query_filter
# - NOTE: Autumn 2019, looking in both: uri_query, referer
w_uri_query <- which(grepl(uri_query_filter, pageviewsData$uri_query))
w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
w_uri_query <- unique(c(w_uri_query, w_uri_query_referer))
pageviewsData <- pageviewsData[w_uri_query, ]
w_uri_query_referer <- which(grepl(uri_query_filter, pageviewsData$referer))
w_uri_query_referer_delete <- setdiff(1:dim(pageviewsData)[1], w_uri_query_referer)
pageviewsData$referer[w_uri_query_referer_delete] <- ''
# - when there is no uri_query, use the query from the referer field if present there
pageviewsData$referer <- str_extract(pageviewsData$referer, "\\?campaign=.*$")
pageviewsData$referer[is.na(pageviewsData$referer)] <- ""
pageviewsData$referer <- gsub("?campaign=", "", pageviewsData$referer, fixed = T)
pageviewsData$uri_query <- gsub("?campaign=", "", pageviewsData$uri_query, fixed = T)
pageviewsData$uri_query[pageviewsData$uri_query == ""] <-
pageviewsData$referer[pageviewsData$uri_query == ""]
pageviewsData <- dplyr::filter(pageviewsData,
uri_query != "")
pageviewsData$referer <- NULL
# - clean up a bit:
pageviewsData$uri_query <- gsub("/.*$", "", pageviewsData$uri_query)
# - aggregate:
pageviewsData <- pageviewsData %>%
dplyr::group_by(uri_query, uri_path) %>%
dplyr::summarise(pageviews = n())
colnames(pageviewsData) <- c('Tag', 'Page', 'Pageviews')
# - add cetDay, campaignName
pageviewsData$date <- cetDay
pageviewsData$campaign <- campaignName
# - store:
write.csv(pageviewsData,
paste0("pageviewsAggregated_",
strsplit(
strsplit(fileName, split = "_", fixed = T)[[1]][2],
split = ".",
fixed = T)[[1]][1],
".csv"
)
)
}
# - set params to wmde_process_pageviews
# - for the Autumn Banner Campaign 2019
uri_query_filter <- 'WMDE_neweditors_autumn_2019'
# - wrangle pageviews
wmde_process_pageviews(fileName = fileName,
dataDir = dataDir,
uri_query_filter = uri_query_filter,
cetDay = cetDay,
campaignName = campaignName)
### ----------------------------------------------------------
### --- User Registrations
### --- HiveQL query: event.ServerSideAccountCreation table
### ----------------------------------------------------------
hiveQL <-
"SELECT year, month, day, hour, event.campaign, event.userId, event.userName
FROM event.serversideaccountcreation
WHERE
year = 2019
AND month >= 11
AND day >= 1
AND (event.campaign LIKE '%WMDE_neweditors_autumn_2019%');"
# - write hql
write(hiveQL, paste0(dataDir, 'user_registrations.hql'))
### --- output filename
filename <- paste0(dataDir, 'user_registrations.tsv')
### --- execute hql script:
hiveArgs <- '/usr/local/bin/beeline --silent --incremental=true --verbose=false -f'
hiveInput <- paste(paste0(dataDir, 'user_registrations.hql'),
" > ",
filename,
sep = "")
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
NOTE: Not run from this report; the data were already pre-processed and aggregated by the following R
script before being submitted to analytical procedures:
# !diagnostics off
### --- script: ABC_2019_Aggregation.R
### --- Data Aggregation for the Autumn Banner Campaign 2019
### --- run from stat1004
### --- /home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/
### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
### --- Report Banner Impression Data
# - function: wmde_report_banner_impressions
wmde_report_banner_impressions <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated banner impression data
lF <- lF[grepl("bannerImpressionsAggregated_", lF, fixed = T)]
# - load files and merge
bannerData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
bannerData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
bannerData[[i]] <- NULL
}
}
bannerData <- rbindlist(bannerData)
bannerData$V1 <- NULL
# - aggregates
perBannerTotals <- bannerData %>%
select(banner, impressions) %>%
group_by(banner) %>%
summarise(totalImpressions = sum(impressions))
perDayTotals <- bannerData %>%
select(date, impressions) %>%
group_by(date) %>%
summarise(totalImpressions = sum(impressions))
# - output
return(
list(bannerImpressionsReport = bannerData,
perBannerTotals = perBannerTotals,
perDayTotals = perDayTotals)
)
}
# - Report banner impressions
bannerImpressionsData <- wmde_report_banner_impressions(dataDir)
bannerImpressionsFile <- bannerImpressionsData$bannerImpressionsReport
write.csv(bannerImpressionsFile, paste0(analyticsDir, "bannerImpressionsFile.csv"))
bannerTotals <- bannerImpressionsData$perBannerTotals
write.csv(bannerTotals, paste0(analyticsDir, "bannerTotals.csv"))
bannerDayTotals <- bannerImpressionsData$perDayTotals
write.csv(bannerDayTotals, paste0(analyticsDir, "bannerDayTotals.csv"))
### --- Report Pageviews Data
# - function: wmde_report_pageviews
wmde_report_pageviews <- function(dataDir) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated banner impression data
lF <- lF[grepl("pageviewsAggregated_", lF, fixed = T)]
# - load files and merge
pageviewsData <- vector(mode = "list", length = length(lF))
for (i in 1:length(lF)) {
if (grepl("csv$|tsv$", lF[i])) {
pageviewsData[[i]] <- fread(paste0(dataDir, lF[i]))
} else {
pageviewsData[[i]] <- NULL
}
}
pageviewsData <- rbindlist(pageviewsData)
pageviewsData$V1 <- NULL
# - aggregates
perDayTotals <- pageviewsData %>%
select(date, Pageviews) %>%
group_by(date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagTotals <- pageviewsData %>%
select(Tag, Pageviews) %>%
group_by(Tag) %>%
summarise(totalPageviews = sum(Pageviews))
perPageTotals <- pageviewsData %>%
select(Page, Pageviews) %>%
group_by(Page) %>%
summarise(totalPageviews = sum(Pageviews))
perPageDayTotals <- pageviewsData %>%
select(Page, date, Pageviews) %>%
group_by(Page, date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagDayTotals <- pageviewsData %>%
select(Tag, date, Pageviews) %>%
group_by(Tag, date) %>%
summarise(totalPageviews = sum(Pageviews))
perTagPageTotals <- pageviewsData %>%
select(Tag, Page, Pageviews) %>%
group_by(Tag, Page) %>%
summarise(totalPageviews = sum(Pageviews))
# - output
return(
list(pageviewsDataReport = pageviewsData,
perDayTotals = perDayTotals,
perTagTotals = perTagTotals,
perPageTotals = perPageTotals,
perPageDayTotals = perPageDayTotals,
perTagDayTotals = perTagDayTotals,
perTagPageTotals = perTagPageTotals)
)
}
# - Report pageviews:
pageviewsData <- wmde_report_pageviews(dataDir)
pageviewsReportFile <- pageviewsData$pageviewsDataReport
write.csv(pageviewsReportFile, paste0(analyticsDir, "pageviewsReportFile.csv"))
pageviews_perDayTotals <- pageviewsData$perDayTotals
write.csv(pageviews_perDayTotals, paste0(analyticsDir, "pageviews_perDayTotals.csv"))
pageviews_perTagTotals <- pageviewsData$perTagTotals
write.csv(pageviews_perTagTotals, paste0(analyticsDir, "pageviews_perTagTotals.csv"))
pageviews_perPageTotals <- pageviewsData$perPageTotals
write.csv(pageviews_perPageTotals, paste0(analyticsDir, "pageviews_perPageTotals.csv"))
pageviews_perPageDayTotals <- pageviewsData$perPageDayTotals
write.csv(pageviews_perPageDayTotals, paste0(analyticsDir, "pageviews_perPageDayTotals.csv"))
pageviews_perTagDayTotals <- pageviewsData$perTagDayTotals
write.csv(pageviews_perTagDayTotals, paste0(analyticsDir, "pageviews_perTagDayTotals.csv"))
pageviews_perTagPageTotals <- pageviewsData$perTagPageTotals
write.csv(pageviews_perTagPageTotals, paste0(analyticsDir, "perTagPageTotals.csv"))
### --- Report User Registrations
# - function: wmde_report_registrations_hive
wmde_report_registrations_hive <- function(dataDir, campaign) {
# - Setup
library(data.table)
library(dplyr)
# - list files:
lF <- list.files(dataDir)
# - filter aggregated user registration data
lF <- lF[grepl("user_registrations.tsv", lF, fixed = T)]
# - load file and wrangle columns
registrationData <- fread(paste0(dataDir, lF), header = T)
colnames(registrationData)[5] <- 'event_campaign'
colnames(registrationData)[6] <- 'event_userId'
registrationData$campaign <- campaign
registrationData$day <- ifelse(nchar(registrationData$day) == 1,
paste0("0", registrationData$day),
registrationData$day)
registrationData$date <- paste(registrationData$year,
registrationData$month,
registrationData$day,
sep = "-")
# - aggregates
perDayTotals <- registrationData %>%
select(date) %>%
group_by(date) %>%
summarise(totalRegistrations = n())
perTagTotals <- registrationData %>%
select(event_campaign) %>%
group_by(event_campaign) %>%
summarise(totalRegistrations = n())
perTagDayTotals <- registrationData %>%
select(event_campaign, date) %>%
group_by(event_campaign, date) %>%
summarise(totalRegistrations = n())
# - full registration dataset: non-aggregated registrations
# - filter data:
fullRegData <- registrationData %>%
select(event_userId,
event_campaign,
date,
campaign)
# - output
return(
list(registrationsDataReport = registrationData,
perDayTotals = perDayTotals,
perTagTotals = perTagTotals,
perTagDayTotals = perTagDayTotals,
fullRegistrationDataset = fullRegData)
)
}
# - Report upon user registrations
campaign <- '2019_AuBC'
userRegData <- wmde_report_registrations_hive(dataDir, campaign)
userRegistrationsReportFile <- userRegData$registrationsDataReport
write.csv(userRegistrationsReportFile, paste0(analyticsDir, "userRegistrationsReportFile.csv"))
userRegistrations_perDayTotals <- userRegData$perDayTotals
write.csv(userRegistrations_perDayTotals, paste0(analyticsDir, "userRegistrations_perDayTotals.csv"))
userRegistrations_perTagTotals <- userRegData$perTagTotals
write.csv(userRegistrations_perTagTotals, paste0(analyticsDir, "userRegistrations_perTagTotals.csv"))
userRegistrations_perTagDayTotals <- userRegData$perTagDayTotals
write.csv(userRegistrations_perTagDayTotals, paste0(analyticsDir, "userRegistrations_perTagDayTotals.csv"))
fullUserRegistrations <- userRegData$fullRegistrationDataset
write.csv(fullUserRegistrations, paste0(analyticsDir, "fullRegistrationDataset.csv"))
library(data.table)
# !diagnostics off
### --- script: ABC_2019_UserEdits.R
### --- User Edits Acquisition for the Autumn Banner Campaign 2019
### --- run from stat1004
### --- /home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/
### -----------------------------------------------------------------------
### --- User Edits
### -----------------------------------------------------------------------
### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
startTimestamp <- '20191101000000'
# - get user ids
userRegistrations <- read.csv(paste0(analyticsDir, 'userRegistrationsReportFile.csv'))
rev_user <- userRegistrations$event_userId
rev_user_text <- as.character(userRegistrations$username)
# - iterate over rev_user
for (i in 1:length(rev_user)) {
# - check username
if (grepl("'", rev_user_text[i], fixed = T)) {
rev_user_text[i] <- gsub("'", "\\'", rev_user_text[i], fixed = T)
}
# - SQL query
sqlQuery <- paste("\"SELECT rev_user, rev_timestamp FROM dewiki.revision WHERE (rev_timestamp >= ",
startTimestamp, " AND rev_user_text = '", rev_user_text[i], "');\"", sep = "")
### --- output filename
filename <- paste(dataDir,'userEdits', "_", rev_user[i], ".tsv", sep = "")
### --- execute sql script:
sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
sqlInput <- paste(sqlQuery,
" > ",
filename,
sep = "")
# - command:
sqlCommand <- paste(sqlLogInPre, sqlInput)
system(command = sqlCommand, wait = TRUE)
# - report
print(paste0("DONE: user ", i, "."))
}
### --- END run SQL scripts
# - load user edits:
lF <- list.files(dataDir)
lF <- lF[grepl("^userEdits_", lF)]
userEdits <- lapply(paste0(dataDir, lF), fread)
userEdits <- rbindlist(userEdits)
# - store user edits:
write.csv(userEdits, paste0(analyticsDir, 'userEdits.csv'))
### --- Check user edits
# - select all non-anonymous user edits on dewiki where timestamp > 20191101000000
# - SQL query
sqlQuery <- paste("\"SELECT rev_actor FROM dewiki.revision WHERE (rev_timestamp >= ",
startTimestamp, ");\"", sep = "")
### --- output filename
filename <- paste0(dataDir, 'userEditsALL.tsv')
### --- execute sql script:
sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
sqlInput <- paste(sqlQuery,
" > ",
filename,
sep = "")
# - command:
sqlCommand <- paste(sqlLogInPre, sqlInput)
system(command = sqlCommand, wait = TRUE)
# - load userEdits
userEdits <- fread(paste0(dataDir, 'userEditsALL.tsv'))
tEdits <- table(as.numeric(userEdits$rev_user))
library(data.table)
# !diagnostics off
### --- script: ABC_2019_UserEdits_wmfMediaWikiHistory.R
### --- User Edits Acquisition for the Autumn Banner Campaign 2019
### --- run from stat1004
### --- /home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/
### -----------------------------------------------------------------------
### --- User Edits
### -----------------------------------------------------------------------
### --- dir structure
campaignPath <- '/home/goransm/Analytics/NewEditors/Campaigns/2019AuBC/'
dataDir <- paste0(campaignPath, "_data/")
analyticsDir <- paste0(campaignPath, "_analytics/")
startTimestamp <- '20191101000000'
# - get user ids
userRegistrations <- read.csv(paste0(analyticsDir,
'userRegistrationsReportFile.csv'))
rev_user <- userRegistrations$event_userId
rev_user_text <- as.character(userRegistrations$username)
# - iterate over rev_user
for (i in 1:length(rev_user)) {
# - SQL query
sqlQuery <- paste("\"SELECT actor.actor_id, actor.actor_user, actor.actor_name, revision_actor_temp.revactor_timestamp FROM actor LEFT JOIN revision_actor_temp ON (actor.actor_id = revision_actor_temp.revactor_actor) WHERE (revision_actor_temp.revactor_timestamp >= 20191101000000 AND actor.actor_user = ", rev_user[i], ");\"");
### --- output filename
filename <- paste(dataDir,'userEdits', "_", i, ".tsv", sep = "")
### --- execute sql script:
sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
sqlInput <- paste(sqlQuery,
" > ",
filename,
sep = "")
# - command:
sqlCommand <- paste(sqlLogInPre, sqlInput)
system(command = sqlCommand, wait = TRUE)
# - report
print(paste0("DONE: user ", i, "."))
}
### --- END run SQL scripts
# - load user edits:
lF <- list.files(dataDir)
lF <- lF[grepl("^userEdits_", lF)]
userEdits <- lapply(paste0(dataDir, lF), fread)
userEdits <- rbindlist(userEdits)
# - store user edits:
write.csv(userEdits, paste0(analyticsDir, 'userEdits.csv'))
### --- Check user edits
# - select all non-anonymous user edits on dewiki where timestamp > 20191101000000
# - SQL query
sqlQuery <- paste("\"SELECT revactor_actor, revactor_timestamp FROM revision_actor_temp WHERE (revactor_timestamp >= ",
startTimestamp, ");\"", sep = "")
### --- output filename
filename <- paste0(dataDir, 'userEditsALL.tsv')
### --- execute sql script:
sqlLogInPre <- paste0('/usr/local/bin/analytics-mysql dewiki -e ')
sqlInput <- paste(sqlQuery,
" > ",
filename,
sep = "")
# - command:
sqlCommand <- paste(sqlLogInPre, sqlInput)
system(command = sqlCommand, wait = TRUE)
# - load userEdits
userEdits <- fread(paste0(dataDir, 'userEditsALL.tsv'))
tEdits <- table(as.numeric(userEdits$revactor_actor))
# - campaign registered users?
wCU <- which(rev_user %in% names(tEdits))
rev_user[wCU]
All data on user registrations are presented in this section.
Chart 2.1. Registrations per tag and day. Please note: points with no data labels signify 0 user registrations.
### --- Full Dataset (Table Report)
datatable(dplyr::arrange(dataSet, event_campaign, date, desc(Registrations)))
Chart 2.2. Total registrations per tag.
dataSet <- dataSet %>%
group_by(event_campaign) %>%
summarise(totalRegistrations = sum(Registrations))
ggplot(dataSet, aes(x = event_campaign,
y = totalRegistrations,
color = event_campaign,
fill = event_campaign,
label = totalRegistrations)) +
geom_bar(width = .5, stat = "identity") +
scale_y_continuous(labels = comma) +
ggtitle('Autumn Banner Campaign 2019: Total Registrations per Tag') +
theme_minimal() +
geom_label_repel(size = 3.5, color = "white", show.legend = FALSE) +
scale_y_continuous(labels = comma) +
ylab("Registrations") +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(legend.position = "right")
All data on user edits are presented in this section.
userEdits <- read.csv(
'_analytics/userEdits.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
dataSet <- userEdits %>%
dplyr::select(actor_user) %>%
dplyr::group_by(actor_user) %>%
dplyr::summarise(edits = n())
# - Edit | 1 | 2-4 | 5-9 | 10-49 | >50
editBoundaries <- list(
c(0, 1),
c(2, 4),
c(5, 9),
c(10, 49)
)
dataSet$editClass <- sapply(dataSet$edits, function(x) {
wEC <- sapply(editBoundaries, function(y) {
x >= y[1] & x <= y[2]
})
if (sum(wEC) == 0) {
return(">= 50")
} else {
return(paste0("(",
editBoundaries[[which(wEC)]][1],
" - ",
editBoundaries[[which(wEC)]][2],
")"
)
)
}
})
editClass <- as.data.frame(table(dataSet$editClass),
stringsAsFactors = F)
colnames(editClass) <- c('Edit Class', 'Num.Users')
editClass$order <- as.numeric(sapply(editClass$`Edit Class`, function(x) {
lower <- str_extract(x, '[[:digit:]]+')
}))
editClass <- arrange(editClass, order)
editClass$order <- NULL
datatable(editClass)
All data on training modules are presented in this section.
tModules <- read.csv('_analytics/wmde_training_data_2019-11.csv',
header = T,
check.names = F,
stringsAsFactors = F)
userReg <- read.csv(
'_analytics/userRegistrationsReportFile.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
tModules <- tModules %>%
dplyr::filter(tModules$username %in% userReg$username)
tModules_Overview <- tModules %>%
dplyr::select(training_module) %>%
dplyr::group_by(training_module) %>%
dplyr::summarise(num_users = n())
datatable(tModules_Overview)
The following table presents the statistics on completed training modules.
tModules <- read.csv('_analytics/wmde_training_data_2019-11.csv',
header = T,
check.names = F,
stringsAsFactors = F)
userReg <- read.csv(
'_analytics/userRegistrationsReportFile.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
tModules <- tModules %>%
dplyr::filter(tModules$username %in% userReg$username)
tModules <- tModules %>%
dplyr::filter(nchar(module_completion_date) > 0)
tModules_Overview <- tModules %>%
dplyr::select(training_module) %>%
dplyr::group_by(training_module) %>%
dplyr::summarise(num_users = n())
datatable(tModules_Overview)
The following table presents the statistics on completed training modules and the respective users’ edits.
tModules <- read.csv('_analytics/wmde_training_data_2019-11.csv',
header = T,
check.names = F,
stringsAsFactors = F)
userReg <- read.csv(
'_analytics/userRegistrationsReportFile.csv',
header = T,
row.names = 1,
check.names = F,
stringsAsFactors = F)
tModules <- tModules %>%
dplyr::filter(tModules$username %in% userReg$username)
tModules <- tModules %>%
dplyr::filter(nchar(module_completion_date) > 0)
tModules_Edits <- tModules %>%
dplyr::left_join(userEdits,
by = c("username" = "actor_name")) %>%
dplyr::filter(!is.na(revactor_timestamp)) %>%
dplyr::group_by(training_module) %>%
dplyr::summarise(edits = n())
datatable(tModules_Edits)