Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign is run from 1. January 2018 to N January 2018.
knitr::opts_knit$set(root.dir = '/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script ThankYou_2018_Production_SQL.R
on stat1005.eqiad.wmnet, collecting the data as .tsv
and .csv
files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou_2018_Production_SQL.R
.
### --- from stat1005: Thank You 2018 Banner Campaign
### --- production script: fetch the campaign data sets
### --- Campaign Details:
# - estimated start: 1st January 2018 (+/- 2 days)
# - estimated duration: 6 to 10 days
# - Reporting should start on 2nd January 2018.
# - The report must include any activity from the beginning of the campaign.
# - The estimated start will be 1st January 2018.
# - Guided Tour names
# - (The training modules include 2 new guided tours):
# - ?tour=diskutieren
# - ?tour=seimutig
### --- Training Modules Schema:
### --- https://meta.wikimedia.org/wiki/User:Stefan_Schneider_(WMDE)/dashboard_libraries/wikipedia-kurse.json
### --- the slug field is relevant for tracking
### --- Setup
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
### --- Directories
bannerImpressionsDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerImpressions'
bannerClicksDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/BannerClicks'
dailyUpdateDir <- '/home/goransm/RScripts/WMDE_Campaigns/ThankYou2018/ThankYou2018_DailyUpdate'
### --- Campaign time range
startDate <- '2018-01-02'
endDate <- '2018-01-08'
### ------------------------------------------------------------
### --- S1. Banner Impression Data
### ------------------------------------------------------------
# - campaign tag
# - Name: bt1, ?campaign=wmde_etc2017_bt1
### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
to = as.POSIXlt(endDate, tz = "CET"),
by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to the campaign end:
endCampaign <- as.POSIXlt(endDate, tz = "UTC")
w <- which(dateRange > endCampaign)
if (length(w) > 0) {
dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
dR[[i]] <- data.frame(
cetName = names(dateRange[i]),
utcYear = year(dateRange[i]),
utcMonth = month(dateRange[i]),
utcDay = mday(dateRange[i]),
utcHour = hour(dateRange[i])
)
}
dR <- rbindlist(dR)
dR <- dR %>%
group_by(cetName, utcYear, utcMonth, utcDay) %>%
summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))
### ------------------------------------------------------------
### --- S2. Banner Landing Page Data
### ------------------------------------------------------------
# - landing page link including the appropriate campaign tag
# - Link:https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia?campaign=wmde_etc2017_bt1
# - set bannerClicksDir
setwd(bannerClicksDir)
for (i in 1:length(unique(dR$cetName))) {
wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
for (j in 1:length(wCetName)) {
# - construct HiveQL query:
y <- dR$utcYear[wCetName[j]]
m <- dR$utcMonth[wCetName[j]]
d <- dR$utcDay[wCetName[j]]
hour <- dR$utcHour[wCetName[j]]
q <- paste(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE uri_host = 'de.wikipedia.org'
AND uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/LerneWikipedia'
AND year = ", y,
" AND month = ", m,
" AND day = ", d,
" AND (", hour, ");",
sep = "")
# - write hql
write(q, 'thankyou2018_BannerClicks.hql')
# - prepare output file:
fileName <- "thankyou2018_BannerClicks_"
fileName <- paste0(fileName,
as.character(unique(dR$cetName)[i]),
"_", j,
".tsv")
fileName <- paste0(bannerClicksDir, "/", fileName)
# - execute hql script:
hiveArgs <-
'beeline -f'
hiveInput <- paste0('thankyou2018_BannerClicks.hql > ',
fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
}
### --- Wrangle this dataset:
### --- Banner tags:
campaignBanner <- 'wmde_etc2017_bt1'
### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
dS <- readLines(lF[i], n = -1)
timeStamp <- strsplit(lF[i], split = "_")[[1]][3]
bannerClicks <- sum(grepl(campaignBanner, dS, fixed = T))
dataSet[[i]] <- data.frame(timestamp = timeStamp,
bannerClicks = bannerClicks,
stringsAsFactors = F)
}
dataSet <- rbindlist(dataSet)
### --- store BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "thankyou2018_BannerClicksPageViews_Update.csv")
### --- SQL
startDate <- '2018-01-01'
### ------------------------------------------------------------
### --- S3. User Registration Data
### ------------------------------------------------------------
# - NOTE: UTC timestamps - adjustment for CE(S)T introduced.
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= ", gsub("-", "", startDate, fixed = T), "220000));\" > ",
dailyUpdateDir, "/thankyou_2018_userRegistrations.tsv", sep = "")
system(command = qCommand, wait = TRUE)
### ------------------------------------------------------------
### --- S4. Guided Tours Data
### ------------------------------------------------------------
# - NOTE: UTC timestamps - adjustment for CE(S)T introduced.
# - ServerSideAccountCreation_5487345
qCommand <- paste("mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-slave.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= ",
gsub("-", "", startDate, fixed = T), "220000));\" > ",
dailyUpdateDir, "/thankyou_2018_guidedTours.tsv", sep = "")
system(command = qCommand, wait = TRUE)
### ------------------------------------------------------------
### --- S5. User Edit Data
### ------------------------------------------------------------
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF,
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
userReg <- userReg %>%
dplyr::select(event_userId, event_isSelfMade, event_campaign) %>%
filter(event_isSelfMade == 1) %>%
filter(event_campaign %in% "wmde_etc2017_bt1")
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
paste(uid, collapse = ", "),
') GROUP BY rev_user;',
sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
paste('"', sqlQuery, '" > ', sep = ""),
dailyUpdateDir, '/thankyou_2018_userEdits.tsv', sep = "")
system(command = mySqlCommand,
wait = TRUE)
### ------------------------------------------------------------
### --- S6. Training Module Data
### ------------------------------------------------------------
userReg <- read.delim('thankyou_2018_userRegistrations.tsv',
sep = "\t",
quote = "",
header = T,
# row.names = 1,
stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 5, 6)
d <- substr(x, 7, 8)
part1Date <- paste(y, m, d, sep = "-")
hr <- substr(x, 9, 10)
mi <- substr(x, 11, 12)
se <- substr(x, 13, 14)
part2Date <- paste(hr, mi, se, sep = ":")
paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <-
as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 6, 7)
d <- substr(x, 9, 10)
paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
filter(event_campaign %in% 'wmde_etc2017_bt1') %>%
group_by(timestamp) %>%
summarise(Registrations = n())
knitr::kable(userReg, format = "html") %>%
kable_styling(full_width = F, position = "left")
timestamp | Registrations |
---|---|
2018-01-01 | 1 |
2018-01-02 | 8 |
2018-01-03 | 15 |
2018-01-04 | 13 |
2018-01-05 | 5 |
2018-01-06 | 6 |
2018-01-07 | 12 |
2018-01-08 | 6 |
2018-01-09 | 9 |
2018-01-10 | 7 |
2018-01-11 | 15 |
2018-01-12 | 9 |
2018-01-13 | 3 |
2018-01-14 | 11 |
2018-01-15 | 1 |
ggplot(userReg, aes(x = timestamp,
y = Registrations,
label = Registrations)) +
geom_bar(stat = "identity",
position = "dodge",
width = .15,
fill = "white",
color = "darkblue") +
ggtitle('Thank You 2018:\nOverview of User Registrations') +
geom_label(size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
userEdits <- readLines('thankyou_2018_userEdits.tsv', n = -1)
if (length(userEdits) >= 1) {
userEdits <- read.delim('thankyou_2018_userEdits.tsv',
sep = "\t",
header = T,
stringsAsFactors = F)
# - report
print(paste0(sum(userEdits$edits), " edits were made by the campaign registered users thus far."))
} else {
print("There are currently no user edits from this campaign.")
}
[1] "135 edits were made by the campaign registered users thus far."
Campaign guided tours: diskutieren
and seimutig
.
tours <- c('diskutieren', 'seimutig')
# setwd('/home/goransm/Work/___DataKolektiv/Projects/WikimediaDEU/_WMDE_Projects/_misc/NewEditors_Team/Thank_You_Campaign_2018/_dailyUpdate/')
gTours <- read.delim('thankyou_2018_guidedTours.tsv',
sep = "\t",
header = T,
row.names = 1,
stringsAsFactors = F)
gTours$timestamp <- as.character(gTours$timestamp)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 5, 6)
d <- substr(x, 7, 8)
part1Date <- paste(y, m, d, sep = "-")
hr <- substr(x, 9, 10)
mi <- substr(x, 11, 12)
se <- substr(x, 13, 14)
part2Date <- paste(hr, mi, se, sep = ":")
paste(part1Date, part2Date, sep = " ")
})
gTours$timestamp <- as.POSIXct(gTours$timestamp, tz = "UTC")
timeDiff <-
as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTours$timestamp <- as.character(gTours$timestamp + timeDiff)
gTours$timestamp <- sapply(gTours$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 6, 7)
d <- substr(x, 9, 10)
paste(y, m, d, sep = "-")
})
# - anonymize event_userId
eventUserId <- setdiff(unique(gTours$event_userId), 0)
an_userId <- character(length(eventUserId))
for (i in 1:length(an_userId)) {
id <- round(runif(1, 1, 10e6))
while (id %in% an_userId) {
id <- round(runif(1, 1, 10e6))
}
an_userId[i] <- id
}
an_userId <- paste0("u_", an_userId)
gTours$an_userId <- sapply(gTours$event_userId, function(x) {
w <- which(eventUserId %in% x)
if (length(w) > 0) {
an_userId[w]
} else {
x
}
})
# - look up for the campaign guided tours
w <- which(gTours$event_tour %in% tours)
if (length(w) > 0) {
gTours <- gTours[w, ]
gTours <- gTours %>%
filter(event_userId != 0) %>%
select(timestamp, event_tour, event_step, an_userId)
knitr::kable(gTours, format = "html") %>%
kable_styling(full_width = F, position = "left")
} else {
print("There are currently no data on guided tours from this campaign.")
}
timestamp | event_tour | event_step | an_userId |
---|---|---|---|
2018-01-03 | diskutieren | returnToTraining | u_5167969 |
2018-01-03 | seimutig | returnToTraining | u_5167969 |
2018-01-04 | seimutig | boldness | u_6495795 |
2018-01-04 | seimutig | positionCursor | u_6495795 |
2018-01-04 | diskutieren | returnToTraining | u_6495795 |
2018-01-05 | diskutieren | saveReply | u_6896279 |
2018-01-04 | seimutig | returnToTraining | u_6495795 |
2018-01-04 | seimutig | editButtonCitation | u_6495795 |
2018-01-06 | seimutig | editButton | u_4706786 |
2018-01-06 | seimutig | boldness | u_4706786 |
2018-01-06 | diskutieren | firstMessage | u_4706786 |
2018-01-08 | seimutig | editBoldness | u_2986526 |
2018-01-08 | seimutig | editSummary | u_2986526 |
2018-01-08 | diskutieren | returnToTraining | u_2986526 |
2018-01-10 | seimutig | editButton | u_2672122 |
2018-01-10 | seimutig | editButton | u_2672122 |
2018-01-10 | seimutig | editBoldness | u_2672122 |
2018-01-10 | seimutig | editSummary | u_2672122 |
2018-01-10 | seimutig | editButtonCitation | u_2672122 |
2018-01-10 | seimutig | saveCitation | u_2672122 |
2018-01-10 | seimutig | editButton | u_2672122 |
2018-01-10 | diskutieren | returnToTraining | u_2672122 |
2018-01-11 | seimutig | editBoldness | u_771518 |
2018-01-11 | seimutig | boldness | u_771518 |
2018-01-11 | seimutig | insertCitation | u_771518 |
2018-01-11 | seimutig | anyEdit | u_771518 |
2018-01-11 | diskutieren | returnToTraining | u_771518 |
2018-01-11 | seimutig | anyEdit | u_4425925 |
Campaign guided tours: diskutieren
and seimutig
.
gTours %>%
select(event_tour) %>%
group_by(event_tour) %>%
summarise(`Unique users` = n()) %>%
knitr::kable(format = "html") %>%
kable_styling(full_width = F, position = "left")
event_tour | Unique users |
---|---|
diskutieren | 7 |
seimutig | 21 |
trainData <- read.csv('wmde_training_data.csv')
# - remove first two rows (test data)
trainData <- trainData[-c(1,2), ]
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userData <- read.table(lF,
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
gtData <- read.delim('thankyou_2018_guidedTours.tsv',
sep = "\t",
header = T,
row.names = 1,
stringsAsFactors = F)
edData <- read.delim('thankyou_2018_userEdits.tsv',
sep = "\t",
header = T,
stringsAsFactors = F)
How many registered users take the Training Module?
print(paste0("Number of uses who took the Training Module is ",
length(trainData$username),
" , which is ",
round(length(trainData$username)/sum(userReg$Registrations)*100, 2),
"% of registered users."
))
[1] "Number of uses who took the Training Module is 67 , which is 55.37% of registered users."
What are the last completed slides per user Training Module?
slidesData <- trainData %>%
select(training_module, last_slide_completed) %>%
group_by(training_module, last_slide_completed) %>%
summarise(Count = n()) %>%
arrange(training_module, desc(Count))
knitr::kable(slidesData, format = "html") %>%
kable_styling(full_width = F, position = "left")
training_module | last_slide_completed | Count |
---|---|---|
artikel-bewerten | fertig | 6 |
artikel-bewerten | artikel-qualitat-bewerten | 1 |
artikel-bewerten | artikel-qualitat-quiz | 1 |
editieren-basiswissen | das-wars | 7 |
editieren-basiswissen | video-wiki-code | 2 |
editieren-basiswissen | visual-editor-buttons | 2 |
editieren-basiswissen | beobachtungsliste-video | 1 |
editieren-basiswissen | edit-vs-edit-quelle | 1 |
editieren-basiswissen | versuche-es | 1 |
wikipedia-basiswissen | relevanz-quiz-fortsetzung | 35 |
wikipedia-basiswissen | richtlinien-der-wikipedia | 2 |
wikipedia-basiswissen | urheberrecht-und-plagiate | 2 |
wikipedia-basiswissen | wikipedia-ist-frei | 2 |
wikipedia-basiswissen | nachweisbarkeit-quiz | 1 |
wikipedia-basiswissen | prinzipien-ruckblick | 1 |
wikipedia-basiswissen | relevanz | 1 |
wikipedia-basiswissen | sei-respektvoll | 1 |
ggplot(slidesData, aes(x = last_slide_completed,
y = Count,
color = training_module,
label = Count)) +
geom_line(aes(group = 1), size = .25) +
geom_point(size = 1.5) +
geom_point(size = 1, color = "white") +
scale_y_continuous(labels = scales::comma) +
ggtitle('Thank You 2018:\nOverview of Training Modules') +
ylab("No. Users") +
geom_text_repel(size = 3) +
facet_wrap(~training_module) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 6.5, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
Do users start editing after training modules? Is there a difference in users with completed and not-completed or not at all having taken the modules?
userData <- userData %>%
filter(event_campaign %in% 'wmde_etc2017_bt1' & event_isSelfMade) %>%
select(event_userId, event_userName)
userData <- left_join(userData, trainData, by = c("event_userName" = "username"))
Column `event_userName`/`username` joining character vector and factor, coercing into character vector
userData$module_completion_date <- NA
userData <- left_join(userData,
gtData[, c('event_userId', 'event_tour', 'event_step')],
by = c('event_userId'))
userData <- left_join(userData, edData,
by = c('event_userId' = 'rev_user'))
How many edits were made on behalf of those users who have started the Training Module vs. those who did not?
tAn <- userData %>%
select(training_module, edits) %>%
group_by(training_module) %>%
summarise(Users = n(), Edits = sum(edits, na.rm = T)) %>%
arrange(training_module, desc(Edits))
knitr::kable(tAn, format = "html") %>%
kable_styling(full_width = F, position = "left")
training_module | Users | Edits |
---|---|---|
artikel-bewerten | 7 | 36 |
editieren-basiswissen | 6 | 36 |
wikipedia-basiswissen | 20 | 38 |
NA | 111 | 187 |
print(paste0("In total, the users who took the Training Module made ",
sum(tAn$Edits[1:3]),
" edits, which is ",
round(sum(tAn$Edits[1:3])/sum(tAn$Edits)*100, 2),
"% of all edits."
))
[1] "In total, the users who took the Training Module made 110 edits, which is 37.04% of all edits."
print(paste0("On the other hand, the users who did not take the Training Module made ",
tAn$Edits[4],
" edits, which is ",
round(tAn$Edits[4]/sum(tAn$Edits)*100, 2),
"% of all edits."
))
[1] "On the other hand, the users who did not take the Training Module made 187 edits, which is 62.96% of all edits."