NOTE: Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
NOTE: The campaign will be run from … to … of October 2017.
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
data.table 1.10.4
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
Attaching package: ‘reshape2’
The following objects are masked from ‘package:data.table’:
dcast, melt
The following object is masked from ‘package:tidyr’:
smiths
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Loading required package: bsts
Loading required package: BoomSpikeSlab
Loading required package: Boom
Loading required package: MASS
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Attaching package: ‘Boom’
The following object is masked from ‘package:stats’:
rWishart
Loading required package: xts
Attaching package: ‘xts’
The following objects are masked from ‘package:data.table’:
first, last
The following objects are masked from ‘package:dplyr’:
first, last
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script abc2017_PROD_OverallDailyUpdate.R
on stat1005.eqiad.wmnet, collecting the data as .tsv
and .csv
files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_OverallDailyUpdate.R
.
### --- Script: abc2017_PROD_OverallDailyUpdate.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_OverallDailyUpdate.R
### --- The script collects and wrangles all datasets
### --- for the WMDE Autumn Banner Campaign 2017.
### --- Goran S. Milovanovic, Data Analyst, WMDE
### --- September 26, 2017.
### -----------------------------------------------------------------------------
### 0. Setup
### -----------------------------------------------------------------------------
rm(list = ls())
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
startDate <- '2017-09-20'
endDate <- '2017-09-26'
bannerImpressiomnsDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017BannerImpressions/'
bannerClicksDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017BannerClicksLandingPages/'
userRegDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017UserReg/'
dailyUpdateDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/'
### -----------------------------------------------------------------------------
### 1. Banner Impressions
### -----------------------------------------------------------------------------
### --- Campaign Banner Tags:
# - (1) ?campaign=wmde_abc2017_bt1 - banner for Specific Task 1;
# - (2) ?campaign=wmde_abc2017_bt2 - banner for Specific Task 2;
# - (3) ?campaign=wmde_abc2017_bt3 - banner for Specific Task 3;
# - (4) ?campaign=wmde_abc2017_gib_lp - banner for the General Invitation
# - which leads to the Landing Page upon click;
# - (5) ?campaign=wmde_abc2017_gib_rg - banner for the General Invitation
# which leads directly to Registration upon click.
### --- from wmf.webrequest, run the following script on stat 1005
### --- to collect banner impressions
### --- Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_BannerImpressions.R
### --- HiveQL for everything with ?campaign like "wmde_abc2017"
### --- and then look for the desired tags.
### --- loop over date range, create query, fetch, and store
# dateRange <- seq(from = as.Date(startDate),
# to = as.Date(endDate),
# by = 'day')
# dateRange <- as.character(dateRange)
# # - set outDir
# outDir <- bannerImpressiomnsDir
# # - store query dir:
# qDir <- bannerImpressiomnsDir
# setwd(qDir)
# # - set HiveQL query dir:
# for (i in 1:length(dateRange)) {
# # - construct HiveQL query:
# y <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][1])
# m <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][2])
# d <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][3])
# q <- paste(
# "USE wmf;
# SELECT uri_query FROM webrequest
# WHERE uri_host = 'de.wikipedia.org'
# AND uri_path = '/beacon/impression'
# AND year = ", y,
# " AND month = ", m,
# " AND day = ", d,
# sep = "")
# # - write hql
# write(q, 'abc2017_BannerImpressions.hql')
# # - prepare output file:
# fileName <- "abc2017_BannerImpressions_"
# fileName <- paste0(fileName, dateRange[i], ".tsv")
# fileName <- paste0(outDir, "/", fileName)
# # - execute hql script:
# hiveArgs <-
# 'beeline -f'
# hiveInput <- paste0('abc2017_BannerImpressions.hql > ',
# fileName)
# # - command:
# hiveCommand <- paste(hiveArgs, hiveInput)
# system(command = hiveCommand, wait = TRUE)
# }
### -----------------------------------------------------------------------------
### 2. Banner Clicks and Landing Page Views
### -----------------------------------------------------------------------------
### --- campaign URL
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Entdeckungen-sortieren?campaign=wmde2017summer1&wmdesource=bannerclick
### --- Campaign Banner Tags:
# - (1) ?campaign=wmde_abc2017_bt1 - banner for Specific Task 1;
# - (2) ?campaign=wmde_abc2017_bt2 - banner for Specific Task 2;
# - (3) ?campaign=wmde_abc2017_bt3 - banner for Specific Task 3;
# - (4) ?campaign=wmde_abc2017_gib_lp - banner for the General Invitation
# - which leads to the Landing Page upon click;
# - (5) ?campaign=wmde_abc2017_gib_rg - banner for the General Invitation
# which leads directly to Registration upon click.
### --- Landing/Registration pages:
# - Landing Page, Specific Tasks, Banner bt1:
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren?campaign=wmde_abc2017_bt1
# - Landing Page, Specific Tasks, Banner bt2:
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren?campaign=wmde_abc2017_bt2
# - Landing Page, Specific Tasks, Banner bt3:
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren?campaign=wmde_abc2017_bt3
# - Landing Page, General, Banner gib_lp
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit?campaign=wmde_abc2017_gib_lp
# - Registration Page, banner gib_rg
# - https://de.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen?campaign=wmde_abc2017_gib_rg
# campaignQuery <- c('campaign=wmde_abc2017_bt1&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_bt2&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_bt3&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_gib_lp&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_gib_rg&wmdesource=bannerclick')
### --- loop over date range, create query, fetch, and store
dateRange <- seq(from = as.Date(startDate),
to = as.Date(endDate),
by = 'day')
dateRange <- as.character(dateRange)
# - set outDir
outDir <- bannerClicksDir
# - store query dir:
qDir <- bannerClicksDir
setwd(qDir)
# - set HiveQL query dir:
for (i in 1:length(dateRange)) {
# - construct HiveQL query:
y <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][1])
m <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][2])
d <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][3])
q <- paste(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE (uri_host = 'de.wikipedia.org'
AND (uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren' OR uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit' OR uri_path = '/wiki/Spezial:Benutzerkonto_anlegen')
AND year = ", y,
" AND month = ", m,
" AND day = ", d, ");",
sep = "")
# - write hql
write(q, 'abc2017_BannerClicks.hql')
# - prepare output file:
fileName <- "abc2017_BannerClicks_"
fileName <- paste0(fileName, dateRange[i], ".tsv")
fileName <- paste0(outDir, "/", fileName)
# - execute hql script:
hiveArgs <-
'beeline -f'
hiveInput <- paste0('abc2017_BannerClicks.hql > ',
fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
### --- Wrangle this dataset:
### --- Landing pages:
specTaskPage <- '/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren'
genInvPage <- '/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit'
regPage <- '/wiki/Spezial:Benutzerkonto_anlegen'
### --- Banner tags:
specTaskBanner1 <- '?campaign=wmde_abc2017_bt1'
specTaskBanner2 <- '?campaign=wmde_abc2017_bt2'
specTaskBanner3 <- '?campaign=wmde_abc2017_bt3'
genInvPage_rg <- '?campaign=wmde_abc2017_gib_rg'
genInvPage_lp <- '?campaign=wmde_abc2017_gib_lp'
### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
dS <- readLines(lF[i], n = -1)
dS <- dS[8:(length(dS)-2)]
if (length(dS > 0)) {
c <- c + 1
dS <- lapply(dS, function(x) {
dat <- strsplit(x, split = "\t", fixed = T)[[1]]
data.frame(page = dat[1], banner = dat[2], refer = dat[3], stringsAsFactors = F)
})
}
dS <- rbindlist(dS)
dS$date <- strsplit(
strsplit(lF[i], split = "_", fixed = T)[[1]][3],
split = ".",
fixed = T)[[1]][1]
dataSet[[c]] <- dS
rm(dS); gc()
}
dataSet <- rbindlist(dataSet)
# - replace values:
dataSet$page <- sapply(dataSet$page, function(x) {
strsplit(x, split = "/", fixed = T)[[1]][length(strsplit(x, split = "/", fixed = T)[[1]])]
})
dataSet$banner[which(dataSet$banner %in% specTaskBanner1)] <- 'BT1'
dataSet$banner[which(dataSet$banner %in% specTaskBanner2)] <- 'BT2'
dataSet$banner[which(dataSet$banner %in% specTaskBanner3)] <- 'BT3'
dataSet$banner[which(dataSet$banner %in% genInvPage_rg)] <- 'GIP_RG'
dataSet$banner[which(dataSet$banner %in% genInvPage_lp)] <- 'GIP_LP'
dataSet$banner <- paste(dataSet$banner, "_click", sep = "")
dataSet$banner[which(!(dataSet$banner %in% c('BT1_click', 'BT2_click', 'BT3_click', 'GIP_RG_click', 'GIP_LP_click')))] <-
'Other'
colnames(dataSet) <- c('Page', 'Source', 'Referer', 'Date')
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Source == 'Other'] <-
str_extract(dataSet$Referer[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Source == 'Other'],
"campaign=wmde_abc(.)+$")
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_bt1", dataSet$Source)] <- "Fehler_korrigieren_BT1"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_bt2", dataSet$Source)] <- "Fehler_korrigieren_BT2"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_bt3", dataSet$Source)] <- "Fehler_korrigieren_BT3"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_gib_rg", dataSet$Source)] <- "GIP_RG_click"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_gib_lp", dataSet$Source)] <- "Mach_mit"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[dataSet$Page %in% 'Mach_mit' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[dataSet$Page %in% 'Fehler_korrigieren' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[is.na(dataSet$Source)] <- 'Other'
dataSet$Referer <- NULL
### --- store abc_BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "abc_BannerClicksPageViews_Update.csv")
### -----------------------------------------------------------------------------
### 3. User Registration Data
### -----------------------------------------------------------------------------
# - ServerSideAccountCreation_5487345
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-store.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20170920000000));\" > /home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/abc2017_userRegistrations.tsv"
system(command = qCommand, wait = TRUE)
### -----------------------------------------------------------------------------
### 4. Guided Tour Data
### -----------------------------------------------------------------------------
### -----------------------------------------------------------------------------
### 5. User Edits Data
### -----------------------------------------------------------------------------
### --- Campaign User Registrations
lF <- list.files(path = "./_dailyUpdateDATA/")
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(paste("./userRegDATA/", lF, sep = ""),
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
EOF within quoted string
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 5, 6)
d <- substr(x, 7, 8)
paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
dplyr::select(id, event_userId, timestamp, event_isSelfMade, event_campaign) %>%
filter(event_isSelfMade == 1)
regPlotSet <- userReg %>%
dplyr::filter(event_isSelfMade == 1 & grepl("wmde_abc2017", event_campaign)) %>%
group_by(event_campaign, timestamp) %>%
summarise(Registrations = n()) %>%
arrange(timestamp)
colnames(regPlotSet) <- c('Campaign', 'Date', 'Registrations')
regPlotSet$Campaign <- sapply(regPlotSet$Campaign, function(x) {
out <- toupper(strsplit(x, split = "_", fixed = T)[[1]][3])
})
ggplot(regPlotSet, aes(x = Date,
y = Registrations,
group = Campaign,
color = Campaign,
fill = Campaign,
label = Registrations)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
scale_fill_manual("legend", values = campaignChartColors) +
scale_color_manual("legend", values = campaignChartColors) +
ggtitle('Autumn Banner Campaign 2017: User Registrations') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
NOTE: the following code is not fully reproducible from this Report. The data are collected by running the script abc2017_PROD_GuidedTour.R
on stat1005.eqiad.wmnet, collecting the data as .csv
files, copying manually, and processing locally.
### --- NOTE:
# For the guided tours it would be great to have two numbers:
#
# Number of people who started the guided tour / completed the first step /and than did whatever (exited, or not)
# Number of people who completed the guided tour
# log.GuidedTourExited_8690566
# https://meta.wikimedia.org/wiki/Schema:GuidedTourExited
# ALSO TAKE A LOOK AT: log.GuidedTourGuiderHidden_8690549
# ALSO TAKE A LOOK AT: https://meta.wikimedia.org/wiki/Schema:GuidedTourGuiderHidden
### 2.1 Collect User Registration Data
### --- Script: abc2017_PROD_GuidedTour.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_Registrations.R
# - ServerSideAccountCreation_5487345
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-store.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20170920000000));\" > /home/goransm/_miscWMDE/abc2017_DataOUT/abc_test/abc2017_guidedTours.tsv"
system(command = qCommand, wait = TRUE)