Feedback should be send to goran.milovanovic_ext@wikimedia.de
.
The campaign is run from 2017/10/05 to 2017/10/13.
CURRENT UPDATE: Complete dataset, collected on 2017/10/14.
NOTE: the Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script abc2017_PROD_OverallDailyUpdate.R
on stat1005.eqiad.wmnet, collecting the data as .tsv
and .csv
files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_OverallDailyUpdate.R
.
### --- Script: abc2017_PROD_OverallDailyUpdate.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_OverallDailyUpdate.R
### --- The script collects and wrangles all datasets
### --- for the WMDE Autumn Banner Campaign 2017.
### --- Goran S. Milovanovic, Data Analyst, WMDE
### --- September 26, 2017.
### -----------------------------------------------------------------------------
### 0. Setup
### -----------------------------------------------------------------------------
rm(list = ls())
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
startDate <- '2017-10-05'
endDate <- '2017-10-14'
bannerImpressionsDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017BannerImpressions/'
bannerClicksDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017BannerClicksLandingPages/'
dailyUpdateDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/'
### -----------------------------------------------------------------------------
### 1. Banner Impressions
### -----------------------------------------------------------------------------
### --- Campaign Banner Tags:
# - (1) ?campaign=wmde_abc2017_bt1 - banner for Specific Task 1;
# - (2) ?campaign=wmde_abc2017_bt2 - banner for Specific Task 2;
# - (3) ?campaign=wmde_abc2017_bt3 - banner for Specific Task 3;
# - (4) ?campaign=wmde_abc2017_gib_lp - banner for the General Invitation
# - which leads to the Landing Page upon click;
# - (5) ?campaign=wmde_abc2017_gib_rg - banner for the General Invitation
# which leads directly to Registration upon click.
### --- HiveQL for everything from
### --- uri_host = 'de.wikipedia.org' and
### --- uri_path = '/beacon/impression'
### --- and then look up the desired tags.
### --- loop over date range, create query, fetch, and store
dateRange <- seq.POSIXt(from = as.POSIXlt(startDate, tz = "CET"),
to = as.POSIXlt(endDate, tz = "CET"),
by = 'hour')
dateRange <- dateRange[-length(dateRange)]
cetDateRange <- as.character(dateRange)
cetDateRange <- sapply(cetDateRange, function(x) {
strsplit(x, split = " ", fixed = T)[[1]][1]
})
names(dateRange) <- cetDateRange
dateRange <- as.POSIXlt(dateRange, tz = "UTC")
# - up to today:
today <- as.POSIXlt(Sys.time(), tz = "UTC")
w <- which(dateRange > today)
if (length(w) > 0) {
dateRange <- dateRange[-w]
}
dR <- list()
for (i in 1:length(dateRange)) {
dR[[i]] <- data.frame(
cetName = names(dateRange[i]),
utcYear = year(dateRange[i]),
utcMonth = month(dateRange[i]),
utcDay = mday(dateRange[i]),
utcHour = hour(dateRange[i])
)
}
dR <- rbindlist(dR)
dR <- dR %>%
group_by(cetName, utcYear, utcMonth, utcDay) %>%
summarise(utcHour = paste("hour=", utcHour, collapse = " OR ", sep = ""))
# - set outDir
outDir <- bannerImpressionsDir
setwd(outDir)
# - set HiveQL query dir:
for (i in 1:length(unique(dR$cetName))) {
wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
for (j in 1:length(wCetName)) {
# - construct HiveQL query:
y <- dR$utcYear[wCetName[j]]
m <- dR$utcMonth[wCetName[j]]
d <- dR$utcDay[wCetName[j]]
hour <- dR$utcHour[wCetName[j]]
q <- paste(
"USE wmf;
SELECT uri_query FROM webrequest
WHERE uri_host = 'de.wikipedia.org'
AND uri_path = '/beacon/impression'
AND year = ", y,
" AND month = ", m,
" AND day = ", d,
" AND (", hour, ");",
sep = "")
# - write hql
write(q, 'abc2017_BannerImpressions.hql')
# - prepare output file:
fileName <- "abc2017_BannerImpressions_"
fileName <- paste0(fileName,
as.character(unique(dR$cetName)[i]),
"_", j,
".tsv")
fileName <- paste0(outDir, fileName)
# - execute hql script:
hiveArgs <-
'beeline -f'
hiveInput <- paste0('abc2017_BannerImpressions.hql > ',
fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
}
### --- wrangle this dataSet
lF <- list.files()
lF <- lF[grepl(".tsv", lF, fixed = T)]
lF <- lF[grepl("Impressions", lF, fixed = T)]
### --- load Dataset:
# - count non-empty files:
c <- 0
dataSet <- list()
for (i in 1:length(lF)) {
dS <- readLines(lF[i], n = -1)
dS <- dS[8:(length(dS) - 1)]
if (length(dS) > 0) {
c <- c + 1
dS <- data.frame(query = dS,
date = strsplit(lF[i], split = "_", fixed = T)[[1]][3],
stringsAsFactors = F)
dataSet[[c]] <- dS
rm(dS); gc()
}
}
dataSet <- rbindlist(dataSet)
dataSet <- filter(dataSet,
grepl("WMDE_editor_campaign_autumn17",
query)
)
# - produce analytics dataset
banner <- str_extract(dataSet$query, "banner=(_|[[:alnum:]])+&")
banner <- gsub("banner=", "", banner, fixed = T)
banner <- gsub("&", "", banner, fixed = T)
impressionRate <- str_extract(dataSet$query, "recordImpressionSampleRate=([[:digit:]]|\\.)+&")
impressionRate <- gsub("recordImpressionSampleRate=", "", impressionRate, fixed = T)
impressionRate <- gsub("&", "", impressionRate, fixed = T)
impressionRate <- as.numeric(impressionRate)
status <- str_extract(dataSet$query, "status=([[:alnum:]]|[[:punct:]])+&")
status <- gsub("status=", "", status)
status <- gsub("&", "", status)
statusCode <- str_extract(dataSet$query, "statusCode=[[:digit:]]&")
statusCode <- gsub("statusCode=", "", statusCode)
statusCode <- gsub("&", "", statusCode)
campaignCategory <- str_extract(dataSet$query, "campaignCategory=[[:alnum:]]+&")
campaignCategory <- gsub("campaignCategory=", "", campaignCategory)
campaignCategory <- gsub("&", "", campaignCategory)
result <- str_extract(dataSet$query, "result=[[:alnum:]]+")
result <- gsub("result=", "", result)
result <- gsub("&", "", result)
qdate <- dataSet$date
# - as.data.frame()
dataSet <- data.frame(banner = banner,
impressionRate = impressionRate,
status = status,
statusCode = statusCode,
campaignCategory = campaignCategory,
result = result,
date = qdate,
stringsAsFactors = F)
# - store analytics dataset:
setwd(dailyUpdateDir)
dataSet <- dataSet[!is.na(dataSet$banner), ]
write.csv(dataSet, 'abc_BannerImpressions_update.csv')
### -----------------------------------------------------------------------------
### 2. Banner Clicks and Landing Page Views
### -----------------------------------------------------------------------------
### --- Landing/Registration pages:
# - Landing Page, Specific Tasks, Banners bt1, bt2, bt3
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/JetztMitmachen
# - Specific bt banner anchors:
# - bt1 - #Bebildern, bt2 - Aktualisieren, bt3 - #Belegen
# - Landing Page, General, Banner gib_lp
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit
# - Registration Page, banner gib_rg
# - https://de.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen
# - set outDir
outDir <- bannerClicksDir
setwd(outDir)
for (i in 1:length(unique(dR$cetName))) {
wCetName <- which(dR$cetName %in% unique(dR$cetName)[i])
for (j in 1:length(wCetName)) {
# - construct HiveQL query:
y <- dR$utcYear[wCetName[j]]
m <- dR$utcMonth[wCetName[j]]
d <- dR$utcDay[wCetName[j]]
hour <- dR$utcHour[wCetName[j]]
q <- paste(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE uri_host = 'de.wikipedia.org'
AND (uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/JetztMitmachen' OR uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit' OR uri_path = '/wiki/Spezial:Benutzerkonto_anlegen')
AND year = ", y,
" AND month = ", m,
" AND day = ", d,
" AND (", hour, ");",
sep = "")
# - write hql
write(q, 'abc2017_BannerClicks.hql')
# - prepare output file:
fileName <- "abc2017_BannerClicks_"
fileName <- paste0(fileName,
as.character(unique(dR$cetName)[i]),
"_", j,
".tsv")
fileName <- paste0(outDir, fileName)
# - execute hql script:
hiveArgs <-
'beeline -f'
hiveInput <- paste0('abc2017_BannerClicks.hql > ',
fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
}
### --- Wrangle this dataset:
### --- Landing pages:
specTaskPage <- '/wiki/Wikipedia:Wikimedia_Deutschland/JetztMitmachen'
genInvPage <- '/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit'
regPage <- '/wiki/Spezial:Benutzerkonto_anlegen'
### --- Banner tags:
specTaskBanner1 <- '?campaign=wmde_abc2017_bt1'
specTaskBanner2 <- '?campaign=wmde_abc2017_bt2'
specTaskBanner3 <- '?campaign=wmde_abc2017_bt3'
genInvPage_rg <- '?campaign=wmde_abc2017_gib_rg'
genInvPage_lp <- '?campaign=wmde_abc2017_gib_lp'
### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files()
lF <- lF[grepl('.tsv', lF, fixed = T)]
lF <- lF[grepl('Clicks', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
dS <- readLines(lF[i], n = -1)
dS <- dS[8:(length(dS) - 2)]
if (length(dS > 0)) {
c <- c + 1
dS <- lapply(dS, function(x) {
dat <- strsplit(x, split = "\t", fixed = T)[[1]]
data.frame(page = dat[1], banner = dat[2], refer = dat[3], stringsAsFactors = F)
})
}
dS <- rbindlist(dS)
dS$date <- strsplit(lF[i], split = "_", fixed = T)[[1]][3]
dataSet[[c]] <- dS
rm(dS); gc()
}
dataSet <- rbindlist(dataSet)
# - replace values:
dataSet$page <- sapply(dataSet$page, function(x) {
strsplit(x, split = "/", fixed = T)[[1]][length(strsplit(x, split = "/", fixed = T)[[1]])]
})
dataSet$banner[which(dataSet$banner %in% specTaskBanner1)] <- 'BT1'
dataSet$banner[which(dataSet$banner %in% specTaskBanner2)] <- 'BT2'
dataSet$banner[which(dataSet$banner %in% specTaskBanner3)] <- 'BT3'
dataSet$banner[which(dataSet$banner %in% genInvPage_rg)] <- 'GIP_RG'
dataSet$banner[which(dataSet$banner %in% genInvPage_lp)] <- 'GIP_LP'
dataSet$banner <- paste(dataSet$banner, "_click", sep = "")
dataSet$banner[which(!(dataSet$banner %in% c('BT1_click',
'BT2_click',
'BT3_click',
'GIP_RG_click',
'GIP_LP_click')))] <- 'Other'
colnames(dataSet) <- c('Page', 'Source', 'Referer', 'Date')
### --- store abc_BannerClicksPageViews_Update.csv
write.csv(dataSet, file = "abc_BannerClicksPageViews_Non-Refined.csv")
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Source == 'Other'] <-
str_extract(dataSet$Referer[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Source == 'Other'],
"campaign=wmde_abc(.)+$")
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_bt1", dataSet$Source)] <- "JetztMitmachen_BT1"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_bt2", dataSet$Source)] <- "JetztMitmachen_BT2"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_bt3", dataSet$Source)] <- "JetztMitmachen_BT3"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_gib_rg", dataSet$Source)] <- "GIP_RG_click"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_gib_lp", dataSet$Source)] <- "Mach_mit"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[dataSet$Page %in% 'Mach_mit' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[dataSet$Page %in% 'JetztMitmachen' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[is.na(dataSet$Source)] <- 'Other'
dataSet$Referer <- NULL
### --- store abc_BannerClicksPageViews_Update.csv
setwd(dailyUpdateDir)
write.csv(dataSet, file = "abc_BannerClicksPageViews_Update.csv")
### -----------------------------------------------------------------------------
### 3. User Registration Data
### -----------------------------------------------------------------------------
# - NOTE: UTC timestamps - adjustment for CE(S)T introduced.
# - ServerSideAccountCreation_5487345
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-store.eqiad.wmnet -A -e \"select * from log.ServerSideAccountCreation_5487345 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20171004220000));\" > /home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/abc2017_userRegistrations.tsv"
system(command = qCommand, wait = TRUE)
### -----------------------------------------------------------------------------
### 4. Guided Tour Data
### -----------------------------------------------------------------------------
# - NOTE: UTC timestamps - adjustment for CE(S)T introduced.
# - ServerSideAccountCreation_5487345
qCommand <- "mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-store.eqiad.wmnet -A -e \"select * from log.GuidedTourExited_8690566 where ((webHost = 'de.wikipedia.org') and (timestamp >= 20171004220000));\" > /home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/abc2017_guidedTours.tsv"
system(command = qCommand, wait = TRUE)
### -----------------------------------------------------------------------------
### 5. User Edits Data
### -----------------------------------------------------------------------------
# - get user IDs from registered:
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF,
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
userReg <- userReg %>%
dplyr::select(event_userId, event_isSelfMade) %>%
filter(event_isSelfMade == 1)
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT COUNT(*) as edits, rev_user FROM revision WHERE rev_user IN (',
paste(uid, collapse = ", "),
') GROUP BY rev_user;',
sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
paste('"', sqlQuery, '" > ', sep = ""),
'/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/abc2017_userEdits.tsv', sep = "")
system(command = mySqlCommand,
wait = TRUE)
### --- Campaign User Registrations
lF <- list.files(path = "./_dailyUpdateDATA/")
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(paste("./_dailyUpdateDATA/", lF, sep = ""),
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
userReg$timestamp <- as.character(userReg$timestamp)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 5, 6)
d <- substr(x, 7, 8)
part1Date <- paste(y, m, d, sep = "-")
hr <- substr(x, 9, 10)
mi <- substr(x, 11, 12)
se <- substr(x, 13, 14)
part2Date <- paste(hr, mi, se, sep = ":")
paste(part1Date, part2Date, sep = " ")
})
userReg$timestamp <- as.POSIXct(userReg$timestamp, tz = "UTC")
timeDiff <-
as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
userReg$timestamp <- as.character(userReg$timestamp + timeDiff)
userReg$timestamp <- sapply(userReg$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 6, 7)
d <- substr(x, 9, 10)
paste(y, m, d, sep = "-")
})
userReg <- userReg %>%
dplyr::select(id, event_userId, timestamp, event_isSelfMade, event_campaign) %>%
filter(event_isSelfMade == 1 & grepl("wmde_abc2017", event_campaign))
print(paste(dim(userReg)[1], " users have registered via the Campaign."))
[1] "1054 users have registered via the Campaign."
regPlotSet <- userReg %>%
group_by(event_campaign, timestamp) %>%
summarise(Registrations = n()) %>%
arrange(timestamp)
colnames(regPlotSet) <- c('Campaign', 'Date', 'Registrations')
regPlotSet$Campaign <- factor(toupper(gsub("wmde_abc2017_", "", regPlotSet$Campaign)))
ggplot(regPlotSet, aes(x = Date,
y = Registrations,
group = Campaign,
color = Campaign,
fill = Campaign)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
scale_fill_manual("legend", values = campaignChartColors) +
scale_color_manual("legend", values = campaignChartColors) +
scale_y_continuous(labels = comma) +
ggtitle('Autumn Banner Campaign 2017: User Registrations (daily)') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
### --- Full Dataset (Table Report)
datatable(regPlotSet)
regPlotSetTotal <- regPlotSet %>%
group_by(Campaign) %>%
summarise(Registrations = sum(Registrations)) %>%
arrange(Campaign)
ggplot(regPlotSetTotal, aes(x = Campaign,
y = Registrations,
group = Campaign,
color = Campaign,
fill = Campaign,
label = Registrations)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
geom_label(fill = "white", color = "black") +
scale_fill_manual("legend", values = campaignChartColors) +
scale_color_manual("legend", values = campaignChartColors) +
scale_y_continuous(labels = comma) +
ggtitle('Autumn Banner Campaign 2017: User Registrations (totals)') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
regPlotSetDaily <- userReg %>%
dplyr::filter(event_isSelfMade == 1 & grepl("wmde_abc2017", event_campaign)) %>%
group_by(timestamp) %>%
summarise(Registrations = n()) %>%
arrange(timestamp)
colnames(regPlotSetDaily) <- c('Date', 'Registrations')
ggplot(regPlotSetDaily, aes(x = Date,
y = Registrations,
label = Registrations)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5,
fill = "darkblue",
color = "darkblue") +
geom_label() +
scale_y_continuous(labels = comma) +
ggtitle('Autumn Banner Campaign 2017: User Registrations') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
### --- Full Dataset (Table Report)
datatable(regPlotSet)
### --- Campaign User Registrations
gTourData <- read.table("./_dailyUpdateDATA/abc2017_guidedTours.tsv",
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
# - clean up: gTourData
gTourData <- gTourData[which(!duplicated(gTourData$event_userId)), ]
gTourData <- gTourData[which(!(gTourData$event_userId == 0)), ]
gTourData <- gTourData[which(gTourData$event_userId %in% userReg$event_userId), ]
gTourData$timestamp <- as.character(gTourData$timestamp)
gTourData$timestamp <- sapply(gTourData$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 5, 6)
d <- substr(x, 7, 8)
part1Date <- paste(y, m, d, sep = "-")
hr <- substr(x, 9, 10)
mi <- substr(x, 11, 12)
se <- substr(x, 13, 14)
part2Date <- paste(hr, mi, se, sep = ":")
paste(part1Date, part2Date, sep = " ")
})
gTourData$timestamp <- as.POSIXct(gTourData$timestamp, tz = "UTC")
timeDiff <-
as.POSIXct(as.character(Sys.time()), tz = "UTC") - as.POSIXct(as.character(Sys.time()), tz = "Europe/Berlin")
gTourData$timestamp <- as.character(gTourData$timestamp + timeDiff)
gTourData$timestamp <- sapply(gTourData$timestamp, function(x) {
y <- substr(x, 1, 4)
m <- substr(x, 6, 7)
d <- substr(x, 9, 10)
paste(y, m, d, sep = "-")
})
gTourData <- gTourData %>%
filter(event_tour %in% 'einfuhrung')
plotGTourData <- gTourData %>%
group_by(event_step, timestamp) %>%
summarise(Count = n())
colnames(plotGTourData) <- c('Tour Step', 'Date', 'Count')
ggplot(plotGTourData, aes(x = Date,
y = Count,
group = `Tour Step`,
color = `Tour Step`,
fill = `Tour Step`)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
scale_y_continuous(labels = comma) +
ggtitle('Autumn Banner Campaign 2017: Guided Tour Steps (daily)') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
### --- Full Dataset (Table Report)
datatable(plotGTourData)
### --- Campaign User Registrations
plotGTourDataTotal <- plotGTourData %>%
group_by(`Tour Step`) %>%
summarise(Count = sum(Count)) %>%
arrange(desc(Count))
plotGTourDataTotal$`Tour Step` <-
factor(plotGTourDataTotal$`Tour Step`,
levels = plotGTourDataTotal$`Tour Step`[order(plotGTourDataTotal$Count)])
ggplot(plotGTourDataTotal, aes(x = `Tour Step`,
y = Count,
group = `Tour Step`,
color = `Tour Step`,
fill = `Tour Step`,
label = Count)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
scale_y_continuous(labels = comma) +
geom_label(fill = "white", color = "black") +
ggtitle('Autumn Banner Campaign 2017: Guided Tour Steps (totals)') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank()) +
theme(legend.position = 'None')
Number of users not exiting the Guided Tour:
nRegistered <- dim(userReg)[1]
nExitedGT <- dim(gTourData)[1]
print(paste(nRegistered - nExitedGT,
" users out of ",
nRegistered,
" (", round((nRegistered - nExitedGT)/nRegistered*100, 2), "%) did not exit the Campaign Guided Tour",
sep = ""))
[1] "667 users out of 1054 (63.28%) did not exit the Campaign Guided Tour"
How many users exit the Guided Tour at the initial step? NOTE: The Others
category encompasses all users who did not exit at the initial step; they have either exited the Guided Tour later on or completed the tour.
exGTdata <- plotGTourData %>%
group_by(`Tour Step`) %>%
summarise(Count = sum(Count))
exGT1 <- exGTdata$Count[exGTdata$`Tour Step` %in% 'willkommen']
exGT2 <- nRegistered - exGT1
exGTourStep1 <- paste(exGT1, " (", round(exGT1/(exGT1 + exGT2)*100, 2), "%)", sep = "")
exGTourStep2 <- paste(exGT2, " (", round(exGT2/(exGT1 + exGT2)*100, 2), "%)", sep = "")
exGTour1 <- data.frame(`Users who exited at Step 1` = exGTourStep1,
`Others` = exGTourStep2,
check.names = F,
stringsAsFactors = F)
knitr::kable(exGTour1, format = "html") %>%
kable_styling(full_width = F, position = "left")
Users who exited at Step 1 | Others |
---|---|
129 (12.24%) | 925 (87.76%) |
# - determine userIDs
userReg <- userReg %>%
dplyr::select(id, event_userId, timestamp, event_isSelfMade, event_campaign) %>%
filter(event_isSelfMade == 1 & grepl("wmde_abc2017", event_campaign))
userIDs <- userReg$event_userId
editData <- read.table("./_dailyUpdateDATA/abc2017_userEdits.tsv",
sep = "\t",
quote = "",
header = T,
check.names = F,
stringsAsFactors = F) %>%
filter(rev_user %in% userIDs)
plEditData <- editData %>%
group_by(edits) %>%
summarise(Count = n())
colnames(plEditData) <- c('Num.Edits', 'Count')
print(paste(sum(plEditData$Count),
" out of ",
dim(userReg)[1],
" registered users (",
round(sum(plEditData$Count)/dim(userReg)[1]*100, 2),
"%) have made at least one edit.",
sep = "")
)
[1] "223 out of 1054 registered users (21.16%) have made at least one edit."
The y-axis represents log(Number of users)
to make the line plot more readable, while the data labels present exact user counts alongside the number of edits made.
ggplot(plEditData, aes(x = `Num.Edits`,
y = log(Count),
label = paste(Count, " (", `Num.Edits`, " edits)", sep = ""))
) +
geom_path(size = .25, color = "darkblue") +
geom_point(size = 1.5, color = "darkblue") +
geom_point(size = 1, color = "white") +
geom_text_repel(size = 3) +
scale_y_continuous(labels = comma) +
ylab('log(Num. of Users)') + xlab('Number of Edits') +
ggtitle('Autumn Banner Campaign 2017: User Edits Distribution') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
editCampaign <- left_join(editData, userReg,
by = c("rev_user" = "event_userId")) %>%
group_by(event_campaign) %>%
summarise(Edits = sum(edits))
colnames(editCampaign) <- c('Campaign', 'Edits')
# - recode:
editCampaign$Campaign <- recode(editCampaign$Campaign,
'wmde_abc2017_bt1' = 'BT1',
'wmde_abc2017_bt2' = 'BT2',
'wmde_abc2017_bt3' = 'BT3',
'wmde_abc2017_gib_rg' = 'GIB_RG',
'wmde_abc2017_gib_lp' = 'GIB_LP'
)
editCampaign$Campaign <- factor(editCampaign$Campaign,
levels = names(campaignChartColors))
ggplot(editCampaign, aes(x = Campaign,
y = Edits,
fill = Campaign,
color = Campaign,
label = Edits)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
geom_label(fill = "white", color = "black") +
scale_y_continuous(labels = comma) +
scale_fill_manual("legend", values = campaignChartColors) +
scale_color_manual("legend", values = campaignChartColors) +
ggtitle('Autumn Banner Campaign 2017: User Edits per Campaign') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
### --- Full Dataset (Table Report)
datatable(editCampaign)
The percent of users who made any edits at all per campaign:
# - the dataset
editsMade <- left_join(userReg, editData,
by = c('event_userId' = 'rev_user'))
editsMade$event_campaign <- toupper(gsub("wmde_abc2017_", "", editsMade$event_campaign, fixed = T))
editsMade$edits[is.na(editsMade$edits)] <- 0
editsMade$Edit <- ifelse(editsMade$edits > 0, 'Edited', 'No edits')
editsMade <- dplyr::select(editsMade,
event_campaign, Edit)
colnames(editsMade)[1] <- 'Campaign'
editsMade <- editsMade %>%
group_by(Campaign, Edit) %>%
summarise(Count = n())
editsMade <- editsMade %>%
group_by(Campaign) %>%
mutate(Count = round(Count/sum(Count)*100, 2))
editsMade$Edit <- factor(editsMade$Edit, levels = c('Edited', 'No edits'))
ggplot(editsMade, aes(x = '', y = Count,
fill = Edit,
color = Edit,
group = Edit,
label = Count)) +
geom_bar(position = "stack",
stat = "identity",
width = 1,
color = "black") +
coord_polar("y", start = 0) +
facet_wrap(~ Campaign) +
scale_fill_manual("legend", values = c('firebrick', 'white')) +
scale_color_manual("legend", values = c('firebrick', 'white')) +
ggtitle('Autumn Banner Campaign 2017: User Edits Distributions per Campaign') +
xlab("") + ylab("Percent Edited") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
In the following table: No edits
: users with zero edits, Edited
: number of user who made any edits at all, 1 - 4 edits
: number of users with 1 - 4 edits, 5 - 10 edits
: number of users with 5 - 10 edits, and >10 edits
: number of user with more than ten edits.
### --- Full Dataset (Table Report)
pltEdits <- as.tbl(editData) %>%
dplyr::group_by(edits) %>%
count()
edits0 <- dim(userReg)[1] - sum(plEditData$Count)
edits <- sum(pltEdits$n[pltEdits$edits > 0])
edits1_4 <- sum(pltEdits$n[pltEdits$edits >= 1 & pltEdits$edits <= 4])
edits5_10 <- sum(pltEdits$n[pltEdits$edits >= 5 & pltEdits$edits <= 10])
edits10 <- sum(pltEdits$n[pltEdits$edits > 10])
editClasses <- data.frame(`No edits` = edits0,
`Edited` = edits,
`1 - 4 edits` = edits1_4,
`5 - 10 edits` = edits5_10,
`> 10 edits` = edits10,
check.names = F,
stringsAsFactors = F)
knitr::kable(editClasses, format = "html") %>%
kable_styling(full_width = F, position = "left")
No edits | Edited | 1 - 4 edits | 5 - 10 edits | > 10 edits |
---|---|---|---|---|
831 | 223 | 194 | 22 | 7 |
How many edits where made by users who did and did not exit the Campaign Guided Tour?
editGTData <- left_join(editData, userReg, by = c('rev_user' = 'event_userId'))
editGTData <- left_join(editGTData, gTourData, by = c('rev_user' = 'event_userId'))
exTourEdits <- sum(editGTData$edits[!is.na(editGTData$event_tour)])
notExTourEdits <- sum(editGTData$edits[is.na(editGTData$event_tour)])
exitedTourEdits <- paste(exTourEdits,
" (", round(exTourEdits/(exTourEdits + notExTourEdits)*100, 2), "%)",
sep = "")
notExitedTourEdits <- paste(notExTourEdits,
" (", round(notExTourEdits/(exTourEdits + notExTourEdits)*100, 2), "%)",
sep = "")
gtEdits <- data.frame(`Exited GT` = exitedTourEdits,
`Did not exit GT` = notExitedTourEdits,
check.names = F,
stringsAsFactors = F)
knitr::kable(gtEdits, format = "html") %>%
kable_styling(full_width = F, position = "left")
Exited GT | Did not exit GT |
---|---|
292 (48.5%) | 310 (51.5%) |
How does exiting vs. not exiting the Campaign Guided Tour influence whether the new user will make at least one edit or not? The following contingency table presents the number of registered users who made any edits at all (vs. those did not edit) separately for those who did and did not exit the Guided Tour.
userRegGT <- left_join(userReg, gTourData,
by = 'event_userId')
userRegGT <- left_join(userRegGT, editData,
by = c('event_userId' = 'rev_user'))
# - Contingency Table:
a <- length(userRegGT$event_userId[!is.na(userRegGT$edits) & is.na(userRegGT$event_tour)])
b <- length(userRegGT$event_userId[is.na(userRegGT$edits) & is.na(userRegGT$event_tour)])
c <- length(userRegGT$event_userId[!is.na(userRegGT$edits) & !is.na(userRegGT$event_tour)])
d <- length(userRegGT$event_userId[is.na(userRegGT$edits) & !is.na(userRegGT$event_tour)])
ct <- data.frame(`Edited` = c(a, c),
`No edits` = c(b, d),
check.names = F)
rownames(ct) <- c('GT Completed', 'GT Exited')
# - deltaP:
deltaP <- a/(a+b) - c/(c+d)
if (deltaP >= 0) {
causalP <- deltaP/(1 - c/(c+d))
} else {
causalP <- -deltaP/(c/(c+d))
}
knitr::kable(ct, format = "html") %>%
kable_styling(full_width = F, position = "left")
Edited | No edits | |
---|---|---|
GT Completed | 103 | 564 |
GT Exited | 120 | 267 |
The estimate of the Causal Power (it can range from 0 = no causal influence at all, to 1 = a cause completelly sufficient to bring about its effect) of the Guided Tour to bring about any edits at all is:
ceffect <- ifelse(deltaP >= 0, 'generative effect', 'preventive effect')
print(paste('Guided Tour Causal Power: ', round(causalP, 2), ' (', ceffect, ')', sep = ""))
[1] "Guided Tour Causal Power: 0.5 (preventive effect)"
print(paste('(NOTE: with a value of a probabilistic contrast deltaP of): ', round(deltaP, 2), sep = ""))
[1] "(NOTE: with a value of a probabilistic contrast deltaP of): -0.16"
SUGGESTION: remove the Guided Tour from our future campaigns; it has an preventive effect upon the number of new user edits.
The following model provides for the removal effects of the Campaign channels in respect to whether a user has made any edits at all or not. This procedure instantiates a model of a particular campaign as a directed graph in which every node represents a campaign channel (e.g. a banner, a page view, an act of a user doing something, etc), and then computes the probabilities of transition from one to another channel. In other words, the model estimates the probabilities of taking any of the possible user journeys in the campaign. Once the model is ready, the procedure simulates a large number of user journeys to produce an estimate of the probability of conversion for each of them. In the case of our campaign we consider the event of a user making at least one edit as a conversion. When this step is completed, the procedure starts removing one by one campaign channel from the model, and each time it re-computes the conversion probability to estimate how many conversions would be lost due to the removal of a particular channel. The larger the drop in probability of conversion due to the removal of a particular channel, the larger the removal effect for that channel. Channels with larger removal effects are considered to be more important. The value of the removal effect, being a probability in itself, can vary from 0 to 1.
In this case, the campaign channels are the following events:
BT1
- Specific Task Banner wmde_abc2017_bt1
is presented;BT2
- Specific Task Banner wmde_abc2017_bt2
is presented;BT3
- Specific Task Banner wmde_abc2017_bt3
is presented;GIB
- General Inviation Banner - wmde_abc2017_gib_lp
or wmde_abc2017_gib_rg
is presented;TLP
- Specific Task Page JetztMitmachen
is viewed (note: the same as a banner click on any of the following banners: BT1
, BT2
, BT3
);GLP
- General Page Mach_mit
is viewed; (note: the same as a banner click on GIB_LP
);RP
- Registration Page Benutzerkonto_anlegen
is viewed; (note: encompasses users who transit from JetztMitmachen
or Mach_mit
, as well as banner clicks on GIB_RG
);Reg
- The act of user registration;GT
- The act of completing the Guided Tour.Important: unlike in the Bayesian A/B tests that are presented above, where the criterion for pair-wise comparisons among the campaign banners was either the number of users registered (Section 5.1A
), or the number of edits made (Section 5.1B
), here the criterion (i.e. the definition of conversion, if you prefer) is whether a user has made any edits at all. The reason that motivates this criterion, and not a more strict criterion of making >= 10 edits
, is simply because there are only several users who have registered via this campaign and made more than ten edits until now. Removal Effects. The Removal Effect for a campaign channel represents the change in probability that a conversion would obtain if the respective channel was removed from the campaign. Once again, given that conversion here means a user making at least one edit, the removal effects tells us how much would the probability of obtaining at least one edit from a user drop if the respective campaign channel was removed. TECHNICAL NOTE: a Markov model of order 4 was used, with 1e8
total simulation runs from the transition matrix.
### --- Banner -> Exit paths --- ###
### --- Definition: N(Banner Impressions) - N(BannerClicks == Landing Page Views)
# - define: N(Banner Impressions)
bImp <- banImpSet %>%
group_by(Banner) %>%
summarise(Count = sum(Count))
nBT1 <- bImp$Count[which(bImp$Banner %in% 'BT1')]
nBT2 <- bImp$Count[which(bImp$Banner %in% 'BT2')]
nBT3 <- bImp$Count[which(bImp$Banner %in% 'BT3')]
nGIB <- bImp$Count[which(bImp$Banner %in% 'GIB_LP')] + bImp$Count[which(bImp$Banner %in% 'GIB_RG')]
# - define: N(BannerClicks/PageViews)
bClick <- clickPlotSet %>%
group_by(Source) %>%
summarise(Count = sum(Count))
bClick$Source <- gsub("_click", "", bClick$Source, fixed = T)
# - define: N(BannerImpressions) - N(BannerClicks/PageViews)
nBT1 <- nBT1 - bClick$Count[which(bClick$Source %in% 'BT1')]
nBT2 <- nBT2 - bClick$Count[which(bClick$Source %in% 'BT2')]
nBT3 <- nBT3 - bClick$Count[which(bClick$Source %in% 'BT3')]
nGIB <- nGIB - bClick$Count[which(bClick$Source %in% 'GIB_LP')] + bClick$Count[which(bClick$Source %in% 'GIB_RG')]
### --- Banner -> Landing Page -> Exit paths --- ###
### --- N(Banner Clicks == Landing Page Views) - N(Registration Page Views)
### --- NOTE: TLP == Task Landing Page (JetztMitmachen), GLP == General Landing Page (Mach mit)
nBT1_TLP <- bClick$Count[which(bClick$Source %in% 'BT1')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT1']
nBT2_TLP <- bClick$Count[which(bClick$Source %in% 'BT2')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT2']
nBT3_TLP <- bClick$Count[which(bClick$Source %in% 'BT3')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT3']
nGIB_GLP <- bClick$Count[which(bClick$Source %in% 'GIB_LP')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'Mach_mit']
### --- Banner (-> Landing Page) -> Registration Page -> Exit paths --- ###
### --- N(Registration Page Views) - N(User Registrations)
bUserReg <- userReg %>%
group_by(event_campaign) %>%
summarise(Count = n())
bUserReg$event_campaign <- toupper(gsub("wmde_abc2017_", "", bUserReg$event_campaign, fixed = T))
colnames(bUserReg)[1] <- 'Banner'
nBT1_TLP_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT1'] -
bUserReg$Count[bUserReg$Banner %in% 'BT1']
nBT2_TLP_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT2'] -
bUserReg$Count[bUserReg$Banner %in% 'BT2']
nBT3_TLP_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT3'] -
bUserReg$Count[bUserReg$Banner %in% 'BT3']
nGIB_GLP_RP <- pageSource$n[pageSource$Page %in% 'Mach_mit' & pageSource$Source %in% 'GIB_LP_click'] -
bUserReg$Count[bUserReg$Banner %in% 'GIB_LP']
nGIB_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'GIB_RG_click'] -
bUserReg$Count[bUserReg$Banner %in% 'GIB_RG']
### --- Banner (-> Landing Page) -> Registration Page -> Registration -> Exit --- ###
### --- N(User Registrations) - N(Edited) - N(Completed GT and Not Edited)
userRegGT <- left_join(userReg, gTourData,
by = 'event_userId')
userRegGT <- left_join(userRegGT, editData,
by = c('event_userId' = 'rev_user'))
nBT1_TLP_RP_Reg <- bUserReg$Count[bUserReg$Banner %in% 'BT1'] -
sum((userRegGT$event_campaign %in% 'wmde_abc2017_bt1' & is.na(userRegGT$event_tour)) |
(userRegGT$event_campaign %in% 'wmde_abc2017_bt1' & !is.na(userRegGT$event_tour) & !is.na(userRegGT$edits)))
nBT2_TLP_RP_Reg <- bUserReg$Count[bUserReg$Banner %in% 'BT2'] -
sum((userRegGT$event_campaign %in% 'wmde_abc2017_bt2' & is.na(userRegGT$event_tour)) |
(userRegGT$event_campaign %in% 'wmde_abc2017_bt2' & !is.na(userRegGT$event_tour) & !is.na(userRegGT$edits)))
nBT3_TLP_RP_Reg <- bUserReg$Count[bUserReg$Banner %in% 'BT3'] -
sum((userRegGT$event_campaign %in% 'wmde_abc2017_bt3' & is.na(userRegGT$event_tour)) |
(userRegGT$event_campaign %in% 'wmde_abc2017_bt3' & !is.na(userRegGT$event_tour) & !is.na(userRegGT$edits)))
nGIB_GLP_RP_Reg <- bUserReg$Count[bUserReg$Banner %in% 'GIB_LP'] -
sum((userRegGT$event_campaign %in% 'wmde_abc2017_gib_lp' & is.na(userRegGT$event_tour)) |
(userRegGT$event_campaign %in% 'wmde_abc2017_gib_lp' & !is.na(userRegGT$event_tour) & !is.na(userRegGT$edits)))
nGIB_RP_Reg <- bUserReg$Count[bUserReg$Banner %in% 'GIB_RG'] -
sum((userRegGT$event_campaign %in% 'wmde_abc2017_gib_rg' & is.na(userRegGT$event_tour)) |
(userRegGT$event_campaign %in% 'wmde_abc2017_gib_rg' & !is.na(userRegGT$event_tour) & !is.na(userRegGT$edits)))
### --- Banner (-> Landing Page) -> Registration Page -> Registration -> GT -> Exit --- ###
### --- N(User Registrations) - N(Edited) - N(Completed GT and Not Edited)
nBT1_TLP_RP_Reg_GT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_bt1') &
is.na(userRegGT$event_tour) &
is.na(userRegGT$edits)])
nBT2_TLP_RP_Reg_GT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_bt2') &
is.na(userRegGT$event_tour) &
is.na(userRegGT$edits)])
nBT3_TLP_RP_Reg_GT <- length(userRegGT$event_userId[(userRegGT$event_campaign.x %in% 'wmde_abc2017_bt3') &
is.na(userRegGT$event_tour) &
is.na(userRegGT$edits)])
nGIB_GLP_RP_Reg_GT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_gib_lp') &
is.na(userRegGT$event_tour) &
is.na(userRegGT$edits)])
nGIB_RP_Reg_GT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_gib_rg') &
is.na(userRegGT$event_tour) &
is.na(userRegGT$edits)])
### --- Banner (-> Landing Page) -> Registration Page -> Registration -> GT -> EDIT --- ###
nBT1_TLP_RP_Reg_GT_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_bt1') &
is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nBT2_TLP_RP_Reg_GT_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_bt2') &
is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nBT3_TLP_RP_Reg_GT_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign.x %in% 'wmde_abc2017_bt3') &
is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nGIB_GLP_RP_Reg_GT_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_gib_lp') &
is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nGIB_RP_Reg_GT_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_gib_rg') &
is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
### --- Banner (-> Landing Page) -> Registration Page -> Registration -> EDIT --- ###
nBT1_TLP_RP_Reg_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_bt1') &
!is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nBT2_TLP_RP_Reg_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_bt2') &
!is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nBT3_TLP_RP_Reg_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_bt3') &
!is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nGIB_GLP_RP_Reg_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_gib_lp') &
!is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
nGIB_RP_Reg_EDIT <- length(userRegGT$event_userId[(userRegGT$event_campaign %in% 'wmde_abc2017_gib_rg') &
!is.na(userRegGT$event_tour) &
!is.na(userRegGT$edits)])
### --- dataset
mcaData <- data.frame(path = c(deparse(substitute(nBT1)),
deparse(substitute(nBT2)),
deparse(substitute(nBT3)),
deparse(substitute(nGIB)),
deparse(substitute(nBT1_TLP)),
deparse(substitute(nBT2_TLP)),
deparse(substitute(nBT3_TLP)),
deparse(substitute(nGIB_GLP)),
deparse(substitute(nBT1_TLP_RP)),
deparse(substitute(nBT2_TLP_RP)),
deparse(substitute(nBT3_TLP_RP)),
deparse(substitute(nGIB_GLP_RP)),
deparse(substitute(nGIB_RP)),
deparse(substitute(nBT1_TLP_RP_Reg)),
deparse(substitute(nBT2_TLP_RP_Reg)),
deparse(substitute(nBT3_TLP_RP_Reg)),
deparse(substitute(nGIB_GLP_RP_Reg)),
deparse(substitute(nGIB_RP_Reg)),
deparse(substitute(nBT1_TLP_RP_Reg_GT)),
deparse(substitute(nBT2_TLP_RP_Reg_GT)),
deparse(substitute(nBT3_TLP_RP_Reg_GT)),
deparse(substitute(nGIB_GLP_RP_Reg_GT)),
deparse(substitute(nGIB_RP_Reg_GT)),
deparse(substitute(nBT1_TLP_RP_Reg_GT_EDIT)),
deparse(substitute(nBT2_TLP_RP_Reg_GT_EDIT)),
deparse(substitute(nBT3_TLP_RP_Reg_GT_EDIT)),
deparse(substitute(nGIB_GLP_RP_Reg_GT_EDIT)),
deparse(substitute(nGIB_RP_Reg_GT_EDIT)),
deparse(substitute(nBT1_TLP_RP_Reg_EDIT)),
deparse(substitute(nBT2_TLP_RP_Reg_EDIT)),
deparse(substitute(nBT3_TLP_RP_Reg_EDIT)),
deparse(substitute(nGIB_GLP_RP_Reg_EDIT)),
deparse(substitute(nGIB_RP_Reg_EDIT))
),
total_conversions = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
nBT1_TLP_RP_Reg_GT_EDIT, nBT2_TLP_RP_Reg_GT_EDIT, nBT3_TLP_RP_Reg_GT_EDIT,
nGIB_GLP_RP_Reg_GT_EDIT, nGIB_RP_Reg_GT_EDIT, nBT1_TLP_RP_Reg_EDIT,
nBT2_TLP_RP_Reg_EDIT, nBT3_TLP_RP_Reg_EDIT, nGIB_GLP_RP_Reg_EDIT,
nGIB_RP_Reg_EDIT
),
total_null = c(nBT1, nBT2, nBT3, nGIB, nBT1_TLP, nBT2_TLP, nBT3_TLP, nGIB_GLP, nBT1_TLP_RP,
nBT2_TLP_RP, nBT3_TLP_RP, nGIB_GLP_RP, nGIB_RP,
nBT1_TLP_RP_Reg, nBT2_TLP_RP_Reg, nBT3_TLP_RP_Reg, nGIB_GLP_RP_Reg, nGIB_RP_Reg,
nBT1_TLP_RP_Reg_GT, nBT2_TLP_RP_Reg_GT, nBT3_TLP_RP_Reg_GT, nGIB_GLP_RP_Reg_GT,
nGIB_RP_Reg_GT, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
),
stringsAsFactors = F)
# - correct paths:
mcaData$path <- gsub("n", "", mcaData$path, fixed = T)
mcaData$path <- gsub("_", " > ", mcaData$path, fixed = T)
editEnds <- which(grepl("EDIT", mcaData$path, fixed = T))
for (i in 1:length(editEnds)) {
wPath <- which(mcaData$path %in% gsub(" > EDIT", "", mcaData$path[editEnds[i]], fixed = T))
wOut <- which(mcaData$path == mcaData$path[editEnds[i]])
wPath <- setdiff(wPath, wOut)
mcaData$total_conversions[wPath] <- mcaData$total_conversions[wOut]
}
mcaData <- mcaData[-which(grepl("EDIT", mcaData$path, fixed = T)), ]
### --- MCA model
abc2017Model <- markov_model(mcaData,
var_path = "path",
var_conv = "total_conversions",
var_null = "total_null",
order = 4,
nsim = 1e8,
out_more = T)
# - collect removal effects for the next plot:
re4order <- abc2017Model$removal_effects$removal_effects
### --- Removal Effects:
re <- as.data.frame(abc2017Model$removal_effects)
colnames(re) <- c('Channel', 'Removal Effect')
re$Channel <- factor(re$Channel, levels = as.character(abc2017Model$removal_effects$channel_name))
gplot <- ggplot(data = re,
aes(x = Channel,
y = `Removal Effect`,
label = round(`Removal Effect`, 2))
) +
geom_bar(width = .1, color = "darkblue", fill = "white", stat = "identity") +
geom_label(size = 3) +
scale_y_continuous(labels = comma) +
xlab('Campaign Channel') + ylab('Removal Effect') +
ylim(c(0, 1)) +
ggtitle('Campaign Multi-Channel Attribution: Removal Effects') +
theme_minimal() +
theme(plot.title = element_text(size = 10))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
suppressWarnings(print(gplot))
Each node in the following graph represents a particular campaign channel. The edges of the graph are labeled by the respective transition probabilities between the channels. The size of the node corresponds to its removal effect. TECHNICAL NOTE: the removal effects are derived from a Markov model of order 4, while the transitional probabilities are derived directly from the 1st order model.
### --- MCA model: 1st order for channel-to-channel transitions
abc2017Model <- markov_model(mcaData,
var_path = "path",
var_conv = "total_conversions",
var_null = "total_null",
order = 1,
out_more = T)
### --- plot w. {igraph}
abc2017Net <- data.frame(ougoing = abc2017Model$transition_matrix$channel_from,
incoming = abc2017Model$transition_matrix$channel_to,
stringsAsFactors = F)
abc2017Net$ougoing <- sapply(abc2017Net$ougoing, function(x) {
ch <- gsub("(start)", "START", fixed = T, x)
ch <- gsub("(null)", "EXIT", fixed = T, ch)
ch <- gsub("(conversion)", "EDIT", fixed = T, ch)
ch
})
abc2017Net$incoming <- sapply(abc2017Net$incoming, function(x) {
ch <- gsub("(start)", "START", fixed = T, x)
ch <- gsub("(null)", "EXIT", fixed = T, ch)
ch <- gsub("(conversion)", "EDIT", fixed = T, ch)
ch
})
abc2017Net <- graph.data.frame(abc2017Net,
directed = T)
E(abc2017Net)$label <- round(abc2017Model$transition_matrix$transition_probability, 2)
V(abc2017Net)$color <- c('white',
'indianred1', 'indianred2', 'indianred3', 'cadetblue',
'red', 'blue', 'yellow', 'orange', 'green',
'white', 'white')
V(abc2017Net)$size <- c(20, re4order*40, 20, 20)
V(abc2017Net)$frame.color <- 'white'
# - plot w. {igraph}
coords <- layout_(abc2017Net, as_tree())
par(mai=c(rep(0,4)))
plot(abc2017Net,
layout = coords,
edge.width = .75,
edge.color = "grey",
edge.arrow.size = 0.35,
edge.curved = 0.6,
edge.label.family = "sans",
edge.label.color = "black",
edge.label.cex = .6,
vertex.shape = "circle",
vertex.label.color = "black",
vertex.label.font = 1,
vertex.label.family = "sans",
vertex.label.cex = .75,
vertex.label.dist = .25,
vertex.label.dist = .45,
rescale = F,
xlim = c(-1, 1),
ylim = c(0, 4),
margin = c(rep(0,4)))
TECHNICAL NOTE: a Markov model of order 4 was used, with 1e8
total simulation runs from the transition matrix.
### --- Banner -> Exit paths --- ###
### --- Definition: N(Banner Impressions) - N(BannerClicks == Landing Page Views)
# - define: N(Banner Impressions)
bImp <- banImpSet %>%
group_by(Banner) %>%
summarise(Count = sum(Count))
nBT1 <- bImp$Count[which(bImp$Banner %in% 'BT1')]
nBT2 <- bImp$Count[which(bImp$Banner %in% 'BT2')]
nBT3 <- bImp$Count[which(bImp$Banner %in% 'BT3')]
nGIB <- bImp$Count[which(bImp$Banner %in% 'GIB_LP')] + bImp$Count[which(bImp$Banner %in% 'GIB_RG')]
# - define: N(BannerClicks/PageViews)
bClick <- clickPlotSet %>%
group_by(Source) %>%
summarise(Count = sum(Count))
bClick$Source <- gsub("_click", "", bClick$Source, fixed = T)
# - define: N(BannerImpressions) - N(BannerClicks/PageViews)
nBT1 <- nBT1 - bClick$Count[which(bClick$Source %in% 'BT1')]
nBT2 <- nBT2 - bClick$Count[which(bClick$Source %in% 'BT2')]
nBT3 <- nBT3 - bClick$Count[which(bClick$Source %in% 'BT3')]
nGIB <- nGIB - bClick$Count[which(bClick$Source %in% 'GIB_LP')] + bClick$Count[which(bClick$Source %in% 'GIB_RG')]
### --- Banner -> Landing Page -> Exit paths --- ###
### --- N(Banner Clicks == Landing Page Views) - N(Registration Page Views)
### --- NOTE: TLP == Task Landing Page (JetztMitmachen), GLP == General Landing Page (Mach mit)
nBT1_TLP <- bClick$Count[which(bClick$Source %in% 'BT1')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT1']
nBT2_TLP <- bClick$Count[which(bClick$Source %in% 'BT2')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT2']
nBT3_TLP <- bClick$Count[which(bClick$Source %in% 'BT3')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT3']
nGIB_GLP <- bClick$Count[which(bClick$Source %in% 'GIB_LP')] -
pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'Mach_mit']
### --- Banner (-> Landing Page) -> Registration Page -> Exit paths --- ###
### --- N(Registration Page Views) - N(User Registrations)
bUserReg <- userReg %>%
group_by(event_campaign) %>%
summarise(Count = n())
bUserReg$event_campaign <- toupper(gsub("wmde_abc2017_", "", bUserReg$event_campaign, fixed = T))
colnames(bUserReg)[1] <- 'Banner'
nBT1_TLP_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT1'] -
bUserReg$Count[bUserReg$Banner %in% 'BT1']
nBT2_TLP_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT2'] -
bUserReg$Count[bUserReg$Banner %in% 'BT2']
nBT3_TLP_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'JetztMitmachen_BT3'] -
bUserReg$Count[bUserReg$Banner %in% 'BT3']
nGIB_GLP_RP <- pageSource$n[pageSource$Page %in% 'Mach_mit' & pageSource$Source %in% 'GIB_LP_click'] -
bUserReg$Count[bUserReg$Banner %in% 'GIB_LP']
nGIB_RP <- pageSource$n[pageSource$Page %in% 'Spezial:Benutzerkonto_anlegen' & pageSource$Source %in% 'GIB_RG_click'] -
bUserReg$Count[bUserReg$Banner %in% 'GIB_RG']
### --- Banner (-> Landing Page) -> Registration Page -> Registration
nBT1_TLP_RP_Reg <- regData$Registrations[which(regData$Campaign %in% 'BT1')]
nBT2_TLP_RP_Reg <- regData$Registrations[which(regData$Campaign %in% 'BT2')]
nBT3_TLP_RP_Reg <- regData$Registrations[which(regData$Campaign %in% 'BT3')]
nGIB_GLP_RP_Reg <- regData$Registrations[which(regData$Campaign %in% 'GIB_LP')]
nGIB_RP_Reg <- regData$Registrations[which(regData$Campaign %in% 'GIB_RG')]
### --- dataset
mcaData <- data.frame(path = c(deparse(substitute(nBT1)),
deparse(substitute(nBT2)),
deparse(substitute(nBT3)),
deparse(substitute(nGIB)),
deparse(substitute(nBT1_TLP)),
deparse(substitute(nBT2_TLP)),
deparse(substitute(nBT3_TLP)),
deparse(substitute(nGIB_GLP)),
deparse(substitute(nBT1_TLP_RP)),
deparse(substitute(nBT2_TLP_RP)),
deparse(substitute(nBT3_TLP_RP)),
deparse(substitute(nGIB_GLP_RP)),
deparse(substitute(nGIB_RP)),
deparse(substitute(nBT1_TLP_RP_Reg)),
deparse(substitute(nBT2_TLP_RP_Reg)),
deparse(substitute(nBT3_TLP_RP_Reg)),
deparse(substitute(nGIB_GLP_RP_Reg)),
deparse(substitute(nGIB_RP_Reg))
),
total_conversions = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
nBT1_TLP_RP_Reg, nBT2_TLP_RP_Reg, nBT1_TLP_RP_Reg, nGIB_GLP_RP_Reg, nGIB_RP_Reg
),
total_null = c(nBT1, nBT2, nBT3, nGIB, nBT1_TLP, nBT2_TLP, nBT3_TLP, nGIB_GLP, nBT1_TLP_RP,
nBT2_TLP_RP, nBT3_TLP_RP, nGIB_GLP_RP, nGIB_RP,
0, 0, 0, 0, 0
),
stringsAsFactors = F)
# - correct paths:
mcaData$path <- gsub("n", "", mcaData$path, fixed = T)
mcaData$path <- gsub("_", " > ", mcaData$path, fixed = T)
editEnds <- which(grepl("Reg", mcaData$path, fixed = T))
for (i in 1:length(editEnds)) {
wPath <- which(mcaData$path %in% gsub(" > Reg", "", mcaData$path[editEnds[i]], fixed = T))
wOut <- which(mcaData$path == mcaData$path[editEnds[i]])
wPath <- setdiff(wPath, wOut)
mcaData$total_conversions[wPath] <- mcaData$total_conversions[wOut]
}
mcaData <- mcaData[-which(grepl("Reg", mcaData$path, fixed = T)), ]
### --- MCA model
abc2017Model <- markov_model(mcaData,
var_path = "path",
var_conv = "total_conversions",
var_null = "total_null",
order = 4,
nsim = 1e8,
out_more = T)
# - collect removal effects for the next plot:
re4order <- abc2017Model$removal_effects$removal_effects
### --- Removal Effects:
re <- as.data.frame(abc2017Model$removal_effects)
colnames(re) <- c('Channel', 'Removal Effect')
re$Channel <- factor(re$Channel, levels = as.character(abc2017Model$removal_effects$channel_name))
gplot <- ggplot(data = re,
aes(x = Channel,
y = `Removal Effect`,
label = round(`Removal Effect`, 2))
) +
geom_bar(width = .1, color = "darkblue", fill = "white", stat = "identity") +
geom_label(size = 3) +
scale_y_continuous(labels = comma) +
xlab('Campaign Channel') + ylab('Removal Effect') +
ylim(c(0, 1)) +
ggtitle('Campaign Multi-Channel Attribution: Removal Effects') +
theme_minimal() +
theme(plot.title = element_text(size = 10))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
suppressWarnings(print(gplot))
Each node in the following graph represents a particular campaign channel. The edges of the graph are labeled by the respective transition probabilities between the channels. The size of the node corresponds to its removal effect. TECHNICAL NOTE: the removal effects are derived from a Markov model of order 4, while the transitional probabilities are derived directly from the 1st order model.
### --- MCA model: 1st order for channel-to-channel transitions
abc2017Model <- markov_model(mcaData,
var_path = "path",
var_conv = "total_conversions",
var_null = "total_null",
order = 1,
out_more = T)
### --- plot w. {igraph}
abc2017Net <- data.frame(ougoing = abc2017Model$transition_matrix$channel_from,
incoming = abc2017Model$transition_matrix$channel_to,
stringsAsFactors = F)
abc2017Net$ougoing <- sapply(abc2017Net$ougoing, function(x) {
ch <- gsub("(start)", "START", fixed = T, x)
ch <- gsub("(null)", "EXIT", fixed = T, ch)
ch <- gsub("(conversion)", "REGISTRATION", fixed = T, ch)
ch
})
abc2017Net$incoming <- sapply(abc2017Net$incoming, function(x) {
ch <- gsub("(start)", "START", fixed = T, x)
ch <- gsub("(null)", "EXIT", fixed = T, ch)
ch <- gsub("(conversion)", "REGISTRATION", fixed = T, ch)
ch
})
abc2017Net <- graph.data.frame(abc2017Net,
directed = T)
E(abc2017Net)$label <- round(abc2017Model$transition_matrix$transition_probability, 2)
V(abc2017Net)$color <- c('white',
'indianred1', 'indianred2', 'indianred3', 'cadetblue',
'red', 'blue', 'yellow',
'white', 'white')
V(abc2017Net)$size <- c(20, re4order*40, 20, 20)
V(abc2017Net)$frame.color <- 'white'
# - plot w. {igraph}
coords <- layout_(abc2017Net, as_tree())
par(mai=c(rep(0,4)))
plot(abc2017Net,
layout = coords,
edge.width = .75,
edge.color = "grey",
edge.arrow.size = 0.35,
edge.curved = 0.6,
edge.label.family = "sans",
edge.label.color = "black",
edge.label.cex = .6,
vertex.shape = "circle",
vertex.label.color = "black",
vertex.label.font = 1,
vertex.label.family = "sans",
vertex.label.cex = .75,
vertex.label.dist = .25,
vertex.label.dist = .45,
rescale = F,
xlim = c(-1, 1),
ylim = c(0, 4),
margin = c(rep(0,4)))
The landing page for specific tasks (JetztMitmachen
, the TLP
channel in the graph) and the GIB
campaign are essentially no different in respect to how much they influence user registration. We have learned from the A/B tests that no individual BT
(i.e. specific task) banner compares to the performance of GIB_RG
which leads directly to the registraion page. However, when considered together, the banners leading to the JetztMitmachen
have a performance comparable to GIB_RG
. The General Invitation landing page Mach_mit
lacks such an effect.
ASSUMPTIONS as stated in the Campaign KickOff Presentation:
Assumption 1: More users register when given a clear and low level entry task. RESULTS: When comparing individual banner campaigns, A/B testing shows that more users register via the General Invitation Banner campaign, especially when given no intermediate landing page prior to the registration page. However, the JetztMitmachen
campagin in general has a performance comparable to the GIB
campaign, while the JetztMitmachen
page was certainly more important for user registration than the general Mach_mit
page - as we have learned from the campaign Multi-Channel Attribution model.
Assumption 2: A landing page with more information before registration is necessary. RESULTS: A/B testing shows that more users register via GIB_RG
banner campaign that leads directly to the registration page than via the GIB_LP
banner campaign that has an intermediate landing page. However, Assumption2
is supported by a high removal effect of the JetztMitmachen
page.
Assumption 3: A general invitation has a lower conversion rate than specific invitations to register. RESULTS: The total number of registered users via the JetztMitmachen
campagin (BT1
, BT2
, and BT3
banner campaigns taken together) is 535, while the total number of users registered via the General Invitation campaign is 519, an almost 50-50 split.
NOTE: All these assumptions are valid if we consider the criterion of making an edit at all instead.
SUGGESTIONS
Suggestion No. 1. Remove the GIB_RG
banner campaign from future campaigns. It drives almost 90% of the traffic towards the registration page while being the least efficient in terms of influencing new user edits at the same time (NOTE: least efficient in terms of the expected number of user edits, not in terms of making any edits at all). That would probably mean that dewiki
would acquire less new users during the campaign, but again the goal is probably for it to acquire new editors. Or, even better, take a look at my Suggestion No. 2
.
Suggestion No. 2. Think about the possibility to integrate the campaign content (e.g. what is on the landing pages now) to the registration page directly. Ratio: the GIB_RG
banner campaign has no intermediate landing page between banner presentation and registration, leading to the highest number of registered new users; on the other hand, those banner campaigns that instantiate a specific task lead to having more user edits on the average than it (in general; this is not valid for BT2
). Maybe integrating the campaign content with the registration page can provide a more powerful combination that would affect positively both registration and future editing.
Suggestion No. 3. Remove the Guided Tour
from our future campaigns; the analysis of its causal power suggests that it has a negative influence towards making at least one edit on behalf of a newly registered user.
This section provides several insights that were sought on the behalf of the campaign management team following the end of the Autumn Banner Campaign 2017.
NOTE: the following Data Acquisition code chunk is not fully reproducible from this Report. The data are collected by running the script abc2017_PROD_RevertedEdits.R on stat1005.eqiad.wmnet, collecting the data as .tsv files, copying manually, and processing locally. Run from stat1005 stat box by executing Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_RevertedEdits.R
.
### --- Script: abc2017_PROD_OverallDailyUpdate.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_RevertedEdits.R
### --- The script collects and wrangles a dataset for ABC 2017 post-campaign analytics
### --- WMDE Autumn Banner Campaign 2017.
### --- Goran S. Milovanovic, Data Scientist, WMDE
### --- November 06, 2017.
### -----------------------------------------------------------------------------
### 0. Setup
### -----------------------------------------------------------------------------
rm(list = ls())
library(dplyr)
# - get user registration data: abc2017_userRegistrations.tsv
# - then get user IDs from registered:
setwd('/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/')
lF <- list.files()
lF <- lF[grepl('userRegistrations', lF, fixed = T)]
userReg <- read.table(lF,
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
userReg <- userReg %>%
dplyr::select(event_userId, event_isSelfMade) %>%
filter(event_isSelfMade == 1)
# - uids:
uid <- userReg$event_userId
# - sql query
sqlQuery <- paste('SELECT rev_user, rev_id, rev_page, rev_timestamp, rev_sha1, rev_content_model, rev_content_format FROM revision WHERE rev_user IN (',
paste(uid, collapse = ", "),
') AND (rev_timestamp >= 20171004220000) AND (rev_timestamp <= 20171014220000);',
sep = "")
mySqlCommand <- paste('mysql -h analytics-store.eqiad.wmnet dewiki -e ',
paste('"', sqlQuery, '" > ', sep = ""),
'/home/goransm/_miscWMDE/abc2017_DataOUT/abc2017_OfficialDatasets/abc2017_DailyUpdate/abc2017_completeUserRevisions.tsv', sep = "")
system(command = mySqlCommand,
wait = TRUE)
Analyse reverted edits locally:
userRevisions <- read.table('./_dailyUpdateDATA/abc2017_completeUserRevisions.tsv',
quote = "",
sep = "\t",
header = T,
check.names = F,
stringsAsFactors = F)
userRevisions <- left_join(userRevisions,
userReg,
by = c("rev_user" = "event_userId"))
userRevisions <- userRevisions %>%
filter(!is.na(event_campaign))
# - keep only those users who made any edits at all:
userRevisions <- userRevisions %>%
filter(rev_user %in% editData$rev_user)
# - Note: UTC times, conversion to CET is not necessary here
userRevisions$rev_timestamp <- as.character(userRevisions$rev_timestamp)
revertsPerUser <- lapply(unique(userRevisions$rev_user), function(x) {
dataset <- dplyr::arrange(userRevisions[userRevisions$rev_user == x, ], rev_timestamp)
return(data.frame(userId = x,
revCount = sum(table(dataset$rev_sha1) - 1),
stringsAsFactors = F))
})
revertsPerUser <- rbindlist(revertsPerUser)
print(paste(sum(revertsPerUser$revCount), " edits were reverted.", sep = ""))
[1] "4 edits were reverted."
The percent of registered users who did and did not complete the Guided Tour, per banner campaign:
uRGT <- userRegGT %>%
group_by(event_campaign) %>%
summarise(Complete = sum(is.na(event_tour)), `Incomplete` = sum(!is.na(event_tour)))
uRGT$event_campaign <- toupper(gsub("wmde_abc2017_", "", uRGT$event_campaign, fixed = T))
uRGT$`% Complete` = round(uRGT$Complete/(uRGT$Complete + uRGT$Incomplete)*100, 2)
uRGT$`% Incomplete` = round(uRGT$Incomplete/(uRGT$Complete + uRGT$Incomplete)*100, 2)
uRGT$Complete <- NULL
uRGT$Incomplete <- NULL
colnames(uRGT)[1] <- 'Campaign'
knitr::kable(uRGT, format = "html") %>%
kable_styling(full_width = F, position = "left")
Campaign | % Complete | % Incomplete |
---|---|---|
BT1 | 52.76 | 47.24 |
BT2 | 63.83 | 36.17 |
BT3 | 68.25 | 31.75 |
GIB_LP | 58.72 | 41.28 |
GIB_RG | 67.15 | 32.85 |
The number of users exiting the Guided Tour at a particular step, per banner campaign.
gtexitPlot <- userRegGT %>%
filter(!is.na(event_tour)) %>%
dplyr::select(event_campaign, event_step) %>%
group_by(event_campaign, event_step) %>%
summarise(Count = n())
colnames(gtexitPlot) <- c('Campaign', 'Exit Point', 'Count')
gtexitPlot$Campaign <- toupper(gsub("wmde_abc2017_", "", gtexitPlot$Campaign, fixed = T))
ggplot(gtexitPlot, aes(x = '', y = Count,
fill = `Exit Point`,
color = `Exit Point`,
group = `Exit Point`,
label = Count)) +
geom_bar(position = "dodge",
stat = "identity",
width = 1,
color = "black") +
facet_wrap(~ Campaign) +
ggtitle('Autumn Banner Campaign 2017: \nPoint of Guided Tour Exit per Registration Campaign') +
xlab("") + ylab("Num. Users") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())