This section presents all data and statistics on the campaign relevant banners.
1.1 Banner Impressions
NOTE: the following code is not fully reproducible from this Report. The data are collected by running the script abc2017_PROD_BannerImpressions.R
on stat1005.eqiad.wmnet, collecting the data as .tsv
files, copying manually, and processing locally.
### 1. 1 Banner Impressions
### --- Script: abc2017_PROD_BannerImpressions.R
### --- the following runs from stat1005.eqiad.wmnet
### --- Campaign Banner Tags:
# - (1) ?campaign=wmde_abc2017_bt1 - banner for Specific Task 1;
# - (2) ?campaign=wmde_abc2017_bt2 - banner for Specific Task 2;
# - (3) ?campaign=wmde_abc2017_bt3 - banner for Specific Task 3;
# - (4) ?campaign=wmde_abc2017_gib_lp - banner for the General Invitation
# - which leads to the Landing Page upon click;
# - (5) ?campaign=wmde_abc2017_gib_rg - banner for the General Invitation
# which leads directly to Registration upon click.
### --- from wmf.webrequest, run the following script on stat 1005
### --- to collect banner impressions
### --- Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_BannerImpressions.R
### --- HiveQL for everything with ?campaign like "wmde_abc2017"
### --- and then look for the desired tags.
rm(list = ls())
### --- loop over date range, create query, fetch, and store
startDate <- '2017-09-20'
endDate <- '2017-09-22'
dateRange <- seq(from = as.Date(startDate),
to = as.Date(endDate),
by = 'day')
dateRange <- as.character(dateRange)
# - set outDir
outDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc_test/'
# - store query dir:
qDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc_test/'
setwd(qDir)
# - set HiveQL query dir:
for (i in 1:length(dateRange)) {
# - construct HiveQL query:
y <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][1])
m <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][2])
d <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][3])
q <- paste(
"USE wmf;
SELECT uri_query FROM webrequest
WHERE uri_host = 'de.wikipedia.org'
AND uri_path = '/beacon/impression'
AND year = ", y,
" AND month = ", m,
" AND day = ", d,
sep = "")
# - write hql
write(q, 'abc2017_BannerImpressions.hql')
# - prepare output file:
fileName <- "abc2017_BannerImpressions_"
fileName <- paste0(fileName, dateRange[i], ".tsv")
fileName <- paste0(outDir, "/", fileName)
# - execute hql script:
hiveArgs <-
'beeline -f'
hiveInput <- paste0('abc2017_BannerImpressions.hql > ',
fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
The remainder of the code analyses the data from the wmf.webrequest
table and is run locally:
### --- extract only campaign relevant data
# - campaign banners:
BT1 <- 'wmde_abc2017_bt1'
BT2 <- 'wmde_abc2017_bt2'
BT3 <- 'wmde_abc2017_bt3'
GIP_RG <- 'wmde_abc2017_gib_rg'
GIP_LP <- 'wmde_abc2017_gib_lp'
# - count non-empty files:
c <- 0
lF <- list.files(path = "./bannerImpressionsDATA/")
lF <- lF[grepl('.tsv', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
dS <- readLines(paste("./bannerImpressionsDATA/",lF[i], sep = ""), n = -1)
dS <- dS[8:length(dS)]
wC <- unname(sapply(dS, function(x) {
grepl(paste(c(BT1, BT2, BT3, GIP_RG, GIP_LP), sep = "", collapse = "|"), x)
}))
if (length(which(wC) > 0)) {
c <- c + 1
dS <- dS[wC]
rm(wC); gc()
dS <- data.frame(query = dS, stringsAsFactors = F)
dS$date <- strsplit(
strsplit(lF[i], split = "_", fixed = T)[[1]][4],
split = ".",
fixed = T)[[1]][1]
dataSet[[c]] <- dS
}
rm(dS); gc()
}
dataSet <- rbindlist(dataSet)
dataSet$impressRate <- sapply(dataSet$query, function(x) {
as.numeric(
str_extract(
str_extract(x, pattern = "recordImpressionSampleRate=([[:digit:]]|\\.)+"),
pattern = "([[:digit:]]|\\.)+")
)
})
### --- Count daily banner impressions:
knitr::kable(bannerImp,
format = "html") %>%
kable_styling(full_width = F,
position = "left")
bannerImp <- gather(bannerImp,
key = Banner,
value = Impressions,
`Banner A`:`Banner B`)
# - Visualize w. {ggplot2}
chartCols <- brewer.pal(8, 'Set2')[c(1, 2, 3, 5, 6)]
ggplot(bannerImp, aes(x = Date,
y = Impressions,
group = Banner,
color = Banner,
fill = Banner,
label = Impressions)) +
geom_bar(stat = "identity",
position = "dodge",
width = .3) +
geom_label(aes(fill = Banner),
colour = "white",
fontface = "bold",
position = position_dodge(width = 1),
size = 2,
show.legend = FALSE) +
scale_fill_manual("legend", values = c("Banner A" = "orange", "Banner B" = "firebrick")) +
scale_color_manual("legend", values = c("Banner A" = "orange", "Banner B" = "firebrick")) +
ggtitle('Autumn Banner Campaign 2017: Banner Impressions\nSource: webrequest, hdfs') +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank())
1.2 Banner Clicks and Landing Page Views
NOTE: the following code is not fully reproducible from this Report. The data are collected by running the script abc2017_PROD_Clicks.R
on stat1005.eqiad.wmnet, collecting the data as .tsv
files, copying manually, and processing locally.
### 1.2 Banner Clicks and Landing Pages
### --- Script: abc2017_PROD_Clicks.R
### --- the following runs on stat1005.eqiad.wmnet
### --- Rscript /home/goransm/RScripts/abc2017/abc2017_PROD_Clicks.R
### --- campaign URL
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Entdeckungen-sortieren?campaign=wmde2017summer1&wmdesource=bannerclick
### --- Campaign Banner Tags:
# - (1) ?campaign=wmde_abc2017_bt1 - banner for Specific Task 1;
# - (2) ?campaign=wmde_abc2017_bt2 - banner for Specific Task 2;
# - (3) ?campaign=wmde_abc2017_bt3 - banner for Specific Task 3;
# - (4) ?campaign=wmde_abc2017_gib_lp - banner for the General Invitation
# - which leads to the Landing Page upon click;
# - (5) ?campaign=wmde_abc2017_gib_rg - banner for the General Invitation
# which leads directly to Registration upon click.
### --- Landing/Registration pages:
# - Landing Page, Specific Tasks, Banner bt1:
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren?campaign=wmde_abc2017_bt1
# - Landing Page, Specific Tasks, Banner bt2:
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren?campaign=wmde_abc2017_bt2
# - Landing Page, Specific Tasks, Banner bt3:
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren?campaign=wmde_abc2017_bt3
# - Landing Page, General, Banner gib_lp
# - https://de.wikipedia.org/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit?campaign=wmde_abc2017_gib_lp
# - Registration Page, banner gib_rg
# - https://de.wikipedia.org/wiki/Spezial:Benutzerkonto_anlegen?campaign=wmde_abc2017_gib_rg
rm(list = ls())
# campaignQuery <- c('campaign=wmde_abc2017_bt1&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_bt2&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_bt3&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_gib_lp&wmdesource=bannerclick',
# 'campaign=wmde_abc2017_gib_rg&wmdesource=bannerclick')
### --- loop over date range, create query, fetch, and store
startDate <- '2017-09-20'
endDate <- '2017-09-22'
dateRange <- seq(from = as.Date(startDate),
to = as.Date(endDate),
by = 'day')
dateRange <- as.character(dateRange)
# - set outDir
outDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc_test/'
# - store query dir:
qDir <- '/home/goransm/_miscWMDE/abc2017_DataOUT/abc_test/'
setwd(qDir)
# - set HiveQL query dir:
for (i in 1:length(dateRange)) {
# - construct HiveQL query:
y <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][1])
m <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][2])
d <- as.numeric(strsplit(dateRange[i], split = "-")[[1]][3])
q <- paste(
"USE wmf;
SELECT uri_path, uri_query, referer FROM webrequest
WHERE (uri_host = 'de.wikipedia.org'
AND (uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren' OR uri_path = '/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit' OR uri_path = '/wiki/Spezial:Benutzerkonto_anlegen')
AND year = ", y,
" AND month = ", m,
" AND day = ", d, ");",
sep = "")
# - write hql
write(q, 'abc2017_BannerClicks.hql')
# - prepare output file:
fileName <- "abc2017_BannerClicks_"
fileName <- paste0(fileName, dateRange[i], ".tsv")
fileName <- paste0(outDir, "/", fileName)
# - execute hql script:
hiveArgs <-
'beeline -f'
hiveInput <- paste0('abc2017_BannerClicks.hql > ',
fileName)
# - command:
hiveCommand <- paste(hiveArgs, hiveInput)
system(command = hiveCommand, wait = TRUE)
}
The following code chunk: Dataset preparation.
### --- Landing pages:
specTaskPage <- '/wiki/Wikipedia:Wikimedia_Deutschland/Fehler_korrigieren'
genInvPage <- '/wiki/Wikipedia:Wikimedia_Deutschland/Mach_mit'
regPage <- '/wiki/Spezial:Benutzerkonto_anlegen'
### --- Banner tags:
specTaskBanner1 <- '?campaign=wmde_abc2017_bt1'
specTaskBanner2 <- '?campaign=wmde_abc2017_bt2'
specTaskBanner3 <- '?campaign=wmde_abc2017_bt3'
genInvPage_rg <- '?campaign=wmde_abc2017_gib_rg'
genInvPage_lp <- '?campaign=wmde_abc2017_gib_lp'
### --- Chart Element Colors:
# chartCols <- c('chocolate1', 'chocolate2', 'chocolate3',
# 'brown4',
# )
# names(chartCols) <- c('BT1_click', 'BT2_click', 'BT3_click',
# 'Fehler_korrigieren',
# 'GIP_LP_click', 'GIP_RG_click',
# 'Other', 'Unknown')
### --- Dataset:
# - count non-empty files:
c <- 0
lF <- list.files(path = "./bannerClicksDATA/")
lF <- lF[grepl('.tsv', lF, fixed = T)]
dataSet <- list()
for (i in 1:length(lF)) {
dS <- readLines(paste("./bannerClicksDATA/",lF[i], sep = ""), n = -1)
dS <- dS[8:(length(dS)-2)]
if (length(dS > 0)) {
c <- c + 1
dS <- lapply(dS, function(x) {
dat <- strsplit(x, split = "\t", fixed = T)[[1]]
data.frame(page = dat[1], banner = dat[2], refer = dat[3], stringsAsFactors = F)
})
}
dS <- rbindlist(dS)
dS$date <- strsplit(
strsplit(lF[i], split = "_", fixed = T)[[1]][3],
split = ".",
fixed = T)[[1]][1]
dataSet[[c]] <- dS
rm(dS); gc()
}
dataSet <- rbindlist(dataSet)
# - replace values:
dataSet$page <- sapply(dataSet$page, function(x) {
strsplit(x, split = "/", fixed = T)[[1]][length(strsplit(x, split = "/", fixed = T)[[1]])]
})
dataSet$banner[which(dataSet$banner %in% specTaskBanner1)] <- 'BT1'
dataSet$banner[which(dataSet$banner %in% specTaskBanner2)] <- 'BT2'
dataSet$banner[which(dataSet$banner %in% specTaskBanner3)] <- 'BT3'
dataSet$banner[which(dataSet$banner %in% genInvPage_rg)] <- 'GIP_RG'
dataSet$banner[which(dataSet$banner %in% genInvPage_lp)] <- 'GIP_LP'
dataSet$banner <- paste(dataSet$banner, "_click", sep = "")
dataSet$banner[which(dataSet$banner %in% '_click')] <- 'Other'
colnames(dataSet) <- c('Page', 'Source', 'Referer', 'Date')
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Source == 'Other'] <-
str_extract(dataSet$Referer[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Source == 'Other'],
"campaign=wmde_abc(.)+$")
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_bt", dataSet$Source)] <- "Fehler_korrigieren"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_gib_rg", dataSet$Source)] <- "GIP_RG_click"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & grepl("wmde_abc2017_gib_lp", dataSet$Source)] <- "Mach_mit"
dataSet$Source[dataSet$Page %in% 'Spezial:Benutzerkonto_anlegen' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[dataSet$Page %in% 'Mach_mit' & dataSet$Referer %in% '-'] <- "Unknown"
dataSet$Source[is.na(dataSet$Source)] <- 'Other'
dataSet$Referer <- NULL
1.2.1 Landing Pages: Referers
The following charts represents the breakdown of referers (i.e. sources) for the campaign pages: one registration page, and two landing pages.
### --- Banner clicks and Landing Page Views
# - Table Report
tableSet <- dataSet %>%
dplyr::group_by(Page, Source, Date) %>%
dplyr::summarise(Count = n()) %>%
dplyr::arrange(Date, Page, Source)
# - Chart
chartCols <- brewer.pal(8, 'Set2')
ggplot(tableSet, aes(x = Page,
y = Count,
group = Source,
color = Source,
fill = Source,
label = Count)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
scale_fill_manual("legend", values = chartCols) +
scale_color_manual("legend", values = chartCols) +
ggtitle('Autumn Banner Campaign 2017:\nBreakdown of Landing Page Views sources') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
1.2.2 Landing Pages: Referer Breakdown
The following three pie charts present a breakdown of referers (i.e. sources) for the Campaign pages (two landing pages and one registration page.
### --- Page Views: Sources
# - Spezial:Benutzerkonto_anlegen
pageSource <- dataSet %>%
count(Page, Source) %>%
group_by(Page) %>%
mutate(Percent = n/sum(n))
pageSource$Percent <- paste(round(pageSource$Percent*100, 2), "%", sep = "")
pageSourcePlot <- filter(pageSource, Page %in% 'Spezial:Benutzerkonto_anlegen')
ggplot(pageSourcePlot, aes(x = '',
y = n,
color = Source,
fill = Source,
label = Percent)) +
geom_bar(aes(x = '',
y = n,
color = Source,
fill = Source),
stat = "identity",
width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(x = 1),
colour = "white",
fontface = "bold",
position = position_stack(vjust = 0.5),
size = 3,
show.legend = F) +
scale_fill_manual("legend", values = chartCols) +
scale_color_manual("legend", values = chartCols) +
ggtitle('Autumn Banner Campaign 2017:\nPage Views Sources for Spezial:Benutzerkonto_anlegen') +
xlab("Outter = Count") + ylab("") +
theme_minimal() +
# theme(axis.text.x = element_blank()) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.y = element_blank()) +
theme(panel.grid.minor.y = element_blank())
# - Fehler_korrigieren
pageSource <- dataSet %>%
count(Page, Source) %>%
group_by(Page) %>%
mutate(Percent = n/sum(n))
pageSource$Percent <- paste(round(pageSource$Percent*100, 2), "%", sep = "")
pageSourcePlot <- filter(pageSource, Page %in% 'Fehler_korrigieren')
ggplot(pageSourcePlot, aes(x = '',
y = n,
color = Source,
fill = Source,
label = Percent)) +
geom_bar(aes(x = '',
y = n,
color = Source,
fill = Source),
stat = "identity",
width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(x = 1),
colour = "white",
fontface = "bold",
position = position_stack(vjust = 0.5),
size = 3,
show.legend = F) +
scale_fill_manual("legend", values = chartCols) +
scale_color_manual("legend", values = chartCols) +
ggtitle('Autumn Banner Campaign 2017:\nPage Views Sources for Fehler_korrigieren') +
xlab("Outter = Count") + ylab("") +
theme_minimal() +
# theme(axis.text.x = element_blank()) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.y = element_blank()) +
theme(panel.grid.minor.y = element_blank()) +
theme(panel.background = element_blank())
# - Mach_mit
pageSource <- dataSet %>%
count(Page, Source) %>%
group_by(Page) %>%
mutate(Percent = n/sum(n))
pageSource$Percent <- paste(round(pageSource$Percent*100, 2), "%", sep = "")
pageSourcePlot <- filter(pageSource, Page %in% 'Mach_mit')
ggplot(pageSourcePlot, aes(x = '',
y = n,
color = Source,
fill = Source,
label = Percent)) +
geom_bar(aes(x = '',
y = n,
color = Source,
fill = Source),
stat = "identity",
width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(x = 1),
colour = "white",
fontface = "bold",
position = position_stack(vjust = 0.5),
size = 3,
show.legend = F) +
scale_fill_manual("legend", values = chartCols) +
scale_color_manual("legend", values = chartCols) +
ggtitle('Autumn Banner Campaign 2017:\nPage Views Sources for Mach_mit') +
xlab("Outter = Count") + ylab("") +
theme_minimal() +
# theme(axis.text.x = element_blank()) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.y = element_blank()) +
theme(panel.grid.minor.y = element_blank()) +
theme(panel.background = element_blank())
1.2.3 Banner Clicks: Campaign Total
The following charts represents the number of banner clicks for each campaign banner during the course of the campaign.
### --- Temporal Banner Clicks
# - Chart
chartCols <- brewer.pal(8, 'Set2')[c(1, 2, 3, 5, 6)]
clickPlotSet <- dataSet %>%
dplyr::select(Source, Date) %>%
dplyr::filter(grepl("_click", Source)) %>%
dplyr::group_by(Source, Date) %>%
dplyr::summarise(Count = n()) %>%
dplyr::arrange(Date)
ggplot(clickPlotSet, aes(x = Date,
y = Count,
group = Source,
color = Source,
fill = Source,
label = Count)) +
geom_bar(stat = "identity",
position = "dodge",
width = .5) +
scale_fill_manual("legend", values = chartCols) +
scale_color_manual("legend", values = chartCols) +
ggtitle('Autumn Banner Campaign 2017: Banner Clicks') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
1.2.4 Page Views: Campaign Total
The following chart presents the number of page views for the two landing pages and one registration page during the course of the campaign:
### --- Temporal Page Views
# - Chart
pagePlotSet <- dataSet %>%
dplyr::filter(!(Source == 'Other' | Source == 'Unknown')) %>%
dplyr::select(Page, Date) %>%
dplyr::group_by(Page, Date) %>%
dplyr::summarise(Count = n()) %>%
dplyr::arrange(Date)
ggplot(pagePlotSet, aes(x = Date,
y = Count,
group = Page,
color = Page,
fill = Page,
label = Count)) +
geom_bar(stat = "identity",
position = "dodge",
width = .2) +
scale_fill_manual("legend", values = chartCols) +
scale_color_manual("legend", values = chartCols) +
ggtitle('Autumn Banner Campaign 2017: Page Views') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, size = 8, hjust = 1)) +
theme(plot.title = element_text(size = 10)) +
theme(legend.title = element_blank()) +
theme(panel.grid.major.x = element_blank()) +
theme(panel.grid.minor.x = element_blank()) +
theme(panel.background = element_blank())
1.2.4 Page Views/Banner Clicks Dataset
The Page
column refers to either one of the two campaign landing pages or the registration page. The Source
column encompasses both campaign banner clicks and campaign pages as referers of the Page
. The Count
data have a daily resolution.
### --- Full Dataset (Table Report)
datatable(tableSet)