Summary
In June 2019, there was a slight decline in year-over-year pageviews (-0.9%). This was the first YOY decline in total pageviews since May/June 2018. We'd like to look into possible reasons for this yoy decline this month and flattening the past quarter.
Approach: Review pageviews in broken down by the following:
Data Notes:
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(magrittr); library(zeallot); library(glue); library(tidyverse); library(glue); library(lubridate)
library(scales)
})
#pageviews by platform since May 2019.
query <- "
SELECT CONCAT(year,'-',LPAD(month,2,'0'),'-',LPAD(day,2,'0')) AS date,
access_method, SUM(view_Count)as views
FROM wmf.pageview_hourly
WHERE year = 2019 and month >= 5
AND agent_type='user'
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
GROUP BY year, month, day, access_method;"
pageviews_byplatform <- wmf::query_hive(query)
pageviews_byplatform$date <- as.Date(pageviews_byplatform$date, format = "%Y-%m-%d")
p <- pageviews_byplatform %>%
filter(date < '2019-07-01') %>%
ggplot(aes(x=date, y =views, color = access_method)) +
geom_line() +
scale_y_continuous("daily pageviews", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%Y-%m-%d"), date_breaks = "1 week")+
labs(title = "Total daily Wikimedia pageviews by access method") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
panel.grid = element_line("gray70"))
p
There was a yoy decline in desktop (-11.9%) while mobile web pageviews increased yoy by +7.8%. We typically report a yoy decline in desktop each month but this one was higher compared to previous months (-8.3% in May and -9.4% in April).
I will investigate the decline in desktop by breaking down by various factors.
#Find desktop pageviews by country to calculate yoy and mom changes on desktop
query <- "
SELECT country,
SUM(IF((month =5 and year = 2019), view_count, null)) AS may19_views,
SUM(IF((month =6 and year = 2019), view_count, null)) AS june19_views,
SUM(IF((month =6 and year = 2018), view_count, null)) AS june18_views
FROM wmf.pageview_hourly
WHERE
((year = 2018 and month =6) OR (year=2019 and (month = 5 OR month =6)))
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
AND agent_type = 'user'
AND access_method = 'desktop'
GROUP BY country;"
pageviews_bycountry <- wmf::query_hive(query)
pageviews_bycountry_clean <- pageviews_bycountry
pageviews_bycountry_clean$june18_views <- as.numeric(pageviews_bycountry_clean$june18_views)
pageviews_bycountry_clean$june19_views <- as.numeric(pageviews_bycountry_clean$june19_views)
pageviews_bycountry_clean$may19_views <- as.numeric(pageviews_bycountry_clean$may19_views)
top_declines_bycountry <- pageviews_bycountry_clean %>%
filter(june19_views > 100000) %>% ## filter out countries with extremely low traffic
group_by(country) %>%
mutate(yoy_percent = (june19_views/june18_views-1)*100,
mom_percent = (june19_views/may19_views-1)* 100) %>%
arrange(yoy_percent)
head(top_declines_bycountry, 10)
There was a significant yoy decline (> 50%) in China, Angola and Sudan. China has been blocked on all Wikipedias since April 2019 and has had a significant decrease in overall pageviews since that date. http://bit.ly/2OFQq7V I'll look further into China YOY changes to see the rates of YoY decrease.
#Daily china pageviews since 2018
#Did not apply IE correction since bug was not relevant for views from China
query <- "
SELECT CONCAT(year,'-',LPAD(month,2,'0'),'-',LPAD(day,2,'0')) AS date,
SUM(view_Count)as views
FROM wmf.pageview_hourly
WHERE year >= 2018
AND agent_type='user'
AND access_method = 'desktop'
AND country = 'China'
GROUP BY year, month, day;"
pageviews_china <- wmf::query_hive(query)
pageviews_china$date <- as.Date(pageviews_china$date, format = "%Y-%m-%d")
#Various adjustments to allow for yoy comparison.
pageviews_china_monthly <- pageviews_china %>%
filter(date < '2019-07-01') %>%
mutate(date = floor_date(date, "month")) %>%
group_by(date) %>%
summarise(views = sum(as.numeric(views))) %>%
mutate(year = ifelse(date >= '2018-01-01' & date < '2019-01-01', '2018', '2019'),
MonthN =as.factor(format(as.Date(date),"%m")),
Month = months(as.Date(date), abbreviate=TRUE))
pageviews_china_monthly
pageviews_china_yoy <- pageviews_china_monthly %>%
ggplot(aes(x=MonthN, y = views, group = year, color = year)) +
geom_line(size = 1) +
scale_y_continuous("pageviews per month", labels = polloi::compress) +
scale_x_discrete(breaks = pageviews_china_monthly$MonthN, labels = pageviews_china_monthly$Month)+
labs(title = "Monthly wikimedia pageviews from China \n year over year comparison") +
ggthemes::theme_tufte(base_size = 14, base_family = "Gill Sans") +
theme(plot.title = element_text(hjust = 0.5),
legend.title = element_blank(),
legend.position = "bottom",
panel.grid = element_line("gray70"),
legend.key.width=unit(1.5,"cm")) +
xlab("Month") +
scale_linetype_manual(breaks=c('2018','2019'), values=c(2,1))
pageviews_china_yoy
# Daily Wikimedia pageviews on desktop in China for 2019
pageviews_china_weekly <- pageviews_china %>%
filter(date < '2019-07-01') %>%
mutate(date = floor_date(date, "week")) %>%
group_by(date) %>%
summarise(views = sum(views)) %>%
ggplot(aes(x=date, y =views)) +
geom_line(color= "blue") +
geom_vline(xintercept = as.numeric(as.Date("2019-04-23")),
linetype = "dashed", color = "black") +
geom_text(aes(x=as.Date('2019-04-23'), y=30E6, label="All Wikipedias blocked in China"), size=3.5, vjust = -1.2, angle = 90, color = "black") +
scale_y_continuous("Weekly pageviews", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%Y-%m-%d"), date_breaks = "1 month")+
labs(title = "Weekly wikimedia pageviews from China") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
panel.grid = element_line("gray70"))
ggsave("Figures/pageviews_china_weekly.png", pageviews_china_weekly, width = 18, height = 9, units = "in", dpi = 150)
pageviews_china_weekly
#Find estimated percentage decline in overall pageviews due to China drop in pageviews.
china_drop <- top_declines_bycountry %>%
filter(country == 'China') %>%
summarise(num_decrease = june18_views - june19_views)
#Total Wikimedia June 2018 Pageviews across all countries is 6570949411
china_drop_percent <- china_drop$num_decrease/ 6570949411 *100
china_drop_percent
Following the China block at the end of April 2019, pageviews have declined with the high YoY declines . In June 2019, there was a 74M year over year pageview drop in China across all Wikimedia projects on desktop. This represents about a little over 1% year over year decrease in all desktop Wikimedia traffic.
#Find pageviews by continent to calculate yoy and mom changes on desktop
query <- "
SELECT continent,
SUM(IF((month =5 and year = 2019), view_count, null)) AS may19_views,
SUM(IF((month =6 and year = 2019), view_count, null)) AS june19_views,
SUM(IF((month =6 and year = 2018), view_count, null)) AS june18_views
FROM wmf.pageview_hourly
WHERE
((year = 2018 and month =6) OR (year=2019 and (month = 5 OR month =6)))
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
AND agent_type = 'user'
AND access_method = 'desktop'
GROUP BY continent;"
pageviews_bycontinent <- wmf::query_hive(query)
pageviews_bycontinent_clean <- pageviews_bycontinent
pageviews_bycontinent_clean$june18_views <- as.numeric(pageviews_bycontinent_clean$june18_views)
pageviews_bycontinent_clean$june19_views <- as.numeric(pageviews_bycontinent_clean$june19_views)
pageviews_bycontinent_clean$may19_views <- as.numeric(pageviews_bycontinent_clean$may19_views)
top_declines_bycontinent <- pageviews_bycontinent_clean %>%
group_by(continent) %>%
mutate(yoy_percent = (june19_views/june18_views-1)*100,
mom_percent = (june19_views/may19_views-1)* 100) %>%
arrange(yoy_percent)
head(top_declines_bycontinent, 10)
#Daily pageviews by continent
query <- "
SELECT CONCAT(year,'-',LPAD(month,2,'0'),'-',LPAD(day,2,'0')) AS date,
continent, SUM(view_Count)as views
FROM wmf.pageview_hourly
WHERE year = 2019 and month >= 5
AND agent_type='user'
AND access_method = 'desktop'
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
GROUP BY year, month, day, continent;"
pageviews_bycontinent_daily <- wmf::query_hive(query)
pageviews_bycontinent_daily$date <- as.Date(pageviews_bycontinent_daily$date, format = "%Y-%m-%d")
p <- pageviews_bycontinent_daily %>%
filter(date < '2019-07-01') %>%
ggplot(aes(x=date, y =views, color = continent)) +
geom_line() +
scale_y_continuous("daily pageviews", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%Y-%m-%d"), date_breaks = "1 week")+
labs(title = "Total daily Wikimedia pageviews by Continent") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
panel.grid = element_line("gray70"))
p
All continents saw YoY declines in desktop pageviews ranging from -8% (Oceania) to -24% (Africa). YoY declines were not strongly isolated to one individual continent.
#Pageviews by project, YoY and MoM changes
query <- "
SELECT project,
SUM(IF((month =5 and year = 2019), view_count, null)) AS may19_views,
SUM(IF((month =6 and year = 2019), view_count, null)) AS june19_views,
SUM(IF((month =6 and year = 2018), view_count, null)) AS june18_views
FROM wmf.pageview_hourly
WHERE
((year = 2018 and month =6) OR (year=2019 and (month = 5 OR month =6)))
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
AND agent_type = 'user'
AND access_method = 'desktop'
GROUP BY project;"
pageviews_byproject <- wmf::query_hive(query)
pageviews_byproject_clean <- pageviews_byproject
pageviews_byproject_clean$june18_views <- as.numeric(pageviews_byproject_clean$june18_views)
pageviews_byproject_clean$june19_views <- as.numeric(pageviews_byproject_clean$june19_views)
pageviews_byproject_clean$may19_views <- as.numeric(pageviews_byproject_clean$may19_views)
top_declines_byproject <- pageviews_byproject_clean %>%
filter(project %in% c("en.wikipedia", 'es.wikipedia', 'de.wikipedia', 'ja.wikipedia',
'fr.wikipedia', 'ru.wikipedia', 'commons.wikimedia', 'it.wikipedia', 'zh.wikipedia', 'pt.wikipedia')) %>%
## Isolate to projects that account for largets portion of June traffic
group_by(project) %>%
mutate(yoy_percent = (june19_views/june18_views-1)*100,
mom_percent = (june19_views/may19_views-1)* 100) %>%
arrange(yoy_percent)
top_declines_byproject
Commons Wikimedia saw the only YoY increase out of the top 10 visited wikimedia projects. All the other projects had yoy decreases ranging from -21% to -8%. Note that Wikimedia Commons has not been reported as blocked by China, which may be an indication that the block is attributing to a portion in yoy decline on the other language Wikipedias.
#Comparison of june 2018 and June 2019 Pageviews by Projects
pageviews_byproject_yoy <- top_declines_byproject %>%
gather(year, views, june19_views:june18_views) %>%
ggplot(aes(x=year, y= views, fill = year)) +
geom_col() +
scale_y_continuous("total pageviews", labels = polloi::compress) +
facet_wrap(~project) +
labs(title = "Comparison of June 2018 and June 2019 Pageviews by Projects") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_blank(),
panel.grid = element_line("gray70"),
axis.title.x=element_blank())
ggsave("Figures/pageviews_byproject_yoy .png", pageviews_byproject_yoy , width = 18, height = 9, units = "in", dpi = 150)
pageviews_byproject_yoy
#Daily pageviews by project
query <- "
SELECT CONCAT(year,'-',LPAD(month,2,'0'),'-',LPAD(day,2,'0')) AS date,
project, SUM(view_Count)as views
FROM wmf.pageview_hourly
WHERE year = 2019 and month >= 5
AND agent_type='user'
AND access_method = 'desktop'
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
GROUP BY year, month, day, project;"
pageviews_byproject_daily <- wmf::query_hive(query)
pageviews_byproject_daily$date <- as.Date(pageviews_byproject_daily$date, format = "%Y-%m-%d")
p <- pageviews_byproject_daily %>%
filter(date < '2019-07-01',
project %in% c('es.wikipedia', 'de.wikipedia', 'ja.wikipedia',
'fr.wikipedia', 'ru.wikipedia', 'commons.wikimedia', 'it.wikipedia', 'zh.wikipedia', 'pt.wikipedia')) %>%
ggplot(aes(x=date, y =views, color = project)) +
geom_line() +
scale_y_continuous("daily pageviews", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%Y-%m-%d"), date_breaks = "1 week")+
labs(title = "Total daily Wikimedia pageviews by Project") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
panel.grid = element_line("gray70"))
p
#Pageviews by referer project, YoY and MoM changes
query <- "
SELECT referer_class,
SUM(IF((month =5 and year = 2019), view_count, null)) AS may19_views,
SUM(IF((month =6 and year = 2019), view_count, null)) AS june19_views,
SUM(IF((month =6 and year = 2018), view_count, null)) AS june18_views
FROM wmf.pageview_hourly
WHERE
((year = 2018 and month =6) OR (year=2019 and (month = 5 OR month =6)))
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
AND agent_type = 'user'
AND access_method = 'desktop'
GROUP BY referer_class;"
pageviews_byreferer <- wmf::query_hive(query)
top_declines_byreferer <- pageviews_byreferer %>%
group_by(referer_class) %>%
mutate(yoy_percent = (june19_views/june18_views-1)*100,
mom_percent = (june19_views/may19_views-1)* 100) %>%
arrange(yoy_percent)
top_declines_byreferer
There was a yoy decline across all referer_class types.
#Comparison of june 2018 and June 2019 Pageviews by Referer Class
pageviews_byreferer_yoy <- top_declines_byreferer %>%
filter(referer_class != "unknown") %>% #remove unknown referers
gather(year, views, june19_views:june18_views) %>%
ggplot(aes(x=year, y= views, fill = year)) +
geom_col() +
scale_y_continuous("total pageviews", labels = polloi::compress) +
facet_wrap(~referer_class) +
labs(title = "Comparison of June 2018 and June 2019 \n pageviews by referer class") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_blank(),
panel.grid = element_line("gray70"),
axis.title.x=element_blank())
pageviews_byreferer_yoy
I also reviewed a breakdown of pageviews by major search engine using the discovery wmflabs dashboard
There are no signficant changes across June seen for any of the major external serarch engines.
Historically, there are fewer views on desktop on the weekends compared to weekdays. To investigate if the higher number of weekend days in June led to a decrease I'll look at the total views on weekends and weekydays in June 2019 and compare to June 2018 last year.
I first calculated calulcated the average weekday and weekend pageviews in both June 2018 and June 2019.
## Collect daily pageviews on desktop in June 2018 and June 2019.
query <- "
SELECT CONCAT(year,'-',LPAD(month,2,'0'),'-',LPAD(day,2,'0')) AS date,
SUM(view_count)as views
FROM wmf.pageview_hourly
WHERE ((year = 2019 and month = 6) OR (year = 2018 and month = 6))
AND agent_type='user'
AND access_method = 'desktop'
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
GROUP BY year, month, day;"
pageviews_daily <- wmf::query_hive(query)
pageviews_daily$date <- as.Date(pageviews_daily$date, format = "%Y-%m-%d")
#Insert day of the week
pageviews_daily_wday <- pageviews_daily
pageviews_daily_wday$day <- weekdays(as.Date(pageviews_daily_wday$date))
# Find average weekend and weekday views on desktop
wday_avg <- pageviews_daily_wday %>%
mutate(date = floor_date(date, "month"),
day_type = ifelse(day %in% c('Saturday', 'Sunday'), 'weekend', 'weekday'),
year = ifelse(date =='2019-06-01', 'June 2019', 'June 2018')) %>%
group_by(year, day_type) %>%
summarise(avg_daily_views = mean(as.numeric(views)),
total_views = sum(as.numeric(views)))
wday_avg
There was a YoY decrease in both weekday and weekend pageviews in June consistent with an overall decrease in traffic.
There was one additional weekend day in June 2019 compared to June 2018. Since there are fewer views on desktop on the weekends compared to weekdays, this might have also attributed to a small portion of the overall YoY decline on desktop.
To calculate, I multiplied the difference in number of weekend days for each June (1) by the average 2019 difference in weekend and weekday daily pageviews to estimate the drop in pageviews from the additional weekend day.
#Count number of day types
wday_counts <- pageviews_daily_wday %>%
mutate(date = floor_date(date, "month"),
day_type = ifelse(day %in% c('Saturday', 'Sunday'), 'weekend', 'weekday'),
year = ifelse(date =='2019-06-01', 'June 2019', 'June 2018')) %>%
group_by(year) %>%
summarise(weekday_num = sum(day_type == 'weekday'),
weekend_num = sum(day_type == 'weekend'))
wday_counts
# Calculate pageview drop in June attibuted to number of weekends.
# Proposed equation: Multiply the difference in weekend numbers by avg 2019 difference in weekend and weekday previews.
drop_num <- (wday_counts[2, 3] - wday_counts[1, 3]) * (wday_avg[3, 3] - wday_avg[4, 3])
#Find estimated percentage drop in June 2019 traffic due to additional weekend day
#Note: 6570949411 is the total num of June 2018 pageviews)
drop_percent <- (drop_num/ 6570949411) *100
drop_percent
The additional weekend day in June 2019 led to an estimated 46.9M pageview drop across all Wikimedia projects on desktop or about 0.7% decrease in overall desktop pageviews in June 2019.
#Find desktop pageviews by browser family to calculate yoy and mom changes
query <- "
SELECT user_agent_map['browser_family'] as browser,
SUM(IF((month =5 and year = 2019), view_count, null)) AS may19_views,
SUM(IF((month =6 and year = 2019), view_count, null)) AS june19_views,
SUM(IF((month =6 and year = 2018), view_count, null)) AS june18_views
FROM wmf.pageview_hourly
WHERE
((year = 2018 and month =6) OR (year=2019 and (month = 5 OR month =6)))
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
AND agent_type = 'user'
AND access_method = 'desktop'
AND user_agent_map['browser_family'] IN ('IE', 'Chrome', 'Firefox', 'Safari', 'Edge', 'Other', 'Opera')
GROUP BY user_agent_map['browser_family'];"
pageviews_bybrowser <- wmf::query_hive(query)
head(pageviews_bybrowser)
top_declines_bybrowser <- pageviews_bybrowser %>%
filter(june19_views > 1) %>%
group_by(browser) %>%
mutate(yoy_percent = (june19_views/june18_views-1)*100,
mom_percent = (june19_views/may19_views-1)* 100) %>%
arrange(yoy_percent)
top_declines_bybrowser
#Daily pageviews by browser
query <- "
SELECT CONCAT(year,'-',LPAD(month,2,'0'),'-',LPAD(day,2,'0')) AS date,
user_agent_map['browser_family'] as browser, SUM(view_Count)as views
FROM wmf.pageview_hourly
WHERE year = 2019 and month >= 5
AND agent_type='user'
AND access_method = 'desktop'
AND NOT (country_code IN ('PK', 'IR', 'AF') -- https://phabricator.wikimedia.org/T157404#3194046
AND user_agent_map['browser_family'] = 'IE') -- https://phabricator.wikimedia.org/T193578#4300284
AND user_agent_map['browser_family'] IN ('IE', 'Chrome', 'Firefox', 'Safari', 'Edge', 'Other', 'Opera')
GROUP BY year, month, day, user_agent_map['browser_family'];"
pageviews_bybrowser_daily <- wmf::query_hive(query)
pageviews_bybrowser_daily$date <- as.Date(pageviews_bybrowser_daily$date, format = "%Y-%m-%d")
p <- pageviews_bybrowser_daily %>%
filter(date < '2019-07-01') %>%
ggplot(aes(x=date, y =views, color = browser)) +
geom_line() +
scale_y_continuous("daily pageviews", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%Y-%m-%d"), date_breaks = "1 week")+
labs(title = "Total daily Wikimedia pageviews by Browser") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
panel.grid = element_line("gray70"))
p
There were YoY declines across all browser groups except Edge and Opera but no significants drops associated with any one browser family.