https://phabricator.wikimedia.org/T228721
The purpose of this task is to QC the edits_hourly data in Turnilo and Superset and confirm the content page edit counts are correct by comparing mediawiki_history to mariadb replicas.
Documentation and analaysis of previous issues leading to missing content page info: https://github.com/wikimedia-research/2019-02-active-editors-discrepancy/blob/master/analysis.ipynb and task https://phabricator.wikimedia.org/T221338
# https://stackoverflow.com/a/35018739/1091835
library(IRdisplay)
display_html(
'<script>
code_show=true;
function code_toggle() {
if (code_show){
$(\'div.input\').hide();
} else {
$(\'div.input\').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
<input type="submit" value="Click here to toggle on/off the raw code.">
</form>'
)
shhh <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
shhh({
library(magrittr); library(zeallot); library(glue); library(tidyverse); library(lubridate)
library(scales)
})
# date range - limiting to 2019
start_date <- "20190101"
start_date_human <- format(as.Date(start_date, '%Y%m%d'), '%d %B %Y')
end_date <- "20190630"
end_date_human <- format(as.Date(end_date, '%Y%m%d'), '%d %B %Y')
# Pass a sample of top sized ranked wiki and ones dentified as having a high volume of editors
# Also some wikis identified as having missing page info in https://phabricator.wikimedia.org/T221338]
target_wikis <- c(
"jawiki" = "Japanese", "itwiki" = "Italian",
"fawiki" = "Persian", "arwiki" = "Arabic", "idwiki" = "Indonesian", "thwiki" = "Thai", "enwiki" = "English",
"dewiki" = "German", "frwiki" = "French", "ruwiki" = "Russian",
"zhwiki" = "Chinese", "ptwiki" = "Portuguese", "nlwiki" = "Nederlandstalige"
)
## Find content space edits on MariaDB replicas.
## Refer to https://meta.wikimedia.org/wiki/Research:Editor_month_dataset
query <- "SELECT
database() as wiki,
date,
ifnull(sum(content_edits), 0) as content_edits
FROM
(
SELECT
DATE(LEFT(rev_timestamp, 8)) as date,
sum(page_namespace = 0 or cn.namespace is not null) as content_edits
FROM revision
left join page on rev_page = page_id
left join staging.content_namespaces cn on database() = wiki and page_namespace = namespace
WHERE rev_timestamp >= '{start_date}' AND
rev_timestamp < '{end_date}'
union all
SELECT
DATE(LEFT(ar_timestamp, 8)) as date,
sum(ar_namespace = 0 or cn.namespace is not null) as content_edits
FROM archive
left join staging.content_namespaces cn on database() = wiki and ar_namespace = namespace
WHERE ar_timestamp >= '{start_date}' AND
ar_timestamp < '{end_date}'
) revs
group by date;"
content_edits_replicas <- map_df(
set_names(names(target_wikis), names(target_wikis)),
~ shhh(wmf::mysql_read(glue(query), .x)),
.id = "wiki"
)
save(content_edits_replicas, file="Data/content_edits_replicas.RData")
load("Data/content_edits_replicas.RData")
content_edits_replicas$date <- as.Date(content_edits_replicas$date, format = "%Y-%m-%d")
content_edits_replicas_monthly <- content_edits_replicas %>%
mutate(date = floor_date(date, "month")) %>%
group_by(date) %>%
summarise(content_edits = sum(content_edits)) %>%
mutate(type = 'MariaDB')
head(content_edits_replicas_monthly)
I verified the monthly content numbers from the replicas with the data available in Turnilo and Superset. Now to check against the mediawiki_history on hive.
## Select main page namespace edits from mediawiki_history using the page_namespace_is_content_historical
# In terminal
# spark2R --master yarn --executor-memory 2G --executor-cores 1 --driver-memory 4G
# use https://github.com/wikimedia-research/Editing-movement-metrics/blob/master/queries/update_editor_month.sql as a reference
query <- "SELECT date_format(event_timestamp, 'yyyy-MM-dd') as date,
wiki_db as wiki,
coalesce(sum(cast(page_namespace_is_content_historical as int)), 0
) as content_edits
FROM wmf.mediawiki_history
WHERE
event_entity = 'revision' AND
event_type = 'create' and
event_timestamp IS NOT NULL and
wiki_db in ('jawiki', 'itwiki', 'fawiki', 'arwiki' , 'idwiki' , 'thwiki' , 'enwiki', 'dewiki' ,
'frwiki' , 'ruwiki' , 'zhwiki' , 'ptwiki' , 'nlwiki' ) and
event_timestamp >= '2019-01-01' and
event_timestamp < '2019-06-30' and
snapshot = '2019-07'
GROUP BY wiki_db, date_format(event_timestamp, 'yyyy-MM-dd')"
results <- collect(sql(query))
save(results, file="R/Data/content_edits_hive.RData")
load("Data/content_edits_hive.RData")
content_edits_datalake$date <- as.Date(content_edits_datalake$date, format = "%Y-%m-%d")
Confirmed that the hive numbers match with the edit counts available in Turnilo and Superset.
content_edits_datalake_monthly <- content_edits_datalake %>%
mutate(date = floor_date(date, "month")) %>%
group_by(date) %>%
summarise(content_edits = sum(content_edits)) %>%
mutate(type = 'data_lake')
head(content_edits_datalake)
Verified hive content edits with the edits_hourly data available in Superset and Turnilo. Now to double check any discrepancies.
content_edits_combined_monthly <- rbind(content_edits_datalake_monthly,
content_edits_replicas_monthly)
content_edits_combined_monthly
p <- ggplot(content_edits_combined_monthly, (aes(x=date, y= content_edits, color = type)))+
geom_line() +
scale_y_continuous("total content edits per day", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%b %Y"), date_breaks = "1 month") +
labs(title = "Monthly Content Edits: Replicas vs Data Lake") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid = element_line("gray70"))
p
content_edits_diff_monthly <- content_edits_combined_monthly %>%
spread(type, content_edits) %>%
mutate(diff = data_lake - MariaDB,
diff_percent = (data_lake - MariaDB)/MariaDB *100)
head(content_edits_diff_monthly)
# Plot the difference beween Hive and MariaDB edits
p <- ggplot(content_edits_diff_monthly, (aes(x=date, y= abs(diff))))+
geom_col(fill = 'dark blue') +
scale_y_continuous("Diff in edit count per month", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%b %Y"), date_breaks = "1 month") +
labs(title = "Difference between total monthly Content Edits \n from DataLake and MariaDB") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5),
panel.grid = element_line("gray70"))
p
There is a slight difference (under 0.1%) between the DataLake and MariaDB content namespace edit counts using the two query above, which is likely due to difference in the two datasets.
Note: MariaDB query shows the numbers of edits to the main namespace. This is not exactly the same thing as edits to content namespaces, but it's a pretty close approximation. See https://meta.wikimedia.org/wiki/Research:Content_namespace
query <-
"SELECT
revision_is_deleted_by_page_deletion as rev_deleted,
page_is_deleted as page_deleted,
page_title IS NULL as null_title,
page_namespace IS NULL as null_namespace,
page_namespace_is_content IS NULL as null_is_content_namespace,
page_title_historical IS NULL as null_historical_title,
page_namespace_historical IS NULL as null_historical_namespace,
page_namespace_is_content_historical IS NULL as null_historical_is_content_namespace,
COUNT(1) as c
FROM wmf.mediawiki_history
WHERE snapshot = '2019-06'
AND event_entity = 'revision'
GROUP BY
revision_is_deleted_by_page_deletion,
page_is_deleted,
page_title IS NULL,
page_namespace IS NULL,
page_namespace_is_content IS NULL,
page_title_historical IS NULL,
page_namespace_historical IS NULL,
page_namespace_is_content_historical IS NULL
ORDER BY
rev_deleted,
page_deleted,
null_title,
null_namespace,
null_is_content_namespace,
null_historical_title,
null_historical_namespace,
null_historical_is_content_namespace
LIMIT 1000"
namespace_check <- wmf::query_hive(query)
namespace_check
Results of the above query look as expected. About 95% of all revisions that are not deleted do not belong to a deleted page and have all their page_namespace info set correctly.
Nlwiki previously had 5160672 deleted pages with null set for page_namespace_historical, page_namespace_is_content and page_namespace_is_content_historical. I did a spot check on wmf.mediawikihistory to confirm this is no longer true.
nlwiki_query <- "SELECT COUNT(*)
FROM wmf.mediawiki_history
WHERE
wiki_db = 'nlwiki'
AND page_is_deleted = TRUE
AND page_namespace_historical = NULL
AND page_namespace_is_content = NULL
AND page_namespace_is_content_historical = NULL"
nlwiki_page_check <- wmf::query_hive(nlwiki_query)
Confirmed that there are 0 pages in nlwiki where this condition is met using the current snapshot of mediawiki_history.
Redo check of active counts using Neil's method documented here
# User previsous "official" active editor count data pulled using MariaDB
metrics_url <- "https://raw.githubusercontent.com/wikimedia-research/Editing-movement-metrics/75b3251727f8c766e4872f775f57a09632df6500/metrics/metrics.tsv"
official_ae_raw <- read.csv(url(metrics_url),sep="\t")
tail(official_ae_raw)
official_ae_raw$month <- as.Date(official_ae_raw$month, format = "%Y-%m-%d")
official_ae <- official_ae_raw %>%
select('month', 'active_editors')%>%
filter(month != '2005-06-01') %>% #Maria DB appears to have a duplicate value for this date
mutate(type = "MariaDb")
#Create intermediate hive table to calculate results
hive.run(["""
CREATE TABLE IF NOT EXISTS mneisler.editor_month_new (
`wiki` STRING,
`month` TIMESTAMP, -- Hive 1.1 does not support the DATE type
`local_user_id` BIGINT,
`user_name` STRING,
`edits` BIGINT,
`content_edits` BIGINT,
`bot_flag` BOOLEAN,
`user_registration` TIMESTAMP
)
STORED AS PARQUET
""", """
insert into mneisler.editor_month_new
select
wiki_db as wiki,
trunc(event_timestamp, "MONTH") as month,
event_user_id as local_user_id,
max(event_user_text) as user_name, -- Some rows incorrectly have a null `event_user_text`
count(*) as edits,
coalesce(
sum(cast(page_namespace_is_content_historical as int)),
0
) as content_edits,
(
max(array_contains(event_user_groups, "bot")) or
max(array_contains(event_user_groups_historical, "bot"))
) as bot,
min(event_user_creation_timestamp) as user_registration
from wmf.mediawiki_history
where
event_timestamp < "2019-02-01" and
event_entity = "revision" and
event_type = "create" and
snapshot = "2019-06"
group by
trunc(event_timestamp, "MONTH"),
wiki_db,
event_user_id
"""])
#Look at active editor numbers via hive.
query <- "
select
month,
count(*) as active_editors
from (
select
cast(month as date) as month,
user_name,
sum(content_edits) as content_edits,
max(bot_flag) as bot_flag
from mneisler.editor_month_new
where
local_user_id != 0
group by month, user_name
) global_edits
where
content_edits >= 5 and
not bot_flag and
user_name not regexp 'bot\\b'
group by month
"
new_ae_raw = wmf::query_hive(query)
new_ae_raw$month <- as.Date(new_ae_raw$month, format = "%Y-%m-%d")
new_ae <- new_ae_raw %>%
filter(month != '2005-06-01') %>% # filtered out duplicate month that was in MariaDB so I can do direct comparison
mutate(type = "data_lake")
deviation <- rbind(new_ae, official_ae)
comparison_plot <- rbind(new_ae, official_ae) %>%
ggplot(aes(x=month, y= active_editors, color = type)) +
geom_line() +
scale_y_continuous("monthly active editors", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%b %Y"), date_breaks = "12 months") +
labs(title = "Monthly active editors from Data Lake and MariaDb") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5))
comparison_plot
It looks like is minimal difference between active editor date from the official and new ae metrics. This metric appears to be fixed. Let's confirm the difference.
deviation_plot <- rbind(new_ae, official_ae) %>%
spread(type, active_editors) %>%
mutate(diff = data_lake-MariaDb) %>%
ggplot(aes(x=month, y= diff)) +
geom_line() +
scale_y_continuous("Diff in monthly active edtiors", labels = polloi::compress) +
scale_x_date("Date", labels = date_format("%b %Y"), date_breaks = "12 months") +
labs(title = "Difference between total monthly active edits \n from DataLake and MariaDB") +
ggthemes::theme_tufte(base_size = 12, base_family = "Gill Sans") +
theme(axis.text.x=element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5))
deviation_plot
The difference between the two is under 1%. The highest monthly difference between the two is 750 active editors per month which is a signficiantly improved from the previous difference of 10,000 active editors between the two dataset.
head(official_ae)
head(new_ae)