cx_translations
when the translation drafts are purgedimport pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pd_display_all
charting.set_mpl_style()
MWH_SNAPSHOT = "2019-01"
hive.run("""
select
date_format(event_timestamp, "YYYY-MM") as month,
if(array_contains(rtc.tags, "contenttranslation-v2"), 2, 1) as cx_version,
-- 6 months ≈ 26 weeks = 252 days
event_user_text,
event_timestamp,
ssac.dt,
if(
coalesce(datediff(event_timestamp, ssac.dt) > 252, true),
"experienced",
"new"
) as user_experience,
if(revision_is_deleted, "deleted", "surving") as status
from event.mediawiki_revision_tags_change rtc -- First full month of data is 2018-09
inner join wmf.mediawiki_history mh
on
mh.snapshot = "{mwh_snapshot}" and
mh.event_timestamp >= "2018-09" and
rtc.rev_id = mh.revision_id and
`rtc`.`database` = mh.wiki_db
left join event_sanitized.serversideaccountcreation ssac
on
ssac.event.username = rtc.performer.user_text and
ssac.year >= 0
where
event_entity = "revision" and
event_type = "create" and
array_contains(rtc.tags, "contenttranslation") and
((rtc.year = 2018 and rtc.month >= 9) or rtc.year = 2019)
""".format(
mwh_snapshot=MWH_SNAPSHOT
))
_116.groupby(["month", "user_experience"]).agg({
"event_timestamp": "count",
"event_user_text": "nunique"
}).rename(columns={
"event_timestamp": "translations",
"event_user_text": "translators"
}).unstack()
# Censoring based on varying time to deletion? Minimum time-to-deletion filter?
translation_deletion_r = hive.run("""
select
month,
cx_version,
user_experience,
status,
count(*) as translations
from (
select
date_format(event_timestamp, "YYYY-MM") as month,
if(array_contains(rtc.tags, "contenttranslation-v2"), 2, 1) as cx_version,
-- 6 months ≈ 26 weeks = 252 days
if(
coalesce(datediff(event_timestamp, ssac.dt) > 252, true),
"experienced",
"new"
) as user_experience,
if(revision_is_deleted, "deleted", "surving") as status
from event.mediawiki_revision_tags_change rtc -- First full month of data is 2018-09
inner join wmf.mediawiki_history mh
on
mh.snapshot = "{mwh_snapshot}" and
mh.event_timestamp >= "2018-09" and
rtc.rev_id = mh.revision_id and
`rtc`.`database` = mh.wiki_db
left join event_sanitized.serversideaccountcreation ssac
on
ssac.event.username = rtc.performer.user_text and
ssac.year >= 0
where
event_entity = "revision" and
event_type = "create" and
array_contains(rtc.tags, "contenttranslation") and
((rtc.year = 2018 and rtc.month >= 9) or rtc.year = 2019)
) translations
group by
month,
cx_version,
user_experience,
status
""".format(
mwh_snapshot=MWH_SNAPSHOT
))
(translation_deletion_cx1
.drop(["cx_version"], axis=1)
.assign(month=lambda df: pd.to_datetime(df["month"]))
.set_index(["month", "user_experience", "status"])
.unstack()
.reset_index("user_experience")
)
translation_deletion_r
translation_deletion_cx1 = translation_deletion_r.query("cx_version == 1")
translation_deletion_cx2 = translation_deletion_r.query("cx_version == 2")
dfs = [translation_deletion_cx1, translation_deletion_cx2]
for df in dfs:
(df
.drop(["cx_version"], axis=1)
.assign(month=lambda df: pd.to_datetime(df["month"]))
.set_index(["month", "user_experience", "status"])
.unstack()
.plot.bar(
stacked=True
))
translators = mariadb.run("""
select
month,
cx_version,
user_experience,
count(distinct user_name) as users_starting_translations
from (
select
date_format(str_to_date(translation_start_timestamp, "%Y%m%d%H%i%S"), "%Y-%m") as month,
translation_cx_version as cx_version,
if(
datediff(
str_to_date(translation_start_timestamp, "%Y%m%d%H%i%S"),
-- For old accounts, this is when the user account was centralized, but since we're looking
-- post-2017 only, the distiction doesn't matter
str_to_date(gu_registration, "%Y%m%d%H%i%S")
) > 252,
"experienced",
"new"
) as user_experience,
gu_name as user_name
from
wikishared.cx_translations
left join
centralauth.globaluser
on gu_id = translation_started_by
where
translation_start_timestamp > "2018"
) translators
group by
month,
cx_version,
user_experience
""")
translators.head()
translators_cx1 = translators.query("cx_version == 1")
translators_cx2 = translators.query("cx_version == 2")
dfs = [translators_cx1, translators_cx2]
for df in dfs:
(
df
.drop(["cx_version"], axis=1)
.assign(month=lambda df: pd.to_datetime(df["month"]))
.set_index(["month", "user_experience"])
.unstack()
.plot.bar()
)
The "token" in CX event logging is the user name... 😛
mariadb.run("""
select
date_format(str_to_date(translation_start_timestamp, "%Y%m%d%H%i%S"), "%Y-%m") as month,
wikishared.cx_translation
""")
Have to query the cx_translation table, get the output, and upload it to Hive so I can combine it with publishing failure data from event_logging.
hive.run("drop table neilpquinn.cx_translators")
path = "data/cx_translators.csv"
translators.to_csv(path)
hive.load_csv(
path,
field_spec="start_month string, user_name string, cx_version int",
db_name = "neilpquinn",
table_name="cx_translators"
)
translators_stats = """
"""