Issues and notes¶

Have to rewrite data access script
Rows are not deleted from cx_translations when the translation drafts are purged

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pd_display_all

charting.set_mpl_style()

MWH_SNAPSHOT = "2019-01"

Surviving translations by newcomers¶

hive.run("""
select
    date_format(event_timestamp, "YYYY-MM") as month,
    if(array_contains(rtc.tags, "contenttranslation-v2"), 2, 1) as cx_version,
    -- 6 months ≈ 26 weeks = 252 days
    event_user_text,
    event_timestamp,
    ssac.dt,
    if(
        coalesce(datediff(event_timestamp, ssac.dt) > 252, true),
        "experienced",
        "new"
    ) as user_experience,
    if(revision_is_deleted, "deleted", "surving") as status
from event.mediawiki_revision_tags_change rtc -- First full month of data is 2018-09
inner join wmf.mediawiki_history mh
on
    mh.snapshot = "{mwh_snapshot}" and
    mh.event_timestamp >= "2018-09" and
    rtc.rev_id = mh.revision_id and
    `rtc`.`database` = mh.wiki_db
left join event_sanitized.serversideaccountcreation ssac
on
    ssac.event.username = rtc.performer.user_text and
    ssac.year >= 0
where
    event_entity = "revision" and
    event_type = "create" and
    array_contains(rtc.tags, "contenttranslation") and
    ((rtc.year = 2018 and rtc.month >= 9) or rtc.year = 2019)
""".format(
    mwh_snapshot=MWH_SNAPSHOT
))

_116.groupby(["month", "user_experience"]).agg({
    "event_timestamp": "count",
    "event_user_text": "nunique"
}).rename(columns={
    "event_timestamp": "translations",
    "event_user_text": "translators"
}).unstack()

# Censoring based on varying time to deletion? Minimum time-to-deletion filter?
translation_deletion_r = hive.run("""
select
    month, 
    cx_version,
    user_experience,
    status,
    count(*) as translations
from (
    select
        date_format(event_timestamp, "YYYY-MM") as month,
        if(array_contains(rtc.tags, "contenttranslation-v2"), 2, 1) as cx_version,
        -- 6 months ≈ 26 weeks = 252 days
        if(
            coalesce(datediff(event_timestamp, ssac.dt) > 252, true),
            "experienced",
            "new"
        ) as user_experience,
        if(revision_is_deleted, "deleted", "surving") as status
    from event.mediawiki_revision_tags_change rtc -- First full month of data is 2018-09
    inner join wmf.mediawiki_history mh
    on
        mh.snapshot = "{mwh_snapshot}" and
        mh.event_timestamp >= "2018-09" and
        rtc.rev_id = mh.revision_id and
        `rtc`.`database` = mh.wiki_db
    left join event_sanitized.serversideaccountcreation ssac
    on
        ssac.event.username = rtc.performer.user_text and
        ssac.year >= 0
    where
        event_entity = "revision" and
        event_type = "create" and
        array_contains(rtc.tags, "contenttranslation") and
        ((rtc.year = 2018 and rtc.month >= 9) or rtc.year = 2019)
) translations
group by 
    month, 
    cx_version,
    user_experience,
    status
""".format(
    mwh_snapshot=MWH_SNAPSHOT
))

(translation_deletion_cx1
.drop(["cx_version"], axis=1)
.assign(month=lambda df: pd.to_datetime(df["month"]))
.set_index(["month", "user_experience", "status"])
.unstack()
.reset_index("user_experience")
)

translation_deletion_r

translation_deletion_cx1 = translation_deletion_r.query("cx_version == 1")
translation_deletion_cx2 = translation_deletion_r.query("cx_version == 2")

dfs = [translation_deletion_cx1, translation_deletion_cx2]

for df in dfs:
    (df
    .drop(["cx_version"], axis=1)
    .assign(month=lambda df: pd.to_datetime(df["month"]))
    .set_index(["month", "user_experience", "status"])
    .unstack()
    .plot.bar(
        stacked=True
    ))

Translators¶

translators = mariadb.run("""
select
    month,
    cx_version,
    user_experience,
    count(distinct user_name) as users_starting_translations
from (
    select
        date_format(str_to_date(translation_start_timestamp, "%Y%m%d%H%i%S"), "%Y-%m") as month,
        translation_cx_version as cx_version,
        if(
            datediff(
                str_to_date(translation_start_timestamp, "%Y%m%d%H%i%S"),
                -- For old accounts, this is when the user account was centralized, but since we're looking
                -- post-2017 only, the distiction doesn't matter
                str_to_date(gu_registration, "%Y%m%d%H%i%S")
            ) > 252,
            "experienced",
            "new"
        ) as user_experience,
        gu_name as user_name
    from
        wikishared.cx_translations
    left join
        centralauth.globaluser
    on gu_id = translation_started_by
    where 
        translation_start_timestamp > "2018"
) translators
group by
    month,
    cx_version,
    user_experience
""")

translators.head()

translators_cx1 = translators.query("cx_version == 1")
translators_cx2 = translators.query("cx_version == 2")

dfs = [translators_cx1, translators_cx2]

for df in dfs:
    (
        df
        .drop(["cx_version"], axis=1)
        .assign(month=lambda df: pd.to_datetime(df["month"]))
        .set_index(["month", "user_experience"])
        .unstack()
        .plot.bar()
    )

Publication rate¶

The "token" in CX event logging is the user name... 😛

mariadb.run("""
select
    date_format(str_to_date(translation_start_timestamp, "%Y%m%d%H%i%S"), "%Y-%m") as month,

wikishared.cx_translation


""")

Have to query the cx_translation table, get the output, and upload it to Hive so I can combine it with publishing failure data from event_logging.

hive.run("drop table neilpquinn.cx_translators")

path = "data/cx_translators.csv"
translators.to_csv(path)
hive.load_csv(
    path,
    field_spec="start_month string, user_name string, cx_version int",
    db_name = "neilpquinn",
    table_name="cx_translators"
)

translators_stats = """


"""

	month	cx_version	event_user_text	event_timestamp	dt	user_experience	status
0	2018-10	1	(।जय महलवाल।)	2018-10-31 13:59:49.0	2018-10-01T11:53:37Z	new	surving
1	2018-10	1	(।जय महलवाल।)	2018-10-18 07:25:28.0	2018-10-01T11:53:37Z	new	surving
2	2018-10	1	(।जय महलवाल।)	2018-10-08 13:51:03.0	2018-10-01T11:53:37Z	new	surving
3	2018-10	1	(।जय महलवाल।)	2018-10-06 13:37:00.0	2018-10-01T11:53:37Z	new	surving
4	2019-01	2	00matt00	2019-01-29 00:54:26.0	None	experienced	surving
5	2018-11	1	00matt00	2018-11-07 17:34:50.0	None	experienced	surving
6	2018-11	1	00matt00	2018-11-05 18:50:38.0	None	experienced	surving
7	2018-12	1	103ful6	2018-12-09 05:13:46.0	None	experienced	surving
8	2019-01	2	103ful6	2019-01-31 16:01:14.0	None	experienced	surving
9	2019-01	2	103ful6	2019-01-26 10:32:05.0	None	experienced	surving
10	2018-10	1	13568zyu	2018-10-30 03:18:06.0	None	experienced	surving
11	2018-10	1	1618033golden	2018-10-01 00:42:50.0	None	experienced	surving
12	2018-10	2	1618033golden	2018-10-02 00:17:48.0	None	experienced	surving
13	2019-01	2	1Mmarek	2019-01-10 11:20:49.0	None	experienced	surving
14	2018-12	2	2001.demchuk	2018-12-02 23:41:08.0	2018-12-02T20:16:48Z	new	surving
15	2018-12	2	2001.demchuk	2018-12-03 19:33:06.0	2018-12-02T20:16:48Z	new	surving
16	2018-12	2	2001.demchuk	2018-12-03 19:26:14.0	2018-12-02T20:16:48Z	new	surving
17	2018-12	2	2001.demchuk	2018-12-03 18:32:47.0	2018-12-02T20:16:48Z	new	surving
18	2018-12	2	2001.demchuk	2018-12-03 17:51:03.0	2018-12-02T20:16:48Z	new	surving
19	2018-10	1	2Оленка	2018-10-20 12:13:33.0	2018-10-20T10:26:39Z	new	surving
20	2018-11	1	3eaccount	2018-11-03 15:15:25.0	2018-11-03T14:24:39Z	new	surving
21	2018-10	1	420peace	2018-10-12 11:31:13.0	None	experienced	deleted
22	2019-01	2	420peace	2019-01-02 13:13:11.0	None	experienced	surving
23	2018-10	1	4th protocol	2018-10-18 15:51:48.0	None	experienced	surving
24	2019-01	2	4th protocol	2019-01-23 15:04:06.0	None	experienced	surving
25	2019-01	1	4th protocol	2019-01-09 14:48:19.0	None	experienced	surving
26	2018-09	1	4th protocol	2018-09-30 15:06:16.0	None	experienced	surving
27	2018-11	1	4th protocol	2018-11-09 14:00:29.0	None	experienced	surving
28	2019-01	1	4th protocol	2019-01-03 13:50:21.0	None	experienced	surving
29	2018-11	1	4th protocol	2018-11-09 15:50:26.0	None	experienced	surving
...	...	...	...	...	...	...	...
62047	2018-12	2	거북이	2018-12-29 15:51:05.0	None	experienced	surving
62048	2019-01	2	거북이	2019-01-16 13:19:22.0	None	experienced	surving
62049	2018-10	1	거북이	2018-10-27 17:34:59.0	None	experienced	surving
62050	2019-01	2	거북이	2019-01-25 04:39:24.0	None	experienced	surving
62051	2019-01	2	거북이	2019-01-20 12:14:40.0	None	experienced	surving
62052	2018-10	1	거북이	2018-10-28 14:33:30.0	None	experienced	surving
62053	2018-09	1	거북이	2018-09-10 16:28:12.0	None	experienced	surving
62054	2019-01	2	거북이	2019-01-25 01:53:45.0	None	experienced	surving
62055	2018-12	2	거북이	2018-12-29 14:41:18.0	None	experienced	surving
62056	2018-12	1	부작용	2018-12-18 21:33:40.0	2018-07-29T07:03:30Z	new	surving
62057	2018-12	1	부작용	2018-12-15 16:08:39.0	2018-07-29T07:03:30Z	new	surving
62058	2018-11	1	이천양정여고 박지희	2018-11-21 10:44:12.0	2018-11-11T16:08:40Z	new	surving
62059	2018-11	1	이천양정여고 박지희	2018-11-18 16:33:41.0	2018-11-11T16:08:40Z	new	surving
62060	2018-11	1	GustavHerbert	2018-11-10 15:56:13.0	2018-05-07T01:51:45	new	surving
62061	2018-11	1	GustavHerbert	2018-11-10 14:27:18.0	2018-05-07T01:51:45	new	surving
62062	2018-11	1	GustavHerbert	2018-11-11 05:31:56.0	2018-05-07T01:51:45	new	surving
62063	2018-11	1	GustavHerbert	2018-11-11 00:29:54.0	2018-05-07T01:51:45	new	surving
62064	2018-11	1	GustavHerbert	2018-11-10 14:04:45.0	2018-05-07T01:51:45	new	surving
62065	2018-11	1	GustavHerbert	2018-11-10 13:30:51.0	2018-05-07T01:51:45	new	surving
62066	2018-11	1	GustavHerbert	2018-11-11 00:30:55.0	2018-05-07T01:51:45	new	surving
62067	2018-11	1	GustavHerbert	2018-11-10 13:35:40.0	2018-05-07T01:51:45	new	surving
62068	2018-11	1	GustavHerbert	2018-11-10 14:31:37.0	2018-05-07T01:51:45	new	surving
62069	2018-11	1	GustavHerbert	2018-11-11 00:54:42.0	2018-05-07T01:51:45	new	surving
62070	2018-11	1	GustavHerbert	2018-11-10 14:07:38.0	2018-05-07T01:51:45	new	surving
62071	2018-11	1	GustavHerbert	2018-11-11 00:47:00.0	2018-05-07T01:51:45	new	surving
62072	2018-11	1	GustavHerbert	2018-11-11 05:29:12.0	2018-05-07T01:51:45	new	surving
62073	2019-01	2	책읽는달팽	2019-01-09 11:43:30.0	None	experienced	surving
62074	2018-12	1	탕탕탕탕탕수육	2018-12-19 12:14:37.0	2018-12-19T11:43:52Z	new	surving
62075	2018-09	1	헤진이	2018-09-08 08:24:09.0	2018-03-27T06:58:05	new	surving
62076	2018-09	1	헤진이	2018-09-04 11:36:32.0	2018-03-27T06:58:05	new	surving

	translations		translators
user_experience	experienced	new	experienced	new
month
2018-09	9648	2183	1367	666
2018-10	10053	2849	1435	827
2018-11	8630	3243	1494	1014
2018-12	7569	2910	1364	867
2019-01	10840	3460	1660	991
2019-12	524	168	196	74

	user_experience	translations
status		deleted	surving
month
2018-09-01	experienced	8	93
2018-09-01	new	408	1751
2018-10-01	experienced	51	172
2018-10-01	new	619	2075
2018-11-01	experienced	84	339
2018-11-01	new	566	2322
2018-12-01	experienced	79	194
2018-12-01	new	371	1626
2019-01-01	experienced	12	212
2019-01-01	new	192	598
2019-12-01	experienced	2	7
2019-12-01	new	13	35

	month	cx_version	user_experience	users_starting_translations
0	2018-01	1	experienced	2261
1	2018-01	1	new	1275
2	2018-01	2	experienced	3
3	2018-01	2	new	1
4	2018-02	1	experienced	2017