from IPython.display import HTML as display_html
import numpy as np
import pandas as pd
from wmfdata import hive, utils
from wmfdata.utils import num_str

We have clean, complete data in EditAttemptStep starting on 8 February 2019 and we started oversampling mobile visual editor events 2019-03-06T00:23:49Z (https://phabricator.wikimedia.org/T212253).

display_html("""
<script>  
    code_show=true; 
    function code_toggle() {
        if (code_show){
            $('div.input').hide();
        } else {
            $('div.input').show();
        }
        code_show = !code_show
    }  
    $( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()">
    <input type="submit" value="Click here to toggle on/off the raw code.">
</form>
""")

Attempts and completion by interface and experience¶

attempts = hive.run("""
-- Transform the steps into attempts
with steps as (
    select
        event.editing_session_id as attempt_id,
        unix_timestamp(
            dt,
            "yyyy-MM-dd'T'hh:mm:ssX"
        ) as ts,
        wiki,
        database_group,
        event.platform as platform,
        event.editor_interface as interface,
        event.action as action,
        event.is_oversample as is_oversample,
        event.user_editcount as user_edit_count,
        coalesce(event.user_class = "IP", False) as user_is_unregistered
    from event_sanitized.editattemptstep
    left join canonical_data.wikis
    on wiki = database_code
    where
        -- Remove Flow and other non-standard edits
        event.integration = "page" and
        -- Remove events not needed to calculate session starts or completions
        event.action in ("ready", "saveSuccess") and
        -- One month of data
        year = 2019 and (
            (month = 5 and day >= 17) or
            (month = 6 and day < 16)
        )
    -- Ensure that steps within each session are in chronological order without having to do a global order by
    distribute by attempt_id
    sort by ts asc
)
-- Group those steps into attempts
select
    attempt_id,
    from_unixtime(min(ts), "yyyy-MM-dd'T'hh:mm:ssX") as dt,
    min(wiki) as wiki,
    case
        when min(database_group) != "wikipedia" then Null
        when min(wiki) in ("enwiki", "nlwiki", "eswiki") then False
        else True
    end as ve_default_wiki,
    min(platform) as platform,
    -- Select the last editor in the session as the "real" one
    collect_list(interface)[
        size(collect_list(interface)) - 1
    ] as editor,
    sum(cast(action = "saveSuccess" as int)) >= 1 as is_completed,
    max(is_oversample) as is_oversample,
    case
        when max(user_is_unregistered) or max(user_edit_count) is null then "unregistered"
        when max(user_edit_count) = 0 then "0"
        when max(user_edit_count) < 5 then "1-4"
        when max(user_edit_count) < 100 then "5-99"
        when max(user_edit_count) < 1000 then "100-999"
        when max(user_edit_count) < 10000 then "1000-9999"
        else "10000+"
    end as user_experience
from steps        
group by attempt_id
""").assign(
    dt=lambda df: pd.to_datetime(df["dt"]),
    wiki=lambda df: pd.Categorical(df["wiki"]),
    platform=lambda df: pd.Categorical(df["platform"]),
    editor=lambda df: pd.Categorical(df["editor"]),
    user_experience=lambda df: pd.Categorical(
        df["user_experience"],
        categories=["unregistered", "0", "1-4", "5-99", "100-999", "1000-9999", "10000+"], ordered=True
    )
)

Average daily edit attempts¶

(
    attempts
    .query("~is_oversample")
    .groupby(["platform", "editor", "user_experience"])
    .apply(len)
    .unstack().T
    .multiply((16/30)) # Correct for sampling, reduce to daily average
    .style
    .background_gradient(axis=None)
    .format(lambda x: num_str(int(x)))
)

Completion rates¶

completion_rates = (
    attempts
    .groupby(["platform", "editor", "user_experience"])
    .apply(lambda grp: grp["is_completed"].sum() / len(grp))
    .unstack()
    .T
)

(
    completion_rates
    .style
    .background_gradient(axis=None)
    .format("{:.0%}")
)

Desktop completion rate by interface, experience, and VE default status¶

(
    attempts
    .query("platform == 'desktop' & editor == 'visualeditor' & ~ve_default_wiki.isnull()")
    .groupby(["ve_default_wiki", "user_experience"])
    .apply(lambda grp: grp["is_completed"].sum() / len(grp))
    .unstack()
    .transpose()
    .join(completion_rates[("desktop", "wikitext")].rename("wikitext"))
    .rename(columns={
        False: "visualeditor (non-default wiki)", 
        True: "visualeditor (default wiki)"
    })
    .rename_axis(None, axis=1)
    .style
    .background_gradient(axis=None)
    .format("{:.0%}")
)

User agent breakdown¶

Has to be computed separately, because user agent info isn't included in the event_santitized database.

hive.run("""
select
    useragent.browser_family,
    useragent.device_family,
    useragent.os_family
from event.editattemptstep
where year = 2019 and month = 6 and day = 16 and hour = 1
limit 50
""")

platform	desktop			phone
editor	visualeditor	wikitext	wikitext-2017	visualeditor	wikitext
user_experience
unregistered	170,000	390,000	9	7,900	1,100,000
0	4,000	8,400	56	960	4,300
1-4	3,600	8,100	94	1,300	3,500
5-99	7,700	24,000	480	1,700	6,600
100-999	4,700	28,000	1,100	760	6,600
1000-9999	4,100	59,000	2,800	320	6,900
10000+	2,700	160,000	3,100	44	3,600

platform	desktop			phone
editor	visualeditor	wikitext	wikitext-2017	visualeditor	wikitext
user_experience
unregistered	4%	10%	0%	22%	2%
0	27%	20%	10%	27%	22%
1-4	43%	38%	19%	32%	31%
5-99	54%	53%	41%	42%	40%
100-999	64%	66%	53%	56%	50%
1000-9999	69%	74%	66%	62%	57%
10000+	75%	82%	75%	58%	63%

	visualeditor (non-default wiki)	visualeditor (default wiki)	wikitext
user_experience
unregistered	8%	4%	10%
0	31%	26%	20%
1-4	47%	41%	38%
5-99	54%	54%	53%
100-999	62%	65%	66%
1000-9999	67%	69%	74%
10000+	77%	75%	82%