observations

  • wikidatawiki: the most revisions (4x enwiki), but very few missing events (0.003% vs the 1.3% on average for all others)
  • some wikis miss more events. e.g. ukwikisource misses 34% of ~60k revisions for Feb21
  • minor edits are more likely to have missing events: 2.3% vs 0.4%
  • revision with rev_parent_id==0 are more likely to be missing (3% vs 1%). Also, events are missing for revisions with (minor_revision==true and parent_id!=0) and (minor_revision==false and parent_id==0), not for other combinations - why this strange xor behavior?
  • archived revisions are more likely to be missing (5.9% vs 1.2%), but <2% of revisions are archived
  • there are spikes in missing events when looking at hourly timeseries (for Feb21), especially for the fields (minor edit, archived, no parent id) seemingly correlated with the occurence of missing events. @joseph in a previous comment also observed bulks of missing revisions.
  • nothing stands out looking at missing events as a function of 'hour of day' / 'minute of hour'
In [85]:
import datetime
import functools
from IPython.core.display import display, HTML, Markdown
import matplotlib.pyplot as plt
In [3]:
def displayHtml(self, limit=20):
    display(HTML(self.limit(limit).toPandas().to_html(escape=False,index=False)))

DataFrame.displayHtml = displayHtml
In [57]:
filter_year = lambda df, v: df.where(F.col('year')==v)
filter_month = lambda df, v: df.where(F.col('month')==v)
filter_day = lambda df, v: df.where(F.col('day')==v)
filter_hour = lambda df, v: df.where(F.col('hour')==v)

def filter_date_pattern(df, date_pattern):    
    date = [int(v) for v in date_pattern.split('/')]
    df = filter_year(df, date[0])
    if len(date) >= 2:
        df = filter_month(df, date[1])
    if len(date) >= 3:
        df = filter_day(df, date[2])
    return df
In [5]:
event_revisions = spark.sql(f"""select * from event.mediawiki_revision_create""")

snapshot='2021-02'
mediawiki_revisions = spark.sql(f"""
    select 
        rev_id,
        rev_page,
        rev_text_id,
        rev_comment,
        rev_user,
        rev_user_text,
        rev_timestamp,
        rev_minor_edit,
        rev_deleted,
        rev_len,
        rev_parent_id,
        rev_sha1,
        rev_content_model,
        rev_content_format,
        rev_actor,
        rev_comment_id,
        wiki_db,
        false as archived
    from wmf_raw.mediawiki_revision
    where snapshot = '{snapshot}'

    union all

    select 
        ar_rev_id as rev_id,
        ar_page_id as rev_page,
        ar_text_id as rev_text_id,
        ar_comment as rev_comment,
        ar_user as rev_user,
        ar_user_text as rev_user_text,
        ar_timestamp as rev_timestamp,
        ar_minor_edit as rev_minor_edit,
        ar_deleted as rev_deleted,
        ar_len as rev_len,
        ar_parent_id as rev_parent_id,
        ar_sha1 as rev_sha1,
        ar_content_model as rev_content_model,
        ar_content_format as rev_content_format,
        ar_actor as rev_actor,
        ar_comment_id as rev_comment_id,        
        wiki_db,
        true as archived
    from wmf_raw.mediawiki_archive
    where snapshot = '{snapshot}'""")

# unused fields in archived table
        # ar_namespace,
        # ar_title,
        # ar_text,
        # ar_flags,
In [58]:
pattern = '2021/02'

events = filter_date_pattern(event_revisions, pattern)
revisions = mediawiki_revisions.where(F.col('rev_timestamp').like(f"{pattern.replace('/','')}%"))
    
#%%
# filter for mediawiki revisions that are not present in the kafka revision event streams 
joined_mediakiki_revisions = (revisions
    .join(events, on=['rev_id'], how='left')
    .withColumn('missing_event', F.col('event.mediawiki_revision_create.rev_timestamp').isNull())
    .withColumn('timestamp', F.to_timestamp('wmf_raw.mediawiki_revision.rev_timestamp', 'yyyyMMddHHmmss'))
).cache()

missing_mediakiki_revisions = joined_mediakiki_revisions.where(F.col('missing_event'))
In [66]:
def count_missing(df):
    return (df
        .agg(
            F.sum(F.col('missing_event').cast('int')).alias('missing'),
            F.count('*').alias('total')
        )
        .withColumn('missing %', F.col('missing') / F.col('total') * 100))

display(Markdown('## Missing kafka events present in mediawiki revisions'))
display(Markdown('### All projects'))
count_missing(joined_mediakiki_revisions).displayHtml()   

display(Markdown('#### Without wikidata'))
count_missing(joined_mediakiki_revisions.where(F.col('wiki_db')!='wikidatawiki')).displayHtml()   

display(Markdown('### Project '))
by_project = count_missing(joined_mediakiki_revisions.groupBy(['wiki_db']))

display(Markdown('#### Top by missing events'))
(by_project
    .orderBy('missing',ascending=False)
    .displayHtml(10))

display(Markdown('#### Top by total revisions'))
(by_project
    .orderBy('total',ascending=False)
    .displayHtml(10))

display(Markdown('#### Top by percentage of events missing'))
(by_project
    .where(F.col('total')>1000)
    .orderBy('missing %',ascending=False)
    .displayHtml(10))

display(Markdown('### Minor edits '))

(count_missing(joined_mediakiki_revisions.where(F.col('wiki_db')!='wikidatawiki').groupBy(['wmf_raw.mediawiki_revision.rev_minor_edit']))
    .orderBy('missing %',ascending=False)
    .displayHtml())


display(Markdown('#### No parent revision '))

parent_0_col = (F.col('wmf_raw.mediawiki_revision.rev_parent_id')==0).alias('rev_parent_is_0')

(count_missing(joined_mediakiki_revisions.where(F.col('wiki_db')!='wikidatawiki').groupBy([parent_0_col]))
    .orderBy('missing %',ascending=False)
    .displayHtml())

(count_missing(joined_mediakiki_revisions.where(F.col('wiki_db')!='wikidatawiki').groupBy(['wmf_raw.mediawiki_revision.rev_minor_edit', parent_0_col]))
    .orderBy('missing %',ascending=False)
    .displayHtml())


display(Markdown('#### Archived'))

(count_missing(joined_mediakiki_revisions.where(F.col('wiki_db')!='wikidatawiki').groupBy(['archived']))
    .orderBy('missing %',ascending=False)
    .displayHtml())

(count_missing(joined_mediakiki_revisions.where(F.col('wiki_db')!='wikidatawiki').groupBy(['wmf_raw.mediawiki_revision.rev_minor_edit', 'archived']))
    .orderBy('missing %',ascending=False)
    .displayHtml())


(count_missing(joined_mediakiki_revisions.where(F.col('wiki_db')!='wikidatawiki').groupBy(['archived',parent_0_col]))
    .orderBy('missing %',ascending=False)
    .displayHtml())

Missing kafka events present in mediawiki revisions

All projects

missing total missing %
359345 48601016 0.739378

Without wikidata

missing total missing %
358748 28188183 1.272689

Project

Top by missing events

wiki_db missing total missing %
commonswiki 102832 7931186 1.296553
enwiki 66744 5431087 1.228925
ukwikisource 21338 60973 34.995818
frwiki 15643 980622 1.595212
dewiki 14164 946148 1.497017
zhwiki 10028 685623 1.462611
ruwiki 7926 588549 1.346702
eswiki 7860 755418 1.040484
itwiki 7065 603621 1.170436
fawiki 6995 272452 2.567425

Top by total revisions

wiki_db missing total missing %
wikidatawiki 597 20412833 0.002925
commonswiki 102832 7931186 1.296553
enwiki 66744 5431087 1.228925
frwiki 15643 980622 1.595212
dewiki 14164 946148 1.497017
eswiki 7860 755418 1.040484
zhwiki 10028 685623 1.462611
itwiki 7065 603621 1.170436
ruwiki 7926 588549 1.346702
arzwiki 997 540574 0.184434

Top by percentage of events missing

wiki_db missing total missing %
ukwikisource 21338 60973 34.995818
dewikisource 4334 13964 31.036952
suwiki 1530 5195 29.451396
eowikinews 304 1180 25.762712
tawikisource 1614 6506 24.807870
ptwikisource 430 1931 22.268255
scowiki 542 3264 16.605392
itwikibooks 2171 13414 16.184583
tgwiki 520 4635 11.218986
trwikisource 874 8295 10.536468

Minor edits

rev_minor_edit missing total missing %
True 189043 8049535 2.348496
False 169705 20138648 0.842683

No parent revision

rev_parent_is_0 missing total missing %
True 105830 3584729 2.952245
False 252918 24603454 1.027978
rev_minor_edit rev_parent_is_0 missing total missing %
False True 105827 3455962 3.062158
True False 189040 7920768 2.386637
False False 63878 16682686 0.382900
True True 3 128767 0.002330

Archived

archived missing total missing %
True 25580 432607 5.912988
False 333168 27755576 1.200364
rev_minor_edit archived missing total missing %
False True 22365 369885 6.046474
True True 3215 62722 5.125793
True False 185828 7986813 2.326685
False False 147340 19768763 0.745317
archived rev_parent_is_0 missing total missing %
True True 20796 132879 15.650328
False True 85034 3451850 2.463433
True False 4784 299728 1.596114
False False 248134 24303726 1.020971
In [102]:
top_wikis = ['commonswiki','enwiki','frwiki','dewiki','eswiki','zhwiki','itwiki','ruwiki','arzwiki']

groupby_cols = (joined_mediakiki_revisions
    .select([
        'missing_event', 
        'wiki_db',
        'wmf_raw.mediawiki_revision.rev_minor_edit', 
        'archived', 
        'wmf_raw.mediawiki_revision.rev_page',
        (F.unix_timestamp('timestamp')/60/60).cast('int').alias('hour_bucket'),
        F.hour('timestamp').alias('hour'), 
        F.minute('timestamp').alias('minute'),
        (F.col('wmf_raw.mediawiki_revision.rev_parent_id')==0).alias('rev_parent_is_0')])
    .where(F.col('wiki_db')!='wikidatawiki')
    .where(F.col('wiki_db').isin(top_wikis)))

def named_col(col):
    name_udf = F.udf(lambda xx: f'{col} = {xx}', 'string')
    return name_udf(col)

def ts_plot(xbucket, pivot_on):
    to_plot = (groupby_cols
        .withColumn(pivot_on, named_col(pivot_on))
        .groupby(xbucket)
        .pivot(pivot_on)
        .agg(
            F.sum(F.col('missing_event').cast('int')) / F.count('*') * 100
        )
        .orderBy(xbucket))

    to_plot.toPandas().plot(title='% missing kafka events',x=xbucket)
In [104]:
display(Markdown('## Timeseries hourly buckets'))
plt.rcParams["figure.figsize"] = (20,15)
ts_plot('hour_bucket', 'rev_minor_edit')
ts_plot('hour_bucket', 'archived')
ts_plot('hour_bucket', 'rev_parent_is_0')
ts_plot('hour_bucket', 'wiki_db')
plt.rcParams["figure.figsize"] = (6,4)

Timeseries hourly buckets

In [105]:
display(Markdown('### hour of day, minute of hour)'))
ts_plot('hour', 'rev_minor_edit')
ts_plot('minute', 'rev_minor_edit')
ts_plot('hour', 'archived')
ts_plot('minute', 'archived')
ts_plot('hour', 'rev_parent_is_0')
ts_plot('minute', 'rev_parent_is_0')

hour of day, minute of hour)

In [ ]: