In [338]:
# Imports

import re
import pandas as pd
from urllib import parse
import numpy as np
In [8]:
# These are the old log files

!ls 20191020T20/old/*.log
20191020T20/old/landingpages.tsv.20191020-190002.log
20191020T20/old/landingpages.tsv.20191020-191501.log
20191020T20/old/landingpages.tsv.20191020-193001.log
20191020T20/old/landingpages.tsv.20191020-194502.log
20191020T20/old/landingpages.tsv.20191020-200001.log
20191020T20/old/landingpages.tsv.20191020-201501.log
20191020T20/old/landingpages.tsv.20191020-203001.log
20191020T20/old/landingpages.tsv.20191020-204501.log
20191020T20/old/landingpages.tsv.20191020-210001.log
20191020T20/old/landingpages.tsv.20191020-211501.log
20191020T20/old/landingpages.tsv.20191020-213001.log
20191020T20/old/landingpages.tsv.20191020-214501.log
In [137]:
# Create a single tsv file with only entries for 20191020T20
# Note: For some reason this doesn't work from Jupyter. Ran directly on the command line.

!cat 20191020T20/old/*.log | awk '$3 ~/^2019-10-20T20/' > 20191020T20/old/20191020T20.tsv
In [379]:
# Load the tsv file of old logs as a Pandas data frame
# Columns of the log files come from this template:
#     %{hostname} %{sequence} %{dt} %{time_firstbyte} %{ip} %{cache_status}/%{http_status} %{response_size}
#     %{http_method} http://%{uri_host}%{uri_path}%{uri_query} - %{content_type} %{referer} %{x_forwarded_for}
#     %{user_agent} %{accept_language} %{x_analytics}

col_names = [
    'hostname',
    'sequence',
    'dt', 
    'time_firstbyte',
    'ip',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'url',
    '(blank)',
    'content_type',
    'referer',
    'x_forwarded_for',
    'user_agent',
    'accept_language',
    'x_analytics'
]

old = pd.read_csv( '20191020T20/old/20191020T20.tsv', names = col_names, header = None, sep='\t' )
In [380]:
# Get rid of columns we don't care about, and add 'source', 'nocookies' and 'id' columns
old.drop( [ 'hostname', 'sequence', 'time_firstbyte', '(blank)', ], inplace = True, axis = 1 )
old[ 'source' ] = 'old'
old[ 'nocookies' ] = old[ 'x_analytics' ].apply( lambda x: 'nocookies=1' in x )
old.insert( 0, 'id', 'old' + pd.Series( range( 0, len( old) ) ).astype( str) )

# convert dt to datetime
old[ 'dt' ] = pd.to_datetime( old[ 'dt' ] )
In [381]:
# Parse URL to create detailed fields old log based

# If the wiki page is in the URL in the pretty format, let's parse it out, but leave an initial slash. This will
# go in the landingpage column, as will URL paths for non-wiki page. The legacy scripts only ingress wiki pages
# in ugly URL format. This way we can easily filter out everything that's not that.
lp_in_path_re = re.compile( '(?<=/wiki/).*$' )

# As per ext.fundraiserLandingPage.LogPageview.js
query_el_map = {
    "language": [ "uselang", "language" ],
    "country": [ "country" ],
    "utm_source": [ "utm_source" ],
    "utm_campaign": [ "utm_campaign" ],
    "utm_medium": [ "utm_medium" ],
    "utm_key": [ "utm_key" ],
    "contact_id": [ "contact_id" ],
    "link_id": [ "link_id" ],
    "template": [ "template" ],
    "appeal": [ "appeal" ],
    "appeal_template": [ "appeal_template", "appeal-template" ],
    "form_template": [ "form_template", "form-template" ],
    "form_countryspecific": [ "form_countryspecific", "form-countryspecific" ]
}

def fields_from_url( url ):
    url_str = str( url )
    parsed_url = parse.urlsplit( url_str )
    query_dict = parse.parse_qs( parsed_url.query )

    lp_in_path = lp_in_path_re.search( parsed_url.path )
    if lp_in_path:
        lp = lp_in_path.group()
        url_type = 'pretty_url'
    elif 'title' in query_dict:
        lp = query_dict[ 'title' ][ 0 ]
        url_type = 'ugly_url'
    else:
        lp = parsed_url.path
        url_type = 'non_wikipage'

    event_data = {
        "landingpage": lp,
        "url_type": url_type,
        "has_action": 'action' in query_dict
    }

    # same logic as ext.fundraiserLandingPage.LogPageview.js to generate event data from URL....
    for el_key, url_params in query_el_map.items():
        for url_param in url_params:
            if url_param in query_dict:
                event_data[ el_key ] = query_dict[ url_param ][ 0 ]

    return pd.Series( event_data )

old = old.join( old[ 'url' ].apply( fields_from_url ) )
In [22]:
# These are the new log files

!ls 20191020T20/new/*.log
20191020T20/new/landingpage-impressions.20191020-190002.json.log
20191020T20/new/landingpage-impressions.20191020-191501.json.log
20191020T20/new/landingpage-impressions.20191020-193001.json.log
20191020T20/new/landingpage-impressions.20191020-194502.json.log
20191020T20/new/landingpage-impressions.20191020-200001.json.log
20191020T20/new/landingpage-impressions.20191020-201501.json.log
20191020T20/new/landingpage-impressions.20191020-203001.json.log
20191020T20/new/landingpage-impressions.20191020-204501.json.log
20191020T20/new/landingpage-impressions.20191020-210001.json.log
20191020T20/new/landingpage-impressions.20191020-211501.json.log
20191020T20/new/landingpage-impressions.20191020-213001.json.log
20191020T20/new/landingpage-impressions.20191020-214502.json.log
In [29]:
# Create a single csv file with only entries for 20191020T20 and with
# the columns we want

!cat 20191020T20/new/*.log | jq -r 'select( .dt | test("2019-10-20T20") ) | [ .dt, .ip, .event.landingpage, .event.language // null, .event.country // null, .event.utm_source // null, .event.utm_campaign // null, .event.utm_medium // null, .event.utm_key // null, .event.contact_id // null, .event.link_id // null, .event.template // null, .event.appeal // null, .event.appeal_template // null, .event.form_template // null, .event.form_countryspecific  // null, .userAgent.browser_family, .userAgent.browser_major, .userAgent.browser_minor, .userAgent.device_family, .userAgent.is_bot, .userAgent.is_mediawiki, .userAgent.os_family, .userAgent.os_major, .userAgent.os_minor] | @csv' > 20191020T20/new/20191020T20.csv
In [393]:
# Load the csv file of new logs as a Pandas data frame

col_names = [
    'dt',
    'ip',
    'landingpage',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_family',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_bot',
    'is_mediawiki',
    'os_family',
    'os_major',
    'os_minor'
]

new = pd.read_csv( '20191020T20/new/20191020T20.csv', names = col_names, header = None )
In [394]:
# Add 'source' and 'id' columns
new[ 'source' ] = 'new'
new.insert( 0, 'id', 'new' + pd.Series( range( 0, len( old) ) ).astype( str) )

# convert dt to datetime
new[ 'dt' ] = pd.to_datetime( new[ 'dt' ] )
In [402]:
# Create a pared-down version of the dataset

# Remove non-wiki pages
old_purged = old[ old.url_type.isin( [ 'ugly_url', 'pretty_url' ] ) ]

# Remove non-OK HTTP statuses from entries from old logs
old_purged = old_purged[
    ( old_purged[ 'cache_status-http_status' ].str.contains( '200' ) ) |
    ( old_purged[ 'cache_status-http_status' ].str.contains( '304' ) )
]

# Remove hits with no cookies from entries from old logs
old_purged = old_purged[ old_purged.nocookies == False ]

# Remove hits with an action URL param (often action=raw, used for background URL requests)
old_purged = old_purged[ old_purged.has_action == False ]

# Remove IP addreses that are clearly bots or hacking attempts
bad_ips = [
    # 5 IPs purged for publication
]

old_purged = old_purged[ ~( old_purged[ 'ip' ].isin( bad_ips ) ) ]

# Remove language code from the end of titles, since there seems to be some mismatch for
# titles with language codes at the end.
lp_ends_in_lang_re = re.compile( '^.*/[a-z]{2}$' )
old_purged[ 'landingpage' ] = old_purged[ 'landingpage' ].apply(
    lambda x: x[:-3] if lp_ends_in_lang_re.match( x ) else x
)
In [403]:
# purge the same bad ips and identified bots from the new logs
new_purged = new[ ~( new[ 'ip' ].isin( bad_ips ) ) ]
new_purged = new_purged[ new_purged[ 'is_bot'] == False ]

# Also remove language code from the end of titles
new_purged[ 'landingpage' ] = new_purged[ 'landingpage' ].apply(
    lambda x: x[:-3] if lp_ends_in_lang_re.match( x ) else x
)
In [404]:
# rows in new
len( new_purged )
Out[404]:
1079
In [405]:
# rows in old_purged
len( old_purged )
Out[405]:
1442
In [269]:
# Create a single data frame with data from both old and new logs

both = pd.concat( [ old_purged, new_purged ], sort = False )
In [270]:
# reorder columns, putting the most interesting ones first, to make the csv easier to read in
# a spreadsheet

reordered_cols = [
    'dt',
    'ip',
    'source',
    'landingpage',
    'nocookies',
    'is_bot',
    'url',
    'content_type',
    'user_agent',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'referer',
    'x_forwarded_for',
    'accept_language',
    'x_analytics',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_family',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_family',
    'os_major',
    'os_minor'
]

both = both[ reordered_cols ]
In [271]:
# order by ip and timestamp
both.sort_values( [ 'ip', 'dt' ], inplace = True )
In [272]:
# Write data out to file for closer examination in a spreadsheet
both.to_csv( '20191020T20/both.csv' )
old_purged.to_csv( '20191020T20/old_purged.csv' )
new_purged.to_csv( '20191020T20/new_purged.csv' )
In [288]:
allowed_lp_from_script = [
    'L11',
    'L12',
    'L2011',
    'L2012',
    'WMF',
    'WP',
    'Special:LandingPage'
]

new_purged_for_script = new_purged[ new_purged.landingpage.isin( allowed_lp_from_script ) ]
old_purged_for_script = old_purged[ old_purged.landingpage.isin( allowed_lp_from_script ) ]
In [289]:
len( new_purged_for_script )
Out[289]:
487
In [290]:
len( old_purged_for_script )
Out[290]:
804
In [426]:
# merge on ip, landingpage, other logged URL parameters, and nearest timestamp
# (where new log timestamp must be later than the old log one)

# first order by timestamp
old_purged.sort_values( [ 'dt' ], inplace = True )
new_purged.sort_values( [ 'dt' ], inplace = True )

# put old logs on the left to include all old rows and only new ones that match
merged_purged_old_left = pd.merge_asof(
    old_purged,
    new_purged,
    direction = 'forward',
    on = 'dt',
    by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_old', '_new' ]
)

# put new logs on the left to include all new rows and only new ones that match
merged_purged_new_left = pd.merge_asof(
    new_purged,
    old_purged,
    direction = 'backward',
    on = 'dt',
    by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_new', '_old']
)
In [ ]:
# Give the columns a nicer order

reordered_merged_cols = [
    'dt',
    'ip',
    'id_old',
    'id_new',
    'landingpage',
    'user_agent',
    'browser_family',
    'os_family',
    'nocookies',
    'is_bot',
    'url',
    'content_type',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'referer',
    'x_forwarded_for',
    'accept_language',
    'x_analytics',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_major',
    'os_minor',
    'source_old',
    'source_new'
]

merged_purged_old_left = merged_purged_old_left[ reordered_merged_cols ]
merged_purged_new_left = merged_purged_new_left[ reordered_merged_cols ]
In [424]:
# Sort for easier analysis
merged_purged_old_left.sort_values( [ 'source_new', 'user_agent' ], inplace = True )
merged_purged_new_left.sort_values( [ 'source_old', 'browser_family', 'os_family' ], inplace = True )
In [425]:
# Write out the merged dataframes for examination in spreadsheets
merged_purged_old_left.to_csv( '20191020T20/merged_purged_old_left.csv' )
merged_purged_new_left.to_csv( '20191020T20/merged_purged_new_left.csv' )
In [427]:
# Total unmatched entries in old logs
len( merged_purged_old_left[ merged_purged_old_left[ 'source_new' ].isna() ] )
Out[427]:
515
In [428]:
# Total unmatched entries in new logs
len( merged_purged_new_left[ merged_purged_new_left[ 'source_old' ].isna() ] )
Out[428]:
147
In [ ]: