In [1]:
# Imports

import re
import pandas as pd
from urllib import parse
import numpy as np
In [2]:
# These are the old log files

!ls 20191020T20/old/*.log
20191020T20/old/landingpages.tsv.20191020-190002.log
20191020T20/old/landingpages.tsv.20191020-191501.log
20191020T20/old/landingpages.tsv.20191020-193001.log
20191020T20/old/landingpages.tsv.20191020-194502.log
20191020T20/old/landingpages.tsv.20191020-200001.log
20191020T20/old/landingpages.tsv.20191020-201501.log
20191020T20/old/landingpages.tsv.20191020-203001.log
20191020T20/old/landingpages.tsv.20191020-204501.log
20191020T20/old/landingpages.tsv.20191020-210001.log
20191020T20/old/landingpages.tsv.20191020-211501.log
20191020T20/old/landingpages.tsv.20191020-213001.log
20191020T20/old/landingpages.tsv.20191020-214501.log
In [3]:
# Create a single tsv file with only entries for 20191020T20
# Note: For some reason this doesn't work from Jupyter. Ran directly on the command line.

!cat 20191020T20/old/*.log | awk '$3 ~/^2019-10-20T20/' > 20191020T20/old/20191020T20.tsv
In [4]:
# Load the tsv file of old logs as a Pandas data frame
# Columns of the log files come from this template:
#     %{hostname} %{sequence} %{dt} %{time_firstbyte} %{ip} %{cache_status}/%{http_status} %{response_size}
#     %{http_method} http://%{uri_host}%{uri_path}%{uri_query} - %{content_type} %{referer} %{x_forwarded_for}
#     %{user_agent} %{accept_language} %{x_analytics}

col_names = [
    'hostname',
    'sequence',
    'dt', 
    'time_firstbyte',
    'ip',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'url',
    '(blank)',
    'content_type',
    'referer',
    'x_forwarded_for',
    'user_agent',
    'accept_language',
    'x_analytics'
]

old = pd.read_csv( '20191020T20/old/20191020T20.tsv', names = col_names, header = None, sep='\t' )
In [5]:
# Get rid of columns we don't care about, and add 'source', 'nocookies' and 'id' columns
old.drop( [ 'hostname', 'sequence', 'time_firstbyte', '(blank)', ], inplace = True, axis = 1 )
old[ 'source' ] = 'old'
old[ 'nocookies' ] = old[ 'x_analytics' ].apply( lambda x: 'nocookies=1' in x )
old.insert( 0, 'id', 'old' + pd.Series( range( 0, len( old) ) ).astype( str) )

# convert dt to datetime
old[ 'dt' ] = pd.to_datetime( old[ 'dt' ] )
In [6]:
# Parse URL to create detailed fields for old log

# If the wiki page is in the URL in the pretty format, let's parse it out, but leave an initial slash. This will
# go in the landingpage column, as will URL paths for non-wiki page. The legacy scripts only ingress wiki pages
# in ugly URL format. This way we can easily filter out everything that's not that.
lp_in_path_re = re.compile( '(?<=/wiki/).*$' )

# As per ext.fundraiserLandingPage.LogPageview.js
query_el_map = {
    "language": [ "uselang", "language" ],
    "country": [ "country" ],
    "utm_source": [ "utm_source" ],
    "utm_campaign": [ "utm_campaign" ],
    "utm_medium": [ "utm_medium" ],
    "utm_key": [ "utm_key" ],
    "contact_id": [ "contact_id" ],
    "link_id": [ "link_id" ],
    "template": [ "template" ],
    "appeal": [ "appeal" ],
    "appeal_template": [ "appeal_template", "appeal-template" ],
    "form_template": [ "form_template", "form-template" ],
    "form_countryspecific": [ "form_countryspecific", "form-countryspecific" ]
}

def fields_from_url( url ):
    url_str = str( url )
    parsed_url = parse.urlsplit( url_str )
    query_dict = parse.parse_qs( parsed_url.query )

    lp_in_path = lp_in_path_re.search( parsed_url.path )
    if lp_in_path:
        lp = lp_in_path.group()
        url_type = 'pretty_url'
    elif 'title' in query_dict:
        lp = query_dict[ 'title' ][ 0 ]
        url_type = 'ugly_url'
    else:
        lp = parsed_url.path
        url_type = 'non_wikipage'

    event_data = {
        "landingpage": lp,
        "url_type": url_type,
        "has_action": 'action' in query_dict
    }

    # same logic as ext.fundraiserLandingPage.LogPageview.js to generate event data from URL....
    for el_key, url_params in query_el_map.items():
        for url_param in url_params:
            if url_param in query_dict:
                event_data[ el_key ] = query_dict[ url_param ][ 0 ]

    return pd.Series( event_data )

old = old.join( old[ 'url' ].apply( fields_from_url ) )
In [7]:
old.to_csv( '20191020T20/old.csv' )
In [8]:
# These are the new log files

!ls 20191020T20/new/*.log
20191020T20/new/landingpage-impressions.20191020-190002.json.log
20191020T20/new/landingpage-impressions.20191020-191501.json.log
20191020T20/new/landingpage-impressions.20191020-193001.json.log
20191020T20/new/landingpage-impressions.20191020-194502.json.log
20191020T20/new/landingpage-impressions.20191020-200001.json.log
20191020T20/new/landingpage-impressions.20191020-201501.json.log
20191020T20/new/landingpage-impressions.20191020-203001.json.log
20191020T20/new/landingpage-impressions.20191020-204501.json.log
20191020T20/new/landingpage-impressions.20191020-210001.json.log
20191020T20/new/landingpage-impressions.20191020-211501.json.log
20191020T20/new/landingpage-impressions.20191020-213001.json.log
20191020T20/new/landingpage-impressions.20191020-214502.json.log
In [9]:
# Create a single csv file with only entries for 20191020T20 and with
# the columns we want

!cat 20191020T20/new/*.log | jq -r 'select( .dt | test("2019-10-20T20") ) | [ .dt, .ip, .event.landingpage, .event.language // null, .event.country // null, .event.utm_source // null, .event.utm_campaign // null, .event.utm_medium // null, .event.utm_key // null, .event.contact_id // null, .event.link_id // null, .event.template // null, .event.appeal // null, .event.appeal_template // null, .event.form_template // null, .event.form_countryspecific  // null, .userAgent.browser_family, .userAgent.browser_major, .userAgent.browser_minor, .userAgent.device_family, .userAgent.is_bot, .userAgent.is_mediawiki, .userAgent.os_family, .userAgent.os_major, .userAgent.os_minor] | @csv' > 20191020T20/new/20191020T20.csv
In [10]:
# Load the csv file of new logs as a Pandas data frame

col_names = [
    'dt',
    'ip',
    'landingpage',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_family',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_bot',
    'is_mediawiki',
    'os_family',
    'os_major',
    'os_minor'
]

new = pd.read_csv( '20191020T20/new/20191020T20.csv', names = col_names, header = None )
In [11]:
# Add 'source' and 'id' columns
new[ 'source' ] = 'new'
new.insert( 0, 'id', 'new' + pd.Series( range( 0, len( old) ) ).astype( str) )

# convert dt to datetime
new[ 'dt' ] = pd.to_datetime( new[ 'dt' ] )
In [12]:
# Create a pared-down version of the dataset

# Remove non-wiki pages
old_purged = old[ old.url_type.isin( [ 'ugly_url', 'pretty_url' ] ) ]

# Remove non-OK HTTP statuses from entries from old logs
old_purged = old_purged[
    ( old_purged[ 'cache_status-http_status' ].str.contains( '200' ) ) |
    ( old_purged[ 'cache_status-http_status' ].str.contains( '304' ) )
]

# Remove hits with no cookies from entries from old logs
old_purged = old_purged[ old_purged.nocookies == False ]

# Remove hits with an action URL param (often action=raw, used for background URL requests)
old_purged = old_purged[ old_purged.has_action == False ]

# Remove IP addreses that are clearly bots or hacking attempts
bad_ips = [
    # Censored
]

old_purged = old_purged[ ~( old_purged[ 'ip' ].isin( bad_ips ) ) ]

# Remove language code from the end of titles, since there seems to be some mismatch for
# titles with language codes at the end.
lp_ends_in_lang_re = re.compile( '^.*/[a-z]{2}$' )
old_purged[ 'landingpage' ] = old_purged[ 'landingpage' ].apply(
    lambda x: x[:-3] if lp_ends_in_lang_re.match( x ) else x
)
In [13]:
# purge the same bad ips and identified bots from the new logs
new_purged = new[ ~( new[ 'ip' ].isin( bad_ips ) ) ]
new_purged = new_purged[ new_purged[ 'is_bot'] == False ]

# Also remove language code from the end of titles
new_purged[ 'landingpage' ] = new_purged[ 'landingpage' ].apply(
    lambda x: x[:-3] if lp_ends_in_lang_re.match( x ) else x
)
In [14]:
# rows in new
len( new_purged )
Out[14]:
1107
In [15]:
# rows in old_purged
len( old_purged )
Out[15]:
1463
In [16]:
# Create a single data frame with data from both old and new logs

both = pd.concat( [ old_purged, new_purged ], sort = False )
In [17]:
# reorder columns, putting the most interesting ones first, to make the csv easier to read in
# a spreadsheet

reordered_cols = [
    'dt',
    'ip',
    'source',
    'landingpage',
    'nocookies',
    'is_bot',
    'url',
    'content_type',
    'user_agent',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'referer',
    'x_forwarded_for',
    'accept_language',
    'x_analytics',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_family',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_family',
    'os_major',
    'os_minor'
]

both = both[ reordered_cols ]
In [18]:
# order by ip and timestamp
both.sort_values( [ 'ip', 'dt' ], inplace = True )
In [19]:
# Write data out to file for closer examination in a spreadsheet
both.to_csv( '20191020T20/both.csv' )
old_purged.to_csv( '20191020T20/old_purged.csv' )
new_purged.to_csv( '20191020T20/new_purged.csv' )
In [20]:
allowed_lp_from_script = [
    'L11',
    'L12',
    'L2011',
    'L2012',
    'WMF',
    'WP',
    'Special:LandingPage'
]

new_purged_for_script = new_purged[ new_purged.landingpage.isin( allowed_lp_from_script ) ]
old_purged_for_script = old_purged[ old_purged.landingpage.isin( allowed_lp_from_script ) ]
In [21]:
len( new_purged_for_script )
Out[21]:
515
In [22]:
len( old_purged_for_script )
Out[22]:
825
In [23]:
# Merge on most event fields, where new log timestamp is after old log timestamp
# Note: Results of this merge not used in the final data

# merge on ip, landingpage, other logged URL parameters, and nearest timestamp
# (where new log timestamp must be later than the old log one)

# first order by timestamp
old_purged.sort_values( [ 'dt' ], inplace = True )
new_purged.sort_values( [ 'dt' ], inplace = True )

# put old logs on the left to include all old rows and only new ones that match
merged_purged_old_left_after = pd.merge_asof(
    old_purged,
    new_purged,
    direction = 'forward',
    tolerance= pd.Timedelta( '2m' ),
    on = 'dt',
    by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_old', '_new' ]
)

# put new logs on the left to include all new rows and only new ones that match
merged_purged_new_left_after = pd.merge_asof(
    new_purged,
    old_purged,
    direction = 'backward',
    tolerance= pd.Timedelta( '2m' ),
    on = 'dt',
    by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_new', '_old']
)

# Give the columns a nicer order
reordered_merged_cols = [
    'dt',
    'ip',
    'id_old',
    'id_new',
    'landingpage',
    'user_agent',
    'browser_family',
    'os_family',
    'nocookies',
    'is_bot',
    'url',
    'content_type',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'referer',
    'x_forwarded_for',
    'accept_language',
    'x_analytics',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_major',
    'os_minor',
    'source_old',
    'source_new'
]

merged_purged_old_left_after = merged_purged_old_left_after[ reordered_merged_cols ]
merged_purged_new_left_after = merged_purged_new_left_after[ reordered_merged_cols ]

# Sort for easier analysis
merged_purged_old_left_after.sort_values( [ 'source_new', 'user_agent' ], inplace = True )
merged_purged_new_left_after.sort_values( [ 'source_old', 'browser_family', 'os_family' ], inplace = True )

# Write out the merged dataframes for examination in spreadsheets
merged_purged_old_left_after.to_csv( '20191020T20/merged_purged_old_left_after.csv' )
merged_purged_new_left_after.to_csv( '20191020T20/merged_purged_new_left_after.csv' )

# Note: later discovered these stats are not quite right, because of duplicates from merge_asof. Fixed below.

# Print out some stats here
(
    "Unmatched in old log: " + str( len( merged_purged_old_left_after[ merged_purged_old_left_after[ 'source_new' ].isna() ] ) ),
    "Unmatched in new log: " + str( len( merged_purged_new_left_after[ merged_purged_new_left_after[ 'source_old' ].isna() ] ) )
)
Out[23]:
('Unmatched in old log: 518', 'Unmatched in new log: 155')
In [24]:
# Merge on most event fields but *not ip*, where new log timestamp is after old log timestamp, 30s tolerance
# Note: Results of this merge not used in the final data

# merge on landingpage, other logged URL parameters, and nearest timestamp
# (where new log timestamp must be later than the old log one)

# first order by timestamp
old_purged.sort_values( [ 'dt' ], inplace = True )
new_purged.sort_values( [ 'dt' ], inplace = True )

# put old logs on the left to include all old rows and only new ones that match
merged_purged_old_left_no_ip = pd.merge_asof(
    old_purged,
    new_purged,
    direction = 'forward',
    tolerance= pd.Timedelta( '30s' ),
    on = 'dt',
    by = [ 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_old', '_new' ]
)

# put new logs on the left to include all new rows and only new ones that match
merged_purged_new_left_no_ip = pd.merge_asof(
    new_purged,
    old_purged,
    direction = 'backward',
    tolerance= pd.Timedelta( '30s' ),
    on = 'dt',
    by = [ 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_new', '_old']
)

# Give the columns a nicer order
reordered_merged_cols = [
    'dt',
    'ip_old',
    'ip_new',
    'id_old',
    'id_new',
    'landingpage',
    'user_agent',
    'browser_family',
    'os_family',
    'nocookies',
    'is_bot',
    'url',
    'content_type',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'referer',
    'x_forwarded_for',
    'accept_language',
    'x_analytics',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_major',
    'os_minor',
    'source_old',
    'source_new'
]

merged_purged_old_left_no_ip = merged_purged_old_left_no_ip[ reordered_merged_cols ]
merged_purged_new_left_no_ip = merged_purged_new_left_no_ip[ reordered_merged_cols ]

# Sort for easier analysis
merged_purged_old_left_no_ip.sort_values( [ 'source_new', 'user_agent' ], inplace = True )
merged_purged_new_left_no_ip.sort_values( [ 'source_old', 'browser_family', 'os_family' ], inplace = True )

# Write out the merged dataframes for examination in spreadsheets
merged_purged_old_left_no_ip.to_csv( '20191020T20/merged_purged_old_left_no_ip.csv' )
merged_purged_new_left_no_ip.to_csv( '20191020T20/merged_purged_new_left_no_ip.csv' )

# Note: later discovered these stats are not quite right, because of duplicates from merge_asof. Fixed below.

# Print out some stats here
(
    "Unmatched in old log: " + str( len( merged_purged_old_left_no_ip[ merged_purged_old_left_no_ip[ 'source_new' ].isna() ] ) ),
    "Unmatched in new log: " + str( len( merged_purged_new_left_no_ip[ merged_purged_new_left_no_ip[ 'source_old' ].isna() ] ) )
)
Out[24]:
('Unmatched in old log: 427', 'Unmatched in new log: 64')
In [28]:
# Merge on most event fields, where new log timestamp is closest to old log timestamp, after *or* before
# This is the merge code used for actual data.

# merge on ip, landingpage, other logged URL parameters, and nearest timestamp

# first order by timestamp
old_purged.sort_values( [ 'dt' ], inplace = True )
new_purged.sort_values( [ 'dt' ], inplace = True )

# put old logs on the left to include all old rows and only new ones that match
merged_purged_old_left_20s = pd.merge_asof(
    old_purged,
    new_purged,
    direction = 'nearest',
    tolerance= pd.Timedelta( '20s' ),
    on = 'dt',
    by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_old', '_new' ]
)

# put new logs on the left to include all new rows and only new ones that match
merged_purged_new_left_20s = pd.merge_asof(
    new_purged,
    old_purged,
    direction = 'nearest',
    tolerance= pd.Timedelta( '20s' ),
    on = 'dt',
    by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
    suffixes = [ '_new', '_old']
)

# Give the columns a nicer order
reordered_merged_cols = [
    'dt',
    'ip',
    'id_old',
    'id_new',
    'landingpage',
    'user_agent',
    'browser_family',
    'os_family',
    'nocookies',
    'is_bot',
    'url',
    'content_type',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'referer',
    'x_forwarded_for',
    'accept_language',
    'x_analytics',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_major',
    'os_minor',
    'source_old',
    'source_new'
]

merged_purged_old_left_20s = merged_purged_old_left_20s[ reordered_merged_cols ]
merged_purged_new_left_20s = merged_purged_new_left_20s[ reordered_merged_cols ]

# Sort for easier analysis
merged_purged_old_left_20s.sort_values( [ 'source_new', 'user_agent' ], inplace = True )
merged_purged_new_left_20s.sort_values( [ 'source_old', 'browser_family', 'os_family' ], inplace = True )

# Write out the merged dataframes for examination in spreadsheets
merged_purged_old_left_20s.to_csv( '20191020T20/merged_purged_old_left_20s.csv' )
merged_purged_new_left_20s.to_csv( '20191020T20/merged_purged_new_left_20s.csv' )

# We want to count and analyse one-to-one pairs between the logs, and events that are not part of one-to-one
# pairs. However, merge_asof is a one-to-many merge. So, we have to create pair identities in the datasets
# of both merge_asof results, and merge those, counting only the pair identities that resulted from both.
merged_purged_old_left_20s[ 'pair_id' ] = merged_purged_old_left_20s.apply(
    lambda x: str( x[ 'id_old' ] ) + '|' + str( x[ 'id_new' ] ),
    axis = 1
)

merged_purged_new_left_20s[ 'pair_id' ] = merged_purged_new_left_20s.apply(
    lambda x: str( x[ 'id_old' ] ) + '|' + str( x[ 'id_new' ] ),
    axis = 1
)

really_matched = merged_purged_old_left_20s.merge(
    merged_purged_new_left_20s[ [ 'pair_id' ] ],
    on = 'pair_id',
    how = 'inner',
    validate = '1:1',
    suffixes = [ '_old_left', '_new_left' ]
)

# Create a dataframe of orphaned new logs
orphanded_new_logs = new_purged.merge(
    really_matched[ [ 'id_new', 'pair_id' ] ],
    left_on = 'id',
    right_on = 'id_new',
    how = 'outer',
    indicator = True,
    validate = '1:1',
    suffixes = [ '', '_really_matched']
)

orphanded_new_logs = orphanded_new_logs[ orphanded_new_logs[ '_merge' ] == 'left_only' ]
    
orphanded_new_logs.sort_values( [ 'os_family', 'os_major', 'browser_family', 'os_major' ] )
orphanded_new_logs.to_csv( '20191020T20/orphanded_new_logs.csv' )

# Print out some stats here
(
    "Matched: " + str( len( really_matched ) ),
    "Unmatched in old log: " + str( len( old_purged ) - len( really_matched ) ),
    "Unmatched in new log: " + str( len( new_purged ) - len( really_matched ) )
)
Out[28]:
('Matched: 933', 'Unmatched in old log: 530', 'Unmatched in new log: 174')
In [29]:
# Load tsv with contents of landingpageimpression_raw table as generated from the same new
# logs using FRUEC

# To get this data: we generated a single log file with this command:
# cat 20191020T20/new/*.log | jq -c 'select( .dt | test("2019-10-20T20") )' > new_logs.20191020-20_landingpageimpression_raw.tsv

# Then we ran FRUEC to consume that log file, starting from an empty database.

# tsv file that we load here was generated using the following query:
# SELECT
#  timestamp AS dt,
#  utm_source,
#  utm_campaign,
#  utm_medium,
#  utm_key,
#  landingpage,
#  language.iso_code AS language,
#  country.iso_code AS country,
#  project.project AS project
# FROM
#  landingpageimpression_raw
# LEFT JOIN language ON landingpageimpression_raw.language_id = language.id
# LEFT JOIN country ON landingpageimpression_raw.country_id = country.id
# LEFT JOIN project ON landingpageimpression_raw.project_id = project.id;

new_lpi_raw = pd.read_csv( 'new_logs.20191020-20_Se.tsv', sep='\t' )
In [30]:
# convert timestamp to datetime
new_lpi_raw[ 'dt' ] = pd.to_datetime( new_lpi_raw[ 'dt' ] )
In [31]:
# fill in XX as default value in country column in dataframe from logs
new_with_defaults = new.copy()
new_with_defaults[ 'country' ] = new_with_defaults[ 'country' ].fillna( 'XX' )
In [32]:
# merge dataframes with new logs and data in landingpage impressions

new_lpi_and_log_merged = new_lpi_raw.merge(
    new_with_defaults,
    on = [
        'dt',
        'utm_source',
        'utm_campaign',
        'utm_medium',
        'utm_key',
        'country'
    ],
    how = 'outer',
    suffixes = [ '_lpi_raw', '_logs' ],
    indicator = True
)

# reorder columns
reordered_merged_cols = [
    'dt',
    '_merge',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'landingpage_lpi_raw',
    'landingpage_logs',
    'language_lpi_raw',
    'language_logs',
    'country',
    'project',
    'id',
    'ip',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_family',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_bot',
    'is_mediawiki',
    'os_family',
    'os_major',
    'os_minor',
    'source'
]

new_lpi_and_log_merged = new_lpi_and_log_merged[ reordered_merged_cols ]

# order by timestamp
new_lpi_and_log_merged.sort_values( [ '_merge' ], inplace = True )

new_lpi_and_log_merged.to_csv( '20191020T20/new_lpi_and_log_merged.csv' )
In [33]:
len( new_lpi_and_log_merged[ new_lpi_and_log_merged[ '_merge' ] != 'both' ] )
Out[33]:
8
In [34]:
# Merge full old log with old log dataframe containing fields indicating whether there was a match in the new log
# We should be able to merge on all the columns present in the old log dataframe. This avoids overcrowding
# the new dataframe with repeated columns.

old_with__merge_info = old_purged.merge(
    merged_purged_old_left_20s,
    left_on = [ 'id', 'dt', 'source', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'cache_status-http_status', 'response_size', 'http_method', 'url', 'content_type', 'referer', 'x_forwarded_for', 'user_agent', 'accept_language', 'x_analytics', 'nocookies' ],
    right_on = [ 'id_old', 'dt', 'source_old', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'cache_status-http_status', 'response_size', 'http_method', 'url', 'content_type', 'referer', 'x_forwarded_for', 'user_agent', 'accept_language', 'x_analytics', 'nocookies' ],
    how = 'outer',
    suffixes = [ '_full', '_matches' ]
)

# Reorder the columns to put important ones first
reordered_merged_cols = [
    'id',
    'dt',
    'ip',
    'source',
    'source_old',
    'source_new',
    'landingpage',
    'language',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'country',
    'nocookies',
    'browser_family',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'url',
    'content_type',
    'referer',
    'x_forwarded_for',
    'user_agent',
    'accept_language',
    'x_analytics',
    'url_type',
    'has_action',
    'contact_id',
    'link_id',
    'appeal',
    'id_old',
    'id_new',
    'os_family',
    'is_bot',
    'utm_key',
    'template',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_major',
    'os_minor'
]

old_with__merge_info = old_with__merge_info[ reordered_merged_cols ]

old_with__merge_info.to_csv( '20191020T20/old_with__merge_info.csv' )
In [35]:
# Now merge in dataframe with new logs including info on whether the event matched
# We should be able to merge on all the columns present in the new log dataframe.  This avoids overcrowding
# the new dataframe with repeated columns.

both_with__merge_info = old_with__merge_info.merge(
    merged_purged_new_left_20s,
    left_on = [ 'id_new', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'user_agent', 'browser_family', 'os_family', 'nocookies', 'is_bot', 'url', 'content_type', 'cache_status-http_status', 'response_size', 'http_method', 'referer', 'x_forwarded_for', 'accept_language', 'x_analytics', 'utm_key', 'template', 'appeal_template', 'form_template', 'form_countryspecific', 'browser_major', 'browser_minor', 'device_family', 'is_mediawiki', 'os_major', 'os_minor' ],
    right_on = [ 'id_new', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'user_agent', 'browser_family', 'os_family', 'nocookies', 'is_bot', 'url', 'content_type', 'cache_status-http_status', 'response_size', 'http_method', 'referer', 'x_forwarded_for', 'accept_language', 'x_analytics', 'utm_key', 'template', 'appeal_template', 'form_template', 'form_countryspecific', 'browser_major', 'browser_minor', 'device_family', 'is_mediawiki', 'os_major', 'os_minor' ],
    how = 'outer',
    suffixes = [ '_all_old', '_matches_new' ]
)

# Add a dt column that contains new dt if old dt is empty
both_with__merge_info[ 'dt' ] = both_with__merge_info.apply( 
    lambda x: x[ 'dt_matches_new' ] if pd.isnull( x[ 'dt_all_old' ] ) else x[ 'dt_all_old' ],
    axis = 1
)

reordered_merged_cols = [
    'id',
    'id_old_matches_new',
    'id_old_all_old',
    'dt',
    'dt_all_old',
    'dt_matches_new',
    'ip',
    'source',
    'source_old_all_old',
    'source_new_all_old',
    'source_old_matches_new',
    'source_new_matches_new',
    'browser_family',
    'os_family',
    'landingpage',
    'language',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'country',
    'nocookies',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'url',
    'content_type',
    'referer',
    'x_forwarded_for',
    'user_agent',
    'accept_language',
    'x_analytics',
    'url_type',
    'has_action',
    'contact_id',
    'link_id',
    'appeal',
    'id_new',
    'is_bot',
    'utm_key',
    'template',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_major',
    'os_minor'
]

both_with__merge_info = both_with__merge_info[ reordered_merged_cols ]

both_with__merge_info.sort_values( [ 'dt' ], inplace = True )

both_with__merge_info.to_csv( '20191020T20/both_with__merge_info.csv' )
In [36]:
orphanded_new_logs_by_browser = orphanded_new_logs.groupby( 'browser_family', as_index = False ) \
    .count()[ [ 'browser_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

orphanded_new_logs_by_browser[ 'precent' ] = orphanded_new_logs_by_browser[ 'count' ].apply(
    lambda x: ( x / len( orphanded_new_logs ) ) * 100
)

orphanded_new_logs_by_browser
Out[36]:
browser_family count precent
14 Mobile Safari 89 51.149425
16 Safari 21 12.068966
1 Chrome 16 9.195402
2 Chrome Mobile 12 6.896552
17 Samsung Internet 11 6.321839
4 Chrome Mobile iOS 5 2.873563
10 Google 3 1.724138
9 Firefox iOS 3 1.724138
7 Firefox 2 1.149425
11 IE 2 1.149425
12 Instagram 2 1.149425
5 Edge 2 1.149425
8 Firefox Mobile 1 0.574713
6 Facebook 1 0.574713
13 MiuiBrowser 1 0.574713
15 Mobile Safari UI/WKWebView 1 0.574713
3 Chrome Mobile WebView 1 0.574713
0 Apple Mail 1 0.574713
In [37]:
new_purged_by_browser = new_purged.groupby( 'browser_family', as_index = False ) \
    .count()[ [ 'browser_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

new_purged_by_browser[ 'precent' ] = new_purged_by_browser[ 'count' ].apply(
    lambda x: ( x / len( new_purged ) ) * 100
)

new_purged_by_browser
Out[37]:
browser_family count precent
2 Chrome 265 23.938573
16 Mobile Safari 239 21.589883
3 Chrome Mobile 141 12.737127
19 Safari 129 11.653117
9 Firefox 109 9.846432
7 Edge 69 6.233062
20 Samsung Internet 53 4.787715
5 Chrome Mobile iOS 25 2.258356
13 IE 19 1.716350
10 Firefox Mobile 12 1.084011
8 Facebook 9 0.813008
12 Google 8 0.722674
18 Opera 6 0.542005
15 MiuiBrowser 5 0.451671
4 Chrome Mobile WebView 4 0.361337
11 Firefox iOS 3 0.271003
6 Chromium 3 0.271003
17 Mobile Safari UI/WKWebView 3 0.271003
14 Instagram 2 0.180668
0 Apple Mail 2 0.180668
1 Bytespider 1 0.090334
In [38]:
orphanded_new_logs_by_os = orphanded_new_logs.groupby( 'os_family', as_index = False ) \
    .count()[ [ 'os_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

orphanded_new_logs_by_os[ 'percent' ] = orphanded_new_logs_by_os[ 'count' ].apply(
    lambda x: ( x / len( orphanded_new_logs ) ) * 100
)

orphanded_new_logs_by_os
Out[38]:
os_family count percent
3 iOS 104 59.770115
0 Android 26 14.942529
1 Mac OS X 25 14.367816
2 Windows 19 10.919540
In [39]:
new_purged_by_os = new_purged.groupby( 'os_family', as_index = False ) \
    .count()[ [ 'os_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

new_purged_by_os[ 'percent' ] = new_purged_by_os[ 'count' ].apply(
    lambda x: ( x / len( new_purged ) ) * 100
)

new_purged_by_os
Out[39]:
os_family count percent
6 Windows 382 34.507678
7 iOS 283 25.564589
0 Android 228 20.596206
4 Mac OS X 197 17.795845
3 Linux 6 0.542005
5 Ubuntu 5 0.451671
1 Chrome OS 4 0.361337
2 Firefox OS 2 0.180668
In [40]:
orphanded_new_logs_by_lp = orphanded_new_logs.groupby( 'landingpage', as_index = False ) \
    .count()[ [ 'landingpage', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

orphanded_new_logs_by_lp[ 'percent' ] = orphanded_new_logs_by_lp[ 'count' ].apply(
    lambda x: ( x / len( orphanded_new_logs ) ) * 100
)

orphanded_new_logs_by_lp
Out[40]:
landingpage count percent
3 Thank_You 110 63.218391
2 Special:LandingPage 45 25.862069
4 Ways_to_Give 13 7.471264
0 Matching_Gifts 5 2.873563
1 Problems_donating 1 0.574713
In [41]:
new_purged_by_lp = new_purged.groupby( 'landingpage', as_index = False ) \
    .count()[ [ 'landingpage', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

new_purged_by_lp[ 'percent' ] = new_purged_by_lp[ 'count' ].apply(
    lambda x: ( x / len( new_purged ) ) * 100
)

new_purged_by_lp
Out[41]:
landingpage count percent
4 Special:LandingPage 515 46.522132
8 Thank_You 456 41.192412
9 Ways_to_Give 65 5.871725
2 Matching_Gifts 21 1.897019
1 FAQ 17 1.535682
3 Problems_donating 14 1.264679
0 Cancel_or_change_recurring_giving 11 0.993677
7 Tax_deductibility 6 0.542005
5 Special:SpecialPages 1 0.090334
6 Special:TrackingCategories 1 0.090334
In [ ]: