In [58]:
# Imports

import re
import pandas as pd
from urllib import parse 
In [8]:
# These are the old log files

!ls 20191020T20/old/*.log
20191020T20/old/landingpages.tsv.20191020-190002.log
20191020T20/old/landingpages.tsv.20191020-191501.log
20191020T20/old/landingpages.tsv.20191020-193001.log
20191020T20/old/landingpages.tsv.20191020-194502.log
20191020T20/old/landingpages.tsv.20191020-200001.log
20191020T20/old/landingpages.tsv.20191020-201501.log
20191020T20/old/landingpages.tsv.20191020-203001.log
20191020T20/old/landingpages.tsv.20191020-204501.log
20191020T20/old/landingpages.tsv.20191020-210001.log
20191020T20/old/landingpages.tsv.20191020-211501.log
20191020T20/old/landingpages.tsv.20191020-213001.log
20191020T20/old/landingpages.tsv.20191020-214501.log
In [137]:
# Create a single tsv file with only entries for 20191020T20
# Note: For some reason this doesn't work from Jupyter. Ran directly on the command line.

!cat 20191020T20/old/*.log | awk '$3 ~/^2019-10-20T20/' > 20191020T20/old/20191020T20.tsv
In [204]:
# Load the tsv file of old logs as a Pandas data frame
# Columns of the log files come from this template:
#     %{hostname} %{sequence} %{dt} %{time_firstbyte} %{ip} %{cache_status}/%{http_status} %{response_size}
#     %{http_method} http://%{uri_host}%{uri_path}%{uri_query} - %{content_type} %{referer} %{x_forwarded_for}
#     %{user_agent} %{accept_language} %{x_analytics}

col_names = [
    'hostname',
    'sequence',
    'dt', 
    'time_firstbyte',
    'ip',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'url',
    '(blank)',
    'content_type',
    'referer',
    'x_forwarded_for',
    'user_agent',
    'accept_language',
    'x_analytics'
]

old = pd.read_csv( '20191020T20/old/20191020T20.tsv', names = col_names, header = None, sep='\t' )
In [205]:
# Get rid of columns we don't care about, and add 'source' and 'nocookies' columns

old.drop( [ 'hostname', 'sequence', 'time_firstbyte', '(blank)', ], inplace = True, axis = 1 )
old[ 'source' ] = 'old'
old[ 'nocookies' ] = old[ 'x_analytics' ].apply( lambda x: 'nocookies=1' in x )
In [206]:
# Parse the URL column to obatin the landing page

lp_in_path_re = re.compile( '(?<=/wiki/).*$' )

def landing_page_from_url( url ):
    url_str = str( url )
    parsed_url = parse.urlsplit( url_str )
    
    lp_in_path = lp_in_path_re.search( parsed_url.path )
    if lp_in_path:
        return lp_in_path.group()

    query_dict = parse.parse_qs( parsed_url.query )
    if 'title' in query_dict:
        return query_dict[ 'title' ][0]
    else:
        return parsed_url.path

old[ 'landingpage' ] = old[ 'url' ].apply( landing_page_from_url )
In [22]:
# These are the new log files

!ls 20191020T20/new/*.log
20191020T20/new/landingpage-impressions.20191020-190002.json.log
20191020T20/new/landingpage-impressions.20191020-191501.json.log
20191020T20/new/landingpage-impressions.20191020-193001.json.log
20191020T20/new/landingpage-impressions.20191020-194502.json.log
20191020T20/new/landingpage-impressions.20191020-200001.json.log
20191020T20/new/landingpage-impressions.20191020-201501.json.log
20191020T20/new/landingpage-impressions.20191020-203001.json.log
20191020T20/new/landingpage-impressions.20191020-204501.json.log
20191020T20/new/landingpage-impressions.20191020-210001.json.log
20191020T20/new/landingpage-impressions.20191020-211501.json.log
20191020T20/new/landingpage-impressions.20191020-213001.json.log
20191020T20/new/landingpage-impressions.20191020-214502.json.log
In [29]:
# Create a single csv file with only entries for 20191020T20 and with
# the columns we want

!cat 20191020T20/new/*.log | jq -r 'select( .dt | test("2019-10-20T20") ) | [ .dt, .ip, .event.landingpage, .event.language // null, .event.country // null, .event.utm_source // null, .event.utm_campaign // null, .event.utm_medium // null, .event.utm_key // null, .event.contact_id // null, .event.link_id // null, .event.template // null, .event.appeal // null, .event.appeal_template // null, .event.form_template // null, .event.form_countryspecific  // null, .userAgent.browser_family, .userAgent.browser_major, .userAgent.browser_minor, .userAgent.device_family, .userAgent.is_bot, .userAgent.is_mediawiki, .userAgent.os_family, .userAgent.os_major, .userAgent.os_minor] | @csv' > 20191020T20/new/20191020T20.csv
In [133]:
# Load the csv file of new logs as a Pandas data frame

col_names = [
    'dt',
    'ip',
    'landingpage',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_family',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_bot',
    'is_mediawiki',
    'os_family',
    'os_major',
    'os_minor'
]

new = pd.read_csv( '20191020T20/new/20191020T20.csv', names = col_names, header = None )
In [134]:
# Add a 'source' column
new[ 'source' ] = 'new'
In [267]:
# Create a paired-down version of the dataset

# Remove non-wiki pages
old_purged = old[ old.landingpage.str.match( '^[^/]' ) ]

# Remove non-OK HTTP statuses from entries from old logs
old_purged = old_purged[
    ( old_purged[ 'cache_status-http_status' ].str.contains( '200' ) ) |
    ( old_purged[ 'cache_status-http_status' ].str.contains( '304' ) )
]

# Remove hits with no cookies from entries from old logs
old_purged = old_purged[ old_purged.nocookies == False ]

# Remove hits from background requests for js from old logs
old_purged = old_purged[ ~old_purged.landingpage.str.contains( '.js') ]

# remove IP addreses that are clearly bots or hacking attempts
bad_ips = [
    # Five entries here removed for publicly visible version of this notebook
]

old_purged = old_purged[ ~( old_purged[ 'ip' ].isin( bad_ips ) ) ]

# order by ip and timestamp
old_purged.sort_values( [ 'ip', 'dt' ], inplace = True )
In [268]:
# purge the same bad ips and identified bots from the new logs
new_purged = new[ ~( new[ 'ip' ].isin( bad_ips ) ) ]
new_purged = new_purged[ new_purged[ 'is_bot'] == False ]
In [266]:
# rows in new
len( new_purged )
Out[266]:
1079
In [263]:
# rows in old_purged
len( old_purged )
Out[263]:
1442
In [269]:
# Create a single data frame with data from both old and new logs

both = pd.concat( [ old_purged, new_purged ], sort = False )
In [270]:
# reorder columns, putting the most interesting ones first, to make the csv easier to read in
# a spreadsheet

reordered_cols = [
    'dt',
    'ip',
    'source',
    'landingpage',
    'nocookies',
    'is_bot',
    'url',
    'content_type',
    'user_agent',
    'cache_status-http_status',
    'response_size',
    'http_method',
    'referer',
    'x_forwarded_for',
    'accept_language',
    'x_analytics',
    'language',
    'country',
    'utm_source',
    'utm_campaign',
    'utm_medium',
    'utm_key',
    'contact_id',
    'link_id',
    'template',
    'appeal',
    'appeal_template',
    'form_template',
    'form_countryspecific',
    'browser_family',
    'browser_major',
    'browser_minor',
    'device_family',
    'is_mediawiki',
    'os_family',
    'os_major',
    'os_minor'
]

both = both[ reordered_cols ]
In [271]:
# order by ip and timestamp
both.sort_values( [ 'ip', 'dt' ], inplace = True )
In [272]:
# Write data out to file for closer examination in a spreadsheet
both.to_csv( '20191020T20/both.csv' )
old_purged.to_csv( '20191020T20/old_purged.csv' )
new_purged.to_csv( '20191020T20/new_purged.csv' )
In [ ]: