# Imports
import re
import pandas as pd
from urllib import parse
# These are the old log files
!ls 20191020T20/old/*.log
# Create a single tsv file with only entries for 20191020T20
# Note: For some reason this doesn't work from Jupyter. Ran directly on the command line.
!cat 20191020T20/old/*.log | awk '$3 ~/^2019-10-20T20/' > 20191020T20/old/20191020T20.tsv
# Load the tsv file of old logs as a Pandas data frame
# Columns of the log files come from this template:
# %{hostname} %{sequence} %{dt} %{time_firstbyte} %{ip} %{cache_status}/%{http_status} %{response_size}
# %{http_method} http://%{uri_host}%{uri_path}%{uri_query} - %{content_type} %{referer} %{x_forwarded_for}
# %{user_agent} %{accept_language} %{x_analytics}
col_names = [
'hostname',
'sequence',
'dt',
'time_firstbyte',
'ip',
'cache_status-http_status',
'response_size',
'http_method',
'url',
'(blank)',
'content_type',
'referer',
'x_forwarded_for',
'user_agent',
'accept_language',
'x_analytics'
]
old = pd.read_csv( '20191020T20/old/20191020T20.tsv', names = col_names, header = None, sep='\t' )
# Get rid of columns we don't care about, and add 'source' and 'nocookies' columns
old.drop( [ 'hostname', 'sequence', 'time_firstbyte', '(blank)', ], inplace = True, axis = 1 )
old[ 'source' ] = 'old'
old[ 'nocookies' ] = old[ 'x_analytics' ].apply( lambda x: 'nocookies=1' in x )
# Parse the URL column to obatin the landing page
lp_in_path_re = re.compile( '(?<=/wiki/).*$' )
def landing_page_from_url( url ):
url_str = str( url )
parsed_url = parse.urlsplit( url_str )
lp_in_path = lp_in_path_re.search( parsed_url.path )
if lp_in_path:
return lp_in_path.group()
query_dict = parse.parse_qs( parsed_url.query )
if 'title' in query_dict:
return query_dict[ 'title' ][0]
else:
return parsed_url.path
old[ 'landingpage' ] = old[ 'url' ].apply( landing_page_from_url )
# These are the new log files
!ls 20191020T20/new/*.log
# Create a single csv file with only entries for 20191020T20 and with
# the columns we want
!cat 20191020T20/new/*.log | jq -r 'select( .dt | test("2019-10-20T20") ) | [ .dt, .ip, .event.landingpage, .event.language // null, .event.country // null, .event.utm_source // null, .event.utm_campaign // null, .event.utm_medium // null, .event.utm_key // null, .event.contact_id // null, .event.link_id // null, .event.template // null, .event.appeal // null, .event.appeal_template // null, .event.form_template // null, .event.form_countryspecific // null, .userAgent.browser_family, .userAgent.browser_major, .userAgent.browser_minor, .userAgent.device_family, .userAgent.is_bot, .userAgent.is_mediawiki, .userAgent.os_family, .userAgent.os_major, .userAgent.os_minor] | @csv' > 20191020T20/new/20191020T20.csv
# Load the csv file of new logs as a Pandas data frame
col_names = [
'dt',
'ip',
'landingpage',
'language',
'country',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_family',
'browser_major',
'browser_minor',
'device_family',
'is_bot',
'is_mediawiki',
'os_family',
'os_major',
'os_minor'
]
new = pd.read_csv( '20191020T20/new/20191020T20.csv', names = col_names, header = None )
# Add a 'source' column
new[ 'source' ] = 'new'
# Create a paired-down version of the dataset
# Remove non-wiki pages
old_purged = old[ old.landingpage.str.match( '^[^/]' ) ]
# Remove non-OK HTTP statuses from entries from old logs
old_purged = old_purged[
( old_purged[ 'cache_status-http_status' ].str.contains( '200' ) ) |
( old_purged[ 'cache_status-http_status' ].str.contains( '304' ) )
]
# Remove hits with no cookies from entries from old logs
old_purged = old_purged[ old_purged.nocookies == False ]
# Remove hits from background requests for js from old logs
old_purged = old_purged[ ~old_purged.landingpage.str.contains( '.js') ]
# remove IP addreses that are clearly bots or hacking attempts
bad_ips = [
# Five entries here removed for publicly visible version of this notebook
]
old_purged = old_purged[ ~( old_purged[ 'ip' ].isin( bad_ips ) ) ]
# order by ip and timestamp
old_purged.sort_values( [ 'ip', 'dt' ], inplace = True )
# purge the same bad ips and identified bots from the new logs
new_purged = new[ ~( new[ 'ip' ].isin( bad_ips ) ) ]
new_purged = new_purged[ new_purged[ 'is_bot'] == False ]
# rows in new
len( new_purged )
# rows in old_purged
len( old_purged )
# Create a single data frame with data from both old and new logs
both = pd.concat( [ old_purged, new_purged ], sort = False )
# reorder columns, putting the most interesting ones first, to make the csv easier to read in
# a spreadsheet
reordered_cols = [
'dt',
'ip',
'source',
'landingpage',
'nocookies',
'is_bot',
'url',
'content_type',
'user_agent',
'cache_status-http_status',
'response_size',
'http_method',
'referer',
'x_forwarded_for',
'accept_language',
'x_analytics',
'language',
'country',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_family',
'browser_major',
'browser_minor',
'device_family',
'is_mediawiki',
'os_family',
'os_major',
'os_minor'
]
both = both[ reordered_cols ]
# order by ip and timestamp
both.sort_values( [ 'ip', 'dt' ], inplace = True )
# Write data out to file for closer examination in a spreadsheet
both.to_csv( '20191020T20/both.csv' )
old_purged.to_csv( '20191020T20/old_purged.csv' )
new_purged.to_csv( '20191020T20/new_purged.csv' )