# Imports
import re
import pandas as pd
from urllib import parse
import numpy as np
# These are the old log files
!ls 20191020T20/old/*.log
# Create a single tsv file with only entries for 20191020T20
# Note: For some reason this doesn't work from Jupyter. Ran directly on the command line.
!cat 20191020T20/old/*.log | awk '$3 ~/^2019-10-20T20/' > 20191020T20/old/20191020T20.tsv
# Load the tsv file of old logs as a Pandas data frame
# Columns of the log files come from this template:
# %{hostname} %{sequence} %{dt} %{time_firstbyte} %{ip} %{cache_status}/%{http_status} %{response_size}
# %{http_method} http://%{uri_host}%{uri_path}%{uri_query} - %{content_type} %{referer} %{x_forwarded_for}
# %{user_agent} %{accept_language} %{x_analytics}
col_names = [
'hostname',
'sequence',
'dt',
'time_firstbyte',
'ip',
'cache_status-http_status',
'response_size',
'http_method',
'url',
'(blank)',
'content_type',
'referer',
'x_forwarded_for',
'user_agent',
'accept_language',
'x_analytics'
]
old = pd.read_csv( '20191020T20/old/20191020T20.tsv', names = col_names, header = None, sep='\t' )
# Get rid of columns we don't care about, and add 'source', 'nocookies' and 'id' columns
old.drop( [ 'hostname', 'sequence', 'time_firstbyte', '(blank)', ], inplace = True, axis = 1 )
old[ 'source' ] = 'old'
old[ 'nocookies' ] = old[ 'x_analytics' ].apply( lambda x: 'nocookies=1' in x )
old.insert( 0, 'id', 'old' + pd.Series( range( 0, len( old) ) ).astype( str) )
# convert dt to datetime
old[ 'dt' ] = pd.to_datetime( old[ 'dt' ] )
# Parse URL to create detailed fields for old log
# If the wiki page is in the URL in the pretty format, let's parse it out, but leave an initial slash. This will
# go in the landingpage column, as will URL paths for non-wiki page. The legacy scripts only ingress wiki pages
# in ugly URL format. This way we can easily filter out everything that's not that.
lp_in_path_re = re.compile( '(?<=/wiki/).*$' )
# As per ext.fundraiserLandingPage.LogPageview.js
query_el_map = {
"language": [ "uselang", "language" ],
"country": [ "country" ],
"utm_source": [ "utm_source" ],
"utm_campaign": [ "utm_campaign" ],
"utm_medium": [ "utm_medium" ],
"utm_key": [ "utm_key" ],
"contact_id": [ "contact_id" ],
"link_id": [ "link_id" ],
"template": [ "template" ],
"appeal": [ "appeal" ],
"appeal_template": [ "appeal_template", "appeal-template" ],
"form_template": [ "form_template", "form-template" ],
"form_countryspecific": [ "form_countryspecific", "form-countryspecific" ]
}
def fields_from_url( url ):
url_str = str( url )
parsed_url = parse.urlsplit( url_str )
query_dict = parse.parse_qs( parsed_url.query )
lp_in_path = lp_in_path_re.search( parsed_url.path )
if lp_in_path:
lp = lp_in_path.group()
url_type = 'pretty_url'
elif 'title' in query_dict:
lp = query_dict[ 'title' ][ 0 ]
url_type = 'ugly_url'
else:
lp = parsed_url.path
url_type = 'non_wikipage'
event_data = {
"landingpage": lp,
"url_type": url_type,
"has_action": 'action' in query_dict
}
# same logic as ext.fundraiserLandingPage.LogPageview.js to generate event data from URL....
for el_key, url_params in query_el_map.items():
for url_param in url_params:
if url_param in query_dict:
event_data[ el_key ] = query_dict[ url_param ][ 0 ]
return pd.Series( event_data )
old = old.join( old[ 'url' ].apply( fields_from_url ) )
old.to_csv( '20191020T20/old.csv' )
# These are the new log files
!ls 20191020T20/new/*.log
# Create a single csv file with only entries for 20191020T20 and with
# the columns we want
!cat 20191020T20/new/*.log | jq -r 'select( .dt | test("2019-10-20T20") ) | [ .dt, .ip, .event.landingpage, .event.language // null, .event.country // null, .event.utm_source // null, .event.utm_campaign // null, .event.utm_medium // null, .event.utm_key // null, .event.contact_id // null, .event.link_id // null, .event.template // null, .event.appeal // null, .event.appeal_template // null, .event.form_template // null, .event.form_countryspecific // null, .userAgent.browser_family, .userAgent.browser_major, .userAgent.browser_minor, .userAgent.device_family, .userAgent.is_bot, .userAgent.is_mediawiki, .userAgent.os_family, .userAgent.os_major, .userAgent.os_minor] | @csv' > 20191020T20/new/20191020T20.csv
# Load the csv file of new logs as a Pandas data frame
col_names = [
'dt',
'ip',
'landingpage',
'language',
'country',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_family',
'browser_major',
'browser_minor',
'device_family',
'is_bot',
'is_mediawiki',
'os_family',
'os_major',
'os_minor'
]
new = pd.read_csv( '20191020T20/new/20191020T20.csv', names = col_names, header = None )
# Add 'source' and 'id' columns
new[ 'source' ] = 'new'
new.insert( 0, 'id', 'new' + pd.Series( range( 0, len( old) ) ).astype( str) )
# convert dt to datetime
new[ 'dt' ] = pd.to_datetime( new[ 'dt' ] )
# Create a pared-down version of the dataset
# Remove non-wiki pages
old_purged = old[ old.url_type.isin( [ 'ugly_url', 'pretty_url' ] ) ]
# Remove non-OK HTTP statuses from entries from old logs
old_purged = old_purged[
( old_purged[ 'cache_status-http_status' ].str.contains( '200' ) ) |
( old_purged[ 'cache_status-http_status' ].str.contains( '304' ) )
]
# Remove hits with no cookies from entries from old logs
old_purged = old_purged[ old_purged.nocookies == False ]
# Remove hits with an action URL param (often action=raw, used for background URL requests)
old_purged = old_purged[ old_purged.has_action == False ]
# Remove IP addreses that are clearly bots or hacking attempts
bad_ips = [
# Censored
]
old_purged = old_purged[ ~( old_purged[ 'ip' ].isin( bad_ips ) ) ]
# Remove language code from the end of titles, since there seems to be some mismatch for
# titles with language codes at the end.
lp_ends_in_lang_re = re.compile( '^.*/[a-z]{2}$' )
old_purged[ 'landingpage' ] = old_purged[ 'landingpage' ].apply(
lambda x: x[:-3] if lp_ends_in_lang_re.match( x ) else x
)
# purge the same bad ips and identified bots from the new logs
new_purged = new[ ~( new[ 'ip' ].isin( bad_ips ) ) ]
new_purged = new_purged[ new_purged[ 'is_bot'] == False ]
# Also remove language code from the end of titles
new_purged[ 'landingpage' ] = new_purged[ 'landingpage' ].apply(
lambda x: x[:-3] if lp_ends_in_lang_re.match( x ) else x
)
# rows in new
len( new_purged )
# rows in old_purged
len( old_purged )
# Create a single data frame with data from both old and new logs
both = pd.concat( [ old_purged, new_purged ], sort = False )
# reorder columns, putting the most interesting ones first, to make the csv easier to read in
# a spreadsheet
reordered_cols = [
'dt',
'ip',
'source',
'landingpage',
'nocookies',
'is_bot',
'url',
'content_type',
'user_agent',
'cache_status-http_status',
'response_size',
'http_method',
'referer',
'x_forwarded_for',
'accept_language',
'x_analytics',
'language',
'country',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_family',
'browser_major',
'browser_minor',
'device_family',
'is_mediawiki',
'os_family',
'os_major',
'os_minor'
]
both = both[ reordered_cols ]
# order by ip and timestamp
both.sort_values( [ 'ip', 'dt' ], inplace = True )
# Write data out to file for closer examination in a spreadsheet
both.to_csv( '20191020T20/both.csv' )
old_purged.to_csv( '20191020T20/old_purged.csv' )
new_purged.to_csv( '20191020T20/new_purged.csv' )
allowed_lp_from_script = [
'L11',
'L12',
'L2011',
'L2012',
'WMF',
'WP',
'Special:LandingPage'
]
new_purged_for_script = new_purged[ new_purged.landingpage.isin( allowed_lp_from_script ) ]
old_purged_for_script = old_purged[ old_purged.landingpage.isin( allowed_lp_from_script ) ]
len( new_purged_for_script )
len( old_purged_for_script )
# Merge on most event fields, where new log timestamp is after old log timestamp
# Note: Results of this merge not used in the final data
# merge on ip, landingpage, other logged URL parameters, and nearest timestamp
# (where new log timestamp must be later than the old log one)
# first order by timestamp
old_purged.sort_values( [ 'dt' ], inplace = True )
new_purged.sort_values( [ 'dt' ], inplace = True )
# put old logs on the left to include all old rows and only new ones that match
merged_purged_old_left_after = pd.merge_asof(
old_purged,
new_purged,
direction = 'forward',
tolerance= pd.Timedelta( '2m' ),
on = 'dt',
by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
suffixes = [ '_old', '_new' ]
)
# put new logs on the left to include all new rows and only new ones that match
merged_purged_new_left_after = pd.merge_asof(
new_purged,
old_purged,
direction = 'backward',
tolerance= pd.Timedelta( '2m' ),
on = 'dt',
by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
suffixes = [ '_new', '_old']
)
# Give the columns a nicer order
reordered_merged_cols = [
'dt',
'ip',
'id_old',
'id_new',
'landingpage',
'user_agent',
'browser_family',
'os_family',
'nocookies',
'is_bot',
'url',
'content_type',
'cache_status-http_status',
'response_size',
'http_method',
'referer',
'x_forwarded_for',
'accept_language',
'x_analytics',
'language',
'country',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_major',
'browser_minor',
'device_family',
'is_mediawiki',
'os_major',
'os_minor',
'source_old',
'source_new'
]
merged_purged_old_left_after = merged_purged_old_left_after[ reordered_merged_cols ]
merged_purged_new_left_after = merged_purged_new_left_after[ reordered_merged_cols ]
# Sort for easier analysis
merged_purged_old_left_after.sort_values( [ 'source_new', 'user_agent' ], inplace = True )
merged_purged_new_left_after.sort_values( [ 'source_old', 'browser_family', 'os_family' ], inplace = True )
# Write out the merged dataframes for examination in spreadsheets
merged_purged_old_left_after.to_csv( '20191020T20/merged_purged_old_left_after.csv' )
merged_purged_new_left_after.to_csv( '20191020T20/merged_purged_new_left_after.csv' )
# Note: later discovered these stats are not quite right, because of duplicates from merge_asof. Fixed below.
# Print out some stats here
(
"Unmatched in old log: " + str( len( merged_purged_old_left_after[ merged_purged_old_left_after[ 'source_new' ].isna() ] ) ),
"Unmatched in new log: " + str( len( merged_purged_new_left_after[ merged_purged_new_left_after[ 'source_old' ].isna() ] ) )
)
# Merge on most event fields but *not ip*, where new log timestamp is after old log timestamp, 30s tolerance
# Note: Results of this merge not used in the final data
# merge on landingpage, other logged URL parameters, and nearest timestamp
# (where new log timestamp must be later than the old log one)
# first order by timestamp
old_purged.sort_values( [ 'dt' ], inplace = True )
new_purged.sort_values( [ 'dt' ], inplace = True )
# put old logs on the left to include all old rows and only new ones that match
merged_purged_old_left_no_ip = pd.merge_asof(
old_purged,
new_purged,
direction = 'forward',
tolerance= pd.Timedelta( '30s' ),
on = 'dt',
by = [ 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
suffixes = [ '_old', '_new' ]
)
# put new logs on the left to include all new rows and only new ones that match
merged_purged_new_left_no_ip = pd.merge_asof(
new_purged,
old_purged,
direction = 'backward',
tolerance= pd.Timedelta( '30s' ),
on = 'dt',
by = [ 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
suffixes = [ '_new', '_old']
)
# Give the columns a nicer order
reordered_merged_cols = [
'dt',
'ip_old',
'ip_new',
'id_old',
'id_new',
'landingpage',
'user_agent',
'browser_family',
'os_family',
'nocookies',
'is_bot',
'url',
'content_type',
'cache_status-http_status',
'response_size',
'http_method',
'referer',
'x_forwarded_for',
'accept_language',
'x_analytics',
'language',
'country',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_major',
'browser_minor',
'device_family',
'is_mediawiki',
'os_major',
'os_minor',
'source_old',
'source_new'
]
merged_purged_old_left_no_ip = merged_purged_old_left_no_ip[ reordered_merged_cols ]
merged_purged_new_left_no_ip = merged_purged_new_left_no_ip[ reordered_merged_cols ]
# Sort for easier analysis
merged_purged_old_left_no_ip.sort_values( [ 'source_new', 'user_agent' ], inplace = True )
merged_purged_new_left_no_ip.sort_values( [ 'source_old', 'browser_family', 'os_family' ], inplace = True )
# Write out the merged dataframes for examination in spreadsheets
merged_purged_old_left_no_ip.to_csv( '20191020T20/merged_purged_old_left_no_ip.csv' )
merged_purged_new_left_no_ip.to_csv( '20191020T20/merged_purged_new_left_no_ip.csv' )
# Note: later discovered these stats are not quite right, because of duplicates from merge_asof. Fixed below.
# Print out some stats here
(
"Unmatched in old log: " + str( len( merged_purged_old_left_no_ip[ merged_purged_old_left_no_ip[ 'source_new' ].isna() ] ) ),
"Unmatched in new log: " + str( len( merged_purged_new_left_no_ip[ merged_purged_new_left_no_ip[ 'source_old' ].isna() ] ) )
)
# Merge on most event fields, where new log timestamp is closest to old log timestamp, after *or* before
# This is the merge code used for actual data.
# merge on ip, landingpage, other logged URL parameters, and nearest timestamp
# first order by timestamp
old_purged.sort_values( [ 'dt' ], inplace = True )
new_purged.sort_values( [ 'dt' ], inplace = True )
# put old logs on the left to include all old rows and only new ones that match
merged_purged_old_left_20s = pd.merge_asof(
old_purged,
new_purged,
direction = 'nearest',
tolerance= pd.Timedelta( '20s' ),
on = 'dt',
by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
suffixes = [ '_old', '_new' ]
)
# put new logs on the left to include all new rows and only new ones that match
merged_purged_new_left_20s = pd.merge_asof(
new_purged,
old_purged,
direction = 'nearest',
tolerance= pd.Timedelta( '20s' ),
on = 'dt',
by = [ 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal' ],
suffixes = [ '_new', '_old']
)
# Give the columns a nicer order
reordered_merged_cols = [
'dt',
'ip',
'id_old',
'id_new',
'landingpage',
'user_agent',
'browser_family',
'os_family',
'nocookies',
'is_bot',
'url',
'content_type',
'cache_status-http_status',
'response_size',
'http_method',
'referer',
'x_forwarded_for',
'accept_language',
'x_analytics',
'language',
'country',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_major',
'browser_minor',
'device_family',
'is_mediawiki',
'os_major',
'os_minor',
'source_old',
'source_new'
]
merged_purged_old_left_20s = merged_purged_old_left_20s[ reordered_merged_cols ]
merged_purged_new_left_20s = merged_purged_new_left_20s[ reordered_merged_cols ]
# Sort for easier analysis
merged_purged_old_left_20s.sort_values( [ 'source_new', 'user_agent' ], inplace = True )
merged_purged_new_left_20s.sort_values( [ 'source_old', 'browser_family', 'os_family' ], inplace = True )
# Write out the merged dataframes for examination in spreadsheets
merged_purged_old_left_20s.to_csv( '20191020T20/merged_purged_old_left_20s.csv' )
merged_purged_new_left_20s.to_csv( '20191020T20/merged_purged_new_left_20s.csv' )
# We want to count and analyse one-to-one pairs between the logs, and events that are not part of one-to-one
# pairs. However, merge_asof is a one-to-many merge. So, we have to create pair identities in the datasets
# of both merge_asof results, and merge those, counting only the pair identities that resulted from both.
merged_purged_old_left_20s[ 'pair_id' ] = merged_purged_old_left_20s.apply(
lambda x: str( x[ 'id_old' ] ) + '|' + str( x[ 'id_new' ] ),
axis = 1
)
merged_purged_new_left_20s[ 'pair_id' ] = merged_purged_new_left_20s.apply(
lambda x: str( x[ 'id_old' ] ) + '|' + str( x[ 'id_new' ] ),
axis = 1
)
really_matched = merged_purged_old_left_20s.merge(
merged_purged_new_left_20s[ [ 'pair_id' ] ],
on = 'pair_id',
how = 'inner',
validate = '1:1',
suffixes = [ '_old_left', '_new_left' ]
)
# Create a dataframe of orphaned new logs
orphanded_new_logs = new_purged.merge(
really_matched[ [ 'id_new', 'pair_id' ] ],
left_on = 'id',
right_on = 'id_new',
how = 'outer',
indicator = True,
validate = '1:1',
suffixes = [ '', '_really_matched']
)
orphanded_new_logs = orphanded_new_logs[ orphanded_new_logs[ '_merge' ] == 'left_only' ]
orphanded_new_logs.sort_values( [ 'os_family', 'os_major', 'browser_family', 'os_major' ] )
orphanded_new_logs.to_csv( '20191020T20/orphanded_new_logs.csv' )
# Print out some stats here
(
"Matched: " + str( len( really_matched ) ),
"Unmatched in old log: " + str( len( old_purged ) - len( really_matched ) ),
"Unmatched in new log: " + str( len( new_purged ) - len( really_matched ) )
)
# Load tsv with contents of landingpageimpression_raw table as generated from the same new
# logs using FRUEC
# To get this data: we generated a single log file with this command:
# cat 20191020T20/new/*.log | jq -c 'select( .dt | test("2019-10-20T20") )' > new_logs.20191020-20_landingpageimpression_raw.tsv
# Then we ran FRUEC to consume that log file, starting from an empty database.
# tsv file that we load here was generated using the following query:
# SELECT
# timestamp AS dt,
# utm_source,
# utm_campaign,
# utm_medium,
# utm_key,
# landingpage,
# language.iso_code AS language,
# country.iso_code AS country,
# project.project AS project
# FROM
# landingpageimpression_raw
# LEFT JOIN language ON landingpageimpression_raw.language_id = language.id
# LEFT JOIN country ON landingpageimpression_raw.country_id = country.id
# LEFT JOIN project ON landingpageimpression_raw.project_id = project.id;
new_lpi_raw = pd.read_csv( 'new_logs.20191020-20_Se.tsv', sep='\t' )
# convert timestamp to datetime
new_lpi_raw[ 'dt' ] = pd.to_datetime( new_lpi_raw[ 'dt' ] )
# fill in XX as default value in country column in dataframe from logs
new_with_defaults = new.copy()
new_with_defaults[ 'country' ] = new_with_defaults[ 'country' ].fillna( 'XX' )
# merge dataframes with new logs and data in landingpage impressions
new_lpi_and_log_merged = new_lpi_raw.merge(
new_with_defaults,
on = [
'dt',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'country'
],
how = 'outer',
suffixes = [ '_lpi_raw', '_logs' ],
indicator = True
)
# reorder columns
reordered_merged_cols = [
'dt',
'_merge',
'utm_source',
'utm_campaign',
'utm_medium',
'utm_key',
'landingpage_lpi_raw',
'landingpage_logs',
'language_lpi_raw',
'language_logs',
'country',
'project',
'id',
'ip',
'contact_id',
'link_id',
'template',
'appeal',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_family',
'browser_major',
'browser_minor',
'device_family',
'is_bot',
'is_mediawiki',
'os_family',
'os_major',
'os_minor',
'source'
]
new_lpi_and_log_merged = new_lpi_and_log_merged[ reordered_merged_cols ]
# order by timestamp
new_lpi_and_log_merged.sort_values( [ '_merge' ], inplace = True )
new_lpi_and_log_merged.to_csv( '20191020T20/new_lpi_and_log_merged.csv' )
len( new_lpi_and_log_merged[ new_lpi_and_log_merged[ '_merge' ] != 'both' ] )
# Merge full old log with old log dataframe containing fields indicating whether there was a match in the new log
# We should be able to merge on all the columns present in the old log dataframe. This avoids overcrowding
# the new dataframe with repeated columns.
old_with__merge_info = old_purged.merge(
merged_purged_old_left_20s,
left_on = [ 'id', 'dt', 'source', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'cache_status-http_status', 'response_size', 'http_method', 'url', 'content_type', 'referer', 'x_forwarded_for', 'user_agent', 'accept_language', 'x_analytics', 'nocookies' ],
right_on = [ 'id_old', 'dt', 'source_old', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'cache_status-http_status', 'response_size', 'http_method', 'url', 'content_type', 'referer', 'x_forwarded_for', 'user_agent', 'accept_language', 'x_analytics', 'nocookies' ],
how = 'outer',
suffixes = [ '_full', '_matches' ]
)
# Reorder the columns to put important ones first
reordered_merged_cols = [
'id',
'dt',
'ip',
'source',
'source_old',
'source_new',
'landingpage',
'language',
'utm_source',
'utm_campaign',
'utm_medium',
'country',
'nocookies',
'browser_family',
'cache_status-http_status',
'response_size',
'http_method',
'url',
'content_type',
'referer',
'x_forwarded_for',
'user_agent',
'accept_language',
'x_analytics',
'url_type',
'has_action',
'contact_id',
'link_id',
'appeal',
'id_old',
'id_new',
'os_family',
'is_bot',
'utm_key',
'template',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_major',
'browser_minor',
'device_family',
'is_mediawiki',
'os_major',
'os_minor'
]
old_with__merge_info = old_with__merge_info[ reordered_merged_cols ]
old_with__merge_info.to_csv( '20191020T20/old_with__merge_info.csv' )
# Now merge in dataframe with new logs including info on whether the event matched
# We should be able to merge on all the columns present in the new log dataframe. This avoids overcrowding
# the new dataframe with repeated columns.
both_with__merge_info = old_with__merge_info.merge(
merged_purged_new_left_20s,
left_on = [ 'id_new', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'user_agent', 'browser_family', 'os_family', 'nocookies', 'is_bot', 'url', 'content_type', 'cache_status-http_status', 'response_size', 'http_method', 'referer', 'x_forwarded_for', 'accept_language', 'x_analytics', 'utm_key', 'template', 'appeal_template', 'form_template', 'form_countryspecific', 'browser_major', 'browser_minor', 'device_family', 'is_mediawiki', 'os_major', 'os_minor' ],
right_on = [ 'id_new', 'ip', 'landingpage', 'language', 'utm_source', 'utm_campaign', 'utm_medium', 'country', 'contact_id', 'link_id', 'appeal', 'user_agent', 'browser_family', 'os_family', 'nocookies', 'is_bot', 'url', 'content_type', 'cache_status-http_status', 'response_size', 'http_method', 'referer', 'x_forwarded_for', 'accept_language', 'x_analytics', 'utm_key', 'template', 'appeal_template', 'form_template', 'form_countryspecific', 'browser_major', 'browser_minor', 'device_family', 'is_mediawiki', 'os_major', 'os_minor' ],
how = 'outer',
suffixes = [ '_all_old', '_matches_new' ]
)
# Add a dt column that contains new dt if old dt is empty
both_with__merge_info[ 'dt' ] = both_with__merge_info.apply(
lambda x: x[ 'dt_matches_new' ] if pd.isnull( x[ 'dt_all_old' ] ) else x[ 'dt_all_old' ],
axis = 1
)
reordered_merged_cols = [
'id',
'id_old_matches_new',
'id_old_all_old',
'dt',
'dt_all_old',
'dt_matches_new',
'ip',
'source',
'source_old_all_old',
'source_new_all_old',
'source_old_matches_new',
'source_new_matches_new',
'browser_family',
'os_family',
'landingpage',
'language',
'utm_source',
'utm_campaign',
'utm_medium',
'country',
'nocookies',
'cache_status-http_status',
'response_size',
'http_method',
'url',
'content_type',
'referer',
'x_forwarded_for',
'user_agent',
'accept_language',
'x_analytics',
'url_type',
'has_action',
'contact_id',
'link_id',
'appeal',
'id_new',
'is_bot',
'utm_key',
'template',
'appeal_template',
'form_template',
'form_countryspecific',
'browser_major',
'browser_minor',
'device_family',
'is_mediawiki',
'os_major',
'os_minor'
]
both_with__merge_info = both_with__merge_info[ reordered_merged_cols ]
both_with__merge_info.sort_values( [ 'dt' ], inplace = True )
both_with__merge_info.to_csv( '20191020T20/both_with__merge_info.csv' )
orphanded_new_logs_by_browser = orphanded_new_logs.groupby( 'browser_family', as_index = False ) \
.count()[ [ 'browser_family', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
orphanded_new_logs_by_browser[ 'precent' ] = orphanded_new_logs_by_browser[ 'count' ].apply(
lambda x: ( x / len( orphanded_new_logs ) ) * 100
)
orphanded_new_logs_by_browser
new_purged_by_browser = new_purged.groupby( 'browser_family', as_index = False ) \
.count()[ [ 'browser_family', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
new_purged_by_browser[ 'precent' ] = new_purged_by_browser[ 'count' ].apply(
lambda x: ( x / len( new_purged ) ) * 100
)
new_purged_by_browser
orphanded_new_logs_by_os = orphanded_new_logs.groupby( 'os_family', as_index = False ) \
.count()[ [ 'os_family', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
orphanded_new_logs_by_os[ 'percent' ] = orphanded_new_logs_by_os[ 'count' ].apply(
lambda x: ( x / len( orphanded_new_logs ) ) * 100
)
orphanded_new_logs_by_os
new_purged_by_os = new_purged.groupby( 'os_family', as_index = False ) \
.count()[ [ 'os_family', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
new_purged_by_os[ 'percent' ] = new_purged_by_os[ 'count' ].apply(
lambda x: ( x / len( new_purged ) ) * 100
)
new_purged_by_os
orphanded_new_logs_by_lp = orphanded_new_logs.groupby( 'landingpage', as_index = False ) \
.count()[ [ 'landingpage', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
orphanded_new_logs_by_lp[ 'percent' ] = orphanded_new_logs_by_lp[ 'count' ].apply(
lambda x: ( x / len( orphanded_new_logs ) ) * 100
)
orphanded_new_logs_by_lp
new_purged_by_lp = new_purged.groupby( 'landingpage', as_index = False ) \
.count()[ [ 'landingpage', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
new_purged_by_lp[ 'percent' ] = new_purged_by_lp[ 'count' ].apply(
lambda x: ( x / len( new_purged ) ) * 100
)
new_purged_by_lp