"""
Data used here was generated with the following Hive query:
SELECT
old.dt,
old.ip,
old.cache_status,
old.http_status,
old.http_method,
old.uri_host,
old.uri_path,
old.uri_query,
old.content_type,
old.referer,
old.x_forwarded_for,
old.user_agent,
old.accept_language,
old.x_analytics,
old.range,
old.is_pageview,
old.record_version,
old.client_ip,
old.geocoded_data,
old.x_cache,
old.user_agent_map,
old.x_analytics_map,
old.ts,
old.access_method,
old.agent_type,
old.is_zero,
old.referer_class,
old.normalized_host,
old.pageview_info,
old.page_id,
old.namespace_id,
old.tags,
old.isp_data,
old.random_campaign AS old_random_campaign,
new.random_campaign AS new_random_campaign
FROM
( SELECT
*,
regexp_extract( uri_query, 'randomcampaign%22%3A([0-9.]*)', 1 ) AS random_campaign
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 5
AND uri_query LIKE "%CentralNoticeImpression%"
AND uri_path = '/beacon/event'
AND dt >= '2019-11-05T01:00:00Z'
AND dt <= '2019-11-05T22:59:59Z'
AND uri_query LIKE '%WMDE_Authors_Campaign_2019_Austria_Switzerland%' ) new
RIGHT JOIN
( SELECT
*,
parse_url( concat( 'http://bla.org/woo/', uri_query ), 'QUERY', 'randomcampaign' ) AS random_campaign
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 5
AND uri_path = '/beacon/impression'
AND dt >= '2019-11-05T01:00:00Z'
AND dt <= '2019-11-05T22:59:59Z'
AND uri_query LIKE '%WMDE_Authors_Campaign_2019_Austria_Switzerland%' ) old
ON
( new.random_campaign = old.random_campaign );
"""
# Imports
import re
import pandas as pd
from urllib import parse
import numpy as np
import json
# Load data into a dataframe
auth_c_merge_old = pd.read_csv( 'T236834_auth_c_merge_1-23h.tsv', sep='\t' )
# Add some pandas columns based on Hive UA and ISP map columns
auth_c_merge_old = auth_c_merge_old.join(
pd.io.json.json_normalize( auth_c_merge_old.user_agent_map.apply( json.loads ) ) )
auth_c_merge_old = auth_c_merge_old.join(
pd.io.json.json_normalize( auth_c_merge_old.isp_data.apply( json.loads ) ) )
# Get a dataframe with only orphaned
auth_c_merge_old_orphaned = auth_c_merge_old[ auth_c_merge_old[ 'new_random_campaign' ].isna() ]
# General stats about orphans
total = len( auth_c_merge_old )
orphaned_total = len( auth_c_merge_old_orphaned )
(
"Total old pipeline events: " + str( total ),
"Orphaned old pipeline events: " + str( orphaned_total ),
"Percent of events orphaned in old pipeline: " + str( orphaned_total * 100 / total )
)
# Compare average number of hits per client ip
overall_average_ip_hits = auth_c_merge_old.groupby( 'client_ip', as_index = False ) \
.count()[ [ 'client_ip', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } )[ 'count' ].mean()
orphaned_average_ip_hits = auth_c_merge_old_orphaned.groupby( 'client_ip', as_index = False ) \
.count()[ [ 'client_ip', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } )[ 'count' ].mean()
(
'Overall average ip hits: ' + str( overall_average_ip_hits ),
'Orphaned average ip hits: ' + str( orphaned_average_ip_hits ),
)
# Overall top OS families
auth_c_merge_old_by_os = auth_c_merge_old.groupby( 'os_family', as_index = False ) \
.count()[ [ 'os_family', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
auth_c_merge_old_by_os[ 'percent' ] = auth_c_merge_old_by_os[ 'count' ].apply(
lambda x: ( x / len( auth_c_merge_old ) ) * 100
)
auth_c_merge_old_by_os[ [ 'os_family', 'percent' ] ]
# Orphaned top OS families
auth_c_merge_old_orphaned_by_os = auth_c_merge_old_orphaned.groupby( 'os_family', as_index = False ) \
.count()[ [ 'os_family', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
auth_c_merge_old_orphaned_by_os[ 'percent' ] = auth_c_merge_old_orphaned_by_os[ 'count' ].apply(
lambda x: ( x / len( auth_c_merge_old_orphaned ) ) * 100
)
auth_c_merge_old_orphaned_by_os[ [ 'os_family', 'percent' ] ]
# Overall top ISP
auth_c_merge_old_by_isp = auth_c_merge_old.groupby( 'isp', as_index = False ) \
.count()[ [ 'isp', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
auth_c_merge_old_by_isp[ 'percent' ] = auth_c_merge_old_by_isp[ 'count' ].apply(
lambda x: ( x / len( auth_c_merge_old ) ) * 100
)
auth_c_merge_old_by_isp.head( 10 )[ [ 'isp', 'percent' ] ]
# Orphaned top ISP
auth_c_merge_old_orphaned_by_isp = auth_c_merge_old_orphaned.groupby( 'isp', as_index = False ) \
.count()[ [ 'isp', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } ) \
.sort_values( [ 'count' ], ascending = False )
auth_c_merge_old_orphaned_by_isp[ 'percent' ] = auth_c_merge_old_orphaned_by_isp[ 'count' ].apply(
lambda x: ( x / len( auth_c_merge_old_orphaned ) ) * 100
)
auth_c_merge_old_orphaned_by_isp.head( 10 )[ [ 'isp', 'percent' ] ]
# Compare average number of hits per referer
overall_average_ip_hits = auth_c_merge_old.groupby( 'referer', as_index = False ) \
.count()[ [ 'referer', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } )[ 'count' ].mean()
orphaned_average_ip_hits = auth_c_merge_old_orphaned.groupby( 'referer', as_index = False ) \
.count()[ [ 'referer', 'dt' ] ] \
.rename( columns = { 'dt': 'count' } )[ 'count' ].mean()
(
'Overall average referer hits: ' + str( overall_average_ip_hits ),
'Orphaned average referer hits: ' + str( orphaned_average_ip_hits ),
)
# Compare average percentage of hits from GET instead of POST
overall_get_hits_percentage = len( auth_c_merge_old[ auth_c_merge_old[ 'http_method' ] == 'GET' ] ) * 100 / total
orphaned_get_hits_percentage = len( auth_c_merge_old_orphaned[ auth_c_merge_old_orphaned[ 'http_method' ] == 'GET' ] ) * 100 / orphaned_total
(
'Overall percent GETs: ' + str( overall_get_hits_percentage ),
'Orphaned precent GETs: ' + str( orphaned_get_hits_percentage ),
)