In [ ]:
"""
Data used here was generated with the following Hive query:

SELECT
  old.dt,
  old.ip,
  old.cache_status,
  old.http_status,
  old.http_method,
  old.uri_host,
  old.uri_path,
  old.uri_query,
  old.content_type,
  old.referer,
  old.x_forwarded_for,
  old.user_agent,
  old.accept_language,
  old.x_analytics,
  old.range,
  old.is_pageview,
  old.record_version,
  old.client_ip,
  old.geocoded_data,
  old.x_cache,
  old.user_agent_map,
  old.x_analytics_map,
  old.ts,
  old.access_method,
  old.agent_type,
  old.is_zero,
  old.referer_class,
  old.normalized_host,
  old.pageview_info,
  old.page_id,
  old.namespace_id,
  old.tags,
  old.isp_data,
  old.random_campaign AS old_random_campaign,
  new.random_campaign AS new_random_campaign
FROM
  ( SELECT
    *,
    regexp_extract( uri_query, 'randomcampaign%22%3A([0-9.]*)', 1 ) AS random_campaign
  FROM
    wmf.webrequest
  WHERE
    year = 2019
    AND month = 11
    AND day = 5
    AND uri_query LIKE "%CentralNoticeImpression%"
    AND uri_path = '/beacon/event'
    AND dt >= '2019-11-05T01:00:00Z'
    AND dt <= '2019-11-05T22:59:59Z'
    AND uri_query LIKE '%WMDE_Authors_Campaign_2019_Austria_Switzerland%' ) new
RIGHT JOIN
  ( SELECT
    *,
    parse_url( concat( 'http://bla.org/woo/', uri_query ), 'QUERY', 'randomcampaign' ) AS random_campaign
  FROM
    wmf.webrequest
  WHERE
    year = 2019
    AND month = 11
    AND day = 5
    AND uri_path = '/beacon/impression'
    AND dt >= '2019-11-05T01:00:00Z'
    AND dt <= '2019-11-05T22:59:59Z'
    AND uri_query LIKE '%WMDE_Authors_Campaign_2019_Austria_Switzerland%' ) old
ON
  ( new.random_campaign = old.random_campaign );
"""
In [1]:
# Imports

import re
import pandas as pd
from urllib import parse
import numpy as np
import json
In [2]:
# Load data into a dataframe

auth_c_merge_old = pd.read_csv( 'T236834_auth_c_merge_1-23h.tsv', sep='\t' )
In [3]:
# Add some pandas columns based on Hive UA and ISP map columns

auth_c_merge_old = auth_c_merge_old.join(
    pd.io.json.json_normalize( auth_c_merge_old.user_agent_map.apply( json.loads ) ) )

auth_c_merge_old = auth_c_merge_old.join(
    pd.io.json.json_normalize( auth_c_merge_old.isp_data.apply( json.loads ) ) )
In [4]:
# Get a dataframe with only orphaned

auth_c_merge_old_orphaned = auth_c_merge_old[ auth_c_merge_old[ 'new_random_campaign' ].isna() ]
In [5]:
# General stats about orphans

total = len( auth_c_merge_old )
orphaned_total = len( auth_c_merge_old_orphaned )

(
    "Total old pipeline events: " + str( total ),
    "Orphaned old pipeline events: " + str( orphaned_total ),
    "Percent of events orphaned in old pipeline: " + str( orphaned_total * 100 / total  )
)
Out[5]:
('Total old pipeline events: {big number omitted}',
 'Orphaned old pipeline events: {less big number omitted}',
 'Percent of events orphaned in old pipeline: 11.38056693218098')
In [6]:
# Compare average number of hits per client ip

overall_average_ip_hits = auth_c_merge_old.groupby( 'client_ip', as_index = False ) \
    .count()[ [ 'client_ip', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

orphaned_average_ip_hits = auth_c_merge_old_orphaned.groupby( 'client_ip', as_index = False ) \
    .count()[ [ 'client_ip', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

(
    'Overall average ip hits: ' + str( overall_average_ip_hits ),
    'Orphaned average ip hits: ' + str( orphaned_average_ip_hits ),
)
Out[6]:
('Overall average ip hits: 1.245391381179967',
 'Orphaned average ip hits: 1.1697050049718263')
In [9]:
# Overall top OS families

auth_c_merge_old_by_os = auth_c_merge_old.groupby( 'os_family', as_index = False ) \
    .count()[ [ 'os_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_by_os[ 'percent' ] = auth_c_merge_old_by_os[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old ) ) * 100
)

auth_c_merge_old_by_os[ [ 'os_family', 'percent' ] ]
Out[9]:
os_family percent
9 Windows 38.966107
1 Android 27.840304
11 iOS 23.489955
6 Mac OS X 8.813570
8 Ubuntu 0.393434
5 Linux 0.328937
2 Chrome OS 0.090296
10 Windows Phone 0.032249
3 Fedora 0.029024
7 Tizen 0.009675
0 - 0.003225
4 FreeBSD 0.003225
In [10]:
# Orphaned top OS families

auth_c_merge_old_orphaned_by_os = auth_c_merge_old_orphaned.groupby( 'os_family', as_index = False ) \
    .count()[ [ 'os_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_orphaned_by_os[ 'percent' ] = auth_c_merge_old_orphaned_by_os[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old_orphaned ) ) * 100
)

auth_c_merge_old_orphaned_by_os[ [ 'os_family', 'percent' ] ]
Out[10]:
os_family percent
9 Windows 63.700765
1 Android 17.257013
11 iOS 11.079626
6 Mac OS X 6.120714
8 Ubuntu 0.906772
5 Linux 0.595069
3 Fedora 0.170020
10 Windows Phone 0.056673
0 - 0.028337
2 Chrome OS 0.028337
4 FreeBSD 0.028337
7 Tizen 0.028337
In [11]:
# Overall top ISP

auth_c_merge_old_by_isp = auth_c_merge_old.groupby( 'isp', as_index = False ) \
    .count()[ [ 'isp', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_by_isp[ 'percent' ] = auth_c_merge_old_by_isp[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old ) ) * 100
)

auth_c_merge_old_by_isp.head( 10 )[ [ 'isp', 'percent' ] ]
Out[11]:
isp percent
3 A1 Telekom Austria 19.104131
428 Swisscom 15.924409
433 T-Mobile Austria GmbH 12.283531
192 Hutchison Drei Austria GmbH 8.426586
421 Sunrise Communications AG 4.485794
474 UPC Schweiz 3.776323
473 UPC Austria 3.531233
2 A1 Mobile 2.205811
368 Salt Mobile SA 1.738205
4 ACONET 1.676933
In [12]:
# Orphaned top ISP

auth_c_merge_old_orphaned_by_isp = auth_c_merge_old_orphaned.groupby( 'isp', as_index = False ) \
    .count()[ [ 'isp', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_orphaned_by_isp[ 'percent' ] = auth_c_merge_old_orphaned_by_isp[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old_orphaned ) ) * 100
)

auth_c_merge_old_orphaned_by_isp.head( 10 )[ [ 'isp', 'percent' ] ]
Out[12]:
isp percent
1 A1 Telekom Austria 18.248796
189 Swisscom 15.046755
192 T-Mobile Austria GmbH 10.909606
87 Hutchison Drei Austria GmbH 6.007368
184 Sunrise Communications AG 3.627090
212 UPC Austria 3.570417
213 UPC Schweiz 3.258714
2 ACONET 1.756872
0 A1 Mobile 1.586852
268 kabelplus GmbH 1.445169
In [13]:
# Compare average number of hits per referer

overall_average_ip_hits = auth_c_merge_old.groupby( 'referer', as_index = False ) \
    .count()[ [ 'referer', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

orphaned_average_ip_hits = auth_c_merge_old_orphaned.groupby( 'referer', as_index = False ) \
    .count()[ [ 'referer', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

(
    'Overall average referer hits: ' + str( overall_average_ip_hits ),
    'Orphaned average referer hits: ' + str( orphaned_average_ip_hits ),
)
Out[13]:
('Overall average referer hits: 1.1968427959396348',
 'Orphaned average referer hits: 1.1073109507373706')
In [14]:
# Compare average percentage of hits from GET instead of POST

overall_get_hits_percentage = len( auth_c_merge_old[ auth_c_merge_old[ 'http_method' ] == 'GET' ] ) * 100 / total

orphaned_get_hits_percentage = len( auth_c_merge_old_orphaned[ auth_c_merge_old_orphaned[ 'http_method' ] == 'GET' ] ) * 100 / orphaned_total

(
    'Overall percent GETs: ' + str( overall_get_hits_percentage ),
    'Orphaned precent GETs: ' + str( orphaned_get_hits_percentage ),
)
Out[14]:
('Overall percent GETs: 10.23896288174401',
 'Orphaned precent GETs: 27.628223292717482')