"""
Data used here was generated with the following Hive query:

SELECT
  old.dt,
  old.ip,
  old.cache_status,
  old.http_status,
  old.http_method,
  old.uri_host,
  old.uri_path,
  old.uri_query,
  old.content_type,
  old.referer,
  old.x_forwarded_for,
  old.user_agent,
  old.accept_language,
  old.x_analytics,
  old.range,
  old.is_pageview,
  old.record_version,
  old.client_ip,
  old.geocoded_data,
  old.x_cache,
  old.user_agent_map,
  old.x_analytics_map,
  old.ts,
  old.access_method,
  old.agent_type,
  old.is_zero,
  old.referer_class,
  old.normalized_host,
  old.pageview_info,
  old.page_id,
  old.namespace_id,
  old.tags,
  old.isp_data,
  old.random_campaign AS old_random_campaign,
  new.random_campaign AS new_random_campaign
FROM
  ( SELECT
    *,
    regexp_extract( uri_query, 'randomcampaign%22%3A([0-9.]*)', 1 ) AS random_campaign
  FROM
    wmf.webrequest
  WHERE
    year = 2019
    AND month = 11
    AND day = 5
    AND uri_query LIKE "%CentralNoticeImpression%"
    AND uri_path = '/beacon/event'
    AND dt >= '2019-11-05T01:00:00Z'
    AND dt <= '2019-11-05T22:59:59Z'
    AND uri_query LIKE '%WMDE_Authors_Campaign_2019_Austria_Switzerland%' ) new
RIGHT JOIN
  ( SELECT
    *,
    parse_url( concat( 'http://bla.org/woo/', uri_query ), 'QUERY', 'randomcampaign' ) AS random_campaign
  FROM
    wmf.webrequest
  WHERE
    year = 2019
    AND month = 11
    AND day = 5
    AND uri_path = '/beacon/impression'
    AND dt >= '2019-11-05T01:00:00Z'
    AND dt <= '2019-11-05T22:59:59Z'
    AND uri_query LIKE '%WMDE_Authors_Campaign_2019_Austria_Switzerland%' ) old
ON
  ( new.random_campaign = old.random_campaign );
"""

# Imports

import re
import pandas as pd
from urllib import parse
import numpy as np
import json

# Load data into a dataframe

auth_c_merge_old = pd.read_csv( 'T236834_auth_c_merge_1-23h.tsv', sep='\t' )

# Add some pandas columns based on Hive UA and ISP map columns

auth_c_merge_old = auth_c_merge_old.join(
    pd.io.json.json_normalize( auth_c_merge_old.user_agent_map.apply( json.loads ) ) )

auth_c_merge_old = auth_c_merge_old.join(
    pd.io.json.json_normalize( auth_c_merge_old.isp_data.apply( json.loads ) ) )

# Get a dataframe with only orphaned

auth_c_merge_old_orphaned = auth_c_merge_old[ auth_c_merge_old[ 'new_random_campaign' ].isna() ]

# General stats about orphans

total = len( auth_c_merge_old )
orphaned_total = len( auth_c_merge_old_orphaned )

(
    "Total old pipeline events: " + str( total ),
    "Orphaned old pipeline events: " + str( orphaned_total ),
    "Percent of events orphaned in old pipeline: " + str( orphaned_total * 100 / total  )
)

('Total old pipeline events: {big number omitted}',
 'Orphaned old pipeline events: {less big number omitted}',
 'Percent of events orphaned in old pipeline: 11.38056693218098')

# Compare average number of hits per client ip

overall_average_ip_hits = auth_c_merge_old.groupby( 'client_ip', as_index = False ) \
    .count()[ [ 'client_ip', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

orphaned_average_ip_hits = auth_c_merge_old_orphaned.groupby( 'client_ip', as_index = False ) \
    .count()[ [ 'client_ip', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

(
    'Overall average ip hits: ' + str( overall_average_ip_hits ),
    'Orphaned average ip hits: ' + str( orphaned_average_ip_hits ),
)

('Overall average ip hits: 1.245391381179967',
 'Orphaned average ip hits: 1.1697050049718263')

# Overall top OS families

auth_c_merge_old_by_os = auth_c_merge_old.groupby( 'os_family', as_index = False ) \
    .count()[ [ 'os_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_by_os[ 'percent' ] = auth_c_merge_old_by_os[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old ) ) * 100
)

auth_c_merge_old_by_os[ [ 'os_family', 'percent' ] ]

# Orphaned top OS families

auth_c_merge_old_orphaned_by_os = auth_c_merge_old_orphaned.groupby( 'os_family', as_index = False ) \
    .count()[ [ 'os_family', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_orphaned_by_os[ 'percent' ] = auth_c_merge_old_orphaned_by_os[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old_orphaned ) ) * 100
)

auth_c_merge_old_orphaned_by_os[ [ 'os_family', 'percent' ] ]

# Overall top ISP

auth_c_merge_old_by_isp = auth_c_merge_old.groupby( 'isp', as_index = False ) \
    .count()[ [ 'isp', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_by_isp[ 'percent' ] = auth_c_merge_old_by_isp[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old ) ) * 100
)

auth_c_merge_old_by_isp.head( 10 )[ [ 'isp', 'percent' ] ]

# Orphaned top ISP

auth_c_merge_old_orphaned_by_isp = auth_c_merge_old_orphaned.groupby( 'isp', as_index = False ) \
    .count()[ [ 'isp', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } ) \
    .sort_values( [ 'count' ], ascending = False )

auth_c_merge_old_orphaned_by_isp[ 'percent' ] = auth_c_merge_old_orphaned_by_isp[ 'count' ].apply(
    lambda x: ( x / len( auth_c_merge_old_orphaned ) ) * 100
)

auth_c_merge_old_orphaned_by_isp.head( 10 )[ [ 'isp', 'percent' ] ]

# Compare average number of hits per referer

overall_average_ip_hits = auth_c_merge_old.groupby( 'referer', as_index = False ) \
    .count()[ [ 'referer', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

orphaned_average_ip_hits = auth_c_merge_old_orphaned.groupby( 'referer', as_index = False ) \
    .count()[ [ 'referer', 'dt' ] ] \
    .rename( columns = { 'dt': 'count' } )[ 'count' ].mean()

(
    'Overall average referer hits: ' + str( overall_average_ip_hits ),
    'Orphaned average referer hits: ' + str( orphaned_average_ip_hits ),
)

('Overall average referer hits: 1.1968427959396348',
 'Orphaned average referer hits: 1.1073109507373706')

# Compare average percentage of hits from GET instead of POST

overall_get_hits_percentage = len( auth_c_merge_old[ auth_c_merge_old[ 'http_method' ] == 'GET' ] ) * 100 / total

orphaned_get_hits_percentage = len( auth_c_merge_old_orphaned[ auth_c_merge_old_orphaned[ 'http_method' ] == 'GET' ] ) * 100 / orphaned_total

(
    'Overall percent GETs: ' + str( overall_get_hits_percentage ),
    'Orphaned precent GETs: ' + str( orphaned_get_hits_percentage ),
)

('Overall percent GETs: 10.23896288174401',
 'Orphaned precent GETs: 27.628223292717482')

	os_family	percent
9	Windows	38.966107
1	Android	27.840304
11	iOS	23.489955
6	Mac OS X	8.813570
8	Ubuntu	0.393434
5	Linux	0.328937
2	Chrome OS	0.090296
10	Windows Phone	0.032249
3	Fedora	0.029024
7	Tizen	0.009675
0	-	0.003225
4	FreeBSD	0.003225

	os_family	percent
9	Windows	63.700765
1	Android	17.257013
11	iOS	11.079626
6	Mac OS X	6.120714
8	Ubuntu	0.906772
5	Linux	0.595069
3	Fedora	0.170020
10	Windows Phone	0.056673
0	-	0.028337
2	Chrome OS	0.028337
4	FreeBSD	0.028337
7	Tizen	0.028337

	isp	percent
3	A1 Telekom Austria	19.104131
428	Swisscom	15.924409
433	T-Mobile Austria GmbH	12.283531
192	Hutchison Drei Austria GmbH	8.426586
421	Sunrise Communications AG	4.485794
474	UPC Schweiz	3.776323
473	UPC Austria	3.531233
2	A1 Mobile	2.205811
368	Salt Mobile SA	1.738205
4	ACONET	1.676933

	isp	percent
1	A1 Telekom Austria	18.248796
189	Swisscom	15.046755
192	T-Mobile Austria GmbH	10.909606
87	Hutchison Drei Austria GmbH	6.007368
184	Sunrise Communications AG	3.627090
212	UPC Austria	3.570417
213	UPC Schweiz	3.258714
2	ACONET	1.756872
0	A1 Mobile	1.586852
268	kabelplus GmbH	1.445169