from pyspark.sql.types import ArrayType, StringType
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd
# Count total CentralNoticeImpression events sent via /beacon/event in wmf.webrequest
# for the 1st window investigated
query = """
SELECT
count(*)
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 5
AND hour >= 20
AND hour <= 23
AND uri_query LIKE "%CentralNoticeImpression%"
AND uri_path = '/beacon/event'
AND dt >= '2019-11-05T21:00:00Z'
AND dt <= '2019-11-05T22:59:59Z'
"""
df = spark.sql( query )
df.show()
# Count total record impression calls sent via /beacon/impression in wmf.webrequest
# for the 1st window investigated
query = """
SELECT
count(*)
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 05
AND hour >= 20
AND hour <= 23
AND uri_path = '/beacon/impression'
AND dt >= '2019-11-05T21:00:00Z'
AND dt <= '2019-11-05T22:59:59Z'
"""
df = spark.sql( query )
df.show()
# Count total CentralNoticeImpression events sent via /beacon/event in wmf.webrequest
# for the 2nd window investigated
query = """
SELECT
count(*)
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 10
AND hour >=10
AND hour <= 19
AND uri_query LIKE "%CentralNoticeImpression%"
AND uri_path = '/beacon/event'
AND dt >= '2019-11-10T11:00:00Z'
AND dt <= '2019-11-10T18:59:59Z'
"""
df = spark.sql( query )
df
df.show()
# Count total record impression calls sent via /beacon/impression in wmf.webrequest
# for the 2nd window investigated
query = """
SELECT
count(*)
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 10
AND hour >=10
AND hour <= 19
AND uri_path = '/beacon/impression'
AND dt >= '2019-11-10T11:00:00Z'
AND dt <= '2019-11-10T18:59:59Z'
"""
df = spark.sql( query )
df.show()
# Count total CentralNoticeImpression events sent via /beacon/event in wmf.webrequest
# for the 3rd window investigated
query = """
SELECT
count(*)
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 20
AND hour = 17
AND uri_query LIKE "%CentralNoticeImpression%"
AND uri_path = '/beacon/event'
AND dt >= '2019-11-20T17:00:00Z'
AND dt <= '2019-11-20T17:59:59Z'
"""
df = spark.sql( query )
df.show()
# Count total record impression calls sent via /beacon/impression in wmf.webrequest
# for the 3rd window investigated
query = """
SELECT
count(*)
FROM
wmf.webrequest
WHERE
year = 2019
AND month = 11
AND day = 20
AND hour = 17
AND uri_path = '/beacon/impression'
AND dt >= '2019-11-20T17:00:00Z'
AND dt <= '2019-11-20T17:59:59Z'
"""
df = spark.sql( query )
df.show()
# Count invalid CentralNoticeImpression events for the 1st window
# 21:00:00-22:59:59
query = """
SELECT
count(*)
FROM
event.eventerror
WHERE
year = 2019
AND month = 11
AND day = 5
AND hour >= 21
AND hour <= 22
AND event.schema = 'CentralNoticeImpression'
"""
df = spark.sql( query )
df.show()