In [1]:
from pyspark.sql.types import ArrayType, StringType

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

import pandas as pd
In [7]:
# Count total CentralNoticeImpression events sent via /beacon/event in wmf.webrequest
# for the 1st window investigated

query = """
SELECT
  count(*)
FROM
  wmf.webrequest
WHERE
  year = 2019
  AND month = 11
  AND day = 5
  AND hour >= 20
  AND hour <= 23
  AND uri_query LIKE "%CentralNoticeImpression%"
  AND uri_path = '/beacon/event'
  AND dt >= '2019-11-05T21:00:00Z'
  AND dt <= '2019-11-05T22:59:59Z'
"""

df = spark.sql( query )
df.show()
+--------+
|count(1)|
+--------+
|  347720|
+--------+

In [8]:
# Count total record impression calls sent via /beacon/impression in wmf.webrequest
# for the 1st window investigated

query = """
SELECT
  count(*)
FROM
  wmf.webrequest
WHERE
  year = 2019
  AND month = 11
  AND day = 05
  AND hour >= 20
  AND hour <= 23
  AND uri_path = '/beacon/impression'
  AND dt >= '2019-11-05T21:00:00Z'
  AND dt <= '2019-11-05T22:59:59Z'
"""

df = spark.sql( query )
df.show()
+--------+
|count(1)|
+--------+
|   91239|
+--------+

In [3]:
# Count total CentralNoticeImpression events sent via /beacon/event in wmf.webrequest
# for the 2nd window investigated

query = """
SELECT
  count(*)
FROM
  wmf.webrequest
WHERE
  year = 2019
  AND month = 11
  AND day = 10
  AND hour >=10
  AND hour <= 19
  AND uri_query LIKE "%CentralNoticeImpression%"
  AND uri_path = '/beacon/event'
  AND dt >= '2019-11-10T11:00:00Z'
  AND dt <= '2019-11-10T18:59:59Z'
"""

df = spark.sql( query )

df
Out[3]:
DataFrame[count(1): bigint]
In [5]:
df.show()
+--------+
|count(1)|
+--------+
|  184572|
+--------+

In [6]:
# Count total record impression calls sent via /beacon/impression in wmf.webrequest
# for the 2nd window investigated

query = """
SELECT
  count(*)
FROM
  wmf.webrequest
WHERE
  year = 2019
  AND month = 11
  AND day = 10
  AND hour >=10
  AND hour <= 19
  AND uri_path = '/beacon/impression'
  AND dt >= '2019-11-10T11:00:00Z'
  AND dt <= '2019-11-10T18:59:59Z'
"""

df = spark.sql( query )
df.show()
+--------+
|count(1)|
+--------+
|  429539|
+--------+

In [9]:
# Count total CentralNoticeImpression events sent via /beacon/event in wmf.webrequest
# for the 3rd window investigated

query = """
SELECT
  count(*)
FROM
  wmf.webrequest
WHERE
  year = 2019
  AND month = 11
  AND day = 20
  AND hour = 17
  AND uri_query LIKE "%CentralNoticeImpression%"
  AND uri_path = '/beacon/event'
  AND dt >= '2019-11-20T17:00:00Z'
  AND dt <= '2019-11-20T17:59:59Z'
"""

df = spark.sql( query )
df.show()
+--------+
|count(1)|
+--------+
|   64339|
+--------+

In [10]:
# Count total record impression calls sent via /beacon/impression in wmf.webrequest
# for the 3rd window investigated

query = """
SELECT
  count(*)
FROM
  wmf.webrequest
WHERE
  year = 2019
  AND month = 11
  AND day = 20
  AND hour = 17
  AND uri_path = '/beacon/impression'
  AND dt >= '2019-11-20T17:00:00Z'
  AND dt <= '2019-11-20T17:59:59Z'
"""

df = spark.sql( query )
df.show()
+--------+
|count(1)|
+--------+
| 1837888|
+--------+

In [12]:
# Count invalid CentralNoticeImpression events for the 1st window
# 21:00:00-22:59:59
query = """
SELECT
  count(*)
FROM
  event.eventerror
WHERE
  year = 2019
  AND month = 11
  AND day = 5
  AND hour >= 21
  AND hour <= 22
  AND event.schema = 'CentralNoticeImpression'
"""

df = spark.sql( query )
df.show()
+--------+
|count(1)|
+--------+
|       1|
+--------+