{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Constructing reading patterns of new users\n",
"\n",
"**Aim**:\n",
"\n",
"For a given time-window we construct reading sessions in which users register a new account.\n",
"\n",
"**How**:\n",
"\n",
"We identify unique users by a standard fingerprint-technique by hashing client_ip and user_agent for desktop (and potentially mobile) users.\n",
"\n",
"We first extract all users that made at least 1 request to a 'Special:CreateAccount' page.\n",
"\n",
"We then filter those users for which the sesssion followed a pattern which suggests a registration-event:\n",
"- visit the create-account page\n",
"- not being logged in while visiting this page\n",
"- being logged in when visiting the next page\n",
"\n",
"\n",
"For an example-case (1 hour):\n",
"- there are 94 users that register\n",
"- it takes around 30 seconds to query\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# %load_ext autoreload\n",
"# %autoreload 2\n",
"\n",
"import os, sys\n",
"import numpy as np\n",
"import datetime\n",
"from pyspark.sql import functions as F, types as T, Window\n",
"import calendar\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"date_start = datetime.datetime(2019, 7, 15, 10)\n",
"date_end = datetime.datetime(2019, 7, 15, 11)\n",
"\n",
"\n",
"ts_start = calendar.timegm(date_start.timetuple())\n",
"ts_end = calendar.timegm(date_end.timetuple())\n",
"\n",
"row_timestamp = F.unix_timestamp(F.concat(\n",
" F.col('year'), F.lit('-'), F.col('month'), F.lit('-'), F.col('day'), \n",
" F.lit(' '), F.col('hour'), F.lit(':00:00')))\n",
"\n",
"\n",
"# hash for user-fingerprinting\n",
"user_id = F.hash(F.concat(F.col('client_ip'),F.lit('-'),F.col('user_agent'))) ## only client and user\n",
"## seems to be the default way https://meta.wikimedia.org/wiki/Research:Unique_Devices/Other_Possible_Implementations#Fingerprinting\n",
"# user_id = F.hash(F.concat(F.col('client_ip'),F.lit('-'),F.col('user_agent'),F.lit('-'),F.col('accept_language')))\n",
"\n",
"# whether request came from loggIn user: 0/1\n",
"logged_in = F.when(F.col('x_analytics_map.loggedIn')==1,1).otherwise(0)\n",
"\n",
"# whether page == Special:CreateAccount\n",
"page_create_account = F.when(F.col('pageview_info.page_title')=='Special:CreateAccount',1).otherwise(0)\n",
"\n",
"w = Window.partitionBy(F.col('user_id'))\n",
"w_user_ts = Window.partitionBy().orderBy(F.col('user_id'),F.col('ts'))\n",
"\n",
"# maximum number of requests per session of an individual\n",
"n_p_max = 500"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"## dataframe with all webrequest of all users that visited at least once a createaccount page\n",
"df = (\n",
" ## select table\n",
" spark.read.table('wmf.webrequest')\n",
" ## user-hash as user_id\n",
" .withColumn('user_id',user_id)\n",
" .withColumn('logged_in',logged_in)\n",
" .withColumn('page_create_account',page_create_account)\n",
" .withColumn('page_title',F.col('pageview_info.page_title'))\n",
" \n",
" ## select time partition\n",
" .where(row_timestamp >= ts_start)\n",
" .where(row_timestamp < ts_end)\n",
" ## select wiki project\n",
" .where(F.col('normalized_host.project_class') == \"wikipedia\")\n",
" .where(F.col('normalized_host.project') == \"en\")\n",
" ## only requests marked as pageviews\n",
" .where(F.col('is_pageview') == 1)\n",
" ## agent-type\n",
" .where(F.col('agent_type') == \"user\")\n",
" ## user: desktop/mobile/mobile app; isaac filters != mobile app\n",
" .where(F.col('access_method') == \"desktop\")\n",
" ## only traffic from inside wiki\n",
"# .where(F.col('referer_class') == \"internal\")\n",
" ## unclear yet; done by isaac\n",
" .where(F.col('webrequest_source') == 'text')\n",
"\n",
" ## count requests per user\n",
" ## n_pca_by_user: number of requests to page-create-account\n",
" ## n_p_by_user: number of requests to pages\n",
" .withColumn('n_pca_by_user', F.sum( F.col('page_create_account') ).over(w) )\n",
" .withColumn('n_p_by_user', F.count(F.lit(1)).over(w) )\n",
" \n",
" ## filter: \n",
" ## ## user rquested at least 1 page view for create account\n",
" .where(F.col('n_pca_by_user') >= 1 )\n",
" ## ## user requested at most n_p_max pages in total\n",
" .where(F.col('n_p_by_user') <= n_p_max)\n",
"\n",
" .orderBy('user_id','ts')\n",
" ## puts a 1 if next webrequest is also \n",
" .withColumn('logged_in_next', \n",
" F.when( F.col('user_id') == F.lag('user_id',count=-1).over(w_user_ts), F.lag('logged_in',count=-1).over(w_user_ts)).otherwise(0) )\n",
"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"## identify user-sesssions which we identify as account-creation\n",
"## 1) visit the 'Special:CreateAccount' page\n",
"## 2) not being logged in when making that request\n",
"## 3) being logged in when visiting the next page\n",
"\n",
"## for this we make an aggregation into user sessions.\n",
"\n",
"## define an aggregation function on the level of users which we accept as new users\n",
"df_users = (\n",
" df.groupBy(F.col('user_id'))\n",
" .agg( \n",
" ## whether there was one requst to Special:CreateAccount AND next request was logged in.\n",
" F.max(F.when( (F.col('logged_in') == 0) & (F.col('logged_in_next') == 1) & (F.col('page_title')=='Special:CreateAccount'),1).otherwise(0))\n",
" .alias('new_user') \n",
" )\n",
" \n",
" ## filter \n",
" .where(F.col('new_user')==1)\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## use join to filter those users whose reading sessions can be considered as registration events\n",
"df_sessions = df.join(df_users, df['user_id'] == df_users['user_id'] , \"left_semi\" ).select('user_id',\n",
" 'ts',\n",
" 'page_title',\n",
" 'logged_in')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"41.16013836860657\n"
]
}
],
"source": [
"t1 = time.time()\n",
"## execute query -- here: convert to pandas. \n",
"df_sessions = df_sessions.toPandas()\n",
"t2 = time.time()\n",
"print(t2-t1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" ts | \n",
" page_title | \n",
" logged_in | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 496025975 | \n",
" 2019-07-15 10:06:48 | \n",
" Main_Page | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 496025975 | \n",
" 2019-07-15 10:06:52 | \n",
" Portal:Science | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 496025975 | \n",
" 2019-07-15 10:06:54 | \n",
" Portal:Science/Categories_and_Main_topics | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 496025975 | \n",
" 2019-07-15 10:07:02 | \n",
" Category:Chemical_engineering | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 496025975 | \n",
" 2019-07-15 10:07:32 | \n",
" Category:Artificial_intelligence | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id ts page_title \\\n",
"0 496025975 2019-07-15 10:06:48 Main_Page \n",
"1 496025975 2019-07-15 10:06:52 Portal:Science \n",
"2 496025975 2019-07-15 10:06:54 Portal:Science/Categories_and_Main_topics \n",
"3 496025975 2019-07-15 10:07:02 Category:Chemical_engineering \n",
"4 496025975 2019-07-15 10:07:32 Category:Artificial_intelligence \n",
"\n",
" logged_in \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## get an over on the session\n",
"df_sessions.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"94"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## group the dataframe by users\n",
"df_sessions_by_user = df_sessions.groupby('user_id')\n",
"users = sorted(list(df_sessions_by_user.groups.keys()))\n",
"len(users) ## number of unique users"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" ts | \n",
" page_title | \n",
" logged_in | \n",
"
\n",
" \n",
" \n",
" \n",
" 706 | \n",
" -2142415641 | \n",
" 2019-07-15 10:06:52 | \n",
" Main_Page | \n",
" 0 | \n",
"
\n",
" \n",
" 707 | \n",
" -2142415641 | \n",
" 2019-07-15 10:07:31 | \n",
" Special:CreateAccount | \n",
" 0 | \n",
"
\n",
" \n",
" 708 | \n",
" -2142415641 | \n",
" 2019-07-15 10:07:31 | \n",
" Special:CreateAccount | \n",
" 0 | \n",
"
\n",
" \n",
" 709 | \n",
" -2142415641 | \n",
" 2019-07-15 10:11:11 | \n",
" Special:CreateAccount | \n",
" 0 | \n",
"
\n",
" \n",
" 710 | \n",
" -2142415641 | \n",
" 2019-07-15 10:15:11 | \n",
" Main_Page | \n",
" 1 | \n",
"
\n",
" \n",
" 711 | \n",
" -2142415641 | \n",
" 2019-07-15 10:19:26 | \n",
" Special:Search | \n",
" 1 | \n",
"
\n",
" \n",
" 712 | \n",
" -2142415641 | \n",
" 2019-07-15 10:21:12 | \n",
" Special:Search | \n",
" 1 | \n",
"
\n",
" \n",
" 713 | \n",
" -2142415641 | \n",
" 2019-07-15 10:22:18 | \n",
" Ranganathittu_Bird_Sanctuary | \n",
" 1 | \n",
"
\n",
" \n",
" 714 | \n",
" -2142415641 | \n",
" 2019-07-15 10:22:19 | \n",
" Ranganathittu_Bird_Sanctuary | \n",
" 1 | \n",
"
\n",
" \n",
" 715 | \n",
" -2142415641 | \n",
" 2019-07-15 10:23:37 | \n",
" Special:UserLogout | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id ts page_title logged_in\n",
"706 -2142415641 2019-07-15 10:06:52 Main_Page 0\n",
"707 -2142415641 2019-07-15 10:07:31 Special:CreateAccount 0\n",
"708 -2142415641 2019-07-15 10:07:31 Special:CreateAccount 0\n",
"709 -2142415641 2019-07-15 10:11:11 Special:CreateAccount 0\n",
"710 -2142415641 2019-07-15 10:15:11 Main_Page 1\n",
"711 -2142415641 2019-07-15 10:19:26 Special:Search 1\n",
"712 -2142415641 2019-07-15 10:21:12 Special:Search 1\n",
"713 -2142415641 2019-07-15 10:22:18 Ranganathittu_Bird_Sanctuary 1\n",
"714 -2142415641 2019-07-15 10:22:19 Ranganathittu_Bird_Sanctuary 1\n",
"715 -2142415641 2019-07-15 10:23:37 Special:UserLogout 0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## session of an individual user (change index)\n",
"df_sessions_by_user.get_group(users[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark - YARN",
"language": "python",
"name": "spark_yarn_pyspark"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}