{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Constructing reading patterns of new users\n", "\n", "**Aim**:\n", "\n", "For a given time-window we construct reading sessions in which users register a new account.\n", "\n", "**How**:\n", "\n", "We identify unique users by a standard fingerprint-technique by hashing client_ip and user_agent for desktop (and potentially mobile) users.\n", "\n", "We first extract all users that made at least 1 request to a 'Special:CreateAccount' page.\n", "\n", "We then filter those users for which the sesssion followed a pattern which suggests a registration-event:\n", "- visit the create-account page\n", "- not being logged in while visiting this page\n", "- being logged in when visiting the next page\n", "\n", "\n", "For an example-case (1 hour):\n", "- there are 94 users that register\n", "- it takes around 30 seconds to query\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# %load_ext autoreload\n", "# %autoreload 2\n", "\n", "import os, sys\n", "import numpy as np\n", "import datetime\n", "from pyspark.sql import functions as F, types as T, Window\n", "import calendar\n", "import time" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "date_start = datetime.datetime(2019, 7, 15, 10)\n", "date_end = datetime.datetime(2019, 7, 15, 11)\n", "\n", "\n", "ts_start = calendar.timegm(date_start.timetuple())\n", "ts_end = calendar.timegm(date_end.timetuple())\n", "\n", "row_timestamp = F.unix_timestamp(F.concat(\n", " F.col('year'), F.lit('-'), F.col('month'), F.lit('-'), F.col('day'), \n", " F.lit(' '), F.col('hour'), F.lit(':00:00')))\n", "\n", "\n", "# hash for user-fingerprinting\n", "user_id = F.hash(F.concat(F.col('client_ip'),F.lit('-'),F.col('user_agent'))) ## only client and user\n", "## seems to be the default way https://meta.wikimedia.org/wiki/Research:Unique_Devices/Other_Possible_Implementations#Fingerprinting\n", "# user_id = F.hash(F.concat(F.col('client_ip'),F.lit('-'),F.col('user_agent'),F.lit('-'),F.col('accept_language')))\n", "\n", "# whether request came from loggIn user: 0/1\n", "logged_in = F.when(F.col('x_analytics_map.loggedIn')==1,1).otherwise(0)\n", "\n", "# whether page == Special:CreateAccount\n", "page_create_account = F.when(F.col('pageview_info.page_title')=='Special:CreateAccount',1).otherwise(0)\n", "\n", "w = Window.partitionBy(F.col('user_id'))\n", "w_user_ts = Window.partitionBy().orderBy(F.col('user_id'),F.col('ts'))\n", "\n", "# maximum number of requests per session of an individual\n", "n_p_max = 500" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "## dataframe with all webrequest of all users that visited at least once a createaccount page\n", "df = (\n", " ## select table\n", " spark.read.table('wmf.webrequest')\n", " ## user-hash as user_id\n", " .withColumn('user_id',user_id)\n", " .withColumn('logged_in',logged_in)\n", " .withColumn('page_create_account',page_create_account)\n", " .withColumn('page_title',F.col('pageview_info.page_title'))\n", " \n", " ## select time partition\n", " .where(row_timestamp >= ts_start)\n", " .where(row_timestamp < ts_end)\n", " ## select wiki project\n", " .where(F.col('normalized_host.project_class') == \"wikipedia\")\n", " .where(F.col('normalized_host.project') == \"en\")\n", " ## only requests marked as pageviews\n", " .where(F.col('is_pageview') == 1)\n", " ## agent-type\n", " .where(F.col('agent_type') == \"user\")\n", " ## user: desktop/mobile/mobile app; isaac filters != mobile app\n", " .where(F.col('access_method') == \"desktop\")\n", " ## only traffic from inside wiki\n", "# .where(F.col('referer_class') == \"internal\")\n", " ## unclear yet; done by isaac\n", " .where(F.col('webrequest_source') == 'text')\n", "\n", " ## count requests per user\n", " ## n_pca_by_user: number of requests to page-create-account\n", " ## n_p_by_user: number of requests to pages\n", " .withColumn('n_pca_by_user', F.sum( F.col('page_create_account') ).over(w) )\n", " .withColumn('n_p_by_user', F.count(F.lit(1)).over(w) )\n", " \n", " ## filter: \n", " ## ## user rquested at least 1 page view for create account\n", " .where(F.col('n_pca_by_user') >= 1 )\n", " ## ## user requested at most n_p_max pages in total\n", " .where(F.col('n_p_by_user') <= n_p_max)\n", "\n", " .orderBy('user_id','ts')\n", " ## puts a 1 if next webrequest is also \n", " .withColumn('logged_in_next', \n", " F.when( F.col('user_id') == F.lag('user_id',count=-1).over(w_user_ts), F.lag('logged_in',count=-1).over(w_user_ts)).otherwise(0) )\n", "\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "## identify user-sesssions which we identify as account-creation\n", "## 1) visit the 'Special:CreateAccount' page\n", "## 2) not being logged in when making that request\n", "## 3) being logged in when visiting the next page\n", "\n", "## for this we make an aggregation into user sessions.\n", "\n", "## define an aggregation function on the level of users which we accept as new users\n", "df_users = (\n", " df.groupBy(F.col('user_id'))\n", " .agg( \n", " ## whether there was one requst to Special:CreateAccount AND next request was logged in.\n", " F.max(F.when( (F.col('logged_in') == 0) & (F.col('logged_in_next') == 1) & (F.col('page_title')=='Special:CreateAccount'),1).otherwise(0))\n", " .alias('new_user') \n", " )\n", " \n", " ## filter \n", " .where(F.col('new_user')==1)\n", ")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "## use join to filter those users whose reading sessions can be considered as registration events\n", "df_sessions = df.join(df_users, df['user_id'] == df_users['user_id'] , \"left_semi\" ).select('user_id',\n", " 'ts',\n", " 'page_title',\n", " 'logged_in')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "41.16013836860657\n" ] } ], "source": [ "t1 = time.time()\n", "## execute query -- here: convert to pandas. \n", "df_sessions = df_sessions.toPandas()\n", "t2 = time.time()\n", "print(t2-t1)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idtspage_titlelogged_in
04960259752019-07-15 10:06:48Main_Page0
14960259752019-07-15 10:06:52Portal:Science0
24960259752019-07-15 10:06:54Portal:Science/Categories_and_Main_topics0
34960259752019-07-15 10:07:02Category:Chemical_engineering0
44960259752019-07-15 10:07:32Category:Artificial_intelligence0
\n", "
" ], "text/plain": [ " user_id ts page_title \\\n", "0 496025975 2019-07-15 10:06:48 Main_Page \n", "1 496025975 2019-07-15 10:06:52 Portal:Science \n", "2 496025975 2019-07-15 10:06:54 Portal:Science/Categories_and_Main_topics \n", "3 496025975 2019-07-15 10:07:02 Category:Chemical_engineering \n", "4 496025975 2019-07-15 10:07:32 Category:Artificial_intelligence \n", "\n", " logged_in \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## get an over on the session\n", "df_sessions.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "94" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## group the dataframe by users\n", "df_sessions_by_user = df_sessions.groupby('user_id')\n", "users = sorted(list(df_sessions_by_user.groups.keys()))\n", "len(users) ## number of unique users" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idtspage_titlelogged_in
706-21424156412019-07-15 10:06:52Main_Page0
707-21424156412019-07-15 10:07:31Special:CreateAccount0
708-21424156412019-07-15 10:07:31Special:CreateAccount0
709-21424156412019-07-15 10:11:11Special:CreateAccount0
710-21424156412019-07-15 10:15:11Main_Page1
711-21424156412019-07-15 10:19:26Special:Search1
712-21424156412019-07-15 10:21:12Special:Search1
713-21424156412019-07-15 10:22:18Ranganathittu_Bird_Sanctuary1
714-21424156412019-07-15 10:22:19Ranganathittu_Bird_Sanctuary1
715-21424156412019-07-15 10:23:37Special:UserLogout0
\n", "
" ], "text/plain": [ " user_id ts page_title logged_in\n", "706 -2142415641 2019-07-15 10:06:52 Main_Page 0\n", "707 -2142415641 2019-07-15 10:07:31 Special:CreateAccount 0\n", "708 -2142415641 2019-07-15 10:07:31 Special:CreateAccount 0\n", "709 -2142415641 2019-07-15 10:11:11 Special:CreateAccount 0\n", "710 -2142415641 2019-07-15 10:15:11 Main_Page 1\n", "711 -2142415641 2019-07-15 10:19:26 Special:Search 1\n", "712 -2142415641 2019-07-15 10:21:12 Special:Search 1\n", "713 -2142415641 2019-07-15 10:22:18 Ranganathittu_Bird_Sanctuary 1\n", "714 -2142415641 2019-07-15 10:22:19 Ranganathittu_Bird_Sanctuary 1\n", "715 -2142415641 2019-07-15 10:23:37 Special:UserLogout 0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## session of an individual user (change index)\n", "df_sessions_by_user.get_group(users[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }