{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Constructing reading patterns of new users\n",
    "\n",
    "**Aim**:\n",
    "\n",
    "For a given time-window we construct reading sessions in which users register a new account.\n",
    "\n",
    "**How**:\n",
    "\n",
    "We identify unique users by a standard fingerprint-technique by hashing client_ip and user_agent for desktop (and potentially mobile) users.\n",
    "\n",
    "We first extract all users that made at least 1 request to a 'Special:CreateAccount' page.\n",
    "\n",
    "We then filter those users for which the sesssion followed a pattern which suggests a registration-event:\n",
    "- visit the create-account page\n",
    "- not being logged in while visiting this page\n",
    "- being logged in when visiting the next page\n",
    "\n",
    "\n",
    "For an example-case (1 hour):\n",
    "- there are 94 users that register\n",
    "- it takes around 30 seconds to query\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %load_ext autoreload\n",
    "# %autoreload 2\n",
    "\n",
    "import os, sys\n",
    "import numpy as np\n",
    "import datetime\n",
    "from pyspark.sql import functions as F, types as T, Window\n",
    "import calendar\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "date_start = datetime.datetime(2019, 7, 15, 10)\n",
    "date_end = datetime.datetime(2019, 7, 15, 11)\n",
    "\n",
    "\n",
    "ts_start = calendar.timegm(date_start.timetuple())\n",
    "ts_end = calendar.timegm(date_end.timetuple())\n",
    "\n",
    "row_timestamp = F.unix_timestamp(F.concat(\n",
    "    F.col('year'), F.lit('-'), F.col('month'), F.lit('-'), F.col('day'), \n",
    "    F.lit(' '), F.col('hour'), F.lit(':00:00')))\n",
    "\n",
    "\n",
    "# hash for user-fingerprinting\n",
    "user_id = F.hash(F.concat(F.col('client_ip'),F.lit('-'),F.col('user_agent'))) ## only client and user\n",
    "## seems to be the default way https://meta.wikimedia.org/wiki/Research:Unique_Devices/Other_Possible_Implementations#Fingerprinting\n",
    "# user_id = F.hash(F.concat(F.col('client_ip'),F.lit('-'),F.col('user_agent'),F.lit('-'),F.col('accept_language')))\n",
    "\n",
    "# whether request came from loggIn user: 0/1\n",
    "logged_in = F.when(F.col('x_analytics_map.loggedIn')==1,1).otherwise(0)\n",
    "\n",
    "# whether page  == Special:CreateAccount\n",
    "page_create_account = F.when(F.col('pageview_info.page_title')=='Special:CreateAccount',1).otherwise(0)\n",
    "\n",
    "w = Window.partitionBy(F.col('user_id'))\n",
    "w_user_ts = Window.partitionBy().orderBy(F.col('user_id'),F.col('ts'))\n",
    "\n",
    "# maximum number of requests per session of an individual\n",
    "n_p_max = 500"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## dataframe with all webrequest of all users that visited at least once a createaccount page\n",
    "df = (\n",
    "    ## select table\n",
    "    spark.read.table('wmf.webrequest')\n",
    "    ## user-hash as user_id\n",
    "    .withColumn('user_id',user_id)\n",
    "    .withColumn('logged_in',logged_in)\n",
    "    .withColumn('page_create_account',page_create_account)\n",
    "    .withColumn('page_title',F.col('pageview_info.page_title'))\n",
    "    \n",
    "    ## select time partition\n",
    "    .where(row_timestamp >= ts_start)\n",
    "    .where(row_timestamp < ts_end)\n",
    "    ## select wiki project\n",
    "    .where(F.col('normalized_host.project_class') == \"wikipedia\")\n",
    "    .where(F.col('normalized_host.project') == \"en\")\n",
    "    ## only requests marked as pageviews\n",
    "    .where(F.col('is_pageview') == 1)\n",
    "    ## agent-type\n",
    "    .where(F.col('agent_type') == \"user\")\n",
    "    ## user: desktop/mobile/mobile app; isaac filters != mobile app\n",
    "    .where(F.col('access_method') == \"desktop\")\n",
    "    ## only traffic from inside wiki\n",
    "#     .where(F.col('referer_class') == \"internal\")\n",
    "    ## unclear yet; done by isaac\n",
    "    .where(F.col('webrequest_source') ==  'text')\n",
    "\n",
    "    ## count requests per user\n",
    "    ## n_pca_by_user: number of requests to page-create-account\n",
    "    ## n_p_by_user:  number of requests to pages\n",
    "    .withColumn('n_pca_by_user', F.sum( F.col('page_create_account') ).over(w) )\n",
    "    .withColumn('n_p_by_user', F.count(F.lit(1)).over(w) )\n",
    "    \n",
    "    ## filter: \n",
    "    ## ## user rquested at least 1 page view for create account\n",
    "    .where(F.col('n_pca_by_user') >= 1 )\n",
    "    ## ## user requested at most n_p_max pages in total\n",
    "    .where(F.col('n_p_by_user') <= n_p_max)\n",
    "\n",
    "    .orderBy('user_id','ts')\n",
    "    ## puts a 1 if next webrequest is also \n",
    "    .withColumn('logged_in_next', \n",
    "                F.when( F.col('user_id') == F.lag('user_id',count=-1).over(w_user_ts), F.lag('logged_in',count=-1).over(w_user_ts)).otherwise(0)   )\n",
    "\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "## identify user-sesssions which we identify as account-creation\n",
    "## 1) visit the 'Special:CreateAccount' page\n",
    "## 2) not being logged in when making that request\n",
    "## 3) being logged in when visiting the next page\n",
    "\n",
    "## for this we make an aggregation into user sessions.\n",
    "\n",
    "## define an aggregation function on the level of users which we accept as new users\n",
    "df_users = (\n",
    "    df.groupBy(F.col('user_id'))\n",
    "    .agg( \n",
    "        ## whether there was one requst to Special:CreateAccount AND next request was logged in.\n",
    "        F.max(F.when( (F.col('logged_in') == 0) & (F.col('logged_in_next') == 1) & (F.col('page_title')=='Special:CreateAccount'),1).otherwise(0))\n",
    "        .alias('new_user') \n",
    "    )\n",
    "        \n",
    "    ## filter \n",
    "    .where(F.col('new_user')==1)\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "## use join to filter those users whose reading sessions can be considered as registration events\n",
    "df_sessions =  df.join(df_users, df['user_id'] == df_users['user_id'] , \"left_semi\" ).select('user_id',\n",
    "            'ts',\n",
    "            'page_title',\n",
    "            'logged_in')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "41.16013836860657\n"
     ]
    }
   ],
   "source": [
    "t1 = time.time()\n",
    "## execute query -- here: convert to pandas. \n",
    "df_sessions = df_sessions.toPandas()\n",
    "t2 = time.time()\n",
    "print(t2-t1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>ts</th>\n",
       "      <th>page_title</th>\n",
       "      <th>logged_in</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>496025975</td>\n",
       "      <td>2019-07-15 10:06:48</td>\n",
       "      <td>Main_Page</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>496025975</td>\n",
       "      <td>2019-07-15 10:06:52</td>\n",
       "      <td>Portal:Science</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>496025975</td>\n",
       "      <td>2019-07-15 10:06:54</td>\n",
       "      <td>Portal:Science/Categories_and_Main_topics</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>496025975</td>\n",
       "      <td>2019-07-15 10:07:02</td>\n",
       "      <td>Category:Chemical_engineering</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>496025975</td>\n",
       "      <td>2019-07-15 10:07:32</td>\n",
       "      <td>Category:Artificial_intelligence</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     user_id                  ts                                 page_title  \\\n",
       "0  496025975 2019-07-15 10:06:48                                  Main_Page   \n",
       "1  496025975 2019-07-15 10:06:52                             Portal:Science   \n",
       "2  496025975 2019-07-15 10:06:54  Portal:Science/Categories_and_Main_topics   \n",
       "3  496025975 2019-07-15 10:07:02              Category:Chemical_engineering   \n",
       "4  496025975 2019-07-15 10:07:32           Category:Artificial_intelligence   \n",
       "\n",
       "   logged_in  \n",
       "0          0  \n",
       "1          0  \n",
       "2          0  \n",
       "3          0  \n",
       "4          0  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## get an over on the session\n",
    "df_sessions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "94"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## group the dataframe by users\n",
    "df_sessions_by_user = df_sessions.groupby('user_id')\n",
    "users = sorted(list(df_sessions_by_user.groups.keys()))\n",
    "len(users) ## number of unique users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>ts</th>\n",
       "      <th>page_title</th>\n",
       "      <th>logged_in</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>706</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:06:52</td>\n",
       "      <td>Main_Page</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>707</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:07:31</td>\n",
       "      <td>Special:CreateAccount</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>708</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:07:31</td>\n",
       "      <td>Special:CreateAccount</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>709</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:11:11</td>\n",
       "      <td>Special:CreateAccount</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>710</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:15:11</td>\n",
       "      <td>Main_Page</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>711</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:19:26</td>\n",
       "      <td>Special:Search</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>712</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:21:12</td>\n",
       "      <td>Special:Search</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>713</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:22:18</td>\n",
       "      <td>Ranganathittu_Bird_Sanctuary</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>714</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:22:19</td>\n",
       "      <td>Ranganathittu_Bird_Sanctuary</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>715</th>\n",
       "      <td>-2142415641</td>\n",
       "      <td>2019-07-15 10:23:37</td>\n",
       "      <td>Special:UserLogout</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        user_id                  ts                    page_title  logged_in\n",
       "706 -2142415641 2019-07-15 10:06:52                     Main_Page          0\n",
       "707 -2142415641 2019-07-15 10:07:31         Special:CreateAccount          0\n",
       "708 -2142415641 2019-07-15 10:07:31         Special:CreateAccount          0\n",
       "709 -2142415641 2019-07-15 10:11:11         Special:CreateAccount          0\n",
       "710 -2142415641 2019-07-15 10:15:11                     Main_Page          1\n",
       "711 -2142415641 2019-07-15 10:19:26                Special:Search          1\n",
       "712 -2142415641 2019-07-15 10:21:12                Special:Search          1\n",
       "713 -2142415641 2019-07-15 10:22:18  Ranganathittu_Bird_Sanctuary          1\n",
       "714 -2142415641 2019-07-15 10:22:19  Ranganathittu_Bird_Sanctuary          1\n",
       "715 -2142415641 2019-07-15 10:23:37            Special:UserLogout          0"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## session of an individual user (change index)\n",
    "df_sessions_by_user.get_group(users[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PySpark - YARN",
   "language": "python",
   "name": "spark_yarn_pyspark"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}