{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from wmfdata.spark import get_session"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.\n"
     ]
    }
   ],
   "source": [
    "# We use wmfdata boilerplate to init a spark session.\n",
    "# Under the hood the library uses findspark to initialise\n",
    "# Spark's environment. pyspark imports will be available \n",
    "# after initialisation\n",
    "spark = get_session(type='regular', app_name=\"ImageRec Training\")\n",
    "import pyspark\n",
    "import pyspark.sql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This assumes we have a file called babel.commonswiki.20210816.csv in hadoop, which is a dump (via quarry) of the `babel` table in commonswiki\n",
    "sdf = spark.read.csv('babel.commonswiki.20210816.csv').toDF(\"user_id\", \"lang\", \"lang_level\")\n",
    "sdf.createOrReplaceTempView('babel')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "lang = 'ru'\n",
    "# Assumes that the follwing files are available in hadoop\n",
    "# {{lang}}.unillustrated.csv - a csv containing unillustrated articles for the wiki, with page id in the first column and page title in the second\n",
    "# {{lang}}.suggested.csv - a csv containing suggested images for unillustrated articles in the wiki, a single column containing page title"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   count(DISTINCT page_title)  count(DISTINCT wl_user)\n",
      "0                       97420                    45267\n"
     ]
    }
   ],
   "source": [
    "# Count \n",
    "# - unillustrated articles with suggested images, and their watchers, on {{lang}}wiki\n",
    "\n",
    "sdf = spark.read.csv(lang + '.unillustrated.csv').toDF(\"page_id\", \"page_title\")\n",
    "sdf.createOrReplaceTempView('unillustrated')\n",
    "query = \"\"\"\n",
    "    SELECT count(DISTINCT unillustrated.page_title), count(DISTINCT wl_user) \n",
    "    FROM wmf_raw.mediawiki_private_watchlist wl\n",
    "    JOIN unillustrated on unillustrated.page_title=wl.wl_title\n",
    "    WHERE wl.wiki_db='\"\"\" + lang + \"\"\"wiki'\n",
    "    AND wl.snapshot='2021-06'\n",
    "    \"\"\"\n",
    "pd = spark.sql(query).toPandas()\n",
    "print(pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   count(DISTINCT page_title)  count(DISTINCT wl_user)\n",
      "0                       82606                    30586\n"
     ]
    }
   ],
   "source": [
    "# Count \n",
    "# - images and watchers for images \n",
    "# - that have been suggested for an article on {{lang}}wiki\n",
    "\n",
    "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n",
    "sdf.createOrReplaceTempView('suggested')\n",
    "query = \"\"\"\n",
    "    SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n",
    "    FROM wmf_raw.mediawiki_private_watchlist wl\n",
    "    JOIN suggested on suggested.page_title=wl.wl_title\n",
    "    WHERE wl.wiki_db='commonswiki'\n",
    "    AND wl.snapshot='2021-06'\n",
    "    \"\"\"\n",
    "pd = spark.sql(query).toPandas()\n",
    "print(pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   count(DISTINCT page_title)  count(DISTINCT wl_user)\n",
      "0                        3838                      305\n"
     ]
    }
   ],
   "source": [
    "# Count \n",
    "# - images and watchers for images \n",
    "# - that have been suggested for an article on {{lang}}wiki\n",
    "# - where the language of the wiki matches one of the user's Babel languages on commons\n",
    "\n",
    "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n",
    "sdf.createOrReplaceTempView('suggested')\n",
    "\n",
    "query = \"\"\"\n",
    "    SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n",
    "    FROM wmf_raw.mediawiki_private_watchlist wl\n",
    "    JOIN suggested on suggested.page_title=wl.wl_title\n",
    "    JOIN babel on wl.wl_user=babel.user_id\n",
    "    WHERE wl.wiki_db='commonswiki'\n",
    "    AND wl.snapshot='2021-06'\n",
    "    AND babel.lang='\"\"\" + lang + \"\"\"'\n",
    "    \"\"\"\n",
    "pd = spark.sql(query).toPandas()\n",
    "print(pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   count(DISTINCT page_title)  count(DISTINCT wl_user)\n",
      "0                         336                      569\n"
     ]
    }
   ],
   "source": [
    "# Count \n",
    "# - images and watchers for images uploaded in May 2021 \n",
    "# - that have been suggested for an article on {{lang}}wiki\n",
    "\n",
    "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n",
    "sdf.createOrReplaceTempView('suggested')\n",
    "query = \"\"\"\n",
    "    SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n",
    "    FROM wmf_raw.mediawiki_private_watchlist wl\n",
    "    JOIN suggested on suggested.page_title=wl.wl_title\n",
    "    JOIN wmf_raw.mediawiki_image img on img.img_name=wl.wl_title\n",
    "    WHERE wl.wiki_db='commonswiki'\n",
    "    AND wl.snapshot='2021-06'\n",
    "    AND img.wiki_db='commonswiki'\n",
    "    AND img.snapshot='2021-06'\n",
    "    AND img.img_timestamp > 20210501000000\n",
    "    AND img.img_timestamp < 20210601000000\n",
    "    \"\"\"\n",
    "pd = spark.sql(query).toPandas()\n",
    "print(pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   count(DISTINCT page_title)  count(DISTINCT wl_user)\n",
      "0                           9                        8\n"
     ]
    }
   ],
   "source": [
    "# Count \n",
    "# - images and watchers for images uploaded in May 2021 \n",
    "# - that have been suggested for an article on {{lang}}wiki\n",
    "# - where the language of the wiki matches one of the user's Babel languages on commons\n",
    "\n",
    "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n",
    "sdf.createOrReplaceTempView('suggested')\n",
    "query = \"\"\"\n",
    "    SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n",
    "    FROM wmf_raw.mediawiki_private_watchlist wl\n",
    "    JOIN suggested on suggested.page_title=wl.wl_title\n",
    "    JOIN babel on wl.wl_user=babel.user_id\n",
    "    JOIN wmf_raw.mediawiki_image img on img.img_name=wl.wl_title\n",
    "    WHERE wl.wiki_db='commonswiki'\n",
    "    AND wl.snapshot='2021-06'\n",
    "    AND img.wiki_db='commonswiki'\n",
    "    AND img.snapshot='2021-06'\n",
    "    AND img.img_timestamp > 20210501000000\n",
    "    AND img.img_timestamp < 20210601000000\n",
    "    AND babel.lang='\"\"\" + lang + \"\"\"'\n",
    "    \"\"\"\n",
    "pd = spark.sql(query).toPandas()\n",
    "print(pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}