{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "from wmfdata.spark import get_session" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.\n" ] } ], "source": [ "# We use wmfdata boilerplate to init a spark session.\n", "# Under the hood the library uses findspark to initialise\n", "# Spark's environment. pyspark imports will be available \n", "# after initialisation\n", "spark = get_session(type='regular', app_name=\"ImageRec Training\")\n", "import pyspark\n", "import pyspark.sql" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# This assumes we have a file called babel.commonswiki.20210816.csv in hadoop, which is a dump (via quarry) of the `babel` table in commonswiki\n", "sdf = spark.read.csv('babel.commonswiki.20210816.csv').toDF(\"user_id\", \"lang\", \"lang_level\")\n", "sdf.createOrReplaceTempView('babel')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "lang = 'ru'\n", "# Assumes that the follwing files are available in hadoop\n", "# {{lang}}.unillustrated.csv - a csv containing unillustrated articles for the wiki, with page id in the first column and page title in the second\n", "# {{lang}}.suggested.csv - a csv containing suggested images for unillustrated articles in the wiki, a single column containing page title" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " count(DISTINCT page_title) count(DISTINCT wl_user)\n", "0 97420 45267\n" ] } ], "source": [ "# Count \n", "# - unillustrated articles with suggested images, and their watchers, on {{lang}}wiki\n", "\n", "sdf = spark.read.csv(lang + '.unillustrated.csv').toDF(\"page_id\", \"page_title\")\n", "sdf.createOrReplaceTempView('unillustrated')\n", "query = \"\"\"\n", " SELECT count(DISTINCT unillustrated.page_title), count(DISTINCT wl_user) \n", " FROM wmf_raw.mediawiki_private_watchlist wl\n", " JOIN unillustrated on unillustrated.page_title=wl.wl_title\n", " WHERE wl.wiki_db='\"\"\" + lang + \"\"\"wiki'\n", " AND wl.snapshot='2021-06'\n", " \"\"\"\n", "pd = spark.sql(query).toPandas()\n", "print(pd)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " count(DISTINCT page_title) count(DISTINCT wl_user)\n", "0 82606 30586\n" ] } ], "source": [ "# Count \n", "# - images and watchers for images \n", "# - that have been suggested for an article on {{lang}}wiki\n", "\n", "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n", "sdf.createOrReplaceTempView('suggested')\n", "query = \"\"\"\n", " SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n", " FROM wmf_raw.mediawiki_private_watchlist wl\n", " JOIN suggested on suggested.page_title=wl.wl_title\n", " WHERE wl.wiki_db='commonswiki'\n", " AND wl.snapshot='2021-06'\n", " \"\"\"\n", "pd = spark.sql(query).toPandas()\n", "print(pd)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " count(DISTINCT page_title) count(DISTINCT wl_user)\n", "0 3838 305\n" ] } ], "source": [ "# Count \n", "# - images and watchers for images \n", "# - that have been suggested for an article on {{lang}}wiki\n", "# - where the language of the wiki matches one of the user's Babel languages on commons\n", "\n", "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n", "sdf.createOrReplaceTempView('suggested')\n", "\n", "query = \"\"\"\n", " SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n", " FROM wmf_raw.mediawiki_private_watchlist wl\n", " JOIN suggested on suggested.page_title=wl.wl_title\n", " JOIN babel on wl.wl_user=babel.user_id\n", " WHERE wl.wiki_db='commonswiki'\n", " AND wl.snapshot='2021-06'\n", " AND babel.lang='\"\"\" + lang + \"\"\"'\n", " \"\"\"\n", "pd = spark.sql(query).toPandas()\n", "print(pd)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " count(DISTINCT page_title) count(DISTINCT wl_user)\n", "0 336 569\n" ] } ], "source": [ "# Count \n", "# - images and watchers for images uploaded in May 2021 \n", "# - that have been suggested for an article on {{lang}}wiki\n", "\n", "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n", "sdf.createOrReplaceTempView('suggested')\n", "query = \"\"\"\n", " SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n", " FROM wmf_raw.mediawiki_private_watchlist wl\n", " JOIN suggested on suggested.page_title=wl.wl_title\n", " JOIN wmf_raw.mediawiki_image img on img.img_name=wl.wl_title\n", " WHERE wl.wiki_db='commonswiki'\n", " AND wl.snapshot='2021-06'\n", " AND img.wiki_db='commonswiki'\n", " AND img.snapshot='2021-06'\n", " AND img.img_timestamp > 20210501000000\n", " AND img.img_timestamp < 20210601000000\n", " \"\"\"\n", "pd = spark.sql(query).toPandas()\n", "print(pd)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " count(DISTINCT page_title) count(DISTINCT wl_user)\n", "0 9 8\n" ] } ], "source": [ "# Count \n", "# - images and watchers for images uploaded in May 2021 \n", "# - that have been suggested for an article on {{lang}}wiki\n", "# - where the language of the wiki matches one of the user's Babel languages on commons\n", "\n", "sdf = spark.read.csv(lang + '.suggested.csv').toDF(\"page_title\")\n", "sdf.createOrReplaceTempView('suggested')\n", "query = \"\"\"\n", " SELECT count(DISTINCT suggested.page_title),count(DISTINCT wl_user) \n", " FROM wmf_raw.mediawiki_private_watchlist wl\n", " JOIN suggested on suggested.page_title=wl.wl_title\n", " JOIN babel on wl.wl_user=babel.user_id\n", " JOIN wmf_raw.mediawiki_image img on img.img_name=wl.wl_title\n", " WHERE wl.wiki_db='commonswiki'\n", " AND wl.snapshot='2021-06'\n", " AND img.wiki_db='commonswiki'\n", " AND img.snapshot='2021-06'\n", " AND img.img_timestamp > 20210501000000\n", " AND img.img_timestamp < 20210601000000\n", " AND babel.lang='\"\"\" + lang + \"\"\"'\n", " \"\"\"\n", "pd = spark.sql(query).toPandas()\n", "print(pd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }