{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This notebook gathers all extra data required by https://phabricator.wikimedia.org/T286562 and writes it to a parquet file\n",
    "\n",
    "import re\n",
    "import math\n",
    "import os\n",
    "from wmfdata.spark import get_session"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pass in the full snapshot date\n",
    "snapshot = '2021-08-23'\n",
    "reg = r'^([\\w]+-[\\w]+)'\n",
    "short_snapshot = re.match(reg, snapshot).group()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.\n"
     ]
    }
   ],
   "source": [
    "# We use wmfdata boilerplate to init a spark session.\n",
    "# Under the hood the library uses findspark to initialise\n",
    "# Spark's environment. pyspark imports will be available \n",
    "# after initialisation\n",
    "spark = get_session(type='regular', app_name=\"T286562\")\n",
    "import pyspark\n",
    "import pyspark.sql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Images containing these substrings are probably icons or placeholders \n",
    "\n",
    "disallowed_substrings = ['flag','noantimage','no_free_image','image_manquante',\n",
    "    'replace_this_image','disambig','regions','map','default',\n",
    "    'defaut','falta_imagem_','imageNA','noimage','noenzyimage']\n",
    "\n",
    "disallowed_sql = []\n",
    "for d in disallowed_substrings:\n",
    "    disallowed_sql.append(\"p.page_title like '%\" + d + \"%'\")\n",
    "\n",
    "query=\"\"\"\n",
    "SELECT p.page_id,p.page_title\n",
    "FROM wmf_raw.mediawiki_page p\n",
    "WHERE p.page_namespace=6 \n",
    "AND page_is_redirect=0 \n",
    "AND p.wiki_db='commonswiki' \n",
    "AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
    "AND (\"\"\" + \" OR \".join(disallowed_sql) + \"\"\")\n",
    "ORDER BY page_id\n",
    "\"\"\"\n",
    "\n",
    "disallowedDF = spark.sql(query)\n",
    "disallowedDF.createOrReplaceTempView(\"files_with_disallowed_substrings\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# An image linked to from more than N pages is likely to be an icon or a placeholder, where the value of N depends on the size of the wiki\n",
    "\n",
    "query=\"\"\"WITH wiki_sizes as (\n",
    "    SELECT wiki_db,COUNT(*) as size,\n",
    "    IF (\n",
    "        COUNT(*) >= 50000,\n",
    "        CEILING((log10(COUNT(*)/50000)+1)*15),\n",
    "        CEILING(\n",
    "            IF(\n",
    "                ((COUNT(*)/50000 * 15) > 15/4), \n",
    "                COUNT(*)/50000 * 15, \n",
    "                15/4\n",
    "            )\n",
    "        )\n",
    "    ) as threshold\n",
    "    FROM wmf_raw.mediawiki_page\n",
    "    WHERE page_namespace=0 \n",
    "    AND page_is_redirect=0\n",
    "    AND snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
    "    GROUP BY wiki_db\n",
    "),\n",
    "commons_file_pages as \n",
    " (\n",
    " SELECT p.page_id,p.page_title\n",
    " FROM wmf_raw.mediawiki_page p\n",
    " WHERE p.page_namespace=6 \n",
    " AND page_is_redirect=0 AND p.wiki_db='commonswiki' \n",
    " AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
    " ORDER BY page_id\n",
    " )\n",
    "SELECT cfp.page_id,cfp.page_title\n",
    "FROM wmf_raw.mediawiki_imagelinks il\n",
    "JOIN wmf_raw.mediawiki_page p\n",
    "ON (p.page_id=il.il_from and p.wiki_db=il.wiki_db)\n",
    "JOIN commons_file_pages cfp\n",
    "ON cfp.page_title=il.il_to\n",
    "JOIN wiki_sizes ws\n",
    "ON ws.wiki_db=p.wiki_db\n",
    "WHERE il.il_from_namespace=0 \n",
    "AND p.snapshot='\"\"\"+short_snapshot+\"\"\"'  \n",
    "AND il.snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
    "GROUP BY ws.threshold,cfp.page_id,cfp.page_title\n",
    "HAVING COUNT(il.il_to)>ws.threshold\"\"\"\n",
    "disallowedDF = spark.sql(query)\n",
    "disallowedDF.createOrReplaceTempView(\"files_with_too_many_linkages\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We also have a parquet containing `page_title` for commons files that are in placeholder categories (see https://petscan.wmflabs.org/?psid=18699732&format=json)\n",
    "# Load those into another temp view\n",
    "\n",
    "placeholder_images = spark.read.parquet('hdfs:/user/gmodena/image_placeholders')\n",
    "placeholder_images.createOrReplaceTempView('files_in_placeholder_categories')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gather all commons files that aren't placeholders or icons, with relevant linked wikidata items for each\n",
    "# \n",
    "# `commons_file_pages_with_reverse_p18` has every allowed commons File page that is linked to from wikidata via the P18 (image) property\n",
    "# Each row contains\n",
    "# - the page_id of the commons page\n",
    "# - a set of the wikidata ids that link to the commons page via P18 i.e. the Qid of every wikidata item that has statement 'P18=<the title of the commons page>'\n",
    "#\n",
    "# `commons_file_pages_with_reverse_p373` has every allowed commons File page that is contained in a commons category where\n",
    "# - the commons category is linked to a wikidata item via the P373 (commons category) property AND\n",
    "# - the wikidata item corresponds to a page in the main namespace in any non-commons wiki AND\n",
    "# - the category on commons contains fewer than 100k pages\n",
    "# Each row contains\n",
    "# - the page_id of the commons page\n",
    "# - a set of the wikidata ids that link to the commons category the commons page is in, concatenated with the number of pages in the category\n",
    "#     i.e. \n",
    "#     - the Qid of every wikidata item linked to a main-namespace page that has statement 'P373=<the title of a commons category the commons page is in>'\n",
    "#     - then the pipe symbol \n",
    "#     - then the number of pages in the category\n",
    "#\n",
    "# `commons_file_pages_with_container_page_qids` has every allowed commons File page that is used on a main-namespace page on a non-commons wiki \n",
    "# Each row contains\n",
    "# - the page_id of the commons page\n",
    "# - a set of the wikidata ids of pages that contain the commons image, concatenated with the wiki_db of the page, concatenated with the number of links to the page\n",
    "#     i.e. \n",
    "#     - the Qid of every wikidata item linked to a main-namespace page that includes the commons File page\n",
    "#     - then the pipe symbol \n",
    "#     - then the wiki the-page-that-includes-the-commons-File-page in on\n",
    "#     - then the pipe symbol \n",
    "#     - then the number of pagelinks to the-page-that-includes-the-commons-File-page\n",
    "#\n",
    "# `commons_file_pages` contains every File page on commons *excluding* those disallowed above\n",
    "# `non_commons_main_pages` contains every main-namespace page on every wiki EXCEPT commons\n",
    "# `qid_props` contains every wikidata item with its P18 (image) and P373 (commons category) values\n",
    "\n",
    "query=\"\"\"WITH commons_file_pages as \n",
    " (\n",
    " SELECT p.page_id,p.page_title\n",
    " FROM wmf_raw.mediawiki_page p\n",
    " LEFT ANTI JOIN files_with_disallowed_substrings dsub \n",
    " ON dsub.page_id=p.page_id\n",
    " LEFT ANTI JOIN files_with_too_many_linkages fml\n",
    " ON fml.page_id=p.page_id\n",
    " LEFT ANTI JOIN files_in_placeholder_categories fpc\n",
    " ON fpc.page_title=p.page_title\n",
    " WHERE p.page_namespace=6 \n",
    " AND page_is_redirect=0 AND p.wiki_db='commonswiki' \n",
    " AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
    " ORDER BY page_id\n",
    " ),\n",
    " non_commons_main_pages as \n",
    " (\n",
    " SELECT p.page_id,p.wiki_db,p.page_title,count(pl.pl_from) as incoming_links\n",
    " FROM wmf_raw.mediawiki_page p\n",
    " JOIN wmf_raw.mediawiki_pagelinks pl\n",
    " ON (pl.pl_title=p.page_title AND pl.wiki_db=p.wiki_db)\n",
    " WHERE p.page_namespace=0 \n",
    " AND page_is_redirect=0 \n",
    " AND p.wiki_db!='commonswiki' \n",
    " AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
    " AND pl.snapshot='\"\"\"+short_snapshot+\"\"\"' \n",
    " GROUP BY p.wiki_db,p.page_id,p.page_title\n",
    " ORDER BY p.page_id\n",
    " ),\n",
    " qid_props AS \n",
    " (\n",
    " SELECT we.id as item_id,\n",
    " MAX(CASE WHEN claim.mainSnak.property = 'P18' THEN claim.mainSnak.datavalue.value ELSE NULL END) AS hasimage,\n",
    " MAX(CASE WHEN claim.mainSnak.property = 'P373' THEN REPLACE(claim.mainSnak.datavalue.value,' ','_') ELSE NULL END) AS commonscategory\n",
    " FROM wmf.wikidata_entity we \n",
    " LATERAL VIEW OUTER explode(claims) c AS claim\n",
    " WHERE typ='item'\n",
    " AND snapshot='\"\"\"+snapshot+\"\"\"'\n",
    " AND claim.mainSnak.property in ('P18','P373')\n",
    " GROUP BY item_id\n",
    " ),\n",
    " commons_file_pages_with_reverse_p18 AS \n",
    " (\n",
    " SELECT p.page_id,collect_set(qid_props.item_id) as reverse_p18\n",
    " FROM commons_file_pages p\n",
    " JOIN qid_props on concat('\"',p.page_title,'\"')=qid_props.hasimage\n",
    " GROUP BY p.page_id\n",
    " ),\n",
    " commons_file_pages_with_reverse_p373 AS\n",
    " (\n",
    " SELECT cl.cl_from as page_id,collect_set(concat_ws('|',qid_props.item_id,c.cat_pages)) as reverse_p373\n",
    " FROM wmf_raw.mediawiki_categorylinks cl\n",
    " JOIN wmf_raw.mediawiki_category c\n",
    " ON (c.cat_title=cl.cl_to AND c.wiki_db='commonswiki' AND c.cat_pages<100000)\n",
    " JOIN qid_props\n",
    " ON qid_props.commonscategory=concat('\"',cl.cl_to,'\"')\n",
    " JOIN wmf.wikidata_item_page_link wipl\n",
    " ON wipl.item_id=qid_props.item_id\n",
    " WHERE cl.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
    " AND c.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
    " AND cl.wiki_db ='commonswiki'\n",
    " AND cl.cl_type='file'\n",
    " AND wipl.page_namespace=0\n",
    " AND wipl.wiki_db!='commonswiki'\n",
    " group by cl.cl_from\n",
    " ),\n",
    " commons_file_pages_with_container_page_qids AS \n",
    " (\n",
    " SELECT cfp.page_id,collect_set(concat_ws('|',wipl.item_id,ncmp.wiki_db,ncmp.incoming_links)) as container_page_qids\n",
    " FROM non_commons_main_pages ncmp\n",
    " JOIN wmf.wikidata_item_page_link wipl\n",
    " ON wipl.page_id=ncmp.page_id\n",
    " AND wipl.wiki_db=ncmp.wiki_db\n",
    " JOIN wmf_raw.mediawiki_page_props pp\n",
    " ON ncmp.page_id=pp.pp_page\n",
    " AND ncmp.wiki_db=pp.wiki_db\n",
    " AND pp.pp_propname in ('page_image','page_image_free')\n",
    " JOIN commons_file_pages cfp\n",
    " ON cfp.page_title=pp.pp_value\n",
    " WHERE wipl.snapshot='\"\"\"+snapshot+\"\"\"'\n",
    " AND pp.snapshot='\"\"\"+short_snapshot+\"\"\"'\n",
    " GROUP BY cfp.page_id\n",
    " )\n",
    " select cfp.page_id,reverse_p18,reverse_p373,container_page_qids\n",
    " FROM commons_file_pages cfp\n",
    " LEFT JOIN commons_file_pages_with_reverse_p18 rp18\n",
    " ON cfp.page_id=rp18.page_id\n",
    " LEFT JOIN commons_file_pages_with_reverse_p373 rp373\n",
    " ON cfp.page_id=rp373.page_id\n",
    " LEFT JOIN commons_file_pages_with_container_page_qids cpq\n",
    " ON cfp.page_id=cpq.page_id\n",
    " WHERE\n",
    " (reverse_p18 IS NOT NULL OR reverse_p373 IS NOT NULL or container_page_qids IS NOT NULL)\n",
    " GROUP BY cfp.page_id,reverse_p18,reverse_p373,container_page_qids\n",
    " \"\"\"\n",
    "commonsDF = spark.sql(query).cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write the data to a parquet so we can use it later\n",
    "#\n",
    "# the fields are:\n",
    "# - page_id: the page id of the file on commons\n",
    "# - reverse_p18: a set of wikidata item-ids from which the commons file is linked via the P18 (image) property\n",
    "# - reverse_p373: a set of wikidata item-ids linked via P373 (commons category) to any commons category that the commons file belongs to, plus the total number of \n",
    "#    files in the category\n",
    "# - container_page_qids: a set of wikidata item-ids of wiki pages the commons file is used in, plus the wiki, plus the number of incoming links to the wiki page\n",
    "\n",
    "commonsDF.write.parquet('commons_files_related_wikidata_items')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}