{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# This notebook gathers all extra data required by https://phabricator.wikimedia.org/T286562 and writes it to a parquet file\n", "\n", "import re\n", "import math\n", "import os\n", "from wmfdata.spark import get_session" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Pass in the full snapshot date\n", "snapshot = '2021-08-23'\n", "reg = r'^([\\w]+-[\\w]+)'\n", "short_snapshot = re.match(reg, snapshot).group()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.\n" ] } ], "source": [ "# We use wmfdata boilerplate to init a spark session.\n", "# Under the hood the library uses findspark to initialise\n", "# Spark's environment. pyspark imports will be available \n", "# after initialisation\n", "spark = get_session(type='regular', app_name=\"T286562\")\n", "import pyspark\n", "import pyspark.sql" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Images containing these substrings are probably icons or placeholders \n", "\n", "disallowed_substrings = ['flag','noantimage','no_free_image','image_manquante',\n", " 'replace_this_image','disambig','regions','map','default',\n", " 'defaut','falta_imagem_','imageNA','noimage','noenzyimage']\n", "\n", "disallowed_sql = []\n", "for d in disallowed_substrings:\n", " disallowed_sql.append(\"p.page_title like '%\" + d + \"%'\")\n", "\n", "query=\"\"\"\n", "SELECT p.page_id,p.page_title\n", "FROM wmf_raw.mediawiki_page p\n", "WHERE p.page_namespace=6 \n", "AND page_is_redirect=0 \n", "AND p.wiki_db='commonswiki' \n", "AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n", "AND (\"\"\" + \" OR \".join(disallowed_sql) + \"\"\")\n", "ORDER BY page_id\n", "\"\"\"\n", "\n", "disallowedDF = spark.sql(query)\n", "disallowedDF.createOrReplaceTempView(\"files_with_disallowed_substrings\") " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# An image linked to from more than N pages is likely to be an icon or a placeholder, where the value of N depends on the size of the wiki\n", "\n", "query=\"\"\"WITH wiki_sizes as (\n", " SELECT wiki_db,COUNT(*) as size,\n", " IF (\n", " COUNT(*) >= 50000,\n", " CEILING((log10(COUNT(*)/50000)+1)*15),\n", " CEILING(\n", " IF(\n", " ((COUNT(*)/50000 * 15) > 15/4), \n", " COUNT(*)/50000 * 15, \n", " 15/4\n", " )\n", " )\n", " ) as threshold\n", " FROM wmf_raw.mediawiki_page\n", " WHERE page_namespace=0 \n", " AND page_is_redirect=0\n", " AND snapshot='\"\"\"+short_snapshot+\"\"\"'\n", " GROUP BY wiki_db\n", "),\n", "commons_file_pages as \n", " (\n", " SELECT p.page_id,p.page_title\n", " FROM wmf_raw.mediawiki_page p\n", " WHERE p.page_namespace=6 \n", " AND page_is_redirect=0 AND p.wiki_db='commonswiki' \n", " AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n", " ORDER BY page_id\n", " )\n", "SELECT cfp.page_id,cfp.page_title\n", "FROM wmf_raw.mediawiki_imagelinks il\n", "JOIN wmf_raw.mediawiki_page p\n", "ON (p.page_id=il.il_from and p.wiki_db=il.wiki_db)\n", "JOIN commons_file_pages cfp\n", "ON cfp.page_title=il.il_to\n", "JOIN wiki_sizes ws\n", "ON ws.wiki_db=p.wiki_db\n", "WHERE il.il_from_namespace=0 \n", "AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n", "AND il.snapshot='\"\"\"+short_snapshot+\"\"\"' \n", "GROUP BY ws.threshold,cfp.page_id,cfp.page_title\n", "HAVING COUNT(il.il_to)>ws.threshold\"\"\"\n", "disallowedDF = spark.sql(query)\n", "disallowedDF.createOrReplaceTempView(\"files_with_too_many_linkages\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# We also have a parquet containing `page_title` for commons files that are in placeholder categories (see https://petscan.wmflabs.org/?psid=18699732&format=json)\n", "# Load those into another temp view\n", "\n", "placeholder_images = spark.read.parquet('hdfs:/user/gmodena/image_placeholders')\n", "placeholder_images.createOrReplaceTempView('files_in_placeholder_categories')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Gather all commons files that aren't placeholders or icons, with relevant linked wikidata items for each\n", "# \n", "# `commons_file_pages_with_reverse_p18` has every allowed commons File page that is linked to from wikidata via the P18 (image) property\n", "# Each row contains\n", "# - the page_id of the commons page\n", "# - a set of the wikidata ids that link to the commons page via P18 i.e. the Qid of every wikidata item that has statement 'P18='\n", "#\n", "# `commons_file_pages_with_reverse_p373` has every allowed commons File page that is contained in a commons category where\n", "# - the commons category is linked to a wikidata item via the P373 (commons category) property AND\n", "# - the wikidata item corresponds to a page in the main namespace in any non-commons wiki AND\n", "# - the category on commons contains fewer than 100k pages\n", "# Each row contains\n", "# - the page_id of the commons page\n", "# - a set of the wikidata ids that link to the commons category the commons page is in, concatenated with the number of pages in the category\n", "# i.e. \n", "# - the Qid of every wikidata item linked to a main-namespace page that has statement 'P373='\n", "# - then the pipe symbol \n", "# - then the number of pages in the category\n", "#\n", "# `commons_file_pages_with_container_page_qids` has every allowed commons File page that is used on a main-namespace page on a non-commons wiki \n", "# Each row contains\n", "# - the page_id of the commons page\n", "# - a set of the wikidata ids of pages that contain the commons image, concatenated with the wiki_db of the page, concatenated with the number of links to the page\n", "# i.e. \n", "# - the Qid of every wikidata item linked to a main-namespace page that includes the commons File page\n", "# - then the pipe symbol \n", "# - then the wiki the-page-that-includes-the-commons-File-page in on\n", "# - then the pipe symbol \n", "# - then the number of pagelinks to the-page-that-includes-the-commons-File-page\n", "#\n", "# `commons_file_pages` contains every File page on commons *excluding* those disallowed above\n", "# `non_commons_main_pages` contains every main-namespace page on every wiki EXCEPT commons\n", "# `qid_props` contains every wikidata item with its P18 (image) and P373 (commons category) values\n", "\n", "query=\"\"\"WITH commons_file_pages as \n", " (\n", " SELECT p.page_id,p.page_title\n", " FROM wmf_raw.mediawiki_page p\n", " LEFT ANTI JOIN files_with_disallowed_substrings dsub \n", " ON dsub.page_id=p.page_id\n", " LEFT ANTI JOIN files_with_too_many_linkages fml\n", " ON fml.page_id=p.page_id\n", " LEFT ANTI JOIN files_in_placeholder_categories fpc\n", " ON fpc.page_title=p.page_title\n", " WHERE p.page_namespace=6 \n", " AND page_is_redirect=0 AND p.wiki_db='commonswiki' \n", " AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n", " ORDER BY page_id\n", " ),\n", " non_commons_main_pages as \n", " (\n", " SELECT p.page_id,p.wiki_db,p.page_title,count(pl.pl_from) as incoming_links\n", " FROM wmf_raw.mediawiki_page p\n", " JOIN wmf_raw.mediawiki_pagelinks pl\n", " ON (pl.pl_title=p.page_title AND pl.wiki_db=p.wiki_db)\n", " WHERE p.page_namespace=0 \n", " AND page_is_redirect=0 \n", " AND p.wiki_db!='commonswiki' \n", " AND p.snapshot='\"\"\"+short_snapshot+\"\"\"' \n", " AND pl.snapshot='\"\"\"+short_snapshot+\"\"\"' \n", " GROUP BY p.wiki_db,p.page_id,p.page_title\n", " ORDER BY p.page_id\n", " ),\n", " qid_props AS \n", " (\n", " SELECT we.id as item_id,\n", " MAX(CASE WHEN claim.mainSnak.property = 'P18' THEN claim.mainSnak.datavalue.value ELSE NULL END) AS hasimage,\n", " MAX(CASE WHEN claim.mainSnak.property = 'P373' THEN REPLACE(claim.mainSnak.datavalue.value,' ','_') ELSE NULL END) AS commonscategory\n", " FROM wmf.wikidata_entity we \n", " LATERAL VIEW OUTER explode(claims) c AS claim\n", " WHERE typ='item'\n", " AND snapshot='\"\"\"+snapshot+\"\"\"'\n", " AND claim.mainSnak.property in ('P18','P373')\n", " GROUP BY item_id\n", " ),\n", " commons_file_pages_with_reverse_p18 AS \n", " (\n", " SELECT p.page_id,collect_set(qid_props.item_id) as reverse_p18\n", " FROM commons_file_pages p\n", " JOIN qid_props on concat('\"',p.page_title,'\"')=qid_props.hasimage\n", " GROUP BY p.page_id\n", " ),\n", " commons_file_pages_with_reverse_p373 AS\n", " (\n", " SELECT cl.cl_from as page_id,collect_set(concat_ws('|',qid_props.item_id,c.cat_pages)) as reverse_p373\n", " FROM wmf_raw.mediawiki_categorylinks cl\n", " JOIN wmf_raw.mediawiki_category c\n", " ON (c.cat_title=cl.cl_to AND c.wiki_db='commonswiki' AND c.cat_pages<100000)\n", " JOIN qid_props\n", " ON qid_props.commonscategory=concat('\"',cl.cl_to,'\"')\n", " JOIN wmf.wikidata_item_page_link wipl\n", " ON wipl.item_id=qid_props.item_id\n", " WHERE cl.snapshot='\"\"\"+short_snapshot+\"\"\"'\n", " AND c.snapshot='\"\"\"+short_snapshot+\"\"\"'\n", " AND cl.wiki_db ='commonswiki'\n", " AND cl.cl_type='file'\n", " AND wipl.page_namespace=0\n", " AND wipl.wiki_db!='commonswiki'\n", " group by cl.cl_from\n", " ),\n", " commons_file_pages_with_container_page_qids AS \n", " (\n", " SELECT cfp.page_id,collect_set(concat_ws('|',wipl.item_id,ncmp.wiki_db,ncmp.incoming_links)) as container_page_qids\n", " FROM non_commons_main_pages ncmp\n", " JOIN wmf.wikidata_item_page_link wipl\n", " ON wipl.page_id=ncmp.page_id\n", " AND wipl.wiki_db=ncmp.wiki_db\n", " JOIN wmf_raw.mediawiki_page_props pp\n", " ON ncmp.page_id=pp.pp_page\n", " AND ncmp.wiki_db=pp.wiki_db\n", " AND pp.pp_propname in ('page_image','page_image_free')\n", " JOIN commons_file_pages cfp\n", " ON cfp.page_title=pp.pp_value\n", " WHERE wipl.snapshot='\"\"\"+snapshot+\"\"\"'\n", " AND pp.snapshot='\"\"\"+short_snapshot+\"\"\"'\n", " GROUP BY cfp.page_id\n", " )\n", " select cfp.page_id,reverse_p18,reverse_p373,container_page_qids\n", " FROM commons_file_pages cfp\n", " LEFT JOIN commons_file_pages_with_reverse_p18 rp18\n", " ON cfp.page_id=rp18.page_id\n", " LEFT JOIN commons_file_pages_with_reverse_p373 rp373\n", " ON cfp.page_id=rp373.page_id\n", " LEFT JOIN commons_file_pages_with_container_page_qids cpq\n", " ON cfp.page_id=cpq.page_id\n", " WHERE\n", " (reverse_p18 IS NOT NULL OR reverse_p373 IS NOT NULL or container_page_qids IS NOT NULL)\n", " GROUP BY cfp.page_id,reverse_p18,reverse_p373,container_page_qids\n", " \"\"\"\n", "commonsDF = spark.sql(query).cache()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# write the data to a parquet so we can use it later\n", "#\n", "# the fields are:\n", "# - page_id: the page id of the file on commons\n", "# - reverse_p18: a set of wikidata item-ids from which the commons file is linked via the P18 (image) property\n", "# - reverse_p373: a set of wikidata item-ids linked via P373 (commons category) to any commons category that the commons file belongs to, plus the total number of \n", "# files in the category\n", "# - container_page_qids: a set of wikidata item-ids of wiki pages the commons file is used in, plus the wiki, plus the number of incoming links to the wiki page\n", "\n", "commonsDF.write.parquet('commons_files_related_wikidata_items')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }