{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get all revisions that are redirects\n", "\n", "For a given wiki we want to find all revisions that are redirects.\n", "We query [wmf.mediawiki_wikitext_history](https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Content/XMLDumps/Mediawiki_wikitext_history).\n", "\n", "How:\n", "- get all redirect-aliases\n", " - for any wiki we can extract all redirect-aliases from '%s-latest-siteinfo-namespaces.json.gz'\n", "- query all revision-ids that are redirects\n", " - we look at the column 'revision_text'\n", " - we check whether the it starts with the string '#redirect [[*]]' or any of its aliases using sql's LIKE command\n", " - we only consider lower-case strings (syntax agnostic to capitalization)\n", "- regexp-extract to get redirect-page from text-field" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pyspark.sql.functions import col,regexp_extract ## for extracting redirect-page\n", "from redirect import get_redirect_aliases ## for getting redirect-aliases" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## select the wmf table\n", "sqlContext.sql('USE wmf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Which wiki" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "wiki_date = '2019-07'\n", "wiki_name = 'rowiki'\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get all redirect - aliases" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['#REDIRECT', '#REDIRECTEAZA']\n" ] } ], "source": [ "## we look up the json of the latest siteinfo-namespaces file\n", "filename = os.path.join('/mnt','data','xmldatadumps','public','%s'%wiki_name,'latest','%s-latest-siteinfo-namespaces.json.gz'%wiki_name)\n", "list_redirect_aliases = get_redirect_aliases(filename)\n", "print(list_redirect_aliases)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Query\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(LOWER(revision_text) LIKE \"#redirect [[%]]%\" OR LOWER(revision_text) LIKE \"#redirecteaza [[%]]%\")\n" ] } ], "source": [ "## construct the string-matching condition\n", "str_revision_redirect_match = '('\n", "for i_redirect_alias, redirect_alias in enumerate(list_redirect_aliases):\n", " if i_redirect_alias > 0:\n", " str_revision_redirect_match += ' OR '\n", " str_revision_redirect_match += 'LOWER(revision_text) LIKE \"%s [[%%]]%%\"'%(redirect_alias.lower())\n", "str_revision_redirect_match += ')'\n", "print(str_revision_redirect_match)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT page_id, revision_id, page_title, revision_text FROM wmf.mediawiki_wikitext_history WHERE snapshot=\"2019-07\" AND wiki_db=\"rowiki\" AND page_namespace = 0 AND (LOWER(revision_text) LIKE \"#redirect [[%]]%\" OR LOWER(revision_text) LIKE \"#redirecteaza [[%]]%\") ORDER BY page_id, revision_id \n" ] } ], "source": [ "## query all revision which start with '#redirect [[*]]' or one of its redirect-aliases\n", "\n", "query = 'SELECT page_id, revision_id, page_title, revision_text \\\n", "FROM wmf.mediawiki_wikitext_history \\\n", "WHERE snapshot=\"%s\" \\\n", "AND wiki_db=\"%s\" \\\n", "AND page_namespace = 0 \\\n", "AND %s \\\n", "ORDER BY page_id, revision_id \\\n", "'\\\n", "%(wiki_date,wiki_name,str_revision_redirect_match)\n", "print(query)\n", "\n", "result = sqlContext.sql(query)\n", "\n", "## extracting the page [[ ]] where it redirects to\n", "## drop the revision-text (too much information)\n", "\n", "df_final = result.withColumn('revision_redirect_page',regexp_extract(col('revision_text'), '\\[\\[(.*?)\\]\\]',1 ) )\\\n", ".drop('revision_text')\\\n", ".toPandas()\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "## save result to csv-dataframe\n", "df_final.to_csv(df_redirect_revision_%s.csv'%wiki_name)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", " | page_id | \n", "revision_id | \n", "page_title | \n", "revision_redirect_page | \n", "
---|---|---|---|---|
0 | \n", "9 | \n", "66798 | \n", "Fizica | \n", "Fizică | \n", "
1 | \n", "9 | \n", "1303010 | \n", "Fizica | \n", "Fizică | \n", "
2 | \n", "9 | \n", "1705699 | \n", "Fizica | \n", "Fizică | \n", "
3 | \n", "9 | \n", "2126089 | \n", "Fizica | \n", "Fizică | \n", "
4 | \n", "9 | \n", "5693014 | \n", "Fizica | \n", "Fizică | \n", "
5 | \n", "9 | \n", "10723388 | \n", "Fizica | \n", "Fizică | \n", "
6 | \n", "13 | \n", "1962 | \n", "Limbi | \n", "Listă de limbi | \n", "
7 | \n", "13 | \n", "66801 | \n", "Limbi | \n", "Limbă | \n", "
8 | \n", "13 | \n", "3020259 | \n", "Limbi | \n", "Limbă (dezambiguizare) | \n", "
9 | \n", "13 | \n", "3914562 | \n", "Limbi | \n", "Limbă | \n", "
10 | \n", "13 | \n", "8881133 | \n", "Limbi | \n", "Limbă (dezambiguizare) | \n", "
11 | \n", "14 | \n", "66802 | \n", "Lingvistica | \n", "Lingvistică | \n", "
12 | \n", "15 | \n", "66803 | \n", "Limba romana | \n", "Limba română | \n", "
13 | \n", "17 | \n", "66805 | \n", "Arte martiale | \n", "Arte marţiale | \n", "
14 | \n", "17 | \n", "232040 | \n", "Arte martiale | \n", "Artă marţială | \n", "
15 | \n", "17 | \n", "4622678 | \n", "Arte martiale | \n", "Artă marțială | \n", "
16 | \n", "17 | \n", "12196093 | \n", "Arte martiale | \n", "Artele marțiale | \n", "
17 | \n", "17 | \n", "12206268 | \n", "Arte martiale | \n", "Arte marțiale | \n", "
18 | \n", "24 | \n", "66811 | \n", "Romania (dezambiguizare) | \n", "România | \n", "
19 | \n", "24 | \n", "747673 | \n", "Romania (dezambiguizare) | \n", "România (dezambiguizare) | \n", "
20 | \n", "24 | \n", "748034 | \n", "Romania (dezambiguizare) | \n", "România | \n", "
21 | \n", "49 | \n", "66833 | \n", "Religii | \n", "Religie | \n", "
22 | \n", "71 | \n", "2461256 | \n", "GPL (licență) | \n", "GPL (licenţă, versiunea 2) | \n", "
23 | \n", "71 | \n", "2896707 | \n", "GPL (licență) | \n", "GPL (licenţă, versiunea 2) | \n", "
24 | \n", "71 | \n", "2896715 | \n", "GPL (licență) | \n", "GPL (licenţă, versiunea 2) | \n", "
25 | \n", "71 | \n", "2986782 | \n", "GPL (licență) | \n", "Licenţa_Publică_Generală_GNU | \n", "
26 | \n", "71 | \n", "4044515 | \n", "GPL (licență) | \n", "Licența Publică Generală GNU | \n", "
27 | \n", "71 | \n", "4184309 | \n", "GPL (licență) | \n", "Licența Publică Generală GNU | \n", "
28 | \n", "76 | \n", "66859 | \n", "Limba valonă | \n", "Limba văleană | \n", "
29 | \n", "80 | \n", "66862 | \n", "Nazismul | \n", "Nazism | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
424323 | \n", "2423632 | \n", "12985859 | \n", "Crinul tigru | \n", "Lilium lancifolium | \n", "
424324 | \n", "2423754 | \n", "12986164 | \n", "Reclamă de Televiziune | \n", "Reclamă de televiziune | \n", "
424325 | \n", "2423877 | \n", "12986435 | \n", "Nomenclatorul drumurilor naționale din România | \n", "Lista drumurilor naționale din România | \n", "
424326 | \n", "2423919 | \n", "12986606 | \n", "NGC 242 | \n", "NGC 241 | \n", "
424327 | \n", "2423921 | \n", "12986619 | \n", "Iacob I de Aragon | \n", "Iacob I al Aragonului | \n", "
424328 | \n", "2423972 | \n", "12986802 | \n", "DN 5 | \n", "DN5 | \n", "
424329 | \n", "2424061 | \n", "12987400 | \n", "After School (grupă) | \n", "After School (grup muzical) | \n", "
424330 | \n", "2424071 | \n", "12987453 | \n", "Lactarius deterrimus | \n", "Râșcov de molid | \n", "
424331 | \n", "2424153 | \n", "12987875 | \n", "Dumbo (povestiri) | \n", "Dumbo (carte) | \n", "
424332 | \n", "2424173 | \n", "12988014 | \n", "Liudmila Saveleva | \n", "Liudmila Savelieva | \n", "
424333 | \n", "2424181 | \n", "12988040 | \n", "Reeducarea în România Comunistă | \n", "Reeducarea în România comunistă | \n", "
424334 | \n", "2424230 | \n", "12988331 | \n", "James B. Conant | \n", "James Bryant Conant | \n", "
424335 | \n", "2424236 | \n", "12988655 | \n", "Dirijor Mircea Popa | \n", "Mircea Popa (dirijor) | \n", "
424336 | \n", "2424257 | \n", "12989799 | \n", "Nürburg (Gemeinde) | \n", "Nürburg (comună) | \n", "
424337 | \n", "2424323 | \n", "12990052 | \n", "Hachioji | \n", "Hachiōji | \n", "
424338 | \n", "2424380 | \n", "12992295 | \n", "Procedeu Edeleanu | \n", "Proces Edeleanu | \n", "
424339 | \n", "2424381 | \n", "12992303 | \n", "Procedeu Leblanc | \n", "Proces Leblanc | \n", "
424340 | \n", "2424385 | \n", "12992662 | \n", "Vasile Șova | \n", "Vasilii Șova | \n", "
424341 | \n", "2424387 | \n", "12992922 | \n", "Oxid feric | \n", "Oxid de fier (III) | \n", "
424342 | \n", "2424388 | \n", "12992942 | \n", "Oxid feros | \n", "Oxid de fier (II) | \n", "
424343 | \n", "2424418 | \n", "12998633 | \n", "Charles Frederic Gerhardt | \n", "Charles Frédéric Gerhardt | \n", "
424344 | \n", "2424477 | \n", "13004007 | \n", "WP:PARITY | \n", "Wikipedia:Opinii marginale#Paritatea surselor | \n", "
424345 | \n", "2424494 | \n", "13008956 | \n", "Mariquina, Chile | \n", "Mariquina | \n", "
424346 | \n", "2424546 | \n", "13012179 | \n", "ARM | \n", "Arm (dezambiguizare) | \n", "
424347 | \n", "2424581 | \n", "13012369 | \n", "Acid fosfonic | \n", "Fosfonat | \n", "
424348 | \n", "2424595 | \n", "13012445 | \n", "Polimer de condensare | \n", "Policondensare | \n", "
424349 | \n", "2424683 | \n", "13013136 | \n", "UEFA Champions League 2018-2019 | \n", "Liga Campionilor 2018-2019 | \n", "
424350 | \n", "2424717 | \n", "13013430 | \n", "Galla, soția lui Teodosiu I | \n", "Galla (soția lui Teodosiu I) | \n", "
424351 | \n", "2424761 | \n", "13013681 | \n", "SSC Farul Constanța | \n", "FC Farul Constanța | \n", "
424352 | \n", "2424870 | \n", "13014422 | \n", "Swaziland | \n", "Eswatini | \n", "
424353 rows × 4 columns
\n", "