{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get all revisions that are redirects\n", "\n", "For a given wiki we want to find all revisions that are redirects.\n", "We query [wmf.mediawiki_wikitext_history](https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Content/XMLDumps/Mediawiki_wikitext_history).\n", "\n", "How:\n", "- get all redirect-aliases\n", " - for any wiki we can extract all redirect-aliases from '%s-latest-siteinfo-namespaces.json.gz'\n", "- query all revision-ids that are redirects\n", " - we look at the column 'revision_text'\n", " - we check whether the it starts with the string '#redirect [[*]]' or any of its aliases using sql's LIKE command\n", " - we only consider lower-case strings (syntax agnostic to capitalization)\n", "- regexp-extract to get redirect-page from text-field" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "from pyspark.sql.functions import col,regexp_extract ## for extracting redirect-page\n", "from redirect import get_redirect_aliases ## for getting redirect-aliases" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## select the wmf table\n", "sqlContext.sql('USE wmf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Which wiki" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "wiki_date = '2019-07'\n", "wiki_name = 'rowiki'\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get all redirect - aliases" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['#REDIRECT', '#REDIRECTEAZA']\n" ] } ], "source": [ "## we look up the json of the latest siteinfo-namespaces file\n", "filename = os.path.join('/mnt','data','xmldatadumps','public','%s'%wiki_name,'latest','%s-latest-siteinfo-namespaces.json.gz'%wiki_name)\n", "list_redirect_aliases = get_redirect_aliases(filename)\n", "print(list_redirect_aliases)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Query\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(LOWER(revision_text) LIKE \"#redirect [[%]]%\" OR LOWER(revision_text) LIKE \"#redirecteaza [[%]]%\")\n" ] } ], "source": [ "## construct the string-matching condition\n", "str_revision_redirect_match = '('\n", "for i_redirect_alias, redirect_alias in enumerate(list_redirect_aliases):\n", " if i_redirect_alias > 0:\n", " str_revision_redirect_match += ' OR '\n", " str_revision_redirect_match += 'LOWER(revision_text) LIKE \"%s [[%%]]%%\"'%(redirect_alias.lower())\n", "str_revision_redirect_match += ')'\n", "print(str_revision_redirect_match)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SELECT page_id, revision_id, page_title, revision_text FROM wmf.mediawiki_wikitext_history WHERE snapshot=\"2019-07\" AND wiki_db=\"rowiki\" AND page_namespace = 0 AND (LOWER(revision_text) LIKE \"#redirect [[%]]%\" OR LOWER(revision_text) LIKE \"#redirecteaza [[%]]%\") ORDER BY page_id, revision_id \n" ] } ], "source": [ "## query all revision which start with '#redirect [[*]]' or one of its redirect-aliases\n", "\n", "query = 'SELECT page_id, revision_id, page_title, revision_text \\\n", "FROM wmf.mediawiki_wikitext_history \\\n", "WHERE snapshot=\"%s\" \\\n", "AND wiki_db=\"%s\" \\\n", "AND page_namespace = 0 \\\n", "AND %s \\\n", "ORDER BY page_id, revision_id \\\n", "'\\\n", "%(wiki_date,wiki_name,str_revision_redirect_match)\n", "print(query)\n", "\n", "result = sqlContext.sql(query)\n", "\n", "## extracting the page [[ ]] where it redirects to\n", "## drop the revision-text (too much information)\n", "\n", "df_final = result.withColumn('revision_redirect_page',regexp_extract(col('revision_text'), '\\[\\[(.*?)\\]\\]',1 ) )\\\n", ".drop('revision_text')\\\n", ".toPandas()\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "## save result to csv-dataframe\n", "df_final.to_csv(df_redirect_revision_%s.csv'%wiki_name)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
page_idrevision_idpage_titlerevision_redirect_page
0966798FizicaFizică
191303010FizicaFizică
291705699FizicaFizică
392126089FizicaFizică
495693014FizicaFizică
5910723388FizicaFizică
6131962LimbiListă de limbi
71366801LimbiLimbă
8133020259LimbiLimbă (dezambiguizare)
9133914562LimbiLimbă
10138881133LimbiLimbă (dezambiguizare)
111466802LingvisticaLingvistică
121566803Limba romanaLimba română
131766805Arte martialeArte marţiale
1417232040Arte martialeArtă marţială
15174622678Arte martialeArtă marțială
161712196093Arte martialeArtele marțiale
171712206268Arte martialeArte marțiale
182466811Romania (dezambiguizare)România
1924747673Romania (dezambiguizare)România (dezambiguizare)
2024748034Romania (dezambiguizare)România
214966833ReligiiReligie
22712461256GPL (licență)GPL (licenţă, versiunea 2)
23712896707GPL (licență)GPL (licenţă, versiunea 2)
24712896715GPL (licență)GPL (licenţă, versiunea 2)
25712986782GPL (licență)Licenţa_Publică_Generală_GNU
26714044515GPL (licență)Licența Publică Generală GNU
27714184309GPL (licență)Licența Publică Generală GNU
287666859Limba valonăLimba văleană
298066862NazismulNazism
...............
424323242363212985859Crinul tigruLilium lancifolium
424324242375412986164Reclamă de TeleviziuneReclamă de televiziune
424325242387712986435Nomenclatorul drumurilor naționale din RomâniaLista drumurilor naționale din România
424326242391912986606NGC 242NGC 241
424327242392112986619Iacob I de AragonIacob I al Aragonului
424328242397212986802DN 5DN5
424329242406112987400After School (grupă)After School (grup muzical)
424330242407112987453Lactarius deterrimusRâșcov de molid
424331242415312987875Dumbo (povestiri)Dumbo (carte)
424332242417312988014Liudmila SavelevaLiudmila Savelieva
424333242418112988040Reeducarea în România ComunistăReeducarea în România comunistă
424334242423012988331James B. ConantJames Bryant Conant
424335242423612988655Dirijor Mircea PopaMircea Popa (dirijor)
424336242425712989799Nürburg (Gemeinde)Nürburg (comună)
424337242432312990052HachiojiHachiōji‎
424338242438012992295Procedeu EdeleanuProces Edeleanu
424339242438112992303Procedeu LeblancProces Leblanc
424340242438512992662Vasile ȘovaVasilii Șova
424341242438712992922Oxid fericOxid de fier (III)
424342242438812992942Oxid ferosOxid de fier (II)
424343242441812998633Charles Frederic GerhardtCharles Frédéric Gerhardt
424344242447713004007WP:PARITYWikipedia:Opinii marginale#Paritatea surselor
424345242449413008956Mariquina, ChileMariquina
424346242454613012179ARMArm (dezambiguizare)
424347242458113012369Acid fosfonicFosfonat
424348242459513012445Polimer de condensarePolicondensare
424349242468313013136UEFA Champions League 2018-2019Liga Campionilor 2018-2019
424350242471713013430Galla, soția lui Teodosiu IGalla (soția lui Teodosiu I)
424351242476113013681SSC Farul ConstanțaFC Farul Constanța
424352242487013014422SwazilandEswatini
\n", "

424353 rows × 4 columns

\n", "
" ], "text/plain": [ " page_id revision_id page_title \\\n", "0 9 66798 Fizica \n", "1 9 1303010 Fizica \n", "2 9 1705699 Fizica \n", "3 9 2126089 Fizica \n", "4 9 5693014 Fizica \n", "5 9 10723388 Fizica \n", "6 13 1962 Limbi \n", "7 13 66801 Limbi \n", "8 13 3020259 Limbi \n", "9 13 3914562 Limbi \n", "10 13 8881133 Limbi \n", "11 14 66802 Lingvistica \n", "12 15 66803 Limba romana \n", "13 17 66805 Arte martiale \n", "14 17 232040 Arte martiale \n", "15 17 4622678 Arte martiale \n", "16 17 12196093 Arte martiale \n", "17 17 12206268 Arte martiale \n", "18 24 66811 Romania (dezambiguizare) \n", "19 24 747673 Romania (dezambiguizare) \n", "20 24 748034 Romania (dezambiguizare) \n", "21 49 66833 Religii \n", "22 71 2461256 GPL (licență) \n", "23 71 2896707 GPL (licență) \n", "24 71 2896715 GPL (licență) \n", "25 71 2986782 GPL (licență) \n", "26 71 4044515 GPL (licență) \n", "27 71 4184309 GPL (licență) \n", "28 76 66859 Limba valonă \n", "29 80 66862 Nazismul \n", "... ... ... ... \n", "424323 2423632 12985859 Crinul tigru \n", "424324 2423754 12986164 Reclamă de Televiziune \n", "424325 2423877 12986435 Nomenclatorul drumurilor naționale din România \n", "424326 2423919 12986606 NGC 242 \n", "424327 2423921 12986619 Iacob I de Aragon \n", "424328 2423972 12986802 DN 5 \n", "424329 2424061 12987400 After School (grupă) \n", "424330 2424071 12987453 Lactarius deterrimus \n", "424331 2424153 12987875 Dumbo (povestiri) \n", "424332 2424173 12988014 Liudmila Saveleva \n", "424333 2424181 12988040 Reeducarea în România Comunistă \n", "424334 2424230 12988331 James B. Conant \n", "424335 2424236 12988655 Dirijor Mircea Popa \n", "424336 2424257 12989799 Nürburg (Gemeinde) \n", "424337 2424323 12990052 Hachioji \n", "424338 2424380 12992295 Procedeu Edeleanu \n", "424339 2424381 12992303 Procedeu Leblanc \n", "424340 2424385 12992662 Vasile Șova \n", "424341 2424387 12992922 Oxid feric \n", "424342 2424388 12992942 Oxid feros \n", "424343 2424418 12998633 Charles Frederic Gerhardt \n", "424344 2424477 13004007 WP:PARITY \n", "424345 2424494 13008956 Mariquina, Chile \n", "424346 2424546 13012179 ARM \n", "424347 2424581 13012369 Acid fosfonic \n", "424348 2424595 13012445 Polimer de condensare \n", "424349 2424683 13013136 UEFA Champions League 2018-2019 \n", "424350 2424717 13013430 Galla, soția lui Teodosiu I \n", "424351 2424761 13013681 SSC Farul Constanța \n", "424352 2424870 13014422 Swaziland \n", "\n", " revision_redirect_page \n", "0 Fizică \n", "1 Fizică \n", "2 Fizică \n", "3 Fizică \n", "4 Fizică \n", "5 Fizică \n", "6 Listă de limbi \n", "7 Limbă \n", "8 Limbă (dezambiguizare) \n", "9 Limbă \n", "10 Limbă (dezambiguizare) \n", "11 Lingvistică \n", "12 Limba română \n", "13 Arte marţiale \n", "14 Artă marţială \n", "15 Artă marțială \n", "16 Artele marțiale \n", "17 Arte marțiale \n", "18 România \n", "19 România (dezambiguizare) \n", "20 România \n", "21 Religie \n", "22 GPL (licenţă, versiunea 2) \n", "23 GPL (licenţă, versiunea 2) \n", "24 GPL (licenţă, versiunea 2) \n", "25 Licenţa_Publică_Generală_GNU \n", "26 Licența Publică Generală GNU \n", "27 Licența Publică Generală GNU \n", "28 Limba văleană \n", "29 Nazism \n", "... ... \n", "424323 Lilium lancifolium \n", "424324 Reclamă de televiziune \n", "424325 Lista drumurilor naționale din România \n", "424326 NGC 241 \n", "424327 Iacob I al Aragonului \n", "424328 DN5 \n", "424329 After School (grup muzical) \n", "424330 Râșcov de molid \n", "424331 Dumbo (carte) \n", "424332 Liudmila Savelieva \n", "424333 Reeducarea în România comunistă \n", "424334 James Bryant Conant \n", "424335 Mircea Popa (dirijor) \n", "424336 Nürburg (comună) \n", "424337 Hachiōji‎ \n", "424338 Proces Edeleanu \n", "424339 Proces Leblanc \n", "424340 Vasilii Șova \n", "424341 Oxid de fier (III) \n", "424342 Oxid de fier (II) \n", "424343 Charles Frédéric Gerhardt \n", "424344 Wikipedia:Opinii marginale#Paritatea surselor \n", "424345 Mariquina \n", "424346 Arm (dezambiguizare) \n", "424347 Fosfonat \n", "424348 Policondensare \n", "424349 Liga Campionilor 2018-2019 \n", "424350 Galla (soția lui Teodosiu I) \n", "424351 FC Farul Constanța \n", "424352 Eswatini \n", "\n", "[424353 rows x 4 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_final" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN (large)", "language": "python", "name": "spark_yarn_pyspark_large" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }