{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Joining the redirect-dataframe into the mediawiki history" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## join:\n", "\n", "Left: the mediawiki-history\n", "Right: the historical redirect\n", "\n", "- Left out join: keep rows with keys in left dataset" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "wiki_name = 'rowiki'\n", "wiki_date = '2019-07'" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "## register historical redirect as tabel\n", "filename_read = 'output/df_redirect_%s_%s.parquet'%(wiki_name,wiki_date)\n", "df_redirects = spark.read.load(filename_read, format='parquet')#.toPandas()\n", "# sqlContext.registerDataFrameAsTable(df, \"redirects\")\n", "\n", "## renaming column names for join later\n", "df_redirects = df_redirects.withColumnRenamed('page_id','page_id_tmp')\n", "df_redirects = df_redirects.withColumnRenamed('revision_id','revision_id_tmp')" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------+---------------+----------------+--------------------+\n", "|page_id_tmp|revision_id_tmp|redirect_command| redirect_page_title|\n", "+-----------+---------------+----------------+--------------------+\n", "| 2101801| 11527908| #REDIRECTEAZA|Comuna Comarova, ...|\n", "| 2131631| 11569024| #REDIRECTEAZA|Format:Comuna Hor...|\n", "| 2408163| 12874394| #redirect|Bară oblică spre ...|\n", "| 2251360| 11777589| #REDIRECTEAZA|Discuție:Nîjnie S...|\n", "| 2034537| 11416020| #redirect| Făgețelu, Olt|\n", "| 2209480| 11674505| #REDIRECTEAZA|Voloske, Dniprope...|\n", "| 2138801| 11577787| #REDIRECTEAZA|Wikipedia:Pagini ...|\n", "| 2134783| 11572282| #REDIRECTEAZA|Format:Localități...|\n", "| 2324299| 12242808| #REDIRECTEAZA| Ioan Drăghiciu|\n", "| 2104106| 11530336| #REDIRECTEAZA|Comuna Verșați, C...|\n", "+-----------+---------------+----------------+--------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "df_redirects.show(10)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "sqlContext.sql('USE wmf')\n", "query = 'SELECT * \\\n", "FROM wmf.mediawiki_history \\\n", "WHERE snapshot=\"%s\" \\\n", "AND wiki_db=\"%s\" '\\\n", "%(wiki_date,wiki_name)\n", "df_revisions = sqlContext.sql(query)\n", "# sqlContext.registerDataFrameAsTable(df, \"revisions\")" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "join_expression = df_revisions['revision_id'] == df_redirects['revision_id_tmp']\n", "## note that this will lead to duplicate column names which will become problematic when referring to them\n", "## instead change to str or seq instead of boolean\n", "\n", "join_type = 'left_outer'\n", "\n", "df_join = df_revisions.join(df_redirects,join_expression,join_type).drop('revision_id_tmp','revision_id_tmp')" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# df_join.show(10)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+-----------+----------------+----------------+-------------------+\n", "|page_id|revision_id|page_is_redirect|redirect_command|redirect_page_title|\n", "+-------+-----------+----------------+----------------+-------------------+\n", "|2012614| 26| false| null| null|\n", "| 22| 29| false| null| null|\n", "| 27| 474| false| null| null|\n", "|1968737| 964| false| null| null|\n", "| 609| 1677| false| null| null|\n", "| 618| 1697| false| null| null|\n", "| 765| 1806| false| null| null|\n", "| 799| 1950| false| null| null|\n", "| 882| 2040| false| null| null|\n", "|1968737| 2214| false| null| null|\n", "| 1118| 2250| false| null| null|\n", "| 1066| 2453| false| null| null|\n", "| 1553| 2509| false| null| null|\n", "| 1579| 2529| false| null| null|\n", "| 1864| 2927| false| null| null|\n", "|1968737| 3091| false| null| null|\n", "| 2019| 3506| false| null| null|\n", "| 2098| 3764| false| null| null|\n", "| 2622| 4590| false| null| null|\n", "| 2033| 4823| false| null| null|\n", "|1968737| 4894| false| null| null|\n", "| 1763| 5385| false| null| null|\n", "| 1787| 5409| false| null| null|\n", "| 2342| 5556| false| null| null|\n", "| 3333| 6721| false| null| null|\n", "| 56| 7225| false| null| null|\n", "| 3966| 7279| false| null| null|\n", "| 3818| 7747| false| null| null|\n", "| 3711| 8075| false| null| null|\n", "| 3269| 8440| false| null| null|\n", "| 3227| 8484| false| null| null|\n", "| 1936| 9233| false| null| null|\n", "| 105| 9458| false| null| null|\n", "| 246| 9715| false| null| null|\n", "| 1713| 9945| false| null| null|\n", "|1968737| 9968| false| null| null|\n", "|1968737| 9978| false| null| null|\n", "| 5548| 10156| false| null| null|\n", "| 5802| 10422| false| null| null|\n", "| 1713| 10871| false| null| null|\n", "| 6484| 10959| false| null| null|\n", "| 666| 11190| false| null| null|\n", "| 5045| 11276| false| null| null|\n", "| 6546| 11434| true| null| null|\n", "| 6626| 11567| false| null| null|\n", "| 6649| 11619| false| null| null|\n", "| 6634| 11625| false| null| null|\n", "| 2159| 11938| false| null| null|\n", "| 5518| 11945| false| null| null|\n", "| 5544| 12044| false| null| null|\n", "| 6890| 12568| false| null| null|\n", "| 6484| 13098| false| null| null|\n", "| 5030| 13248| false| null| null|\n", "| 73| 13401| false| null| null|\n", "| 1544| 13452| false| null| null|\n", "| null| 13460| null| null| null|\n", "| 4031| 13518| false| null| null|\n", "| 5039| 13638| false| null| null|\n", "| 7167| 13723| false| null| null|\n", "| 2371| 14117| false| null| null|\n", "| 5392| 14719| false| null| null|\n", "| 7798| 14846| false| null| null|\n", "| 7853| 15057| false| null| null|\n", "| 8051| 15173| false| null| null|\n", "| 7805| 15194| false| null| null|\n", "|1968737| 15237| false| null| null|\n", "| 4021| 15322| false| null| null|\n", "| 8215| 15371| false| null| null|\n", "| 8230| 15375| true| #REDIRECT| Corint|\n", "| 4209| 15432| false| null| null|\n", "| 2435| 15437| false| null| null|\n", "| 8058| 15663| false| null| null|\n", "| null| 15846| null| null| null|\n", "| 9325| 16530| false| null| null|\n", "| 9227| 16597| false| null| null|\n", "| 9134| 16742| false| null| null|\n", "| 9016| 16896| false| null| null|\n", "| 9147| 17043| false| null| null|\n", "| 9148| 17048| false| null| null|\n", "| 9255| 17499| false| null| null|\n", "| 9399| 17703| false| null| null|\n", "| 5669| 17971| false| null| null|\n", "| 9023| 17979| false| null| null|\n", "| 3378| 18147| false| null| null|\n", "| 76| 18196| false| null| null|\n", "| 1579| 18295| false| null| null|\n", "| 7059| 18348| false| null| null|\n", "| 6891| 18628| false| null| null|\n", "| 2081| 18730| false| null| null|\n", "| 5137| 19141| false| null| null|\n", "| 5766| 19158| false| null| null|\n", "| 9861| 19163| false| null| null|\n", "| 10397| 19771| false| null| null|\n", "| 10431| 19907| false| null| null|\n", "| null| 19979| null| null| null|\n", "| 10244| 20532| false| null| null|\n", "| 10559| 21209| false| null| null|\n", "| 225| 21223| false| null| null|\n", "| 10567| 21342| false| null| null|\n", "| 10534| 21899| false| null| null|\n", "+-------+-----------+----------------+----------------+-------------------+\n", "only showing top 100 rows\n", "\n" ] } ], "source": [ "## get some entries (most will not be revision with redirect)\n", "df_join[['page_id','revision_id','page_is_redirect','redirect_command','redirect_page_title']].show(100)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+-----------+--------------------+----------------+----------------+--------------------+\n", "|page_id|revision_id| page_title|page_is_redirect|redirect_command| redirect_page_title|\n", "+-------+-----------+--------------------+----------------+----------------+--------------------+\n", "| 8230| 15375| Korinthos| true| #REDIRECT| Corint|\n", "| 8501| 74512| Thessalonica| true| #REDIRECT| Salonic|\n", "| 10768| 76584|Partidul_Reformis...| true| #REDIRECT|Partidul Reformis...|\n", "| 13414| 79106| Nou-născut| false| #REDIRECT| Bebeluş|\n", "| 15723| 81068|Wikimedia_România...| true| #REDIRECT|Wikipedia:Wikimed...|\n", "| 17815| 82876| Gcbirzan| true| #REDIRECT| :w:en:User:Gcbirzan|\n", "| 29406| 123068|Liceul_„Gheoghe_Ș...| true| #redirect|Liceul Gheoghe Şi...|\n", "| 30461| 129421|Patrimoniul_mondi...| true| #REDIRECT|Locuri din patrim...|\n", "| 32884| 144245|Războiul_de_O_Sut...| true| #redirect|Războiul de o sut...|\n", "| 32955| 144705| Ștefan_Odobleja| false| #redirect| Stefan Odobleja|\n", "+-------+-----------+--------------------+----------------+----------------+--------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "## get some entries revision ids that are redirects\n", "df_join[df_join['redirect_command'].isNotNull()][['page_id','revision_id','page_title','page_is_redirect','redirect_command','redirect_page_title']].show(10)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "533144" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## count how many revision ids there are that are redirects\n", "df_join[df_join['redirect_command'].isNotNull()][['page_id','revision_id','page_is_redirect','redirect_command','redirect_page_title']].count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "PySpark - YARN", "language": "python", "name": "spark_yarn_pyspark" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }