For a given wiki we want to find all revisions that are redirects. We query wmf.mediawiki_wikitext_history.
How:
import os
from pyspark.sql.functions import col,regexp_extract ## for extracting redirect-page
from redirect import get_redirect_aliases ## for getting redirect-aliases
## select the wmf table
sqlContext.sql('USE wmf')
wiki_date = '2019-07'
wiki_name = 'rowiki'
## we look up the json of the latest siteinfo-namespaces file
filename = os.path.join('/mnt','data','xmldatadumps','public','%s'%wiki_name,'latest','%s-latest-siteinfo-namespaces.json.gz'%wiki_name)
list_redirect_aliases = get_redirect_aliases(filename)
print(list_redirect_aliases)
## construct the string-matching condition
str_revision_redirect_match = '('
for i_redirect_alias, redirect_alias in enumerate(list_redirect_aliases):
if i_redirect_alias > 0:
str_revision_redirect_match += ' OR '
str_revision_redirect_match += 'LOWER(revision_text) LIKE "%s [[%%]]%%"'%(redirect_alias.lower())
str_revision_redirect_match += ')'
print(str_revision_redirect_match)
## query all revision which start with '#redirect [[*]]' or one of its redirect-aliases
query = 'SELECT page_id, revision_id, page_title, revision_text \
FROM wmf.mediawiki_wikitext_history \
WHERE snapshot="%s" \
AND wiki_db="%s" \
AND page_namespace = 0 \
AND %s \
ORDER BY page_id, revision_id \
'\
%(wiki_date,wiki_name,str_revision_redirect_match)
print(query)
result = sqlContext.sql(query)
## extracting the page [[ ]] where it redirects to
## drop the revision-text (too much information)
df_final = result.withColumn('revision_redirect_page',regexp_extract(col('revision_text'), '\[\[(.*?)\]\]',1 ) )\
.drop('revision_text')\
.toPandas()
## save result to csv-dataframe
df_final.to_csv(df_redirect_revision_%s.csv'%wiki_name)
df_final