{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6e293777-488b-44ce-b8c9-9a49fb341521", "metadata": {}, "outputs": [], "source": [ "import xml.etree.cElementTree as ElementTree\n", "import bz2\n", "import re\n", "\n", "inputfile = 'ptwikibooks-latest-flow.xml.bz2'\n", "ns = 'http://www.mediawiki.org/xml/flow-1.0/'\n", "\n", "bracketed_ns = '{%s}' % (ns,)\n", "\n", "def tag( name ):\n", " return f'{ bracketed_ns }{name}'\n", "\n", "def bare_tag( ns_tag ):\n", " return ns_tag.replace( bracketed_ns, '' )\n", "\n", "def readStream( xmlfile ):\n", " with bz2.open( xmlfile, 'rb' ) as file:\n", " for ( event, element ) in ElementTree.iterparse( file, events=[ 'start', 'end' ] ):\n", " yield ( event, element )\n", "\n", "def readTagStream( stream, tags ):\n", " reference_counter = 0\n", " for ( event, element ) in stream:\n", " if ( event == 'start' ):\n", " if ( bare_tag( element.tag ) in tags ):\n", " reference_counter += 1\n", " elif ( event == 'end' ):\n", " if ( bare_tag( element.tag ) in tags ):\n", " yield element\n", " reference_counter -= 1\n", " assert( reference_counter >= 0)\n", " if reference_counter == 0:\n", " element.clear()\n", " pass\n", "\n", "stream = readTagStream( readStream( inputfile ), [ 'topic' ] )\n", "topics = ( e.get( 'id' ) for e in stream )\n", "\n", "with open( 'topics.txt', 'w' ) as file:\n", " for topic in topics:\n", " file.write( topic )\n", " file.write( '\\n' )" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }