from collections import OrderedDict import pywikibot as pwb site = pwb.Site('wikidata', 'wikidata') page = pwb.Page(site, 'Wikidata:WikiProject WLM/Mapping tables/ge (ka)/types') contents = page.get() header, sep, rest = contents.partition('|-') rest, sep, footer = rest.rpartition('|}') footer = '|}' + footer NATIONAL_IMPORTANCE_STR = "ეროვნული" def clean_type(text): """ Return a cleaned version of self.type. Multiple types may exist either separated by "
" or ",". Types may include NATIONAL_IMPORTANCE_STR which should be used only for heritage status. """ raw_type = text.lower() raw_type = raw_type.replace("
", ",") types = [typ.strip() for typ in raw_type.split(',')] if NATIONAL_IMPORTANCE_STR in types: types.remove(NATIONAL_IMPORTANCE_STR) types = list(filter(None, types)) # remove empty entries return ', '.join(types) entries = rest.split('|-') d = {} for entry in entries: parts = entry.split('\n|') name = clean_type(parts[1].strip()) num = parts[2].strip() or "0" qid = parts[3].strip() com = parts[4].strip() if name not in d: d[name] = {'num': 0, 'qid': '', 'com': '', 'orig':[]} d[name]['num'] += int(num) if qid and d[name]['qid']: print('doh qid: {} {}'.format(qid, d[name]['qid'])) if com and d[name]['com']: print('doh com: {} {}'.format(com, d[name]['com'])) d[name]['qid'] = d[name]['qid'] or qid d[name]['com'] = d[name]['com'] or com d[name]['orig'].append(parts[1].strip()) od = OrderedDict(sorted(d.items(), key=lambda t: t[1]['num'], reverse=True)) txt = '' for k, v in od.items(): txt += '|- \n| {}\n| {}\n| {}\n| {}\n'.format(k, v['num'], v['qid'], v['com']) page_text = header + text + footer with open('tmp.wiki', 'w', encoding='utf-8') as f: f.write(page_text)