import requests import json import re domain = 'https://en.wikipedia.beta.wmflabs.org' current_spam_blacklist = requests.get(domain + '/wiki/MediaWiki:Spam-blacklist?action=raw').text current_blocked_domains = requests.get(domain + '/wiki/MediaWiki:BlockedExternalDomains.json?action=raw') query_res = requests.get(domain + '/w/api.php', params= { 'action': 'query', 'format': 'json', 'prop': 'revisions', 'titles': 'MediaWiki:Spam-blacklist', 'rvslots': 'main', 'rvprops': 'ids'}).json()['query']['pages'] spam_blacklist_rev_id = query_res[list(query_res.keys())[0]]['revisions'][0]['revid'] if current_blocked_domains.status_code == 404: current_blocked_domains = [] else: current_blocked_domains = current_blocked_domains.json() new_spam_blacklist = [] for line in current_spam_blacklist.split('\n'): if line.startswith('#'): new_spam_blacklist.append(line) continue domain_regex = line.strip() if domain_regex.startswith('\\b') and domain_regex.endswith('\\b'): domain_regex = domain_regex[2:-2] else: new_spam_blacklist.append(line) continue if '/' in domain_regex: new_spam_blacklist.append(line) continue if re.search(r'[^\\][\.\$\^\{\[\(\|\)\*\+\?]', domain_regex): new_spam_blacklist.append(line) continue if re.search(r'\\[^\.]', domain_regex): new_spam_blacklist.append(line) continue if '[' in domain_regex or ']' in domain_regex: new_spam_blacklist.append(line) continue current_blocked_domains.append( { 'domain': domain_regex.replace('\\', ''), 'notes': 'Moved from [[Special:PermaLink/{}|MediaWiki:Spam-blacklist]]'.format(spam_blacklist_rev_id)} ) print('\n'.join(new_spam_blacklist)) print(json.dumps(current_blocked_domains, ensure_ascii=False, indent='\t'))