""" Just get some basic stats from Proton externally """ import io import pprint import random import requests from pdfminer.high_level import extract_text stats = {} WIKIS = ['en', 'es', 'el', 'it', 'fr', 'simple', 'de', 'ar', 'bg', 'no', 'tr'] for i in range(100): wiki = random.choice(WIKIS) if not wiki in stats.keys(): stats[wiki] = { 'cl-matches-bytes': 0, 'cl-fails-bytes': 0, 'valid': 0, 'invalid': 0, } r = requests.get('https://{}.wikipedia.org/api/rest_v1/page/random/title'.format(wiki)) title = r.json()['items'][0]['title'] print('Random page title: {}'.format(title)) pdf = requests.get('https://{}.wikipedia.org/api/rest_v1/page/pdf/{}'.format(wiki, title)) code = pdf.status_code if code == 404: print('https://{}.wikipedia.org/api/rest_v1/page/pdf/{} 404ed'.format(wiki, title)) if not code in stats.keys(): stats[code] = { 'cl-matches-bytes': 0, 'cl-fails-bytes': 0, 'valid': 0, 'invalid': 0, } if int(pdf.headers['content-length']) == len(pdf.content): stats[code]['cl-matches-bytes'] += 1 stats[wiki]['cl-matches-bytes'] += 1 else: stats[code]['cl-fails-bytes'] += 1 stats[wiki]['cl-fails-bytes'] += 1 f = io.BytesIO(pdf.content) try: extract_text(f) stats[code]['valid'] += 1 stats[wiki]['valid'] += 1 except : stats[code]['invalid'] += 1 stats[wiki]['invalid'] += 1 pprint.pprint(stats)