""" Just get some status for PDF rendering """ import io import pprint import random import requests from pdfminer.high_level import extract_text stats = { '200s': 0, '4xx': 0, '5xx': 0, 'valid': 0, 'invalid': 0, 'cl-matches-bytes': 0, 'cl-fails-bytes': 0, } WIKIS = ['en', 'es', 'el', 'it', 'fr', 'simple', 'de', 'ar', 'bg', 'no', 'tr'] for i in range(100): wiki = random.choice(WIKIS) r = requests.get('https://{}.wikipedia.org/api/rest_v1/page/random/title'.format(wiki)) title = r.json()['items'][0]['title'] print('Random page title: {}'.format(title)) pdf = requests.get('https://en.wikipedia.org/api/rest_v1/page/pdf/{}'.format(title)) code = pdf.status_code if code == 200: stats['200s'] += 1 elif 400 <= code < 500: stats['4xx'] += 1 if 500 <= code < 600: stats['5xx'] += 1 if int(pdf.headers['content-length']) == len(pdf.content): stats['cl-matches-bytes'] += 1 else: stats['cl-fails-bytes'] += 1 f = io.BytesIO(pdf.content) try: extract_text(f) stats['valid'] += 1 except : stats['invalid'] += 1 pprint.pprint(stats)