In [6]: import requests ...: import io ...: from pdfminer.high_level import extract_text ...: In [7]: valid = 0 ...: invalid = 0 ...: for i in range(20): ...: random = requests.get('https://fr.wikipedia.org/api/rest_v1/page/random/title') ...: title = random.json()['items'][0]['title'] ...: print('Random page title: {}'.format(title)) ...: pdf = requests.get('https://fr.wikipedia.org/api/rest_v1/page/pdf/{}'.format(title)) ...: file = io.BytesIO(pdf.content) ...: try: ...: extract_text(file) ...: valid += 1 ...: print('Valid pdf: {}'.format(title)) ...: except: ...: invalid += 1 ...: print('Invalid pdf: {}'.format(title)) ...: Random page title: Phare_de_Sainte-Marine Valid pdf: Phare_de_Sainte-Marine Random page title: Musées_royaux_des_Beaux-Arts_de_Belgique Invalid pdf: Musées_royaux_des_Beaux-Arts_de_Belgique Random page title: Argentine_(papillon) Invalid pdf: Argentine_(papillon) Random page title: Dampmart Invalid pdf: Dampmart Random page title: Canton_de_Romans-sur-Isère-1 Valid pdf: Canton_de_Romans-sur-Isère-1 Random page title: Béton_armé Invalid pdf: Béton_armé Random page title: Dobroudja Invalid pdf: Dobroudja Random page title: Saint-Maurice_(Nièvre) Invalid pdf: Saint-Maurice_(Nièvre) Random page title: Gare_de_Tangen Valid pdf: Gare_de_Tangen Random page title: Commune_de_Kohila Invalid pdf: Commune_de_Kohila Random page title: Basse-Ham Invalid pdf: Basse-Ham Random page title: Dennis_Reimer Invalid pdf: Dennis_Reimer Random page title: Sind Invalid pdf: Sind Random page title: Alliage Invalid pdf: Alliage Random page title: Columbia_Pictures Invalid pdf: Columbia_Pictures Random page title: RIPEMD-160 Invalid pdf: RIPEMD-160 Random page title: Université_de_Neuchâtel Valid pdf: Université_de_Neuchâtel Random page title: Livie Invalid pdf: Livie Random page title: Albrecht_Wilhelm_Roth Invalid pdf: Albrecht_Wilhelm_Roth Random page title: Shōgakukan Invalid pdf: Shōgakukan