In [1]: import requests In [2]: import io In [3]: from pdfminer.high_level import extract_text In [7]: valid = 0 ...: invalid = 0 ...: for i in range(20): ...: random = requests.get('https://fr.wikipedia.org/api/rest_v1/page/random/title') ...: title = random.json()['items'][0]['title'] ...: print('Random page title: {}'.format(title)) ...: pdf = requests.get('https://en.wikipedia.org/api/rest_v1/page/pdf/{}'.format(title)) ...: file = io.BytesIO(pdf.content) ...: try: ...: extract_text(file) ...: valid += 1 ...: print('Valid pdf: {}'.format(title)) ...: except: ...: invalid += 1 ...: print('Invalid pdf: {}'.format(title)) ...: Random page title: Felipe_de_Jesús_Villanueva_Gutiérrez Valid pdf: Felipe_de_Jesús_Villanueva_Gutiérrez Random page title: Lycée_François-Ier_(Le_Havre) Invalid pdf: Lycée_François-Ier_(Le_Havre) Random page title: John_LeKay Invalid pdf: John_LeKay Random page title: Grigny_(métropole_de_Lyon) Invalid pdf: Grigny_(métropole_de_Lyon) Random page title: Hulk Valid pdf: Hulk Random page title: Bethoncourt Valid pdf: Bethoncourt Random page title: Volta_Limburg_Classic Valid pdf: Volta_Limburg_Classic Random page title: Erna_Hennicot-Schoepges Valid pdf: Erna_Hennicot-Schoepges Random page title: Pothières Valid pdf: Pothières Random page title: Fabrice_Loiseau Invalid pdf: Fabrice_Loiseau Random page title: Ligne_3_Scarborough Invalid pdf: Ligne_3_Scarborough Random page title: Tarbes_Gespe_Bigorre Valid pdf: Tarbes_Gespe_Bigorre Random page title: Lustre_(ameublement) Invalid pdf: Lustre_(ameublement) Random page title: Julius_Robert_von_Mayer Valid pdf: Julius_Robert_von_Mayer Random page title: Bombardier_(avion) Invalid pdf: Bombardier_(avion) Random page title: Courléon Valid pdf: Courléon Random page title: Montreuil-Juigné Valid pdf: Montreuil-Juigné Random page title: Bibliothèque_universitaire_des_langues_et_civilisations Valid pdf: Bibliothèque_universitaire_des_langues_et_civilisations Random page title: Florent_III_de_Hollande Invalid pdf: Florent_III_de_Hollande Random page title: Orenbourg Invalid pdf: Orenbourg In [8]: valid Out[8]: 11 In [9]: invalid Out[9]: 9