from dataknead import Knead from pathlib import Path import json import xmltodict import re KEYS = [ "dc:description", "dc:date", "dc:identifier", "dcterms:spatial", "dc:creator", "europeana:isShownAt", "europeana:type", "europeana:rights", "europeana:isShownBy" ] BATCH_SIZE = 500 def load_xml(path): with open(path) as f: return xmltodict.parse(f.read()) def parse(d): ret = { "identifier" : d["header"]["identifier"] } for key in KEYS: ret[key] = d["metadata"]["europeana:record"].get(key, None) # Sometimes items have multiple descriptions... if isinstance(ret[key], list): # Filter out items with value None ret[key] = list(filter((None).__ne__, ret[key])) # Join elements with a space ret[key] = ' '.join(ret[key]) # Clean up descriptions if key == "dc:description" and ret[key] != None: # Remove all line endings ret[key] = ret[key].replace('\n', ' ').replace('\r', '') # Remove all double spaces ret[key] = re.sub(' +', ' ', ret[key]) return ret def write_json(path, data): with open(path, "w") as f: f.write(json.dumps(data, indent = 4)) def main(): results = [] file_number = 1 for path in Path(".").glob("download_data/*.xml"): data = load_xml(path) records = data["OAI-PMH"]["ListRecords"]["record"] records = [parse(r) for r in records] results = results + records chunks = [ results[i:i + BATCH_SIZE] for i in range(0, len(results), BATCH_SIZE) ] for index, chunk in enumerate(chunks): Knead(chunk).write(f"results-{str(index).zfill(5)}.csv") if __name__ == "__main__": main()