from dataknead import Knead
from pathlib import Path
import json
import xmltodict
import re

KEYS = [
    "dc:description",
    "dc:date",
    "dc:identifier",
    "dcterms:spatial",
    "dc:creator",
    "europeana:isShownAt",
    "europeana:type",
    "europeana:rights",
    "europeana:isShownBy"
]

BATCH_SIZE = 500

def load_xml(path):
    with open(path) as f:
        return xmltodict.parse(f.read())

def parse(d):
    ret = {
        "identifier" : d["header"]["identifier"]
    }

    for key in KEYS:
        ret[key] = d["metadata"]["europeana:record"].get(key, None)
        # Sometimes items have multiple descriptions...
        if isinstance(ret[key], list):
            # Filter out items with value None
            ret[key] = list(filter((None).__ne__, ret[key]))
            # Join elements with a space
            ret[key] = ' '.join(ret[key])

        # Clean up descriptions
        if key == "dc:description" and ret[key] != None:
            # Remove all line endings
            ret[key] = ret[key].replace('\n', ' ').replace('\r', '')
            # Remove all double spaces
            ret[key] = re.sub(' +', ' ', ret[key])

    return ret

def write_json(path, data):
    with open(path, "w") as f:
        f.write(json.dumps(data, indent = 4))

def main():
    results = []
    file_number = 1

    for path in Path(".").glob("download_data/*.xml"):
        data = load_xml(path)
        records = data["OAI-PMH"]["ListRecords"]["record"]
        records = [parse(r) for r in records]
        results = results + records

    chunks = [ results[i:i + BATCH_SIZE] for i in range(0, len(results), BATCH_SIZE) ]

    for index, chunk in enumerate(chunks):
        Knead(chunk).write(f"results-{str(index).zfill(5)}.csv")
      

if __name__ == "__main__":
    main()