ddm/hw02/import.py

import json
import sys
import random
import os
from faker import Faker
from pymongo import MongoClient

fake = Faker()

# pip install faker pymongo

# - article doi or JSON filename can be used as paper id
# - no author id, authors are equal based on name for lack of info
# - use issn as journal id
# - author affiliation Faked
# - journal name Faked
# - publication date Faked (year ignored)

fake_unis = []
for i in range(0, 500):
    (_, _, loc, _, _) = fake.location_on_land()
    fake_unis.append("University of " + loc)

authors = {}
def fake_author(name: str, email: str) -> str:
    if name not in authors:
        authors[name] = {
            "name": name,
            "email": email,
            "affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)],
            "bio": fake.paragraph(nb_sentences=5)
        }
    return authors[name]

journals = {}
def fake_journal(issn: str, volume: str, paper_doi: str) -> str:
    if issn not in journals:
        journals[issn] = {
            "issn": issn,
            "name": " ".join(fake.bs().split(" ")[1:]).capitalize(),
            "volumes": [{
                "name": volume,
                "papers": [paper_doi]
            }]
        }
    else:
        not_found = True
        for v in journals[issn]["volumes"]:
            if v["name"] == volume:
                not_found = False
                v["papers"].append(paper_doi)
                break
        if not_found:
            journals[issn]["volumes"].append({
                "name": volume,
                "papers": [paper_doi]
            })

    return journals[issn]

def getProp(obj: dict, props: str, default = None):
    target = obj
    if target is None:
        return default
    for prop in props.split("."):
        if prop not in target:
            return default
        target = target[prop]
    return target

def save_sentence(body: dict, parents: [dict], sentence: str):
    target = body
    for p in parents:
        if p["id"] not in target["sections"]:
            target["sections"][p["id"]] = {
                "title": p["title"],
                "content": "",
                "sections": {},
            }
        target = target["sections"][p["id"]]
    target["content"] += sentence + " "

def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
    arr = []
    ks = []
    for k in sec["sections"].keys():
        ks.append(k)
    ks.sort()
    for k in ks:
        arr.append(transform_section(sec["sections"][k], figures, references))
    if "title" not in sec:
        return arr

    content = []
    if random.randint(0, 4) == 0 and len(figures) > 0:
        content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
    if "content" in sec and sec["content"] != "":
        content += [sec["content"]]
    if random.randint(0, 4) == 0 and len(references) > 0:
        content += [{ "reference": random.randint(1, len(references)) }]

    content += arr

    if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0:
        del content[-1]

    return {
        "title": sec["title"],
        "content": content
    }

def get_author_name(author: dict) -> str:
    first = "" if author["first"] is None else author["first"]
    last = "" if author["last"] is None else author["last"]
    return (first + " " + last).strip()

def json_to_paper(filename: str, jsonObj: dict) -> dict:
    paper = {}

    paper["title"] = getProp(jsonObj, "metadata.title")
    paper["abstract"] = getProp(jsonObj, "abstract")
    paper["doi"] = getProp(jsonObj, "metadata.doi")

    authors = []
    for author in getProp(jsonObj, "metadata.authors", []):
        email = getProp(author, "email")

        author = fake_author(get_author_name(author), email)

        authors.append({
            "email": author["email"],
            "name": author["name"],
            "affiliation": author["affiliation"]
        })

    paper["authors"] = authors
    paper["keywords"] = getProp(jsonObj, "metadata.keywords")

    publicationDetails = {}
    publicationDetails["issn"] = getProp(jsonObj, "metadata.issn")

    date = fake.date_object()
    volume = getProp(jsonObj, "metadata.volume")
    if volume is None:
        volume = str(date.year) + " Issue"

    journal = fake_journal(
        publicationDetails["issn"],
        volume,
        getProp(jsonObj, "metadata.doi")
    )

    publicationDetails["journal"] = journal["name"]
    publicationDetails["volume"] = volume
    publicationDetails["number"] = random.randint(1, 99)
    publicationDetails["date"] = date.isoformat()
    publicationDetails["pages"] = {
        "start": getProp(jsonObj, "metadata.firstpage"),
        "end": getProp(jsonObj, "metadata.lastpage")
    }
    paper["publicationDetails"] = publicationDetails

    figures = []
    for i in range(0, random.randint(3, 15)):
        figures.append({
            "page": random.randint(1, 10),
            "label": "fig" + str(i),
            "caption": fake.paragraph(nb_sentences=1),
            "imageURL": fake.image_url()
        })
    paper["figures"] = figures

    i = 0
    references = []
    for key, value in getProp(jsonObj, "bib_entries", {}).items():
        if value is None:
            continue

        i += 1
        ref = {
            "referenceNumber": i,
            "doi": getProp(value, "doi"),
            "title": getProp(value, "title"),
            "authors": [],
            "issn": getProp(value, "issn"),
            "volume": getProp(value, "volume"),
            "year": getProp(value, "year")
        }

        for author in getProp(value, "authors", []):
            ref["authors"].append({ "name": get_author_name(author) })
        references.append(ref)
    paper["references"] = references

    body = {
        "sections": {}
    }

    l = getProp(jsonObj, "body_text", [])
    l.sort(key=lambda x: x["startOffset"])
    for e in l:
        parents = []
        for p in getProp(e, "parents", []):
            parents.append(p)

        parents.append({ "id": e["secId"], "title": e["title"] })
        save_sentence(body, parents, e["sentence"])

    paper["content"] = transform_section(body, figures, references)
    return paper


mongo_conn_str = "mongodb://localhost:27017"
def main():
    source_folder: str = sys.argv[1]
    if len(sys.argv) > 2:
        limit: int = int(sys.argv[2])
    else:
        limit: int = -1

    mongo = MongoClient(mongo_conn_str)
    db = mongo["ddm"]

    db["papers"].drop()
    db["authors"].drop()
    db["journals"].drop()

    paper_ids: dict[str, ID] = {}
    author_ids: dict[str, ID] = {}
    journal_ids: dict[str, ID] = {}

    i = 0
    j = 0
    for filename in os.listdir(source_folder):
        if filename.endswith(".json"):
            jsonObj = {}
            with open(source_folder + "/" + filename, 'r') as jsonFile:
                jsonStr = "".join(jsonFile.readlines())
                d = json.JSONDecoder()
                jsonObj = d.decode(jsonStr)

            if getProp(jsonObj, "metadata.issn") is None or \
                getProp(jsonObj, "metadata.doi") is None:
                j += 1
                continue # SKIP papers with no journal ISSN or paper DOI

            paper = json_to_paper(filename, jsonObj)

            x = db["papers"].insert_one(paper)
            paper_ids[paper["doi"]] = x.inserted_id

            i += 1
            if i % 100 == 0:
                print("Papers processed: ", i)
            if j % 100 == 0 and j > 0:
                print("Papers skipped: ", j)
            if limit > 0 and i == limit:
                break

    print("Papers skipped: ", j)

    i = 0
    for name, author in authors.items():
        x = db["authors"].insert_one(author)
        author_ids[name] = x.inserted_id
        i += 1
        if i % 1000 == 0:
            print("Authors processed: ", i)

    i = 0
    for issn, journal in journals.items():
        x = db["journals"].insert_one(journal)
        journal_ids[issn] = x.inserted_id
        i += 1
        if i % 100 == 0:
            print("Journals processed: ", i)

    i = 0
    for paper in db["papers"].find():
        mongo_filter = { "_id": paper["_id"] }
        update = {}
        mongo_update = { "$set": update }

        issn = getProp(paper, "publicationDetails.issn", "")
        if issn in journal_ids:
            update["publicationDetails.journalRef"] = journal_ids[issn]

        references = getProp(paper, "references", [])
        for ref in references:
            if ref["doi"] in paper_ids:
                ref["paperId"] = paper_ids[ref["doi"]]

            for author in ref["authors"]:
                name = author["name"]
                if name in author_ids:
                    author["authorId"] = author_ids[name]

            if ref["issn"] in journal_ids:
                ref["journalId"] = journal_ids[issn]
        update["references"] = references

        authors_loc = getProp(paper, "authors", [])
        for author in authors_loc:
            name = author["name"]
            if name in author_ids:
                author["authorId"] = author_ids[name]
        update["authors"] = authors_loc

        db["papers"].update_one(mongo_filter, mongo_update)

        i += 1
        if i % 100 == 0:
            print("Papers updated with refs: ", i)

    i = 0
    for journal in db["journals"].find():
        mongo_filter = { "_id": journal["_id"] }
        update = {}
        mongo_update = { "$set": update }

        volumes = getProp(journal, "volumes", [])
        for volume in volumes:
            v_papers = []
            for p in volume["papers"]:
                v_papers.append(paper_ids[p])
            volume["papers"] = v_papers
        update["volumes"] = volumes

        db["journals"].update_one(mongo_filter, mongo_update)
        i += 1
        if i % 100 == 0:
            print("Journals updated with refs: ", i)

if __name__ == "__main__":
    main()