From ce1c4f88f381dc1b2240119d2477c1f7bf93afb7 Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Sun, 20 Nov 2022 13:35:01 +0100 Subject: [PATCH] wip --- hw02/Papers.ts | 68 ++++++++++++ hw02/convert.py | 282 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 hw02/Papers.ts create mode 100644 hw02/convert.py diff --git a/hw02/Papers.ts b/hw02/Papers.ts new file mode 100644 index 0000000..bbc8bdd --- /dev/null +++ b/hw02/Papers.ts @@ -0,0 +1,68 @@ +type ID = string; +type UInt = number; + +// each interface is a mongodb collection + +type Section = { title: string, content: Content[] } +type Content = Section | string /* text */ | { reference: number } /* bib reference */ | { label: string } /* figure reference */ + +interface Paper { + _id: ID, + title: string, + abstract: string, + authors: [{ // attributes here reflect the data found when the paper was published + affiliation: string, // university of blah + email: string | null, + name: string, + authorId: ID + }], + keywords: string[], + publicationDetails: { + journalRef: ID, + journal: string, // "Journal of AI" + volume: string, // "Spring 2020 edition" + number: string // reference for this paper in the volume + date: Date, + pages: { // pages in the volume where this paper is + start: number, + end: number + } // [0, 1] + }, + content: Section[], + figures: [{ + page: number // page number where the figure is placed + label: string, + caption: string, + imageURL?: string // some figures are tables, not in the scope of this assignment to model a table + }], + references: [{ + referenceNumber: number, + paperId?: ID, // optional + title: string, + authors: [{ + name: string, + authorId?: ID //optional + }], // list of author names + journal: string, + journalId?: ID, + volume: string, + number: string + }] +} + +interface Author { // author data here reflects the most up to date info in the author + _id: ID, + name: string, + email: string, + affiliation: string, + bio: string +} + +interface Journal { + _id: ID, + name: string, + volumes: [{ + name: string, + papers: ID[] // references to papers in volumes + }] +} \ No newline at end of file diff --git a/hw02/convert.py b/hw02/convert.py new file mode 100644 index 0000000..3ec91dd --- /dev/null +++ b/hw02/convert.py @@ -0,0 +1,282 @@ +import json +import sys +import random +import os +from faker import Faker +from pymongo import MongoClient + +fake = Faker() + +# pip install faker pymongo + +# TODO: number on publicationDetails (random) +# TODO: figures (fake) +# TODO: inject (fake) figures and references in content +# TODO: references are a dictionary + +# - article doi or JSON filename can be used as paper id +# - no author id, authors are equal based on name for lack of info +# - use issn as journal id +# - author affiliation Faked +# - journal name Faked +# - publication date Faked (year ignored) + +fake_unis = [] +for i in range(0, 500): + (_, _, loc, _, _) = fake.location_on_land() + fake_unis.append("University of " + loc) + +authors = {} +def fake_author(name: str, email: str) -> str: + if name not in authors: + authors[name] = { + "name": name, + "email": email, + "affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)], + "bio": fake.paragraph(nb_sentences=5) + } + return authors[name] + +journals = {} +def fake_journal(issn: str, volume: str, paper_doi: str) -> str: + if issn not in journals: + journals[issn] = { + "issn": issn, + "name": " ".join(fake.bs().split(" ")[1:]).capitalize(), + "volumes": [{ + "name": volume, + "papers": [paper_doi] + }] + } + else: + not_found = True + for v in journals[issn]["volumes"]: + if v["name"] == volume: + not_found = False + v["papers"].append(paper_doi) + break + if not_found: + journals[issn]["volumes"].append({ + "name": volume, + "papers": [paper_doi] + }) + + return journals[issn] + +def getProp(obj: dict, props: str, default = None): + target = obj + if target is None: + return default + for prop in props.split("."): + if prop not in target: + return default + target = target[prop] + return target + +def setProp(obj: dict, props: str, toSave: any): + target = obj + for prop in props.split("."): + if prop not in target: + target[prop] = {} + target = target[prop] + target[prop] = toSave + +def save_sentence(body: dict, parents: [dict], sentence: str): + target = body + for p in parents: + if p["id"] not in target["sections"]: + target["sections"][p["id"]] = { + "title": p["title"], + "content": "", + "sections": {}, + } + target = target["sections"][p["id"]] + target["content"] += sentence + " " + +def transform_section(sec: dict) -> dict: + arr = [] + ks = [] + for k in sec["sections"].keys(): + ks.append(k) + ks.sort() + for k in ks: + arr.append(transform_section(sec["sections"][k])) + + content = ([sec["content"]] if "content" in sec and sec["content"] != "" else []) + arr, + + if "title" not in sec: + return content + + return { + "title": sec["title"], + "content": content + } + +def get_author_name(author: dict) -> str: + first = "" if author["first"] is None else author["first"] + last = "" if author["last"] is None else author["last"] + return (first + " " + last).strip() + +def json_to_paper(filename: str, jsonObj: dict) -> dict: + paper = {} + + paper["title"] = getProp(jsonObj, "metadata.title") + paper["abstract"] = getProp(jsonObj, "abstract") + paper["doi"] = getProp(jsonObj, "metadata.doi") + + authors = [] + for author in getProp(jsonObj, "metadata.authors", []): + email = getProp(author, "email") + + author = fake_author(get_author_name(author), email) + + # TODO: authorID + authors.append({ + "email": author["email"], + "name": author["name"], + "affiliation": author["affiliation"] + }) + + paper["authors"] = authors + paper["keywords"] = getProp(jsonObj, "metadata.keywords") + + publicationDetails = {} + publicationDetails["issn"] = getProp(jsonObj, "metadata.issn") # ISBN-like, not a name + + date = fake.date_object() + volume = getProp(jsonObj, "metadata.volume") + if volume is None: + volume = str(date.year) + " Issue" + + journal = fake_journal( + publicationDetails["issn"], + volume, + getProp(jsonObj, "metadata.doi") + ) + + publicationDetails["journal"] = journal["name"] + publicationDetails["volume"] = volume + + publicationDetails["date"] = date.isoformat() + + publicationDetails["pages"] = { + "start": getProp(jsonObj, "metadata.firstpage"), + "end": getProp(jsonObj, "metadata.lastpage") + } + paper["publicationDetails"] = publicationDetails + + references = {} + for key, value in getProp(jsonObj, "bib_entries", {}).items(): + if value is None: + continue + + references[key] = { + "doi": getProp(value, "doi"), + "title": getProp(value, "title"), + "authors": [], + "issn": getProp(value, "issn"), + "volume": getProp(value, "volume"), + "year": getProp(value, "year") + } + + for author in getProp(value, "authors", []): + references[key]["authors"].append({ "name": get_author_name(author) }) + paper["references"] = references + + body = { + "sections": {} + } + + l = getProp(jsonObj, "body_text", []) + l.sort(key=lambda x: x["startOffset"]) + for e in l: + parents = [] + for p in getProp(e, "parents", []): + parents.append(p) + + parents.append({ "id": e["secId"], "title": e["title"] }) + save_sentence(body, parents, e["sentence"]) + + paper["content"] = transform_section(body) + return paper + + +mongo_conn_str = "mongodb://localhost:27017" +def main(): + source_folder: str = sys.argv[1] + + mongo = MongoClient(mongo_conn_str) + db = mongo["ddm"] + + db["papers"].drop() + db["authors"].drop() + db["journals"].drop() + + paper_ids: dict[str, ID] = {} + author_ids: dict[str, ID] = {} + journal_ids: dict[str, ID] = {} + + i = 0 + for filename in os.listdir(source_folder): + if filename.endswith(".json"): + jsonObj = {} + with open(source_folder + "/" + filename, 'r') as jsonFile: + jsonStr = "".join(jsonFile.readlines()) + d = json.JSONDecoder() + jsonObj = d.decode(jsonStr) + + paper = json_to_paper(filename, jsonObj) + + x = db["papers"].insert_one(paper) + paper_ids[paper["doi"]] = x.inserted_id + + i += 1 + if i % 20 == 0: + print("Papers processed: " + i) + if i == 10: # TODO: remove + break + + i = 0 + for name, author in authors.items(): + x = db["authors"].insert_one(author) + author_ids[name] = x.inserted_id + + for issn, journal in journals.items(): + x = db["journals"].insert_one(journal) + journal_ids[issn] = x.inserted_id + + for paper in db["papers"].find(): + mongo_filter = { "_id": paper["_id"] } + update = {} + mongo_update = { "$set": update } + + issn = getProp(paper, "publicationDetails.issn", "") + if issn in journal_ids: + setProp(update, "publicationDetails.journalRef", journal_ids[issn]) + + for key, ref in getProp(paper, "references", {}).items(): + if ref["doi"] in paper_ids: + setProp(update, "references." + key + ".paperId", paper_ids[ref["doi"]]) + + authors_loc = getProp(ref, "authors", []) + for author in authors_loc: + if name in author_ids: + author["authorId"] = author_ids[name] + setProp(update, "references." + key + ".authors", authors_loc) + + issn = getProp(ref, "issn", "") + if issn in journal_ids: + setProp(update, "references." + key + ".journalId", journal_ids[issn]) + + authors_loc = getProp(paper, "authors", []) + for author in authors_loc: + if name in author_ids: + author["authorId"] = author_ids[name] + setProp(update, "authors", authors_loc) + + print(mongo_update) + db["papers"].update_one(mongo_filter, mongo_update) + + +if __name__ == "__main__": + main() \ No newline at end of file