From ce1c4f88f381dc1b2240119d2477c1f7bf93afb7 Mon Sep 17 00:00:00 2001
From: Claudio Maggioni <maggicl@usi.ch>
Date: Sun, 20 Nov 2022 13:35:01 +0100
Subject: [PATCH] wip

---
 hw02/Papers.ts  |  68 ++++++++++++
 hw02/convert.py | 282 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 350 insertions(+)
 create mode 100644 hw02/Papers.ts
 create mode 100644 hw02/convert.py

diff --git a/hw02/Papers.ts b/hw02/Papers.ts
new file mode 100644
index 0000000..bbc8bdd
--- /dev/null
+++ b/hw02/Papers.ts
@@ -0,0 +1,68 @@
+type ID = string;
+type UInt = number;
+
+// each interface is a mongodb collection
+
+type Section = { title: string, content: Content[] }
+type Content = Section | string /* text */ | { reference: number } /* bib reference */ | { label: string } /* figure reference */
+
+interface Paper {
+    _id: ID,
+    title: string,
+    abstract: string,
+    authors: [{ // attributes here reflect the data found when the paper was published
+        affiliation: string, // university of blah
+        email: string | null,
+        name: string,
+        authorId: ID
+    }],
+    keywords: string[],
+    publicationDetails: {
+        journalRef: ID,
+        journal: string, // "Journal of AI"
+        volume: string, // "Spring 2020 edition"
+        number: string // reference for this paper in the volume
+        date: Date,
+        pages: { // pages in the volume where this paper is
+            start: number,
+            end: number
+        } // [0, 1]
+    },
+    content: Section[],
+    figures: [{
+        page: number // page number where the figure is placed
+        label: string,
+        caption: string,
+        imageURL?: string // some figures are tables, not in the scope of this assignment to model a table
+    }],
+    references: [{
+        referenceNumber: number,
+        paperId?: ID, // optional
+        title: string,
+        authors: [{
+            name: string,
+            authorId?: ID //optional
+        }], // list of author names
+        journal: string,
+        journalId?: ID,
+        volume: string,
+        number: string
+    }]
+}
+
+interface Author { // author data here reflects the most up to date info in the author
+    _id: ID,
+    name: string,
+    email: string,
+    affiliation: string,
+    bio: string
+}
+
+interface Journal {
+    _id: ID,
+    name: string,
+    volumes: [{
+        name: string,
+        papers: ID[] // references to papers in volumes
+    }]
+}
\ No newline at end of file
diff --git a/hw02/convert.py b/hw02/convert.py
new file mode 100644
index 0000000..3ec91dd
--- /dev/null
+++ b/hw02/convert.py
@@ -0,0 +1,282 @@
+import json
+import sys
+import random
+import os
+from faker import Faker
+from pymongo import MongoClient
+
+fake = Faker()
+
+# pip install faker pymongo
+
+# TODO: number on publicationDetails (random)
+# TODO: figures (fake)
+# TODO: inject (fake) figures and references in content
+# TODO: references are a dictionary
+
+# - article doi or JSON filename can be used as paper id
+# - no author id, authors are equal based on name for lack of info
+# - use issn as journal id
+# - author affiliation Faked 
+# - journal name Faked
+# - publication date Faked (year ignored)
+
+fake_unis = []
+for i in range(0, 500):
+    (_, _, loc, _, _) = fake.location_on_land()
+    fake_unis.append("University of " + loc)
+
+authors = {}
+def fake_author(name: str, email: str) -> str:
+    if name not in authors:
+        authors[name] = {
+            "name": name,
+            "email": email,
+            "affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)],
+            "bio": fake.paragraph(nb_sentences=5)
+        }
+    return authors[name]
+
+journals = {}
+def fake_journal(issn: str, volume: str, paper_doi: str) -> str:
+    if issn not in journals:
+        journals[issn] = {
+            "issn": issn,
+            "name": " ".join(fake.bs().split(" ")[1:]).capitalize(),
+            "volumes": [{
+                "name": volume,
+                "papers": [paper_doi]
+            }]
+        }
+    else: 
+        not_found = True
+        for v in journals[issn]["volumes"]:
+            if v["name"] == volume:
+                not_found = False
+                v["papers"].append(paper_doi)
+                break
+        if not_found:
+            journals[issn]["volumes"].append({
+                "name": volume,
+                "papers": [paper_doi]
+            })
+    
+    return journals[issn]
+
+def getProp(obj: dict, props: str, default = None):
+    target = obj
+    if target is None:
+        return default
+    for prop in props.split("."):
+        if prop not in target:
+            return default
+        target = target[prop]
+    return target
+
+def setProp(obj: dict, props: str, toSave: any):
+    target = obj
+    for prop in props.split("."):
+        if prop not in target:
+            target[prop] = {}
+        target = target[prop]
+    target[prop] = toSave
+
+def save_sentence(body: dict, parents: [dict], sentence: str):
+    target = body
+    for p in parents:
+        if p["id"] not in target["sections"]:
+            target["sections"][p["id"]] = {
+                "title": p["title"],
+                "content": "",
+                "sections": {}, 
+            }
+        target = target["sections"][p["id"]]
+    target["content"] += sentence + " "
+
+def transform_section(sec: dict) -> dict:
+    arr = []
+    ks = []
+    for k in sec["sections"].keys():
+        ks.append(k)
+    ks.sort()
+    for k in ks:
+        arr.append(transform_section(sec["sections"][k]))
+
+    content = ([sec["content"]] if "content" in sec and sec["content"] != "" else []) + arr,
+
+    if "title" not in sec:
+        return content
+
+    return {
+        "title": sec["title"],
+        "content": content
+    }
+
+def get_author_name(author: dict) -> str:
+    first = "" if author["first"] is None else author["first"]
+    last = "" if author["last"] is None else author["last"]
+    return (first + " " + last).strip()        
+
+def json_to_paper(filename: str, jsonObj: dict) -> dict:
+    paper = {}
+    
+    paper["title"] = getProp(jsonObj, "metadata.title")
+    paper["abstract"] = getProp(jsonObj, "abstract")
+    paper["doi"] = getProp(jsonObj, "metadata.doi")
+
+    authors = []    
+    for author in getProp(jsonObj, "metadata.authors", []):
+        email = getProp(author, "email")
+
+        author = fake_author(get_author_name(author), email)
+
+        # TODO: authorID
+        authors.append({
+            "email": author["email"],
+            "name": author["name"],
+            "affiliation": author["affiliation"]
+        })
+
+    paper["authors"] = authors
+    paper["keywords"] = getProp(jsonObj, "metadata.keywords")
+
+    publicationDetails = {}
+    publicationDetails["issn"] = getProp(jsonObj, "metadata.issn") # ISBN-like, not a name
+    
+    date = fake.date_object()
+    volume = getProp(jsonObj, "metadata.volume")
+    if volume is None:
+        volume = str(date.year) + " Issue"
+    
+    journal = fake_journal(
+        publicationDetails["issn"],  
+        volume, 
+        getProp(jsonObj, "metadata.doi")
+    )
+
+    publicationDetails["journal"] = journal["name"]
+    publicationDetails["volume"] = volume
+   
+    publicationDetails["date"] = date.isoformat() 
+
+    publicationDetails["pages"] = {
+        "start": getProp(jsonObj, "metadata.firstpage"),
+        "end": getProp(jsonObj, "metadata.lastpage")
+    }
+    paper["publicationDetails"] = publicationDetails
+
+    references = {}
+    for key, value in getProp(jsonObj, "bib_entries", {}).items():
+        if value is None:
+            continue
+
+        references[key] = {
+            "doi": getProp(value, "doi"),
+            "title": getProp(value, "title"),
+            "authors": [],
+            "issn": getProp(value, "issn"),
+            "volume": getProp(value, "volume"),
+            "year": getProp(value, "year")
+        }
+        
+        for author in getProp(value, "authors", []):
+            references[key]["authors"].append({ "name": get_author_name(author) })
+    paper["references"] = references
+
+    body = {
+        "sections": {}
+    }
+
+    l = getProp(jsonObj, "body_text", [])
+    l.sort(key=lambda x: x["startOffset"])
+    for e in l:
+        parents = []
+        for p in getProp(e, "parents", []):
+            parents.append(p)
+
+        parents.append({ "id": e["secId"], "title": e["title"] })
+        save_sentence(body, parents, e["sentence"])
+    
+    paper["content"] = transform_section(body)
+    return paper
+
+
+mongo_conn_str = "mongodb://localhost:27017"
+def main():
+    source_folder: str = sys.argv[1]
+
+    mongo = MongoClient(mongo_conn_str)
+    db = mongo["ddm"]
+
+    db["papers"].drop()
+    db["authors"].drop()
+    db["journals"].drop()
+
+    paper_ids: dict[str, ID] = {}
+    author_ids: dict[str, ID] = {}
+    journal_ids: dict[str, ID] = {}
+
+    i = 0
+    for filename in os.listdir(source_folder):
+        if filename.endswith(".json"): 
+            jsonObj = {}
+            with open(source_folder + "/" + filename, 'r') as jsonFile:
+                jsonStr = "".join(jsonFile.readlines())
+                d = json.JSONDecoder()
+                jsonObj = d.decode(jsonStr)
+                
+            paper = json_to_paper(filename, jsonObj)
+                
+            x = db["papers"].insert_one(paper)
+            paper_ids[paper["doi"]] = x.inserted_id
+            
+            i += 1
+            if i % 20 == 0:
+                print("Papers processed: " + i)
+            if i == 10: # TODO: remove
+                break
+    
+    i = 0
+    for name, author in authors.items():
+        x = db["authors"].insert_one(author)
+        author_ids[name] = x.inserted_id
+    
+    for issn, journal in journals.items():
+        x = db["journals"].insert_one(journal)
+        journal_ids[issn] = x.inserted_id
+
+    for paper in db["papers"].find():
+        mongo_filter = { "_id": paper["_id"] }
+        update = {}
+        mongo_update = { "$set": update }
+
+        issn = getProp(paper, "publicationDetails.issn", "")
+        if issn in journal_ids:
+            setProp(update, "publicationDetails.journalRef", journal_ids[issn])
+
+        for key, ref in getProp(paper, "references", {}).items():
+            if ref["doi"] in paper_ids:
+                setProp(update, "references." + key + ".paperId", paper_ids[ref["doi"]])
+
+            authors_loc = getProp(ref, "authors", [])
+            for author in authors_loc:
+                if name in author_ids:
+                    author["authorId"] = author_ids[name]
+            setProp(update, "references." + key + ".authors", authors_loc)
+            
+            issn = getProp(ref, "issn", "")
+            if issn in journal_ids:
+                setProp(update, "references." + key + ".journalId", journal_ids[issn])
+
+        authors_loc = getProp(paper, "authors", [])
+        for author in authors_loc:
+            if name in author_ids:
+                author["authorId"] = author_ids[name]
+        setProp(update, "authors", authors_loc)
+        
+        print(mongo_update)
+        db["papers"].update_one(mongo_filter, mongo_update)
+        
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file