ddm/hw02/convert.py

import json
import sys
import random
import os
from faker import Faker
from pymongo import MongoClient

fake = Faker()

# pip install faker pymongo

# TODO: figures (fake)
# TODO: inject (fake) figures and references in content

# - article doi or JSON filename can be used as paper id
# - no author id, authors are equal based on name for lack of info
# - use issn as journal id
# - author affiliation Faked 
# - journal name Faked
# - publication date Faked (year ignored)

fake_unis = []
for i in range(0, 500):
    (_, _, loc, _, _) = fake.location_on_land()
    fake_unis.append("University of " + loc)

authors = {}
def fake_author(name: str, email: str) -> str:
    if name not in authors:
        authors[name] = {
            "name": name,
            "email": email,
            "affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)],
            "bio": fake.paragraph(nb_sentences=5)
        }
    return authors[name]

journals = {}
def fake_journal(issn: str, volume: str, paper_doi: str) -> str:
    if issn not in journals:
        journals[issn] = {
            "issn": issn,
            "name": " ".join(fake.bs().split(" ")[1:]).capitalize(),
            "volumes": [{
                "name": volume,
                "papers": [paper_doi]
            }]
        }
    else: 
        not_found = True
        for v in journals[issn]["volumes"]:
            if v["name"] == volume:
                not_found = False
                v["papers"].append(paper_doi)
                break
        if not_found:
            journals[issn]["volumes"].append({
                "name": volume,
                "papers": [paper_doi]
            })
    
    return journals[issn]

def getProp(obj: dict, props: str, default = None):
    target = obj
    if target is None:
        return default
    for prop in props.split("."):
        if prop not in target:
            return default
        target = target[prop]
    return target

def save_sentence(body: dict, parents: [dict], sentence: str):
    target = body
    for p in parents:
        if p["id"] not in target["sections"]:
            target["sections"][p["id"]] = {
                "title": p["title"],
                "content": "",
                "sections": {}, 
            }
        target = target["sections"][p["id"]]
    target["content"] += sentence + " "

def transform_section(sec: dict) -> dict:
    arr = []
    ks = []
    for k in sec["sections"].keys():
        ks.append(k)
    ks.sort()
    for k in ks:
        arr.append(transform_section(sec["sections"][k]))

    content = ([sec["content"]] if "content" in sec and sec["content"] != "" else []) + arr,

    if "title" not in sec:
        return content

    return {
        "title": sec["title"],
        "content": content
    }

def get_author_name(author: dict) -> str:
    first = "" if author["first"] is None else author["first"]
    last = "" if author["last"] is None else author["last"]
    return (first + " " + last).strip()        

def json_to_paper(filename: str, jsonObj: dict) -> dict:
    paper = {}
    
    paper["title"] = getProp(jsonObj, "metadata.title")
    paper["abstract"] = getProp(jsonObj, "abstract")
    paper["doi"] = getProp(jsonObj, "metadata.doi")

    authors = []    
    for author in getProp(jsonObj, "metadata.authors", []):
        email = getProp(author, "email")

        author = fake_author(get_author_name(author), email)

        # TODO: authorID
        authors.append({
            "email": author["email"],
            "name": author["name"],
            "affiliation": author["affiliation"]
        })

    paper["authors"] = authors
    paper["keywords"] = getProp(jsonObj, "metadata.keywords")

    publicationDetails = {}
    publicationDetails["issn"] = getProp(jsonObj, "metadata.issn") # ISBN-like, not a name
    
    date = fake.date_object()
    volume = getProp(jsonObj, "metadata.volume")
    if volume is None:
        volume = str(date.year) + " Issue"
    
    journal = fake_journal(
        publicationDetails["issn"],  
        volume, 
        getProp(jsonObj, "metadata.doi")
    )

    publicationDetails["journal"] = journal["name"]
    publicationDetails["volume"] = volume
    publicationDetails["number"] = random.randint(1, 99)
    publicationDetails["date"] = date.isoformat() 
    publicationDetails["pages"] = {
        "start": getProp(jsonObj, "metadata.firstpage"),
        "end": getProp(jsonObj, "metadata.lastpage")
    }
    paper["publicationDetails"] = publicationDetails

    figures = []
    for i in range(0, random.randint(3, 15)):
        figures.push({
            "page": random.randint(1, 10),
            "label": "fig" + str(i),
            "caption": fake.paragraph(nb_sentences=1),
            "imageURL": fake.image_url()
        })
    paper["figures"] = figures

    i = 0
    references = []
    for key, value in getProp(jsonObj, "bib_entries", {}).items():
        i += 1
        if value is None:
            continue

        ref = {
            "referenceNumber": i,
            "doi": getProp(value, "doi"),
            "title": getProp(value, "title"),
            "authors": [],
            "issn": getProp(value, "issn"),
            "volume": getProp(value, "volume"),
            "year": getProp(value, "year")
        }
        
        for author in getProp(value, "authors", []):
            ref["authors"].append({ "name": get_author_name(author) })
        references.append(ref)
    paper["references"] = references

    body = {
        "sections": {}
    }

    l = getProp(jsonObj, "body_text", [])
    l.sort(key=lambda x: x["startOffset"])
    for e in l:
        parents = []
        for p in getProp(e, "parents", []):
            parents.append(p)

        parents.append({ "id": e["secId"], "title": e["title"] })
        save_sentence(body, parents, e["sentence"])
    
    paper["content"] = transform_section(body)
    return paper


mongo_conn_str = "mongodb://localhost:27017"
def main():
    source_folder: str = sys.argv[1]

    mongo = MongoClient(mongo_conn_str)
    db = mongo["ddm"]

    db["papers"].drop()
    db["authors"].drop()
    db["journals"].drop()

    paper_ids: dict[str, ID] = {}
    author_ids: dict[str, ID] = {}
    journal_ids: dict[str, ID] = {}

    i = 0
    for filename in os.listdir(source_folder):
        if filename.endswith(".json"): 
            jsonObj = {}
            with open(source_folder + "/" + filename, 'r') as jsonFile:
                jsonStr = "".join(jsonFile.readlines())
                d = json.JSONDecoder()
                jsonObj = d.decode(jsonStr)
                
            paper = json_to_paper(filename, jsonObj)
                
            x = db["papers"].insert_one(paper)
            paper_ids[paper["doi"]] = x.inserted_id
            
            i += 1
            if i % 100 == 0:
                print("Papers processed: ", i)
            if i == 1000: # TODO: remove
                break
    
    i = 0
    for name, author in authors.items():
        x = db["authors"].insert_one(author)
        author_ids[name] = x.inserted_id
        i += 1
        if i % 1000 == 0:
            print("Authors processed: ", i)

    i = 0
    for issn, journal in journals.items():
        x = db["journals"].insert_one(journal)
        journal_ids[issn] = x.inserted_id
        i += 1
        if i % 100 == 0:
            print("Journals processed: ", i)

    i = 0
    for paper in db["papers"].find():
        mongo_filter = { "_id": paper["_id"] }
        update = {}
        mongo_update = { "$set": update }

        issn = getProp(paper, "publicationDetails.issn", "")
        if issn in journal_ids:
            update["publicationDetails.journalRef"] = journal_ids[issn]
        
        references = getProp(paper, "references", [])
        for ref in references:
            if ref["doi"] in paper_ids:
                ref["paperId"] = paper_ids[ref["doi"]]

            for author in ref["authors"]:
                name = author["name"]
                if name in author_ids:
                    author["authorId"] = author_ids[name]
            
            if ref["issn"] in journal_ids:
                ref["journalId"] = journal_ids[issn]
        update["references"] = references

        authors_loc = getProp(paper, "authors", [])
        for author in authors_loc:
            name = author["name"]
            if name in author_ids:
                author["authorId"] = author_ids[name]
        update["authors"] = authors_loc
    
        db["papers"].update_one(mongo_filter, mongo_update)

        i += 1
        if i % 100 == 0:
            print("Papers updated with refs: ", i)

    i = 0
    for journal in db["journals"].find():
        mongo_filter = { "_id": journal["_id"] }
        update = {}
        mongo_update = { "$set": update }

        volumes = getProp(journal, "volumes", [])
        for volume in volumes:
            v_papers = []
            for p in volume["papers"]:
                v_papers.append(paper_ids[p])
            volume["papers"] = v_papers
        update["volumes"] = volumes
        
        db["journals"].update_one(mongo_filter, mongo_update)
        i += 1
        if i % 100 == 0:
            print("Journals updated with refs: ", i)

        
if __name__ == "__main__":
    main()
wip 2022-11-20 12:35:01 +00:00			`import json`
			`import sys`
			`import random`
			`import os`
			`from faker import Faker`
			`from pymongo import MongoClient`

			`fake = Faker()`

			`# pip install faker pymongo`

			`# TODO: figures (fake)`
			`# TODO: inject (fake) figures and references in content`

			`# - article doi or JSON filename can be used as paper id`
			`# - no author id, authors are equal based on name for lack of info`
			`# - use issn as journal id`
			`# - author affiliation Faked`
			`# - journal name Faked`
			`# - publication date Faked (year ignored)`

			`fake_unis = []`
			`for i in range(0, 500):`
			`(_, _, loc, _, _) = fake.location_on_land()`
			`fake_unis.append("University of " + loc)`

			`authors = {}`
			`def fake_author(name: str, email: str) -> str:`
			`if name not in authors:`
			`authors[name] = {`
			`"name": name,`
			`"email": email,`
			`"affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)],`
			`"bio": fake.paragraph(nb_sentences=5)`
			`}`
			`return authors[name]`

			`journals = {}`
			`def fake_journal(issn: str, volume: str, paper_doi: str) -> str:`
			`if issn not in journals:`
			`journals[issn] = {`
			`"issn": issn,`
			`"name": " ".join(fake.bs().split(" ")[1:]).capitalize(),`
			`"volumes": [{`
			`"name": volume,`
			`"papers": [paper_doi]`
			`}]`
			`}`
			`else:`
			`not_found = True`
			`for v in journals[issn]["volumes"]:`
			`if v["name"] == volume:`
			`not_found = False`
			`v["papers"].append(paper_doi)`
			`break`
			`if not_found:`
			`journals[issn]["volumes"].append({`
			`"name": volume,`
			`"papers": [paper_doi]`
			`})`

			`return journals[issn]`

			`def getProp(obj: dict, props: str, default = None):`
			`target = obj`
			`if target is None:`
			`return default`
			`for prop in props.split("."):`
			`if prop not in target:`
			`return default`
			`target = target[prop]`
			`return target`

			`def save_sentence(body: dict, parents: [dict], sentence: str):`
			`target = body`
			`for p in parents:`
			`if p["id"] not in target["sections"]:`
			`target["sections"][p["id"]] = {`
			`"title": p["title"],`
			`"content": "",`
			`"sections": {},`
			`}`
			`target = target["sections"][p["id"]]`
			`target["content"] += sentence + " "`

			`def transform_section(sec: dict) -> dict:`
			`arr = []`
			`ks = []`
			`for k in sec["sections"].keys():`
			`ks.append(k)`
			`ks.sort()`
			`for k in ks:`
			`arr.append(transform_section(sec["sections"][k]))`

			`content = ([sec["content"]] if "content" in sec and sec["content"] != "" else []) + arr,`

			`if "title" not in sec:`
			`return content`

			`return {`
			`"title": sec["title"],`
			`"content": content`
			`}`

			`def get_author_name(author: dict) -> str:`
			`first = "" if author["first"] is None else author["first"]`
			`last = "" if author["last"] is None else author["last"]`
			`return (first + " " + last).strip()`

			`def json_to_paper(filename: str, jsonObj: dict) -> dict:`
			`paper = {}`

			`paper["title"] = getProp(jsonObj, "metadata.title")`
			`paper["abstract"] = getProp(jsonObj, "abstract")`
			`paper["doi"] = getProp(jsonObj, "metadata.doi")`

			`authors = []`
			`for author in getProp(jsonObj, "metadata.authors", []):`
			`email = getProp(author, "email")`

			`author = fake_author(get_author_name(author), email)`

			`# TODO: authorID`
			`authors.append({`
			`"email": author["email"],`
			`"name": author["name"],`
			`"affiliation": author["affiliation"]`
			`})`

			`paper["authors"] = authors`
			`paper["keywords"] = getProp(jsonObj, "metadata.keywords")`

			`publicationDetails = {}`
			`publicationDetails["issn"] = getProp(jsonObj, "metadata.issn") # ISBN-like, not a name`

			`date = fake.date_object()`
			`volume = getProp(jsonObj, "metadata.volume")`
			`if volume is None:`
			`volume = str(date.year) + " Issue"`

			`journal = fake_journal(`
			`publicationDetails["issn"],`
			`volume,`
			`getProp(jsonObj, "metadata.doi")`
			`)`

			`publicationDetails["journal"] = journal["name"]`
			`publicationDetails["volume"] = volume`
wip 2022-11-20 13:16:27 +00:00			`publicationDetails["number"] = random.randint(1, 99)`
wip 2022-11-20 12:35:01 +00:00			`publicationDetails["date"] = date.isoformat()`
			`publicationDetails["pages"] = {`
			`"start": getProp(jsonObj, "metadata.firstpage"),`
			`"end": getProp(jsonObj, "metadata.lastpage")`
			`}`
			`paper["publicationDetails"] = publicationDetails`

wip 2022-11-20 13:16:27 +00:00			`figures = []`
			`for i in range(0, random.randint(3, 15)):`
			`figures.push({`
			`"page": random.randint(1, 10),`
			`"label": "fig" + str(i),`
			`"caption": fake.paragraph(nb_sentences=1),`
			`"imageURL": fake.image_url()`
			`})`
			`paper["figures"] = figures`

			`i = 0`
			`references = []`
wip 2022-11-20 12:35:01 +00:00			`for key, value in getProp(jsonObj, "bib_entries", {}).items():`
wip 2022-11-20 13:16:27 +00:00			`i += 1`
wip 2022-11-20 12:35:01 +00:00			`if value is None:`
			`continue`

wip 2022-11-20 13:16:27 +00:00			`ref = {`
			`"referenceNumber": i,`
wip 2022-11-20 12:35:01 +00:00			`"doi": getProp(value, "doi"),`
			`"title": getProp(value, "title"),`
			`"authors": [],`
			`"issn": getProp(value, "issn"),`
			`"volume": getProp(value, "volume"),`
			`"year": getProp(value, "year")`
			`}`

			`for author in getProp(value, "authors", []):`
wip 2022-11-20 13:16:27 +00:00			`ref["authors"].append({ "name": get_author_name(author) })`
			`references.append(ref)`
wip 2022-11-20 12:35:01 +00:00			`paper["references"] = references`

			`body = {`
			`"sections": {}`
			`}`

			`l = getProp(jsonObj, "body_text", [])`
			`l.sort(key=lambda x: x["startOffset"])`
			`for e in l:`
			`parents = []`
			`for p in getProp(e, "parents", []):`
			`parents.append(p)`

			`parents.append({ "id": e["secId"], "title": e["title"] })`
			`save_sentence(body, parents, e["sentence"])`

			`paper["content"] = transform_section(body)`
			`return paper`


			`mongo_conn_str = "mongodb://localhost:27017"`
			`def main():`
			`source_folder: str = sys.argv[1]`

			`mongo = MongoClient(mongo_conn_str)`
			`db = mongo["ddm"]`

			`db["papers"].drop()`
			`db["authors"].drop()`
			`db["journals"].drop()`

			`paper_ids: dict[str, ID] = {}`
			`author_ids: dict[str, ID] = {}`
			`journal_ids: dict[str, ID] = {}`

			`i = 0`
			`for filename in os.listdir(source_folder):`
			`if filename.endswith(".json"):`
			`jsonObj = {}`
			`with open(source_folder + "/" + filename, 'r') as jsonFile:`
			`jsonStr = "".join(jsonFile.readlines())`
			`d = json.JSONDecoder()`
			`jsonObj = d.decode(jsonStr)`

			`paper = json_to_paper(filename, jsonObj)`

			`x = db["papers"].insert_one(paper)`
			`paper_ids[paper["doi"]] = x.inserted_id`

			`i += 1`
wip 2022-11-20 13:16:27 +00:00			`if i % 100 == 0:`
			`print("Papers processed: ", i)`
			`if i == 1000: # TODO: remove`
wip 2022-11-20 12:35:01 +00:00			`break`

			`i = 0`
			`for name, author in authors.items():`
			`x = db["authors"].insert_one(author)`
			`author_ids[name] = x.inserted_id`
wip 2022-11-20 13:16:27 +00:00			`i += 1`
			`if i % 1000 == 0:`
			`print("Authors processed: ", i)`

			`i = 0`
wip 2022-11-20 12:35:01 +00:00			`for issn, journal in journals.items():`
			`x = db["journals"].insert_one(journal)`
			`journal_ids[issn] = x.inserted_id`
wip 2022-11-20 13:16:27 +00:00			`i += 1`
			`if i % 100 == 0:`
			`print("Journals processed: ", i)`
wip 2022-11-20 12:35:01 +00:00
wip 2022-11-20 13:16:27 +00:00			`i = 0`
wip 2022-11-20 12:35:01 +00:00			`for paper in db["papers"].find():`
			`mongo_filter = { "_id": paper["_id"] }`
			`update = {}`
			`mongo_update = { "$set": update }`

			`issn = getProp(paper, "publicationDetails.issn", "")`
			`if issn in journal_ids:`
wip 2022-11-20 13:16:27 +00:00			`update["publicationDetails.journalRef"] = journal_ids[issn]`

			`references = getProp(paper, "references", [])`
			`for ref in references:`
wip 2022-11-20 12:35:01 +00:00			`if ref["doi"] in paper_ids:`
wip 2022-11-20 13:16:27 +00:00			`ref["paperId"] = paper_ids[ref["doi"]]`
wip 2022-11-20 12:35:01 +00:00
wip 2022-11-20 13:16:27 +00:00			`for author in ref["authors"]:`
			`name = author["name"]`
wip 2022-11-20 12:35:01 +00:00			`if name in author_ids:`
			`author["authorId"] = author_ids[name]`

wip 2022-11-20 13:16:27 +00:00			`if ref["issn"] in journal_ids:`
			`ref["journalId"] = journal_ids[issn]`
			`update["references"] = references`
wip 2022-11-20 12:35:01 +00:00
			`authors_loc = getProp(paper, "authors", [])`
			`for author in authors_loc:`
wip 2022-11-20 13:16:27 +00:00			`name = author["name"]`
wip 2022-11-20 12:35:01 +00:00			`if name in author_ids:`
			`author["authorId"] = author_ids[name]`
wip 2022-11-20 13:16:27 +00:00			`update["authors"] = authors_loc`

wip 2022-11-20 12:35:01 +00:00			`db["papers"].update_one(mongo_filter, mongo_update)`
wip 2022-11-20 13:16:27 +00:00
			`i += 1`
			`if i % 100 == 0:`
			`print("Papers updated with refs: ", i)`

			`i = 0`
			`for journal in db["journals"].find():`
			`mongo_filter = { "_id": journal["_id"] }`
			`update = {}`
			`mongo_update = { "$set": update }`

			`volumes = getProp(journal, "volumes", [])`
			`for volume in volumes:`
			`v_papers = []`
			`for p in volume["papers"]:`
			`v_papers.append(paper_ids[p])`
			`volume["papers"] = v_papers`
			`update["volumes"] = volumes`

			`db["journals"].update_one(mongo_filter, mongo_update)`
			`i += 1`
			`if i % 100 == 0:`
			`print("Journals updated with refs: ", i)`

wip 2022-11-20 12:35:01 +00:00

			`if __name__ == "__main__":`
			`main()`