This commit is contained in:
Claudio Maggioni 2022-11-20 13:35:01 +01:00
parent 282d41194a
commit ce1c4f88f3
2 changed files with 350 additions and 0 deletions

68
hw02/Papers.ts Normal file
View File

@ -0,0 +1,68 @@
type ID = string;
type UInt = number;
// each interface is a mongodb collection
type Section = { title: string, content: Content[] }
type Content = Section | string /* text */ | { reference: number } /* bib reference */ | { label: string } /* figure reference */
interface Paper {
_id: ID,
title: string,
abstract: string,
authors: [{ // attributes here reflect the data found when the paper was published
affiliation: string, // university of blah
email: string | null,
name: string,
authorId: ID
}],
keywords: string[],
publicationDetails: {
journalRef: ID,
journal: string, // "Journal of AI"
volume: string, // "Spring 2020 edition"
number: string // reference for this paper in the volume
date: Date,
pages: { // pages in the volume where this paper is
start: number,
end: number
} // [0, 1]
},
content: Section[],
figures: [{
page: number // page number where the figure is placed
label: string,
caption: string,
imageURL?: string // some figures are tables, not in the scope of this assignment to model a table
}],
references: [{
referenceNumber: number,
paperId?: ID, // optional
title: string,
authors: [{
name: string,
authorId?: ID //optional
}], // list of author names
journal: string,
journalId?: ID,
volume: string,
number: string
}]
}
interface Author { // author data here reflects the most up to date info in the author
_id: ID,
name: string,
email: string,
affiliation: string,
bio: string
}
interface Journal {
_id: ID,
name: string,
volumes: [{
name: string,
papers: ID[] // references to papers in volumes
}]
}

282
hw02/convert.py Normal file
View File

@ -0,0 +1,282 @@
import json
import sys
import random
import os
from faker import Faker
from pymongo import MongoClient
fake = Faker()
# pip install faker pymongo
# TODO: number on publicationDetails (random)
# TODO: figures (fake)
# TODO: inject (fake) figures and references in content
# TODO: references are a dictionary
# - article doi or JSON filename can be used as paper id
# - no author id, authors are equal based on name for lack of info
# - use issn as journal id
# - author affiliation Faked
# - journal name Faked
# - publication date Faked (year ignored)
fake_unis = []
for i in range(0, 500):
(_, _, loc, _, _) = fake.location_on_land()
fake_unis.append("University of " + loc)
authors = {}
def fake_author(name: str, email: str) -> str:
if name not in authors:
authors[name] = {
"name": name,
"email": email,
"affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)],
"bio": fake.paragraph(nb_sentences=5)
}
return authors[name]
journals = {}
def fake_journal(issn: str, volume: str, paper_doi: str) -> str:
if issn not in journals:
journals[issn] = {
"issn": issn,
"name": " ".join(fake.bs().split(" ")[1:]).capitalize(),
"volumes": [{
"name": volume,
"papers": [paper_doi]
}]
}
else:
not_found = True
for v in journals[issn]["volumes"]:
if v["name"] == volume:
not_found = False
v["papers"].append(paper_doi)
break
if not_found:
journals[issn]["volumes"].append({
"name": volume,
"papers": [paper_doi]
})
return journals[issn]
def getProp(obj: dict, props: str, default = None):
target = obj
if target is None:
return default
for prop in props.split("."):
if prop not in target:
return default
target = target[prop]
return target
def setProp(obj: dict, props: str, toSave: any):
target = obj
for prop in props.split("."):
if prop not in target:
target[prop] = {}
target = target[prop]
target[prop] = toSave
def save_sentence(body: dict, parents: [dict], sentence: str):
target = body
for p in parents:
if p["id"] not in target["sections"]:
target["sections"][p["id"]] = {
"title": p["title"],
"content": "",
"sections": {},
}
target = target["sections"][p["id"]]
target["content"] += sentence + " "
def transform_section(sec: dict) -> dict:
arr = []
ks = []
for k in sec["sections"].keys():
ks.append(k)
ks.sort()
for k in ks:
arr.append(transform_section(sec["sections"][k]))
content = ([sec["content"]] if "content" in sec and sec["content"] != "" else []) + arr,
if "title" not in sec:
return content
return {
"title": sec["title"],
"content": content
}
def get_author_name(author: dict) -> str:
first = "" if author["first"] is None else author["first"]
last = "" if author["last"] is None else author["last"]
return (first + " " + last).strip()
def json_to_paper(filename: str, jsonObj: dict) -> dict:
paper = {}
paper["title"] = getProp(jsonObj, "metadata.title")
paper["abstract"] = getProp(jsonObj, "abstract")
paper["doi"] = getProp(jsonObj, "metadata.doi")
authors = []
for author in getProp(jsonObj, "metadata.authors", []):
email = getProp(author, "email")
author = fake_author(get_author_name(author), email)
# TODO: authorID
authors.append({
"email": author["email"],
"name": author["name"],
"affiliation": author["affiliation"]
})
paper["authors"] = authors
paper["keywords"] = getProp(jsonObj, "metadata.keywords")
publicationDetails = {}
publicationDetails["issn"] = getProp(jsonObj, "metadata.issn") # ISBN-like, not a name
date = fake.date_object()
volume = getProp(jsonObj, "metadata.volume")
if volume is None:
volume = str(date.year) + " Issue"
journal = fake_journal(
publicationDetails["issn"],
volume,
getProp(jsonObj, "metadata.doi")
)
publicationDetails["journal"] = journal["name"]
publicationDetails["volume"] = volume
publicationDetails["date"] = date.isoformat()
publicationDetails["pages"] = {
"start": getProp(jsonObj, "metadata.firstpage"),
"end": getProp(jsonObj, "metadata.lastpage")
}
paper["publicationDetails"] = publicationDetails
references = {}
for key, value in getProp(jsonObj, "bib_entries", {}).items():
if value is None:
continue
references[key] = {
"doi": getProp(value, "doi"),
"title": getProp(value, "title"),
"authors": [],
"issn": getProp(value, "issn"),
"volume": getProp(value, "volume"),
"year": getProp(value, "year")
}
for author in getProp(value, "authors", []):
references[key]["authors"].append({ "name": get_author_name(author) })
paper["references"] = references
body = {
"sections": {}
}
l = getProp(jsonObj, "body_text", [])
l.sort(key=lambda x: x["startOffset"])
for e in l:
parents = []
for p in getProp(e, "parents", []):
parents.append(p)
parents.append({ "id": e["secId"], "title": e["title"] })
save_sentence(body, parents, e["sentence"])
paper["content"] = transform_section(body)
return paper
mongo_conn_str = "mongodb://localhost:27017"
def main():
source_folder: str = sys.argv[1]
mongo = MongoClient(mongo_conn_str)
db = mongo["ddm"]
db["papers"].drop()
db["authors"].drop()
db["journals"].drop()
paper_ids: dict[str, ID] = {}
author_ids: dict[str, ID] = {}
journal_ids: dict[str, ID] = {}
i = 0
for filename in os.listdir(source_folder):
if filename.endswith(".json"):
jsonObj = {}
with open(source_folder + "/" + filename, 'r') as jsonFile:
jsonStr = "".join(jsonFile.readlines())
d = json.JSONDecoder()
jsonObj = d.decode(jsonStr)
paper = json_to_paper(filename, jsonObj)
x = db["papers"].insert_one(paper)
paper_ids[paper["doi"]] = x.inserted_id
i += 1
if i % 20 == 0:
print("Papers processed: " + i)
if i == 10: # TODO: remove
break
i = 0
for name, author in authors.items():
x = db["authors"].insert_one(author)
author_ids[name] = x.inserted_id
for issn, journal in journals.items():
x = db["journals"].insert_one(journal)
journal_ids[issn] = x.inserted_id
for paper in db["papers"].find():
mongo_filter = { "_id": paper["_id"] }
update = {}
mongo_update = { "$set": update }
issn = getProp(paper, "publicationDetails.issn", "")
if issn in journal_ids:
setProp(update, "publicationDetails.journalRef", journal_ids[issn])
for key, ref in getProp(paper, "references", {}).items():
if ref["doi"] in paper_ids:
setProp(update, "references." + key + ".paperId", paper_ids[ref["doi"]])
authors_loc = getProp(ref, "authors", [])
for author in authors_loc:
if name in author_ids:
author["authorId"] = author_ids[name]
setProp(update, "references." + key + ".authors", authors_loc)
issn = getProp(ref, "issn", "")
if issn in journal_ids:
setProp(update, "references." + key + ".journalId", journal_ids[issn])
authors_loc = getProp(paper, "authors", [])
for author in authors_loc:
if name in author_ids:
author["authorId"] = author_ids[name]
setProp(update, "authors", authors_loc)
print(mongo_update)
db["papers"].update_one(mongo_filter, mongo_update)
if __name__ == "__main__":
main()