This commit is contained in:
Claudio Maggioni 2022-11-20 14:16:27 +01:00
parent ce1c4f88f3
commit 4cc72067e4

View file

@ -9,10 +9,8 @@ fake = Faker()
# pip install faker pymongo # pip install faker pymongo
# TODO: number on publicationDetails (random)
# TODO: figures (fake) # TODO: figures (fake)
# TODO: inject (fake) figures and references in content # TODO: inject (fake) figures and references in content
# TODO: references are a dictionary
# - article doi or JSON filename can be used as paper id # - article doi or JSON filename can be used as paper id
# - no author id, authors are equal based on name for lack of info # - no author id, authors are equal based on name for lack of info
@ -73,14 +71,6 @@ def getProp(obj: dict, props: str, default = None):
target = target[prop] target = target[prop]
return target return target
def setProp(obj: dict, props: str, toSave: any):
target = obj
for prop in props.split("."):
if prop not in target:
target[prop] = {}
target = target[prop]
target[prop] = toSave
def save_sentence(body: dict, parents: [dict], sentence: str): def save_sentence(body: dict, parents: [dict], sentence: str):
target = body target = body
for p in parents: for p in parents:
@ -156,21 +146,33 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
publicationDetails["journal"] = journal["name"] publicationDetails["journal"] = journal["name"]
publicationDetails["volume"] = volume publicationDetails["volume"] = volume
publicationDetails["number"] = random.randint(1, 99)
publicationDetails["date"] = date.isoformat() publicationDetails["date"] = date.isoformat()
publicationDetails["pages"] = { publicationDetails["pages"] = {
"start": getProp(jsonObj, "metadata.firstpage"), "start": getProp(jsonObj, "metadata.firstpage"),
"end": getProp(jsonObj, "metadata.lastpage") "end": getProp(jsonObj, "metadata.lastpage")
} }
paper["publicationDetails"] = publicationDetails paper["publicationDetails"] = publicationDetails
references = {} figures = []
for i in range(0, random.randint(3, 15)):
figures.push({
"page": random.randint(1, 10),
"label": "fig" + str(i),
"caption": fake.paragraph(nb_sentences=1),
"imageURL": fake.image_url()
})
paper["figures"] = figures
i = 0
references = []
for key, value in getProp(jsonObj, "bib_entries", {}).items(): for key, value in getProp(jsonObj, "bib_entries", {}).items():
i += 1
if value is None: if value is None:
continue continue
references[key] = { ref = {
"referenceNumber": i,
"doi": getProp(value, "doi"), "doi": getProp(value, "doi"),
"title": getProp(value, "title"), "title": getProp(value, "title"),
"authors": [], "authors": [],
@ -180,7 +182,8 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
} }
for author in getProp(value, "authors", []): for author in getProp(value, "authors", []):
references[key]["authors"].append({ "name": get_author_name(author) }) ref["authors"].append({ "name": get_author_name(author) })
references.append(ref)
paper["references"] = references paper["references"] = references
body = { body = {
@ -231,20 +234,28 @@ def main():
paper_ids[paper["doi"]] = x.inserted_id paper_ids[paper["doi"]] = x.inserted_id
i += 1 i += 1
if i % 20 == 0: if i % 100 == 0:
print("Papers processed: " + i) print("Papers processed: ", i)
if i == 10: # TODO: remove if i == 1000: # TODO: remove
break break
i = 0 i = 0
for name, author in authors.items(): for name, author in authors.items():
x = db["authors"].insert_one(author) x = db["authors"].insert_one(author)
author_ids[name] = x.inserted_id author_ids[name] = x.inserted_id
i += 1
if i % 1000 == 0:
print("Authors processed: ", i)
i = 0
for issn, journal in journals.items(): for issn, journal in journals.items():
x = db["journals"].insert_one(journal) x = db["journals"].insert_one(journal)
journal_ids[issn] = x.inserted_id journal_ids[issn] = x.inserted_id
i += 1
if i % 100 == 0:
print("Journals processed: ", i)
i = 0
for paper in db["papers"].find(): for paper in db["papers"].find():
mongo_filter = { "_id": paper["_id"] } mongo_filter = { "_id": paper["_id"] }
update = {} update = {}
@ -252,30 +263,54 @@ def main():
issn = getProp(paper, "publicationDetails.issn", "") issn = getProp(paper, "publicationDetails.issn", "")
if issn in journal_ids: if issn in journal_ids:
setProp(update, "publicationDetails.journalRef", journal_ids[issn]) update["publicationDetails.journalRef"] = journal_ids[issn]
for key, ref in getProp(paper, "references", {}).items(): references = getProp(paper, "references", [])
for ref in references:
if ref["doi"] in paper_ids: if ref["doi"] in paper_ids:
setProp(update, "references." + key + ".paperId", paper_ids[ref["doi"]]) ref["paperId"] = paper_ids[ref["doi"]]
authors_loc = getProp(ref, "authors", []) for author in ref["authors"]:
for author in authors_loc: name = author["name"]
if name in author_ids: if name in author_ids:
author["authorId"] = author_ids[name] author["authorId"] = author_ids[name]
setProp(update, "references." + key + ".authors", authors_loc)
issn = getProp(ref, "issn", "") if ref["issn"] in journal_ids:
if issn in journal_ids: ref["journalId"] = journal_ids[issn]
setProp(update, "references." + key + ".journalId", journal_ids[issn]) update["references"] = references
authors_loc = getProp(paper, "authors", []) authors_loc = getProp(paper, "authors", [])
for author in authors_loc: for author in authors_loc:
name = author["name"]
if name in author_ids: if name in author_ids:
author["authorId"] = author_ids[name] author["authorId"] = author_ids[name]
setProp(update, "authors", authors_loc) update["authors"] = authors_loc
print(mongo_update)
db["papers"].update_one(mongo_filter, mongo_update) db["papers"].update_one(mongo_filter, mongo_update)
i += 1
if i % 100 == 0:
print("Papers updated with refs: ", i)
i = 0
for journal in db["journals"].find():
mongo_filter = { "_id": journal["_id"] }
update = {}
mongo_update = { "$set": update }
volumes = getProp(journal, "volumes", [])
for volume in volumes:
v_papers = []
for p in volume["papers"]:
v_papers.append(paper_ids[p])
volume["papers"] = v_papers
update["volumes"] = volumes
db["journals"].update_one(mongo_filter, mongo_update)
i += 1
if i % 100 == 0:
print("Journals updated with refs: ", i)
if __name__ == "__main__": if __name__ == "__main__":