wip
This commit is contained in:
parent
ce1c4f88f3
commit
4cc72067e4
1 changed files with 67 additions and 32 deletions
|
@ -9,10 +9,8 @@ fake = Faker()
|
||||||
|
|
||||||
# pip install faker pymongo
|
# pip install faker pymongo
|
||||||
|
|
||||||
# TODO: number on publicationDetails (random)
|
|
||||||
# TODO: figures (fake)
|
# TODO: figures (fake)
|
||||||
# TODO: inject (fake) figures and references in content
|
# TODO: inject (fake) figures and references in content
|
||||||
# TODO: references are a dictionary
|
|
||||||
|
|
||||||
# - article doi or JSON filename can be used as paper id
|
# - article doi or JSON filename can be used as paper id
|
||||||
# - no author id, authors are equal based on name for lack of info
|
# - no author id, authors are equal based on name for lack of info
|
||||||
|
@ -73,14 +71,6 @@ def getProp(obj: dict, props: str, default = None):
|
||||||
target = target[prop]
|
target = target[prop]
|
||||||
return target
|
return target
|
||||||
|
|
||||||
def setProp(obj: dict, props: str, toSave: any):
|
|
||||||
target = obj
|
|
||||||
for prop in props.split("."):
|
|
||||||
if prop not in target:
|
|
||||||
target[prop] = {}
|
|
||||||
target = target[prop]
|
|
||||||
target[prop] = toSave
|
|
||||||
|
|
||||||
def save_sentence(body: dict, parents: [dict], sentence: str):
|
def save_sentence(body: dict, parents: [dict], sentence: str):
|
||||||
target = body
|
target = body
|
||||||
for p in parents:
|
for p in parents:
|
||||||
|
@ -156,21 +146,33 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
|
||||||
|
|
||||||
publicationDetails["journal"] = journal["name"]
|
publicationDetails["journal"] = journal["name"]
|
||||||
publicationDetails["volume"] = volume
|
publicationDetails["volume"] = volume
|
||||||
|
publicationDetails["number"] = random.randint(1, 99)
|
||||||
publicationDetails["date"] = date.isoformat()
|
publicationDetails["date"] = date.isoformat()
|
||||||
|
|
||||||
publicationDetails["pages"] = {
|
publicationDetails["pages"] = {
|
||||||
"start": getProp(jsonObj, "metadata.firstpage"),
|
"start": getProp(jsonObj, "metadata.firstpage"),
|
||||||
"end": getProp(jsonObj, "metadata.lastpage")
|
"end": getProp(jsonObj, "metadata.lastpage")
|
||||||
}
|
}
|
||||||
paper["publicationDetails"] = publicationDetails
|
paper["publicationDetails"] = publicationDetails
|
||||||
|
|
||||||
references = {}
|
figures = []
|
||||||
|
for i in range(0, random.randint(3, 15)):
|
||||||
|
figures.push({
|
||||||
|
"page": random.randint(1, 10),
|
||||||
|
"label": "fig" + str(i),
|
||||||
|
"caption": fake.paragraph(nb_sentences=1),
|
||||||
|
"imageURL": fake.image_url()
|
||||||
|
})
|
||||||
|
paper["figures"] = figures
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
references = []
|
||||||
for key, value in getProp(jsonObj, "bib_entries", {}).items():
|
for key, value in getProp(jsonObj, "bib_entries", {}).items():
|
||||||
|
i += 1
|
||||||
if value is None:
|
if value is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
references[key] = {
|
ref = {
|
||||||
|
"referenceNumber": i,
|
||||||
"doi": getProp(value, "doi"),
|
"doi": getProp(value, "doi"),
|
||||||
"title": getProp(value, "title"),
|
"title": getProp(value, "title"),
|
||||||
"authors": [],
|
"authors": [],
|
||||||
|
@ -180,7 +182,8 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
|
||||||
}
|
}
|
||||||
|
|
||||||
for author in getProp(value, "authors", []):
|
for author in getProp(value, "authors", []):
|
||||||
references[key]["authors"].append({ "name": get_author_name(author) })
|
ref["authors"].append({ "name": get_author_name(author) })
|
||||||
|
references.append(ref)
|
||||||
paper["references"] = references
|
paper["references"] = references
|
||||||
|
|
||||||
body = {
|
body = {
|
||||||
|
@ -231,20 +234,28 @@ def main():
|
||||||
paper_ids[paper["doi"]] = x.inserted_id
|
paper_ids[paper["doi"]] = x.inserted_id
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
if i % 20 == 0:
|
if i % 100 == 0:
|
||||||
print("Papers processed: " + i)
|
print("Papers processed: ", i)
|
||||||
if i == 10: # TODO: remove
|
if i == 1000: # TODO: remove
|
||||||
break
|
break
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for name, author in authors.items():
|
for name, author in authors.items():
|
||||||
x = db["authors"].insert_one(author)
|
x = db["authors"].insert_one(author)
|
||||||
author_ids[name] = x.inserted_id
|
author_ids[name] = x.inserted_id
|
||||||
|
i += 1
|
||||||
|
if i % 1000 == 0:
|
||||||
|
print("Authors processed: ", i)
|
||||||
|
|
||||||
|
i = 0
|
||||||
for issn, journal in journals.items():
|
for issn, journal in journals.items():
|
||||||
x = db["journals"].insert_one(journal)
|
x = db["journals"].insert_one(journal)
|
||||||
journal_ids[issn] = x.inserted_id
|
journal_ids[issn] = x.inserted_id
|
||||||
|
i += 1
|
||||||
|
if i % 100 == 0:
|
||||||
|
print("Journals processed: ", i)
|
||||||
|
|
||||||
|
i = 0
|
||||||
for paper in db["papers"].find():
|
for paper in db["papers"].find():
|
||||||
mongo_filter = { "_id": paper["_id"] }
|
mongo_filter = { "_id": paper["_id"] }
|
||||||
update = {}
|
update = {}
|
||||||
|
@ -252,30 +263,54 @@ def main():
|
||||||
|
|
||||||
issn = getProp(paper, "publicationDetails.issn", "")
|
issn = getProp(paper, "publicationDetails.issn", "")
|
||||||
if issn in journal_ids:
|
if issn in journal_ids:
|
||||||
setProp(update, "publicationDetails.journalRef", journal_ids[issn])
|
update["publicationDetails.journalRef"] = journal_ids[issn]
|
||||||
|
|
||||||
for key, ref in getProp(paper, "references", {}).items():
|
references = getProp(paper, "references", [])
|
||||||
|
for ref in references:
|
||||||
if ref["doi"] in paper_ids:
|
if ref["doi"] in paper_ids:
|
||||||
setProp(update, "references." + key + ".paperId", paper_ids[ref["doi"]])
|
ref["paperId"] = paper_ids[ref["doi"]]
|
||||||
|
|
||||||
authors_loc = getProp(ref, "authors", [])
|
for author in ref["authors"]:
|
||||||
for author in authors_loc:
|
name = author["name"]
|
||||||
if name in author_ids:
|
if name in author_ids:
|
||||||
author["authorId"] = author_ids[name]
|
author["authorId"] = author_ids[name]
|
||||||
setProp(update, "references." + key + ".authors", authors_loc)
|
|
||||||
|
|
||||||
issn = getProp(ref, "issn", "")
|
if ref["issn"] in journal_ids:
|
||||||
if issn in journal_ids:
|
ref["journalId"] = journal_ids[issn]
|
||||||
setProp(update, "references." + key + ".journalId", journal_ids[issn])
|
update["references"] = references
|
||||||
|
|
||||||
authors_loc = getProp(paper, "authors", [])
|
authors_loc = getProp(paper, "authors", [])
|
||||||
for author in authors_loc:
|
for author in authors_loc:
|
||||||
|
name = author["name"]
|
||||||
if name in author_ids:
|
if name in author_ids:
|
||||||
author["authorId"] = author_ids[name]
|
author["authorId"] = author_ids[name]
|
||||||
setProp(update, "authors", authors_loc)
|
update["authors"] = authors_loc
|
||||||
|
|
||||||
print(mongo_update)
|
|
||||||
db["papers"].update_one(mongo_filter, mongo_update)
|
db["papers"].update_one(mongo_filter, mongo_update)
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
if i % 100 == 0:
|
||||||
|
print("Papers updated with refs: ", i)
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for journal in db["journals"].find():
|
||||||
|
mongo_filter = { "_id": journal["_id"] }
|
||||||
|
update = {}
|
||||||
|
mongo_update = { "$set": update }
|
||||||
|
|
||||||
|
volumes = getProp(journal, "volumes", [])
|
||||||
|
for volume in volumes:
|
||||||
|
v_papers = []
|
||||||
|
for p in volume["papers"]:
|
||||||
|
v_papers.append(paper_ids[p])
|
||||||
|
volume["papers"] = v_papers
|
||||||
|
update["volumes"] = volumes
|
||||||
|
|
||||||
|
db["journals"].update_one(mongo_filter, mongo_update)
|
||||||
|
i += 1
|
||||||
|
if i % 100 == 0:
|
||||||
|
print("Journals updated with refs: ", i)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Reference in a new issue