From 4cc72067e4158e01cbff3b32d0a3ef480c24c25a Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Sun, 20 Nov 2022 14:16:27 +0100 Subject: [PATCH] wip --- hw02/convert.py | 99 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 67 insertions(+), 32 deletions(-) diff --git a/hw02/convert.py b/hw02/convert.py index 3ec91dd..895256e 100644 --- a/hw02/convert.py +++ b/hw02/convert.py @@ -9,10 +9,8 @@ fake = Faker() # pip install faker pymongo -# TODO: number on publicationDetails (random) # TODO: figures (fake) # TODO: inject (fake) figures and references in content -# TODO: references are a dictionary # - article doi or JSON filename can be used as paper id # - no author id, authors are equal based on name for lack of info @@ -73,14 +71,6 @@ def getProp(obj: dict, props: str, default = None): target = target[prop] return target -def setProp(obj: dict, props: str, toSave: any): - target = obj - for prop in props.split("."): - if prop not in target: - target[prop] = {} - target = target[prop] - target[prop] = toSave - def save_sentence(body: dict, parents: [dict], sentence: str): target = body for p in parents: @@ -156,21 +146,33 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict: publicationDetails["journal"] = journal["name"] publicationDetails["volume"] = volume - + publicationDetails["number"] = random.randint(1, 99) publicationDetails["date"] = date.isoformat() - publicationDetails["pages"] = { "start": getProp(jsonObj, "metadata.firstpage"), "end": getProp(jsonObj, "metadata.lastpage") } paper["publicationDetails"] = publicationDetails - references = {} + figures = [] + for i in range(0, random.randint(3, 15)): + figures.push({ + "page": random.randint(1, 10), + "label": "fig" + str(i), + "caption": fake.paragraph(nb_sentences=1), + "imageURL": fake.image_url() + }) + paper["figures"] = figures + + i = 0 + references = [] for key, value in getProp(jsonObj, "bib_entries", {}).items(): + i += 1 if value is None: continue - references[key] = { + ref = { + "referenceNumber": i, "doi": getProp(value, "doi"), "title": getProp(value, "title"), "authors": [], @@ -180,7 +182,8 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict: } for author in getProp(value, "authors", []): - references[key]["authors"].append({ "name": get_author_name(author) }) + ref["authors"].append({ "name": get_author_name(author) }) + references.append(ref) paper["references"] = references body = { @@ -231,20 +234,28 @@ def main(): paper_ids[paper["doi"]] = x.inserted_id i += 1 - if i % 20 == 0: - print("Papers processed: " + i) - if i == 10: # TODO: remove + if i % 100 == 0: + print("Papers processed: ", i) + if i == 1000: # TODO: remove break i = 0 for name, author in authors.items(): x = db["authors"].insert_one(author) author_ids[name] = x.inserted_id - + i += 1 + if i % 1000 == 0: + print("Authors processed: ", i) + + i = 0 for issn, journal in journals.items(): x = db["journals"].insert_one(journal) journal_ids[issn] = x.inserted_id + i += 1 + if i % 100 == 0: + print("Journals processed: ", i) + i = 0 for paper in db["papers"].find(): mongo_filter = { "_id": paper["_id"] } update = {} @@ -252,30 +263,54 @@ def main(): issn = getProp(paper, "publicationDetails.issn", "") if issn in journal_ids: - setProp(update, "publicationDetails.journalRef", journal_ids[issn]) - - for key, ref in getProp(paper, "references", {}).items(): + update["publicationDetails.journalRef"] = journal_ids[issn] + + references = getProp(paper, "references", []) + for ref in references: if ref["doi"] in paper_ids: - setProp(update, "references." + key + ".paperId", paper_ids[ref["doi"]]) + ref["paperId"] = paper_ids[ref["doi"]] - authors_loc = getProp(ref, "authors", []) - for author in authors_loc: + for author in ref["authors"]: + name = author["name"] if name in author_ids: author["authorId"] = author_ids[name] - setProp(update, "references." + key + ".authors", authors_loc) - issn = getProp(ref, "issn", "") - if issn in journal_ids: - setProp(update, "references." + key + ".journalId", journal_ids[issn]) + if ref["issn"] in journal_ids: + ref["journalId"] = journal_ids[issn] + update["references"] = references authors_loc = getProp(paper, "authors", []) for author in authors_loc: + name = author["name"] if name in author_ids: author["authorId"] = author_ids[name] - setProp(update, "authors", authors_loc) - - print(mongo_update) + update["authors"] = authors_loc + db["papers"].update_one(mongo_filter, mongo_update) + + i += 1 + if i % 100 == 0: + print("Papers updated with refs: ", i) + + i = 0 + for journal in db["journals"].find(): + mongo_filter = { "_id": journal["_id"] } + update = {} + mongo_update = { "$set": update } + + volumes = getProp(journal, "volumes", []) + for volume in volumes: + v_papers = [] + for p in volume["papers"]: + v_papers.append(paper_ids[p]) + volume["papers"] = v_papers + update["volumes"] = volumes + + db["journals"].update_one(mongo_filter, mongo_update) + i += 1 + if i % 100 == 0: + print("Journals updated with refs: ", i) + if __name__ == "__main__":