import json import sys import random import os from faker import Faker from pymongo import MongoClient fake = Faker() # pip install faker pymongo # - article doi or JSON filename can be used as paper id # - no author id, authors are equal based on name for lack of info # - use issn as journal id # - author affiliation Faked # - journal name Faked # - publication date Faked (year ignored) fake_unis = [] for i in range(0, 500): (_, _, loc, _, _) = fake.location_on_land() fake_unis.append("University of " + loc) authors = {} def fake_author(name: str, email: str) -> str: if name not in authors: authors[name] = { "name": name, "email": email, "affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)], "bio": fake.paragraph(nb_sentences=5) } return authors[name] journals = {} def fake_journal(issn: str, volume: str, paper_doi: str) -> str: if issn not in journals: journals[issn] = { "issn": issn, "name": " ".join(fake.bs().split(" ")[1:]).capitalize(), "volumes": [{ "name": volume, "papers": [paper_doi] }] } else: not_found = True for v in journals[issn]["volumes"]: if v["name"] == volume: not_found = False v["papers"].append(paper_doi) break if not_found: journals[issn]["volumes"].append({ "name": volume, "papers": [paper_doi] }) return journals[issn] def getProp(obj: dict, props: str, default = None): target = obj if target is None: return default for prop in props.split("."): if prop not in target: return default target = target[prop] return target def save_sentence(body: dict, parents: [dict], sentence: str): target = body for p in parents: if p["id"] not in target["sections"]: target["sections"][p["id"]] = { "title": p["title"], "content": "", "sections": {}, } target = target["sections"][p["id"]] target["content"] += sentence + " " def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict: arr = [] ks = [] for k in sec["sections"].keys(): ks.append(k) ks.sort() for k in ks: arr.append(transform_section(sec["sections"][k], figures, references)) if "title" not in sec: return arr content = [] if random.randint(0, 4) == 0 and len(figures) > 0: content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }] if "content" in sec and sec["content"] != "": content += [sec["content"]] if random.randint(0, 4) == 0 and len(references) > 0: content += [{ "reference": random.randint(1, len(references)) }] content += arr if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0: del content[-1] return { "title": sec["title"], "content": content } def get_author_name(author: dict) -> str: first = "" if author["first"] is None else author["first"] last = "" if author["last"] is None else author["last"] return (first + " " + last).strip() def json_to_paper(filename: str, jsonObj: dict) -> dict: paper = {} paper["title"] = getProp(jsonObj, "metadata.title") paper["abstract"] = getProp(jsonObj, "abstract") paper["doi"] = getProp(jsonObj, "metadata.doi") authors = [] for author in getProp(jsonObj, "metadata.authors", []): email = getProp(author, "email") author = fake_author(get_author_name(author), email) authors.append({ "email": author["email"], "name": author["name"], "affiliation": author["affiliation"] }) paper["authors"] = authors paper["keywords"] = getProp(jsonObj, "metadata.keywords") publicationDetails = {} publicationDetails["issn"] = getProp(jsonObj, "metadata.issn") date = fake.date_object() volume = getProp(jsonObj, "metadata.volume") if volume is None: volume = str(date.year) + " Issue" journal = fake_journal( publicationDetails["issn"], volume, getProp(jsonObj, "metadata.doi") ) publicationDetails["journal"] = journal["name"] publicationDetails["volume"] = volume publicationDetails["number"] = random.randint(1, 99) publicationDetails["date"] = date.isoformat() publicationDetails["pages"] = { "start": getProp(jsonObj, "metadata.firstpage"), "end": getProp(jsonObj, "metadata.lastpage") } paper["publicationDetails"] = publicationDetails figures = [] for i in range(0, random.randint(3, 15)): figures.append({ "page": random.randint(1, 10), "label": "fig" + str(i), "caption": fake.paragraph(nb_sentences=1), "imageURL": fake.image_url() }) paper["figures"] = figures i = 0 references = [] for key, value in getProp(jsonObj, "bib_entries", {}).items(): if value is None: continue i += 1 ref = { "referenceNumber": i, "doi": getProp(value, "doi"), "title": getProp(value, "title"), "authors": [], "issn": getProp(value, "issn"), "volume": getProp(value, "volume"), "year": getProp(value, "year") } for author in getProp(value, "authors", []): ref["authors"].append({ "name": get_author_name(author) }) references.append(ref) paper["references"] = references body = { "sections": {} } l = getProp(jsonObj, "body_text", []) l.sort(key=lambda x: x["startOffset"]) for e in l: parents = [] for p in getProp(e, "parents", []): parents.append(p) parents.append({ "id": e["secId"], "title": e["title"] }) save_sentence(body, parents, e["sentence"]) paper["content"] = transform_section(body, figures, references) return paper mongo_conn_str = "mongodb://localhost:27017" def main(): source_folder: str = sys.argv[1] if len(sys.argv) > 2: limit: int = int(sys.argv[2]) else: limit: int = -1 mongo = MongoClient(mongo_conn_str) db = mongo["ddm"] db["papers"].drop() db["authors"].drop() db["journals"].drop() paper_ids: dict[str, ID] = {} author_ids: dict[str, ID] = {} journal_ids: dict[str, ID] = {} i = 0 j = 0 for filename in os.listdir(source_folder): if filename.endswith(".json"): jsonObj = {} with open(source_folder + "/" + filename, 'r') as jsonFile: jsonStr = "".join(jsonFile.readlines()) d = json.JSONDecoder() jsonObj = d.decode(jsonStr) if getProp(jsonObj, "metadata.issn") is None or \ getProp(jsonObj, "metadata.doi") is None: j += 1 continue # SKIP papers with no journal ISSN or paper DOI paper = json_to_paper(filename, jsonObj) x = db["papers"].insert_one(paper) paper_ids[paper["doi"]] = x.inserted_id i += 1 if i % 100 == 0: print("Papers processed: ", i) if j % 100 == 0 and j > 0: print("Papers skipped: ", j) if limit > 0 and i == limit: break print("Papers skipped: ", j) i = 0 for name, author in authors.items(): x = db["authors"].insert_one(author) author_ids[name] = x.inserted_id i += 1 if i % 1000 == 0: print("Authors processed: ", i) i = 0 for issn, journal in journals.items(): x = db["journals"].insert_one(journal) journal_ids[issn] = x.inserted_id i += 1 if i % 100 == 0: print("Journals processed: ", i) i = 0 for paper in db["papers"].find(): mongo_filter = { "_id": paper["_id"] } update = {} mongo_update = { "$set": update } issn = getProp(paper, "publicationDetails.issn", "") if issn in journal_ids: update["publicationDetails.journalRef"] = journal_ids[issn] references = getProp(paper, "references", []) for ref in references: if ref["doi"] in paper_ids: ref["paperId"] = paper_ids[ref["doi"]] for author in ref["authors"]: name = author["name"] if name in author_ids: author["authorId"] = author_ids[name] if ref["issn"] in journal_ids: ref["journalId"] = journal_ids[issn] update["references"] = references authors_loc = getProp(paper, "authors", []) for author in authors_loc: name = author["name"] if name in author_ids: author["authorId"] = author_ids[name] update["authors"] = authors_loc db["papers"].update_one(mongo_filter, mongo_update) i += 1 if i % 100 == 0: print("Papers updated with refs: ", i) i = 0 for journal in db["journals"].find(): mongo_filter = { "_id": journal["_id"] } update = {} mongo_update = { "$set": update } volumes = getProp(journal, "volumes", []) for volume in volumes: v_papers = [] for p in volume["papers"]: v_papers.append(paper_ids[p]) volume["papers"] = v_papers update["volumes"] = volumes db["journals"].update_one(mongo_filter, mongo_update) i += 1 if i % 100 == 0: print("Journals updated with refs: ", i) if __name__ == "__main__": main()