2022-11-20 12:35:01 +00:00
|
|
|
import json
|
|
|
|
import sys
|
|
|
|
import random
|
|
|
|
import os
|
|
|
|
from faker import Faker
|
|
|
|
from pymongo import MongoClient
|
|
|
|
|
|
|
|
fake = Faker()
|
|
|
|
|
|
|
|
# pip install faker pymongo
|
|
|
|
|
|
|
|
# TODO: figures (fake)
|
|
|
|
# TODO: inject (fake) figures and references in content
|
|
|
|
|
|
|
|
# - article doi or JSON filename can be used as paper id
|
|
|
|
# - no author id, authors are equal based on name for lack of info
|
|
|
|
# - use issn as journal id
|
|
|
|
# - author affiliation Faked
|
|
|
|
# - journal name Faked
|
|
|
|
# - publication date Faked (year ignored)
|
|
|
|
|
|
|
|
fake_unis = []
|
|
|
|
for i in range(0, 500):
|
|
|
|
(_, _, loc, _, _) = fake.location_on_land()
|
|
|
|
fake_unis.append("University of " + loc)
|
|
|
|
|
|
|
|
authors = {}
|
|
|
|
def fake_author(name: str, email: str) -> str:
|
|
|
|
if name not in authors:
|
|
|
|
authors[name] = {
|
|
|
|
"name": name,
|
|
|
|
"email": email,
|
|
|
|
"affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)],
|
|
|
|
"bio": fake.paragraph(nb_sentences=5)
|
|
|
|
}
|
|
|
|
return authors[name]
|
|
|
|
|
|
|
|
journals = {}
|
|
|
|
def fake_journal(issn: str, volume: str, paper_doi: str) -> str:
|
|
|
|
if issn not in journals:
|
|
|
|
journals[issn] = {
|
|
|
|
"issn": issn,
|
|
|
|
"name": " ".join(fake.bs().split(" ")[1:]).capitalize(),
|
|
|
|
"volumes": [{
|
|
|
|
"name": volume,
|
|
|
|
"papers": [paper_doi]
|
|
|
|
}]
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
not_found = True
|
|
|
|
for v in journals[issn]["volumes"]:
|
|
|
|
if v["name"] == volume:
|
|
|
|
not_found = False
|
|
|
|
v["papers"].append(paper_doi)
|
|
|
|
break
|
|
|
|
if not_found:
|
|
|
|
journals[issn]["volumes"].append({
|
|
|
|
"name": volume,
|
|
|
|
"papers": [paper_doi]
|
|
|
|
})
|
|
|
|
|
|
|
|
return journals[issn]
|
|
|
|
|
|
|
|
def getProp(obj: dict, props: str, default = None):
|
|
|
|
target = obj
|
|
|
|
if target is None:
|
|
|
|
return default
|
|
|
|
for prop in props.split("."):
|
|
|
|
if prop not in target:
|
|
|
|
return default
|
|
|
|
target = target[prop]
|
|
|
|
return target
|
|
|
|
|
|
|
|
def save_sentence(body: dict, parents: [dict], sentence: str):
|
|
|
|
target = body
|
|
|
|
for p in parents:
|
|
|
|
if p["id"] not in target["sections"]:
|
|
|
|
target["sections"][p["id"]] = {
|
|
|
|
"title": p["title"],
|
|
|
|
"content": "",
|
|
|
|
"sections": {},
|
|
|
|
}
|
|
|
|
target = target["sections"][p["id"]]
|
|
|
|
target["content"] += sentence + " "
|
|
|
|
|
|
|
|
def transform_section(sec: dict) -> dict:
|
|
|
|
arr = []
|
|
|
|
ks = []
|
|
|
|
for k in sec["sections"].keys():
|
|
|
|
ks.append(k)
|
|
|
|
ks.sort()
|
|
|
|
for k in ks:
|
|
|
|
arr.append(transform_section(sec["sections"][k]))
|
|
|
|
|
|
|
|
content = ([sec["content"]] if "content" in sec and sec["content"] != "" else []) + arr,
|
|
|
|
|
|
|
|
if "title" not in sec:
|
|
|
|
return content
|
|
|
|
|
|
|
|
return {
|
|
|
|
"title": sec["title"],
|
|
|
|
"content": content
|
|
|
|
}
|
|
|
|
|
|
|
|
def get_author_name(author: dict) -> str:
|
|
|
|
first = "" if author["first"] is None else author["first"]
|
|
|
|
last = "" if author["last"] is None else author["last"]
|
|
|
|
return (first + " " + last).strip()
|
|
|
|
|
|
|
|
def json_to_paper(filename: str, jsonObj: dict) -> dict:
|
|
|
|
paper = {}
|
|
|
|
|
|
|
|
paper["title"] = getProp(jsonObj, "metadata.title")
|
|
|
|
paper["abstract"] = getProp(jsonObj, "abstract")
|
|
|
|
paper["doi"] = getProp(jsonObj, "metadata.doi")
|
|
|
|
|
|
|
|
authors = []
|
|
|
|
for author in getProp(jsonObj, "metadata.authors", []):
|
|
|
|
email = getProp(author, "email")
|
|
|
|
|
|
|
|
author = fake_author(get_author_name(author), email)
|
|
|
|
|
|
|
|
# TODO: authorID
|
|
|
|
authors.append({
|
|
|
|
"email": author["email"],
|
|
|
|
"name": author["name"],
|
|
|
|
"affiliation": author["affiliation"]
|
|
|
|
})
|
|
|
|
|
|
|
|
paper["authors"] = authors
|
|
|
|
paper["keywords"] = getProp(jsonObj, "metadata.keywords")
|
|
|
|
|
|
|
|
publicationDetails = {}
|
|
|
|
publicationDetails["issn"] = getProp(jsonObj, "metadata.issn") # ISBN-like, not a name
|
|
|
|
|
|
|
|
date = fake.date_object()
|
|
|
|
volume = getProp(jsonObj, "metadata.volume")
|
|
|
|
if volume is None:
|
|
|
|
volume = str(date.year) + " Issue"
|
|
|
|
|
|
|
|
journal = fake_journal(
|
|
|
|
publicationDetails["issn"],
|
|
|
|
volume,
|
|
|
|
getProp(jsonObj, "metadata.doi")
|
|
|
|
)
|
|
|
|
|
|
|
|
publicationDetails["journal"] = journal["name"]
|
|
|
|
publicationDetails["volume"] = volume
|
2022-11-20 13:16:27 +00:00
|
|
|
publicationDetails["number"] = random.randint(1, 99)
|
2022-11-20 12:35:01 +00:00
|
|
|
publicationDetails["date"] = date.isoformat()
|
|
|
|
publicationDetails["pages"] = {
|
|
|
|
"start": getProp(jsonObj, "metadata.firstpage"),
|
|
|
|
"end": getProp(jsonObj, "metadata.lastpage")
|
|
|
|
}
|
|
|
|
paper["publicationDetails"] = publicationDetails
|
|
|
|
|
2022-11-20 13:16:27 +00:00
|
|
|
figures = []
|
|
|
|
for i in range(0, random.randint(3, 15)):
|
|
|
|
figures.push({
|
|
|
|
"page": random.randint(1, 10),
|
|
|
|
"label": "fig" + str(i),
|
|
|
|
"caption": fake.paragraph(nb_sentences=1),
|
|
|
|
"imageURL": fake.image_url()
|
|
|
|
})
|
|
|
|
paper["figures"] = figures
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
references = []
|
2022-11-20 12:35:01 +00:00
|
|
|
for key, value in getProp(jsonObj, "bib_entries", {}).items():
|
2022-11-20 13:16:27 +00:00
|
|
|
i += 1
|
2022-11-20 12:35:01 +00:00
|
|
|
if value is None:
|
|
|
|
continue
|
|
|
|
|
2022-11-20 13:16:27 +00:00
|
|
|
ref = {
|
|
|
|
"referenceNumber": i,
|
2022-11-20 12:35:01 +00:00
|
|
|
"doi": getProp(value, "doi"),
|
|
|
|
"title": getProp(value, "title"),
|
|
|
|
"authors": [],
|
|
|
|
"issn": getProp(value, "issn"),
|
|
|
|
"volume": getProp(value, "volume"),
|
|
|
|
"year": getProp(value, "year")
|
|
|
|
}
|
|
|
|
|
|
|
|
for author in getProp(value, "authors", []):
|
2022-11-20 13:16:27 +00:00
|
|
|
ref["authors"].append({ "name": get_author_name(author) })
|
|
|
|
references.append(ref)
|
2022-11-20 12:35:01 +00:00
|
|
|
paper["references"] = references
|
|
|
|
|
|
|
|
body = {
|
|
|
|
"sections": {}
|
|
|
|
}
|
|
|
|
|
|
|
|
l = getProp(jsonObj, "body_text", [])
|
|
|
|
l.sort(key=lambda x: x["startOffset"])
|
|
|
|
for e in l:
|
|
|
|
parents = []
|
|
|
|
for p in getProp(e, "parents", []):
|
|
|
|
parents.append(p)
|
|
|
|
|
|
|
|
parents.append({ "id": e["secId"], "title": e["title"] })
|
|
|
|
save_sentence(body, parents, e["sentence"])
|
|
|
|
|
|
|
|
paper["content"] = transform_section(body)
|
|
|
|
return paper
|
|
|
|
|
|
|
|
|
|
|
|
mongo_conn_str = "mongodb://localhost:27017"
|
|
|
|
def main():
|
|
|
|
source_folder: str = sys.argv[1]
|
|
|
|
|
|
|
|
mongo = MongoClient(mongo_conn_str)
|
|
|
|
db = mongo["ddm"]
|
|
|
|
|
|
|
|
db["papers"].drop()
|
|
|
|
db["authors"].drop()
|
|
|
|
db["journals"].drop()
|
|
|
|
|
|
|
|
paper_ids: dict[str, ID] = {}
|
|
|
|
author_ids: dict[str, ID] = {}
|
|
|
|
journal_ids: dict[str, ID] = {}
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
for filename in os.listdir(source_folder):
|
|
|
|
if filename.endswith(".json"):
|
|
|
|
jsonObj = {}
|
|
|
|
with open(source_folder + "/" + filename, 'r') as jsonFile:
|
|
|
|
jsonStr = "".join(jsonFile.readlines())
|
|
|
|
d = json.JSONDecoder()
|
|
|
|
jsonObj = d.decode(jsonStr)
|
|
|
|
|
|
|
|
paper = json_to_paper(filename, jsonObj)
|
|
|
|
|
|
|
|
x = db["papers"].insert_one(paper)
|
|
|
|
paper_ids[paper["doi"]] = x.inserted_id
|
|
|
|
|
|
|
|
i += 1
|
2022-11-20 13:16:27 +00:00
|
|
|
if i % 100 == 0:
|
|
|
|
print("Papers processed: ", i)
|
|
|
|
if i == 1000: # TODO: remove
|
2022-11-20 12:35:01 +00:00
|
|
|
break
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
for name, author in authors.items():
|
|
|
|
x = db["authors"].insert_one(author)
|
|
|
|
author_ids[name] = x.inserted_id
|
2022-11-20 13:16:27 +00:00
|
|
|
i += 1
|
|
|
|
if i % 1000 == 0:
|
|
|
|
print("Authors processed: ", i)
|
|
|
|
|
|
|
|
i = 0
|
2022-11-20 12:35:01 +00:00
|
|
|
for issn, journal in journals.items():
|
|
|
|
x = db["journals"].insert_one(journal)
|
|
|
|
journal_ids[issn] = x.inserted_id
|
2022-11-20 13:16:27 +00:00
|
|
|
i += 1
|
|
|
|
if i % 100 == 0:
|
|
|
|
print("Journals processed: ", i)
|
2022-11-20 12:35:01 +00:00
|
|
|
|
2022-11-20 13:16:27 +00:00
|
|
|
i = 0
|
2022-11-20 12:35:01 +00:00
|
|
|
for paper in db["papers"].find():
|
|
|
|
mongo_filter = { "_id": paper["_id"] }
|
|
|
|
update = {}
|
|
|
|
mongo_update = { "$set": update }
|
|
|
|
|
|
|
|
issn = getProp(paper, "publicationDetails.issn", "")
|
|
|
|
if issn in journal_ids:
|
2022-11-20 13:16:27 +00:00
|
|
|
update["publicationDetails.journalRef"] = journal_ids[issn]
|
|
|
|
|
|
|
|
references = getProp(paper, "references", [])
|
|
|
|
for ref in references:
|
2022-11-20 12:35:01 +00:00
|
|
|
if ref["doi"] in paper_ids:
|
2022-11-20 13:16:27 +00:00
|
|
|
ref["paperId"] = paper_ids[ref["doi"]]
|
2022-11-20 12:35:01 +00:00
|
|
|
|
2022-11-20 13:16:27 +00:00
|
|
|
for author in ref["authors"]:
|
|
|
|
name = author["name"]
|
2022-11-20 12:35:01 +00:00
|
|
|
if name in author_ids:
|
|
|
|
author["authorId"] = author_ids[name]
|
|
|
|
|
2022-11-20 13:16:27 +00:00
|
|
|
if ref["issn"] in journal_ids:
|
|
|
|
ref["journalId"] = journal_ids[issn]
|
|
|
|
update["references"] = references
|
2022-11-20 12:35:01 +00:00
|
|
|
|
|
|
|
authors_loc = getProp(paper, "authors", [])
|
|
|
|
for author in authors_loc:
|
2022-11-20 13:16:27 +00:00
|
|
|
name = author["name"]
|
2022-11-20 12:35:01 +00:00
|
|
|
if name in author_ids:
|
|
|
|
author["authorId"] = author_ids[name]
|
2022-11-20 13:16:27 +00:00
|
|
|
update["authors"] = authors_loc
|
|
|
|
|
2022-11-20 12:35:01 +00:00
|
|
|
db["papers"].update_one(mongo_filter, mongo_update)
|
2022-11-20 13:16:27 +00:00
|
|
|
|
|
|
|
i += 1
|
|
|
|
if i % 100 == 0:
|
|
|
|
print("Papers updated with refs: ", i)
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
for journal in db["journals"].find():
|
|
|
|
mongo_filter = { "_id": journal["_id"] }
|
|
|
|
update = {}
|
|
|
|
mongo_update = { "$set": update }
|
|
|
|
|
|
|
|
volumes = getProp(journal, "volumes", [])
|
|
|
|
for volume in volumes:
|
|
|
|
v_papers = []
|
|
|
|
for p in volume["papers"]:
|
|
|
|
v_papers.append(paper_ids[p])
|
|
|
|
volume["papers"] = v_papers
|
|
|
|
update["volumes"] = volumes
|
|
|
|
|
|
|
|
db["journals"].update_one(mongo_filter, mongo_update)
|
|
|
|
i += 1
|
|
|
|
if i % 100 == 0:
|
|
|
|
print("Journals updated with refs: ", i)
|
|
|
|
|
2022-11-20 12:35:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|