This repository has been archived on 2022-12-21. You can view files and clone it, but cannot push or open issues or pull requests.
ddm/hw02/import.py

336 lines
9.9 KiB
Python

import json
import sys
import random
import os
from faker import Faker
from pymongo import MongoClient
fake = Faker()
# pip install faker pymongo
# - article doi or JSON filename can be used as paper id
# - no author id, authors are equal based on name for lack of info
# - use issn as journal id
# - author affiliation Faked
# - journal name Faked
# - publication date Faked (year ignored)
fake_unis = []
for i in range(0, 500):
(_, _, loc, _, _) = fake.location_on_land()
fake_unis.append("University of " + loc)
authors = {}
def fake_author(name: str, email: str) -> str:
if name not in authors:
authors[name] = {
"name": name,
"email": email,
"affiliation": fake_unis[random.randint(0, len(fake_unis) - 1)],
"bio": fake.paragraph(nb_sentences=5)
}
return authors[name]
journals = {}
def fake_journal(issn: str, volume: str, paper_doi: str) -> str:
if issn not in journals:
journals[issn] = {
"issn": issn,
"name": " ".join(fake.bs().split(" ")[1:]).capitalize(),
"volumes": [{
"name": volume,
"papers": [paper_doi]
}]
}
else:
not_found = True
for v in journals[issn]["volumes"]:
if v["name"] == volume:
not_found = False
v["papers"].append(paper_doi)
break
if not_found:
journals[issn]["volumes"].append({
"name": volume,
"papers": [paper_doi]
})
return journals[issn]
def getProp(obj: dict, props: str, default = None):
target = obj
if target is None:
return default
for prop in props.split("."):
if prop not in target:
return default
target = target[prop]
return target
def save_sentence(body: dict, parents: [dict], sentence: str):
target = body
for p in parents:
if p["id"] not in target["sections"]:
target["sections"][p["id"]] = {
"title": p["title"],
"content": "",
"sections": {},
}
target = target["sections"][p["id"]]
target["content"] += sentence + " "
def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
arr = []
ks = []
for k in sec["sections"].keys():
ks.append(k)
ks.sort()
for k in ks:
arr.append(transform_section(sec["sections"][k], figures, references))
if "title" not in sec:
return arr
content = []
if random.randint(0, 4) == 0 and len(figures) > 0:
content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
if "content" in sec and sec["content"] != "":
content += [sec["content"]]
if random.randint(0, 4) == 0 and len(references) > 0:
content += [{ "reference": random.randint(1, len(references)) }]
content += arr
if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0:
del content[-1]
return {
"title": sec["title"],
"content": content
}
def get_author_name(author: dict) -> str:
first = "" if author["first"] is None else author["first"]
last = "" if author["last"] is None else author["last"]
return (first + " " + last).strip()
def json_to_paper(filename: str, jsonObj: dict) -> dict:
paper = {}
paper["title"] = getProp(jsonObj, "metadata.title")
paper["abstract"] = getProp(jsonObj, "abstract")
paper["doi"] = getProp(jsonObj, "metadata.doi")
authors = []
for author in getProp(jsonObj, "metadata.authors", []):
email = getProp(author, "email")
author = fake_author(get_author_name(author), email)
authors.append({
"email": author["email"],
"name": author["name"],
"affiliation": author["affiliation"]
})
paper["authors"] = authors
paper["keywords"] = getProp(jsonObj, "metadata.keywords")
publicationDetails = {}
publicationDetails["issn"] = getProp(jsonObj, "metadata.issn")
date = fake.date_object()
volume = getProp(jsonObj, "metadata.volume")
if volume is None:
volume = str(date.year) + " Issue"
journal = fake_journal(
publicationDetails["issn"],
volume,
getProp(jsonObj, "metadata.doi")
)
publicationDetails["journal"] = journal["name"]
publicationDetails["volume"] = volume
publicationDetails["number"] = random.randint(1, 99)
publicationDetails["date"] = date.isoformat()
publicationDetails["pages"] = {
"start": getProp(jsonObj, "metadata.firstpage"),
"end": getProp(jsonObj, "metadata.lastpage")
}
paper["publicationDetails"] = publicationDetails
figures = []
for i in range(0, random.randint(3, 15)):
figures.append({
"page": random.randint(1, 10),
"label": "fig" + str(i),
"caption": fake.paragraph(nb_sentences=1),
"imageURL": fake.image_url()
})
paper["figures"] = figures
i = 0
references = []
for key, value in getProp(jsonObj, "bib_entries", {}).items():
if value is None:
continue
i += 1
ref = {
"referenceNumber": i,
"doi": getProp(value, "doi"),
"title": getProp(value, "title"),
"authors": [],
"issn": getProp(value, "issn"),
"volume": getProp(value, "volume"),
"year": getProp(value, "year")
}
for author in getProp(value, "authors", []):
ref["authors"].append({ "name": get_author_name(author) })
references.append(ref)
paper["references"] = references
body = {
"sections": {}
}
l = getProp(jsonObj, "body_text", [])
l.sort(key=lambda x: x["startOffset"])
for e in l:
parents = []
for p in getProp(e, "parents", []):
parents.append(p)
parents.append({ "id": e["secId"], "title": e["title"] })
save_sentence(body, parents, e["sentence"])
paper["content"] = transform_section(body, figures, references)
return paper
mongo_conn_str = "mongodb://localhost:27017"
def main():
source_folder: str = sys.argv[1]
if len(sys.argv) > 2:
limit: int = int(sys.argv[2])
else:
limit: int = -1
mongo = MongoClient(mongo_conn_str)
db = mongo["ddm"]
db["papers"].drop()
db["authors"].drop()
db["journals"].drop()
paper_ids: dict[str, ID] = {}
author_ids: dict[str, ID] = {}
journal_ids: dict[str, ID] = {}
i = 0
j = 0
for filename in os.listdir(source_folder):
if filename.endswith(".json"):
jsonObj = {}
with open(source_folder + "/" + filename, 'r') as jsonFile:
jsonStr = "".join(jsonFile.readlines())
d = json.JSONDecoder()
jsonObj = d.decode(jsonStr)
if getProp(jsonObj, "metadata.issn") is None or \
getProp(jsonObj, "metadata.doi") is None:
j += 1
continue # SKIP papers with no journal ISSN or paper DOI
paper = json_to_paper(filename, jsonObj)
x = db["papers"].insert_one(paper)
paper_ids[paper["doi"]] = x.inserted_id
i += 1
if i % 100 == 0:
print("Papers processed: ", i)
if j % 100 == 0 and j > 0:
print("Papers skipped: ", j)
if limit > 0 and i == limit:
break
print("Papers skipped: ", j)
i = 0
for name, author in authors.items():
x = db["authors"].insert_one(author)
author_ids[name] = x.inserted_id
i += 1
if i % 1000 == 0:
print("Authors processed: ", i)
i = 0
for issn, journal in journals.items():
x = db["journals"].insert_one(journal)
journal_ids[issn] = x.inserted_id
i += 1
if i % 100 == 0:
print("Journals processed: ", i)
i = 0
for paper in db["papers"].find():
mongo_filter = { "_id": paper["_id"] }
update = {}
mongo_update = { "$set": update }
issn = getProp(paper, "publicationDetails.issn", "")
if issn in journal_ids:
update["publicationDetails.journalRef"] = journal_ids[issn]
references = getProp(paper, "references", [])
for ref in references:
if ref["doi"] in paper_ids:
ref["paperId"] = paper_ids[ref["doi"]]
for author in ref["authors"]:
name = author["name"]
if name in author_ids:
author["authorId"] = author_ids[name]
if ref["issn"] in journal_ids:
ref["journalId"] = journal_ids[issn]
update["references"] = references
authors_loc = getProp(paper, "authors", [])
for author in authors_loc:
name = author["name"]
if name in author_ids:
author["authorId"] = author_ids[name]
update["authors"] = authors_loc
db["papers"].update_one(mongo_filter, mongo_update)
i += 1
if i % 100 == 0:
print("Papers updated with refs: ", i)
i = 0
for journal in db["journals"].find():
mongo_filter = { "_id": journal["_id"] }
update = {}
mongo_update = { "$set": update }
volumes = getProp(journal, "volumes", [])
for volume in volumes:
v_papers = []
for p in volume["papers"]:
v_papers.append(paper_ids[p])
volume["papers"] = v_papers
update["volumes"] = volumes
db["journals"].update_one(mongo_filter, mongo_update)
i += 1
if i % 100 == 0:
print("Journals updated with refs: ", i)
if __name__ == "__main__":
main()