Added instructions

This commit is contained in:
Claudio Maggioni 2022-11-20 15:04:45 +01:00
parent da395105d1
commit a3a6ea1d9f
4 changed files with 41 additions and 17 deletions

2
hw02/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/articles-dataset/*
!/articles-dataset/.gitkeep

7
hw02/README.md Normal file
View file

@ -0,0 +1,7 @@
# DDM HW02 Data Importer
Instructions:
- Run `pip3 install faker pymongo`
- Install MongoDB (the connect string is the variable `mongo_conn_str`, set to a local DB with no password by default)
- Download the ZIP file at https://elsevier.digitalcommonsdata.com/datasets/zm33cdndxs/ and extract all the JSON files in the `articles-dataset` folder
- Run the script with `python3 import.py articles-dataset`

View file

View file

@ -9,8 +9,6 @@ fake = Faker()
# pip install faker pymongo
# TODO: inject (fake) figures and references in content
# - article doi or JSON filename can be used as paper id
# - no author id, authors are equal based on name for lack of info
# - use issn as journal id
@ -83,14 +81,6 @@ def save_sentence(body: dict, parents: [dict], sentence: str):
target["content"] += sentence + " "
def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
content = []
if random.randint(0, 10) == 0 and len(figures) > 0:
content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
if "content" in sec and sec["content"] != "":
content += [sec["content"]]
if random.randint(0, 10) == 0 and len(references) > 0:
content += [{ "reference": random.randint(1, len(references)) }]
arr = []
ks = []
for k in sec["sections"].keys():
@ -98,10 +88,24 @@ def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
ks.sort()
for k in ks:
arr.append(transform_section(sec["sections"][k], figures, references))
content += arr,
if "title" not in sec:
return content
return arr
content = []
if random.randint(0, 4) == 0 and len(figures) > 0:
content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
if "content" in sec and sec["content"] != "":
content += [sec["content"]]
if random.randint(0, 4) == 0 and len(references) > 0:
content += [{ "reference": random.randint(1, len(references)) }]
content += arr
if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0:
del content[-1]
return {
"title": sec["title"],
@ -213,6 +217,10 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
mongo_conn_str = "mongodb://localhost:27017"
def main():
source_folder: str = sys.argv[1]
if len(sys.argv) > 2:
limit: int = int(sys.argv[2])
else:
limit: int = -1
mongo = MongoClient(mongo_conn_str)
db = mongo["ddm"]
@ -226,6 +234,7 @@ def main():
journal_ids: dict[str, ID] = {}
i = 0
j = 0
for filename in os.listdir(source_folder):
if filename.endswith(".json"):
jsonObj = {}
@ -234,6 +243,10 @@ def main():
d = json.JSONDecoder()
jsonObj = d.decode(jsonStr)
if getProp(jsonObj, "metadata.issn") is None or getProp(jsonObj, "metadata.doi") is None:
j += 1
continue # SKIP papers with no journal ISSN or paper DOI
paper = json_to_paper(filename, jsonObj)
x = db["papers"].insert_one(paper)
@ -242,9 +255,13 @@ def main():
i += 1
if i % 100 == 0:
print("Papers processed: ", i)
if i == 1000: # TODO: remove
if j % 100 == 0 and j > 0:
print("Papers skipped: ", j)
if limit > 0 and i == limit:
break
print("Papers skipped: ", j)
i = 0
for name, author in authors.items():
x = db["authors"].insert_one(author)
@ -317,7 +334,5 @@ def main():
if i % 100 == 0:
print("Journals updated with refs: ", i)
if __name__ == "__main__":
main()