Added instructions
This commit is contained in:
parent
da395105d1
commit
a3a6ea1d9f
4 changed files with 41 additions and 17 deletions
2
hw02/.gitignore
vendored
Normal file
2
hw02/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
/articles-dataset/*
|
||||||
|
!/articles-dataset/.gitkeep
|
7
hw02/README.md
Normal file
7
hw02/README.md
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# DDM HW02 Data Importer
|
||||||
|
|
||||||
|
Instructions:
|
||||||
|
- Run `pip3 install faker pymongo`
|
||||||
|
- Install MongoDB (the connect string is the variable `mongo_conn_str`, set to a local DB with no password by default)
|
||||||
|
- Download the ZIP file at https://elsevier.digitalcommonsdata.com/datasets/zm33cdndxs/ and extract all the JSON files in the `articles-dataset` folder
|
||||||
|
- Run the script with `python3 import.py articles-dataset`
|
0
hw02/articles-dataset/.gitkeep
Normal file
0
hw02/articles-dataset/.gitkeep
Normal file
|
@ -9,8 +9,6 @@ fake = Faker()
|
||||||
|
|
||||||
# pip install faker pymongo
|
# pip install faker pymongo
|
||||||
|
|
||||||
# TODO: inject (fake) figures and references in content
|
|
||||||
|
|
||||||
# - article doi or JSON filename can be used as paper id
|
# - article doi or JSON filename can be used as paper id
|
||||||
# - no author id, authors are equal based on name for lack of info
|
# - no author id, authors are equal based on name for lack of info
|
||||||
# - use issn as journal id
|
# - use issn as journal id
|
||||||
|
@ -83,14 +81,6 @@ def save_sentence(body: dict, parents: [dict], sentence: str):
|
||||||
target["content"] += sentence + " "
|
target["content"] += sentence + " "
|
||||||
|
|
||||||
def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
|
def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
|
||||||
content = []
|
|
||||||
if random.randint(0, 10) == 0 and len(figures) > 0:
|
|
||||||
content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
|
|
||||||
if "content" in sec and sec["content"] != "":
|
|
||||||
content += [sec["content"]]
|
|
||||||
if random.randint(0, 10) == 0 and len(references) > 0:
|
|
||||||
content += [{ "reference": random.randint(1, len(references)) }]
|
|
||||||
|
|
||||||
arr = []
|
arr = []
|
||||||
ks = []
|
ks = []
|
||||||
for k in sec["sections"].keys():
|
for k in sec["sections"].keys():
|
||||||
|
@ -98,10 +88,24 @@ def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
|
||||||
ks.sort()
|
ks.sort()
|
||||||
for k in ks:
|
for k in ks:
|
||||||
arr.append(transform_section(sec["sections"][k], figures, references))
|
arr.append(transform_section(sec["sections"][k], figures, references))
|
||||||
content += arr,
|
|
||||||
|
|
||||||
if "title" not in sec:
|
if "title" not in sec:
|
||||||
return content
|
return arr
|
||||||
|
|
||||||
|
content = []
|
||||||
|
if random.randint(0, 4) == 0 and len(figures) > 0:
|
||||||
|
content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
|
||||||
|
if "content" in sec and sec["content"] != "":
|
||||||
|
content += [sec["content"]]
|
||||||
|
if random.randint(0, 4) == 0 and len(references) > 0:
|
||||||
|
content += [{ "reference": random.randint(1, len(references)) }]
|
||||||
|
|
||||||
|
content += arr
|
||||||
|
|
||||||
|
|
||||||
|
if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0:
|
||||||
|
del content[-1]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title": sec["title"],
|
"title": sec["title"],
|
||||||
|
@ -213,6 +217,10 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
|
||||||
mongo_conn_str = "mongodb://localhost:27017"
|
mongo_conn_str = "mongodb://localhost:27017"
|
||||||
def main():
|
def main():
|
||||||
source_folder: str = sys.argv[1]
|
source_folder: str = sys.argv[1]
|
||||||
|
if len(sys.argv) > 2:
|
||||||
|
limit: int = int(sys.argv[2])
|
||||||
|
else:
|
||||||
|
limit: int = -1
|
||||||
|
|
||||||
mongo = MongoClient(mongo_conn_str)
|
mongo = MongoClient(mongo_conn_str)
|
||||||
db = mongo["ddm"]
|
db = mongo["ddm"]
|
||||||
|
@ -226,6 +234,7 @@ def main():
|
||||||
journal_ids: dict[str, ID] = {}
|
journal_ids: dict[str, ID] = {}
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
|
j = 0
|
||||||
for filename in os.listdir(source_folder):
|
for filename in os.listdir(source_folder):
|
||||||
if filename.endswith(".json"):
|
if filename.endswith(".json"):
|
||||||
jsonObj = {}
|
jsonObj = {}
|
||||||
|
@ -233,6 +242,10 @@ def main():
|
||||||
jsonStr = "".join(jsonFile.readlines())
|
jsonStr = "".join(jsonFile.readlines())
|
||||||
d = json.JSONDecoder()
|
d = json.JSONDecoder()
|
||||||
jsonObj = d.decode(jsonStr)
|
jsonObj = d.decode(jsonStr)
|
||||||
|
|
||||||
|
if getProp(jsonObj, "metadata.issn") is None or getProp(jsonObj, "metadata.doi") is None:
|
||||||
|
j += 1
|
||||||
|
continue # SKIP papers with no journal ISSN or paper DOI
|
||||||
|
|
||||||
paper = json_to_paper(filename, jsonObj)
|
paper = json_to_paper(filename, jsonObj)
|
||||||
|
|
||||||
|
@ -242,9 +255,13 @@ def main():
|
||||||
i += 1
|
i += 1
|
||||||
if i % 100 == 0:
|
if i % 100 == 0:
|
||||||
print("Papers processed: ", i)
|
print("Papers processed: ", i)
|
||||||
if i == 1000: # TODO: remove
|
if j % 100 == 0 and j > 0:
|
||||||
|
print("Papers skipped: ", j)
|
||||||
|
if limit > 0 and i == limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
print("Papers skipped: ", j)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
for name, author in authors.items():
|
for name, author in authors.items():
|
||||||
x = db["authors"].insert_one(author)
|
x = db["authors"].insert_one(author)
|
||||||
|
@ -315,9 +332,7 @@ def main():
|
||||||
db["journals"].update_one(mongo_filter, mongo_update)
|
db["journals"].update_one(mongo_filter, mongo_update)
|
||||||
i += 1
|
i += 1
|
||||||
if i % 100 == 0:
|
if i % 100 == 0:
|
||||||
print("Journals updated with refs: ", i)
|
print("Journals updated with refs: ", i)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
Reference in a new issue