Added instructions
This commit is contained in:
parent
da395105d1
commit
a3a6ea1d9f
4 changed files with 41 additions and 17 deletions
2
hw02/.gitignore
vendored
Normal file
2
hw02/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
/articles-dataset/*
|
||||
!/articles-dataset/.gitkeep
|
7
hw02/README.md
Normal file
7
hw02/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
# DDM HW02 Data Importer
|
||||
|
||||
Instructions:
|
||||
- Run `pip3 install faker pymongo`
|
||||
- Install MongoDB (the connect string is the variable `mongo_conn_str`, set to a local DB with no password by default)
|
||||
- Download the ZIP file at https://elsevier.digitalcommonsdata.com/datasets/zm33cdndxs/ and extract all the JSON files in the `articles-dataset` folder
|
||||
- Run the script with `python3 import.py articles-dataset`
|
0
hw02/articles-dataset/.gitkeep
Normal file
0
hw02/articles-dataset/.gitkeep
Normal file
|
@ -9,8 +9,6 @@ fake = Faker()
|
|||
|
||||
# pip install faker pymongo
|
||||
|
||||
# TODO: inject (fake) figures and references in content
|
||||
|
||||
# - article doi or JSON filename can be used as paper id
|
||||
# - no author id, authors are equal based on name for lack of info
|
||||
# - use issn as journal id
|
||||
|
@ -83,14 +81,6 @@ def save_sentence(body: dict, parents: [dict], sentence: str):
|
|||
target["content"] += sentence + " "
|
||||
|
||||
def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
|
||||
content = []
|
||||
if random.randint(0, 10) == 0 and len(figures) > 0:
|
||||
content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
|
||||
if "content" in sec and sec["content"] != "":
|
||||
content += [sec["content"]]
|
||||
if random.randint(0, 10) == 0 and len(references) > 0:
|
||||
content += [{ "reference": random.randint(1, len(references)) }]
|
||||
|
||||
arr = []
|
||||
ks = []
|
||||
for k in sec["sections"].keys():
|
||||
|
@ -98,10 +88,24 @@ def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
|
|||
ks.sort()
|
||||
for k in ks:
|
||||
arr.append(transform_section(sec["sections"][k], figures, references))
|
||||
content += arr,
|
||||
|
||||
if "title" not in sec:
|
||||
return content
|
||||
return arr
|
||||
|
||||
content = []
|
||||
if random.randint(0, 4) == 0 and len(figures) > 0:
|
||||
content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]
|
||||
if "content" in sec and sec["content"] != "":
|
||||
content += [sec["content"]]
|
||||
if random.randint(0, 4) == 0 and len(references) > 0:
|
||||
content += [{ "reference": random.randint(1, len(references)) }]
|
||||
|
||||
content += arr
|
||||
|
||||
|
||||
if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0:
|
||||
del content[-1]
|
||||
|
||||
|
||||
|
||||
return {
|
||||
"title": sec["title"],
|
||||
|
@ -213,6 +217,10 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
|
|||
mongo_conn_str = "mongodb://localhost:27017"
|
||||
def main():
|
||||
source_folder: str = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
limit: int = int(sys.argv[2])
|
||||
else:
|
||||
limit: int = -1
|
||||
|
||||
mongo = MongoClient(mongo_conn_str)
|
||||
db = mongo["ddm"]
|
||||
|
@ -226,6 +234,7 @@ def main():
|
|||
journal_ids: dict[str, ID] = {}
|
||||
|
||||
i = 0
|
||||
j = 0
|
||||
for filename in os.listdir(source_folder):
|
||||
if filename.endswith(".json"):
|
||||
jsonObj = {}
|
||||
|
@ -234,6 +243,10 @@ def main():
|
|||
d = json.JSONDecoder()
|
||||
jsonObj = d.decode(jsonStr)
|
||||
|
||||
if getProp(jsonObj, "metadata.issn") is None or getProp(jsonObj, "metadata.doi") is None:
|
||||
j += 1
|
||||
continue # SKIP papers with no journal ISSN or paper DOI
|
||||
|
||||
paper = json_to_paper(filename, jsonObj)
|
||||
|
||||
x = db["papers"].insert_one(paper)
|
||||
|
@ -242,9 +255,13 @@ def main():
|
|||
i += 1
|
||||
if i % 100 == 0:
|
||||
print("Papers processed: ", i)
|
||||
if i == 1000: # TODO: remove
|
||||
if j % 100 == 0 and j > 0:
|
||||
print("Papers skipped: ", j)
|
||||
if limit > 0 and i == limit:
|
||||
break
|
||||
|
||||
print("Papers skipped: ", j)
|
||||
|
||||
i = 0
|
||||
for name, author in authors.items():
|
||||
x = db["authors"].insert_one(author)
|
||||
|
@ -317,7 +334,5 @@ def main():
|
|||
if i % 100 == 0:
|
||||
print("Journals updated with refs: ", i)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in a new issue