Added instructions

2022-11-20 15:04:45 +01:00 · 2022-11-20 15:04:45 +01:00 · a3a6ea1d9f
commit a3a6ea1d9f
parent da395105d1
4 changed files with 41 additions and 17 deletions
--- a/hw02/.gitignore
+++ b/hw02/.gitignore
@ -0,0 +1,2 @@
 /articles-dataset/*
 !/articles-dataset/.gitkeep
--- a/hw02/README.md
+++ b/hw02/README.md
@ -0,0 +1,7 @@
 # DDM HW02 Data Importer
 Instructions:
 - Run `pip3 install faker pymongo`
 - Install MongoDB (the connect string is the variable `mongo_conn_str`, set to a local DB with no password by default)
 - Download the ZIP file at https://elsevier.digitalcommonsdata.com/datasets/zm33cdndxs/ and extract all the JSON files in the `articles-dataset` folder
 - Run the script with `python3 import.py articles-dataset`
--- a/hw02/articles-dataset/.gitkeep
+++ b/hw02/articles-dataset/.gitkeep
--- a/hw02/convert.py
+++ b/hw02/convert.py
@ -9,8 +9,6 @@ fake = Faker()
 # pip install faker pymongo
 # TODO: inject (fake) figures and references in content
 # - article doi or JSON filename can be used as paper id
 # - no author id, authors are equal based on name for lack of info
 # - use issn as journal id
@ -83,14 +81,6 @@ def save_sentence(body: dict, parents: [dict], sentence: str):
    target["content"] += sentence + " "
 def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
    content = []
    if random.randint(0, 10) == 0 and len(figures) > 0:
        content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]     
    if "content" in sec and sec["content"] != "":
        content += [sec["content"]] 
    if random.randint(0, 10) == 0 and len(references) > 0:
        content += [{ "reference": random.randint(1, len(references)) }] 
    arr = []
    ks = []
    for k in sec["sections"].keys():
@ -98,10 +88,24 @@ def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
    ks.sort()
    for k in ks:
        arr.append(transform_section(sec["sections"][k], figures, references))
    content += arr,
    if "title" not in sec:
-        return content
+        return arr
    content = []
    if random.randint(0, 4) == 0 and len(figures) > 0:
        content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]     
    if "content" in sec and sec["content"] != "":
        content += [sec["content"]] 
    if random.randint(0, 4) == 0 and len(references) > 0:
        content += [{ "reference": random.randint(1, len(references)) }] 
    content += arr
    if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0:
        del content[-1]
    return {
        "title": sec["title"],
@ -213,6 +217,10 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
 mongo_conn_str = "mongodb://localhost:27017"
 def main():
    source_folder: str = sys.argv[1]
    if len(sys.argv) > 2:
        limit: int = int(sys.argv[2])
    else:
        limit: int = -1
    mongo = MongoClient(mongo_conn_str)
    db = mongo["ddm"]
@ -226,6 +234,7 @@ def main():
    journal_ids: dict[str, ID] = {}
    i = 0
    j = 0
    for filename in os.listdir(source_folder):
        if filename.endswith(".json"): 
            jsonObj = {}
@ -233,6 +242,10 @@ def main():
                jsonStr = "".join(jsonFile.readlines())
                d = json.JSONDecoder()
                jsonObj = d.decode(jsonStr)
            if getProp(jsonObj, "metadata.issn") is None or getProp(jsonObj, "metadata.doi") is None:
                j += 1
                continue # SKIP papers with no journal ISSN or paper DOI
            paper = json_to_paper(filename, jsonObj)
@ -242,9 +255,13 @@ def main():
            i += 1
            if i % 100 == 0:
                print("Papers processed: ", i)
-            if i == 1000: # TODO: remove
+            if j % 100 == 0 and j > 0:
                print("Papers skipped: ", j)
            if limit > 0 and i == limit:
                break
    print("Papers skipped: ", j)
    i = 0
    for name, author in authors.items():
        x = db["authors"].insert_one(author)
@ -315,9 +332,7 @@ def main():
        db["journals"].update_one(mongo_filter, mongo_update)
        i += 1
        if i % 100 == 0:
-            print("Journals updated with refs: ", i)
+            print("Journals updated with refs: ", i)        
 if __name__ == "__main__":
    main()
		`@ -0,0 +1,2 @@`
							`/articles-dataset/*`
							`!/articles-dataset/.gitkeep`