Added instructions

2022-11-20 15:04:45 +01:00 · 2022-11-20 15:04:45 +01:00 · a3a6ea1d9f
commit a3a6ea1d9f
parent da395105d1
4 changed files with 41 additions and 17 deletions
--- a/hw02/.gitignore
+++ b/hw02/.gitignore
@ -0,0 +1,2 @@
+/articles-dataset/*
+!/articles-dataset/.gitkeep
--- a/hw02/README.md
+++ b/hw02/README.md
@ -0,0 +1,7 @@
+# DDM HW02 Data Importer
+
+Instructions:
+- Run `pip3 install faker pymongo`
+- Install MongoDB (the connect string is the variable `mongo_conn_str`, set to a local DB with no password by default)
+- Download the ZIP file at https://elsevier.digitalcommonsdata.com/datasets/zm33cdndxs/ and extract all the JSON files in the `articles-dataset` folder
+- Run the script with `python3 import.py articles-dataset`
--- a/hw02/articles-dataset/.gitkeep
+++ b/hw02/articles-dataset/.gitkeep
--- a/hw02/convert.py
+++ b/hw02/convert.py
@ -9,8 +9,6 @@ fake = Faker()

 # pip install faker pymongo

-# TODO: inject (fake) figures and references in content
-
 # - article doi or JSON filename can be used as paper id
 # - no author id, authors are equal based on name for lack of info
 # - use issn as journal id
@ -83,14 +81,6 @@ def save_sentence(body: dict, parents: [dict], sentence: str):
    target["content"] += sentence + " "

 def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
-    content = []
-    if random.randint(0, 10) == 0 and len(figures) > 0:
-        content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]     
-    if "content" in sec and sec["content"] != "":
-        content += [sec["content"]] 
-    if random.randint(0, 10) == 0 and len(references) > 0:
-        content += [{ "reference": random.randint(1, len(references)) }] 
-    
    arr = []
    ks = []
    for k in sec["sections"].keys():
@ -98,10 +88,24 @@ def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict:
    ks.sort()
    for k in ks:
        arr.append(transform_section(sec["sections"][k], figures, references))
-    content += arr,
-
    if "title" not in sec:
-        return content
+        return arr
+
+    content = []
+    if random.randint(0, 4) == 0 and len(figures) > 0:
+        content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }]     
+    if "content" in sec and sec["content"] != "":
+        content += [sec["content"]] 
+    if random.randint(0, 4) == 0 and len(references) > 0:
+        content += [{ "reference": random.randint(1, len(references)) }] 
+
+    content += arr
+
+
+    if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0:
+        del content[-1]
+
+

    return {
        "title": sec["title"],
@ -213,6 +217,10 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict:
 mongo_conn_str = "mongodb://localhost:27017"
 def main():
    source_folder: str = sys.argv[1]
+    if len(sys.argv) > 2:
+        limit: int = int(sys.argv[2])
+    else:
+        limit: int = -1

    mongo = MongoClient(mongo_conn_str)
    db = mongo["ddm"]
@ -226,6 +234,7 @@ def main():
    journal_ids: dict[str, ID] = {}

    i = 0
+    j = 0
    for filename in os.listdir(source_folder):
        if filename.endswith(".json"): 
            jsonObj = {}
@ -234,6 +243,10 @@ def main():
                d = json.JSONDecoder()
                jsonObj = d.decode(jsonStr)

+            if getProp(jsonObj, "metadata.issn") is None or getProp(jsonObj, "metadata.doi") is None:
+                j += 1
+                continue # SKIP papers with no journal ISSN or paper DOI
+                
            paper = json_to_paper(filename, jsonObj)
                
            x = db["papers"].insert_one(paper)
@ -242,9 +255,13 @@ def main():
            i += 1
            if i % 100 == 0:
                print("Papers processed: ", i)
-            if i == 1000: # TODO: remove
+            if j % 100 == 0 and j > 0:
+                print("Papers skipped: ", j)
+            if limit > 0 and i == limit:
                break
    
+    print("Papers skipped: ", j)
+
    i = 0
    for name, author in authors.items():
        x = db["authors"].insert_one(author)
@ -317,7 +334,5 @@ def main():
        if i % 100 == 0:
            print("Journals updated with refs: ", i)        

-        
-
 if __name__ == "__main__":
    main()