diff --git a/hw02/.gitignore b/hw02/.gitignore new file mode 100644 index 0000000..46878bb --- /dev/null +++ b/hw02/.gitignore @@ -0,0 +1,2 @@ +/articles-dataset/* +!/articles-dataset/.gitkeep \ No newline at end of file diff --git a/hw02/README.md b/hw02/README.md new file mode 100644 index 0000000..11c5db8 --- /dev/null +++ b/hw02/README.md @@ -0,0 +1,7 @@ +# DDM HW02 Data Importer + +Instructions: +- Run `pip3 install faker pymongo` +- Install MongoDB (the connect string is the variable `mongo_conn_str`, set to a local DB with no password by default) +- Download the ZIP file at https://elsevier.digitalcommonsdata.com/datasets/zm33cdndxs/ and extract all the JSON files in the `articles-dataset` folder +- Run the script with `python3 import.py articles-dataset` \ No newline at end of file diff --git a/hw02/articles-dataset/.gitkeep b/hw02/articles-dataset/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/hw02/convert.py b/hw02/import.py similarity index 92% rename from hw02/convert.py rename to hw02/import.py index 9acb896..3122b06 100644 --- a/hw02/convert.py +++ b/hw02/import.py @@ -9,8 +9,6 @@ fake = Faker() # pip install faker pymongo -# TODO: inject (fake) figures and references in content - # - article doi or JSON filename can be used as paper id # - no author id, authors are equal based on name for lack of info # - use issn as journal id @@ -83,14 +81,6 @@ def save_sentence(body: dict, parents: [dict], sentence: str): target["content"] += sentence + " " def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict: - content = [] - if random.randint(0, 10) == 0 and len(figures) > 0: - content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }] - if "content" in sec and sec["content"] != "": - content += [sec["content"]] - if random.randint(0, 10) == 0 and len(references) > 0: - content += [{ "reference": random.randint(1, len(references)) }] - arr = [] ks = [] for k in sec["sections"].keys(): @@ -98,10 +88,24 @@ def transform_section(sec: dict, figures: [dict], references: [dict]) -> dict: ks.sort() for k in ks: arr.append(transform_section(sec["sections"][k], figures, references)) - content += arr, - if "title" not in sec: - return content + return arr + + content = [] + if random.randint(0, 4) == 0 and len(figures) > 0: + content += [{ "label": figures[random.randint(0, len(figures)-1)]["label"] }] + if "content" in sec and sec["content"] != "": + content += [sec["content"]] + if random.randint(0, 4) == 0 and len(references) > 0: + content += [{ "reference": random.randint(1, len(references)) }] + + content += arr + + + if len(content) > 0 and isinstance(content[-1], list) and len(content[-1]) == 0: + del content[-1] + + return { "title": sec["title"], @@ -213,6 +217,10 @@ def json_to_paper(filename: str, jsonObj: dict) -> dict: mongo_conn_str = "mongodb://localhost:27017" def main(): source_folder: str = sys.argv[1] + if len(sys.argv) > 2: + limit: int = int(sys.argv[2]) + else: + limit: int = -1 mongo = MongoClient(mongo_conn_str) db = mongo["ddm"] @@ -226,6 +234,7 @@ def main(): journal_ids: dict[str, ID] = {} i = 0 + j = 0 for filename in os.listdir(source_folder): if filename.endswith(".json"): jsonObj = {} @@ -233,6 +242,10 @@ def main(): jsonStr = "".join(jsonFile.readlines()) d = json.JSONDecoder() jsonObj = d.decode(jsonStr) + + if getProp(jsonObj, "metadata.issn") is None or getProp(jsonObj, "metadata.doi") is None: + j += 1 + continue # SKIP papers with no journal ISSN or paper DOI paper = json_to_paper(filename, jsonObj) @@ -242,9 +255,13 @@ def main(): i += 1 if i % 100 == 0: print("Papers processed: ", i) - if i == 1000: # TODO: remove + if j % 100 == 0 and j > 0: + print("Papers skipped: ", j) + if limit > 0 and i == limit: break + print("Papers skipped: ", j) + i = 0 for name, author in authors.items(): x = db["authors"].insert_one(author) @@ -315,9 +332,7 @@ def main(): db["journals"].update_one(mongo_filter, mongo_update) i += 1 if i % 100 == 0: - print("Journals updated with refs: ", i) - - + print("Journals updated with refs: ", i) if __name__ == "__main__": main() \ No newline at end of file