From ad1d07623ca5a3073f1d23dec52429a5996dd6de Mon Sep 17 00:00:00 2001
From: Claudio Maggioni <maggicl@usi.ch>
Date: Wed, 11 Oct 2023 14:35:41 +0200
Subject: [PATCH] bow extraction words

---
 requirements.txt |  2 ++
 search-data.py   | 43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..3012ac71
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+nltk==3.8.1
+pandas==2.1.1
diff --git a/search-data.py b/search-data.py
index 87eb44c5..9d5481d8 100644
--- a/search-data.py
+++ b/search-data.py
@@ -2,21 +2,62 @@ import re
 import argparse
 import os
 import pandas as pd
+import nltk
+import numpy as np
+from nltk.corpus import stopwords
+
+nltk.download('stopwords')
 
 SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
 IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
 
+# using ntlk stop words and example words for now
+STOP_WORDS = set(stopwords.words('english')) \
+    .union(['test', 'tests', 'main', 'this'])
+
+
+def find_all(regex, word):
+    matches = re.finditer(regex, word)
+    return [m.group(0).lower() for m in matches]
+
+
+# https://stackoverflow.com/a/29920015
+def camel_case_split(word):
+    return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
+
+
+def identifier_split(identifier):
+    return [y for x in identifier.split("_") for y in camel_case_split(x)]
+
+
+def comment_split(comment):
+    return find_all('[A-Za-z0-9]+', comment)
+
+
+def remove_stopwords(input_bow_list):
+    return [word for word in input_bow_list if word not in STOP_WORDS]
+
+
+def get_bow(data, split_f):
+    if data is None or (type(data) == float and np.isnan(data)):
+        return []
+    return remove_stopwords(split_f(data))
+
 
 def search(query):
     df = pd.read_csv(IN_DATASET)
     
+    for i, row in df.iterrows():
+        name_bow = get_bow(row["name"], identifier_split)
+        comment_bow = get_bow(row["comment"], comment_split)
+
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("query", help="the query to search the corpus with", type=str)
     args = parser.parse_args()
-    search(query)
+    search(args.query)
 
 
 if __name__ == "__main__":