bow extraction words

2023-10-11 14:35:41 +02:00 · 2023-10-11 14:35:41 +02:00 · ad1d07623c
commit ad1d07623c
parent d964c138a6
2 changed files with 44 additions and 1 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 nltk==3.8.1
 pandas==2.1.1
--- a/search-data.py
+++ b/search-data.py
@ -2,21 +2,62 @@ import re
 import argparse
 import os
 import pandas as pd
 import nltk
 import numpy as np
 from nltk.corpus import stopwords
 nltk.download('stopwords')
 SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
 IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
 # using ntlk stop words and example words for now
 STOP_WORDS = set(stopwords.words('english')) \
    .union(['test', 'tests', 'main', 'this'])
 def find_all(regex, word):
    matches = re.finditer(regex, word)
    return [m.group(0).lower() for m in matches]
 # https://stackoverflow.com/a/29920015
 def camel_case_split(word):
    return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
 def identifier_split(identifier):
    return [y for x in identifier.split("_") for y in camel_case_split(x)]
 def comment_split(comment):
    return find_all('[A-Za-z0-9]+', comment)
 def remove_stopwords(input_bow_list):
    return [word for word in input_bow_list if word not in STOP_WORDS]
 def get_bow(data, split_f):
    if data is None or (type(data) == float and np.isnan(data)):
        return []
    return remove_stopwords(split_f(data))
 def search(query):
    df = pd.read_csv(IN_DATASET)
    for i, row in df.iterrows():
        name_bow = get_bow(row["name"], identifier_split)
        comment_bow = get_bow(row["comment"], comment_split)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("query", help="the query to search the corpus with", type=str)
    args = parser.parse_args()
-    search(query)
+    search(args.query)
 if __name__ == "__main__":