From ad1d07623ca5a3073f1d23dec52429a5996dd6de Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Wed, 11 Oct 2023 14:35:41 +0200 Subject: [PATCH] bow extraction words --- requirements.txt | 2 ++ search-data.py | 43 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..3012ac71 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +nltk==3.8.1 +pandas==2.1.1 diff --git a/search-data.py b/search-data.py index 87eb44c5..9d5481d8 100644 --- a/search-data.py +++ b/search-data.py @@ -2,21 +2,62 @@ import re import argparse import os import pandas as pd +import nltk +import numpy as np +from nltk.corpus import stopwords + +nltk.download('stopwords') SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv") +# using ntlk stop words and example words for now +STOP_WORDS = set(stopwords.words('english')) \ + .union(['test', 'tests', 'main', 'this']) + + +def find_all(regex, word): + matches = re.finditer(regex, word) + return [m.group(0).lower() for m in matches] + + +# https://stackoverflow.com/a/29920015 +def camel_case_split(word): + return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word) + + +def identifier_split(identifier): + return [y for x in identifier.split("_") for y in camel_case_split(x)] + + +def comment_split(comment): + return find_all('[A-Za-z0-9]+', comment) + + +def remove_stopwords(input_bow_list): + return [word for word in input_bow_list if word not in STOP_WORDS] + + +def get_bow(data, split_f): + if data is None or (type(data) == float and np.isnan(data)): + return [] + return remove_stopwords(split_f(data)) + def search(query): df = pd.read_csv(IN_DATASET) + for i, row in df.iterrows(): + name_bow = get_bow(row["name"], identifier_split) + comment_bow = get_bow(row["comment"], comment_split) + def main(): parser = argparse.ArgumentParser() parser.add_argument("query", help="the query to search the corpus with", type=str) args = parser.parse_args() - search(query) + search(args.query) if __name__ == "__main__":