import re import argparse import os import pandas as pd import nltk import numpy as np from nltk.corpus import stopwords nltk.download('stopwords') SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv") # using ntlk stop words and example words for now STOP_WORDS = set(stopwords.words('english')) \ .union(['test', 'tests', 'main', 'this']) def find_all(regex, word): matches = re.finditer(regex, word) return [m.group(0).lower() for m in matches] # https://stackoverflow.com/a/29920015 def camel_case_split(word): return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word) def identifier_split(identifier): return [y for x in identifier.split("_") for y in camel_case_split(x)] def comment_split(comment): return find_all('[A-Za-z0-9]+', comment) def remove_stopwords(input_bow_list): return [word for word in input_bow_list if word not in STOP_WORDS] def get_bow(data, split_f): if data is None or (type(data) == float and np.isnan(data)): return [] return remove_stopwords(split_f(data)) def search(query): df = pd.read_csv(IN_DATASET) for i, row in df.iterrows(): name_bow = get_bow(row["name"], identifier_split) comment_bow = get_bow(row["comment"], comment_split) def main(): parser = argparse.ArgumentParser() parser.add_argument("query", help="the query to search the corpus with", type=str) args = parser.parse_args() search(args.query) if __name__ == "__main__": main()