kse-01/search-data.py

65 lines
1.5 KiB
Python
Raw Normal View History

2023-10-11 11:59:07 +00:00
import re
import argparse
import os
import pandas as pd
2023-10-11 12:35:41 +00:00
import nltk
import numpy as np
from nltk.corpus import stopwords
nltk.download('stopwords')
2023-10-11 11:59:07 +00:00
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
2023-10-11 12:35:41 +00:00
# using ntlk stop words and example words for now
STOP_WORDS = set(stopwords.words('english')) \
.union(['test', 'tests', 'main', 'this'])
def find_all(regex, word):
matches = re.finditer(regex, word)
return [m.group(0).lower() for m in matches]
# https://stackoverflow.com/a/29920015
def camel_case_split(word):
return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
def identifier_split(identifier):
return [y for x in identifier.split("_") for y in camel_case_split(x)]
def comment_split(comment):
return find_all('[A-Za-z0-9]+', comment)
def remove_stopwords(input_bow_list):
return [word for word in input_bow_list if word not in STOP_WORDS]
def get_bow(data, split_f):
if data is None or (type(data) == float and np.isnan(data)):
return []
return remove_stopwords(split_f(data))
2023-10-11 11:59:07 +00:00
def search(query):
df = pd.read_csv(IN_DATASET)
2023-10-11 12:35:41 +00:00
for i, row in df.iterrows():
name_bow = get_bow(row["name"], identifier_split)
comment_bow = get_bow(row["comment"], comment_split)
2023-10-11 11:59:07 +00:00
def main():
parser = argparse.ArgumentParser()
parser.add_argument("query", help="the query to search the corpus with", type=str)
args = parser.parse_args()
2023-10-11 12:35:41 +00:00
search(args.query)
2023-10-11 11:59:07 +00:00
if __name__ == "__main__":
main()