bow extraction words
This commit is contained in:
parent
d964c138a6
commit
ad1d07623c
2 changed files with 44 additions and 1 deletions
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
nltk==3.8.1
|
||||||
|
pandas==2.1.1
|
|
@ -2,21 +2,62 @@ import re
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import nltk
|
||||||
|
import numpy as np
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
||||||
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
||||||
|
|
||||||
|
# using ntlk stop words and example words for now
|
||||||
|
STOP_WORDS = set(stopwords.words('english')) \
|
||||||
|
.union(['test', 'tests', 'main', 'this'])
|
||||||
|
|
||||||
|
|
||||||
|
def find_all(regex, word):
|
||||||
|
matches = re.finditer(regex, word)
|
||||||
|
return [m.group(0).lower() for m in matches]
|
||||||
|
|
||||||
|
|
||||||
|
# https://stackoverflow.com/a/29920015
|
||||||
|
def camel_case_split(word):
|
||||||
|
return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
|
||||||
|
|
||||||
|
|
||||||
|
def identifier_split(identifier):
|
||||||
|
return [y for x in identifier.split("_") for y in camel_case_split(x)]
|
||||||
|
|
||||||
|
|
||||||
|
def comment_split(comment):
|
||||||
|
return find_all('[A-Za-z0-9]+', comment)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_stopwords(input_bow_list):
|
||||||
|
return [word for word in input_bow_list if word not in STOP_WORDS]
|
||||||
|
|
||||||
|
|
||||||
|
def get_bow(data, split_f):
|
||||||
|
if data is None or (type(data) == float and np.isnan(data)):
|
||||||
|
return []
|
||||||
|
return remove_stopwords(split_f(data))
|
||||||
|
|
||||||
|
|
||||||
def search(query):
|
def search(query):
|
||||||
df = pd.read_csv(IN_DATASET)
|
df = pd.read_csv(IN_DATASET)
|
||||||
|
|
||||||
|
for i, row in df.iterrows():
|
||||||
|
name_bow = get_bow(row["name"], identifier_split)
|
||||||
|
comment_bow = get_bow(row["comment"], comment_split)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("query", help="the query to search the corpus with", type=str)
|
parser.add_argument("query", help="the query to search the corpus with", type=str)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
search(query)
|
search(args.query)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue