kse-01/search-data.py

import re
import argparse
import os
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords

nltk.download('stopwords')

SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")

# using ntlk stop words and example words for now
STOP_WORDS = set(stopwords.words('english')) \
    .union(['test', 'tests', 'main', 'this'])


def find_all(regex, word):
    matches = re.finditer(regex, word)
    return [m.group(0).lower() for m in matches]


# https://stackoverflow.com/a/29920015
def camel_case_split(word):
    return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)


def identifier_split(identifier):
    return [y for x in identifier.split("_") for y in camel_case_split(x)]


def comment_split(comment):
    return find_all('[A-Za-z0-9]+', comment)


def remove_stopwords(input_bow_list):
    return [word for word in input_bow_list if word not in STOP_WORDS]


def get_bow(data, split_f):
    if data is None or (type(data) == float and np.isnan(data)):
        return []
    return remove_stopwords(split_f(data))


def search(query):
    df = pd.read_csv(IN_DATASET)
    
    for i, row in df.iterrows():
        name_bow = get_bow(row["name"], identifier_split)
        comment_bow = get_bow(row["comment"], comment_split)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("query", help="the query to search the corpus with", type=str)
    args = parser.parse_args()
    search(args.query)


if __name__ == "__main__":
    main()
wip part 2 2023-10-11 11:59:07 +00:00			`import re`
			`import argparse`
			`import os`
			`import pandas as pd`
bow extraction words 2023-10-11 12:35:41 +00:00			`import nltk`
			`import numpy as np`
			`from nltk.corpus import stopwords`

			`nltk.download('stopwords')`
wip part 2 2023-10-11 11:59:07 +00:00
			`SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))`
			`IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")`

bow extraction words 2023-10-11 12:35:41 +00:00			`# using ntlk stop words and example words for now`
			`STOP_WORDS = set(stopwords.words('english')) \`
			`.union(['test', 'tests', 'main', 'this'])`


			`def find_all(regex, word):`
			`matches = re.finditer(regex, word)`
			`return [m.group(0).lower() for m in matches]`


			`# https://stackoverflow.com/a/29920015`
			`def camel_case_split(word):`
			`return find_all('.+?(?:(?<=[a-z])(?=[A-Z])\|(?<=[A-Z])(?=[A-Z][a-z])\|$)', word)`


			`def identifier_split(identifier):`
			`return [y for x in identifier.split("_") for y in camel_case_split(x)]`


			`def comment_split(comment):`
			`return find_all('[A-Za-z0-9]+', comment)`


			`def remove_stopwords(input_bow_list):`
			`return [word for word in input_bow_list if word not in STOP_WORDS]`


			`def get_bow(data, split_f):`
			`if data is None or (type(data) == float and np.isnan(data)):`
			`return []`
			`return remove_stopwords(split_f(data))`

wip part 2 2023-10-11 11:59:07 +00:00
			`def search(query):`
			`df = pd.read_csv(IN_DATASET)`

bow extraction words 2023-10-11 12:35:41 +00:00			`for i, row in df.iterrows():`
			`name_bow = get_bow(row["name"], identifier_split)`
			`comment_bow = get_bow(row["comment"], comment_split)`

wip part 2 2023-10-11 11:59:07 +00:00

			`def main():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("query", help="the query to search the corpus with", type=str)`
			`args = parser.parse_args()`
bow extraction words 2023-10-11 12:35:41 +00:00			`search(args.query)`
wip part 2 2023-10-11 11:59:07 +00:00

			`if __name__ == "__main__":`
			`main()`