2023-10-11 11:59:07 +00:00
|
|
|
import re
|
|
|
|
import argparse
|
|
|
|
import os
|
|
|
|
import pandas as pd
|
2023-10-11 12:35:41 +00:00
|
|
|
import nltk
|
|
|
|
import numpy as np
|
|
|
|
from nltk.corpus import stopwords
|
2023-10-11 15:49:38 +00:00
|
|
|
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
|
|
|
|
from gensim.models import TfidfModel, LsiModel, LdaModel
|
2023-10-16 13:10:45 +00:00
|
|
|
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
|
2023-10-11 15:49:38 +00:00
|
|
|
from gensim.corpora import Dictionary
|
|
|
|
from collections import defaultdict
|
2023-10-11 12:35:41 +00:00
|
|
|
|
|
|
|
nltk.download('stopwords')
|
2023-10-11 11:59:07 +00:00
|
|
|
|
|
|
|
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
|
|
|
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
|
|
|
|
2023-10-11 12:35:41 +00:00
|
|
|
# using ntlk stop words and example words for now
|
|
|
|
STOP_WORDS = set(stopwords.words('english')) \
|
|
|
|
.union(['test', 'tests', 'main', 'this'])
|
|
|
|
|
|
|
|
|
|
|
|
def find_all(regex, word):
|
|
|
|
matches = re.finditer(regex, word)
|
|
|
|
return [m.group(0).lower() for m in matches]
|
|
|
|
|
|
|
|
|
|
|
|
# https://stackoverflow.com/a/29920015
|
|
|
|
def camel_case_split(word):
|
|
|
|
return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
|
|
|
|
|
|
|
|
|
|
|
|
def identifier_split(identifier):
|
|
|
|
return [y for x in identifier.split("_") for y in camel_case_split(x)]
|
|
|
|
|
|
|
|
|
|
|
|
def comment_split(comment):
|
|
|
|
return find_all('[A-Za-z0-9]+', comment)
|
|
|
|
|
|
|
|
|
|
|
|
def remove_stopwords(input_bow_list):
|
|
|
|
return [word for word in input_bow_list if word not in STOP_WORDS]
|
|
|
|
|
|
|
|
|
|
|
|
def get_bow(data, split_f):
|
|
|
|
if data is None or (type(data) == float and np.isnan(data)):
|
|
|
|
return []
|
|
|
|
return remove_stopwords(split_f(data))
|
|
|
|
|
2023-10-11 11:59:07 +00:00
|
|
|
|
2023-10-11 15:49:38 +00:00
|
|
|
def print_sims(corpus, query, df, dictionary):
|
|
|
|
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
|
|
|
sims = index[query]
|
|
|
|
pick_top = 5
|
2023-10-11 11:59:07 +00:00
|
|
|
|
2023-10-16 13:10:45 +00:00
|
|
|
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
|
|
|
|
|
|
|
|
def print_results(idxs_scores, df):
|
|
|
|
for idx, score in idxs_scores:
|
2023-10-11 15:49:38 +00:00
|
|
|
row = df.loc[idx]
|
|
|
|
print("Similarity: {s:2.02f}%".format(s=score*100))
|
|
|
|
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
|
|
|
|
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
|
|
|
|
|
2023-10-11 12:35:41 +00:00
|
|
|
|
2023-10-11 15:49:38 +00:00
|
|
|
def search(query, method):
|
|
|
|
df = pd.read_csv(IN_DATASET)
|
|
|
|
df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
|
|
|
|
df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))
|
|
|
|
|
|
|
|
corpus_list = []
|
|
|
|
for idx, row in df.iterrows():
|
|
|
|
document_words = row["name_bow"] + row["comment_bow"]
|
|
|
|
corpus_list.append(document_words)
|
|
|
|
|
|
|
|
dictionary = Dictionary(corpus_list)
|
|
|
|
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
2023-10-16 13:10:45 +00:00
|
|
|
query_w = get_bow(query, comment_split)
|
|
|
|
query_bow = dictionary.doc2bow(query_w)
|
2023-10-11 15:49:38 +00:00
|
|
|
|
|
|
|
if method == "tfidf":
|
|
|
|
tfidf = TfidfModel(corpus_bow)
|
|
|
|
print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary)
|
|
|
|
elif method == "freq":
|
|
|
|
print_sims(corpus_bow, query_bow, df, dictionary)
|
|
|
|
elif method == "lsi":
|
|
|
|
lsi = LsiModel(corpus_bow)
|
|
|
|
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
2023-10-16 13:10:45 +00:00
|
|
|
elif method == "doc2vec":
|
|
|
|
dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)]
|
|
|
|
model = Doc2Vec(vector_size=50, min_count=2, epochs=100)
|
|
|
|
model.build_vocab(dvdocs)
|
|
|
|
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
|
|
|
|
dvquery = model.infer_vector(query_w)
|
|
|
|
print_results(model.dv.most_similar([dvquery], topn=5), df)
|
|
|
|
else:
|
|
|
|
raise Error("method unknown")
|
2023-10-11 11:59:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser()
|
2023-10-11 15:49:38 +00:00
|
|
|
parser.add_argument("method", help="the method to compare similarities with", type=str)
|
2023-10-11 11:59:07 +00:00
|
|
|
parser.add_argument("query", help="the query to search the corpus with", type=str)
|
|
|
|
args = parser.parse_args()
|
2023-10-11 15:49:38 +00:00
|
|
|
search(args.query, args.method)
|
2023-10-11 11:59:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|