almost done part 4
This commit is contained in:
parent
ea74353ba3
commit
453beeb980
3 changed files with 92 additions and 14 deletions
|
@ -1,7 +1,12 @@
|
||||||
import argparse
|
import argparse
|
||||||
from typing import Iterable, Optional
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import tqdm
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
from sklearn.manifold import TSNE
|
||||||
|
|
||||||
search_data = __import__('search-data')
|
search_data = __import__('search-data')
|
||||||
|
|
||||||
|
@ -43,6 +48,22 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
|
||||||
|
if results.vectors is not None and results.query_vector is not None:
|
||||||
|
tsne_vectors = np.array(results.vectors + [results.query_vector])
|
||||||
|
# try perplexity = 1, 1.5, 2
|
||||||
|
tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000)
|
||||||
|
tsne_results = tsne.fit_transform(tsne_vectors)
|
||||||
|
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'query', 'is_input'])
|
||||||
|
df['tsne-2d-one'] = tsne_results[:, 0]
|
||||||
|
df['tsne-2d-two'] = tsne_results[:, 1]
|
||||||
|
df['query'] = [query] * (len(results.vectors) + 1)
|
||||||
|
df['is_input'] = (['Result'] * len(results.vectors)) + ['Input query']
|
||||||
|
return df
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def main(method: str, file_path: str):
|
def main(method: str, file_path: str):
|
||||||
df = search_data.load_data()
|
df = search_data.load_data()
|
||||||
test_set = list(read_ground_truth(file_path, df))
|
test_set = list(read_ground_truth(file_path, df))
|
||||||
|
@ -50,9 +71,16 @@ def main(method: str, file_path: str):
|
||||||
precision_sum = 0
|
precision_sum = 0
|
||||||
recall_sum = 0
|
recall_sum = 0
|
||||||
|
|
||||||
for query, expected in test_set:
|
dfs = []
|
||||||
indexes_values: list[tuple[int, float]] = search_data.search(query, method, df)
|
|
||||||
idx = better_index(indexes_values, expected)
|
for query, expected in tqdm.tqdm(test_set):
|
||||||
|
search_results = search_data.search(query, method, df)
|
||||||
|
|
||||||
|
df_q = plot_df(search_results, query)
|
||||||
|
if df_q is not None:
|
||||||
|
dfs.append(df_q)
|
||||||
|
|
||||||
|
idx = better_index(search_results.indexes_scores, expected)
|
||||||
|
|
||||||
if idx is None:
|
if idx is None:
|
||||||
precision = 0
|
precision = 0
|
||||||
|
@ -67,6 +95,20 @@ def main(method: str, file_path: str):
|
||||||
print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set)))
|
print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set)))
|
||||||
print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set)))
|
print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set)))
|
||||||
|
|
||||||
|
df = pd.concat(dfs)
|
||||||
|
|
||||||
|
plt.figure(figsize=(4, 4))
|
||||||
|
ax = sns.scatterplot(
|
||||||
|
x="tsne-2d-one", y="tsne-2d-two",
|
||||||
|
hue="query",
|
||||||
|
style="is_input",
|
||||||
|
palette=sns.color_palette("husl", n_colors=10),
|
||||||
|
data=df,
|
||||||
|
legend="full",
|
||||||
|
alpha=1.0
|
||||||
|
)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
|
@ -3,3 +3,6 @@ gensim==4.3.2
|
||||||
nltk==3.8.1
|
nltk==3.8.1
|
||||||
numpy==1.26.1
|
numpy==1.26.1
|
||||||
pandas==2.1.1
|
pandas==2.1.1
|
||||||
|
tqdm==4.66.1
|
||||||
|
scikit-learn==1.3.2
|
||||||
|
seaborn==0.13.0
|
|
@ -2,6 +2,9 @@ import argparse
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import typing
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import coloredlogs
|
import coloredlogs
|
||||||
import nltk
|
import nltk
|
||||||
|
@ -19,7 +22,7 @@ SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
||||||
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
||||||
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
|
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
|
||||||
|
|
||||||
# using ntlk stop words and example words for now
|
# using nltk stop words and example words for now
|
||||||
STOP_WORDS = set(stopwords.words('english')) \
|
STOP_WORDS = set(stopwords.words('english')) \
|
||||||
.union(['test', 'tests', 'main', 'this', 'self'])
|
.union(['test', 'tests', 'main', 'this', 'self'])
|
||||||
|
|
||||||
|
@ -52,7 +55,7 @@ def get_bow(data, split_f):
|
||||||
return remove_stopwords(split_f(data))
|
return remove_stopwords(split_f(data))
|
||||||
|
|
||||||
|
|
||||||
def pick_most_similar(corpus, query, dictionary):
|
def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]:
|
||||||
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
||||||
sims = index[query]
|
sims = index[query]
|
||||||
pick_top = 5
|
pick_top = 5
|
||||||
|
@ -74,7 +77,7 @@ def print_results(indexes_scores: list[tuple[int, float]], df):
|
||||||
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
|
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
|
||||||
|
|
||||||
print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
|
print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
|
||||||
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \
|
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}"
|
||||||
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
|
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,9 +97,33 @@ def load_data() -> pd.DataFrame:
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]:
|
SparseVector = list[tuple[int, float]]
|
||||||
|
DenseVector = np.array
|
||||||
|
|
||||||
|
|
||||||
|
def to_dense(vector: SparseVector) -> DenseVector:
|
||||||
|
dense = [0.0] * len(vector)
|
||||||
|
for idx, value in vector:
|
||||||
|
dense[idx] = value
|
||||||
|
return np.array(dense)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SearchResults:
|
||||||
|
indexes_scores: list[tuple[int, float]]
|
||||||
|
vectors: Optional[list[DenseVector]]
|
||||||
|
query_vector: Optional[DenseVector]
|
||||||
|
|
||||||
|
def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]],
|
||||||
|
query_vector: Optional[DenseVector]):
|
||||||
|
self.indexes_scores = indexes_values
|
||||||
|
self.vectors = vectors
|
||||||
|
self.query_vector = query_vector
|
||||||
|
|
||||||
|
|
||||||
|
def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
|
||||||
corpus_list = []
|
corpus_list = []
|
||||||
for idx, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
document_words = row["name_bow"] + row["comment_bow"]
|
document_words = row["name_bow"] + row["comment_bow"]
|
||||||
corpus_list.append(document_words)
|
corpus_list.append(document_words)
|
||||||
|
|
||||||
|
@ -112,12 +139,15 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
|
||||||
|
|
||||||
if method == "tfidf":
|
if method == "tfidf":
|
||||||
tfidf = TfidfModel(corpus_bow)
|
tfidf = TfidfModel(corpus_bow)
|
||||||
return pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary)
|
return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None)
|
||||||
elif method == "freq":
|
elif method == "freq":
|
||||||
return pick_most_similar(corpus_bow, query_bow, dictionary)
|
return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None)
|
||||||
elif method == "lsi":
|
elif method == "lsi":
|
||||||
lsi = LsiModel(corpus_bow)
|
lsi = LsiModel(corpus_bow)
|
||||||
return pick_most_similar(lsi[corpus_bow], lsi[query_bow], dictionary)
|
corpus = typing.cast(list[SparseVector], lsi[corpus_bow])
|
||||||
|
results = pick_most_similar(corpus, lsi[query_bow], dictionary)
|
||||||
|
result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results]
|
||||||
|
return SearchResults(results, result_vectors, to_dense(lsi[query_bow]))
|
||||||
elif method == "doc2vec":
|
elif method == "doc2vec":
|
||||||
if os.path.exists(DOC2VEC_MODEL):
|
if os.path.exists(DOC2VEC_MODEL):
|
||||||
model = Doc2Vec.load(DOC2VEC_MODEL)
|
model = Doc2Vec.load(DOC2VEC_MODEL)
|
||||||
|
@ -125,7 +155,10 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
|
||||||
model = build_doc2vec_model(corpus_list)
|
model = build_doc2vec_model(corpus_list)
|
||||||
|
|
||||||
dv_query = model.infer_vector(query_w)
|
dv_query = model.infer_vector(query_w)
|
||||||
return model.dv.most_similar([dv_query], topn=5)
|
results = model.dv.most_similar([dv_query], topn=5)
|
||||||
|
|
||||||
|
result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results]
|
||||||
|
return SearchResults(results, result_vectors, dv_query)
|
||||||
else:
|
else:
|
||||||
raise ValueError("method unknown")
|
raise ValueError("method unknown")
|
||||||
|
|
||||||
|
@ -137,8 +170,8 @@ def main():
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
df = load_data()
|
df = load_data()
|
||||||
indexes_scores = search(args.query, args.method, df)
|
results = search(args.query, args.method, df)
|
||||||
print_results(indexes_scores, df)
|
print_results(results.indexes_scores, df)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in a new issue