almost done part 4
This commit is contained in:
parent
ea74353ba3
commit
453beeb980
3 changed files with 92 additions and 14 deletions
|
@ -1,7 +1,12 @@
|
|||
import argparse
|
||||
from typing import Iterable, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import tqdm
|
||||
from matplotlib import pyplot as plt
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
search_data = __import__('search-data')
|
||||
|
||||
|
@ -43,6 +48,22 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
|
|||
return None
|
||||
|
||||
|
||||
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
|
||||
if results.vectors is not None and results.query_vector is not None:
|
||||
tsne_vectors = np.array(results.vectors + [results.query_vector])
|
||||
# try perplexity = 1, 1.5, 2
|
||||
tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000)
|
||||
tsne_results = tsne.fit_transform(tsne_vectors)
|
||||
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'query', 'is_input'])
|
||||
df['tsne-2d-one'] = tsne_results[:, 0]
|
||||
df['tsne-2d-two'] = tsne_results[:, 1]
|
||||
df['query'] = [query] * (len(results.vectors) + 1)
|
||||
df['is_input'] = (['Result'] * len(results.vectors)) + ['Input query']
|
||||
return df
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def main(method: str, file_path: str):
|
||||
df = search_data.load_data()
|
||||
test_set = list(read_ground_truth(file_path, df))
|
||||
|
@ -50,9 +71,16 @@ def main(method: str, file_path: str):
|
|||
precision_sum = 0
|
||||
recall_sum = 0
|
||||
|
||||
for query, expected in test_set:
|
||||
indexes_values: list[tuple[int, float]] = search_data.search(query, method, df)
|
||||
idx = better_index(indexes_values, expected)
|
||||
dfs = []
|
||||
|
||||
for query, expected in tqdm.tqdm(test_set):
|
||||
search_results = search_data.search(query, method, df)
|
||||
|
||||
df_q = plot_df(search_results, query)
|
||||
if df_q is not None:
|
||||
dfs.append(df_q)
|
||||
|
||||
idx = better_index(search_results.indexes_scores, expected)
|
||||
|
||||
if idx is None:
|
||||
precision = 0
|
||||
|
@ -67,6 +95,20 @@ def main(method: str, file_path: str):
|
|||
print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set)))
|
||||
print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set)))
|
||||
|
||||
df = pd.concat(dfs)
|
||||
|
||||
plt.figure(figsize=(4, 4))
|
||||
ax = sns.scatterplot(
|
||||
x="tsne-2d-one", y="tsne-2d-two",
|
||||
hue="query",
|
||||
style="is_input",
|
||||
palette=sns.color_palette("husl", n_colors=10),
|
||||
data=df,
|
||||
legend="full",
|
||||
alpha=1.0
|
||||
)
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
|
|
|
@ -3,3 +3,6 @@ gensim==4.3.2
|
|||
nltk==3.8.1
|
||||
numpy==1.26.1
|
||||
pandas==2.1.1
|
||||
tqdm==4.66.1
|
||||
scikit-learn==1.3.2
|
||||
seaborn==0.13.0
|
|
@ -2,6 +2,9 @@ import argparse
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import typing
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import coloredlogs
|
||||
import nltk
|
||||
|
@ -19,7 +22,7 @@ SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
|||
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
||||
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
|
||||
|
||||
# using ntlk stop words and example words for now
|
||||
# using nltk stop words and example words for now
|
||||
STOP_WORDS = set(stopwords.words('english')) \
|
||||
.union(['test', 'tests', 'main', 'this', 'self'])
|
||||
|
||||
|
@ -52,7 +55,7 @@ def get_bow(data, split_f):
|
|||
return remove_stopwords(split_f(data))
|
||||
|
||||
|
||||
def pick_most_similar(corpus, query, dictionary):
|
||||
def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]:
|
||||
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
||||
sims = index[query]
|
||||
pick_top = 5
|
||||
|
@ -74,7 +77,7 @@ def print_results(indexes_scores: list[tuple[int, float]], df):
|
|||
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
|
||||
|
||||
print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
|
||||
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \
|
||||
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}"
|
||||
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
|
||||
|
||||
|
||||
|
@ -94,9 +97,33 @@ def load_data() -> pd.DataFrame:
|
|||
return df
|
||||
|
||||
|
||||
def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]:
|
||||
SparseVector = list[tuple[int, float]]
|
||||
DenseVector = np.array
|
||||
|
||||
|
||||
def to_dense(vector: SparseVector) -> DenseVector:
|
||||
dense = [0.0] * len(vector)
|
||||
for idx, value in vector:
|
||||
dense[idx] = value
|
||||
return np.array(dense)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResults:
|
||||
indexes_scores: list[tuple[int, float]]
|
||||
vectors: Optional[list[DenseVector]]
|
||||
query_vector: Optional[DenseVector]
|
||||
|
||||
def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]],
|
||||
query_vector: Optional[DenseVector]):
|
||||
self.indexes_scores = indexes_values
|
||||
self.vectors = vectors
|
||||
self.query_vector = query_vector
|
||||
|
||||
|
||||
def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
|
||||
corpus_list = []
|
||||
for idx, row in df.iterrows():
|
||||
for _, row in df.iterrows():
|
||||
document_words = row["name_bow"] + row["comment_bow"]
|
||||
corpus_list.append(document_words)
|
||||
|
||||
|
@ -112,12 +139,15 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
|
|||
|
||||
if method == "tfidf":
|
||||
tfidf = TfidfModel(corpus_bow)
|
||||
return pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary)
|
||||
return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None)
|
||||
elif method == "freq":
|
||||
return pick_most_similar(corpus_bow, query_bow, dictionary)
|
||||
return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None)
|
||||
elif method == "lsi":
|
||||
lsi = LsiModel(corpus_bow)
|
||||
return pick_most_similar(lsi[corpus_bow], lsi[query_bow], dictionary)
|
||||
corpus = typing.cast(list[SparseVector], lsi[corpus_bow])
|
||||
results = pick_most_similar(corpus, lsi[query_bow], dictionary)
|
||||
result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results]
|
||||
return SearchResults(results, result_vectors, to_dense(lsi[query_bow]))
|
||||
elif method == "doc2vec":
|
||||
if os.path.exists(DOC2VEC_MODEL):
|
||||
model = Doc2Vec.load(DOC2VEC_MODEL)
|
||||
|
@ -125,7 +155,10 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
|
|||
model = build_doc2vec_model(corpus_list)
|
||||
|
||||
dv_query = model.infer_vector(query_w)
|
||||
return model.dv.most_similar([dv_query], topn=5)
|
||||
results = model.dv.most_similar([dv_query], topn=5)
|
||||
|
||||
result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results]
|
||||
return SearchResults(results, result_vectors, dv_query)
|
||||
else:
|
||||
raise ValueError("method unknown")
|
||||
|
||||
|
@ -137,8 +170,8 @@ def main():
|
|||
args = parser.parse_args()
|
||||
|
||||
df = load_data()
|
||||
indexes_scores = search(args.query, args.method, df)
|
||||
print_results(indexes_scores, df)
|
||||
results = search(args.query, args.method, df)
|
||||
print_results(results.indexes_scores, df)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in a new issue