almost done part 4
This commit is contained in:
3 changed files with 92 additions and 14 deletions
@ -1,7 +1,12 @@
import argparse
from typing import Iterable, Optional
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
search_data = __import__('search-data')
@ -43,6 +48,22 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
return None
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
if results.vectors is not None and results.query_vector is not None:
tsne_vectors = np.array(results.vectors + [results.query_vector])
# try perplexity = 1, 1.5, 2
tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000)
tsne_results = tsne.fit_transform(tsne_vectors)
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'query', 'is_input'])
df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]
df['query'] = [query] * (len(results.vectors) + 1)
df['is_input'] = (['Result'] * len(results.vectors)) + ['Input query']
return df
return None
def main(method: str, file_path: str):
df = search_data.load_data()
test_set = list(read_ground_truth(file_path, df))
@ -50,9 +71,16 @@ def main(method: str, file_path: str):
precision_sum = 0
recall_sum = 0
for query, expected in test_set:
indexes_values: list[tuple[int, float]] =, method, df)
idx = better_index(indexes_values, expected)
dfs = []
for query, expected in tqdm.tqdm(test_set):
search_results =, method, df)
df_q = plot_df(search_results, query)
if df_q is not None:
idx = better_index(search_results.indexes_scores, expected)
if idx is None:
precision = 0
@ -67,6 +95,20 @@ def main(method: str, file_path: str):
print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set)))
print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set)))
df = pd.concat(dfs)
plt.figure(figsize=(4, 4))
ax = sns.scatterplot(
x="tsne-2d-one", y="tsne-2d-two",
palette=sns.color_palette("husl", n_colors=10),
if __name__ == '__main__':
parser = argparse.ArgumentParser()
@ -3,3 +3,6 @@ gensim==4.3.2
@ -2,6 +2,9 @@ import argparse
import logging
import os
import re
import typing
from dataclasses import dataclass
from typing import Optional
import coloredlogs
import nltk
@ -19,7 +22,7 @@ SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
# using ntlk stop words and example words for now
# using nltk stop words and example words for now
STOP_WORDS = set(stopwords.words('english')) \
.union(['test', 'tests', 'main', 'this', 'self'])
@ -52,7 +55,7 @@ def get_bow(data, split_f):
return remove_stopwords(split_f(data))
def pick_most_similar(corpus, query, dictionary):
def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]:
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
sims = index[query]
pick_top = 5
@ -74,7 +77,7 @@ def print_results(indexes_scores: list[tuple[int, float]], df):
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}"
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
@ -94,9 +97,33 @@ def load_data() -> pd.DataFrame:
return df
def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]:
SparseVector = list[tuple[int, float]]
DenseVector = np.array
def to_dense(vector: SparseVector) -> DenseVector:
dense = [0.0] * len(vector)
for idx, value in vector:
dense[idx] = value
return np.array(dense)
class SearchResults:
indexes_scores: list[tuple[int, float]]
vectors: Optional[list[DenseVector]]
query_vector: Optional[DenseVector]
def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]],
query_vector: Optional[DenseVector]):
self.indexes_scores = indexes_values
self.vectors = vectors
self.query_vector = query_vector
def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
corpus_list = []
for idx, row in df.iterrows():
for _, row in df.iterrows():
document_words = row["name_bow"] + row["comment_bow"]
@ -112,12 +139,15 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
if method == "tfidf":
tfidf = TfidfModel(corpus_bow)
return pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary)
return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None)
elif method == "freq":
return pick_most_similar(corpus_bow, query_bow, dictionary)
return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None)
elif method == "lsi":
lsi = LsiModel(corpus_bow)
return pick_most_similar(lsi[corpus_bow], lsi[query_bow], dictionary)
corpus = typing.cast(list[SparseVector], lsi[corpus_bow])
results = pick_most_similar(corpus, lsi[query_bow], dictionary)
result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results]
return SearchResults(results, result_vectors, to_dense(lsi[query_bow]))
elif method == "doc2vec":
if os.path.exists(DOC2VEC_MODEL):
model = Doc2Vec.load(DOC2VEC_MODEL)
@ -125,7 +155,10 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
model = build_doc2vec_model(corpus_list)
dv_query = model.infer_vector(query_w)
return model.dv.most_similar([dv_query], topn=5)
results = model.dv.most_similar([dv_query], topn=5)
result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results]
return SearchResults(results, result_vectors, dv_query)
raise ValueError("method unknown")
@ -137,8 +170,8 @@ def main():
args = parser.parse_args()
df = load_data()
indexes_scores = search(args.query, args.method, df)
print_results(indexes_scores, df)
results = search(args.query, args.method, df)
print_results(results.indexes_scores, df)
if __name__ == "__main__":
Reference in a new issue