almost done part 4

This commit is contained in:
Claudio Maggioni 2023-10-25 15:10:47 +02:00
parent ea74353ba3
commit 453beeb980
3 changed files with 92 additions and 14 deletions

View file

@ -1,7 +1,12 @@
import argparse import argparse
from typing import Iterable, Optional from typing import Iterable, Optional
import numpy as np
import pandas as pd import pandas as pd
import seaborn as sns
import tqdm
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
search_data = __import__('search-data') search_data = __import__('search-data')
@ -43,6 +48,22 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
return None return None
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
if results.vectors is not None and results.query_vector is not None:
tsne_vectors = np.array(results.vectors + [results.query_vector])
# try perplexity = 1, 1.5, 2
tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000)
tsne_results = tsne.fit_transform(tsne_vectors)
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'query', 'is_input'])
df['tsne-2d-one'] = tsne_results[:, 0]
df['tsne-2d-two'] = tsne_results[:, 1]
df['query'] = [query] * (len(results.vectors) + 1)
df['is_input'] = (['Result'] * len(results.vectors)) + ['Input query']
return df
else:
return None
def main(method: str, file_path: str): def main(method: str, file_path: str):
df = search_data.load_data() df = search_data.load_data()
test_set = list(read_ground_truth(file_path, df)) test_set = list(read_ground_truth(file_path, df))
@ -50,9 +71,16 @@ def main(method: str, file_path: str):
precision_sum = 0 precision_sum = 0
recall_sum = 0 recall_sum = 0
for query, expected in test_set: dfs = []
indexes_values: list[tuple[int, float]] = search_data.search(query, method, df)
idx = better_index(indexes_values, expected) for query, expected in tqdm.tqdm(test_set):
search_results = search_data.search(query, method, df)
df_q = plot_df(search_results, query)
if df_q is not None:
dfs.append(df_q)
idx = better_index(search_results.indexes_scores, expected)
if idx is None: if idx is None:
precision = 0 precision = 0
@ -67,6 +95,20 @@ def main(method: str, file_path: str):
print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set))) print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set)))
print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set))) print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set)))
df = pd.concat(dfs)
plt.figure(figsize=(4, 4))
ax = sns.scatterplot(
x="tsne-2d-one", y="tsne-2d-two",
hue="query",
style="is_input",
palette=sns.color_palette("husl", n_colors=10),
data=df,
legend="full",
alpha=1.0
)
plt.show()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

View file

@ -3,3 +3,6 @@ gensim==4.3.2
nltk==3.8.1 nltk==3.8.1
numpy==1.26.1 numpy==1.26.1
pandas==2.1.1 pandas==2.1.1
tqdm==4.66.1
scikit-learn==1.3.2
seaborn==0.13.0

View file

@ -2,6 +2,9 @@ import argparse
import logging import logging
import os import os
import re import re
import typing
from dataclasses import dataclass
from typing import Optional
import coloredlogs import coloredlogs
import nltk import nltk
@ -19,7 +22,7 @@ SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv") IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat") DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
# using ntlk stop words and example words for now # using nltk stop words and example words for now
STOP_WORDS = set(stopwords.words('english')) \ STOP_WORDS = set(stopwords.words('english')) \
.union(['test', 'tests', 'main', 'this', 'self']) .union(['test', 'tests', 'main', 'this', 'self'])
@ -52,7 +55,7 @@ def get_bow(data, split_f):
return remove_stopwords(split_f(data)) return remove_stopwords(split_f(data))
def pick_most_similar(corpus, query, dictionary): def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]:
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary)) index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
sims = index[query] sims = index[query]
pick_top = 5 pick_top = 5
@ -74,7 +77,7 @@ def print_results(indexes_scores: list[tuple[int, float]], df):
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
print("\nSimilarity: {s:2.02f}%".format(s=score * 100)) print("\nSimilarity: {s:2.02f}%".format(s=score * 100))
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \ print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}"
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"])) .format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
@ -94,9 +97,33 @@ def load_data() -> pd.DataFrame:
return df return df
def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]: SparseVector = list[tuple[int, float]]
DenseVector = np.array
def to_dense(vector: SparseVector) -> DenseVector:
dense = [0.0] * len(vector)
for idx, value in vector:
dense[idx] = value
return np.array(dense)
@dataclass
class SearchResults:
indexes_scores: list[tuple[int, float]]
vectors: Optional[list[DenseVector]]
query_vector: Optional[DenseVector]
def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]],
query_vector: Optional[DenseVector]):
self.indexes_scores = indexes_values
self.vectors = vectors
self.query_vector = query_vector
def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
corpus_list = [] corpus_list = []
for idx, row in df.iterrows(): for _, row in df.iterrows():
document_words = row["name_bow"] + row["comment_bow"] document_words = row["name_bow"] + row["comment_bow"]
corpus_list.append(document_words) corpus_list.append(document_words)
@ -112,12 +139,15 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
if method == "tfidf": if method == "tfidf":
tfidf = TfidfModel(corpus_bow) tfidf = TfidfModel(corpus_bow)
return pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary) return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None)
elif method == "freq": elif method == "freq":
return pick_most_similar(corpus_bow, query_bow, dictionary) return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None)
elif method == "lsi": elif method == "lsi":
lsi = LsiModel(corpus_bow) lsi = LsiModel(corpus_bow)
return pick_most_similar(lsi[corpus_bow], lsi[query_bow], dictionary) corpus = typing.cast(list[SparseVector], lsi[corpus_bow])
results = pick_most_similar(corpus, lsi[query_bow], dictionary)
result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results]
return SearchResults(results, result_vectors, to_dense(lsi[query_bow]))
elif method == "doc2vec": elif method == "doc2vec":
if os.path.exists(DOC2VEC_MODEL): if os.path.exists(DOC2VEC_MODEL):
model = Doc2Vec.load(DOC2VEC_MODEL) model = Doc2Vec.load(DOC2VEC_MODEL)
@ -125,7 +155,10 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]
model = build_doc2vec_model(corpus_list) model = build_doc2vec_model(corpus_list)
dv_query = model.infer_vector(query_w) dv_query = model.infer_vector(query_w)
return model.dv.most_similar([dv_query], topn=5) results = model.dv.most_similar([dv_query], topn=5)
result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results]
return SearchResults(results, result_vectors, dv_query)
else: else:
raise ValueError("method unknown") raise ValueError("method unknown")
@ -137,8 +170,8 @@ def main():
args = parser.parse_args() args = parser.parse_args()
df = load_data() df = load_data()
indexes_scores = search(args.query, args.method, df) results = search(args.query, args.method, df)
print_results(indexes_scores, df) print_results(results.indexes_scores, df)
if __name__ == "__main__": if __name__ == "__main__":