From 453beeb980aa4fa9d318bd88df96808107ddeb18 Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Wed, 25 Oct 2023 15:10:47 +0200 Subject: [PATCH] almost done part 4 --- prec-recall.py | 48 +++++++++++++++++++++++++++++++++++++++--- requirements.txt | 3 +++ search-data.py | 55 ++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 92 insertions(+), 14 deletions(-) diff --git a/prec-recall.py b/prec-recall.py index d2944452..a3b975c6 100644 --- a/prec-recall.py +++ b/prec-recall.py @@ -1,7 +1,12 @@ import argparse from typing import Iterable, Optional +import numpy as np import pandas as pd +import seaborn as sns +import tqdm +from matplotlib import pyplot as plt +from sklearn.manifold import TSNE search_data = __import__('search-data') @@ -43,6 +48,22 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]: return None +def plot_df(results, query: str) -> Optional[pd.DataFrame]: + if results.vectors is not None and results.query_vector is not None: + tsne_vectors = np.array(results.vectors + [results.query_vector]) + # try perplexity = 1, 1.5, 2 + tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000) + tsne_results = tsne.fit_transform(tsne_vectors) + df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'query', 'is_input']) + df['tsne-2d-one'] = tsne_results[:, 0] + df['tsne-2d-two'] = tsne_results[:, 1] + df['query'] = [query] * (len(results.vectors) + 1) + df['is_input'] = (['Result'] * len(results.vectors)) + ['Input query'] + return df + else: + return None + + def main(method: str, file_path: str): df = search_data.load_data() test_set = list(read_ground_truth(file_path, df)) @@ -50,9 +71,16 @@ def main(method: str, file_path: str): precision_sum = 0 recall_sum = 0 - for query, expected in test_set: - indexes_values: list[tuple[int, float]] = search_data.search(query, method, df) - idx = better_index(indexes_values, expected) + dfs = [] + + for query, expected in tqdm.tqdm(test_set): + search_results = search_data.search(query, method, df) + + df_q = plot_df(search_results, query) + if df_q is not None: + dfs.append(df_q) + + idx = better_index(search_results.indexes_scores, expected) if idx is None: precision = 0 @@ -67,6 +95,20 @@ def main(method: str, file_path: str): print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set))) print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set))) + df = pd.concat(dfs) + + plt.figure(figsize=(4, 4)) + ax = sns.scatterplot( + x="tsne-2d-one", y="tsne-2d-two", + hue="query", + style="is_input", + palette=sns.color_palette("husl", n_colors=10), + data=df, + legend="full", + alpha=1.0 + ) + plt.show() + if __name__ == '__main__': parser = argparse.ArgumentParser() diff --git a/requirements.txt b/requirements.txt index 81c4505c..84b942c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,6 @@ gensim==4.3.2 nltk==3.8.1 numpy==1.26.1 pandas==2.1.1 +tqdm==4.66.1 +scikit-learn==1.3.2 +seaborn==0.13.0 \ No newline at end of file diff --git a/search-data.py b/search-data.py index f3a2632f..18682217 100644 --- a/search-data.py +++ b/search-data.py @@ -2,6 +2,9 @@ import argparse import logging import os import re +import typing +from dataclasses import dataclass +from typing import Optional import coloredlogs import nltk @@ -19,7 +22,7 @@ SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv") DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat") -# using ntlk stop words and example words for now +# using nltk stop words and example words for now STOP_WORDS = set(stopwords.words('english')) \ .union(['test', 'tests', 'main', 'this', 'self']) @@ -52,7 +55,7 @@ def get_bow(data, split_f): return remove_stopwords(split_f(data)) -def pick_most_similar(corpus, query, dictionary): +def pick_most_similar(corpus, query, dictionary) -> list[tuple[int, float]]: index = SparseMatrixSimilarity(corpus, num_features=len(dictionary)) sims = index[query] pick_top = 5 @@ -74,7 +77,7 @@ def print_results(indexes_scores: list[tuple[int, float]], df): desc = (desc[:75] + '...\n') if len(desc) > 75 else desc print("\nSimilarity: {s:2.02f}%".format(s=score * 100)) - print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \ + print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" .format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"])) @@ -94,9 +97,33 @@ def load_data() -> pd.DataFrame: return df -def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]: +SparseVector = list[tuple[int, float]] +DenseVector = np.array + + +def to_dense(vector: SparseVector) -> DenseVector: + dense = [0.0] * len(vector) + for idx, value in vector: + dense[idx] = value + return np.array(dense) + + +@dataclass +class SearchResults: + indexes_scores: list[tuple[int, float]] + vectors: Optional[list[DenseVector]] + query_vector: Optional[DenseVector] + + def __init__(self, indexes_values: list[tuple[int, float]], vectors: Optional[list[DenseVector]], + query_vector: Optional[DenseVector]): + self.indexes_scores = indexes_values + self.vectors = vectors + self.query_vector = query_vector + + +def search(query: str, method: str, df: pd.DataFrame) -> SearchResults: corpus_list = [] - for idx, row in df.iterrows(): + for _, row in df.iterrows(): document_words = row["name_bow"] + row["comment_bow"] corpus_list.append(document_words) @@ -112,12 +139,15 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]] if method == "tfidf": tfidf = TfidfModel(corpus_bow) - return pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary) + return SearchResults(pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary), None, None) elif method == "freq": - return pick_most_similar(corpus_bow, query_bow, dictionary) + return SearchResults(pick_most_similar(corpus_bow, query_bow, dictionary), None, None) elif method == "lsi": lsi = LsiModel(corpus_bow) - return pick_most_similar(lsi[corpus_bow], lsi[query_bow], dictionary) + corpus = typing.cast(list[SparseVector], lsi[corpus_bow]) + results = pick_most_similar(corpus, lsi[query_bow], dictionary) + result_vectors: list[DenseVector] = [to_dense(corpus[idx]) for idx, _ in results] + return SearchResults(results, result_vectors, to_dense(lsi[query_bow])) elif method == "doc2vec": if os.path.exists(DOC2VEC_MODEL): model = Doc2Vec.load(DOC2VEC_MODEL) @@ -125,7 +155,10 @@ def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]] model = build_doc2vec_model(corpus_list) dv_query = model.infer_vector(query_w) - return model.dv.most_similar([dv_query], topn=5) + results = model.dv.most_similar([dv_query], topn=5) + + result_vectors = [model.infer_vector(corpus_list[idx]) for idx, _ in results] + return SearchResults(results, result_vectors, dv_query) else: raise ValueError("method unknown") @@ -137,8 +170,8 @@ def main(): args = parser.parse_args() df = load_data() - indexes_scores = search(args.query, args.method, df) - print_results(indexes_scores, df) + results = search(args.query, args.method, df) + print_results(results.indexes_scores, df) if __name__ == "__main__":