From 3196c1c199ec1693fe19a23a527980f9ce19e4e7 Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Wed, 11 Oct 2023 17:49:38 +0200 Subject: [PATCH] part 2 done but Doc2Vec --- requirements.txt | 1 + search-data.py | 44 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3012ac71..17bdf3d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ nltk==3.8.1 pandas==2.1.1 +gensim==4.3.2 diff --git a/search-data.py b/search-data.py index 9d5481d8..526a9732 100644 --- a/search-data.py +++ b/search-data.py @@ -5,6 +5,10 @@ import pandas as pd import nltk import numpy as np from nltk.corpus import stopwords +from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity +from gensim.models import TfidfModel, LsiModel, LdaModel +from gensim.corpora import Dictionary +from collections import defaultdict nltk.download('stopwords') @@ -44,20 +48,48 @@ def get_bow(data, split_f): return remove_stopwords(split_f(data)) -def search(query): - df = pd.read_csv(IN_DATASET) +def print_sims(corpus, query, df, dictionary): + index = SparseMatrixSimilarity(corpus, num_features=len(dictionary)) + sims = index[query] + pick_top = 5 - for i, row in df.iterrows(): - name_bow = get_bow(row["name"], identifier_split) - comment_bow = get_bow(row["comment"], comment_split) + for idx, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]: + row = df.loc[idx] + print("Similarity: {s:2.02f}%".format(s=score*100)) + print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \ + .format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"])) +def search(query, method): + df = pd.read_csv(IN_DATASET) + df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split)) + df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split)) + + corpus_list = [] + for idx, row in df.iterrows(): + document_words = row["name_bow"] + row["comment_bow"] + corpus_list.append(document_words) + + dictionary = Dictionary(corpus_list) + corpus_bow = [dictionary.doc2bow(text) for text in corpus_list] + query_bow = dictionary.doc2bow(get_bow(query, comment_split)) + + if method == "tfidf": + tfidf = TfidfModel(corpus_bow) + print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary) + elif method == "freq": + print_sims(corpus_bow, query_bow, df, dictionary) + elif method == "lsi": + lsi = LsiModel(corpus_bow) + print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary) + def main(): parser = argparse.ArgumentParser() + parser.add_argument("method", help="the method to compare similarities with", type=str) parser.add_argument("query", help="the query to search the corpus with", type=str) args = parser.parse_args() - search(args.query) + search(args.query, args.method) if __name__ == "__main__":