part 2 done but Doc2Vec

This commit is contained in:
Claudio Maggioni 2023-10-11 17:49:38 +02:00
parent ad1d07623c
commit 3196c1c199
2 changed files with 39 additions and 6 deletions

View file

@ -1,2 +1,3 @@
nltk==3.8.1 nltk==3.8.1
pandas==2.1.1 pandas==2.1.1
gensim==4.3.2

View file

@ -5,6 +5,10 @@ import pandas as pd
import nltk import nltk
import numpy as np import numpy as np
from nltk.corpus import stopwords from nltk.corpus import stopwords
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
from gensim.models import TfidfModel, LsiModel, LdaModel
from gensim.corpora import Dictionary
from collections import defaultdict
nltk.download('stopwords') nltk.download('stopwords')
@ -44,20 +48,48 @@ def get_bow(data, split_f):
return remove_stopwords(split_f(data)) return remove_stopwords(split_f(data))
def search(query): def print_sims(corpus, query, df, dictionary):
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
sims = index[query]
pick_top = 5
for idx, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]:
row = df.loc[idx]
print("Similarity: {s:2.02f}%".format(s=score*100))
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
def search(query, method):
df = pd.read_csv(IN_DATASET) df = pd.read_csv(IN_DATASET)
df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))
for i, row in df.iterrows(): corpus_list = []
name_bow = get_bow(row["name"], identifier_split) for idx, row in df.iterrows():
comment_bow = get_bow(row["comment"], comment_split) document_words = row["name_bow"] + row["comment_bow"]
corpus_list.append(document_words)
dictionary = Dictionary(corpus_list)
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
query_bow = dictionary.doc2bow(get_bow(query, comment_split))
if method == "tfidf":
tfidf = TfidfModel(corpus_bow)
print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary)
elif method == "freq":
print_sims(corpus_bow, query_bow, df, dictionary)
elif method == "lsi":
lsi = LsiModel(corpus_bow)
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("method", help="the method to compare similarities with", type=str)
parser.add_argument("query", help="the query to search the corpus with", type=str) parser.add_argument("query", help="the query to search the corpus with", type=str)
args = parser.parse_args() args = parser.parse_args()
search(args.query) search(args.query, args.method)
if __name__ == "__main__": if __name__ == "__main__":