wip word2vec

This commit is contained in:
Claudio Maggioni 2023-10-16 15:10:45 +02:00
parent 3196c1c199
commit 06beb66d50

View file

@ -7,6 +7,7 @@ import numpy as np
from nltk.corpus import stopwords from nltk.corpus import stopwords
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
from gensim.models import TfidfModel, LsiModel, LdaModel from gensim.models import TfidfModel, LsiModel, LdaModel
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.corpora import Dictionary from gensim.corpora import Dictionary
from collections import defaultdict from collections import defaultdict
@ -53,7 +54,10 @@ def print_sims(corpus, query, df, dictionary):
sims = index[query] sims = index[query]
pick_top = 5 pick_top = 5
for idx, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]: print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
def print_results(idxs_scores, df):
for idx, score in idxs_scores:
row = df.loc[idx] row = df.loc[idx]
print("Similarity: {s:2.02f}%".format(s=score*100)) print("Similarity: {s:2.02f}%".format(s=score*100))
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \ print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
@ -72,7 +76,8 @@ def search(query, method):
dictionary = Dictionary(corpus_list) dictionary = Dictionary(corpus_list)
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list] corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
query_bow = dictionary.doc2bow(get_bow(query, comment_split)) query_w = get_bow(query, comment_split)
query_bow = dictionary.doc2bow(query_w)
if method == "tfidf": if method == "tfidf":
tfidf = TfidfModel(corpus_bow) tfidf = TfidfModel(corpus_bow)
@ -82,6 +87,15 @@ def search(query, method):
elif method == "lsi": elif method == "lsi":
lsi = LsiModel(corpus_bow) lsi = LsiModel(corpus_bow)
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary) print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
elif method == "doc2vec":
dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)]
model = Doc2Vec(vector_size=50, min_count=2, epochs=100)
model.build_vocab(dvdocs)
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
dvquery = model.infer_vector(query_w)
print_results(model.dv.most_similar([dvquery], topn=5), df)
else:
raise Error("method unknown")
def main(): def main():