wip word2vec
This commit is contained in:
parent
3196c1c199
commit
06beb66d50
1 changed files with 16 additions and 2 deletions
|
@ -7,6 +7,7 @@ import numpy as np
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
|
from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity
|
||||||
from gensim.models import TfidfModel, LsiModel, LdaModel
|
from gensim.models import TfidfModel, LsiModel, LdaModel
|
||||||
|
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
|
||||||
from gensim.corpora import Dictionary
|
from gensim.corpora import Dictionary
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
@ -53,7 +54,10 @@ def print_sims(corpus, query, df, dictionary):
|
||||||
sims = index[query]
|
sims = index[query]
|
||||||
pick_top = 5
|
pick_top = 5
|
||||||
|
|
||||||
for idx, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]:
|
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
|
||||||
|
|
||||||
|
def print_results(idxs_scores, df):
|
||||||
|
for idx, score in idxs_scores:
|
||||||
row = df.loc[idx]
|
row = df.loc[idx]
|
||||||
print("Similarity: {s:2.02f}%".format(s=score*100))
|
print("Similarity: {s:2.02f}%".format(s=score*100))
|
||||||
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
|
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
|
||||||
|
@ -72,7 +76,8 @@ def search(query, method):
|
||||||
|
|
||||||
dictionary = Dictionary(corpus_list)
|
dictionary = Dictionary(corpus_list)
|
||||||
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
||||||
query_bow = dictionary.doc2bow(get_bow(query, comment_split))
|
query_w = get_bow(query, comment_split)
|
||||||
|
query_bow = dictionary.doc2bow(query_w)
|
||||||
|
|
||||||
if method == "tfidf":
|
if method == "tfidf":
|
||||||
tfidf = TfidfModel(corpus_bow)
|
tfidf = TfidfModel(corpus_bow)
|
||||||
|
@ -82,6 +87,15 @@ def search(query, method):
|
||||||
elif method == "lsi":
|
elif method == "lsi":
|
||||||
lsi = LsiModel(corpus_bow)
|
lsi = LsiModel(corpus_bow)
|
||||||
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
||||||
|
elif method == "doc2vec":
|
||||||
|
dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)]
|
||||||
|
model = Doc2Vec(vector_size=50, min_count=2, epochs=100)
|
||||||
|
model.build_vocab(dvdocs)
|
||||||
|
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
|
||||||
|
dvquery = model.infer_vector(query_w)
|
||||||
|
print_results(model.dv.most_similar([dvquery], topn=5), df)
|
||||||
|
else:
|
||||||
|
raise Error("method unknown")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Loading…
Reference in a new issue