doc2vec executes
This commit is contained in:
parent
06beb66d50
commit
72bfb2b778
3 changed files with 47 additions and 16 deletions
BIN
doc2vec_model.dat
Normal file
BIN
doc2vec_model.dat
Normal file
Binary file not shown.
|
@ -1,3 +1,5 @@
|
||||||
nltk==3.8.1
|
coloredlogs==15.0.1
|
||||||
pandas==2.1.1
|
|
||||||
gensim==4.3.2
|
gensim==4.3.2
|
||||||
|
nltk==3.8.1
|
||||||
|
numpy==1.26.1
|
||||||
|
pandas==2.1.1
|
||||||
|
|
|
@ -10,15 +10,21 @@ from gensim.models import TfidfModel, LsiModel, LdaModel
|
||||||
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
|
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
|
||||||
from gensim.corpora import Dictionary
|
from gensim.corpora import Dictionary
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
import coloredlogs
|
||||||
|
import logging
|
||||||
|
|
||||||
|
coloredlogs.install()
|
||||||
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
||||||
|
|
||||||
nltk.download('stopwords')
|
nltk.download('stopwords')
|
||||||
|
|
||||||
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
|
||||||
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
|
||||||
|
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
|
||||||
|
|
||||||
# using ntlk stop words and example words for now
|
# using ntlk stop words and example words for now
|
||||||
STOP_WORDS = set(stopwords.words('english')) \
|
STOP_WORDS = set(stopwords.words('english')) \
|
||||||
.union(['test', 'tests', 'main', 'this'])
|
.union(['test', 'tests', 'main', 'this', 'self'])
|
||||||
|
|
||||||
|
|
||||||
def find_all(regex, word):
|
def find_all(regex, word):
|
||||||
|
@ -53,15 +59,35 @@ def print_sims(corpus, query, df, dictionary):
|
||||||
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
|
||||||
sims = index[query]
|
sims = index[query]
|
||||||
pick_top = 5
|
pick_top = 5
|
||||||
|
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top], df)
|
||||||
|
|
||||||
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
|
|
||||||
|
|
||||||
def print_results(idxs_scores, df):
|
def print_results(idxs_scores, df):
|
||||||
|
print("\n===== RESULTS: =====")
|
||||||
|
|
||||||
for idx, score in idxs_scores:
|
for idx, score in idxs_scores:
|
||||||
row = df.loc[idx]
|
row = df.loc[idx]
|
||||||
print("Similarity: {s:2.02f}%".format(s=score*100))
|
|
||||||
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \
|
comment = row["comment"]
|
||||||
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"]))
|
if type(comment) != str:
|
||||||
|
desc = ""
|
||||||
|
else:
|
||||||
|
comment = re.sub(re.compile(r'[\s\n]+', re.MULTILINE), ' ', comment)
|
||||||
|
desc = "Description: {c}\n".format(c=comment)
|
||||||
|
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
|
||||||
|
|
||||||
|
print("\nSimilarity: {s:2.02f}%".format(s=score*100))
|
||||||
|
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \
|
||||||
|
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
|
||||||
|
|
||||||
|
|
||||||
|
def build_doc2vec_model(corpus_list):
|
||||||
|
dvdocs = [TaggedDocument(text, [i]) for i, text in enumerate(corpus_list)]
|
||||||
|
model = Doc2Vec(vector_size=100, epochs=100, sample=1e-5)
|
||||||
|
model.build_vocab(dvdocs)
|
||||||
|
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
|
||||||
|
model.save(DOC2VEC_MODEL)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
def search(query, method):
|
def search(query, method):
|
||||||
|
@ -74,9 +100,11 @@ def search(query, method):
|
||||||
document_words = row["name_bow"] + row["comment_bow"]
|
document_words = row["name_bow"] + row["comment_bow"]
|
||||||
corpus_list.append(document_words)
|
corpus_list.append(document_words)
|
||||||
|
|
||||||
|
query_w = get_bow(query, comment_split)
|
||||||
|
|
||||||
|
if method != "doc2vec":
|
||||||
dictionary = Dictionary(corpus_list)
|
dictionary = Dictionary(corpus_list)
|
||||||
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
|
||||||
query_w = get_bow(query, comment_split)
|
|
||||||
query_bow = dictionary.doc2bow(query_w)
|
query_bow = dictionary.doc2bow(query_w)
|
||||||
|
|
||||||
if method == "tfidf":
|
if method == "tfidf":
|
||||||
|
@ -88,10 +116,11 @@ def search(query, method):
|
||||||
lsi = LsiModel(corpus_bow)
|
lsi = LsiModel(corpus_bow)
|
||||||
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
|
||||||
elif method == "doc2vec":
|
elif method == "doc2vec":
|
||||||
dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)]
|
if os.path.exists(DOC2VEC_MODEL):
|
||||||
model = Doc2Vec(vector_size=50, min_count=2, epochs=100)
|
model = Doc2Vec.load(DOC2VEC_MODEL)
|
||||||
model.build_vocab(dvdocs)
|
else:
|
||||||
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
|
model = build_doc2vec_model(corpus_list)
|
||||||
|
|
||||||
dvquery = model.infer_vector(query_w)
|
dvquery = model.infer_vector(query_w)
|
||||||
print_results(model.dv.most_similar([dvquery], topn=5), df)
|
print_results(model.dv.most_similar([dvquery], topn=5), df)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in a new issue