doc2vec executes

This commit is contained in:
Claudio Maggioni 2023-10-16 16:36:25 +02:00
parent 06beb66d50
commit 72bfb2b778
3 changed files with 47 additions and 16 deletions

BIN
doc2vec_model.dat Normal file

Binary file not shown.

View file

@ -1,3 +1,5 @@
nltk==3.8.1 coloredlogs==15.0.1
pandas==2.1.1
gensim==4.3.2 gensim==4.3.2
nltk==3.8.1
numpy==1.26.1
pandas==2.1.1

View file

@ -10,15 +10,21 @@ from gensim.models import TfidfModel, LsiModel, LdaModel
from gensim.models.doc2vec import TaggedDocument, Doc2Vec from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.corpora import Dictionary from gensim.corpora import Dictionary
from collections import defaultdict from collections import defaultdict
import coloredlogs
import logging
coloredlogs.install()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
nltk.download('stopwords') nltk.download('stopwords')
SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv") IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
# using ntlk stop words and example words for now # using ntlk stop words and example words for now
STOP_WORDS = set(stopwords.words('english')) \ STOP_WORDS = set(stopwords.words('english')) \
.union(['test', 'tests', 'main', 'this']) .union(['test', 'tests', 'main', 'this', 'self'])
def find_all(regex, word): def find_all(regex, word):
@ -52,16 +58,36 @@ def get_bow(data, split_f):
def print_sims(corpus, query, df, dictionary): def print_sims(corpus, query, df, dictionary):
index = SparseMatrixSimilarity(corpus, num_features=len(dictionary)) index = SparseMatrixSimilarity(corpus, num_features=len(dictionary))
sims = index[query] sims = index[query]
pick_top = 5 pick_top = 5
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top], df)
print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top])
def print_results(idxs_scores, df): def print_results(idxs_scores, df):
print("\n===== RESULTS: =====")
for idx, score in idxs_scores: for idx, score in idxs_scores:
row = df.loc[idx] row = df.loc[idx]
print("Similarity: {s:2.02f}%".format(s=score*100))
print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \ comment = row["comment"]
.format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"])) if type(comment) != str:
desc = ""
else:
comment = re.sub(re.compile(r'[\s\n]+', re.MULTILINE), ' ', comment)
desc = "Description: {c}\n".format(c=comment)
desc = (desc[:75] + '...\n') if len(desc) > 75 else desc
print("\nSimilarity: {s:2.02f}%".format(s=score*100))
print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \
.format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"]))
def build_doc2vec_model(corpus_list):
dvdocs = [TaggedDocument(text, [i]) for i, text in enumerate(corpus_list)]
model = Doc2Vec(vector_size=100, epochs=100, sample=1e-5)
model.build_vocab(dvdocs)
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
model.save(DOC2VEC_MODEL)
return model
def search(query, method): def search(query, method):
@ -74,10 +100,12 @@ def search(query, method):
document_words = row["name_bow"] + row["comment_bow"] document_words = row["name_bow"] + row["comment_bow"]
corpus_list.append(document_words) corpus_list.append(document_words)
dictionary = Dictionary(corpus_list)
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
query_w = get_bow(query, comment_split) query_w = get_bow(query, comment_split)
query_bow = dictionary.doc2bow(query_w)
if method != "doc2vec":
dictionary = Dictionary(corpus_list)
corpus_bow = [dictionary.doc2bow(text) for text in corpus_list]
query_bow = dictionary.doc2bow(query_w)
if method == "tfidf": if method == "tfidf":
tfidf = TfidfModel(corpus_bow) tfidf = TfidfModel(corpus_bow)
@ -88,10 +116,11 @@ def search(query, method):
lsi = LsiModel(corpus_bow) lsi = LsiModel(corpus_bow)
print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary) print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary)
elif method == "doc2vec": elif method == "doc2vec":
dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)] if os.path.exists(DOC2VEC_MODEL):
model = Doc2Vec(vector_size=50, min_count=2, epochs=100) model = Doc2Vec.load(DOC2VEC_MODEL)
model.build_vocab(dvdocs) else:
model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs) model = build_doc2vec_model(corpus_list)
dvquery = model.infer_vector(query_w) dvquery = model.infer_vector(query_w)
print_results(model.dv.most_similar([dvquery], topn=5), df) print_results(model.dv.most_similar([dvquery], topn=5), df)
else: else: