diff --git a/doc2vec_model.dat b/doc2vec_model.dat new file mode 100644 index 00000000..447ba9c0 Binary files /dev/null and b/doc2vec_model.dat differ diff --git a/requirements.txt b/requirements.txt index 17bdf3d0..81c4505c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ -nltk==3.8.1 -pandas==2.1.1 +coloredlogs==15.0.1 gensim==4.3.2 +nltk==3.8.1 +numpy==1.26.1 +pandas==2.1.1 diff --git a/search-data.py b/search-data.py index aff08089..19d9f0f3 100644 --- a/search-data.py +++ b/search-data.py @@ -10,15 +10,21 @@ from gensim.models import TfidfModel, LsiModel, LdaModel from gensim.models.doc2vec import TaggedDocument, Doc2Vec from gensim.corpora import Dictionary from collections import defaultdict +import coloredlogs +import logging + +coloredlogs.install() +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) nltk.download('stopwords') SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv") +DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat") # using ntlk stop words and example words for now STOP_WORDS = set(stopwords.words('english')) \ - .union(['test', 'tests', 'main', 'this']) + .union(['test', 'tests', 'main', 'this', 'self']) def find_all(regex, word): @@ -52,16 +58,36 @@ def get_bow(data, split_f): def print_sims(corpus, query, df, dictionary): index = SparseMatrixSimilarity(corpus, num_features=len(dictionary)) sims = index[query] - pick_top = 5 - - print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top]) + pick_top = 5 + print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top], df) + def print_results(idxs_scores, df): + print("\n===== RESULTS: =====") + for idx, score in idxs_scores: row = df.loc[idx] - print("Similarity: {s:2.02f}%".format(s=score*100)) - print("Python {feat}: {name}\nFile: {file}\nLine: {line}\n" \ - .format(feat=row["type"], name=row["name"], file=row["file"], line=row["line"])) + + comment = row["comment"] + if type(comment) != str: + desc = "" + else: + comment = re.sub(re.compile(r'[\s\n]+', re.MULTILINE), ' ', comment) + desc = "Description: {c}\n".format(c=comment) + desc = (desc[:75] + '...\n') if len(desc) > 75 else desc + + print("\nSimilarity: {s:2.02f}%".format(s=score*100)) + print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \ + .format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"])) + + +def build_doc2vec_model(corpus_list): + dvdocs = [TaggedDocument(text, [i]) for i, text in enumerate(corpus_list)] + model = Doc2Vec(vector_size=100, epochs=100, sample=1e-5) + model.build_vocab(dvdocs) + model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs) + model.save(DOC2VEC_MODEL) + return model def search(query, method): @@ -74,10 +100,12 @@ def search(query, method): document_words = row["name_bow"] + row["comment_bow"] corpus_list.append(document_words) - dictionary = Dictionary(corpus_list) - corpus_bow = [dictionary.doc2bow(text) for text in corpus_list] query_w = get_bow(query, comment_split) - query_bow = dictionary.doc2bow(query_w) + + if method != "doc2vec": + dictionary = Dictionary(corpus_list) + corpus_bow = [dictionary.doc2bow(text) for text in corpus_list] + query_bow = dictionary.doc2bow(query_w) if method == "tfidf": tfidf = TfidfModel(corpus_bow) @@ -88,10 +116,11 @@ def search(query, method): lsi = LsiModel(corpus_bow) print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary) elif method == "doc2vec": - dvdocs = [TaggedDocument(bow, [i]) for i, bow in enumerate(corpus_bow)] - model = Doc2Vec(vector_size=50, min_count=2, epochs=100) - model.build_vocab(dvdocs) - model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs) + if os.path.exists(DOC2VEC_MODEL): + model = Doc2Vec.load(DOC2VEC_MODEL) + else: + model = build_doc2vec_model(corpus_list) + dvquery = model.infer_vector(query_w) print_results(model.dv.most_similar([dvquery], topn=5), df) else: