diff --git a/.gitignore b/.gitignore index bdaab25d..f22a4c82 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,161 @@ env/ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ \ No newline at end of file diff --git a/prec-recall.py b/prec-recall.py new file mode 100644 index 00000000..d2944452 --- /dev/null +++ b/prec-recall.py @@ -0,0 +1,76 @@ +import argparse +from typing import Iterable, Optional + +import pandas as pd + +search_data = __import__('search-data') + +PREFIX: str = "./" + + +def read_ground_truth(file_path: str, df: pd.DataFrame) -> Iterable[tuple[str, int]]: + records: list[list[str]] = [] + + with open(file_path) as f: + record_tmp = [] + for line in f: + line = line.strip() + if line == '': + assert len(record_tmp) == 3 + records.append(record_tmp) + record_tmp = [] + else: + record_tmp.append(line) + + if len(record_tmp) == 3: + records.append(record_tmp) + + for query, name, file_name in records: + assert file_name.startswith(PREFIX) + file_name = file_name[len(PREFIX):] + + row = df[(df.name == name) & (df.file == file_name)] + assert len(row) == 1 + + yield query, row.index[0] + + +def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]: + for i, le in enumerate(li): + if le[0] == e: + return i + + return None + + +def main(method: str, file_path: str): + df = search_data.load_data() + test_set = list(read_ground_truth(file_path, df)) + + precision_sum = 0 + recall_sum = 0 + + for query, expected in test_set: + indexes_values: list[tuple[int, float]] = search_data.search(query, method, df) + idx = better_index(indexes_values, expected) + + if idx is None: + precision = 0 + recall = 0 + else: + precision = 1 / (idx + 1) + recall = 1 + + precision_sum += precision + recall_sum += recall + + print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set))) + print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set))) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("method", help="the method to compare similarities with", type=str) + parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str) + args = parser.parse_args() + main(args.method, args.ground_truth_file) diff --git a/search-data.py b/search-data.py index 19d9f0f3..f3a2632f 100644 --- a/search-data.py +++ b/search-data.py @@ -1,20 +1,17 @@ -import re import argparse +import logging import os -import pandas as pd +import re + +import coloredlogs import nltk import numpy as np -from nltk.corpus import stopwords -from gensim.similarities import SparseMatrixSimilarity, MatrixSimilarity -from gensim.models import TfidfModel, LsiModel, LdaModel -from gensim.models.doc2vec import TaggedDocument, Doc2Vec +import pandas as pd from gensim.corpora import Dictionary -from collections import defaultdict -import coloredlogs -import logging - -coloredlogs.install() -logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +from gensim.models import TfidfModel, LsiModel +from gensim.models.doc2vec import TaggedDocument, Doc2Vec +from gensim.similarities import SparseMatrixSimilarity +from nltk.corpus import stopwords nltk.download('stopwords') @@ -55,19 +52,19 @@ def get_bow(data, split_f): return remove_stopwords(split_f(data)) -def print_sims(corpus, query, df, dictionary): +def pick_most_similar(corpus, query, dictionary): index = SparseMatrixSimilarity(corpus, num_features=len(dictionary)) sims = index[query] - pick_top = 5 - print_results(sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top], df) + pick_top = 5 + return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:pick_top] -def print_results(idxs_scores, df): +def print_results(indexes_scores: list[tuple[int, float]], df): print("\n===== RESULTS: =====") - for idx, score in idxs_scores: + for idx, score in indexes_scores: row = df.loc[idx] - + comment = row["comment"] if type(comment) != str: desc = "" @@ -76,7 +73,7 @@ def print_results(idxs_scores, df): desc = "Description: {c}\n".format(c=comment) desc = (desc[:75] + '...\n') if len(desc) > 75 else desc - print("\nSimilarity: {s:2.02f}%".format(s=score*100)) + print("\nSimilarity: {s:2.02f}%".format(s=score * 100)) print("Python {feat}: {name}\n{desc}File: {file}\nLine: {line}" \ .format(feat=row["type"], name=row["name"], desc=desc, file=row["file"], line=row["line"])) @@ -90,41 +87,47 @@ def build_doc2vec_model(corpus_list): return model -def search(query, method): - df = pd.read_csv(IN_DATASET) +def load_data() -> pd.DataFrame: + df = pd.read_csv(IN_DATASET, index_col=0) df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split)) df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split)) + return df + +def search(query: str, method: str, df: pd.DataFrame) -> list[tuple[int, float]]: corpus_list = [] for idx, row in df.iterrows(): document_words = row["name_bow"] + row["comment_bow"] corpus_list.append(document_words) query_w = get_bow(query, comment_split) - + dictionary = None + corpus_bow = None + query_bow = None + if method != "doc2vec": dictionary = Dictionary(corpus_list) corpus_bow = [dictionary.doc2bow(text) for text in corpus_list] query_bow = dictionary.doc2bow(query_w) - + if method == "tfidf": tfidf = TfidfModel(corpus_bow) - print_sims(tfidf[corpus_bow], tfidf[query_bow], df, dictionary) + return pick_most_similar(tfidf[corpus_bow], tfidf[query_bow], dictionary) elif method == "freq": - print_sims(corpus_bow, query_bow, df, dictionary) + return pick_most_similar(corpus_bow, query_bow, dictionary) elif method == "lsi": lsi = LsiModel(corpus_bow) - print_sims(lsi[corpus_bow], lsi[query_bow], df, dictionary) + return pick_most_similar(lsi[corpus_bow], lsi[query_bow], dictionary) elif method == "doc2vec": if os.path.exists(DOC2VEC_MODEL): model = Doc2Vec.load(DOC2VEC_MODEL) else: model = build_doc2vec_model(corpus_list) - dvquery = model.infer_vector(query_w) - print_results(model.dv.most_similar([dvquery], topn=5), df) + dv_query = model.infer_vector(query_w) + return model.dv.most_similar([dv_query], topn=5) else: - raise Error("method unknown") + raise ValueError("method unknown") def main(): @@ -132,8 +135,13 @@ def main(): parser.add_argument("method", help="the method to compare similarities with", type=str) parser.add_argument("query", help="the query to search the corpus with", type=str) args = parser.parse_args() - search(args.query, args.method) + + df = load_data() + indexes_scores = search(args.query, args.method, df) + print_results(indexes_scores, df) if __name__ == "__main__": + coloredlogs.install() + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) main()