wip report

2023-11-07 11:48:00 +01:00 · 2023-11-07 11:48:00 +01:00 · fd007afb60
commit fd007afb60
parent 678434abdf
13 changed files with 279 additions and 45 deletions
--- a/README.md
+++ b/README.md
@ -12,3 +12,60 @@ In this repository, you can find the following files:
 For more information, see the Project-02 slides (available on iCourse)
 Note: Feel free to modify this file according to the project's necessities.
 ## Environment setup
 To install the required dependencies make sure `python3` points to a Python 3.10 or 3.11 installation and then run:
 ```shell
 python3 -m venv env
 source env/bin/activate
 pip install -r requirements.txt
 ```
 ## Part 1: data extraction
 To extract the data in file `data.csv` run the command:
 ```shell
 python3 extract-data.py
 ```
 The script prints the requested counts, which are namely:
 ```
 Methods: 5817
 Functions: 4565
 Classes: 1882
 Python Files: 2817
 ```
 ## Part 2: Training
 In order to train and predict the output of a given query run the command:
 ```shell
 python3 search-data.py [method] "[query]"
 ```
 where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to run all classifiers and `[query]` is the natural
 language query to search. Outputs are printed on stdout, and in case of `doc2vec` the trained model file is saved in
 `./doc2vec_model.dat` and fetched in this path for subsequent executions.
 ## Part 3: Evaluation
 To evaluate a model run the command:
 ```shell
 python3 search-data.py [method] ./ground-truth-unique.txt
 ```
 where `[method]` is one of `{tfidf,freq,lsi,doc2vec}` or `all` to evaluate all classifiers. The script outputs the
 performance of the classifiers in terms of average precision and recall, which are namely:
 | Engine   | Average Precision   | Average Recall   |
 |:---------|:--------------------|:-----------------|
 | tfidf    | 20.00%              | 20.00%           |
 | freq     | 27.00%              | 40.00%           |
 | lsi      | 4.00%               | 20.00%           |
 | doc2vec  | 10.00%              | 10.00%           |
--- a/doc2vec_model.dat
+++ b/doc2vec_model.dat
--- a/extract-data.py
+++ b/extract-data.py
@ -15,7 +15,7 @@ def find_py_files(dir):
 def keep_name(name):
-    return not name.startswith("_") and not "main" in str(name).lower() and \
+    return not name.startswith("_") and "main" not in str(name).lower() and \
        "test" not in str(name).lower()
@ -56,11 +56,11 @@ class FeatureVisitor(ast.NodeVisitor):
                        })
 def main():
    df = pd.DataFrame(columns=["name", "file", "line", "type", "comment"])
    files = list(find_py_files(IN_DIR))
-    for file in find_py_files(IN_DIR):
+    for file in files:
        with open(file, "r") as f:
            py_source = f.read()
@ -71,6 +71,16 @@ def main():
        df_visitor = pd.DataFrame.from_records(visitor.rows)
        df = pd.concat([df, df_visitor])
    counts = df["type"].apply(lambda ft: {
        "function": "Functions",
        "class": "Classes",
        "method": "Methods"
    }[ft]).value_counts().to_dict()
    counts["Python Files"] = len(files)
    for file_type, name in counts.items():
        print(f"{file_type}: {name}")
    df.reset_index(drop=True).to_csv(OUT_FILE)
--- a/out/doc2vec_plot.png
+++ b/out/doc2vec_plot.png
--- a/out/doc2vec_prec_recall.txt
+++ b/out/doc2vec_prec_recall.txt
@ -1,2 +1,2 @@
-Precision: 30.00%
+Precision: 10.00%
-Recall: 30.00%
+Recall: 10.00%
--- a/out/freq_prec_recall.txt
+++ b/out/freq_prec_recall.txt
@ -1,2 +1,2 @@
-Precision: 24.50%
+Precision: 27.00%
-Recall: 24.50%
+Recall: 40.00%
--- a/out/lsi_plot.png
+++ b/out/lsi_plot.png
--- a/out/lsi_prec_recall.txt
+++ b/out/lsi_prec_recall.txt
@ -1,2 +1,2 @@
-Precision: 3.33%
+Precision: 4.00%
-Recall: 3.33%
+Recall: 20.00%
--- a/out/tfidf_prec_recall.txt
+++ b/out/tfidf_prec_recall.txt
@ -1,2 +1,2 @@
-Precision: 22.50%
+Precision: 20.00%
-Recall: 22.50%
+Recall: 20.00%
--- a/prec-recall.py
+++ b/prec-recall.py
@ -53,7 +53,7 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
 def plot_df(results, query: str) -> Optional[pd.DataFrame]:
    if results.vectors is not None and results.query_vector is not None:
        tsne_vectors = np.array(results.vectors + [results.query_vector])
-        tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000)
+        tsne = TSNE(n_components=2, perplexity=2, n_iter=3000)
        tsne_results = tsne.fit_transform(tsne_vectors)
        df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind'])
        df['tsne-2d-one'] = tsne_results[:, 0]
@ -65,7 +65,7 @@ def plot_df(results, query: str) -> Optional[pd.DataFrame]:
        return None
-def main(method: str, file_path: str):
+def evaluate(method_name: str, file_path: str) -> tuple[float, float]:
    df = search_data.load_data()
    test_set = list(read_ground_truth(file_path, df))
@ -75,7 +75,7 @@ def main(method: str, file_path: str):
    dfs = []
    for query, expected in tqdm.tqdm(test_set):
-        search_results = search_data.search(query, method, df)
+        search_results = search_data.search(query, method_name, df)
        df_q = plot_df(search_results, query)
        if df_q is not None:
@ -96,10 +96,13 @@ def main(method: str, file_path: str):
    if not os.path.isdir(OUT_DIR):
        os.makedirs(OUT_DIR)
-    output = "Precision: {0:.2f}%\nRecall: {0:.2f}%\n".format(precision_sum * 100 / len(test_set))
+    precision = precision_sum * 100 / len(test_set)
    recall = recall_sum * 100 / len(test_set)
    output = "Precision: {0:.2f}%\nRecall: {1:.2f}%\n".format(precision, recall)
    print(output)
-    with open(os.path.join(OUT_DIR, "{0}_prec_recall.txt".format(method)), "w") as f:
+    with open(os.path.join(OUT_DIR, "{0}_prec_recall.txt".format(method_name)), "w") as f:
        f.write(output)
    if len(dfs) > 0:
@ -114,12 +117,33 @@ def main(method: str, file_path: str):
            legend="full",
            alpha=1.0
        )
-        plt.savefig(os.path.join(OUT_DIR, "{0}_plot.png".format(method)))
+        plt.savefig(os.path.join(OUT_DIR, "{0}_plot.png".format(method_name)))
    return precision, recall
 def main():
    methods = ["tfidf", "freq", "lsi", "doc2vec"]
    parser = argparse.ArgumentParser()
    parser.add_argument("method", help="the method to compare similarities with", type=str, choices=methods + ["all"])
    parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str)
    args = parser.parse_args()
    if args.method == "all":
        df = pd.DataFrame(columns=["Engine", "Average Precision", "Average Recall"])
        for i, method in enumerate(methods):
            print(f"Applying method {method}:")
            precision, recall = evaluate(method, args.ground_truth_file)
            df.loc[i, "Engine"] = method
            df.loc[i, "Average Precision"] = f"{precision:.2f}%"
            df.loc[i, "Average Recall"] = f"{recall:.2f}%"
        print(df.to_markdown(index=False))
    else:
        evaluate(args.method, args.ground_truth_file)
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
+    main()
    parser.add_argument("method", help="the method to compare similarities with", type=str)
    parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str)
    args = parser.parse_args()
    main(args.method, args.ground_truth_file)
--- a/report/main.tex
+++ b/report/main.tex
@ -0,0 +1,110 @@
 %!TEX TS-program = pdflatexmk
 \documentclass{article}
 \usepackage{algorithm}
 \usepackage{textcomp}
 \usepackage{xcolor}
 \usepackage{soul}
 \usepackage{booktabs}
 \usepackage[utf8]{inputenc}
 \usepackage[T1]{fontenc}
 \usepackage{microtype}
 \usepackage{rotating}
 \usepackage{graphicx}
 \usepackage{paralist}
 \usepackage{tabularx}
 \usepackage{multicol}
 \usepackage{multirow}
 \usepackage{pbox}
 \usepackage{enumitem}	
 \usepackage{colortbl}
 \usepackage{pifont}
 \usepackage{xspace}
 \usepackage{url}
 \usepackage{tikz}
 \usepackage{fontawesome}
 \usepackage{lscape}
 \usepackage{listings}
 \usepackage{color}
 \usepackage{anyfontsize}
 \usepackage{comment}
 \usepackage{soul}
 \usepackage{multibib}
 \usepackage{float}
 \usepackage{caption}
 \usepackage{subcaption}
 \usepackage{amssymb}
 \usepackage{amsmath}
 \usepackage{hyperref}
 \title{Knowledge Management and Analysis \\ Project 01: Code Search}
 \author{Claudio Maggioni}
 \date{}
 \begin{document}
 \maketitle
 \subsection*{Section 1 - Data Extraction}
 The data extraction process scans through the files in the TensorFlow project to extract Python docstrings and symbol
 names for functions, classes and methods. A summary of the number of features extracted can be found in
 table~\ref{tab:count1}.
 Report and comment figures about the extracted data (e.g., number of files; number of code
 entities of different kinds).
 \begin{table}[H]
 \centering \scriptsize
 \begin{tabular}{cccc}
 \hline
 Type & Number \\
 \hline
 Python files & ? \\
 Classes & ? \\
 Functions & ? \\
 Methods & ? \\
 \hline
 \end{tabular}
 \caption{Count of created classes and properties.}
 \label{tab:count1}
 \end{table}
 \subsection*{Section 2: Training of search engines}
 Report and comment an example of a query and the results.
 \subsection*{Section 3: Evaluation of search engines}
 Using the ground truth provided, evaluate and report recall and average precision for each of the four search engines; comment the differences among search engines.
 \begin{table} [H]
 \centering \scriptsize
 \begin{tabular}{cccc}
 \hline
 Engine & Avg Precision & Recall \\
 \hline
 Frequencies & ? & ? \\
 TD-IDF & ? & ? \\
 LSI & ? & ? \\
 Doc2Vec & ? & ? \\
 \hline
 \end{tabular}
 \caption{Evaluation of search engines.}
 \label{tab:tab2}
 \end{table}
 \subsection*{Section 4: Visualisation of query results}
 Include, comment and compare the t-SNE plots for LSI and for Doc2Vec.
 \begin{figure}[H]
 \begin{center}
 \includegraphics[width=0.3\textwidth]{Figures/dummy_pic.png}
 \caption{Caption.}
 \label{fig:fig1}
 \end{center}
 \end{figure}
 \end{document}
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,8 @@ coloredlogs==15.0.1
 gensim==4.3.2
 nltk==3.8.1
 numpy==1.26.1
-pandas==2.1.1
+pandas==2.1.2
 tqdm==4.66.1
 scikit-learn==1.3.2
 seaborn==0.13.0
 tabulate==0.9.0
--- a/search-data.py
+++ b/search-data.py
@ -3,6 +3,7 @@ import logging
 import os
 import re
 import typing
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Optional
@ -16,7 +17,7 @@ from gensim.models.doc2vec import TaggedDocument, Doc2Vec
 from gensim.similarities import SparseMatrixSimilarity
 from nltk.corpus import stopwords
-nltk.download('stopwords')
+nltk.download('stopwords', quiet=True)
 SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
 IN_DATASET = os.path.join(SCRIPT_DIR, "data.csv")
@ -24,32 +25,35 @@ DOC2VEC_MODEL = os.path.join(SCRIPT_DIR, "doc2vec_model.dat")
 # using nltk stop words and example words for now
 STOP_WORDS = set(stopwords.words('english')) \
-    .union(['test', 'tests', 'main', 'this', 'self'])
+    .union(['test', 'tests', 'main', 'this', 'self', 'def', 'object', 'false', 'class', 'tuple', 'use', 'default',
            'none', 'dtype', 'true', 'function', 'returns', 'int', 'get', 'set', 'new', 'return', 'list', 'python',
            'numpy', 'type', 'name'])
-def find_all(regex, word):
+def find_all(regex: str, word: str, lower=True) -> list[str]:
    matches = re.finditer(regex, word)
-    return [m.group(0).lower() for m in matches]
+    return [m.group(0).lower() if lower else m.group(0) for m in matches]
 # https://stackoverflow.com/a/29920015
-def camel_case_split(word):
+def camel_case_split(word: str) -> list[str]:
    return find_all('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
-def identifier_split(identifier):
+def identifier_split(identifier: str) -> list[str]:
    return [y for x in identifier.split("_") for y in camel_case_split(x)]
-def comment_split(comment):
+def comment_split(comment: str) -> list[str]:
-    return find_all('[A-Za-z0-9]+', comment)
+    # Camel case split within "words" found takes care of referenced type names in the docstring comment
    return [s for word in find_all('[A-Za-z]+', comment, lower=False) for s in camel_case_split(word)]
-def remove_stopwords(input_bow_list):
+def remove_stopwords(input_bow_list: list[str]) -> list[str]:
-    return [word for word in input_bow_list if word not in STOP_WORDS]
+    return [word for word in input_bow_list if word not in STOP_WORDS and len(word) > 2]
-def get_bow(data, split_f):
+def get_bow(data: Optional[float | str], split_f) -> list[str]:
    if data is None or (type(data) == float and np.isnan(data)):
        return []
    return remove_stopwords(split_f(data))
@ -83,17 +87,31 @@ def print_results(indexes_scores: list[tuple[int, float]], df):
 def build_doc2vec_model(corpus_list):
    dvdocs = [TaggedDocument(text, [i]) for i, text in enumerate(corpus_list)]
-    model = Doc2Vec(vector_size=100, epochs=100, sample=1e-5)
+    model = Doc2Vec(vector_size=300, epochs=50, sample=0)
    model.build_vocab(dvdocs)
    model.train(dvdocs, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(DOC2VEC_MODEL)
    return model
-def load_data() -> pd.DataFrame:
+def load_data(print_frequent=False) -> pd.DataFrame:
    df = pd.read_csv(IN_DATASET, index_col=0)
    df["name_bow"] = df["name"].apply(lambda n: get_bow(n, identifier_split))
    df["comment_bow"] = df["comment"].apply(lambda c: get_bow(c, comment_split))
    if print_frequent:
        freq = defaultdict(int)
        for bow in df["name_bow"].tolist():
            for i in bow:
                freq[i] += 1
        for bow in df["comment_bow"].tolist():
            for i in bow:
                freq[i] += 1
        for key, value in sorted(freq.items(), key=lambda k: k[1], reverse=True)[:100]:
            print(f"{value}: {key}")
    return df
@ -164,17 +182,31 @@ def search(query: str, method: str, df: pd.DataFrame) -> SearchResults:
 def main():
    methods = ["tfidf", "freq", "lsi", "doc2vec"]
    parser = argparse.ArgumentParser()
-    parser.add_argument("method", help="the method to compare similarities with", type=str)
+    parser.add_argument("method", help="the method to compare similarities with", type=str,
                        choices=methods + ["all"])
    parser.add_argument("query", help="the query to search the corpus with", type=str)
    parser.add_argument("-v", "--verbose", help="enable verbose logging", action='store_true')
    args = parser.parse_args()
    if args.verbose:
        coloredlogs.install()
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    df = load_data()
-    results = search(args.query, args.method, df)
+
-    print_results(results.indexes_scores, df)
+    if args.method == "all":
        for method in methods:
            print(f"Applying method {method}:")
            results = search(args.query, method, df)
            print_results(results.indexes_scores, df)
            print()
    else:
        results = search(args.query, args.method, df)
        print_results(results.indexes_scores, df)
 if __name__ == "__main__":
    coloredlogs.install()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    main()
`@ -1,2 +1,2 @@`
	`Precision: 30.00%`	`Precision: 10.00%`
	`Recall: 30.00%`	`Recall: 10.00%`
`@ -1,2 +1,2 @@`
	`Precision: 24.50%`	`Precision: 27.00%`
	`Recall: 24.50%`	`Recall: 40.00%`
`@ -1,2 +1,2 @@`
	`Precision: 3.33%`	`Precision: 4.00%`
	`Recall: 3.33%`	`Recall: 20.00%`
`@ -1,2 +1,2 @@`
	`Precision: 22.50%`	`Precision: 20.00%`
	`Recall: 22.50%`	`Recall: 20.00%`