diff --git a/report/main.pdf b/report/main.pdf index d1459b1a..883481da 100644 Binary files a/report/main.pdf and b/report/main.pdf differ diff --git a/report/main.tex b/report/main.tex index cd0918fd..fbe47151 100644 --- a/report/main.tex +++ b/report/main.tex @@ -34,6 +34,7 @@ \usepackage{subcaption} \usepackage{amssymb} \usepackage{amsmath} +\usepackage{changepage} \usepackage{hyperref} \title{Knowledge Management and Analysis \\ Project 01: Code Search} @@ -42,52 +43,63 @@ \begin{document} -\maketitle + \maketitle -\subsection*{Section 1 - Data Extraction} + \begin{adjustwidth}{-4cm}{-4cm} + \centering + \begin{tabular}{cc} + \toprule + Repository URL & \url{https://github.com/kamclassroom2022/project-01-multi-search-maggicl} \\ + Commit ID & \texttt{b8e0a2c3c41249e45b233b55607e0b04ebe1bad0} \\ \bottomrule + \end{tabular} + \end{adjustwidth} + \vspace{1cm} -The data extraction (implemented in the script \texttt{extract-data.py}) process scans through the files in the -TensorFlow project to extract Python docstrings and symbol names for functions, classes and methods. A summary of the -number of features extracted can be found in table~\ref{tab:count1}. The collected figures show that the number of -classes is more than half the number of files, while the number of functions is about twice the number of files. -Additionally, the data shows that a class has slightly more than 2 methods in it on average. -\begin{table}[H] -\centering -\begin{tabular}{cc} -\hline -Type & Number \\ -\hline -Python files & 2817 \\ -Classes & 1882 \\ -Functions & 4565 \\ -Methods & 5817 \\ -\hline -\end{tabular} -\caption{Count of created classes and properties.} -\label{tab:count1} -\end{table} + \subsection*{Section 1 - Data Extraction} -\subsection*{Section 2: Training of search engines} + The data extraction (implemented in the script \texttt{extract-data.py}) process scans through the files in the + TensorFlow project to extract Python docstrings and symbol names for functions, classes and methods. A summary of the + number of features extracted can be found in table~\ref{tab:count1}. The collected figures show that the number of + classes is more than half the number of files, while the number of functions is about twice the number of files. + Additionally, the data shows that a class has slightly more than 2 methods in it on average. -The training and model execution of the search engines is implemented in the Python script \texttt{search-data.py}. -The training model loads the data extracted by \texttt{extract-data.py} and uses as classification features the -identifier name and only the first line of the comment docstring. All other comment lines are filtered out as this -significantly increases performance when evaluating the models. + \begin{table}[H] + \centering + \begin{tabular}{cc} + \toprule + Type & Number \\ + \midrule + Python files & 2817 \\ + Classes & 1882 \\ + Functions & 4565 \\ + Methods & 5817 \\ + \bottomrule + \end{tabular} + \caption{Count of created classes and properties.} + \label{tab:count1} + \end{table} -The script is able to search a given natural language query among the extracted TensorFlow corpus using four techniques. -These are namely: Word Frequency Similarity, Term-Frequency Inverse Document-Frequency (TF-IDF) Similarity, Latent -Semantic Indexing (LSI), and Doc2Vec. + \subsection*{Section 2: Training of search engines} -An example output of results generated from the query ``Gather gpu device info'' for the word frequency, TF-IDF, LSI -and Doc2Vec models are shown in -figures~\ref{fig:search-freq},~\ref{fig:search-tfidf},~\ref{fig:search-lsi}~and~\ref{fig:search-doc2vec} respectively. -All four models are able to correctly report the ground truth required by the file \texttt{ground-truth-unique.txt} as -the first result with $>90\%$ similarity, with the except of the Doc2Vec model which reports $71.63\%$ similarity. + The training and model execution of the search engines is implemented in the Python script \texttt{search-data.py}. + The training model loads the data extracted by \texttt{extract-data.py} and uses as classification features the + identifier name and only the first line of the comment docstring. All other comment lines are filtered out as this + significantly increases performance when evaluating the models. -\begin{figure}[b] - \small - \begin{verbatim} + The script is able to search a given natural language query among the extracted TensorFlow corpus using four techniques. + These are namely: Word Frequency Similarity, Term-Frequency Inverse Document-Frequency (TF-IDF) Similarity, Latent + Semantic Indexing (LSI), and Doc2Vec. + + An example output of results generated from the query ``Gather gpu device info'' for the word frequency, TF-IDF, LSI + and Doc2Vec models are shown in + figures~\ref{fig:search-freq},~\ref{fig:search-tfidf},~\ref{fig:search-lsi}~and~\ref{fig:search-doc2vec} respectively. + All four models are able to correctly report the ground truth required by the file \texttt{ground-truth-unique.txt} as + the first result with $>90\%$ similarity, with the except of the Doc2Vec model which reports $71.63\%$ similarity. + + \begin{figure}[b] + \small + \begin{verbatim} Similarity: 90.45% Python function: gather_gpu_devices Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf... @@ -118,13 +130,13 @@ Description: Gather list of devices available to TensorFlow. Returns: A lis... File: tensorflow/tensorflow/tools/test/system_info_lib.py Line: 126 \end{verbatim} - \caption{Search result output for the query ``Gather gpu device info'' using the word frequency similarity model.} - \label{fig:search-freq} -\end{figure} + \caption{Search result output for the query ``Gather gpu device info'' using the word frequency similarity model.} + \label{fig:search-freq} + \end{figure} -\begin{figure}[b] - \small - \begin{verbatim} + \begin{figure}[b] + \small + \begin{verbatim} Similarity: 90.95% Python function: gather_gpu_devices Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf... @@ -154,13 +166,13 @@ Python function: info File: tensorflow/tensorflow/python/platform/tf_logging.py Line: 167 \end{verbatim} - \caption{Search result output for the query ``Gather gpu device info'' using the TF-IDF model.} - \label{fig:search-tfidf} -\end{figure} + \caption{Search result output for the query ``Gather gpu device info'' using the TF-IDF model.} + \label{fig:search-tfidf} + \end{figure} -\begin{figure}[b] - \small - \begin{verbatim} + \begin{figure}[b] + \small + \begin{verbatim} Similarity: 98.38% Python function: gather_gpu_devices Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf... @@ -190,13 +202,13 @@ Python method: get_var_on_device File: tensorflow/tensorflow/python/distribute/packed_distributed_variable.py Line: 90 \end{verbatim} - \caption{Search result output for the query ``Gather gpu device info'' using the LSI model.} - \label{fig:search-lsi} -\end{figure} + \caption{Search result output for the query ``Gather gpu device info'' using the LSI model.} + \label{fig:search-lsi} + \end{figure} -\begin{figure}[b] - \small - \begin{verbatim} + \begin{figure}[b] + \small + \begin{verbatim} Similarity: 71.63% Python function: gather_gpu_devices Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf... @@ -227,59 +239,59 @@ Description: A list of device names for CPU hosts. Returns: A list of devic... File: tensorflow/tensorflow/python/tpu/tpu_embedding.py Line: 1011 \end{verbatim} - \caption{Search result output for the query ``Gather gpu device info'' using the Doc2Vec model.} - \label{fig:search-doc2vec} -\end{figure} + \caption{Search result output for the query ``Gather gpu device info'' using the Doc2Vec model.} + \label{fig:search-doc2vec} + \end{figure} -\subsection*{Section 3: Evaluation of search engines} + \subsection*{Section 3: Evaluation of search engines} -The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script -\texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}. + The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script + \texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}. -Precision and recall are quite high for all models. -The word frequency model has the highest precision and recall ($93.33\%$ and $100.00\%$ respectively), while the Doc2Vec -model has the lowest precision ($73.33\%$) and lowest recall ($80.00\%$). + Precision and recall are quite high for all models. + The word frequency model has the highest precision and recall ($93.33\%$ and $100.00\%$ respectively), while the Doc2Vec + model has the lowest precision ($73.33\%$) and lowest recall ($80.00\%$). -\begin{table}[H] -\centering -\begin{tabular}{ccc} -\hline -Engine & Avg Precision & Recall \\ -\hline -Frequencies & 93.33\% & 100.00\% \\ -TD-IDF & 90.00\% & 90.00\% \\ -LSI & 90.00\% & 90.00\% \\ -Doc2Vec & 73.33\% & 80.00\% \\ -\hline -\end{tabular} -\caption{Evaluation of search engines.} -\label{tab:tab2} -\end{table} + \begin{table}[H] + \centering + \begin{tabular}{ccc} + \toprule + Engine & Avg Precision & Recall \\ + \midrule + Frequencies & 93.33\% & 100.00\% \\ + TD-IDF & 90.00\% & 90.00\% \\ + LSI & 90.00\% & 90.00\% \\ + Doc2Vec & 73.33\% & 80.00\% \\ + \bottomrule + \end{tabular} + \caption{Evaluation of search engines.} + \label{tab:tab2} + \end{table} -\subsection*{TBD Section 4: Visualisation of query results} + \subsection*{TBD Section 4: Visualisation of query results} -The two-dimensional T-SNE plots (computed with perplexity $= 2$) for the LSI and Doc2Vec models are respectively in -figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}. + The two-dimensional T-SNE plots (computed with perplexity $= 2$) for the LSI and Doc2Vec models are respectively in + figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}. -The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows -fewer outliers and more distinct clusters for the results of each query and the query vector itself. However, even -considering the good performance for both models, it is hard to distinguish from the plots given distinct ``regions'' -where results and their respective query are located. + The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows + fewer outliers and more distinct clusters for the results of each query and the query vector itself. However, even + considering the good performance for both models, it is hard to distinguish from the plots given distinct ``regions'' + where results and their respective query are located. -\begin{figure} -\begin{center} -\includegraphics[width=\textwidth]{../out/lsi_plot} -\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} -\label{fig:tsne-lsi} -\end{center} -\end{figure} + \begin{figure} + \begin{center} + \includegraphics[width=\textwidth]{../out/lsi_plot} + \caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} + \label{fig:tsne-lsi} + \end{center} + \end{figure} -\begin{figure} -\begin{center} -\includegraphics[width=\textwidth]{../out/doc2vec_plot} -\caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} -\label{fig:tsne-doc2vec} -\end{center} -\end{figure} + \begin{figure} + \begin{center} + \includegraphics[width=\textwidth]{../out/doc2vec_plot} + \caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} + \label{fig:tsne-doc2vec} + \end{center} + \end{figure} \end{document}