diff --git a/.gitignore b/.gitignore index f22a4c82..9917ead1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -env/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -158,4 +157,303 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ \ No newline at end of file +.idea/ +**/.DS_Store +out/model/*.pt + +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Intermediate documents: +*.dvi +*.xdv +*-converted-to.* +# these rules might exclude image files for figures etc. +# *.ps +# *.eps +# *.pdf + +## Generated if empty string is given at "Please type another file name for output:" + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync + +## Build tool directories for auxiliary files +# latexrun +latex.out/ + +## Auxiliary and intermediate files from other packages: +# algorithms +*.alg +*.loa + +# achemso +acs-*.bib + +# amsthm +*.thm + +# beamer +*.nav +*.pre +*.snm +*.vrb + +# changes +*.soc + +# comment +*.cut + +# cprotect +*.cpt + +# elsarticle (documentclass of Elsevier journals) +*.spl + +# endnotes +*.ent + +*.lox + +# feynmf/feynmp +*.mf +*.mp +*.t[1-9] +*.t[1-9][0-9] +*.tfm + +#(r)(e)ledmac/(r)(e)ledpar +*.end +*.?end +*.[1-9] +*.[1-9][0-9] +*.[1-9][0-9][0-9] +*.[1-9]R +*.[1-9][0-9]R +*.[1-9][0-9][0-9]R +*.eledsec[1-9] +*.eledsec[1-9]R +*.eledsec[1-9][0-9] +*.eledsec[1-9][0-9]R +*.eledsec[1-9][0-9][0-9] +*.eledsec[1-9][0-9][0-9]R + +# glossaries +*.acn +*.acr +*.glg +*.glo +*.gls +*.glsdefs +*.lzo +*.lzs +*.slg +*.slo +*.sls + +# uncomment this for glossaries-extra (will ignore makeindex's style files!) +# *.ist + +# gnuplot +*.gnuplot +*.table + +# gnuplottex +*-gnuplottex-* + +# gregoriotex +*.gaux +*.glog +*.gtex + +# htlatex +*.4ct +*.4tc +*.idv +*.lg +*.trc +*.xref + +# hyperref +*.brf + +# knitr +*-concordance.tex +# *.tikz +*-tikzDictionary + +# listings +*.lol + +# luatexja-ruby +*.ltjruby + +# makeidx +*.idx +*.ilg +*.ind + +# minitoc +*.maf +*.mlf +*.mlt +*.mtc[0-9]* +*.slf[0-9]* +*.slt[0-9]* +*.stc[0-9]* + +# minted +_minted* +*.pyg + +# morewrites +*.mw + +# newpax +*.newpax + +# nomencl +*.nlg +*.nlo +*.nls + +# pax +*.pax + +# pdfpcnotes +*.pdfpc + +# sagetex +*.sagetex.sage +*.sagetex.py +*.sagetex.scmd + +# scrwfile +*.wrt + +# svg +svg-inkscape/ + +# sympy +*.sout +*.sympy +sympy-plots-for-*.tex/ + +# pdfcomment +*.upa +*.upb + +# pythontex +*.pytxcode +pythontex-files-*/ + +# tcolorbox +*.listing + +# thmtools +*.loe + +# TikZ & PGF +*.dpth +*.md5 +*.auxlock + +# titletoc +*.ptc + +# todonotes +*.tdo + +# vhistory +*.hst +*.ver + +*.lod + +# xcolor +*.xcp + +# xmpincl +*.xmpi + +# xindy +*.xdy + +# xypic precompiled matrices and outlines +*.xyc +*.xyd + +# endfloat +*.ttt +*.fff + +# Latexian +TSWLatexianTemp* + +## Editors: +# WinEdt +*.bak +*.sav + +# Texpad +.texpadtmp + +# LyX +*.lyx~ + +# Kile +*.backup + +# gummi +.*.swp + +# KBibTeX +*~[0-9]* + +# TeXnicCenter +*.tps + +# auto folder when using emacs and auctex +./auto/* +*.el + +# expex forward references with \gathertags +*-tags.tex + +# standalone packages +*.sta + +# Makeindex log files +*.lpz + +# xwatermark package +*.xwm + +# REVTeX puts footnotes in the bibliography by default, unless the nofootinbib +# option is specified. Footnotes are the stored in a file with suffix Notes.bib. +# Uncomment the next line to have this generated file ignored. +#*Notes.bib \ No newline at end of file diff --git a/README.md b/README.md index 3e1c4e5e..090c0664 100644 --- a/README.md +++ b/README.md @@ -69,3 +69,13 @@ performance of the classifiers in terms of average precision and recall, which a | freq | 27.00% | 40.00% | | lsi | 4.00% | 20.00% | | doc2vec | 10.00% | 10.00% | + +## Report + +To compile the report run: + +```shell +cd report +pdflatex -interaction=nonstopmode -output-directory=. main.tex +pdflatex -interaction=nonstopmode -output-directory=. main.tex +``` \ No newline at end of file diff --git a/out/doc2vec_plot.png b/out/doc2vec_plot.png index 6fa6aa3c..64ee0d4d 100644 Binary files a/out/doc2vec_plot.png and b/out/doc2vec_plot.png differ diff --git a/out/lsi_plot.png b/out/lsi_plot.png index 0f3b14b7..b51c5500 100644 Binary files a/out/lsi_plot.png and b/out/lsi_plot.png differ diff --git a/out/lsi_prec_recall.txt b/out/lsi_prec_recall.txt index 6bdeba6e..b15b7a95 100644 --- a/out/lsi_prec_recall.txt +++ b/out/lsi_prec_recall.txt @@ -1,2 +1,2 @@ -Precision: 4.00% +Precision: 4.50% Recall: 20.00% diff --git a/prec-recall.py b/prec-recall.py index 2869c442..2b8a2928 100644 --- a/prec-recall.py +++ b/prec-recall.py @@ -107,7 +107,7 @@ def evaluate(method_name: str, file_path: str) -> tuple[float, float]: if len(dfs) > 0: df = pd.concat(dfs) - plt.figure(figsize=(20, 16)) + plt.figure(figsize=(12, 10)) sns.scatterplot( x="tsne-2d-one", y="tsne-2d-two", hue="Query", diff --git a/report/main.pdf b/report/main.pdf new file mode 100644 index 00000000..d466baa8 Binary files /dev/null and b/report/main.pdf differ diff --git a/report/main.tex b/report/main.tex index 550f86fb..b9b89226 100644 --- a/report/main.tex +++ b/report/main.tex @@ -16,7 +16,7 @@ \usepackage{multicol} \usepackage{multirow} \usepackage{pbox} -\usepackage{enumitem} +\usepackage{enumitem} \usepackage{colortbl} \usepackage{pifont} \usepackage{xspace} @@ -28,7 +28,6 @@ \usepackage{color} \usepackage{anyfontsize} \usepackage{comment} -\usepackage{soul} \usepackage{multibib} \usepackage{float} \usepackage{caption} @@ -47,23 +46,22 @@ \subsection*{Section 1 - Data Extraction} -The data extraction process scans through the files in the TensorFlow project to extract Python docstrings and symbol -names for functions, classes and methods. A summary of the number of features extracted can be found in -table~\ref{tab:count1}. - -Report and comment figures about the extracted data (e.g., number of files; number of code -entities of different kinds). +The data extraction (implemented in the script \texttt{extract-data.py}) process scans through the files in the +TensorFlow project to extract Python docstrings and symbol names for functions, classes and methods. A summary of the +number of features extracted can be found in table~\ref{tab:count1}. The collected figures show that the number of +classes is more than half the number of files, while the number of functions is about twice the number of files. +Additionally, the data shows that a class has slightly more than 2 methods in it on average. \begin{table}[H] -\centering \scriptsize -\begin{tabular}{cccc} +\centering +\begin{tabular}{cc} \hline Type & Number \\ \hline -Python files & ? \\ -Classes & ? \\ -Functions & ? \\ -Methods & ? \\ +Python files & 2817 \\ +Classes & 1882 \\ +Functions & 4565 \\ +Methods & 5817 \\ \hline \end{tabular} \caption{Count of created classes and properties.} @@ -72,39 +70,196 @@ Methods & ? \\ \subsection*{Section 2: Training of search engines} -Report and comment an example of a query and the results. +The training and model execution of the search engines is implemented in the Python script \texttt{search-data.py}. +The script is able to search a given natural language query among the extracted TensorFlow corpus using four techniques. +These are namely: Word Frequency Similarity, Term-Frequency Inverse Document-Frequency (TF-IDF) Similarity, Latent +Semantic Indexing (LSI), and Doc2Vec. +An example output of results generated from the query ``Gather gpu device info'' for the word frequency, TF-IDF, LSI +and Doc2Vec models are shown in +figures~\ref{fig:search-freq},~\ref{fig:search-tfidf},~\ref{fig:search-lsi}~and~\ref{fig:search-doc2vec} respectively. +Both the word frequency and TF-IDF model identify the correct result (according to the provided ground truth for this +query) as the first recommendation to output. Both the LSI and Doc2Vec models fail to report the correct function in +all 5 results. -\subsection*{Section 3: Evaluation of search engines} +\begin{figure} + \small + \begin{verbatim} +Similarity: 87.29% +Python function: gather_gpu_devices +Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf... +File: tensorflow/tensorflow/tools/test/gpu_info_lib.py +Line: 167 -Using the ground truth provided, evaluate and report recall and average precision for each of the four search engines; comment the differences among search engines. +Similarity: 60.63% +Python function: compute_capability_from_device_desc +Description: Returns the GpuInfo given a DeviceAttributes proto. Args: devi... +File: tensorflow/tensorflow/python/framework/gpu_util.py +Line: 35 +Similarity: 60.30% +Python function: gpu_device_name +Description: Returns the name of a GPU device if available or the empty str... +File: tensorflow/tensorflow/python/framework/test_util.py +Line: 129 + +Similarity: 58.83% +Python function: gather_available_device_info +Description: Gather list of devices available to TensorFlow. Returns: A lis... +File: tensorflow/tensorflow/tools/test/system_info_lib.py +Line: 126 + +Similarity: 57.74% +Python function: gather_memory_info +Description: Gather memory info. +File: tensorflow/tensorflow/tools/test/system_info_lib.py +Line: 70 + \end{verbatim} + \caption{Search result output for the query ``Gather gpu device info'' using the word frequency similarity model.} + \label{fig:search-freq} +\end{figure} + +\begin{figure} + \small + \begin{verbatim} +Similarity: 86.62% +Python function: gather_gpu_devices +Description: Gather gpu device info. Returns: A list of test_log_pb2.GPUInf... +File: tensorflow/tensorflow/tools/test/gpu_info_lib.py +Line: 167 + +Similarity: 66.14% +Python function: gather_memory_info +Description: Gather memory info. +File: tensorflow/tensorflow/tools/test/system_info_lib.py +Line: 70 + +Similarity: 62.52% +Python function: gather_available_device_info +Description: Gather list of devices available to TensorFlow. Returns: A lis... +File: tensorflow/tensorflow/tools/test/system_info_lib.py +Line: 126 + +Similarity: 57.98% +Python function: gather +File: tensorflow/tensorflow/compiler/tf2xla/python/xla.py +Line: 452 + +Similarity: 57.98% +Python function: gather_v2 +File: tensorflow/tensorflow/python/ops/array_ops.py +Line: 4736 + \end{verbatim} + \caption{Search result output for the query ``Gather gpu device info'' using the TF-IDF model.} + \label{fig:search-tfidf} +\end{figure} + +\begin{figure} + \small + \begin{verbatim} +Similarity: 92.11% +Python function: device +Description: Uses gpu when requested and available. +File: tensorflow/tensorflow/python/framework/test_util.py +Line: 1581 + +Similarity: 92.11% +Python function: device +Description: Uses gpu when requested and available. +File: tensorflow/tensorflow/python/keras/testing_utils.py +Line: 925 + +Similarity: 89.04% +Python function: compute_capability_from_device_desc +Description: Returns the GpuInfo given a DeviceAttributes proto. Args: devi... +File: tensorflow/tensorflow/python/framework/gpu_util.py +Line: 35 + +Similarity: 85.96% +Python class: CUDADeviceProperties +File: tensorflow/tensorflow/tools/test/gpu_info_lib.py +Line: 51 + +Similarity: 85.93% +Python function: gpu_device_name +Description: Returns the name of a GPU device if available or the empty str... +File: tensorflow/tensorflow/python/framework/test_util.py +Line: 129 + \end{verbatim} + \caption{Search result output for the query ``Gather gpu device info'' using the LSI model.} + \label{fig:search-lsi} +\end{figure} + +\begin{figure} + \small + \begin{verbatim} +Similarity: 81.85% +Python method: benchmark_gather_nd_op +File: tensorflow/tensorflow/python/kernel_tests/gather_nd_op_test.py +Line: 389 + +Similarity: 81.83% +Python function: gather_hostname +File: tensorflow/tensorflow/tools/test/system_info_lib.py +Line: 66 + +Similarity: 81.07% +Python method: benchmarkNontrivialGatherAxis1XLA +File: tensorflow/tensorflow/compiler/tests/gather_test.py +Line: 210 + +Similarity: 80.53% +Python method: benchmarkNontrivialGatherAxis4 +File: tensorflow/tensorflow/compiler/tests/gather_test.py +Line: 213 + +Similarity: 80.45% +Python method: benchmarkNontrivialGatherAxis4XLA +File: tensorflow/tensorflow/compiler/tests/gather_test.py +Line: 216 + \end{verbatim} + \caption{Search result output for the query ``Gather gpu device info'' using the Doc2Vec model.} + \label{fig:search-doc2vec} +\end{figure} + +\subsection*{TBD Section 3: Evaluation of search engines} + +Using the ground truth provided, evaluate and report recall and average precision for each of the four search engines; +comment the differences among search engines. \begin{table} [H] -\centering \scriptsize +\centering \begin{tabular}{cccc} \hline Engine & Avg Precision & Recall \\ \hline -Frequencies & ? & ? \\ -TD-IDF & ? & ? \\ -LSI & ? & ? \\ -Doc2Vec & ? & ? \\ +Frequencies & 27.00\% & 40.00\% \\ +TD-IDF & 20.00\% & 20.00\% \\ +LSI & 4.00\% & 20.00\% \\ +Doc2Vec & 10.00\% & 10.00\% \\ \hline \end{tabular} \caption{Evaluation of search engines.} \label{tab:tab2} \end{table} -\subsection*{Section 4: Visualisation of query results} +\subsection*{TBD Section 4: Visualisation of query results} Include, comment and compare the t-SNE plots for LSI and for Doc2Vec. \begin{figure}[H] \begin{center} -\includegraphics[width=0.3\textwidth]{Figures/dummy_pic.png} -\caption{Caption.} -\label{fig:fig1} +\includegraphics[width=\textwidth]{../out/doc2vec_plot} +\caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} +\label{fig:tsne-doc2vec} +\end{center} +\end{figure} + +\begin{figure}[H] +\begin{center} +\includegraphics[width=\textwidth]{../out/lsi_plot} +\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} +\label{fig:lsi-doc2vec} \end{center} \end{figure} \end{document}