diff --git a/out/doc2vec_plot.png b/out/doc2vec_plot.png index 64ee0d4d..bda77281 100644 Binary files a/out/doc2vec_plot.png and b/out/doc2vec_plot.png differ diff --git a/out/lsi_plot.png b/out/lsi_plot.png index b51c5500..b8a4952d 100644 Binary files a/out/lsi_plot.png and b/out/lsi_plot.png differ diff --git a/prec-recall.py b/prec-recall.py index 2b8a2928..2644ee84 100644 --- a/prec-recall.py +++ b/prec-recall.py @@ -53,7 +53,7 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]: def plot_df(results, query: str) -> Optional[pd.DataFrame]: if results.vectors is not None and results.query_vector is not None: tsne_vectors = np.array(results.vectors + [results.query_vector]) - tsne = TSNE(n_components=2, perplexity=2, n_iter=3000) + tsne = TSNE(n_components=2, perplexity=1, n_iter=3000) tsne_results = tsne.fit_transform(tsne_vectors) df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind']) df['tsne-2d-one'] = tsne_results[:, 0] diff --git a/report/main.pdf b/report/main.pdf index d466baa8..99c68b2f 100644 Binary files a/report/main.pdf and b/report/main.pdf differ diff --git a/report/main.tex b/report/main.tex index b9b89226..01026458 100644 --- a/report/main.tex +++ b/report/main.tex @@ -82,7 +82,7 @@ Both the word frequency and TF-IDF model identify the correct result (according query) as the first recommendation to output. Both the LSI and Doc2Vec models fail to report the correct function in all 5 results. -\begin{figure} +\begin{figure}[b] \small \begin{verbatim} Similarity: 87.29% @@ -119,7 +119,7 @@ Line: 70 \label{fig:search-freq} \end{figure} -\begin{figure} +\begin{figure}[b] \small \begin{verbatim} Similarity: 86.62% @@ -154,7 +154,7 @@ Line: 4736 \label{fig:search-tfidf} \end{figure} -\begin{figure} +\begin{figure}[b] \small \begin{verbatim} Similarity: 92.11% @@ -190,7 +190,7 @@ Line: 129 \label{fig:search-lsi} \end{figure} -\begin{figure} +\begin{figure}[b] \small \begin{verbatim} Similarity: 81.85% @@ -222,14 +222,18 @@ Line: 216 \label{fig:search-doc2vec} \end{figure} -\subsection*{TBD Section 3: Evaluation of search engines} +\subsection*{Section 3: Evaluation of search engines} -Using the ground truth provided, evaluate and report recall and average precision for each of the four search engines; -comment the differences among search engines. +The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script +\texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}. -\begin{table} [H] +Precision and recall is quite low for all models, less so for the word frequency and the TF-IDF models. +The word frequency model has the highest precision and recall (27\% and 40\% respectively), while the LSI model has the +lowest precision (4\%) and Doc2Vec has the lowest recall (10\%). + +\begin{table}[H] \centering -\begin{tabular}{cccc} +\begin{tabular}{ccc} \hline Engine & Avg Precision & Recall \\ \hline @@ -245,9 +249,21 @@ Doc2Vec & 10.00\% & 10.00\% \\ \subsection*{TBD Section 4: Visualisation of query results} -Include, comment and compare the t-SNE plots for LSI and for Doc2Vec. +The two-dimensional T-SNE plots (computed with perplexity $= 1$) for the LSI and Doc2Vec models are respectively in +figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}. -\begin{figure}[H] +The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows +fewer outliers and more distinct clusters for the results of each query and the query vector itself. + +\begin{figure} +\begin{center} +\includegraphics[width=\textwidth]{../out/lsi_plot} +\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} +\label{fig:tsne-lsi} +\end{center} +\end{figure} + +\begin{figure} \begin{center} \includegraphics[width=\textwidth]{../out/doc2vec_plot} \caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} @@ -255,11 +271,4 @@ Include, comment and compare the t-SNE plots for LSI and for Doc2Vec. \end{center} \end{figure} -\begin{figure}[H] -\begin{center} -\includegraphics[width=\textwidth]{../out/lsi_plot} -\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} -\label{fig:lsi-doc2vec} -\end{center} -\end{figure} \end{document}