report done

This commit is contained in:
Claudio Maggioni 2023-11-07 15:07:15 +01:00
parent 43437c2bed
commit d8815d46f9
5 changed files with 28 additions and 19 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 78 KiB

After

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 69 KiB

After

Width:  |  Height:  |  Size: 71 KiB

View file

@ -53,7 +53,7 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
def plot_df(results, query: str) -> Optional[pd.DataFrame]: def plot_df(results, query: str) -> Optional[pd.DataFrame]:
if results.vectors is not None and results.query_vector is not None: if results.vectors is not None and results.query_vector is not None:
tsne_vectors = np.array(results.vectors + [results.query_vector]) tsne_vectors = np.array(results.vectors + [results.query_vector])
tsne = TSNE(n_components=2, perplexity=2, n_iter=3000) tsne = TSNE(n_components=2, perplexity=1, n_iter=3000)
tsne_results = tsne.fit_transform(tsne_vectors) tsne_results = tsne.fit_transform(tsne_vectors)
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind']) df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind'])
df['tsne-2d-one'] = tsne_results[:, 0] df['tsne-2d-one'] = tsne_results[:, 0]

Binary file not shown.

View file

@ -82,7 +82,7 @@ Both the word frequency and TF-IDF model identify the correct result (according
query) as the first recommendation to output. Both the LSI and Doc2Vec models fail to report the correct function in query) as the first recommendation to output. Both the LSI and Doc2Vec models fail to report the correct function in
all 5 results. all 5 results.
\begin{figure} \begin{figure}[b]
\small \small
\begin{verbatim} \begin{verbatim}
Similarity: 87.29% Similarity: 87.29%
@ -119,7 +119,7 @@ Line: 70
\label{fig:search-freq} \label{fig:search-freq}
\end{figure} \end{figure}
\begin{figure} \begin{figure}[b]
\small \small
\begin{verbatim} \begin{verbatim}
Similarity: 86.62% Similarity: 86.62%
@ -154,7 +154,7 @@ Line: 4736
\label{fig:search-tfidf} \label{fig:search-tfidf}
\end{figure} \end{figure}
\begin{figure} \begin{figure}[b]
\small \small
\begin{verbatim} \begin{verbatim}
Similarity: 92.11% Similarity: 92.11%
@ -190,7 +190,7 @@ Line: 129
\label{fig:search-lsi} \label{fig:search-lsi}
\end{figure} \end{figure}
\begin{figure} \begin{figure}[b]
\small \small
\begin{verbatim} \begin{verbatim}
Similarity: 81.85% Similarity: 81.85%
@ -222,14 +222,18 @@ Line: 216
\label{fig:search-doc2vec} \label{fig:search-doc2vec}
\end{figure} \end{figure}
\subsection*{TBD Section 3: Evaluation of search engines} \subsection*{Section 3: Evaluation of search engines}
Using the ground truth provided, evaluate and report recall and average precision for each of the four search engines; The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script
comment the differences among search engines. \texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}.
Precision and recall is quite low for all models, less so for the word frequency and the TF-IDF models.
The word frequency model has the highest precision and recall (27\% and 40\% respectively), while the LSI model has the
lowest precision (4\%) and Doc2Vec has the lowest recall (10\%).
\begin{table}[H] \begin{table}[H]
\centering \centering
\begin{tabular}{cccc} \begin{tabular}{ccc}
\hline \hline
Engine & Avg Precision & Recall \\ Engine & Avg Precision & Recall \\
\hline \hline
@ -245,9 +249,21 @@ Doc2Vec & 10.00\% & 10.00\% \\
\subsection*{TBD Section 4: Visualisation of query results} \subsection*{TBD Section 4: Visualisation of query results}
Include, comment and compare the t-SNE plots for LSI and for Doc2Vec. The two-dimensional T-SNE plots (computed with perplexity $= 1$) for the LSI and Doc2Vec models are respectively in
figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}.
\begin{figure}[H] The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows
fewer outliers and more distinct clusters for the results of each query and the query vector itself.
\begin{figure}
\begin{center}
\includegraphics[width=\textwidth]{../out/lsi_plot}
\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
\label{fig:tsne-lsi}
\end{center}
\end{figure}
\begin{figure}
\begin{center} \begin{center}
\includegraphics[width=\textwidth]{../out/doc2vec_plot} \includegraphics[width=\textwidth]{../out/doc2vec_plot}
\caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.} \caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
@ -255,11 +271,4 @@ Include, comment and compare the t-SNE plots for LSI and for Doc2Vec.
\end{center} \end{center}
\end{figure} \end{figure}
\begin{figure}[H]
\begin{center}
\includegraphics[width=\textwidth]{../out/lsi_plot}
\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
\label{fig:lsi-doc2vec}
\end{center}
\end{figure}
\end{document} \end{document}