report done
This commit is contained in:
parent
43437c2bed
commit
d8815d46f9
5 changed files with 28 additions and 19 deletions
Binary file not shown.
Before Width: | Height: | Size: 78 KiB After Width: | Height: | Size: 77 KiB |
BIN
out/lsi_plot.png
BIN
out/lsi_plot.png
Binary file not shown.
Before Width: | Height: | Size: 69 KiB After Width: | Height: | Size: 71 KiB |
|
@ -53,7 +53,7 @@ def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
|
||||||
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
|
def plot_df(results, query: str) -> Optional[pd.DataFrame]:
|
||||||
if results.vectors is not None and results.query_vector is not None:
|
if results.vectors is not None and results.query_vector is not None:
|
||||||
tsne_vectors = np.array(results.vectors + [results.query_vector])
|
tsne_vectors = np.array(results.vectors + [results.query_vector])
|
||||||
tsne = TSNE(n_components=2, perplexity=2, n_iter=3000)
|
tsne = TSNE(n_components=2, perplexity=1, n_iter=3000)
|
||||||
tsne_results = tsne.fit_transform(tsne_vectors)
|
tsne_results = tsne.fit_transform(tsne_vectors)
|
||||||
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind'])
|
df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'Query', 'Vector kind'])
|
||||||
df['tsne-2d-one'] = tsne_results[:, 0]
|
df['tsne-2d-one'] = tsne_results[:, 0]
|
||||||
|
|
BIN
report/main.pdf
BIN
report/main.pdf
Binary file not shown.
|
@ -82,7 +82,7 @@ Both the word frequency and TF-IDF model identify the correct result (according
|
||||||
query) as the first recommendation to output. Both the LSI and Doc2Vec models fail to report the correct function in
|
query) as the first recommendation to output. Both the LSI and Doc2Vec models fail to report the correct function in
|
||||||
all 5 results.
|
all 5 results.
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}[b]
|
||||||
\small
|
\small
|
||||||
\begin{verbatim}
|
\begin{verbatim}
|
||||||
Similarity: 87.29%
|
Similarity: 87.29%
|
||||||
|
@ -119,7 +119,7 @@ Line: 70
|
||||||
\label{fig:search-freq}
|
\label{fig:search-freq}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}[b]
|
||||||
\small
|
\small
|
||||||
\begin{verbatim}
|
\begin{verbatim}
|
||||||
Similarity: 86.62%
|
Similarity: 86.62%
|
||||||
|
@ -154,7 +154,7 @@ Line: 4736
|
||||||
\label{fig:search-tfidf}
|
\label{fig:search-tfidf}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}[b]
|
||||||
\small
|
\small
|
||||||
\begin{verbatim}
|
\begin{verbatim}
|
||||||
Similarity: 92.11%
|
Similarity: 92.11%
|
||||||
|
@ -190,7 +190,7 @@ Line: 129
|
||||||
\label{fig:search-lsi}
|
\label{fig:search-lsi}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{figure}
|
\begin{figure}[b]
|
||||||
\small
|
\small
|
||||||
\begin{verbatim}
|
\begin{verbatim}
|
||||||
Similarity: 81.85%
|
Similarity: 81.85%
|
||||||
|
@ -222,14 +222,18 @@ Line: 216
|
||||||
\label{fig:search-doc2vec}
|
\label{fig:search-doc2vec}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\subsection*{TBD Section 3: Evaluation of search engines}
|
\subsection*{Section 3: Evaluation of search engines}
|
||||||
|
|
||||||
Using the ground truth provided, evaluate and report recall and average precision for each of the four search engines;
|
The evaluation over the given ground truth to compute precision, recall, and the T-SNE plots is performed by the script
|
||||||
comment the differences among search engines.
|
\texttt{prec-recall.py}. The calculated average precision and recall values are reported in table~\ref{tab:tab2}.
|
||||||
|
|
||||||
|
Precision and recall is quite low for all models, less so for the word frequency and the TF-IDF models.
|
||||||
|
The word frequency model has the highest precision and recall (27\% and 40\% respectively), while the LSI model has the
|
||||||
|
lowest precision (4\%) and Doc2Vec has the lowest recall (10\%).
|
||||||
|
|
||||||
\begin{table}[H]
|
\begin{table}[H]
|
||||||
\centering
|
\centering
|
||||||
\begin{tabular}{cccc}
|
\begin{tabular}{ccc}
|
||||||
\hline
|
\hline
|
||||||
Engine & Avg Precision & Recall \\
|
Engine & Avg Precision & Recall \\
|
||||||
\hline
|
\hline
|
||||||
|
@ -245,9 +249,21 @@ Doc2Vec & 10.00\% & 10.00\% \\
|
||||||
|
|
||||||
\subsection*{TBD Section 4: Visualisation of query results}
|
\subsection*{TBD Section 4: Visualisation of query results}
|
||||||
|
|
||||||
Include, comment and compare the t-SNE plots for LSI and for Doc2Vec.
|
The two-dimensional T-SNE plots (computed with perplexity $= 1$) for the LSI and Doc2Vec models are respectively in
|
||||||
|
figures~\ref{fig:tsne-lsi}~and~\ref{fig:tsne-doc2vec}.
|
||||||
|
|
||||||
\begin{figure}[H]
|
The T-SNE plot for the LSI model shows evidently the presence of outliers in the search result. The Doc2Vec plot shows
|
||||||
|
fewer outliers and more distinct clusters for the results of each query and the query vector itself.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\begin{center}
|
||||||
|
\includegraphics[width=\textwidth]{../out/lsi_plot}
|
||||||
|
\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
|
||||||
|
\label{fig:tsne-lsi}
|
||||||
|
\end{center}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
\begin{center}
|
\begin{center}
|
||||||
\includegraphics[width=\textwidth]{../out/doc2vec_plot}
|
\includegraphics[width=\textwidth]{../out/doc2vec_plot}
|
||||||
\caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
|
\caption{T-SNE plot for the Doc2Vec model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
|
||||||
|
@ -255,11 +271,4 @@ Include, comment and compare the t-SNE plots for LSI and for Doc2Vec.
|
||||||
\end{center}
|
\end{center}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{figure}[H]
|
|
||||||
\begin{center}
|
|
||||||
\includegraphics[width=\textwidth]{../out/lsi_plot}
|
|
||||||
\caption{T-SNE plot for the LSI model over the queries and ground truths given in \texttt{ground-truth-unique.txt}.}
|
|
||||||
\label{fig:lsi-doc2vec}
|
|
||||||
\end{center}
|
|
||||||
\end{figure}
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|
Loading…
Reference in a new issue