kse-01/prec-recall.py

import argparse
from typing import Iterable, Optional

import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE

search_data = __import__('search-data')

PREFIX: str = "./"


def read_ground_truth(file_path: str, df: pd.DataFrame) -> Iterable[tuple[str, int]]:
    records: list[list[str]] = []

    with open(file_path) as f:
        record_tmp = []
        for line in f:
            line = line.strip()
            if line == '':
                assert len(record_tmp) == 3
                records.append(record_tmp)
                record_tmp = []
            else:
                record_tmp.append(line)

    if len(record_tmp) == 3:
        records.append(record_tmp)

    for query, name, file_name in records:
        assert file_name.startswith(PREFIX)
        file_name = file_name[len(PREFIX):]

        row = df[(df.name == name) & (df.file == file_name)]
        assert len(row) == 1

        yield query, row.index[0]


def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:
    for i, le in enumerate(li):
        if le[0] == e:
            return i

    return None


def plot_df(results, query: str) -> Optional[pd.DataFrame]:
    if results.vectors is not None and results.query_vector is not None:
        tsne_vectors = np.array(results.vectors + [results.query_vector])
        # try perplexity = 1, 1.5, 2
        tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000)
        tsne_results = tsne.fit_transform(tsne_vectors)
        df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two',  'query', 'is_input'])
        df['tsne-2d-one'] = tsne_results[:, 0]
        df['tsne-2d-two'] = tsne_results[:, 1]
        df['query'] = [query] * (len(results.vectors) + 1)
        df['is_input'] = (['Result'] * len(results.vectors)) + ['Input query']
        return df
    else:
        return None


def main(method: str, file_path: str):
    df = search_data.load_data()
    test_set = list(read_ground_truth(file_path, df))

    precision_sum = 0
    recall_sum = 0

    dfs = []

    for query, expected in tqdm.tqdm(test_set):
        search_results = search_data.search(query, method, df)

        df_q = plot_df(search_results, query)
        if df_q is not None:
            dfs.append(df_q)

        idx = better_index(search_results.indexes_scores, expected)

        if idx is None:
            precision = 0
            recall = 0
        else:
            precision = 1 / (idx + 1)
            recall = 1

        precision_sum += precision
        recall_sum += recall

    print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set)))
    print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set)))

    df = pd.concat(dfs)

    plt.figure(figsize=(4, 4))
    ax = sns.scatterplot(
        x="tsne-2d-one", y="tsne-2d-two",
        hue="query",
        style="is_input",
        palette=sns.color_palette("husl", n_colors=10),
        data=df,
        legend="full",
        alpha=1.0
    )
    plt.show()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("method", help="the method to compare similarities with", type=str)
    parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str)
    args = parser.parse_args()
    main(args.method, args.ground_truth_file)
part 3 done 2023-10-23 13:42:25 +00:00			`import argparse`
			`from typing import Iterable, Optional`

almost done part 4 2023-10-25 13:10:47 +00:00			`import numpy as np`
part 3 done 2023-10-23 13:42:25 +00:00			`import pandas as pd`
almost done part 4 2023-10-25 13:10:47 +00:00			`import seaborn as sns`
			`import tqdm`
			`from matplotlib import pyplot as plt`
			`from sklearn.manifold import TSNE`
part 3 done 2023-10-23 13:42:25 +00:00
			`search_data = __import__('search-data')`

			`PREFIX: str = "./"`


			`def read_ground_truth(file_path: str, df: pd.DataFrame) -> Iterable[tuple[str, int]]:`
			`records: list[list[str]] = []`

			`with open(file_path) as f:`
			`record_tmp = []`
			`for line in f:`
			`line = line.strip()`
			`if line == '':`
			`assert len(record_tmp) == 3`
			`records.append(record_tmp)`
			`record_tmp = []`
			`else:`
			`record_tmp.append(line)`

			`if len(record_tmp) == 3:`
			`records.append(record_tmp)`

			`for query, name, file_name in records:`
			`assert file_name.startswith(PREFIX)`
			`file_name = file_name[len(PREFIX):]`

			`row = df[(df.name == name) & (df.file == file_name)]`
			`assert len(row) == 1`

			`yield query, row.index[0]`


			`def better_index(li: list[tuple[int, float]], e: int) -> Optional[int]:`
			`for i, le in enumerate(li):`
			`if le[0] == e:`
			`return i`

			`return None`


almost done part 4 2023-10-25 13:10:47 +00:00			`def plot_df(results, query: str) -> Optional[pd.DataFrame]:`
			`if results.vectors is not None and results.query_vector is not None:`
			`tsne_vectors = np.array(results.vectors + [results.query_vector])`
			`# try perplexity = 1, 1.5, 2`
			`tsne = TSNE(n_components=2, verbose=1, perplexity=1.5, n_iter=3000)`
			`tsne_results = tsne.fit_transform(tsne_vectors)`
			`df = pd.DataFrame(columns=['tsne-2d-one', 'tsne-2d-two', 'query', 'is_input'])`
			`df['tsne-2d-one'] = tsne_results[:, 0]`
			`df['tsne-2d-two'] = tsne_results[:, 1]`
			`df['query'] = [query] * (len(results.vectors) + 1)`
			`df['is_input'] = (['Result'] * len(results.vectors)) + ['Input query']`
			`return df`
			`else:`
			`return None`


part 3 done 2023-10-23 13:42:25 +00:00			`def main(method: str, file_path: str):`
			`df = search_data.load_data()`
			`test_set = list(read_ground_truth(file_path, df))`

			`precision_sum = 0`
			`recall_sum = 0`

almost done part 4 2023-10-25 13:10:47 +00:00			`dfs = []`

			`for query, expected in tqdm.tqdm(test_set):`
			`search_results = search_data.search(query, method, df)`

			`df_q = plot_df(search_results, query)`
			`if df_q is not None:`
			`dfs.append(df_q)`

			`idx = better_index(search_results.indexes_scores, expected)`
part 3 done 2023-10-23 13:42:25 +00:00
			`if idx is None:`
			`precision = 0`
			`recall = 0`
			`else:`
			`precision = 1 / (idx + 1)`
			`recall = 1`

			`precision_sum += precision`
			`recall_sum += recall`

			`print("Precision: {0:.2f}%".format(precision_sum * 100 / len(test_set)))`
			`print("Recall: {0:.2f}%".format(recall_sum * 100 / len(test_set)))`

almost done part 4 2023-10-25 13:10:47 +00:00			`df = pd.concat(dfs)`

			`plt.figure(figsize=(4, 4))`
			`ax = sns.scatterplot(`
			`x="tsne-2d-one", y="tsne-2d-two",`
			`hue="query",`
			`style="is_input",`
			`palette=sns.color_palette("husl", n_colors=10),`
			`data=df,`
			`legend="full",`
			`alpha=1.0`
			`)`
			`plt.show()`

part 3 done 2023-10-23 13:42:25 +00:00
			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("method", help="the method to compare similarities with", type=str)`
			`parser.add_argument("ground_truth_file", help="file where ground truth comes from", type=str)`
			`args = parser.parse_args()`
			`main(args.method, args.ground_truth_file)`