ima01/ground_truth.py

#!/usr/bin/env python3
import os
import pandas as pd
import glob


DIR: str = os.path.dirname(os.path.realpath(__file__))
IN_DIR: str = DIR + '/feature_vectors'
OUT_DIR: str = DIR + '/clustering'


def clean_output():
    filelist = glob.glob(OUT_DIR + '/*_groundtruth.csv')
    for f in filelist:
        os.remove(f)


def ground_truth(method_name: str, keywords: list[str]):
    for i, key in enumerate(keywords):
        if method_name.find(key) != -1:
            return i + 1

    return 0


def create_ground_truth(path: str, keywords: list[str]):
    clazz_name = os.path.basename(path)
    clazz_name = clazz_name[:clazz_name.rfind('.')]

    df = pd.read_csv(path, index_col=0).filter([])
    df['cluster'] = df.index.map(lambda m: ground_truth(m.lower(), keywords))
    df.to_csv(OUT_DIR + '/' + clazz_name + '_groundtruth.csv')


def main():
    with open(DIR + '/keyword_list.txt', 'r') as f:
        keywords: list[str] = [x.strip().strip('\n').lower() for x in f.readlines()]

    clean_output()

    filelist = glob.glob(IN_DIR + '/*.csv')
    for f in filelist:
        create_ground_truth(f, keywords)


if __name__ == '__main__':
    main()