#!/usr/bin/env python3 import os import pandas as pd import glob DIR: str = os.path.dirname(os.path.realpath(__file__)) IN_DIR: str = DIR + '/feature_vectors' OUT_DIR: str = DIR + '/clustering' def clean_output(): filelist = glob.glob(OUT_DIR + '/*_groundtruth.csv') for f in filelist: os.remove(f) def ground_truth(method_name: str, keywords: list[str]): for i, key in enumerate(keywords): if method_name.find(key) != -1: return i + 1 return 0 def create_ground_truth(path: str, keywords: list[str]): clazz_name = os.path.basename(path) clazz_name = clazz_name[:clazz_name.rfind('.')] df = pd.read_csv(path, index_col=0).filter([]) df['cluster'] = df.index.map(lambda m: ground_truth(m.lower(), keywords)) df.to_csv(OUT_DIR + '/' + clazz_name + '_groundtruth.csv') def main(): with open(DIR + '/keyword_list.txt', 'r') as f: keywords: list[str] = [x.strip().strip('\n').lower() for x in f.readlines()] clean_output() filelist = glob.glob(IN_DIR + '/*.csv') for f in filelist: create_ground_truth(f, keywords) if __name__ == '__main__': main()