This repository has been archived on 2023-06-18. You can view files and clone it, but cannot push or open issues or pull requests.
ima01/ground_truth.py

48 lines
1.1 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import pandas as pd
import glob
DIR: str = os.path.dirname(os.path.realpath(__file__))
IN_DIR: str = DIR + '/feature_vectors'
OUT_DIR: str = DIR + '/clustering'
def clean_output():
filelist = glob.glob(OUT_DIR + '/*_groundtruth.csv')
for f in filelist:
os.remove(f)
def ground_truth(method_name: str, keywords: list[str]):
for i, key in enumerate(keywords):
if method_name.find(key) != -1:
return i + 1
return 0
def create_ground_truth(path: str, keywords: list[str]):
clazz_name = os.path.basename(path)
clazz_name = clazz_name[:clazz_name.rfind('.')]
df = pd.read_csv(path, index_col=0).filter([])
df['cluster'] = df.index.map(lambda m: ground_truth(m.lower(), keywords))
df.to_csv(OUT_DIR + '/' + clazz_name + '_groundtruth.csv')
def main():
with open(DIR + '/keyword_list.txt', 'r') as f:
keywords: list[str] = [x.strip().strip('\n').lower() for x in f.readlines()]
clean_output()
filelist = glob.glob(IN_DIR + '/*.csv')
for f in filelist:
create_ground_truth(f, keywords)
if __name__ == '__main__':
main()