48 lines
1.1 KiB
Python
48 lines
1.1 KiB
Python
|
#!/usr/bin/env python3
|
||
|
import os
|
||
|
import pandas as pd
|
||
|
import glob
|
||
|
|
||
|
|
||
|
DIR: str = os.path.dirname(os.path.realpath(__file__))
|
||
|
IN_DIR: str = DIR + '/feature_vectors'
|
||
|
OUT_DIR: str = DIR + '/clustering'
|
||
|
|
||
|
|
||
|
def clean_output():
|
||
|
filelist = glob.glob(OUT_DIR + '/*_groundtruth.csv')
|
||
|
for f in filelist:
|
||
|
os.remove(f)
|
||
|
|
||
|
|
||
|
def ground_truth(method_name: str, keywords: list[str]):
|
||
|
for i, key in enumerate(keywords):
|
||
|
if method_name.find(key) != -1:
|
||
|
return i + 1
|
||
|
|
||
|
return 0
|
||
|
|
||
|
|
||
|
def create_ground_truth(path: str, keywords: list[str]):
|
||
|
clazz_name = os.path.basename(path)
|
||
|
clazz_name = clazz_name[:clazz_name.rfind('.')]
|
||
|
|
||
|
df = pd.read_csv(path, index_col=0).filter([])
|
||
|
df['cluster'] = df.index.map(lambda m: ground_truth(m.lower(), keywords))
|
||
|
df.to_csv(OUT_DIR + '/' + clazz_name + '_groundtruth.csv')
|
||
|
|
||
|
|
||
|
def main():
|
||
|
with open(DIR + '/keyword_list.txt', 'r') as f:
|
||
|
keywords: list[str] = [x.strip().strip('\n').lower() for x in f.readlines()]
|
||
|
|
||
|
clean_output()
|
||
|
|
||
|
filelist = glob.glob(IN_DIR + '/*.csv')
|
||
|
for f in filelist:
|
||
|
create_ground_truth(f, keywords)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|