#!/usr/bin/env python3 from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import silhouette_score import numpy as np import glob import os import pandas as pd import argparse from k_means import cluster_kmeans from hierarchical import cluster_hierarchical DIR: str = os.path.dirname(os.path.realpath(__file__)) OUT_DIR: str = DIR + '/clustering' IN_DIR: str = DIR + '/feature_vectors' K_MAX: int = 65 def clean_output(): filelist = glob.glob(OUT_DIR + '/*_silhouette.csv') for f in filelist: os.remove(f) def validate(path: str, clazz_name: str, autorun: bool): df = pd.DataFrame(columns=['k_means', 'hierarchical'], dtype=float) # We bound the number of clusters by the number of distinct points in our dataset. # To count them, we compute the number of "distinct" feature vectors and we # bound to the minimum of K_MAX and this number. nodup = pd.read_csv(path, index_col=0).drop_duplicates() max_distinct = len(nodup) for n in range(2, min(K_MAX, max_distinct)): X_h, Y_h = cluster_hierarchical(path, n, save_to_disk=False) df.loc[n, 'k_means'] = silhouette_score(X_h, Y_h) X_k, Y_k = cluster_kmeans(path, n, save_to_disk=False) df.loc[n, 'hierarchical'] = silhouette_score(X_k, Y_k) k_kmeans = df[['k_means']].idxmax()[0] k_hierarchical = df[['hierarchical']].idxmax()[0] print("K_means optimal value: " + str(k_kmeans)) print("Hierarchical optimal value: " + str(k_hierarchical)) df.to_csv(OUT_DIR + '/' + clazz_name + '_silhouette.csv') if autorun: cluster_hierarchical(path, k_hierarchical) cluster_kmeans(path, k_kmeans) def compute_silhouette(path: str, clazz_name: str, suffix: str): df_y = pd.read_csv(OUT_DIR + '/' + clazz_name + '_' + suffix + '.csv') Y = df_y.iloc[:, 1].values df = pd.read_csv(path) X = df.drop(df.columns[0], axis=1).to_numpy() print("Silhouette for " + suffix + ": " + str(silhouette_score(X, Y))) def main(): parser = argparse.ArgumentParser(description='Compute silhouette metric.') parser.add_argument('--validate', action='store_true', help='compute optimal k for each algorithm') parser.add_argument('--autorun', action='store_true', help='if validating, computes CSV for optimal clustering automatically') args = parser.parse_args() if args.validate: clean_output() filelist = glob.glob(IN_DIR + '/*.csv') for f in filelist: clazz_name = os.path.basename(f) clazz_name = clazz_name[:clazz_name.rfind('.')] print(clazz_name) if args.validate: validate(f, clazz_name, args.autorun) compute_silhouette(f, clazz_name, 'kmeans') compute_silhouette(f, clazz_name, 'hierarchical') print() if __name__ == '__main__': main()