This repository has been archived on 2023-06-18. You can view files and clone it, but cannot push or open issues or pull requests.
ima01/silhouette.py

96 lines
2.9 KiB
Python
Executable File

#!/usr/bin/env python3
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import numpy as np
import glob
import os
import pandas as pd
import argparse
from k_means import cluster_kmeans
from hierarchical import cluster_hierarchical
DIR: str = os.path.dirname(os.path.realpath(__file__))
OUT_DIR: str = DIR + '/clustering'
IN_DIR: str = DIR + '/feature_vectors'
K_MAX: int = 65
def clean_output():
filelist = glob.glob(OUT_DIR + '/*_silhouette.csv')
for f in filelist:
os.remove(f)
def validate(path: str, clazz_name: str, autorun: bool):
df = pd.DataFrame(columns=['k_means', 'hierarchical'], dtype=float)
# We bound the number of clusters by the number of distinct points in our dataset.
# To count them, we compute the number of "distinct" feature vectors and we
# bound to the minimum of K_MAX and this number.
nodup = pd.read_csv(path, index_col=0).drop_duplicates()
max_distinct = len(nodup)
for n in range(2, min(K_MAX, max_distinct)):
X_h, Y_h = cluster_hierarchical(path, n, save_to_disk=False)
df.loc[n, 'k_means'] = silhouette_score(X_h, Y_h)
X_k, Y_k = cluster_kmeans(path, n, save_to_disk=False)
df.loc[n, 'hierarchical'] = silhouette_score(X_k, Y_k)
k_kmeans = df[['k_means']].idxmax()[0]
k_hierarchical = df[['hierarchical']].idxmax()[0]
print("K_means optimal value: " + str(k_kmeans))
print("Hierarchical optimal value: " + str(k_hierarchical))
df.to_csv(OUT_DIR + '/' + clazz_name + '_silhouette.csv')
if autorun:
cluster_hierarchical(path, k_hierarchical)
cluster_kmeans(path, k_kmeans)
def compute_silhouette(path: str, clazz_name: str, suffix: str):
df_y = pd.read_csv(OUT_DIR + '/' + clazz_name + '_' + suffix + '.csv')
Y = df_y.iloc[:, 1].values
df = pd.read_csv(path)
X = df.drop(df.columns[0], axis=1).to_numpy()
print("Silhouette for " + suffix + ": " + str(silhouette_score(X, Y)))
def main():
parser = argparse.ArgumentParser(description='Compute silhouette metric.')
parser.add_argument('--validate', action='store_true',
help='compute optimal k for each algorithm')
parser.add_argument('--autorun', action='store_true',
help='if validating, computes CSV for optimal clustering automatically')
args = parser.parse_args()
if args.validate:
clean_output()
filelist = glob.glob(IN_DIR + '/*.csv')
for f in filelist:
clazz_name = os.path.basename(f)
clazz_name = clazz_name[:clazz_name.rfind('.')]
print(clazz_name)
if args.validate:
validate(f, clazz_name, args.autorun)
compute_silhouette(f, clazz_name, 'kmeans')
compute_silhouette(f, clazz_name, 'hierarchical')
print()
if __name__ == '__main__':
main()