95 lines
2.9 KiB
Python
Executable file
95 lines
2.9 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
from sklearn.metrics import silhouette_score
|
|
import numpy as np
|
|
import glob
|
|
import os
|
|
import pandas as pd
|
|
import argparse
|
|
from k_means import cluster_kmeans
|
|
from hierarchical import cluster_hierarchical
|
|
|
|
DIR: str = os.path.dirname(os.path.realpath(__file__))
|
|
OUT_DIR: str = DIR + '/clustering'
|
|
IN_DIR: str = DIR + '/feature_vectors'
|
|
|
|
K_MAX: int = 65
|
|
|
|
|
|
def clean_output():
|
|
filelist = glob.glob(OUT_DIR + '/*_silhouette.csv')
|
|
for f in filelist:
|
|
os.remove(f)
|
|
|
|
|
|
def validate(path: str, clazz_name: str, autorun: bool):
|
|
df = pd.DataFrame(columns=['k_means', 'hierarchical'], dtype=float)
|
|
|
|
# We bound the number of clusters by the number of distinct points in our dataset.
|
|
# To count them, we compute the number of "distinct" feature vectors and we
|
|
# bound to the minimum of K_MAX and this number.
|
|
nodup = pd.read_csv(path, index_col=0).drop_duplicates()
|
|
max_distinct = len(nodup)
|
|
|
|
for n in range(2, min(K_MAX, max_distinct)):
|
|
X_h, Y_h = cluster_hierarchical(path, n, save_to_disk=False)
|
|
df.loc[n, 'k_means'] = silhouette_score(X_h, Y_h)
|
|
|
|
X_k, Y_k = cluster_kmeans(path, n, save_to_disk=False)
|
|
df.loc[n, 'hierarchical'] = silhouette_score(X_k, Y_k)
|
|
|
|
k_kmeans = df[['k_means']].idxmax()[0]
|
|
k_hierarchical = df[['hierarchical']].idxmax()[0]
|
|
|
|
print("K_means optimal value: " + str(k_kmeans))
|
|
print("Hierarchical optimal value: " + str(k_hierarchical))
|
|
|
|
df.to_csv(OUT_DIR + '/' + clazz_name + '_silhouette.csv')
|
|
|
|
if autorun:
|
|
cluster_hierarchical(path, k_hierarchical)
|
|
cluster_kmeans(path, k_kmeans)
|
|
|
|
|
|
|
|
def compute_silhouette(path: str, clazz_name: str, suffix: str):
|
|
df_y = pd.read_csv(OUT_DIR + '/' + clazz_name + '_' + suffix + '.csv')
|
|
Y = df_y.iloc[:, 1].values
|
|
|
|
df = pd.read_csv(path)
|
|
X = df.drop(df.columns[0], axis=1).to_numpy()
|
|
|
|
print("Silhouette for " + suffix + ": " + str(silhouette_score(X, Y)))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Compute silhouette metric.')
|
|
parser.add_argument('--validate', action='store_true',
|
|
help='compute optimal k for each algorithm')
|
|
parser.add_argument('--autorun', action='store_true',
|
|
help='if validating, computes CSV for optimal clustering automatically')
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.validate:
|
|
clean_output()
|
|
|
|
filelist = glob.glob(IN_DIR + '/*.csv')
|
|
for f in filelist:
|
|
clazz_name = os.path.basename(f)
|
|
clazz_name = clazz_name[:clazz_name.rfind('.')]
|
|
|
|
print(clazz_name)
|
|
|
|
if args.validate:
|
|
validate(f, clazz_name, args.autorun)
|
|
|
|
compute_silhouette(f, clazz_name, 'kmeans')
|
|
compute_silhouette(f, clazz_name, 'hierarchical')
|
|
|
|
print()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|