#!/usr/bin/env python3 import os import pandas as pd import glob from sklearn import preprocessing from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer from sklearn.model_selection import GridSearchCV from sklearn.utils import shuffle from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier DIR: str = os.path.dirname(os.path.realpath(__file__)) IN_DIR: str = DIR + '/metrics' OUT_DIR: str = DIR + '/models' def clean_output(): filelist = glob.glob(OUT_DIR + '/models.csv') for f in filelist: os.remove(f) def get_classifiers() -> list[tuple[any, dict[str, list[str]]]]: return [ (GaussianNB(), {}), (SVC(), { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'] }), (DecisionTreeClassifier(), { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'] }), (RandomForestClassifier(), { 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], 'class_weight': ['balanced', 'balanced_subsample'] }), (MLPClassifier(), { 'max_iter': [10000], 'hidden_layer_sizes': [(10,), (50,), (100, 20,)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'] }) ] def perform_grid_search(X_scaled, y, random_state: int) -> pd.DataFrame: # Shuffle according to given random state. This allows for multiple runs # of grid search that have a deterministic output for a given random_state X_shuffled, y_shuffled = shuffle(X_scaled, y, random_state=random_state) dfs: list[pd.DataFrame] = [] for classifier, grid in get_classifiers(): # cross-validation splits are same across calls as data is not shuffled # see: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html clf = GridSearchCV(classifier, grid, cv=5, n_jobs=-1, scoring={ 'precision': make_scorer(precision_score, average='binary', zero_division=0), 'recall': make_scorer(recall_score, average='binary'), 'f1': make_scorer(f1_score, average='binary') }, refit='f1') clf.fit(X_shuffled, y_shuffled) df_classifier = pd.DataFrame(clf.cv_results_) df_classifier['classifier'] = type(classifier).__name__ dfs.append(df_classifier) return pd.concat(dfs, ignore_index=True) def load_dataset() -> tuple[any, any]: df = pd.read_csv(IN_DIR + '/feature_vectors_labeled.csv') df = df.drop('class_name', axis=1) X = df.iloc[:, :-1].to_numpy() y = df.iloc[:, -1].to_numpy() scaler = preprocessing.StandardScaler().fit(X) X_scaled = scaler.transform(X) return (X_scaled, y) def main(): clean_output() X, y = load_dataset() df = perform_grid_search(X, y, 0xDEADB017) df.to_csv(OUT_DIR + '/models.csv', index=False) if __name__ == '__main__': main()