#!/usr/bin/env python3 import os import pandas as pd import glob from sklearn import preprocessing from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer from sklearn.model_selection import GridSearchCV from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier DIR: str = os.path.dirname(os.path.realpath(__file__)) IN_DIR: str = DIR + '/metrics' OUT_DIR: str = DIR + '/models' RANDOM: int = 42 def clean_output(): filelist = glob.glob(OUT_DIR + '/models.csv') for f in filelist: os.remove(f) def get_classifiers() -> list: return [ (GaussianNB(), {}), (SVC(), { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'] }), (DecisionTreeClassifier(), { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'] }), (RandomForestClassifier(), { 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2'], 'class_weight': ['balanced', 'balanced_subsample'] }), (MLPClassifier(), { 'max_iter': [2000], 'hidden_layer_sizes': [(10,), (50,), (100, 20,)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'] }) ] def main(): clean_output() # load dataset df = pd.read_csv(IN_DIR + '/feature_vectors_labeled.csv') df = df.drop('class_name', axis=1) X = df.iloc[:, :-1].to_numpy() y = df.iloc[:, -1].to_numpy() scaler = preprocessing.StandardScaler().fit(X) X_scaled = scaler.transform(X) df = None for classifier, grid in get_classifiers(): # cross-validation splits are same across calls as data is not shuffled # see: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html clf = GridSearchCV(classifier, grid, cv=5, scoring={ 'precision': make_scorer(precision_score, average='binary', zero_division=0), 'recall': make_scorer(recall_score, average='binary'), 'f1': make_scorer(f1_score, average='binary') }, refit='f1') clf.fit(X_scaled, y) df_classifier = pd.DataFrame(clf.cv_results_) df_classifier['classifier'] = type(classifier).__name__ df = df_classifier if df is None else pd.concat( [df, df_classifier], ignore_index=True) df.to_csv(OUT_DIR + '/models.csv', index=False) if __name__ == '__main__': main()