This repository has been archived on 2024-10-22. You can view files and clone it, but cannot push or open issues or pull requests.
ima-preparation/bug-2022/train_classifiers.py

90 lines
2.7 KiB
Python
Executable file

#!/usr/bin/env python3
import os
import pandas as pd
import glob
from sklearn import preprocessing
from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
DIR: str = os.path.dirname(os.path.realpath(__file__))
IN_DIR: str = DIR + '/metrics'
OUT_DIR: str = DIR + '/models'
RANDOM: int = 42
def clean_output():
filelist = glob.glob(OUT_DIR + '/models.csv')
for f in filelist:
os.remove(f)
def get_classifiers() -> list:
return [
(GaussianNB(), {}),
(SVC(), {
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma': ['scale', 'auto']
}),
(DecisionTreeClassifier(), {
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random']
}),
(RandomForestClassifier(), {
'criterion': ['gini', 'entropy'],
'max_features': ['sqrt', 'log2'],
'class_weight': ['balanced', 'balanced_subsample']
}),
(MLPClassifier(), {
'max_iter': [2000],
'hidden_layer_sizes': [(10,), (50,), (100, 20,)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
'solver': ['lbfgs', 'sgd', 'adam'],
'learning_rate': ['constant', 'invscaling', 'adaptive']
})
]
def main():
clean_output()
# load dataset
df = pd.read_csv(IN_DIR + '/feature_vectors_labeled.csv')
df = df.drop('class_name', axis=1)
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
df = None
for classifier, grid in get_classifiers():
# cross-validation splits are same across calls as data is not shuffled
# see: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
clf = GridSearchCV(classifier, grid, cv=5, scoring={
'precision': make_scorer(precision_score, average='binary', zero_division=0),
'recall': make_scorer(recall_score, average='binary'),
'f1': make_scorer(f1_score, average='binary')
}, refit='f1')
clf.fit(X_scaled, y)
df_classifier = pd.DataFrame(clf.cv_results_)
df_classifier['classifier'] = type(classifier).__name__
df = df_classifier if df is None else pd.concat(
[df, df_classifier], ignore_index=True)
df.to_csv(OUT_DIR + '/models.csv', index=False)
if __name__ == '__main__':
main()