This repository has been archived on 2024-10-22. You can view files and clone it, but cannot push or open issues or pull requests.
ima-preparation/bug-2022/train_classifiers.py

104 lines
3.2 KiB
Python
Raw Normal View History

2023-02-19 17:18:51 +00:00
#!/usr/bin/env python3
import os
import pandas as pd
import glob
from sklearn import preprocessing
from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import GridSearchCV
2023-02-20 15:11:57 +00:00
from sklearn.utils import shuffle
2023-02-19 17:18:51 +00:00
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
DIR: str = os.path.dirname(os.path.realpath(__file__))
IN_DIR: str = DIR + '/metrics'
OUT_DIR: str = DIR + '/models'
def clean_output():
filelist = glob.glob(OUT_DIR + '/models.csv')
for f in filelist:
os.remove(f)
2023-02-20 15:11:57 +00:00
def get_classifiers() -> list[tuple[any, dict[str, list[str]]]]:
2023-02-19 17:18:51 +00:00
return [
(GaussianNB(), {}),
(SVC(), {
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma': ['scale', 'auto']
}),
(DecisionTreeClassifier(), {
'criterion': ['gini', 'entropy'],
'splitter': ['best', 'random']
}),
(RandomForestClassifier(), {
'criterion': ['gini', 'entropy'],
'max_features': ['sqrt', 'log2'],
'class_weight': ['balanced', 'balanced_subsample']
}),
(MLPClassifier(), {
'max_iter': [2000],
'hidden_layer_sizes': [(10,), (50,), (100, 20,)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
'solver': ['lbfgs', 'sgd', 'adam'],
'learning_rate': ['constant', 'invscaling', 'adaptive']
})
]
2023-02-20 15:11:57 +00:00
def perform_grid_search(X_scaled, y, random_state: int) -> pd.DataFrame:
# Shuffle according to given random state. This allows for multiple runs
# of grid search that have a deterministic output for a given random_state
X_shuffled, y_shuffled = shuffle(X_scaled, y, random_state=random_state)
2023-02-19 17:18:51 +00:00
2023-02-20 15:11:57 +00:00
dfs: list[pd.DataFrame] = []
2023-02-19 17:18:51 +00:00
for classifier, grid in get_classifiers():
# cross-validation splits are same across calls as data is not shuffled
# see: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
clf = GridSearchCV(classifier, grid, cv=5, scoring={
'precision': make_scorer(precision_score, average='binary', zero_division=0),
'recall': make_scorer(recall_score, average='binary'),
'f1': make_scorer(f1_score, average='binary')
}, refit='f1')
2023-02-20 15:11:57 +00:00
clf.fit(X_shuffled, y_shuffled)
2023-02-19 17:18:51 +00:00
df_classifier = pd.DataFrame(clf.cv_results_)
df_classifier['classifier'] = type(classifier).__name__
2023-02-20 15:11:57 +00:00
dfs.append(df_classifier)
return pd.concat(dfs, ignore_index=True)
def load_dataset() -> tuple[any, any]:
df = pd.read_csv(IN_DIR + '/feature_vectors_labeled.csv')
df = df.drop('class_name', axis=1)
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
return (X_scaled, y)
def main():
clean_output()
X, y = load_dataset()
df = perform_grid_search(X, y, 0xDEADB017)
2023-02-19 17:18:51 +00:00
df.to_csv(OUT_DIR + '/models.csv', index=False)
if __name__ == '__main__':
main()