2023-02-19 17:18:51 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
import glob
|
|
|
|
|
|
|
|
from sklearn import preprocessing
|
|
|
|
from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
|
|
|
|
from sklearn.model_selection import GridSearchCV
|
2023-02-20 15:11:57 +00:00
|
|
|
from sklearn.utils import shuffle
|
2023-02-19 17:18:51 +00:00
|
|
|
|
|
|
|
from sklearn.neural_network import MLPClassifier
|
|
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
from sklearn.svm import SVC
|
|
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
|
|
|
|
DIR: str = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
IN_DIR: str = DIR + '/metrics'
|
|
|
|
OUT_DIR: str = DIR + '/models'
|
|
|
|
|
|
|
|
|
|
|
|
def clean_output():
|
|
|
|
filelist = glob.glob(OUT_DIR + '/models.csv')
|
|
|
|
for f in filelist:
|
|
|
|
os.remove(f)
|
|
|
|
|
|
|
|
|
2023-02-20 15:11:57 +00:00
|
|
|
def get_classifiers() -> list[tuple[any, dict[str, list[str]]]]:
|
2023-02-19 17:18:51 +00:00
|
|
|
return [
|
|
|
|
(GaussianNB(), {}),
|
|
|
|
(SVC(), {
|
|
|
|
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
|
|
|
|
'gamma': ['scale', 'auto']
|
|
|
|
}),
|
|
|
|
(DecisionTreeClassifier(), {
|
|
|
|
'criterion': ['gini', 'entropy'],
|
|
|
|
'splitter': ['best', 'random']
|
|
|
|
}),
|
|
|
|
(RandomForestClassifier(), {
|
|
|
|
'criterion': ['gini', 'entropy'],
|
|
|
|
'max_features': ['sqrt', 'log2'],
|
|
|
|
'class_weight': ['balanced', 'balanced_subsample']
|
|
|
|
}),
|
|
|
|
(MLPClassifier(), {
|
|
|
|
'max_iter': [2000],
|
|
|
|
'hidden_layer_sizes': [(10,), (50,), (100, 20,)],
|
|
|
|
'activation': ['identity', 'logistic', 'tanh', 'relu'],
|
|
|
|
'solver': ['lbfgs', 'sgd', 'adam'],
|
|
|
|
'learning_rate': ['constant', 'invscaling', 'adaptive']
|
|
|
|
})
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-02-20 15:11:57 +00:00
|
|
|
def perform_grid_search(X_scaled, y, random_state: int) -> pd.DataFrame:
|
|
|
|
# Shuffle according to given random state. This allows for multiple runs
|
|
|
|
# of grid search that have a deterministic output for a given random_state
|
|
|
|
X_shuffled, y_shuffled = shuffle(X_scaled, y, random_state=random_state)
|
2023-02-19 17:18:51 +00:00
|
|
|
|
2023-02-20 15:11:57 +00:00
|
|
|
dfs: list[pd.DataFrame] = []
|
2023-02-19 17:18:51 +00:00
|
|
|
|
|
|
|
for classifier, grid in get_classifiers():
|
|
|
|
# cross-validation splits are same across calls as data is not shuffled
|
|
|
|
# see: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
|
|
|
|
clf = GridSearchCV(classifier, grid, cv=5, scoring={
|
|
|
|
'precision': make_scorer(precision_score, average='binary', zero_division=0),
|
|
|
|
'recall': make_scorer(recall_score, average='binary'),
|
|
|
|
'f1': make_scorer(f1_score, average='binary')
|
|
|
|
}, refit='f1')
|
|
|
|
|
2023-02-20 15:11:57 +00:00
|
|
|
clf.fit(X_shuffled, y_shuffled)
|
2023-02-19 17:18:51 +00:00
|
|
|
|
|
|
|
df_classifier = pd.DataFrame(clf.cv_results_)
|
|
|
|
df_classifier['classifier'] = type(classifier).__name__
|
|
|
|
|
2023-02-20 15:11:57 +00:00
|
|
|
dfs.append(df_classifier)
|
|
|
|
|
|
|
|
return pd.concat(dfs, ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
def load_dataset() -> tuple[any, any]:
|
|
|
|
df = pd.read_csv(IN_DIR + '/feature_vectors_labeled.csv')
|
|
|
|
df = df.drop('class_name', axis=1)
|
|
|
|
|
|
|
|
X = df.iloc[:, :-1].to_numpy()
|
|
|
|
y = df.iloc[:, -1].to_numpy()
|
|
|
|
|
|
|
|
scaler = preprocessing.StandardScaler().fit(X)
|
|
|
|
X_scaled = scaler.transform(X)
|
|
|
|
|
|
|
|
return (X_scaled, y)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
clean_output()
|
|
|
|
|
|
|
|
X, y = load_dataset()
|
|
|
|
df = perform_grid_search(X, y, 0xDEADB017)
|
2023-02-19 17:18:51 +00:00
|
|
|
|
|
|
|
df.to_csv(OUT_DIR + '/models.csv', index=False)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|