140 lines
4.0 KiB
Python
Executable File
140 lines
4.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import os
|
|
import pandas as pd
|
|
import glob
|
|
import re
|
|
import itertools
|
|
import numpy as np
|
|
from train_classifiers import perform_grid_search, load_dataset
|
|
|
|
from sklearn.neural_network import MLPClassifier
|
|
from sklearn.naive_bayes import GaussianNB
|
|
from sklearn.svm import SVC
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
|
|
from scipy.stats import wilcoxon
|
|
|
|
DIR: str = os.path.dirname(os.path.realpath(__file__))
|
|
OUT_DIR: str = DIR + '/models'
|
|
|
|
RANDOM_STATES: list[int] = [
|
|
0xDEADB017,
|
|
0xDEADBEEF,
|
|
4, # chosen by a fair dice roll
|
|
0xDECAFBAD,
|
|
0x000FF1CE,
|
|
8451,
|
|
42,
|
|
2056,
|
|
25,
|
|
6 // 2,
|
|
91,
|
|
7, # Where is Cherapunji?
|
|
115,
|
|
65,
|
|
76,
|
|
85,
|
|
83,
|
|
111,
|
|
110,
|
|
101,
|
|
]
|
|
|
|
|
|
class BiasedClassifier(BaseEstimator, ClassifierMixin):
|
|
def __init__(self, to_return=None):
|
|
self.to_return = to_return
|
|
|
|
def fit(self, X, y=None):
|
|
pass
|
|
|
|
def predict(self, X, y=None):
|
|
return np.array([self.to_return] * len(X))
|
|
|
|
def predict_proba(self, X, y=None):
|
|
return np.array([self.to_return] * len(X))
|
|
|
|
|
|
def clean_output():
|
|
filelist = glob.glob(OUT_DIR + '/models.csv')
|
|
for f in filelist:
|
|
os.remove(f)
|
|
|
|
|
|
string_to_classifier = {
|
|
'MLPClassifier': MLPClassifier(),
|
|
'GaussianNB': GaussianNB(),
|
|
'SVC': SVC(),
|
|
'DecisionTreeClassifier': DecisionTreeClassifier(),
|
|
'RandomForestClassifier': RandomForestClassifier()
|
|
}
|
|
|
|
|
|
def unit_grid(params: dict) -> dict:
|
|
return dict([(k, [v]) for k, v in params.items()])
|
|
|
|
|
|
def main():
|
|
if not os.path.exists(OUT_DIR + '/evaluation.csv'):
|
|
X, y = load_dataset()
|
|
|
|
df_best = pd.read_csv(OUT_DIR + '/best.csv')
|
|
df_best = df_best.loc[:, ['classifier', 'params']]
|
|
df_best.loc[:, 'classifier'] = df_best['classifier'].apply(lambda x: string_to_classifier[x])
|
|
df_best.loc[:, 'params'] = df_best['params'].apply(lambda x: eval(x))
|
|
classifiers = [(e['classifier'], unit_grid(e['params'])) for e in df_best.to_dict('records')]
|
|
classifiers.append((BiasedClassifier(), { 'to_return': [1] }), )
|
|
|
|
dfs: list[pd.DataFrame] = []
|
|
|
|
for i, state in enumerate(RANDOM_STATES):
|
|
print("Iteration " + str(i + 1) + " of " + str(len(RANDOM_STATES)) + "...")
|
|
dfs.append(perform_grid_search(X, y, classifiers, 5, state))
|
|
|
|
# concatenate all runs in single dataframe
|
|
df = pd.concat(dfs, ignore_index=True)
|
|
df.to_csv(OUT_DIR + '/evaluation.csv', index=False)
|
|
else:
|
|
df = pd.read_csv(OUT_DIR + '/evaluation.csv')
|
|
|
|
metrics_columns = list(df.filter(regex='^split\d_test'))
|
|
df = pd.melt(df, id_vars=['classifier'], value_vars=metrics_columns, var_name="metric")
|
|
df.loc[:, 'metric'] = df['metric'].apply(lambda x: re.sub('split\d_test_', '', x))
|
|
|
|
classifier_list = df['classifier'].unique()
|
|
metric_list = df['metric'].unique()
|
|
|
|
df_stats = pd.DataFrame(columns=['classifier_a', 'classifier_b', 'metric', 'pvalue'])
|
|
|
|
i = 1
|
|
for classifier_a in classifier_list:
|
|
for classifier_b in classifier_list:
|
|
if classifier_a >= classifier_b:
|
|
continue
|
|
|
|
for metric in metric_list:
|
|
if metric == 'accuracy':
|
|
continue
|
|
|
|
series_a = list(df.loc[(df['classifier'] == classifier_a) & \
|
|
(df['metric'] == metric), 'value'])
|
|
series_b = list(df.loc[(df['classifier'] == classifier_b) & \
|
|
(df['metric'] == metric), 'value'])
|
|
|
|
df_stats.loc[i, 'classifier_a'] = classifier_a
|
|
df_stats.loc[i, 'classifier_b'] = classifier_b
|
|
df_stats.loc[i, 'metric'] = metric
|
|
df_stats.loc[i, 'pvalue'] = wilcoxon(series_a, series_b).pvalue
|
|
|
|
i += 1
|
|
|
|
df_stats.to_csv(OUT_DIR + '/model_stats.csv')
|
|
print(df_stats)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|