ml/as1_Maggioni_Claudio/src/statistical_test.py

""" Statistical Student's T-test between the linear and non-linear model """

import joblib
import numpy as np
from keras import models
from sklearn.model_selection import train_test_split

# Load test dataset (which was saved when building models)
test = np.load('test.npz')
X_test = test["x"]
y_test = test["y"]

# Load both models and the normalizers for the non-linear model)
lr = joblib.load("../deliverable/linear_regression.pickle")
bm = joblib.load('../deliverable/baseline_model.pickle')
nonlinear_model = models.load_model('../deliverable/nonlinear_model')
normalizers = joblib.load('../deliverable/nonlinear_model_normalizers.pickle')

# Split (without shuffling, the test set has already been shuffled) the test set
# in two evenly sized test datasets for student's T-test comparison
X_a, X_b, Y_a, Y_b = train_test_split(X_test, y_test, test_size=0.5,
                                      shuffle=False)

# Make sure both sets have the same size
assert X_a.shape == X_b.shape

# Compute the noise values for the linear model
e_a = np.square(Y_a - lr.predict(X_a))

# Drop the additional parameters for the linear model and normalize Xs with
# normalizing factors
X_b = X_b[:, 1:3]
X_b -= normalizers["mean"]
X_b /= normalizers["std"]

# Compute the noise values for the feedforward NN
e_b = np.square(Y_b - nonlinear_model.predict(X_b).flatten())

# # of data points in both test sets
L = X_a.shape[0]

# Compute mean and variance on the respective noise datasets for both models
e_mean_a = np.mean(e_a)
var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1)
e_mean_b = np.mean(e_b)
var_e_b = np.sum(np.square(e_b - e_mean_b)) / (L - 1)

# Compute Student's T-test
T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L)

# Print results
print("# T-test between linear and FFNN")
print("T-test result: %g" % T)
print("Linear model variance: %g" % var_e_a)
print("Feedforward NN variance: %g" % var_e_b)

# Now perform another test using the (a) test set for the baseline model
X_a = X_a[:, 1:3]
e_a = np.square(Y_a - bm.predict(X_a))
e_mean_a = np.mean(e_a)
var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1)
T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L)

print("# T-test between baseline and FFNN")
print("T-test result: %g" % T)
print("Baseline model variance: %g" % var_e_a)
print("Feedforward NN variance: %g" % var_e_b)

# Compute MSE on entire test set for all models
print("MSE on entire test set for linear regression model: %g" %
        (np.mean(np.square(y_test - lr.predict(X_test)))))

X_test = X_test[:, 1:3]
print("MSE on entire test set for baseline model: %g" %
        (np.mean(np.square(y_test - bm.predict(X_test)))))

# Normalize Xs first for running my FFNN model
X_test -= normalizers["mean"]
X_test /= normalizers["std"]
print("MSE on entire test set for feedforward NN model: %g" %
        (np.mean(np.square(y_test - nonlinear_model.predict(X_test).flatten()))))

# vim: set ts=4 sw=4 et tw=80:
hw1: almost done, recheck for safety 2021-05-07 10:32:53 +00:00			`""" Statistical Student's T-test between the linear and non-linear model """`

			`import joblib`
			`import numpy as np`
			`from keras import models`
			`from sklearn.model_selection import train_test_split`

			`# Load test dataset (which was saved when building models)`
			`test = np.load('test.npz')`
			`X_test = test["x"]`
			`y_test = test["y"]`

			`# Load both models and the normalizers for the non-linear model)`
			`lr = joblib.load("../deliverable/linear_regression.pickle")`
			`bm = joblib.load('../deliverable/baseline_model.pickle')`
			`nonlinear_model = models.load_model('../deliverable/nonlinear_model')`
			`normalizers = joblib.load('../deliverable/nonlinear_model_normalizers.pickle')`

			`# Split (without shuffling, the test set has already been shuffled) the test set`
			`# in two evenly sized test datasets for student's T-test comparison`
			`X_a, X_b, Y_a, Y_b = train_test_split(X_test, y_test, test_size=0.5,`
			`shuffle=False)`

			`# Make sure both sets have the same size`
			`assert X_a.shape == X_b.shape`

			`# Compute the noise values for the linear model`
			`e_a = np.square(Y_a - lr.predict(X_a))`

			`# Drop the additional parameters for the linear model and normalize Xs with`
			`# normalizing factors`
			`X_b = X_b[:, 1:3]`
			`X_b -= normalizers["mean"]`
			`X_b /= normalizers["std"]`

			`# Compute the noise values for the feedforward NN`
			`e_b = np.square(Y_b - nonlinear_model.predict(X_b).flatten())`

			`# # of data points in both test sets`
			`L = X_a.shape[0]`

			`# Compute mean and variance on the respective noise datasets for both models`
			`e_mean_a = np.mean(e_a)`
			`var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1)`
			`e_mean_b = np.mean(e_b)`
			`var_e_b = np.sum(np.square(e_b - e_mean_b)) / (L - 1)`

			`# Compute Student's T-test`
			`T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L)`

			`# Print results`
			`print("# T-test between linear and FFNN")`
			`print("T-test result: %g" % T)`
			`print("Linear model variance: %g" % var_e_a)`
			`print("Feedforward NN variance: %g" % var_e_b)`

			`# Now perform another test using the (a) test set for the baseline model`
			`X_a = X_a[:, 1:3]`
			`e_a = np.square(Y_a - bm.predict(X_a))`
			`e_mean_a = np.mean(e_a)`
			`var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1)`
			`T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L)`

			`print("# T-test between baseline and FFNN")`
			`print("T-test result: %g" % T)`
			`print("Baseline model variance: %g" % var_e_a)`
			`print("Feedforward NN variance: %g" % var_e_b)`

			`# Compute MSE on entire test set for all models`
			`print("MSE on entire test set for linear regression model: %g" %`
			`(np.mean(np.square(y_test - lr.predict(X_test)))))`

			`X_test = X_test[:, 1:3]`
			`print("MSE on entire test set for baseline model: %g" %`
			`(np.mean(np.square(y_test - bm.predict(X_test)))))`

			`# Normalize Xs first for running my FFNN model`
			`X_test -= normalizers["mean"]`
			`X_test /= normalizers["std"]`
			`print("MSE on entire test set for feedforward NN model: %g" %`
			`(np.mean(np.square(y_test - nonlinear_model.predict(X_test).flatten()))))`

			`# vim: set ts=4 sw=4 et tw=80:`