hw1: almost done, recheck for safety
This commit is contained in:
parent
ccc8da0405
commit
315d736e11
11 changed files with 250 additions and 80 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -61,51 +61,39 @@ if __name__ == '__main__':
|
|||
# EDITABLE SECTION OF THE SCRIPT: if you need to edit the script, do it here
|
||||
############################################################################
|
||||
|
||||
ms = {"baseline", "linear", "nonlinear"}
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print("%s {model_name}" % sys.argv[0])
|
||||
sys.exit(1)
|
||||
|
||||
if sys.argv[1] not in ms:
|
||||
print("model name must be one of " + str(ms))
|
||||
sys.exit(2)
|
||||
|
||||
y_pred = None
|
||||
if sys.argv[1] == "baseline":
|
||||
# Load the trained model
|
||||
baseline_model_path = './baseline_model.pickle'
|
||||
baseline_model = load_model(baseline_model_path)
|
||||
|
||||
# Predict on the given samples
|
||||
y_pred = baseline_model.predict(x)
|
||||
else:
|
||||
X = np.zeros([x.shape[0], 5])
|
||||
X[:, 0] = 1
|
||||
X[:, 1:3] = x
|
||||
X[:, 3] = x[:, 0] * x[:, 1]
|
||||
X[:, 4] = np.sin(x[:, 0])
|
||||
X = np.zeros([x.shape[0], 5])
|
||||
X[:, 0] = 1
|
||||
X[:, 1:3] = x
|
||||
X[:, 3] = x[:, 0] * x[:, 1]
|
||||
X[:, 4] = np.sin(x[:, 0])
|
||||
|
||||
if sys.argv[1] == "linear":
|
||||
# Load the linear model
|
||||
linear_model_path = './linear_regression.pickle'
|
||||
linear_model = load_model(linear_model_path)
|
||||
# Load the linear model
|
||||
linear_model_path = './linear_regression.pickle'
|
||||
linear_model = load_model(linear_model_path)
|
||||
|
||||
# Predict on the given samples
|
||||
y_pred = linear_model.predict(X)
|
||||
else:
|
||||
# Load saved Keras model
|
||||
nonlinear_model = models.load_model('./nonlinear_model')
|
||||
# Load saved normalizing factors for Xs
|
||||
normalizers = load_model('./nonlinear_model_normalizers.pickle')
|
||||
# Normalize Xs with normalizing factors
|
||||
X = X[:, 1:3]
|
||||
X -= normalizers["mean"]
|
||||
X /= normalizers["std"]
|
||||
# Predict on the given samples
|
||||
y_pred = linear_model.predict(X)
|
||||
# Load saved Keras model
|
||||
nonlinear_model = models.load_model('./nonlinear_model')
|
||||
# Load saved normalizing factors for Xs
|
||||
normalizers = load_model('./nonlinear_model_normalizers.pickle')
|
||||
|
||||
# Run model on normalized data
|
||||
y_pred = nonlinear_model.predict(X)
|
||||
y_pred = y_pred[:, 0]
|
||||
print("Linear model:")
|
||||
mse = evaluate_predictions(y_pred, y)
|
||||
print('MSE: {}'.format(mse))
|
||||
|
||||
# Normalize Xs with normalizing factors
|
||||
X = X[:, 1:3]
|
||||
X -= normalizers["mean"]
|
||||
X /= normalizers["std"]
|
||||
|
||||
# Run model on normalized data
|
||||
y_pred = nonlinear_model.predict(X)
|
||||
y_pred = y_pred[:, 0]
|
||||
|
||||
print("Non-linear model:")
|
||||
|
||||
############################################################################
|
||||
# STOP EDITABLE SECTION: do not modify anything below this point.
|
||||
|
|
|
@ -62,6 +62,21 @@ of your choice. Limit the assignment to 2500 words (formulas, tables,
|
|||
figures, etc., do not count as words) and do not include any code in the
|
||||
report.
|
||||
|
||||
## Premise on data splitting
|
||||
|
||||
In order to perform a correct statistical analysis, I have splitted the given
|
||||
datapoints in a training set, a validation set and a test set. The test set has
|
||||
200 points (10% of the given datapoints), the validation set has 180 data points
|
||||
(10% of the remaining 90% of the given datapoint) while the training set has
|
||||
1620 datapoints (all the remaining datapoints).
|
||||
|
||||
For the linear model, I will perform the linear regression using both the
|
||||
training and the validation datapoints (since there is no need to choose
|
||||
hyperparameters since the given family of models requires none). For the
|
||||
nonlinear model, which is a feedforward NN in my case, I used the training set
|
||||
to fit the model and I have used the validation set in order to choose the
|
||||
architecture of the neural network.
|
||||
|
||||
## Task 1
|
||||
|
||||
Use the family of models
|
||||
|
@ -71,11 +86,16 @@ $f(\mathbf{x}, \boldsymbol{\theta}) = \theta_0 + \theta_1 \cdot x_1 +
|
|||
to fit the data. Write in the report the formula of the model
|
||||
substituting parameters $\theta_0, \ldots, \theta_4$ with the estimates
|
||||
you've found:
|
||||
$$f(\mathbf{x}, \boldsymbol{\theta}) = \_ + \_ \cdot x_1 + \_
|
||||
\cdot x_2 + \_ \cdot x_1 \cdot x_2 + \_ \cdot \sin(x_1)$$
|
||||
|
||||
$$f(\mathbf{x}, \boldsymbol{\theta}) = 3.13696 -0.468921 \cdot x_1
|
||||
-0.766447 \cdot x_2 + 0.0390513 \cdot x_1 x_2 -0.918346 \cdot \sin(x_1)$$
|
||||
|
||||
Evaluate the test performance of your model using the mean squared error
|
||||
as performance measure.
|
||||
|
||||
On the 200-datapoint test set described before, the MSE of this linear model is
|
||||
$1.22265$. This result can be reproduced by running `src/statistical_tests.py`.
|
||||
|
||||
## Task 2
|
||||
|
||||
Consider any family of non-linear models of your choice to address the
|
||||
|
@ -84,6 +104,41 @@ using the mean squared error as performance measure. Compare your model
|
|||
with the linear regression of Task 1. Which one is **statistically**
|
||||
better?
|
||||
|
||||
The model I've chosen is a feed-forward neural network with 2 hidden layers:
|
||||
|
||||
- one 22-neuron dense layer with hyperbolic tangent activation function;
|
||||
- one 15-neuron dense layer with sigmoidal activation function;
|
||||
|
||||
Finally, the output neuron has a linear activation function. As described
|
||||
before, the validation set was used to manually tailor the NNs architecture.
|
||||
|
||||
The fitted model has these final performance parameters:
|
||||
|
||||
- Final MSE on validation set $\approx 0.0143344$
|
||||
- Final MSE on the test set $\approx 0.0107676$
|
||||
|
||||
I conclude the NN non-linear model is better than the linear
|
||||
regression model because of the lower test set MSE. Since the test set is 200
|
||||
elements big, we can safely assume that the loss function applied on the test
|
||||
set is a resonable approximation of the structural risk for both models.
|
||||
|
||||
In order to motivate this intuitive argument, I will perform a student's T-test
|
||||
to show that the non-linear model is indeed statistically better with 95%
|
||||
confidence. The test was implemented in `src/statistical_tests.py` and it has
|
||||
been performed by spitting the test set, assuming both resulting sets have an
|
||||
expectation of the noise equal to 0, and by computing mean, variance and
|
||||
T-score.
|
||||
|
||||
The results of the test are the following:
|
||||
|
||||
- T-test result: 5.37324
|
||||
- Linear model noise variance: 6.69426
|
||||
- Feedforward NN noise variance: $0.348799 \cdot 10^{-3}$
|
||||
|
||||
The test is clearly outside of the 95% confidence interval of $[-1.96,1.96]$ and
|
||||
thus we reject the null hypothesis. Since the variance is lower for the NN we
|
||||
conclude that this model is indeed the better one statistically speaking.
|
||||
|
||||
## Task 3 (Bonus)
|
||||
|
||||
In the [**Github repository of the
|
||||
|
@ -95,6 +150,33 @@ performance of your model is better (i.e., the MSE is lower) than ours.
|
|||
Of course, you also have to tell us why you think that your model is
|
||||
better.
|
||||
|
||||
In order to understand if my model is better than the baseline model, I
|
||||
performed another Student's T-test re-using the subset of the test set I've used
|
||||
for my linear model. The results of the new test are the following:
|
||||
|
||||
- T-test result: 1.1777 (for a $p$ value of 0.2417396)
|
||||
- Baseline model noise variance: $0.366316 \cdot 10^{-3}$
|
||||
- Feedforward NN noise variance: $0.348799 \cdot 10^{-3}$
|
||||
|
||||
This test would accept my model as the better one (as the variance is lower)
|
||||
with a 75% confidence interval, which is far from ideal but still can be
|
||||
significant and could be a good chance at having a better model on the hidden
|
||||
test set.
|
||||
|
||||
To further analyze the performance of my NN w.r.t. the baseline model, I have
|
||||
computed the MSE over my entire 200-datapoints test set for both models. Here
|
||||
are the results:
|
||||
|
||||
- MSE on entire test set for baseline model: 0.0151954
|
||||
- MSE on entire test set for feedforward NN model: 0.0107676
|
||||
|
||||
My model shows lower MSE for this particular test set, but even if this test
|
||||
wasn't used to train both sets of course the performance may change for a
|
||||
different test set like the hidden one. The T-test performed before suggests
|
||||
that my model might have a decent chance at performing better than the baseline
|
||||
model, but I cannot say that for sure or with a reasonable (like 90% or 95%)
|
||||
confidence.
|
||||
|
||||
# Questions
|
||||
|
||||
## Q1. Training versus Validation
|
||||
|
@ -238,7 +320,7 @@ Comment and compare how the (a.) training error, (b.) test error and
|
|||
problem was proved impossible to solve by a perceptron model by Minsky and
|
||||
Papert, then that would be quite a motivation in front of my boss.
|
||||
|
||||
On a morev general (and more serious) note, the perceptron model would be
|
||||
On a more general (and more serious) note, the perceptron model would be
|
||||
unable to solve the problem in the picture since a perceptron can solve only
|
||||
linearly-separable classification problems, and even through a simple
|
||||
graphical argument we would be unable to find a line able able to separate
|
||||
|
@ -248,11 +330,22 @@ Comment and compare how the (a.) training error, (b.) test error and
|
|||
2. **Would you expect to have better luck with a neural network with
|
||||
activation function $h(x) = - x \cdot e^{-2}$ for the hidden units?**
|
||||
|
||||
The activation function is still linear and data is not linearly separable
|
||||
First of all we notice that that activation function is still linear, even
|
||||
if a $-e^{-2}$ scaling factor is applied to the output. Since the problem is
|
||||
not linearly separable, we would still have bad luck since the new family of
|
||||
models would still be inadequate for solving the problem.
|
||||
|
||||
3. **What are the main differences and similarities between the
|
||||
perceptron and the logistic regression neuron?**
|
||||
|
||||
|
||||
|
||||
|
||||
The perceptron neuron and the logistic regression neuron both are dense
|
||||
neurons that recieve as parameters all values from the previous layer as
|
||||
single scalars. Both require a number of parameters equal to the number of
|
||||
inputs, which are used as scaling factors for the inputs recieved. The main
|
||||
difference between the neurons is the activation function: the perceptron
|
||||
uses an heavyside activation function, while the logistic regression neuron
|
||||
uses a sigmoidal function, providing a probabliistic interpretation of the
|
||||
output w.r.t. the classification problem (i.e. by instead of providing just a
|
||||
"in X class" or "not in X class" indication, the logistic regression
|
||||
neuron can output in the continuous $[0,1]$ range a probabliity of said
|
||||
element being in the class).
|
||||
|
|
Binary file not shown.
|
@ -1,22 +1,42 @@
|
|||
import numpy as np
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from tensorflow.keras import Input, Model
|
||||
from tensorflow.keras.models import Sequential, save_model
|
||||
from tensorflow.keras.layers import Dense
|
||||
from tensorflow.keras import losses
|
||||
from tensorflow import keras
|
||||
import tensorflow as tf
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from utils import save_sklearn_model, save_keras_model
|
||||
import os
|
||||
import random
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from keras import backend as K
|
||||
|
||||
#
|
||||
# FIX THE RANDOM GENERATOR SEEDS
|
||||
#
|
||||
seed_value = 0
|
||||
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
|
||||
os.environ['PYTHONHASHSEED'] = str(seed_value)
|
||||
# 2. Set `python` built-in pseudo-random generator at a fixed value
|
||||
random.seed(seed_value)
|
||||
# 3. Set `numpy` pseudo-random generator at a fixed value
|
||||
np.random.seed(seed_value)
|
||||
# 4. Set the `tensorflow` pseudo-random generator at a fixed value
|
||||
tf.random.set_seed(seed_value)
|
||||
# 5. Configure a new global `tensorflow` session
|
||||
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,
|
||||
inter_op_parallelism_threads=1)
|
||||
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(),
|
||||
config=session_conf)
|
||||
tf.compat.v1.keras.backend.set_session(sess)
|
||||
|
||||
d = os.path.dirname(__file__)
|
||||
|
||||
data = np.load(d + "/../data/data.npz")
|
||||
data = np.load("../data/data.npz")
|
||||
xs = data["x"] # 2000x2
|
||||
y = data["y"] # 2000x1
|
||||
points = 2000
|
||||
points = np.shape(xs)[0]
|
||||
|
||||
# We manually include in the feature vectors a '1' column corresponding to theta_0,
|
||||
# so disable
|
||||
|
@ -30,13 +50,13 @@ X[:, 3] = xs[:, 0] * xs[:, 1]
|
|||
X[:, 4] = np.sin(xs[:, 0])
|
||||
|
||||
# Shuffle our data for division in training, and test set
|
||||
np.random.seed(0) # seed the generation for reproducibility purposes
|
||||
|
||||
train_ratio = 0.1
|
||||
validation_ratio = 0.1
|
||||
|
||||
X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=train_ratio)
|
||||
X_train, X_val, y_train, y_val = train_test_split(X_t, y_t, test_size=validation_ratio)
|
||||
np.savez('test', x=X_test, y=y_test, allow_pickle=True)
|
||||
|
||||
# Fit with train data
|
||||
reg = lr.fit(X_t, y_t)
|
||||
|
@ -46,14 +66,9 @@ print("# Linear regression:")
|
|||
# Print the resulting parameters
|
||||
print("f(x) = %g + %g * x_1 + %g * x_2 + %g * x_1 * x_2 + %g * sin(x_1)" % tuple(reg.coef_))
|
||||
|
||||
save_sklearn_model(reg, d + "/../deliverable/linear_regression.pickle")
|
||||
|
||||
# Test using MSQ on test set
|
||||
score = reg.score(X_test, y_test)
|
||||
print("MSQ error on test set is: %g" % (score))
|
||||
|
||||
### Non-linear regression:
|
||||
save_sklearn_model(reg, "../deliverable/linear_regression.pickle")
|
||||
|
||||
# Non-linear regression:
|
||||
print("\n# Feed-forward NN:")
|
||||
|
||||
A = X_val
|
||||
|
@ -61,9 +76,6 @@ A = X_val
|
|||
X_train = X_train[:, 1:3]
|
||||
X_val = X_val[:, 1:3]
|
||||
|
||||
# X_train = X_train[:, 1:3]
|
||||
# X_val = X_val[:, 1:3]
|
||||
|
||||
mean = np.mean(X_train, axis=0)
|
||||
std = np.std(X_train, axis=0)
|
||||
|
||||
|
@ -74,26 +86,20 @@ X_val -= mean
|
|||
X_val /= std
|
||||
|
||||
network = Sequential()
|
||||
network.add(Dense(20, activation='tanh'))
|
||||
network.add(Dense(10, activation='relu'))
|
||||
network.add(Dense(7, activation='sigmoid'))
|
||||
network.add(Dense(5, activation='relu'))
|
||||
network.add(Dense(22, activation='tanh'))
|
||||
network.add(Dense(15, activation='sigmoid'))
|
||||
network.add(Dense(1, activation='linear'))
|
||||
network.compile(optimizer='rmsprop', loss='mse', metrics=['mse'])
|
||||
network.compile(optimizer='adam', loss='mse')
|
||||
|
||||
epochs = 1000
|
||||
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=40)
|
||||
network.fit(X_train, y_train, epochs=epochs, verbose=1, batch_size=15,
|
||||
validation_data=(X_val, y_val), callbacks=[callback])
|
||||
epochs = 5000
|
||||
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=120)
|
||||
network.fit(X_train, y_train, epochs=epochs, verbose=1,
|
||||
validation_data=(X_val, y_val), callbacks=[callback])
|
||||
|
||||
network.save(d + "/../deliverable/nonlinear_model")
|
||||
save_sklearn_model({"mean": mean, "std": std}, d + "/../deliverable/nonlinear_model_normalizers.pickle")
|
||||
network.save("../deliverable/nonlinear_model")
|
||||
save_sklearn_model({"mean": mean, "std": std}, "../deliverable/nonlinear_model_normalizers.pickle")
|
||||
|
||||
# Print the final validation set MSE, which was used to tailor the NN architecture after
|
||||
# several manual trials
|
||||
msq = mean_squared_error(network.predict(X_val), y_val)
|
||||
print(msq)
|
||||
|
||||
X_test = X_test[:, 1:3]
|
||||
X_test -= mean
|
||||
X_test /= std
|
||||
msq = mean_squared_error(network.predict(X_test), y_test)
|
||||
#print(msq)
|
||||
|
|
83
assignment_1/src/statistical_test.py
Normal file
83
assignment_1/src/statistical_test.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
""" Statistical Student's T-test between the linear and non-linear model """
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
from keras import models
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
# Load test dataset (which was saved when building models)
|
||||
test = np.load('test.npz')
|
||||
X_test = test["x"]
|
||||
y_test = test["y"]
|
||||
|
||||
# Load both models and the normalizers for the non-linear model)
|
||||
lr = joblib.load("../deliverable/linear_regression.pickle")
|
||||
bm = joblib.load('../deliverable/baseline_model.pickle')
|
||||
nonlinear_model = models.load_model('../deliverable/nonlinear_model')
|
||||
normalizers = joblib.load('../deliverable/nonlinear_model_normalizers.pickle')
|
||||
|
||||
# Split (without shuffling, the test set has already been shuffled) the test set
|
||||
# in two evenly sized test datasets for student's T-test comparison
|
||||
X_a, X_b, Y_a, Y_b = train_test_split(X_test, y_test, test_size=0.5,
|
||||
shuffle=False)
|
||||
|
||||
# Make sure both sets have the same size
|
||||
assert X_a.shape == X_b.shape
|
||||
|
||||
# Compute the noise values for the linear model
|
||||
e_a = np.square(Y_a - lr.predict(X_a))
|
||||
|
||||
# Drop the additional parameters for the linear model and normalize Xs with
|
||||
# normalizing factors
|
||||
X_b = X_b[:, 1:3]
|
||||
X_b -= normalizers["mean"]
|
||||
X_b /= normalizers["std"]
|
||||
|
||||
# Compute the noise values for the feedforward NN
|
||||
e_b = np.square(Y_b - nonlinear_model.predict(X_b).flatten())
|
||||
|
||||
# # of data points in both test sets
|
||||
L = X_a.shape[0]
|
||||
|
||||
# Compute mean and variance on the respective noise datasets for both models
|
||||
e_mean_a = np.mean(e_a)
|
||||
var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1)
|
||||
e_mean_b = np.mean(e_b)
|
||||
var_e_b = np.sum(np.square(e_b - e_mean_b)) / (L - 1)
|
||||
|
||||
# Compute Student's T-test
|
||||
T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L)
|
||||
|
||||
# Print results
|
||||
print("# T-test between linear and FFNN")
|
||||
print("T-test result: %g" % T)
|
||||
print("Linear model variance: %g" % var_e_a)
|
||||
print("Feedforward NN variance: %g" % var_e_b)
|
||||
|
||||
# Now perform another test using the (a) test set for the baseline model
|
||||
X_a = X_a[:, 1:3]
|
||||
e_a = np.square(Y_a - bm.predict(X_a))
|
||||
e_mean_a = np.mean(e_a)
|
||||
var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1)
|
||||
T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L)
|
||||
|
||||
print("# T-test between baseline and FFNN")
|
||||
print("T-test result: %g" % T)
|
||||
print("Baseline model variance: %g" % var_e_a)
|
||||
print("Feedforward NN variance: %g" % var_e_b)
|
||||
|
||||
# Compute MSE on entire test set for all models
|
||||
print("MSE on entire test set for linear regression model: %g" %
|
||||
(np.mean(np.square(y_test - lr.predict(X_test)))))
|
||||
|
||||
X_test = X_test[:, 1:3]
|
||||
print("MSE on entire test set for baseline model: %g" %
|
||||
(np.mean(np.square(y_test - bm.predict(X_test)))))
|
||||
|
||||
# Normalize Xs first for running my FFNN model
|
||||
X_test -= normalizers["mean"]
|
||||
X_test /= normalizers["std"]
|
||||
print("MSE on entire test set for feedforward NN model: %g" %
|
||||
(np.mean(np.square(y_test - nonlinear_model.predict(X_test).flatten()))))
|
||||
|
||||
# vim: set ts=4 sw=4 et tw=80:
|
BIN
assignment_1/src/test.npz
Normal file
BIN
assignment_1/src/test.npz
Normal file
Binary file not shown.
Reference in a new issue