diff --git a/assignment_1/deliverable/linear_regression.pickle b/assignment_1/deliverable/linear_regression.pickle index ec34b34..e42bf7a 100644 Binary files a/assignment_1/deliverable/linear_regression.pickle and b/assignment_1/deliverable/linear_regression.pickle differ diff --git a/assignment_1/deliverable/nonlinear_model/saved_model.pb b/assignment_1/deliverable/nonlinear_model/saved_model.pb index 22064e7..f71850e 100644 Binary files a/assignment_1/deliverable/nonlinear_model/saved_model.pb and b/assignment_1/deliverable/nonlinear_model/saved_model.pb differ diff --git a/assignment_1/deliverable/nonlinear_model/variables/variables.data-00000-of-00001 b/assignment_1/deliverable/nonlinear_model/variables/variables.data-00000-of-00001 index 5432cfe..c7dce55 100644 Binary files a/assignment_1/deliverable/nonlinear_model/variables/variables.data-00000-of-00001 and b/assignment_1/deliverable/nonlinear_model/variables/variables.data-00000-of-00001 differ diff --git a/assignment_1/deliverable/nonlinear_model/variables/variables.index b/assignment_1/deliverable/nonlinear_model/variables/variables.index index 20b3447..c5d43a1 100644 Binary files a/assignment_1/deliverable/nonlinear_model/variables/variables.index and b/assignment_1/deliverable/nonlinear_model/variables/variables.index differ diff --git a/assignment_1/deliverable/nonlinear_model_normalizers.pickle b/assignment_1/deliverable/nonlinear_model_normalizers.pickle index 4e9916f..54ad903 100644 Binary files a/assignment_1/deliverable/nonlinear_model_normalizers.pickle and b/assignment_1/deliverable/nonlinear_model_normalizers.pickle differ diff --git a/assignment_1/deliverable/run_model.py b/assignment_1/deliverable/run_model.py index d4808f3..af2ae23 100755 --- a/assignment_1/deliverable/run_model.py +++ b/assignment_1/deliverable/run_model.py @@ -61,51 +61,39 @@ if __name__ == '__main__': # EDITABLE SECTION OF THE SCRIPT: if you need to edit the script, do it here ############################################################################ - ms = {"baseline", "linear", "nonlinear"} - - if len(sys.argv) != 2: - print("%s {model_name}" % sys.argv[0]) - sys.exit(1) - - if sys.argv[1] not in ms: - print("model name must be one of " + str(ms)) - sys.exit(2) - y_pred = None - if sys.argv[1] == "baseline": - # Load the trained model - baseline_model_path = './baseline_model.pickle' - baseline_model = load_model(baseline_model_path) - # Predict on the given samples - y_pred = baseline_model.predict(x) - else: - X = np.zeros([x.shape[0], 5]) - X[:, 0] = 1 - X[:, 1:3] = x - X[:, 3] = x[:, 0] * x[:, 1] - X[:, 4] = np.sin(x[:, 0]) + X = np.zeros([x.shape[0], 5]) + X[:, 0] = 1 + X[:, 1:3] = x + X[:, 3] = x[:, 0] * x[:, 1] + X[:, 4] = np.sin(x[:, 0]) - if sys.argv[1] == "linear": - # Load the linear model - linear_model_path = './linear_regression.pickle' - linear_model = load_model(linear_model_path) + # Load the linear model + linear_model_path = './linear_regression.pickle' + linear_model = load_model(linear_model_path) - # Predict on the given samples - y_pred = linear_model.predict(X) - else: - # Load saved Keras model - nonlinear_model = models.load_model('./nonlinear_model') - # Load saved normalizing factors for Xs - normalizers = load_model('./nonlinear_model_normalizers.pickle') - # Normalize Xs with normalizing factors - X = X[:, 1:3] - X -= normalizers["mean"] - X /= normalizers["std"] + # Predict on the given samples + y_pred = linear_model.predict(X) + # Load saved Keras model + nonlinear_model = models.load_model('./nonlinear_model') + # Load saved normalizing factors for Xs + normalizers = load_model('./nonlinear_model_normalizers.pickle') - # Run model on normalized data - y_pred = nonlinear_model.predict(X) - y_pred = y_pred[:, 0] + print("Linear model:") + mse = evaluate_predictions(y_pred, y) + print('MSE: {}'.format(mse)) + + # Normalize Xs with normalizing factors + X = X[:, 1:3] + X -= normalizers["mean"] + X /= normalizers["std"] + + # Run model on normalized data + y_pred = nonlinear_model.predict(X) + y_pred = y_pred[:, 0] + + print("Non-linear model:") ############################################################################ # STOP EDITABLE SECTION: do not modify anything below this point. diff --git a/assignment_1/report_Maggioni_Claudio.md b/assignment_1/report_Maggioni_Claudio.md index 2bd754b..c20b507 100644 --- a/assignment_1/report_Maggioni_Claudio.md +++ b/assignment_1/report_Maggioni_Claudio.md @@ -62,6 +62,21 @@ of your choice. Limit the assignment to 2500 words (formulas, tables, figures, etc., do not count as words) and do not include any code in the report. +## Premise on data splitting + +In order to perform a correct statistical analysis, I have splitted the given +datapoints in a training set, a validation set and a test set. The test set has +200 points (10% of the given datapoints), the validation set has 180 data points +(10% of the remaining 90% of the given datapoint) while the training set has +1620 datapoints (all the remaining datapoints). + +For the linear model, I will perform the linear regression using both the +training and the validation datapoints (since there is no need to choose +hyperparameters since the given family of models requires none). For the +nonlinear model, which is a feedforward NN in my case, I used the training set +to fit the model and I have used the validation set in order to choose the +architecture of the neural network. + ## Task 1 Use the family of models @@ -71,11 +86,16 @@ $f(\mathbf{x}, \boldsymbol{\theta}) = \theta_0 + \theta_1 \cdot x_1 + to fit the data. Write in the report the formula of the model substituting parameters $\theta_0, \ldots, \theta_4$ with the estimates you've found: -$$f(\mathbf{x}, \boldsymbol{\theta}) = \_ + \_ \cdot x_1 + \_ -\cdot x_2 + \_ \cdot x_1 \cdot x_2 + \_ \cdot \sin(x_1)$$ + +$$f(\mathbf{x}, \boldsymbol{\theta}) = 3.13696 -0.468921 \cdot x_1 +-0.766447 \cdot x_2 + 0.0390513 \cdot x_1 x_2 -0.918346 \cdot \sin(x_1)$$ + Evaluate the test performance of your model using the mean squared error as performance measure. +On the 200-datapoint test set described before, the MSE of this linear model is +$1.22265$. This result can be reproduced by running `src/statistical_tests.py`. + ## Task 2 Consider any family of non-linear models of your choice to address the @@ -84,6 +104,41 @@ using the mean squared error as performance measure. Compare your model with the linear regression of Task 1. Which one is **statistically** better? +The model I've chosen is a feed-forward neural network with 2 hidden layers: + +- one 22-neuron dense layer with hyperbolic tangent activation function; +- one 15-neuron dense layer with sigmoidal activation function; + +Finally, the output neuron has a linear activation function. As described +before, the validation set was used to manually tailor the NNs architecture. + +The fitted model has these final performance parameters: + +- Final MSE on validation set $\approx 0.0143344$ +- Final MSE on the test set $\approx 0.0107676$ + +I conclude the NN non-linear model is better than the linear +regression model because of the lower test set MSE. Since the test set is 200 +elements big, we can safely assume that the loss function applied on the test +set is a resonable approximation of the structural risk for both models. + +In order to motivate this intuitive argument, I will perform a student's T-test +to show that the non-linear model is indeed statistically better with 95% +confidence. The test was implemented in `src/statistical_tests.py` and it has +been performed by spitting the test set, assuming both resulting sets have an +expectation of the noise equal to 0, and by computing mean, variance and +T-score. + +The results of the test are the following: + +- T-test result: 5.37324 +- Linear model noise variance: 6.69426 +- Feedforward NN noise variance: $0.348799 \cdot 10^{-3}$ + +The test is clearly outside of the 95% confidence interval of $[-1.96,1.96]$ and +thus we reject the null hypothesis. Since the variance is lower for the NN we +conclude that this model is indeed the better one statistically speaking. + ## Task 3 (Bonus) In the [**Github repository of the @@ -95,6 +150,33 @@ performance of your model is better (i.e., the MSE is lower) than ours. Of course, you also have to tell us why you think that your model is better. +In order to understand if my model is better than the baseline model, I +performed another Student's T-test re-using the subset of the test set I've used +for my linear model. The results of the new test are the following: + +- T-test result: 1.1777 (for a $p$ value of 0.2417396) +- Baseline model noise variance: $0.366316 \cdot 10^{-3}$ +- Feedforward NN noise variance: $0.348799 \cdot 10^{-3}$ + +This test would accept my model as the better one (as the variance is lower) +with a 75% confidence interval, which is far from ideal but still can be +significant and could be a good chance at having a better model on the hidden +test set. + +To further analyze the performance of my NN w.r.t. the baseline model, I have +computed the MSE over my entire 200-datapoints test set for both models. Here +are the results: + +- MSE on entire test set for baseline model: 0.0151954 +- MSE on entire test set for feedforward NN model: 0.0107676 + +My model shows lower MSE for this particular test set, but even if this test +wasn't used to train both sets of course the performance may change for a +different test set like the hidden one. The T-test performed before suggests +that my model might have a decent chance at performing better than the baseline +model, but I cannot say that for sure or with a reasonable (like 90% or 95%) +confidence. + # Questions ## Q1. Training versus Validation @@ -238,7 +320,7 @@ Comment and compare how the (a.) training error, (b.) test error and problem was proved impossible to solve by a perceptron model by Minsky and Papert, then that would be quite a motivation in front of my boss. - On a morev general (and more serious) note, the perceptron model would be + On a more general (and more serious) note, the perceptron model would be unable to solve the problem in the picture since a perceptron can solve only linearly-separable classification problems, and even through a simple graphical argument we would be unable to find a line able able to separate @@ -248,11 +330,22 @@ Comment and compare how the (a.) training error, (b.) test error and 2. **Would you expect to have better luck with a neural network with activation function $h(x) = - x \cdot e^{-2}$ for the hidden units?** - The activation function is still linear and data is not linearly separable + First of all we notice that that activation function is still linear, even + if a $-e^{-2}$ scaling factor is applied to the output. Since the problem is + not linearly separable, we would still have bad luck since the new family of + models would still be inadequate for solving the problem. 3. **What are the main differences and similarities between the perceptron and the logistic regression neuron?** - - - + The perceptron neuron and the logistic regression neuron both are dense + neurons that recieve as parameters all values from the previous layer as + single scalars. Both require a number of parameters equal to the number of + inputs, which are used as scaling factors for the inputs recieved. The main + difference between the neurons is the activation function: the perceptron + uses an heavyside activation function, while the logistic regression neuron + uses a sigmoidal function, providing a probabliistic interpretation of the + output w.r.t. the classification problem (i.e. by instead of providing just a + "in X class" or "not in X class" indication, the logistic regression + neuron can output in the continuous $[0,1]$ range a probabliity of said + element being in the class). diff --git a/assignment_1/report_Maggioni_Claudio.pdf b/assignment_1/report_Maggioni_Claudio.pdf index 42c7f58..fd88acd 100644 Binary files a/assignment_1/report_Maggioni_Claudio.pdf and b/assignment_1/report_Maggioni_Claudio.pdf differ diff --git a/assignment_1/src/build_models.py b/assignment_1/src/build_models.py index 8625def..3fa2543 100644 --- a/assignment_1/src/build_models.py +++ b/assignment_1/src/build_models.py @@ -1,22 +1,42 @@ -import numpy as np from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from tensorflow.keras import Input, Model from tensorflow.keras.models import Sequential, save_model from tensorflow.keras.layers import Dense from tensorflow.keras import losses -from tensorflow import keras -import tensorflow as tf from sklearn.metrics import mean_squared_error from utils import save_sklearn_model, save_keras_model import os +import random +import numpy as np +import tensorflow as tf +from keras import backend as K + +# +# FIX THE RANDOM GENERATOR SEEDS +# +seed_value = 0 +# 1. Set `PYTHONHASHSEED` environment variable at a fixed value +os.environ['PYTHONHASHSEED'] = str(seed_value) +# 2. Set `python` built-in pseudo-random generator at a fixed value +random.seed(seed_value) +# 3. Set `numpy` pseudo-random generator at a fixed value +np.random.seed(seed_value) +# 4. Set the `tensorflow` pseudo-random generator at a fixed value +tf.random.set_seed(seed_value) +# 5. Configure a new global `tensorflow` session +session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, + inter_op_parallelism_threads=1) +sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), + config=session_conf) +tf.compat.v1.keras.backend.set_session(sess) d = os.path.dirname(__file__) -data = np.load(d + "/../data/data.npz") +data = np.load("../data/data.npz") xs = data["x"] # 2000x2 y = data["y"] # 2000x1 -points = 2000 +points = np.shape(xs)[0] # We manually include in the feature vectors a '1' column corresponding to theta_0, # so disable @@ -30,13 +50,13 @@ X[:, 3] = xs[:, 0] * xs[:, 1] X[:, 4] = np.sin(xs[:, 0]) # Shuffle our data for division in training, and test set -np.random.seed(0) # seed the generation for reproducibility purposes train_ratio = 0.1 validation_ratio = 0.1 X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=train_ratio) X_train, X_val, y_train, y_val = train_test_split(X_t, y_t, test_size=validation_ratio) +np.savez('test', x=X_test, y=y_test, allow_pickle=True) # Fit with train data reg = lr.fit(X_t, y_t) @@ -46,14 +66,9 @@ print("# Linear regression:") # Print the resulting parameters print("f(x) = %g + %g * x_1 + %g * x_2 + %g * x_1 * x_2 + %g * sin(x_1)" % tuple(reg.coef_)) -save_sklearn_model(reg, d + "/../deliverable/linear_regression.pickle") - -# Test using MSQ on test set -score = reg.score(X_test, y_test) -print("MSQ error on test set is: %g" % (score)) - -### Non-linear regression: +save_sklearn_model(reg, "../deliverable/linear_regression.pickle") +# Non-linear regression: print("\n# Feed-forward NN:") A = X_val @@ -61,9 +76,6 @@ A = X_val X_train = X_train[:, 1:3] X_val = X_val[:, 1:3] -# X_train = X_train[:, 1:3] -# X_val = X_val[:, 1:3] - mean = np.mean(X_train, axis=0) std = np.std(X_train, axis=0) @@ -74,26 +86,20 @@ X_val -= mean X_val /= std network = Sequential() -network.add(Dense(20, activation='tanh')) -network.add(Dense(10, activation='relu')) -network.add(Dense(7, activation='sigmoid')) -network.add(Dense(5, activation='relu')) +network.add(Dense(22, activation='tanh')) +network.add(Dense(15, activation='sigmoid')) network.add(Dense(1, activation='linear')) -network.compile(optimizer='rmsprop', loss='mse', metrics=['mse']) +network.compile(optimizer='adam', loss='mse') -epochs = 1000 -callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=40) -network.fit(X_train, y_train, epochs=epochs, verbose=1, batch_size=15, - validation_data=(X_val, y_val), callbacks=[callback]) +epochs = 5000 +callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=120) +network.fit(X_train, y_train, epochs=epochs, verbose=1, + validation_data=(X_val, y_val), callbacks=[callback]) -network.save(d + "/../deliverable/nonlinear_model") -save_sklearn_model({"mean": mean, "std": std}, d + "/../deliverable/nonlinear_model_normalizers.pickle") +network.save("../deliverable/nonlinear_model") +save_sklearn_model({"mean": mean, "std": std}, "../deliverable/nonlinear_model_normalizers.pickle") +# Print the final validation set MSE, which was used to tailor the NN architecture after +# several manual trials msq = mean_squared_error(network.predict(X_val), y_val) print(msq) - -X_test = X_test[:, 1:3] -X_test -= mean -X_test /= std -msq = mean_squared_error(network.predict(X_test), y_test) -#print(msq) diff --git a/assignment_1/src/statistical_test.py b/assignment_1/src/statistical_test.py new file mode 100644 index 0000000..422558e --- /dev/null +++ b/assignment_1/src/statistical_test.py @@ -0,0 +1,83 @@ +""" Statistical Student's T-test between the linear and non-linear model """ + +import joblib +import numpy as np +from keras import models +from sklearn.model_selection import train_test_split + +# Load test dataset (which was saved when building models) +test = np.load('test.npz') +X_test = test["x"] +y_test = test["y"] + +# Load both models and the normalizers for the non-linear model) +lr = joblib.load("../deliverable/linear_regression.pickle") +bm = joblib.load('../deliverable/baseline_model.pickle') +nonlinear_model = models.load_model('../deliverable/nonlinear_model') +normalizers = joblib.load('../deliverable/nonlinear_model_normalizers.pickle') + +# Split (without shuffling, the test set has already been shuffled) the test set +# in two evenly sized test datasets for student's T-test comparison +X_a, X_b, Y_a, Y_b = train_test_split(X_test, y_test, test_size=0.5, + shuffle=False) + +# Make sure both sets have the same size +assert X_a.shape == X_b.shape + +# Compute the noise values for the linear model +e_a = np.square(Y_a - lr.predict(X_a)) + +# Drop the additional parameters for the linear model and normalize Xs with +# normalizing factors +X_b = X_b[:, 1:3] +X_b -= normalizers["mean"] +X_b /= normalizers["std"] + +# Compute the noise values for the feedforward NN +e_b = np.square(Y_b - nonlinear_model.predict(X_b).flatten()) + +# # of data points in both test sets +L = X_a.shape[0] + +# Compute mean and variance on the respective noise datasets for both models +e_mean_a = np.mean(e_a) +var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1) +e_mean_b = np.mean(e_b) +var_e_b = np.sum(np.square(e_b - e_mean_b)) / (L - 1) + +# Compute Student's T-test +T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L) + +# Print results +print("# T-test between linear and FFNN") +print("T-test result: %g" % T) +print("Linear model variance: %g" % var_e_a) +print("Feedforward NN variance: %g" % var_e_b) + +# Now perform another test using the (a) test set for the baseline model +X_a = X_a[:, 1:3] +e_a = np.square(Y_a - bm.predict(X_a)) +e_mean_a = np.mean(e_a) +var_e_a = np.sum(np.square(e_a - e_mean_a)) / (L - 1) +T = (e_mean_a - e_mean_b) / np.sqrt(var_e_a / L + var_e_b / L) + +print("# T-test between baseline and FFNN") +print("T-test result: %g" % T) +print("Baseline model variance: %g" % var_e_a) +print("Feedforward NN variance: %g" % var_e_b) + +# Compute MSE on entire test set for all models +print("MSE on entire test set for linear regression model: %g" % + (np.mean(np.square(y_test - lr.predict(X_test))))) + +X_test = X_test[:, 1:3] +print("MSE on entire test set for baseline model: %g" % + (np.mean(np.square(y_test - bm.predict(X_test))))) + +# Normalize Xs first for running my FFNN model +X_test -= normalizers["mean"] +X_test /= normalizers["std"] +print("MSE on entire test set for feedforward NN model: %g" % + (np.mean(np.square(y_test - nonlinear_model.predict(X_test).flatten())))) + +# vim: set ts=4 sw=4 et tw=80: diff --git a/assignment_1/src/test.npz b/assignment_1/src/test.npz new file mode 100644 index 0000000..ca09226 Binary files /dev/null and b/assignment_1/src/test.npz differ