import pathlib
import pickle


import numpy as np
from matplotlib import pyplot as plt
from tqdm.auto import tqdm


def error(x, y):
    """Relative error."""
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))


def gradient(f, x, df=None, h=1e-5):
    """Numerical gradient of f at x."""
    grad = np.zeros_like(x)
    with np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) as it:
        while not it.finished:
            # evaluate function at x +/- h
            ix = it.multi_index
            value = x[ix]
            x[ix] = value + h
            pos = f(x)
            x[ix] = value - h
            neg = f(x)
            x[ix] = value
            # compute the partial derivative with centered formula
            grad[ix] = (pos - neg) if df is None else np.sum((pos - neg) * df)
            grad[ix] /= 2 * h
            it.iternext()
    return grad


database = 'cifar10.npz'
data = dict(np.load(database))
for k, v in list(data.items()):
    print(f'{k}: {v.shape}')


def affine_forward(x, w, b):
    """Forward pass for a fully-connected affine layer.
    Input x has shape (N, d_1, ..., d_k) and contains a minibatch of N samples, where each sample x[i] has shape
    (d_1, ..., d_k). We will reshape each input into a vector of dimension D = d_1 * ... * d_k, and then transform it
    to an output vector of dimension M.
    Input:
    - x: An array containing input data, of shape (N, d_1, ..., d_k)
    - w: An array of weights, of shape (D, M)
    - b: An array of biases, of shape (M,)
    Return:
    - out: Output, of shape (N, M)
    - cache: shape (x, w, b)
    """
    out = None
    ###########################################################################
    # 1) Implement the affine forward pass. Store the result in 'out'. Note   #
    # that you will need to reshape the input into rows.                      #
    # ----------------------------------------------------------------------- #
    #     START OF YOUR CODE                                                  #
    ###########################################################################
    ###########################################################################
    #     END OF YOUR CODE                                                    #
    ###########################################################################
    cache = (x, w, b)
    return out, cache


num_inputs = 2
input_shape = 4, 5, 6
output_dim = 3
input_size = num_inputs * np.prod(input_shape)
weight_size = output_dim * np.prod(input_shape)
x = np.linspace(-0.1, 0.5, input_size).reshape(num_inputs, *input_shape)
w = np.linspace(-0.2, 0.3, weight_size).reshape(np.prod(input_shape), output_dim)
b = np.linspace(-0.3, 0.1, output_dim)
out, _ = affine_forward(x, w, b)
out_expected = np.array([
    [1.49834967, 1.70660132, 1.91485297],
    [3.25553199, 3.5141327,  3.77273342]])
print('Testing affine_forward...')
print(f'Output difference (expected <1e-5): {error(out, out_expected):.2e}')


def affine_backward(dout, cache):
    """Backward pass for an affine layer.
    Input:
    - dout: Upstream derivative, of shape (N, M)
    - cache: Tuple of:
      - x: Input data, of shape (N, d_1, ... d_k)
      - w: Weights, of shape (D, M)
      - b: Biases, of shape (M,)
    Return:
    - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
    - dw: Gradient with respect to w, of shape (D, M)
    - db: Gradient with respect to b, of shape (M,)
    """
    x, w, b = cache
    dx, dw, db = None, None, None
    ###########################################################################
    # 2) Implement the affine backward pass.                                  #
    # ----------------------------------------------------------------------- #
    #     START OF YOUR CODE                                                  #
    ###########################################################################
    ###########################################################################
    #     END OF YOUR CODE                                                    #
    ###########################################################################
    return dx, dw, db


np.random.seed(231)
x = np.random.randn(10, 2, 3)
w = np.random.randn(6, 5)
b = np.random.randn(5)
dout = np.random.randn(10, 5)
dx_expected = gradient(lambda x: affine_forward(x, w, b)[0], x, dout)
dw_expected = gradient(lambda w: affine_forward(x, w, b)[0], w, dout)
db_expected = gradient(lambda b: affine_forward(x, w, b)[0], b, dout)
_, cache = affine_forward(x, w, b)
dx, dw, db = affine_backward(dout, cache)
print('Testing affine_backward...')
print(f'dx difference (expected <1e-5): {error(dx, dx_expected):.2e}')
print(f'dw difference (expected <1e-5): {error(dw, dw_expected):.2e}')
print(f'db difference (expected <1e-5): {error(db, db_expected):.2e}')


def relu_forward(x):
    """Forward pass for a layer of rectified linear units (ReLUs).
    Input:
    - x: Inputs, of any shape
    Return:
    - out: Output, of the same shape as x
    - cache: x
    """
    out = None
    ###########################################################################
    # 3) Implement the ReLU forward pass.                                     #
    # ----------------------------------------------------------------------- #
    #     START OF YOUR CODE                                                  #
    ###########################################################################
    out = np.zeros(x.shape)
    out[x>0]= x[x>0]
    ###########################################################################
    #     END OF YOUR CODE                                                    #
    ###########################################################################
    cache = x
    return out, cache


x = np.linspace(-0.5, 0.5, 12).reshape(3, 4)
out, _ = relu_forward(x)
out_expected = np.array([
    [0.0,        0.0,        0.0,        0.0       ],
    [0.0,        0.0,        0.04545455, 0.13636364],
    [0.22727273, 0.31818182, 0.40909091, 0.5       ]])
print('Testing relu_forward...')
print(f'Output difference (expected <1e-5): {error(out, out_expected):.2e}')


def relu_backward(dout, cache):
    """Backward pass for a layer of rectified linear units (ReLUs).
    Input:
    - dout: Upstream derivatives, of any shape
    - cache: Input x, of same shape as dout
    Return:
    - dx: Gradient with respect to x
    """
    dx, x = None, cache
    ###########################################################################
    # 4) Implement the ReLU backward pass.                                    #
    # ----------------------------------------------------------------------- #
    #     START OF YOUR CODE                                                  #
    ########################################################################### 
    dx = np.zeros(x.shape)
    dx[x>0] = dout[x>0]
    ###########################################################################
    #     END OF YOUR CODE                                                    #
    ###########################################################################
    return dx


np.random.seed(231)
x = np.random.randn(10, 10)
dout = np.random.randn(*x.shape)
dx_expected = gradient(lambda x: relu_forward(x)[0], x, dout)
_, cache = relu_forward(x)
dx = relu_backward(dout, cache)
print('Testing relu_backward...')
print(f'dx difference (expected <1e-5): {error(dx, dx_expected):.2e}')


def softmax_loss(x, y):
    """Loss and gradient for softmax classification.
    Input:
    - x: Input data, of shape (N, C) where x[i,j] is the score for the jth class for the ith input
    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C
    Return:
    - loss: Scalar giving the loss
    - dx: Gradient of the loss with respect to x
    """
    loss, dx = None, None
    ###########################################################################
    # 5) Implement the loss and gradient for softmax classification.          #
    # ----------------------------------------------------------------------- #
    #     START OF YOUR CODE                                                  #
    ###########################################################################
    ###########################################################################
    #     END OF YOUR CODE                                                    #
    ###########################################################################
    return loss, dx


np.random.seed(231)
num_classes, num_inputs = 10, 50
x = 0.001 * np.random.randn(num_inputs, num_classes)
y = np.random.randint(num_classes, size=num_inputs)
dx_expected = gradient(lambda x: softmax_loss(x, y)[0], x)
loss, dx = softmax_loss(x, y)
print('Testing softmax_loss...')
print(f'dx difference (expected <1e-5): {error(dx, dx_expected):.2e}')


def affine_relu_forward(x, w, b):
    """Perform affine transform followed by ReLU."""
    a, fc_cache = affine_forward(x, w, b)
    out, relu_cache = relu_forward(a)
    cache = (fc_cache, relu_cache)
    return out, cache


def affine_relu_backward(dout, cache):
    """Backward pass for the affine-relu layer."""
    fc_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = affine_backward(da, fc_cache)
    return dx, dw, db


class Network:
    """A fully-connected neural network.
    A two-layer fully-connected network with ReLU nonlinearity and softmax loss that uses a modular layer design. We
    assume an input dimension of D, a hidden dimension of H, and perform classification over C classes.
    The architecture is of the form: affine -> relu -> affine -> softmax.
    Note that this class does not implement gradient descent; instead, it will interact with a separate Solver object
    that is responsible for performing the optimization.
    Learnable parameters of the model are stored in self.params dictionary, which maps parameter names to numpy arrays.
    """
    def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0):
        """Initialize a new network.
        Input:
        - input_dim: Size of the input
        - hidden_dim: Size of the hidden layer
        - num_classes: Number of classes to classify
        - weight_scale: Standard deviation for random initialization of the weights
        - reg: L2 regularization strength
        """
        self.params = {}
        self.reg = reg
        #######################################################################
        # 6) Initialize the weights and biases. Weights should be initialized #
        # from a Gaussian centered at 0 with standard deviation equal to      #
        # 'weight_scale', and biases should be initialized to 0. All weights  #
        # and biases should be stored in the dictionary 'self.params', with   #
        # first layer weights and biases using the keys 'W1' and 'b1' and     #
        # second layer weights and biases using the keys 'W2' and 'b2'.       #
        # ------------------------------------------------------------------- #
        #     START OF YOUR CODE                                              #
        #######################################################################
        #######################################################################
        #     END OF YOUR CODE                                                #
        #######################################################################
    def loss(self, X, y=None):
        """Loss and gradient for a minibatch of data.
        Inputs:
        - X: Array of input data, of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,); y[i] gives the label for X[i]
        Returns:
        If y is None, then run a test-time forward pass of the model, and return:
        - scores: Array of shape (N, C), where scores[i,c] is the classification score for X[i] and class c
        If y is not None, then run a training-time forward and backward pass, and return:
        - loss: Scalar loss
        - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss
        """
        scores = None
        #######################################################################
        # 7) Implement the forward pass, computing the class scores for 'X'   #
        # and storing them in the scores variable.                            #
        # ------------------------------------------------------------------- #
        #     START OF YOUR CODE                                              #
        #######################################################################
        #######################################################################
        #     END OF YOUR CODE                                                #
        #######################################################################
        if y is None:  # then assume we are in test mode
            return scores
        loss, grads = 0, {}
        #######################################################################
        # 8) Implement the backward pass for the two-layer net. Store the     #
        # loss in the loss variable and gradients in the 'grads' dictionary.  #
        # Compute data loss using softmax, and make sure that 'grads[k]'      #
        # holds the gradients for 'self.params[k]'. Don't forget to add L2    #
        # regularization!                                                     #
        #                                                                     #
        # NOTE: To ensure that your implementation matches ours and you pass  #
        # the automated tests, make sure that your L2 regularization includes #
        # a factor of 0.5 to simplify the expression for the gradient.        #
        # ------------------------------------------------------------------- #
        #     START OF YOUR CODE                                              #
        #######################################################################
        #######################################################################
        #     END OF YOUR CODE                                                #
        #######################################################################
        return loss, grads


np.random.seed(231)
N, D, H, C = 3, 5, 50, 7
X = np.random.randn(N, D)
y = np.random.randint(C, size=N)
std = 1e-3
model = Network(input_dim=D, hidden_dim=H, num_classes=C, weight_scale=std)
print('Testing initialization...')
W1_std = np.abs(model.params['W1'].std() - std)
b1 = model.params['b1']
W2_std = np.abs(model.params['W2'].std() - std)
b2 = model.params['b2']
assert W1_std < std / 10, 'Problem with first layer weights'
assert np.all(b1 == 0), 'Problem with first layer biases'
assert W2_std < std / 10, 'Problem with second layer weights'
assert np.all(b2 == 0), 'Problem with second layer biases'
print('Testing test-time forward pass...')
model.params['W1'] = np.linspace(-0.7, 0.3, D * H).reshape(D, H)
model.params['b1'] = np.linspace(-0.1, 0.9, H)
model.params['W2'] = np.linspace(-0.3, 0.4, H * C).reshape(H, C)
model.params['b2'] = np.linspace(-0.9, 0.1, C)
X = np.linspace(-5.5, 4.5, N * D).reshape(D, N).T
scores = model.loss(X)
scores_expected = np.array([
    [11.53165108, 12.2917344,  13.05181771, 13.81190102, 14.57198434, 15.33206765, 16.09215096],
    [12.05769098, 12.74614105, 13.43459113, 14.1230412,  14.81149128, 15.49994135, 16.18839143],
    [12.58373087, 13.20054771, 13.81736455, 14.43418138, 15.05099822, 15.66781506, 16.2846319 ]])
scores_diff = np.abs(scores - scores_expected).sum()
assert scores_diff < 1e-6, 'Problem with test-time forward pass'
print('Testing training loss...')
y = np.array([0, 5, 1])
loss, grads = model.loss(X, y)
loss_expected = 3.4702243556
assert np.abs(loss - loss_expected) < 1e-10, 'Problem with training-time loss'
model.reg = 1.0
loss, grads = model.loss(X, y)
loss_expected = 26.5948426952
assert np.abs(loss - loss_expected) < 1e-10, 'Problem with regularization loss'
for reg in [0.0, 0.7]:
    print('Running numeric gradient check with reg =', reg)
    model.reg = reg
    loss, grads = model.loss(X, y)
    for name in sorted(grads):
        f = lambda _: model.loss(X, y)[0]
        grad_expected = gradient(f, model.params[name])
        print(f'{name} difference (expected <1e-5): {error(grads[name], grad_expected):.2e}')


def sgd(w, dw, config=None):
    """Stochastic gradient descent."""
    if config is None:
        config = {}
    config.setdefault('learning_rate', 1e-2)
    ###########################################################################
    # 9) Implement the vanilla stochastic gradient descent update formula.    #
    # ----------------------------------------------------------------------- #
    #     START OF YOUR CODE                                                  #
    ###########################################################################
    ###########################################################################
    #     END OF YOUR CODE                                                    #
    ###########################################################################
    return w, config


class Solver:
    """Solver for training classification models.
    Accepts both training data and validation labels so it can periodically check classification accuracy to watch out
    for overfitting.
    To train a model, you will first construct a Solver instance, passing the model, dataset, and various options
    (learning rate, batch size, etc) to the constructor. You will then call the train() method to run the optimization
    procedure and train the model.
    After the train() method returns, model.params will contain the parameters that performed best on the validation
    set over the course of training. In addition, the instance variable solver.loss_history will contain a list of all
    losses encountered during training and the instance variables solver.train_acc_history and solver.val_acc_history
    will be lists of the accuracies of the model on the training and validation set at each epoch.
    Example usage might look something like this:
    data = {
      'X_train': # training data
      'y_train': # training labels
      'X_val': # validation data
      'y_val': # validation labels
    }
    model = MyAwesomeModel(hidden_size=100, reg=10)
    solver = Solver(
        model, data,
        optim_config={'learning_rate': 1e-3},
        lr_decay=0.95,
        num_epochs=10,
        batch_size=100,
        print_every=100)
    solver.train()
    A Solver works on a model object that must conform to the following API:
    - model.params must be a dictionary mapping string parameter names to numpy arrays containing parameter values.
    - model.loss(X, y) must be a function that computes training-time loss and gradients, and test-time classification
      scores, with the following inputs and outputs:
      Input:
      - X: Array giving a minibatch of input data of shape (N, d_1, ..., d_k)
      - y: Array of labels, of shape (N,) giving labels for X where y[i] is the label for X[i].
      Return:
      If y is None, run a test-time forward pass, and return:
      - scores: Array of shape (N, C), where scores[i, c] is the classification score for X[i] and class c
      If y is not None, run a training-time forward and backward pass, and return:
      - loss: Scalar giving the loss
      - grads: Dictionary with the same keys as self.params mapping parameter names to gradients of the loss
    """
    def __init__(self, model, data, **kwargs):
        """Solver constructor.
        Required arguments:
        - model: A model object conforming to the API described above
        - data: A dictionary of training and validation data containing:
          'X_train': Array, shape (N_train, d_1, ..., d_k) of training images
          'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images
          'y_train': Array, shape (N_train,) of labels for training images
          'y_val': Array, shape (N_val,) of labels for validation images
        Optional arguments:
        - optim_config: A dictionary containing hyperparameters that will be passed to the chosen update rule. Each
          update rule requires different hyperparameters (see optim.py) but all update rules require a 'learning_rate'
          parameter so that should always be present
        - lr_decay: A scalar for learning rate decay; after each epoch the learning rate is multiplied by this value
        - batch_size: Size of minibatches used to compute loss and gradient during training
        - num_epochs: The number of epochs to run for during training
        - print_every: Integer; training losses will be printed every print_every iterations
        - verbose: Boolean; if set to false then no output will be printed during training
        - num_train_samples: Number of training samples used to check training accuracy; default is 1000; set to None
          to use entire training set
        - num_val_samples: Number of validation samples to use to check val accuracy; default is None, which uses the
          entire validation set
        - checkpoint_name: If not None, then save model checkpoints here every epoch
        """
        self.model = model
        self.X_train = data['X_train']
        self.y_train = data['y_train']
        self.X_val = data['X_val']
        self.y_val = data['y_val']
        # unpack keyword arguments
        self.optim_config = kwargs.pop('optim_config', {})
        self.lr_decay = kwargs.pop('lr_decay', 1.0)
        self.batch_size = kwargs.pop('batch_size', 100)
        self.num_epochs = kwargs.pop('num_epochs', 10)
        self.num_train_samples = kwargs.pop('num_train_samples', 1000)
        self.num_val_samples = kwargs.pop('num_val_samples', None)
        self.checkpoint_name = kwargs.pop('checkpoint_name', None)
        self.print_every = kwargs.pop('print_every', 10)
        self.verbose = kwargs.pop('verbose', True)
        # throw an error if there are extra keyword arguments
        if len(kwargs) > 0:
            extra = ', '.join(f'"{k}"' for k in list(kwargs.keys()))
            raise ValueError(f'Unrecognized arguments {extra}')
        # make sure the update rule exists, then replace the string name with the actual function
        self.update_rule = sgd
        self._reset()
    def _reset(self):
        """Book-keeping variables for optimization; Do not call manually."""
        # set up some variables for book-keeping
        self.epoch = 0
        self.best_val_acc = 0
        self.best_params = {}
        self.loss_history = []
        self.train_acc_history = []
        self.val_acc_history = []
        # make a deep copy of the optim_config for each parameter
        self.optim_configs = {}
        for p in self.model.params:
            d = {k: v for k, v in self.optim_config.items()}
            self.optim_configs[p] = d
    def _step(self):
        """Perform a single gradient update; called by "train" method."""
        # make a minibatch of training data
        num_train = self.X_train.shape[0]
        batch_mask = np.random.choice(num_train, self.batch_size)
        X_batch = self.X_train[batch_mask]
        y_batch = self.y_train[batch_mask]
        # compute loss and gradient
        loss, grads = self.model.loss(X_batch, y_batch)
        self.loss_history.append(loss)
        # perform a parameter update
        for p, w in self.model.params.items():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = self.update_rule(w, dw, config)
            self.model.params[p] = next_w
            self.optim_configs[p] = next_config
    def _save_checkpoint(self):
        if self.checkpoint_name is None:
            return
        checkpoint = {
            'model': self.model,
            'update_rule': self.update_rule,
            'lr_decay': self.lr_decay,
            'optim_config': self.optim_config,
            'batch_size': self.batch_size,
            'num_train_samples': self.num_train_samples,
            'num_val_samples': self.num_val_samples,
            'epoch': self.epoch,
            'loss_history': self.loss_history,
            'train_acc_history': self.train_acc_history,
            'val_acc_history': self.val_acc_history,
        }
        filename = f'{self.checkpoint_name}_epoch_{self.epoch}.pkl'
        if self.verbose:
            print(f'Saving checkpoint to "{filename}"')
        with open(filename, 'wb') as f:
            pickle.dump(checkpoint, f)
    def check_accuracy(self, X, y, num_samples=None, batch_size=100):
        """Check accuracy of the model on the provided data.
        Input:
        - X: Array of data, of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,)
        - num_samples: If not None, subsample the data and only test the model on num_samples datapoints
        - batch_size: Split X and y into batches of this size to avoid using too much memory
        Return:
        - acc: Scalar giving the fraction of instances that were correctly classified by the model
        """
        # subsample the data
        N = X.shape[0]
        if num_samples is not None and N > num_samples:
            mask = np.random.choice(N, num_samples)
            N = num_samples
            X = X[mask]
            y = y[mask]
        # compute predictions in batches
        num_batches = N // batch_size
        if N % batch_size != 0:
            num_batches += 1
        y_pred = []
        for i in range(num_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            scores = self.model.loss(X[start:end])
            y_pred.append(np.argmax(scores, axis=1))
        y_pred = np.hstack(y_pred)
        acc = np.mean(y_pred == y)
        return acc
    def train(self):
        """Perform optimization to train the model."""
        num_train = self.X_train.shape[0]
        iterations_per_epoch = max(num_train // self.batch_size, 1)
        num_iterations = self.num_epochs * iterations_per_epoch
        for t in tqdm(range(num_iterations)):
            self._step()
            # print training loss
            if self.verbose and t % self.print_every == 0:
                print(f'(Iteration {t + 1} / {num_iterations}) loss: {self.loss_history[-1]}')
            # at the end of every epoch, increment the epoch counter and decay the learning rate
            epoch_end = (t + 1) % iterations_per_epoch == 0
            if epoch_end:
                self.epoch += 1
                for k in self.optim_configs:
                    self.optim_configs[k]['learning_rate'] *= self.lr_decay
            # check train and val accuracy on the first iteration, the last iteration, and at the end of each epoch
            first_it = t == 0
            last_it = t == num_iterations - 1
            if first_it or last_it or epoch_end:
                train_acc = self.check_accuracy(self.X_train, self.y_train, num_samples=self.num_train_samples)
                val_acc = self.check_accuracy(self.X_val, self.y_val, num_samples=self.num_val_samples)
                self.train_acc_history.append(train_acc)
                self.val_acc_history.append(val_acc)
                self._save_checkpoint()
                if self.verbose:
                    print(f'(Epoch {self.epoch} / {self.num_epochs}) train acc: {train_acc}; val_acc: {val_acc}')
                # keep track of the best model
                if val_acc > self.best_val_acc:
                    self.best_val_acc = val_acc
                    self.best_params = {}
                    for k, v in self.model.params.items():
                        self.best_params[k] = v.copy()
        # at the end of training swap the best params into the model
        self.model.params = self.best_params


input_size = 32 * 32 * 3
hidden_size = 50
num_classes = 10
model = Network(input_size, hidden_size, num_classes)
solver = None
###############################################################################
# 10) Use a 'Solver' instance to train the network.                           #
# --------------------------------------------------------------------------- #
#     START OF YOUR CODE                                                      #
###############################################################################
###############################################################################
#     END OF YOUR CODE                                                        #
###############################################################################


fig, ax = plt.subplots(figsize=(8,4))
ax.plot(solver.loss_history, '.', alpha=0.5)
ax.set_xlabel('Iteration')
ax.set_ylabel('Training Loss')
pass


fig, ax = plt.subplots(figsize=(8,4))
ax.plot(solver.train_acc_history, '-o', label='train')
ax.plot(solver.val_acc_history, '-o', label='val')
ax.plot([0.5] * len(solver.val_acc_history), 'k--')
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.legend(loc='lower right')
pass


weights = model.params['W1'].reshape(32, 32, 3, -1).transpose(3, 0, 1, 2)
# visualize W1 as a grid of images
ncols = 10
nrows = -(-len(weights) // ncols)
fig, axs = plt.subplots(nrows, ncols, figsize=(ncols,nrows))
plt.subplots_adjust(wspace=0.1, hspace=0.1)
for i, ax in enumerate(axs.ravel()):
    if i < len(weights):
        ax.imshow(np.interp(weights[i], (weights[i].min(), weights[i].max()), (0, 255)).astype(int))
    else:
        ax.set_frame_on(False)
    ax.set_xticks([])
    ax.set_yticks([])


best_model = model
###############################################################################
# 11) Tune hyperparameters using the validation set. Store your best trained  #
# model in 'best_model'.                                                      #
#                                                                             #
# To help debug your network, it may help to use visualizations similar to    #
# the ones we used above; these visualizations will have significant          #
# qualitative differences from the ones we saw above for the poorly tuned     #
# network. Please also feel free to produce output plots that demonstrate the #
# impact of pertinent parameters.                                             #
#                                                                             #
# Tweaking hyperparameters by hand can be fun, but you might find it useful   #
# to write code to sweep through possible combinations of hyperparameters.    #
# --------------------------------------------------------------------------- #
#     START OF YOUR CODE                                                      #
###############################################################################
###############################################################################
#     END OF YOUR CODE                                                        #
###############################################################################


y_val_pred = np.argmax(best_model.loss(data['X_val']), axis=1)
print('Validation accuracy:', (y_val_pred == data['y_val']).mean())


y_test_pred = np.argmax(best_model.loss(data['X_test']), axis=1)
print('Test accuracy:', (y_test_pred == data['y_test']).mean())

Tutorial 4 - Fully connected network for image classification¶

Layers¶

Affine layer: forward¶

Affine layer: backward¶

Activation layer: forward¶

Activation layer: backward¶

Loss layer¶

Network¶

Network construction¶

Network training¶

Hyperparameter tuning¶

Acknowledgements¶