#!/usr/bin/env python
# coding: utf-8

# In[121]:


import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])


# ## Initial setup

# ### Data

# In[122]:


n,m = x_train.shape
c = y_train.max()+1
nh = 50


# In[123]:


class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x


# In[124]:


model = Model(m, nh, 10)
pred = model(x_train)
pred.shape


# ### Cross entropy loss

# First, we will need to compute the softmax of our activations. This is defined by:
# 
# $$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$
# 
# or more concisely:
# 
# $$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$ 
# 
# In practice, we will need the log of the softmax when we calculate the loss.

# In[125]:


def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdim=True))).log()


# In[126]:


log_softmax(pred)


# Note that the formula 
# 
# $$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$ 
# 
# gives a simplification when we compute the log softmax, which was previously defined as `(x.exp()/(x.exp().sum(-1,keepdim=True))).log()`

# In[127]:


def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()


# Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the [LogSumExp trick](https://en.wikipedia.org/wiki/LogSumExp). The idea is to use the following formula:
# 
# $$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$
# 
# where a is the maximum of the $x_{j}$.

# In[128]:


def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()


# This way, we will avoid an overflow when taking the exponential of a big activation. In PyTorch, this is already implemented for us. 

# In[129]:


def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)


# In[130]:


test_close(logsumexp(pred), pred.logsumexp(-1))
sm_pred = log_softmax(pred)
sm_pred


# The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:
# 
# $$ -\sum x\, \log p(x) $$
# 
# But since our $x$s are 1-hot encoded, this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.
# 
# This can be done using numpy-style [integer array indexing](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#integer-array-indexing). Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link.

# In[131]:


y_train[:3]


# In[132]:


sm_pred[0,5],sm_pred[1,0],sm_pred[2,4]


# In[133]:


sm_pred[[0,1,2], y_train[:3]]


# In[134]:


def nll(input, target): return -input[range(target.shape[0]), target].mean()


# In[135]:


loss = nll(sm_pred, y_train)
loss


# Then use PyTorch's implementation.

# In[136]:


test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)


# In PyTorch, `F.log_softmax` and `F.nll_loss` are combined in one optimized function, `F.cross_entropy`.

# In[137]:


test_close(F.cross_entropy(pred, y_train), loss, 1e-3)


# ## Basic training loop

# Basically the training loop repeats over the following steps:
# - get the output of the model on a batch of inputs
# - compare the output to the labels we have and compute a loss
# - calculate the gradients of the loss with respect to every parameter of the model
# - update said parameters with those gradients to make them a little bit better

# In[138]:


loss_func = F.cross_entropy


# In[139]:


bs=64                  # batch size

xb = x_train[0:bs]     # a mini-batch from x
preds = model(xb)      # predictions
preds[0], preds.shape


# In[140]:


yb = y_train[0:bs]
loss_func(preds, yb)


# In[141]:


torch.argmax(preds, dim=1)


# In[142]:


def accuracy(out, yb): return (torch.argmax(out, dim=1)==yb).float().mean()


# In[143]:


accuracy(preds, yb)


# In[144]:


lr = 0.5   # learning rate
epochs = 3 # how many epochs to train for


# In[145]:


for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n,i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        if i==0: print(loss.item(), accuracy(preds, yb).item())
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad   * lr
                    l.weight.grad.zero_()
                    l.bias  .grad.zero_()


# ## Using parameters and optim

# ### Parameters

# Use `nn.Module.__setattr__`:

# In[146]:


class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in,nh)
        self.l2 = nn.Linear(nh,n_out)
        self.relu = nn.ReLU()
        
    def __call__(self, x): return self.l2(self.relu(self.l1(x)))


# In[147]:


model = Model(m, nh, 10)


# In[148]:


for name,l in model.named_children(): print(f"{name}: {l}")


# In[149]:


model


# In[150]:


model.l1


# In[151]:


def fit():
    for epoch in range(epochs):
        for i in range(0, n, bs):
            s = slice(i, min(n,i+bs))
            xb,yb = x_train[s],y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            if i==0: print(loss.item(), accuracy(preds, yb).item())
            with torch.no_grad():
                for p in model.parameters(): p -= p.grad * lr
                model.zero_grad()


# In[152]:


fit()


# Behind the scenes, PyTorch overrides the `__setattr__` function in `nn.Module` so that the submodules you define are properly registered as parameters of the model.

# In[153]:


class DummyModule():
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in,nh)
        self.l2 = nn.Linear(nh,n_out)
        
    def __setattr__(self,k,v):
        if not k.startswith("_"): self._modules[k] = v
        super().__setattr__(k,v)
        
    def __repr__(self): return f'{self._modules}'
    
    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters(): yield p


# In[154]:


mdl = DummyModule(m,nh,10)
mdl


# In[155]:


[o.shape for o in mdl.parameters()]


# ### Registering modules

# We can use the original `layers` approach, but we have to register the modules.

# In[156]:


layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)]


# In[157]:


class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i,l in enumerate(self.layers): self.add_module(f'layer_{i}', l)
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x


# In[158]:


model = Model(layers)


# In[159]:


model


# ### nn.ModuleList

# `nn.ModuleList` does this for us.

# In[160]:


class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x


# In[161]:


model = SequentialModel(layers)


# In[162]:


model


# In[163]:


fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)


# ### nn.Sequential

# `nn.Sequential` is a convenient class which does the same as the above:

# In[164]:


model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))


# In[165]:


fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)


# In[166]:


model


# ### optim

# Let's replace our previous manually coded optimization step:
# 
# ```python
# with torch.no_grad():
#     for p in model.parameters(): p -= p.grad * lr
#     model.zero_grad()
# ```
# 
# and instead use just:
# 
# ```python
# opt.step()
# opt.zero_grad()
# ```

# In[167]:


class Optimizer():
    def __init__(self, params, lr=0.5): self.params,self.lr=list(params),lr
        
    def step(self):
        with torch.no_grad():
            for p in self.params: p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()


# In[168]:


model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))


# In[169]:


opt = Optimizer(model.parameters())


# In[170]:


for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n,i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        if i==0: print(loss.item(), accuracy(preds, yb).item())

        opt.step()
        opt.zero_grad()


# PyTorch already provides this exact functionality in `optim.SGD` (it also handles stuff like momentum, which we'll look at later)

# In[171]:


from torch import optim


# In[172]:


def get_model():
    model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))
    return model, optim.SGD(model.parameters(), lr=lr)


# In[173]:


model,opt = get_model()
loss_func(model(xb), yb)


# In[174]:


for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n,i+bs))
        xb,yb = x_train[s],y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        if i==0: print(loss.item(), accuracy(preds, yb).item())
        opt.step()
        opt.zero_grad()


# ## Dataset and DataLoader

# ### Dataset

# It's clunky to iterate through minibatches of x and y values separately:
# 
# ```python
#     xb = x_train[s]
#     yb = y_train[s]
# ```
# 
# Instead, let's do these two steps together, by introducing a `Dataset` class:
# 
# ```python
#     xb,yb = train_ds[s]
# ```

# In[175]:


class Dataset():
    def __init__(self, x, y): self.x,self.y = x,y
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i],self.y[i]


# In[176]:


train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
assert len(train_ds)==len(x_train)
assert len(valid_ds)==len(x_valid)


# In[177]:


xb,yb = train_ds[0:5]
assert xb.shape==(5,28*28)
assert yb.shape==(5,)
xb,yb


# In[178]:


model,opt = get_model()


# In[179]:


for epoch in range(epochs):
    for i in range(0, n, bs):
        xb,yb = train_ds[i:min(n,i+bs)]
        pred = model(xb)
        loss = loss_func(pred, yb)
        if i==0: print(loss.item(), accuracy(pred, yb).item())

        loss.backward()
        opt.step()
        opt.zero_grad()


# ### DataLoader

# Previously, our loop iterated over batches (xb, yb) like this:
# 
# ```python
# for i in range(0, n, bs):
#     xb,yb = train_ds[i:min(n,i+bs)]
#     ...
# ```
# 
# Let's make our loop much cleaner, using a data loader:
# 
# ```python
# for xb,yb in train_dl:
#     ...
# ```

# In[180]:


class DataLoader():
    def __init__(self, ds, bs): self.ds,self.bs = ds,bs
    def __iter__(self):
        for i in range(0, len(self.ds), self.bs): yield self.ds[i:i+self.bs]


# In[181]:


train_dl = DataLoader(train_ds, bs)
valid_dl = DataLoader(valid_ds, bs)


# In[182]:


xb,yb = next(iter(valid_dl))
xb.shape


# In[183]:


yb


# In[184]:


plt.imshow(xb[0].view(28,28))
yb[0]


# In[185]:


model,opt = get_model()


# In[186]:


def fit():
    for epoch in range(epochs):
        for xb,yb in train_dl:
            pred = model(xb)
            loss = loss_func(pred, yb)
            loss.backward()
            opt.step()
            opt.zero_grad()


# In[187]:


fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)


# ### Random sampling

# We want our training set to be in a random order, and that order should differ each iteration. But the validation set shouldn't be randomized.

# In[188]:


import random


# In[189]:


class Sampler():
    def __init__(self, ds, shuffle=False): self.n,self.shuffle = len(ds),shuffle
    def __iter__(self):
        res = list(range(self.n))
        if self.shuffle: random.shuffle(res)
        return iter(res)


# In[190]:


from itertools import islice


# In[191]:


ss = Sampler(train_ds)


# In[192]:


it = iter(ss)
for o in range(5): print(next(it))


# In[193]:


list(islice(ss, 5))


# In[194]:


ss = Sampler(train_ds, shuffle=True)
list(islice(ss, 5))


# In[195]:


import fastcore.all as fc


# In[196]:


class BatchSampler():
    def __init__(self, sampler, bs, drop_last=False): fc.store_attr()
    def __iter__(self): yield from fc.chunked(iter(self.sampler), self.bs, drop_last=self.drop_last)


# In[197]:


batchs = BatchSampler(ss, 4)
list(islice(batchs, 5))


# In[198]:


def collate(b):
    xs,ys = zip(*b)
    return torch.stack(xs),torch.stack(ys)


# In[199]:


class DataLoader():
    def __init__(self, ds, batchs, collate_fn=collate): fc.store_attr()
    def __iter__(self): yield from (self.collate_fn(self.ds[i] for i in b) for b in self.batchs)


# In[200]:


train_samp = BatchSampler(Sampler(train_ds, shuffle=True ), bs)
valid_samp = BatchSampler(Sampler(valid_ds, shuffle=False), bs)


# In[201]:


train_dl = DataLoader(train_ds, batchs=train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batchs=valid_samp, collate_fn=collate)


# In[202]:


xb,yb = next(iter(valid_dl))
plt.imshow(xb[0].view(28,28))
yb[0]


# In[203]:


xb.shape,yb.shape


# In[204]:


model,opt = get_model()
fit()

loss_func(model(xb), yb), accuracy(model(xb), yb)


# ### Multiprocessing DataLoader

# In[205]:


import torch.multiprocessing as mp
from fastcore.basics import store_attr


# In[206]:


class DataLoader():
    def __init__(self, ds, batchs, n_workers=1, collate_fn=collate): fc.store_attr()
    def __iter__(self):
        with mp.Pool(self.n_workers) as ex: yield from ex.map(self.ds.__getitem__, iter(self.batchs))


# In[207]:


train_dl = DataLoader(train_ds, batchs=train_samp, collate_fn=collate, n_workers=2)
it = iter(train_dl)


# In[208]:


xb,yb = next(it)
xb.shape,yb.shape


# ### PyTorch DataLoader

# In[209]:


from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, BatchSampler


# In[210]:


train_samp = BatchSampler(RandomSampler(train_ds),     bs, drop_last=False)
valid_samp = BatchSampler(SequentialSampler(valid_ds), bs, drop_last=False)


# In[211]:


train_dl = DataLoader(train_ds, batch_sampler=train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batch_sampler=valid_samp, collate_fn=collate)


# In[212]:


model,opt = get_model()
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)


# PyTorch can auto-generate the BatchSampler for us:

# In[213]:


train_dl = DataLoader(train_ds, bs, sampler=RandomSampler(train_ds), collate_fn=collate)
valid_dl = DataLoader(valid_ds, bs, sampler=SequentialSampler(valid_ds), collate_fn=collate)


# PyTorch can also generate the Sequential/RandomSamplers too:

# In[214]:


train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=True, num_workers=2)
valid_dl = DataLoader(valid_ds, bs, shuffle=False, num_workers=2)


# In[215]:


model,opt = get_model()
fit()

loss_func(model(xb), yb), accuracy(model(xb), yb)


# Our dataset actually already knows how to sample a batch of indices all at once:

# In[216]:


train_ds[[4,6,7]]


# ...that means that we can actually skip the batch_sampler and collate_fn entirely:

# In[217]:


train_dl = DataLoader(train_ds, sampler=train_samp)
valid_dl = DataLoader(valid_ds, sampler=valid_samp)


# In[218]:


xb,yb = next(iter(train_dl))
xb.shape,yb.shape


# ## Validation

# You **always** should also have a [validation set](http://www.fast.ai/2017/11/13/validation-sets/), in order to identify if you are overfitting.
# 
# We will calculate and print the validation loss at the end of each epoch.
# 
# (Note that we always call `model.train()` before training, and `model.eval()` before inference, because these are used by layers such as `nn.BatchNorm2d` and `nn.Dropout` to ensure appropriate behaviour for these different phases.)

# In[105]:


def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb,yb in train_dl:
            loss = loss_func(model(xb), yb)
            loss.backward()
            opt.step()
            opt.zero_grad()

        model.eval()
        with torch.no_grad():
            tot_loss,tot_acc,count = 0.,0.,0
            for xb,yb in valid_dl:
                pred = model(xb)
                n = len(xb)
                count += n
                tot_loss += loss_func(pred, yb).item()*n
                tot_acc  += accuracy (pred,yb).item()*n
        print(epoch, tot_loss/count, tot_acc/count)
    return tot_loss/count, tot_acc/count


# In[106]:


def get_dls(train_ds, valid_ds, bs, **kwargs):
    return (DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs),
            DataLoader(valid_ds, batch_size=bs*2, **kwargs))


# Now, our whole process of obtaining the data loaders and fitting the model can be run in 3 lines of code:

# In[107]:


train_dl,valid_dl = get_dls(train_ds, valid_ds, bs)
model,opt = get_model()
loss,acc = fit(5, model, loss_func, opt, train_dl, valid_dl)


# In[ ]: