#!/usr/bin/env python # coding: utf-8 # In[121]: import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt from pathlib import Path from torch import tensor,nn import torch.nn.functional as F from fastcore.test import test_close torch.set_printoptions(precision=2, linewidth=140, sci_mode=False) torch.manual_seed(1) mpl.rcParams['image.cmap'] = 'gray' path_data = Path('data') path_gz = path_data/'mnist.pkl.gz' with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1') x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid]) # ## Initial setup # ### Data # In[122]: n,m = x_train.shape c = y_train.max()+1 nh = 50 # In[123]: class Model(nn.Module): def __init__(self, n_in, nh, n_out): super().__init__() self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)] def __call__(self, x): for l in self.layers: x = l(x) return x # In[124]: model = Model(m, nh, 10) pred = model(x_train) pred.shape # ### Cross entropy loss # First, we will need to compute the softmax of our activations. This is defined by: # # $$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$ # # or more concisely: # # $$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum_{0 \leq j \leq n-1} e^{x_{j}}}$$ # # In practice, we will need the log of the softmax when we calculate the loss. # In[125]: def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdim=True))).log() # In[126]: log_softmax(pred) # Note that the formula # # $$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$ # # gives a simplification when we compute the log softmax, which was previously defined as `(x.exp()/(x.exp().sum(-1,keepdim=True))).log()` # In[127]: def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log() # Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the [LogSumExp trick](https://en.wikipedia.org/wiki/LogSumExp). The idea is to use the following formula: # # $$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$ # # where a is the maximum of the $x_{j}$. # In[128]: def logsumexp(x): m = x.max(-1)[0] return m + (x-m[:,None]).exp().sum(-1).log() # This way, we will avoid an overflow when taking the exponential of a big activation. In PyTorch, this is already implemented for us. # In[129]: def log_softmax(x): return x - x.logsumexp(-1,keepdim=True) # In[130]: test_close(logsumexp(pred), pred.logsumexp(-1)) sm_pred = log_softmax(pred) sm_pred # The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by: # # $$ -\sum x\, \log p(x) $$ # # But since our $x$s are 1-hot encoded, this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target. # # This can be done using numpy-style [integer array indexing](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#integer-array-indexing). Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link. # In[131]: y_train[:3] # In[132]: sm_pred[0,5],sm_pred[1,0],sm_pred[2,4] # In[133]: sm_pred[[0,1,2], y_train[:3]] # In[134]: def nll(input, target): return -input[range(target.shape[0]), target].mean() # In[135]: loss = nll(sm_pred, y_train) loss # Then use PyTorch's implementation. # In[136]: test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3) # In PyTorch, `F.log_softmax` and `F.nll_loss` are combined in one optimized function, `F.cross_entropy`. # In[137]: test_close(F.cross_entropy(pred, y_train), loss, 1e-3) # ## Basic training loop # Basically the training loop repeats over the following steps: # - get the output of the model on a batch of inputs # - compare the output to the labels we have and compute a loss # - calculate the gradients of the loss with respect to every parameter of the model # - update said parameters with those gradients to make them a little bit better # In[138]: loss_func = F.cross_entropy # In[139]: bs=64 # batch size xb = x_train[0:bs] # a mini-batch from x preds = model(xb) # predictions preds[0], preds.shape # In[140]: yb = y_train[0:bs] loss_func(preds, yb) # In[141]: torch.argmax(preds, dim=1) # In[142]: def accuracy(out, yb): return (torch.argmax(out, dim=1)==yb).float().mean() # In[143]: accuracy(preds, yb) # In[144]: lr = 0.5 # learning rate epochs = 3 # how many epochs to train for # In[145]: for epoch in range(epochs): for i in range(0, n, bs): s = slice(i, min(n,i+bs)) xb,yb = x_train[s],y_train[s] preds = model(xb) loss = loss_func(preds, yb) loss.backward() if i==0: print(loss.item(), accuracy(preds, yb).item()) with torch.no_grad(): for l in model.layers: if hasattr(l, 'weight'): l.weight -= l.weight.grad * lr l.bias -= l.bias.grad * lr l.weight.grad.zero_() l.bias .grad.zero_() # ## Using parameters and optim # ### Parameters # Use `nn.Module.__setattr__`: # In[146]: class Model(nn.Module): def __init__(self, n_in, nh, n_out): super().__init__() self.l1 = nn.Linear(n_in,nh) self.l2 = nn.Linear(nh,n_out) self.relu = nn.ReLU() def __call__(self, x): return self.l2(self.relu(self.l1(x))) # In[147]: model = Model(m, nh, 10) # In[148]: for name,l in model.named_children(): print(f"{name}: {l}") # In[149]: model # In[150]: model.l1 # In[151]: def fit(): for epoch in range(epochs): for i in range(0, n, bs): s = slice(i, min(n,i+bs)) xb,yb = x_train[s],y_train[s] preds = model(xb) loss = loss_func(preds, yb) loss.backward() if i==0: print(loss.item(), accuracy(preds, yb).item()) with torch.no_grad(): for p in model.parameters(): p -= p.grad * lr model.zero_grad() # In[152]: fit() # Behind the scenes, PyTorch overrides the `__setattr__` function in `nn.Module` so that the submodules you define are properly registered as parameters of the model. # In[153]: class DummyModule(): def __init__(self, n_in, nh, n_out): self._modules = {} self.l1 = nn.Linear(n_in,nh) self.l2 = nn.Linear(nh,n_out) def __setattr__(self,k,v): if not k.startswith("_"): self._modules[k] = v super().__setattr__(k,v) def __repr__(self): return f'{self._modules}' def parameters(self): for l in self._modules.values(): for p in l.parameters(): yield p # In[154]: mdl = DummyModule(m,nh,10) mdl # In[155]: [o.shape for o in mdl.parameters()] # ### Registering modules # We can use the original `layers` approach, but we have to register the modules. # In[156]: layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)] # In[157]: class Model(nn.Module): def __init__(self, layers): super().__init__() self.layers = layers for i,l in enumerate(self.layers): self.add_module(f'layer_{i}', l) def __call__(self, x): for l in self.layers: x = l(x) return x # In[158]: model = Model(layers) # In[159]: model # ### nn.ModuleList # `nn.ModuleList` does this for us. # In[160]: class SequentialModel(nn.Module): def __init__(self, layers): super().__init__() self.layers = nn.ModuleList(layers) def __call__(self, x): for l in self.layers: x = l(x) return x # In[161]: model = SequentialModel(layers) # In[162]: model # In[163]: fit() loss_func(model(xb), yb), accuracy(model(xb), yb) # ### nn.Sequential # `nn.Sequential` is a convenient class which does the same as the above: # In[164]: model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)) # In[165]: fit() loss_func(model(xb), yb), accuracy(model(xb), yb) # In[166]: model # ### optim # Let's replace our previous manually coded optimization step: # # ```python # with torch.no_grad(): # for p in model.parameters(): p -= p.grad * lr # model.zero_grad() # ``` # # and instead use just: # # ```python # opt.step() # opt.zero_grad() # ``` # In[167]: class Optimizer(): def __init__(self, params, lr=0.5): self.params,self.lr=list(params),lr def step(self): with torch.no_grad(): for p in self.params: p -= p.grad * self.lr def zero_grad(self): for p in self.params: p.grad.data.zero_() # In[168]: model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)) # In[169]: opt = Optimizer(model.parameters()) # In[170]: for epoch in range(epochs): for i in range(0, n, bs): s = slice(i, min(n,i+bs)) xb,yb = x_train[s],y_train[s] preds = model(xb) loss = loss_func(preds, yb) loss.backward() if i==0: print(loss.item(), accuracy(preds, yb).item()) opt.step() opt.zero_grad() # PyTorch already provides this exact functionality in `optim.SGD` (it also handles stuff like momentum, which we'll look at later) # In[171]: from torch import optim # In[172]: def get_model(): model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)) return model, optim.SGD(model.parameters(), lr=lr) # In[173]: model,opt = get_model() loss_func(model(xb), yb) # In[174]: for epoch in range(epochs): for i in range(0, n, bs): s = slice(i, min(n,i+bs)) xb,yb = x_train[s],y_train[s] preds = model(xb) loss = loss_func(preds, yb) loss.backward() if i==0: print(loss.item(), accuracy(preds, yb).item()) opt.step() opt.zero_grad() # ## Dataset and DataLoader # ### Dataset # It's clunky to iterate through minibatches of x and y values separately: # # ```python # xb = x_train[s] # yb = y_train[s] # ``` # # Instead, let's do these two steps together, by introducing a `Dataset` class: # # ```python # xb,yb = train_ds[s] # ``` # In[175]: class Dataset(): def __init__(self, x, y): self.x,self.y = x,y def __len__(self): return len(self.x) def __getitem__(self, i): return self.x[i],self.y[i] # In[176]: train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid) assert len(train_ds)==len(x_train) assert len(valid_ds)==len(x_valid) # In[177]: xb,yb = train_ds[0:5] assert xb.shape==(5,28*28) assert yb.shape==(5,) xb,yb # In[178]: model,opt = get_model() # In[179]: for epoch in range(epochs): for i in range(0, n, bs): xb,yb = train_ds[i:min(n,i+bs)] pred = model(xb) loss = loss_func(pred, yb) if i==0: print(loss.item(), accuracy(pred, yb).item()) loss.backward() opt.step() opt.zero_grad() # ### DataLoader # Previously, our loop iterated over batches (xb, yb) like this: # # ```python # for i in range(0, n, bs): # xb,yb = train_ds[i:min(n,i+bs)] # ... # ``` # # Let's make our loop much cleaner, using a data loader: # # ```python # for xb,yb in train_dl: # ... # ``` # In[180]: class DataLoader(): def __init__(self, ds, bs): self.ds,self.bs = ds,bs def __iter__(self): for i in range(0, len(self.ds), self.bs): yield self.ds[i:i+self.bs] # In[181]: train_dl = DataLoader(train_ds, bs) valid_dl = DataLoader(valid_ds, bs) # In[182]: xb,yb = next(iter(valid_dl)) xb.shape # In[183]: yb # In[184]: plt.imshow(xb[0].view(28,28)) yb[0] # In[185]: model,opt = get_model() # In[186]: def fit(): for epoch in range(epochs): for xb,yb in train_dl: pred = model(xb) loss = loss_func(pred, yb) loss.backward() opt.step() opt.zero_grad() # In[187]: fit() loss_func(model(xb), yb), accuracy(model(xb), yb) # ### Random sampling # We want our training set to be in a random order, and that order should differ each iteration. But the validation set shouldn't be randomized. # In[188]: import random # In[189]: class Sampler(): def __init__(self, ds, shuffle=False): self.n,self.shuffle = len(ds),shuffle def __iter__(self): res = list(range(self.n)) if self.shuffle: random.shuffle(res) return iter(res) # In[190]: from itertools import islice # In[191]: ss = Sampler(train_ds) # In[192]: it = iter(ss) for o in range(5): print(next(it)) # In[193]: list(islice(ss, 5)) # In[194]: ss = Sampler(train_ds, shuffle=True) list(islice(ss, 5)) # In[195]: import fastcore.all as fc # In[196]: class BatchSampler(): def __init__(self, sampler, bs, drop_last=False): fc.store_attr() def __iter__(self): yield from fc.chunked(iter(self.sampler), self.bs, drop_last=self.drop_last) # In[197]: batchs = BatchSampler(ss, 4) list(islice(batchs, 5)) # In[198]: def collate(b): xs,ys = zip(*b) return torch.stack(xs),torch.stack(ys) # In[199]: class DataLoader(): def __init__(self, ds, batchs, collate_fn=collate): fc.store_attr() def __iter__(self): yield from (self.collate_fn(self.ds[i] for i in b) for b in self.batchs) # In[200]: train_samp = BatchSampler(Sampler(train_ds, shuffle=True ), bs) valid_samp = BatchSampler(Sampler(valid_ds, shuffle=False), bs) # In[201]: train_dl = DataLoader(train_ds, batchs=train_samp, collate_fn=collate) valid_dl = DataLoader(valid_ds, batchs=valid_samp, collate_fn=collate) # In[202]: xb,yb = next(iter(valid_dl)) plt.imshow(xb[0].view(28,28)) yb[0] # In[203]: xb.shape,yb.shape # In[204]: model,opt = get_model() fit() loss_func(model(xb), yb), accuracy(model(xb), yb) # ### Multiprocessing DataLoader # In[205]: import torch.multiprocessing as mp from fastcore.basics import store_attr # In[206]: class DataLoader(): def __init__(self, ds, batchs, n_workers=1, collate_fn=collate): fc.store_attr() def __iter__(self): with mp.Pool(self.n_workers) as ex: yield from ex.map(self.ds.__getitem__, iter(self.batchs)) # In[207]: train_dl = DataLoader(train_ds, batchs=train_samp, collate_fn=collate, n_workers=2) it = iter(train_dl) # In[208]: xb,yb = next(it) xb.shape,yb.shape # ### PyTorch DataLoader # In[209]: from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, BatchSampler # In[210]: train_samp = BatchSampler(RandomSampler(train_ds), bs, drop_last=False) valid_samp = BatchSampler(SequentialSampler(valid_ds), bs, drop_last=False) # In[211]: train_dl = DataLoader(train_ds, batch_sampler=train_samp, collate_fn=collate) valid_dl = DataLoader(valid_ds, batch_sampler=valid_samp, collate_fn=collate) # In[212]: model,opt = get_model() fit() loss_func(model(xb), yb), accuracy(model(xb), yb) # PyTorch can auto-generate the BatchSampler for us: # In[213]: train_dl = DataLoader(train_ds, bs, sampler=RandomSampler(train_ds), collate_fn=collate) valid_dl = DataLoader(valid_ds, bs, sampler=SequentialSampler(valid_ds), collate_fn=collate) # PyTorch can also generate the Sequential/RandomSamplers too: # In[214]: train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=True, num_workers=2) valid_dl = DataLoader(valid_ds, bs, shuffle=False, num_workers=2) # In[215]: model,opt = get_model() fit() loss_func(model(xb), yb), accuracy(model(xb), yb) # Our dataset actually already knows how to sample a batch of indices all at once: # In[216]: train_ds[[4,6,7]] # ...that means that we can actually skip the batch_sampler and collate_fn entirely: # In[217]: train_dl = DataLoader(train_ds, sampler=train_samp) valid_dl = DataLoader(valid_ds, sampler=valid_samp) # In[218]: xb,yb = next(iter(train_dl)) xb.shape,yb.shape # ## Validation # You **always** should also have a [validation set](http://www.fast.ai/2017/11/13/validation-sets/), in order to identify if you are overfitting. # # We will calculate and print the validation loss at the end of each epoch. # # (Note that we always call `model.train()` before training, and `model.eval()` before inference, because these are used by layers such as `nn.BatchNorm2d` and `nn.Dropout` to ensure appropriate behaviour for these different phases.) # In[105]: def fit(epochs, model, loss_func, opt, train_dl, valid_dl): for epoch in range(epochs): model.train() for xb,yb in train_dl: loss = loss_func(model(xb), yb) loss.backward() opt.step() opt.zero_grad() model.eval() with torch.no_grad(): tot_loss,tot_acc,count = 0.,0.,0 for xb,yb in valid_dl: pred = model(xb) n = len(xb) count += n tot_loss += loss_func(pred, yb).item()*n tot_acc += accuracy (pred,yb).item()*n print(epoch, tot_loss/count, tot_acc/count) return tot_loss/count, tot_acc/count # In[106]: def get_dls(train_ds, valid_ds, bs, **kwargs): return (DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs), DataLoader(valid_ds, batch_size=bs*2, **kwargs)) # Now, our whole process of obtaining the data loaders and fitting the model can be run in 3 lines of code: # In[107]: train_dl,valid_dl = get_dls(train_ds, valid_ds, bs) model,opt = get_model() loss,acc = fit(5, model, loss_func, opt, train_dl, valid_dl) # In[ ]: