In [ ]:

%install '.package(path: "$cwd/FastaiNotebook_02a_why_sqrt5")' FastaiNotebook_02a_why_sqrt5

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_02a_why_sqrt5")
		FastaiNotebook_02a_why_sqrt5
With SwiftPM flags: []
Working in: /tmp/tmph6cxueln
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 2.07s
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'FastaiNotebook_02a_why_sqrt5' (5 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Loading library...
Installation complete!

In [ ]:

import FastaiNotebook_02a_why_sqrt5

In [ ]:

// export
import Path
import TensorFlow

Data¶

In [ ]:

var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: Path.home/".fastai"/"data"/"mnist_tst", flat: true)

In [ ]:

let trainMean = xTrain.mean()
let trainStd  = xTrain.standardDeviation()

In [ ]:

xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [ ]:

let (n,m) = (Int(xTrain.shape[0]),Int(xTrain.shape[1]))
let c = yTrain.max()+1
print(n,m,c)

60000 784 10

Those can't be used to define a model cause they're not Ints though...

In [ ]:

let (n,m) = (60000,784)
let c = 10
let nHid = 50

In [ ]:

public struct MyModel: Layer {
    public var layer1: FADense<Float>
    public var layer2: FADense<Float>
    
    public init(nIn: Int, nHid: Int, nOut: Int){
        layer1 = FADense(inputSize: nIn, outputSize: nHid, activation: relu)
        layer2 = FADense(inputSize: nHid, outputSize: nOut)
    }
    
    @differentiable
    public func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {
        return input.sequenced(in: context, through: layer1, layer2)
    }
}

In [ ]:

var model = MyModel(nIn: m, nHid: nHid, nOut: c)

In [ ]:

let pred = model.applied(to: xTrain)

Cross entropy loss¶

In [ ]:

func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let exped = exp(activations) 
    return log(exped / exped.sum(alongAxes: -1))
}

In [ ]:

let smPred = logSoftmax(pred)

In [ ]:

yTrain[0..<3]

Out[ ]:

[5, 0, 4]

In [ ]:

(smPred[0][5],smPred[1][0],smPred[2][4])

Out[ ]:

▿ 3 elements
  - .0 : -2.0715566
  - .1 : -2.028722
  - .2 : -2.1230843

There is no fancy indexing yet so we have to use gather to get the indices we want out of our softmaxed predictions.

In [ ]:

func nll<Scalar>(_ input: Tensor<Scalar>, _ target :Tensor<Int32>) -> Tensor<Scalar> 
    where Scalar:TensorFlowFloatingPoint{
        let idx: Tensor<Int32> = Raw.range(start: Tensor(0), limit: Tensor(60000), delta: Tensor(1))
        let indices = Raw.concat(concatDim: Tensor(1), [idx.expandingShape(at: 1), target.expandingShape(at: 1)])
        let losses = Raw.gatherNd(params: input, indices: indices)
        return -losses.mean()
    }

In [ ]:

nll(smPred, yTrain)

Out[ ]:

2.6719995

In [ ]:

time(repeating: 100){ let _ = nll(smPred, yTrain) }

1.0019715 ms

Simplify logSoftmax with log formulas.

In [ ]:

func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - log(exp(activations).sum(alongAxes: -1))
}

In [ ]:

let smPred = logSoftmax(pred)

In [ ]:

nll(smPred, yTrain)

Out[ ]:

2.6719995

Use LogSumExp trick

In [ ]:

smPred.max(alongAxes: -1).shape

Out[ ]:

▿ TensorShape
  ▿ dimensions : 2 elements
    - 0 : 60000
    - 1 : 1

In [ ]:

func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let m = x.max(alongAxes: -1)
    return m + log(exp(x-m).sum(alongAxes: -1))
}

In [ ]:

func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - logSumExp(activations)
}

In [ ]:

let smPred = logSoftmax(pred)

In [ ]:

nll(smPred, yTrain)

Out[ ]:

2.6719997

In S4TF nll loss is combined with softmax in:

In [ ]:

let loss = softmaxCrossEntropy(logits: pred, labels: yTrain)
loss

Out[ ]:

2.6719995

In [ ]:

time(repeating: 100){ let _ = nll(logSoftmax(pred), yTrain)}

1.33812186 ms

In [ ]:

time(repeating: 100){ let _ = softmaxCrossEntropy(logits: pred, labels: yTrain)}

1.0861205000000003 ms

Basic training loop¶

Basically the training loop repeats over the following steps:

get the output of the model on a batch of inputs
compare the output to the labels we have and compute a loss
calculate the gradients of the loss with respect to every parameter of the model
update said parameters with those gradients to make them a little bit better

In [ ]:

// export
public func accuracy(_ output: Tensor<Float>, _ target: Tensor<Int32>) -> Tensor<Float>{
    let corrects = Tensor<Float>(output.argmax(squeezingAxis: 1) .== target)
    return corrects.mean()
}

In [ ]:

print(accuracy(pred, yTrain))

0.14516667

In [ ]:

let bs:Int32=64                         // batch size
let xb = xTrain[0..<bs]          // a mini-batch from x
let preds = model.applied(to: xb) //predictions
print(preds[0], preds.shape)

[ -1.7508852, -0.42652333, 0.015924871,    1.145769,   -2.150978,    0.233792, 0.027202263,
   0.7222841,  0.66757137,     2.54584] TensorShape(dimensions: [64, 10])

In [ ]:

let yb = yTrain[0..<bs]
let loss = softmaxCrossEntropy(logits: preds, labels: yb)

In [ ]:

print(accuracy(preds, yb))

0.125

In [ ]:

let lr:Float = 0.5   // learning rate
let epochs = 1      // how many epochs to train for

To get all the gradients we need a training context.

In [ ]:

let trainingContext = Context(learningPhase: .training)

Then we can go

In [ ]:

let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
    let preds = model.applied(to: xb, in: trainingContext)
    return softmaxCrossEntropy(logits: preds, labels: yb)
}

Loop by hand

In [ ]:

for epoch in 1...epochs{
    for i in 0..<((n-1)/Int(bs)){
        let startIdx = Int32(i) * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
            let preds = model.applied(to: xb, in: trainingContext)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        model.layer1.weight -= lr * grads.layer1.weight
        model.layer1.bias   -= lr * grads.layer1.bias
        model.layer2.weight -= lr * grads.layer2.weight
        model.layer2.bias   -= lr * grads.layer2.bias
    }
}

In [ ]:

let preds = model.applied(to: xValid)
accuracy(preds, yValid)

Out[ ]:

0.9305

93% in one epoch, not too bad!

Naming all the parameters is a bit boring. We can use AllDifferentiableVariables objects to access them all.

In [ ]:

for epoch in 1...epochs{
    for i in 0..<((n-1)/Int(bs)){
        let startIdx = Int32(i) * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
            let preds = model.applied(to: xb, in: trainingContext)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        var parameters = model.allDifferentiableVariables
        for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){ 
            parameters[keyPath: kp] -= lr * grads[keyPath:kp]
        }
    }
}

Then we can use a S4TF optimizer to do the step for us.

In [ ]:

let optimizer = SGD<MyModel, Float>(learningRate: lr)

In [ ]:

for epoch in 1...epochs{
    for i in 0..<((n-1)/Int(bs)){
        let startIdx = Int32(i) * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
            let preds = model.applied(to: xb, in: trainingContext)
            return softmaxCrossEntropy(logits: preds, labels: yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

Dataset¶

We can create a swift Dataset from our arrays. It will automatically batch things for us.

In [ ]:

// export
public struct DataBatch<Inputs: Differentiable & TensorGroup, Labels: TensorGroup>: TensorGroup {
    public var xb: Inputs
    public var yb: Labels    
    
    public init(xb: Inputs, yb: Labels){
        self.xb = xb
        self.yb = yb
    }
}

In [ ]:

let train_ds:Dataset<DataBatch> = Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(Int64(bs))

In [ ]:

for epoch in 1...epochs{
    for batch in train_ds{
        let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
            let preds = model.applied(to: batch.xb, in: trainingContext)
            return softmaxCrossEntropy(logits: preds, labels: batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

This Dataset can also do the shuffle for us:

In [ ]:

for epoch in 1...epochs{
    for batch in train_ds.shuffled(){
        let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
            let preds = model.applied(to: batch.xb, in: trainingContext)
            return softmaxCrossEntropy(logits: preds, labels: batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: grads)
    }
}

Training loop¶

In [ ]:

public func train<Opt: Optimizer, Labels:TensorGroup>(
    _ model: inout Opt.Model,
    on dataset: Dataset<DataBatch<Opt.Model.Input, Labels>>,
    using optimizer: inout Opt,
    lossFunc: @escaping @differentiable (Opt.Model.Output, @nondiff Labels) -> Tensor<Opt.Scalar>
) where Opt.Model.Input: TensorGroup,
        Opt.Model.CotangentVector == Opt.Model.AllDifferentiableVariables,
        Opt.Scalar: TensorFlowFloatingPoint
{
    let context = Context(learningPhase: .training)
    for batch in dataset {
        let (loss, 𝛁model) = model.valueWithGradient { model -> Tensor<Opt.Scalar> in 
            let pred = model.applied(to: batch.xb, in: context)                                      
            return lossFunc(pred, batch.yb)
        }
        optimizer.update(&model.allDifferentiableVariables, along: 𝛁model)
    }
}

In [ ]:

var optimizer = SGD<MyModel, Float>(learningRate: lr)

In [ ]:

train(&model, on: train_ds, using: &optimizer, lossFunc: softmaxCrossEntropy)

error: Execution was interrupted, reason: signal SIGSEGV: address access protected (fault address: 0x2b10cd0).
The process has been left at the point where it was interrupted, use "thread return -x" to return to the state before expression evaluation.

Export¶

In [ ]:

notebookToScript(fname: (Path.cwd / "03_minibatch_training.ipynb").string)

In [ ]: