In [ ]:

%install '.package(path: "$cwd/FastaiNotebook_01_matmul")' FastaiNotebook_01_matmul

Installing packages:
	.package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_01_matmul")
		FastaiNotebook_01_matmul
With SwiftPM flags: []
Working in: /tmp/tmp20_q4psv
Fetching https://github.com/mxcl/Path.swift
Fetching https://github.com/JustHTTP/Just
Completed resolution in 1.41s
Cloning https://github.com/mxcl/Path.swift
Resolving https://github.com/mxcl/Path.swift at 0.16.2
Cloning https://github.com/JustHTTP/Just
Resolving https://github.com/JustHTTP/Just at 0.7.1
Compile Swift Module 'Just' (1 sources)
Compile Swift Module 'Path' (9 sources)
Compile Swift Module 'FastaiNotebook_01_matmul' (2 sources)
Compile Swift Module 'jupyterInstalledPackages' (1 sources)
Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so
Initializing Swift...
Loading library...
Installation complete!

In [ ]:

import FastaiNotebook_01_matmul

In [ ]:

// export
import Path
import TensorFlow

The forward and backward passes¶

In [ ]:

// export
public func normalize(_ x:Tensor<Float>, mean:Tensor<Float>, std:Tensor<Float>) -> Tensor<Float> {
    return (x-mean)/std
}

In [ ]:

var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)

Normalize the training and validation sets.

In [ ]:

let trainMean = xTrain.mean()
let trainStd  = xTrain.standardDeviation()

In [ ]:

xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [ ]:

//export
public func testNearZero(_ a:Tensor<Float>, tolerance:Float=1e-3) {
    assert(abs(a)<tolerance, "Near zero: \(a)")
}

In [ ]:

testNearZero(xTrain.mean())
testNearZero(xTrain.standardDeviation() - 1.0)

In [ ]:

let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max()+1
print(n,m,c)

60000 784 10

Foundations version¶

Basic architecture¶

In [ ]:

//num hidden
let nh:Int32 = 50

In [ ]:

// simplified kaiming init / he init
let w1:Tensor<Float> = Tensor(randomNormal: [m,nh]) / sqrt(Float(m))
let b1:Tensor<Float> = Tensor(repeating: 0.0, shape: [nh])
let w2:Tensor<Float> = Tensor(randomNormal: [nh,1]) / sqrt(Float(nh))
let b2:Tensor<Float> = Tensor(repeating: 0.0, shape: [1])

In [ ]:

testNearZero(w1.mean())
testNearZero(w1.standardDeviation()-1/sqrt(Float(m)))

In [ ]:

// This should be ~ (0,1) (mean,std)...
(xValid.mean(),xValid.standardDeviation())

Out[ ]:

▿ 2 elements
  - .0 : 0.006017743
  - .1 : [[1.0076997]]

In [ ]:

func lin(_ x:Tensor<Float>, _ w:Tensor<Float>, _ b:Tensor<Float>) ->Tensor<Float> {return matmul(x, w) + b}

In [ ]:

let t = lin(xValid, w1, b1)

In [ ]:

//...so should this, because we used kaiming init, which is designed to do this
(t.mean(),t.standardDeviation())

Out[ ]:

▿ 2 elements
  - .0 : -0.02412962
  - .1 : [[1.0055103]]

In [ ]:

func myRelu(_ x:Tensor<Float>) -> Tensor<Float> {return max(x, 0)}

In [ ]:

let t = myRelu(lin(xValid, w1, b1))

In [ ]:

//...actually it really should be this!
(t.mean(),t.standardDeviation())

Out[ ]:

▿ 2 elements
  - .0 : 0.38528448
  - .1 : [[0.5647878]]

In [ ]:

// kaiming init / he init for relu
let w1:Tensor<Float> = Tensor(randomNormal: [m,nh]) * sqrt(2.0/Float(m))

In [ ]:

(w1.mean(),w1.standardDeviation())

Out[ ]:

▿ 2 elements
  - .0 : 0.00026910927
  - .1 : [[0.050386585]]

In [ ]:

let t = myRelu(lin(xValid, w1, b1))
(t.mean(),t.standardDeviation())

Out[ ]:

▿ 2 elements
  - .0 : 0.5735824
  - .1 : [[0.8429618]]

In [ ]:

func model(_ xb: Tensor<Float>) -> Tensor<Float>{
    let l1 = lin(xb, w1, b1)
    let l2 = myRelu(l1)
    let l3 = lin(l2, w2, b2)
    return l3
}

In [ ]:

time(repeating: 10) {let _ = model(xValid)}

0.9224796 ms

Loss function¶

In [ ]:

let preds = model(xTrain)

In [ ]:

// export
public func mse(_ out:Tensor<Float>, _ targ:Tensor<Float>) -> Tensor<Float> {
    return (out.squeezingShape(at: -1) - targ).squared().mean()
}

In [ ]:

var yTrainF = Tensor<Float>(yTrain)
var yValidF = Tensor<Float>(yValid)

In [ ]:

mse(preds, yTrainF)

Out[ ]:

25.17043

Gradients and backward pass¶

To store the gradients a bit like in PyTorch we introduce a Tensor with grad class that has two attributes: the original tensor and the gradient. We choose a class to easily replicate the python notebook: classes are reference types (which means they are mutable) while structures are value types.

In [ ]:

class TensorWithGrad {
    var inner: Tensor<Float>
    var grad:  Tensor<Float>
    
    init(_ x: Tensor<Float>) {
        inner = x
        grad = Tensor(repeating: 0.0, shape:x.shape)
    } 
}

In [ ]:

func lin(_ x:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad) -> TensorWithGrad {
    return TensorWithGrad(matmul(x.inner, w.inner) + b.inner)
}
func myRelu(_ x:TensorWithGrad) -> TensorWithGrad {return TensorWithGrad(max(x.inner, 0))}
func mse(_ inp: TensorWithGrad, _ targ : Tensor<Float>) -> Tensor<Float>{
    //grad of loss with respect to output of previous layer
    return (inp.inner.squeezingShape(at: -1) - targ).squared().mean()
}

In [ ]:

func mseGrad(_ inp: TensorWithGrad, _ targ : Tensor<Float>){
    //grad of loss with respect to output of previous layer
    inp.grad = 2.0 * (inp.inner.squeezingShape(at: -1) - targ).expandingShape(at: -1) / Float(inp.inner.shape[0])
}

In [ ]:

func reluGrad(_ inp:TensorWithGrad, _ out:TensorWithGrad){
    //grad of relu with respect to input activations
    inp.grad = (inp.inner .> 0).selecting(out.grad, Tensor<Float>(repeating:0.0, shape:inp.inner.shape))
}

In [ ]:

func linGrad(_ inp:TensorWithGrad, _ out:TensorWithGrad, _ w:TensorWithGrad, _ b:TensorWithGrad){
    //grad of relu with respect to input activations
    inp.grad = matmul(out.grad, w.inner.transposed())
    w.grad = matmul(inp.inner.transposed(), out.grad)
    b.grad = out.grad.sum(squeezingAxes: 0)
}

In [ ]:

let w1a = TensorWithGrad(w1)
let b1a = TensorWithGrad(b1)
let w2a = TensorWithGrad(w2)
let b2a = TensorWithGrad(b2)

In [ ]:

func forwardAndBackward(_ inp:TensorWithGrad, _ targ:Tensor<Float>){
    //forward pass:
    let l1 = lin(inp, w1a, b1a)
    let l2 = myRelu(l1)
    let out = lin(l2, w2a, b2a)
    //we don't actually need the loss in backward!
    let loss = mse(out, targ)
    
    //backward pass:
    mseGrad(out, targ)
    linGrad(l2, out, w2a, b2a)
    reluGrad(l1, l2)
    linGrad(inp, l1, w1a, b1a)
}

In [ ]:

let inp = TensorWithGrad(xTrain)

In [ ]:

forwardAndBackward(inp, yTrainF)

Let's compare to swift autodiff now. We have to mark the function as @differentiable

In [ ]:

@differentiable
func forward(_ inp:Tensor<Float>, _ targ:Tensor<Float>, w1:Tensor<Float>, b1:Tensor<Float>, 
            w2:Tensor<Float>, b2:Tensor<Float>) -> Tensor<Float>{
    let l1 = matmul(inp, w1) + b1
    let l2 = relu(l1)
    let l3 = matmul(l2, w2) + b2
    return (l3.squeezingShape(at: -1) - targ).squared().mean()
}

Then we can ask for the gradients of anything like this:

In [ ]:

let xGrad = gradient(at: xTrain) {xTrain in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let w1Grad = gradient(at: w1) {w1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let b1Grad = gradient(at: b1) {b1 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let w2Grad = gradient(at: w2) {w2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}
let b2Grad = gradient(at: b2) {b2 in forward(xTrain, yTrainF, w1:w1, b1:b1, w2:w2, b2:b2)}

Note the big difference with PyTorch: in PyTorch the tensors remember how they were created when you have requires_grad=True so that when you arrive at a final number a call the backward pass, they can compute their respective gradients.

In swift for TensorFlow, the Tensor don't store anything, so you have to specify the whole function you want executed when computing the gradients.

In [ ]:

testNearZero(xGrad - inp.grad)
testNearZero(w1Grad - w1a.grad)
testNearZero(b1Grad - b1a.grad)
testNearZero(w2Grad - w2a.grad)
testNearZero(b2Grad - b2a.grad)

In [ ]:

time(repeating: 10) { forwardAndBackward(inp, yTrainF) }

23.150008500000002 ms

It's a bit inefficient to have to ask for the gradients of every parameter in a different function call. The swifty way of doing this is to regroup all our parameters in a structure (which will be our model later on). As long as they all conform to the protocol Differentiable, we can make this structure conform to Differentiable without having to implement anything and it will just work.

In [ ]:

public struct myParams: Differentiable {
    public var x, w1, b1, w2, b2: Tensor<Float>
}

In [ ]:

let allParams = myParams(x: xTrain, w1: w1, b1: b1, w2: w2, b2: b2)

In [ ]:

let grads = gradient(at: allParams) { 
     allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
}

In [ ]:

testNearZero(xGrad  - grads.x)
testNearZero(w1Grad - grads.w1)
testNearZero(b1Grad - grads.b1)
testNearZero(w2Grad - grads.w2)
testNearZero(b2Grad - grads.b2)

If you wanted the value for your loss as well as the gradients, you just have to use valueWithGradient.

In [ ]:

let (loss,grads) = valueWithGradient(at: allParams) { 
    allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
}

In [ ]:

testNearZero(xGrad  - grads.x)
testNearZero(w1Grad - grads.w1)
testNearZero(b1Grad - grads.b1)
testNearZero(w2Grad - grads.w2)
testNearZero(b2Grad - grads.b2)

In terms of timing our implementaiton gives:

In [ ]:

time(repeating: 10) { forwardAndBackward(inp, yTrainF) }

23.5071069 ms

In [ ]:

time(repeating: 10) { let _ = valueWithGradient(at: allParams) { 
    allParams in forward(allParams.x, yTrainF, w1: allParams.w1, b1: allParams.b1, w2: allParams.w2, b2: allParams.b2)
    }
}

21.7817148 ms

Export¶

In [ ]:

notebookToScript(fname: (Path.cwd / "02_fully_connected.ipynb").string)

In [ ]: