%install '.package(path: "$cwd/FastaiNotebook_02a_why_sqrt5")' FastaiNotebook_02a_why_sqrt5
Installing packages: .package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_02a_why_sqrt5") FastaiNotebook_02a_why_sqrt5 With SwiftPM flags: [] Working in: /tmp/tmph6cxueln Fetching https://github.com/mxcl/Path.swift Fetching https://github.com/JustHTTP/Just Completed resolution in 2.07s Cloning https://github.com/JustHTTP/Just Resolving https://github.com/JustHTTP/Just at 0.7.1 Cloning https://github.com/mxcl/Path.swift Resolving https://github.com/mxcl/Path.swift at 0.16.2 Compile Swift Module 'Just' (1 sources) Compile Swift Module 'Path' (9 sources) Compile Swift Module 'FastaiNotebook_02a_why_sqrt5' (5 sources) Compile Swift Module 'jupyterInstalledPackages' (1 sources) Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so Initializing Swift... Loading library... Installation complete!
import FastaiNotebook_02a_why_sqrt5
// export
import Path
import TensorFlow
var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: Path.home/".fastai"/"data"/"mnist_tst", flat: true)
let trainMean = xTrain.mean()
let trainStd = xTrain.standardDeviation()
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)
let (n,m) = (Int(xTrain.shape[0]),Int(xTrain.shape[1]))
let c = yTrain.max()+1
print(n,m,c)
60000 784 10
Those can't be used to define a model cause they're not Ints though...
let (n,m) = (60000,784)
let c = 10
let nHid = 50
public struct MyModel: Layer {
public var layer1: FADense<Float>
public var layer2: FADense<Float>
public init(nIn: Int, nHid: Int, nOut: Int){
layer1 = FADense(inputSize: nIn, outputSize: nHid, activation: relu)
layer2 = FADense(inputSize: nHid, outputSize: nOut)
}
@differentiable
public func applied(to input: Tensor<Float>, in context: Context) -> Tensor<Float> {
return input.sequenced(in: context, through: layer1, layer2)
}
}
var model = MyModel(nIn: m, nHid: nHid, nOut: c)
let pred = model.applied(to: xTrain)
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
let exped = exp(activations)
return log(exped / exped.sum(alongAxes: -1))
}
let smPred = logSoftmax(pred)
yTrain[0..<3]
[5, 0, 4]
(smPred[0][5],smPred[1][0],smPred[2][4])
▿ 3 elements - .0 : -2.0715566 - .1 : -2.028722 - .2 : -2.1230843
There is no fancy indexing yet so we have to use gather to get the indices we want out of our softmaxed predictions.
func nll<Scalar>(_ input: Tensor<Scalar>, _ target :Tensor<Int32>) -> Tensor<Scalar>
where Scalar:TensorFlowFloatingPoint{
let idx: Tensor<Int32> = Raw.range(start: Tensor(0), limit: Tensor(60000), delta: Tensor(1))
let indices = Raw.concat(concatDim: Tensor(1), [idx.expandingShape(at: 1), target.expandingShape(at: 1)])
let losses = Raw.gatherNd(params: input, indices: indices)
return -losses.mean()
}
nll(smPred, yTrain)
2.6719995
time(repeating: 100){ let _ = nll(smPred, yTrain) }
1.0019715 ms
Simplify logSoftmax with log formulas.
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
return activations - log(exp(activations).sum(alongAxes: -1))
}
let smPred = logSoftmax(pred)
nll(smPred, yTrain)
2.6719995
Use LogSumExp trick
smPred.max(alongAxes: -1).shape
▿ TensorShape
▿ dimensions : 2 elements
- 0 : 60000
- 1 : 1
func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
let m = x.max(alongAxes: -1)
return m + log(exp(x-m).sum(alongAxes: -1))
}
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
return activations - logSumExp(activations)
}
let smPred = logSoftmax(pred)
nll(smPred, yTrain)
2.6719997
In S4TF nll loss is combined with softmax in:
let loss = softmaxCrossEntropy(logits: pred, labels: yTrain)
loss
2.6719995
time(repeating: 100){ let _ = nll(logSoftmax(pred), yTrain)}
1.33812186 ms
time(repeating: 100){ let _ = softmaxCrossEntropy(logits: pred, labels: yTrain)}
1.0861205000000003 ms
Basically the training loop repeats over the following steps:
// export
public func accuracy(_ output: Tensor<Float>, _ target: Tensor<Int32>) -> Tensor<Float>{
let corrects = Tensor<Float>(output.argmax(squeezingAxis: 1) .== target)
return corrects.mean()
}
print(accuracy(pred, yTrain))
0.14516667
let bs:Int32=64 // batch size
let xb = xTrain[0..<bs] // a mini-batch from x
let preds = model.applied(to: xb) //predictions
print(preds[0], preds.shape)
[ -1.7508852, -0.42652333, 0.015924871, 1.145769, -2.150978, 0.233792, 0.027202263, 0.7222841, 0.66757137, 2.54584] TensorShape(dimensions: [64, 10])
let yb = yTrain[0..<bs]
let loss = softmaxCrossEntropy(logits: preds, labels: yb)
print(accuracy(preds, yb))
0.125
let lr:Float = 0.5 // learning rate
let epochs = 1 // how many epochs to train for
To get all the gradients we need a training context.
let trainingContext = Context(learningPhase: .training)
Then we can go
let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb, in: trainingContext)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
Loop by hand
for epoch in 1...epochs{
for i in 0..<((n-1)/Int(bs)){
let startIdx = Int32(i) * bs
let endIdx = startIdx + bs
let xb = xTrain[startIdx..<endIdx]
let yb = yTrain[startIdx..<endIdx]
let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb, in: trainingContext)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
model.layer1.weight -= lr * grads.layer1.weight
model.layer1.bias -= lr * grads.layer1.bias
model.layer2.weight -= lr * grads.layer2.weight
model.layer2.bias -= lr * grads.layer2.bias
}
}
let preds = model.applied(to: xValid)
accuracy(preds, yValid)
0.9305
93% in one epoch, not too bad!
Naming all the parameters is a bit boring. We can use AllDifferentiableVariables objects to access them all.
for epoch in 1...epochs{
for i in 0..<((n-1)/Int(bs)){
let startIdx = Int32(i) * bs
let endIdx = startIdx + bs
let xb = xTrain[startIdx..<endIdx]
let yb = yTrain[startIdx..<endIdx]
let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb, in: trainingContext)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
var parameters = model.allDifferentiableVariables
for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){
parameters[keyPath: kp] -= lr * grads[keyPath:kp]
}
}
}
Then we can use a S4TF optimizer to do the step for us.
let optimizer = SGD<MyModel, Float>(learningRate: lr)
for epoch in 1...epochs{
for i in 0..<((n-1)/Int(bs)){
let startIdx = Int32(i) * bs
let endIdx = startIdx + bs
let xb = xTrain[startIdx..<endIdx]
let yb = yTrain[startIdx..<endIdx]
let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb, in: trainingContext)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
optimizer.update(&model.allDifferentiableVariables, along: grads)
}
}
We can create a swift Dataset from our arrays. It will automatically batch things for us.
// export
public struct DataBatch<Inputs: Differentiable & TensorGroup, Labels: TensorGroup>: TensorGroup {
public var xb: Inputs
public var yb: Labels
public init(xb: Inputs, yb: Labels){
self.xb = xb
self.yb = yb
}
}
let train_ds:Dataset<DataBatch> = Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(Int64(bs))
for epoch in 1...epochs{
for batch in train_ds{
let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: batch.xb, in: trainingContext)
return softmaxCrossEntropy(logits: preds, labels: batch.yb)
}
optimizer.update(&model.allDifferentiableVariables, along: grads)
}
}
This Dataset can also do the shuffle for us:
for epoch in 1...epochs{
for batch in train_ds.shuffled(){
let (loss, grads) = model.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: batch.xb, in: trainingContext)
return softmaxCrossEntropy(logits: preds, labels: batch.yb)
}
optimizer.update(&model.allDifferentiableVariables, along: grads)
}
}
public func train<Opt: Optimizer, Labels:TensorGroup>(
_ model: inout Opt.Model,
on dataset: Dataset<DataBatch<Opt.Model.Input, Labels>>,
using optimizer: inout Opt,
lossFunc: @escaping @differentiable (Opt.Model.Output, @nondiff Labels) -> Tensor<Opt.Scalar>
) where Opt.Model.Input: TensorGroup,
Opt.Model.CotangentVector == Opt.Model.AllDifferentiableVariables,
Opt.Scalar: TensorFlowFloatingPoint
{
let context = Context(learningPhase: .training)
for batch in dataset {
let (loss, 𝛁model) = model.valueWithGradient { model -> Tensor<Opt.Scalar> in
let pred = model.applied(to: batch.xb, in: context)
return lossFunc(pred, batch.yb)
}
optimizer.update(&model.allDifferentiableVariables, along: 𝛁model)
}
}
var optimizer = SGD<MyModel, Float>(learningRate: lr)
train(&model, on: train_ds, using: &optimizer, lossFunc: softmaxCrossEntropy)
error: Execution was interrupted, reason: signal SIGSEGV: address access protected (fault address: 0x2b10cd0). The process has been left at the point where it was interrupted, use "thread return -x" to return to the state before expression evaluation.
notebookToScript(fname: (Path.cwd / "03_minibatch_training.ipynb").string)