%install '.package(path: "$cwd/FastaiNotebook_06_cuda")' FastaiNotebook_06_cuda
Installing packages: .package(path: "/home/ubuntu/fastai_docs/dev_swift/FastaiNotebook_06_cuda") FastaiNotebook_06_cuda With SwiftPM flags: [] Working in: /tmp/tmpgocy_x9t/swift-install Fetching https://github.com/mxcl/Path.swift Fetching https://github.com/JustHTTP/Just Completed resolution in 3.43s Cloning https://github.com/JustHTTP/Just Resolving https://github.com/JustHTTP/Just at 0.7.1 Cloning https://github.com/mxcl/Path.swift Resolving https://github.com/mxcl/Path.swift at 0.16.2 Compile Swift Module 'Path' (9 sources) Compile Swift Module 'Just' (1 sources) Compile Swift Module 'FastaiNotebook_06_cuda' (11 sources) Compile Swift Module 'jupyterInstalledPackages' (1 sources) Linking ./.build/x86_64-unknown-linux/debug/libjupyterInstalledPackages.so Initializing Swift... Installation complete!
import FastaiNotebook_06_cuda
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")
('inline', 'module://ipykernel.pylab.backend_inline')
// export
import Path
import TensorFlow
import Python
let plt = Python.import("matplotlib.pyplot")
let data = mnistDataBunch(flat: false, bs: 512)
let opt = SimpleSGD<CnnModel, Float>(learningRate: 0.4)
func modelInit() -> CnnModel { return CnnModel(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunction: softmaxCrossEntropy, optimizer: opt, initializingWith: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
learner.makeAddChannel()])
time { try! learner.fit(1) }
Epoch 0: [0.6109115, 0.7906] 7393.339826 ms
Let's start by building our own BatchNorm layer from scratch. Eventually we intend for this code to do the trick:
struct AlmostBatchNorm<Scalar: TensorFlowFloatingPoint>: Differentiable {
// Configuration hyperparameters
@noDerivative let momentum: Scalar
@noDerivative let epsilon: Scalar
// Running statistics
@noDerivative var runningMean: Tensor<Scalar>
@noDerivative var runningVariance: Tensor<Scalar>
// Trainable parameters
var scale: Tensor<Scalar>
var offset: Tensor<Scalar>
init(featureCount: Int, momentum: Scalar = 0.9, epsilon: Scalar = 1e-5) {
self.momentum = momentum
self.epsilon = epsilon
self.scale = Tensor(ones: [featureCount])
self.offset = Tensor(zeros: [featureCount])
self.runningMean = Tensor(0)
self.runningVariance = Tensor(1)
}
mutating func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let mean: Tensor<Scalar>
let variance: Tensor<Scalar>
switch Context.local.learningPhase {
case .training:
mean = input.mean(alongAxes: [0, 1, 2])
variance = input.variance(alongAxes: [0, 1, 2])
runningMean += (mean - runningMean) * (1 - momentum)
runningVariance += (variance - runningVariance) * (1 - momentum)
case .inference:
mean = runningMean
variance = runningVariance
}
let normalizer = rsqrt(variance + epsilon) * scale
return (input - mean) * normalizer + offset
}
}
But there are some automatic differentiation limitations (control flow support) and Layer protocol constraints (mutating applied) that make this impossible for now (note the lack of @differentiable or a Layer conformance), so we'll need a few workarounds. A Reference will let us update running statistics without declaring the applied method mutating:
//export
class Reference<T> {
var value: T
init(_ value: T) { self.value = value }
}
The following snippet will let us differentiate a layer's applied method if it's composed of training and inference implementations that are each differentiable:
//export
protocol LearningPhaseDependent: Layer {
var delegate: LayerDelegate<Output> { get set }
@differentiable func forwardTraining(to input: Input) -> Output
@differentiable func forwardInference(to input: Input) -> Output
}
extension LearningPhaseDependent {
func forward(_ input: Input) -> Output {
switch Context.local.learningPhase {
case .training: return forwardTraining(to: input)
case .inference: return forwardInference(to: input)
}
}
@differentiating(applied)
func gradForward(_ input: Input) ->
(value: Output, pullback: (Output.CotangentVector) ->
(Self.CotangentVector, Input.CotangentVector)) {
switch Context.local.learningPhase {
case .training:
return valueWithPullback(at: input) { $0.forwardTraining(to: $1) }
case .inference:
return valueWithPullback(at: input) { $0.forwardInference(to: $1) }
}
}
@differentiable
public func applied(to input: Input) -> Output {
let activation = forward(input)
delegate.didProduceActivation(activation)
return activation
}
}
Now we can implement a BatchNorm that we can use in our models:
//export
protocol Norm: Layer where Input == Tensor<Scalar>, Output == Tensor<Scalar>{
associatedtype Scalar
init(featureCount: Int, epsilon: Scalar)
}
public struct FABatchNorm<Scalar: TensorFlowFloatingPoint>: LearningPhaseDependent, Norm {
// Configuration hyperparameters
@noDerivative var momentum: Scalar
@noDerivative var epsilon: Scalar
// Running statistics
@noDerivative let runningMean: Reference<Tensor<Scalar>>
@noDerivative let runningVariance: Reference<Tensor<Scalar>>
@noDerivative public var delegate: LayerDelegate<Output> = LayerDelegate()
// Trainable parameters
public var scale: Tensor<Scalar>
public var offset: Tensor<Scalar>
// TODO: check why these aren't being synthesized
public typealias Input = Tensor<Scalar>
public typealias Output = Tensor<Scalar>
public init(featureCount: Int, momentum: Scalar, epsilon: Scalar = 1e-5) {
self.momentum = momentum
self.epsilon = epsilon
self.scale = Tensor(ones: [featureCount])
self.offset = Tensor(zeros: [featureCount])
self.runningMean = Reference(Tensor(0))
self.runningVariance = Reference(Tensor(1))
}
public init(featureCount: Int, epsilon: Scalar = 1e-5) {
self.init(featureCount: featureCount, momentum: 0.9, epsilon: epsilon)
}
@differentiable
public func forwardTraining(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let mean = input.mean(alongAxes: [0, 1, 2])
let variance = input.variance(alongAxes: [0, 1, 2])
runningMean.value += (mean - runningMean.value) * (1 - momentum)
runningVariance.value += (variance - runningVariance.value) * (1 - momentum)
let normalizer = rsqrt(variance + epsilon) * scale
return (input - mean) * normalizer + offset
}
@differentiable
public func forwardInference(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let mean = runningMean.value
let variance = runningVariance.value
let normalizer = rsqrt(variance + epsilon) * scale
return (input - mean) * normalizer + offset
}
}
//export
public struct ConvBN<Scalar: TensorFlowFloatingPoint>: FALayer {
public var conv: FANoBiasConv2D<Scalar>
public var norm: FABatchNorm<Scalar>
@noDerivative public var delegate: LayerDelegate<Output> = LayerDelegate()
public init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 2){
// TODO (when control flow AD works): use Conv2D without bias
self.conv = FANoBiasConv2D(filterShape: (ks, ks, cIn, cOut),
strides: (stride,stride),
padding: .same,
activation: relu)
self.norm = FABatchNorm(featureCount: cOut, epsilon: 1e-5)
}
@differentiable
public func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
return norm.applied(to: conv.applied(to: input))
}
@differentiable
public func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let activation = forward(input)
delegate.didProduceActivation(activation)
return activation
}
}
// Would be great if this generic could work
// struct ConvNorm<NormType: Norm, Scalar: TensorFlowFloatingPoint>: Layer
// where NormType.Scalar == Scalar {
// var conv: Conv2D<Scalar>
// var norm: NormType
// init(
// filterShape: (Int, Int, Int, Int),
// strides: (Int, Int) = (1, 1),
// padding: Padding = .valid,
// activation: @escaping Conv2D<Scalar>.Activation = identity
// ) {
// // TODO (when control flow AD works): use Conv2D without bias
// self.conv = Conv2D(
// filterShape: filterShape,
// strides: strides,
// padding: padding,
// activation: activation)
// self.norm = NormType.init(featureCount: filterShape.3, epsilon: 1e-5)
// }
// @differentiable
// func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
// return norm.applied(to: conv.applied(to: input))
// }
// }
//typealias ConvBN = ConvNorm<BatchNorm<Float>, Float>
//export
public struct CnnModelBN: Layer {
public var convs: [ConvBN<Float>]
public var pool = FAAdaptiveAvgPool2D<Float>()
public var flatten = Flatten<Float>()
public var linear: FADense<Float>
public init(channelIn: Int, nOut: Int, filters: [Int]){
convs = []
let allFilters = [channelIn] + filters
for i in 0..<filters.count { convs.append(ConvBN(allFilters[i], allFilters[i+1])) }
linear = FADense<Float>(inputSize: filters.last!, outputSize: nOut)
}
@differentiable
public func applied(to input: TF) -> TF {
return input.sequenced(through: convs, pool, flatten, linear)
}
}
let opt = SimpleSGD<CnnModelBN, Float>(learningRate: 0.4)
func modelInit() -> CnnModelBN { return CnnModelBN(channelIn: 1, nOut: 10, filters: [8, 16, 32, 32]) }
let learner = Learner(data: data, lossFunction: softmaxCrossEntropy, optimizer: opt, initializingWith: modelInit)
let recorder = learner.makeDefaultDelegates(metrics: [accuracy])
learner.addDelegates([learner.makeNormalize(mean: mnistStats.mean, std: mnistStats.std),
learner.makeAddChannel()])
time { try! learner.fit(1) }
Epoch 0: [0.14512135, 0.9556] 12103.279594 ms
TODO: hooks/LayerDelegates
From the paper: "batch normalization cannot be applied to online learning tasks or to extremely large distributed models where the minibatches have to be small".
General equation for a norm layer with learnable affine:
$$y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$$The difference with BatchNorm is
struct LayerNorm2D<Scalar: TensorFlowFloatingPoint>: Norm {
// Configuration hyperparameters
@noDerivative let epsilon: Scalar
// Trainable parameters
var scale: Tensor<Scalar>
var offset: Tensor<Scalar>
init(featureCount: Int, epsilon: Scalar = 1e-5) {
self.epsilon = epsilon
self.scale = Tensor(ones: [featureCount])
self.offset = Tensor(zeros: [featureCount])
}
@differentiable
func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let mean = input.mean(alongAxes: [1, 2, 3])
let variance = input.variance(alongAxes: [1, 2, 3])
let normalizer = rsqrt(variance + epsilon) * scale
return (input - mean) * normalizer + offset
}
}
struct ConvLN<Scalar: TensorFlowFloatingPoint>: FALayer {
var conv: FANoBiasConv2D<Scalar>
var norm: LayerNorm2D<Scalar>
@noDerivative public var delegate: LayerDelegate<Output> = LayerDelegate()
init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 2){
// TODO (when control flow AD works): use Conv2D without bias
self.conv = FANoBiasConv2D(filterShape: (ks, ks, cIn, cOut),
strides: (stride,stride),
padding: .same,
activation: relu)
self.norm = LayerNorm2D(featureCount: cOut, epsilon: 1e-5)
}
@differentiable
func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
return norm.applied(to: conv.applied(to: input))
}
@differentiable
public func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let activation = forward(input)
delegate.didProduceActivation(activation)
return activation
}
}
public struct CnnModelLN: Layer {
public var convs: [ConvLN<Float>]
public var pool = FAAdaptiveAvgPool2D<Float>()
public var flatten = Flatten<Float>()
public var linear: FADense<Float>
public init(channelIn: Int, nOut: Int, filters: [Int]){
convs = []
let allFilters = [channelIn] + filters
for i in 0..<filters.count { convs.append(ConvLN(allFilters[i], allFilters[i+1])) }
linear = FADense<Float>(inputSize: filters.last!, outputSize: nOut)
}
@differentiable
public func applied(to input: TF) -> TF {
return input.sequenced(through: convs, pool, flatten, linear)
}
}
struct InstanceNorm<Scalar: TensorFlowFloatingPoint>: Norm {
// Configuration hyperparameters
@noDerivative let epsilon: Scalar
// Trainable parameters
var scale: Tensor<Scalar>
var offset: Tensor<Scalar>
init(featureCount: Int, epsilon: Scalar = 1e-5) {
self.epsilon = epsilon
self.scale = Tensor(ones: [featureCount])
self.offset = Tensor(zeros: [featureCount])
}
@differentiable
func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let mean = input.mean(alongAxes: [2, 3])
let variance = input.variance(alongAxes: [2, 3])
let normalizer = rsqrt(variance + epsilon) * scale
return (input - mean) * normalizer + offset
}
}
struct ConvIN<Scalar: TensorFlowFloatingPoint>: FALayer {
var conv: FANoBiasConv2D<Scalar>
var norm: InstanceNorm<Scalar>
@noDerivative public var delegate: LayerDelegate<Output> = LayerDelegate()
init(_ cIn: Int, _ cOut: Int, ks: Int = 3, stride: Int = 2){
// TODO (when control flow AD works): use Conv2D without bias
self.conv = FANoBiasConv2D(filterShape: (ks, ks, cIn, cOut),
strides: (stride,stride),
padding: .same,
activation: relu)
self.norm = InstanceNorm(featureCount: cOut, epsilon: 1e-5)
}
@differentiable
func forward(_ input: Tensor<Scalar>) -> Tensor<Scalar> {
return norm.applied(to: conv.applied(to: input))
}
@differentiable
public func applied(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let activation = forward(input)
delegate.didProduceActivation(activation)
return activation
}
}
Lost in all those norms? The authors from the group norm paper have you covered:

TODO/skipping GroupNorm
struct RunningBatchNorm<Scalar: TensorFlowFloatingPoint>: LearningPhaseDependent, Norm {
// Configuration hyperparameters
@noDerivative let momentum: Scalar
@noDerivative let epsilon: Scalar
// Running statistics
@noDerivative let runningSum: Reference<Tensor<Scalar>>
@noDerivative let runningSumOfSquares: Reference<Tensor<Scalar>>
@noDerivative let runningCount: Reference<Scalar>
@noDerivative let samplesSeen: Reference<Int>
// Trainable parameters
var scale: Tensor<Scalar>
var offset: Tensor<Scalar>
// TODO: check why these aren't being synthesized
typealias Input = Tensor<Scalar>
typealias Output = Tensor<Scalar>
init(featureCount: Int, momentum: Scalar, epsilon: Scalar = 1e-5) {
self.momentum = momentum
self.epsilon = epsilon
self.scale = Tensor(ones: [featureCount])
self.offset = Tensor(zeros: [featureCount])
self.runningSum = Reference(Tensor(0))
self.runningSumOfSquares = Reference(Tensor(0))
self.runningCount = Reference(Scalar(0))
self.samplesSeen = Reference(0)
}
init(featureCount: Int, epsilon: Scalar = 1e-5) {
self.init(featureCount: featureCount, momentum: 0.9, epsilon: epsilon)
}
@differentiable
func forwardTraining(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let (batch, channels) = (input.shape[0], Scalar(input.shape[3]))
let sum = input.sum(alongAxes: [0, 1, 2])
let sumOfSquares = (input * input).sum(alongAxes: [0, 1, 2])
let count = Scalar(input.scalarCount).withoutDerivative() / channels
let mom = momentum / sqrt(Scalar(batch) - 1)
let runningSum = mom * self.runningSum.value + (1 - mom) * sum
let runningSumOfSquares = mom * self.runningSumOfSquares.value + (
1 - mom) * sumOfSquares
let runningCount = mom * self.runningCount.value + (1 - mom) * count
self.runningSum.value = runningSum
self.runningSumOfSquares.value = runningSumOfSquares
self.runningCount.value = runningCount
self.samplesSeen.value += batch
let mean = runningSum / runningCount
let variance = runningSumOfSquares / runningCount - mean * mean
let normalizer = rsqrt(variance + epsilon) * scale
return (input - mean) * normalizer + offset
}
@differentiable
func forwardInference(to input: Tensor<Scalar>) -> Tensor<Scalar> {
let mean = runningSum.value / runningCount.value
let variance = runningSumOfSquares.value / runningCount.value - mean * mean
let normalizer = rsqrt(variance + epsilon) * scale
return (input - mean) * normalizer + offset
}
}
TODO: XLA compilation + test RBN
notebookToScript(fname: (Path.cwd / "07_batchnorm.ipynb").string)