This notebook walks through how to build a sequential layer type, allowing you to chain an arbitrary number of layers of the same type together.
%install '.package(path: "$cwd/FastaiNotebook_00_load_data")' FastaiNotebook_00_load_data
import FastaiNotebook_00_load_data
// export
/// Define a new Differentiable data type that will be the AllDifferentiableVariables, Cotangent-, and Tangent vectors
/// for our sequential layer type.
public struct DiffList<U: Differentiable & AdditiveArithmetic & Equatable & VectorNumeric>: KeyPathIterable {
public var u: [U] = []
}
// export
extension DiffList: Equatable {
public static func == (lhs: DiffList, rhs: DiffList) -> Bool {
if lhs.u.count != rhs.u.count { return false }
for i in 0..<lhs.u.count {
if lhs.u[i] != rhs.u[i] { return false }
}
return true
}
}
// export
extension DiffList: AdditiveArithmetic {
public static var zero: DiffList {
get { return DiffList() }
}
@differentiable(vjp: _vjpAdd(lhs:rhs:))
public static func + (lhs: DiffList, rhs: DiffList) -> DiffList {
precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,
"DiffList size mis-match: lhs: \(lhs.u.count), rhs: \(rhs.u.count)")
if lhs.u.count == 0 { return rhs }
if rhs.u.count == 0 { return lhs }
var output: [U] = []
for i in 0..<lhs.u.count { output.append(lhs.u[i] + rhs.u[i]) }
return DiffList(u: output)
}
public static func _vjpAdd(lhs: DiffList, rhs: DiffList) -> (DiffList, (DiffList) -> (DiffList, DiffList)) {
return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in
precondition(v.u.count == lhsCount || lhsCount == 0,
"DiffList gradient size mis-match: v: \(v.u.count), lhs: \(lhsCount)")
precondition(v.u.count == rhsCount || rhsCount == 0,
"DiffList gradient size mis-match: v: \(v.u.count), rhs: \(rhsCount)")
var lhsOutput: [U] = []
var rhsOutput: [U] = []
// Unbroadcasting
if lhsCount != 0 { lhsOutput = v.u }
if rhsCount != 0 { rhsOutput = v.u }
return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))
})
}
@differentiable(vjp: _vjpSubtract(lhs:rhs:))
public static func - (lhs: DiffList, rhs: DiffList) -> DiffList {
precondition(lhs.u.count == 0 || rhs.u.count == 0 || lhs.u.count == rhs.u.count,
"DiffList size mis-match: lhs: \(lhs.u.count), rhs: \(rhs.u.count)")
if lhs.u.count == 0 { return rhs }
if rhs.u.count == 0 { return lhs }
var output: [U] = []
for i in 0..<lhs.u.count { output.append(lhs.u[i] + rhs.u[i]) }
return DiffList(u: output)
}
public static func _vjpSubtract(lhs: DiffList, rhs: DiffList) -> (DiffList, (DiffList) -> (DiffList, DiffList)) {
return (lhs + rhs, { [lhsCount = lhs.u.count, rhsCount = rhs.u.count] v in
precondition(v.u.count == lhsCount || lhsCount == 0,
"DiffList gradient size mis-match: v: \(v.u.count), lhs: \(lhsCount)")
precondition(v.u.count == rhsCount || rhsCount == 0,
"DiffList gradient size mis-match: v: \(v.u.count), rhs: \(rhsCount)")
var lhsOutput: [U] = []
var rhsOutput: [U] = []
// Unbroadcasting
if lhsCount != 0 { lhsOutput = v.u }
if rhsCount != 0 { rhsOutput = v.u.map({ U.zero - $0 }) }
return (DiffList(u: lhsOutput), DiffList(u: rhsOutput))
})
}
}
extension DiffList: VectorNumeric {
public typealias Scalar = U.Scalar
public static func * (lhs: Scalar, rhs: DiffList) -> DiffList {
return DiffList(u: rhs.u.map( { $0 * lhs } ))
}
}
extension DiffList: Differentiable {
public typealias TangentVector = DiffList
public typealias CotangentVector = DiffList
public typealias AllDifferentiableVariables = DiffList
public func tangentVector(from cotangent: CotangentVector) -> TangentVector {
return cotangent
}
}
// export
import TensorFlow // Defines Layer.
// export
/// A struct that contains a number of layers within it.
public struct SequentialLayer<U: Layer>: KeyPathIterable where
U.Input == U.Output,
U.AllDifferentiableVariables: VectorNumeric,
U.AllDifferentiableVariables == U.CotangentVector {
public var layers: [U]
public init(layers: [U]) {
self.layers = layers
}
}
// export
// Mark SequentialLayer as Differentiable
extension SequentialLayer: Differentiable {
public typealias AllDifferentiableVariables = DiffList<U.AllDifferentiableVariables>
public typealias TangentVector = DiffList<U.TangentVector>
public typealias CotangentVector = DiffList<U.CotangentVector>
public func tangentVector(from cotangent: CotangentVector) -> TangentVector {
precondition(cotangent.u.count == layers.count, "Differing # of layers: \(cotangent.u.count) and \(layers.count)")
return DiffList(u: zip(layers, cotangent.u).map({ $0.0.tangentVector(from: $0.1) }))
}
public func moved(along direction: TangentVector) -> SequentialLayer {
precondition(direction.u.count == layers.count, "Differing # of layers: \(direction.u.count) and \(layers.count)")
return SequentialLayer(layers: zip(layers, direction.u).map({ $0.0.moved(along: $0.1) }))
}
public var allDifferentiableVariables: AllDifferentiableVariables {
get { return DiffList(u: layers.map({ $0.allDifferentiableVariables })) }
set {
precondition(newValue.u.count == layers.count, "Differing # of layers: \(newValue.u.count) and \(layers.count)")
for i in 0..<layers.count { layers[i].allDifferentiableVariables = newValue.u[i] }
}
}
}
// export
extension SequentialLayer: Layer {
public typealias Input = U.Input
public typealias Output = U.Output
@differentiable(vjp: _appliedDifferentiating(to:))
public func applied(to input: Input) -> Output {
var tmp = input
for layer in layers { tmp = layer.applied(to: tmp) }
return tmp
}
public func _appliedDifferentiating(to input: Input) -> (
Output, (Output.CotangentVector) -> (CotangentVector, Input.CotangentVector)) {
var pullbacks: [(U.Output.CotangentVector) -> (U.AllDifferentiableVariables, U.Input.CotangentVector)] = []
var tmp = input
for layer in layers {
let (output, pullback) = Swift.valueWithPullback(at: layer, tmp) { layer, input in
return layer.applied(to: input)
}
tmp = output
pullbacks.append(pullback)
}
return (tmp, { input in
var allDiffVars: [U.AllDifferentiableVariables] = []
var tmp = input
for pb in pullbacks.reversed() {
let (diffVars, input) = pb(tmp)
tmp = input
allDiffVars.append(diffVars)
}
return (DiffList(u: allDiffVars.reversed()), tmp)
})
}
}
struct MyModel: Layer {
var layers: SequentialLayer<Dense<Float>>
init(inputSize: Int, hiddenUnits: [Int], outputSize: Int) {
// Make the dense layers.
// TODO(saeta): Clean up this code.
var input = inputSize
var output = outputSize
if hiddenUnits.count > 0 { output = hiddenUnits[0] }
var layers: [Dense<Float>] = []
for i in 0..<hiddenUnits.count {
output = hiddenUnits[i]
print("Making Dense<Float>(inputSize: \(input), outputSize: \(output))")
layers.append(Dense<Float>(inputSize: input, outputSize: output))
input = output
}
print("Making Dense<Float>(inputSize: \(output), outputSize: \(outputSize))")
layers.append(Dense<Float>(inputSize: output, outputSize: outputSize))
self.layers = SequentialLayer(layers: layers)
}
@differentiable
func applied(to input: Tensor<Float>) -> Tensor<Float> {
return layers.applied(to: input)
}
}
struct FixedModel: Layer {
var d1 = Dense<Float>(inputSize: 784, outputSize: 30)
var d2 = Dense<Float>(inputSize: 30, outputSize: 10)
@differentiable
func applied(to input: Tensor<Float>) -> Tensor<Float> {
return input.sequenced(through: d1, d2)
}
}
var (xTrain, yTrain, xValid, yValid) = loadMNIST(path: mnistPath, flat: true)
func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
let m = x.max(alongAxes: -1)
return m + log(exp(x-m).sum(alongAxes: -1))
}
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
return activations - logSumExp(activations)
}
Context.local.learningPhase = .training
let lr:Float = 0.5 // learning rate
let epochs = 1 // how many epochs to train for
let bs=64 // batch size
let (n,m) = (60000,784) // MNIST dataset size
var modelFixed = FixedModel()
let modelFixedStart = modelFixed
modelFixedStart.d1.weight == modelFixed.d1.weight
var modelFlex = MyModel(inputSize: 784, hiddenUnits: [30], outputSize: 10)
let modelFlexStart = modelFlex
modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight
public func accuracy<Model: Layer>(_ model: Model, inputs: Tensor<Float>, target: Tensor<Int32>) -> Tensor<Float> where Model.Input == Tensor<Float>, Model.Output == Tensor<Float> {
return withLearningPhase(.inference) {
let output = model.applied(to: inputs)
let corrects = Tensor<Float>(output.argmax(squeezingAxis: 1) .== target)
return corrects.mean()
}
}
accuracy(modelFlex, inputs: xValid, target: yValid)
accuracy(modelFixed, inputs: xValid, target: yValid)
The training loops below are copied from 03_minibatch. They don't appear to actually train either model. :-(
for epoch in 1...epochs{
for i in 0..<((n-1)/bs){
let startIdx = i * bs
let endIdx = startIdx + bs
let xb = xTrain[startIdx..<endIdx]
let yb = yTrain[startIdx..<endIdx]
let (loss, grads) = modelFixed.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
var parameters = modelFixed.allDifferentiableVariables
for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){
parameters[keyPath: kp] -= lr * grads[keyPath:kp]
}
}
}
for epoch in 1...epochs{
for i in 0..<((n-1)/bs){
let startIdx = i * bs
let endIdx = startIdx + bs
let xb = xTrain[startIdx..<endIdx]
let yb = yTrain[startIdx..<endIdx]
let (loss, grads) = modelFlex.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
var parameters = modelFlex.allDifferentiableVariables
for kp in parameters.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self){
parameters[keyPath: kp] -= lr * grads[keyPath: kp]
}
}
}
accuracy(modelFlex, inputs: xValid, target: yValid)
accuracy(modelFixed, inputs: xValid, target: yValid)
modelFixedStart.d1.weight == modelFixed.d1.weight
modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight
let optimizerFixed = SGD<FixedModel, Float>(learningRate: lr)
// let optimizerFlex = SGD<MyModel, Float>(learningRate: lr) // SGD doesn't work for the new flex style models, due to the interaction between how .zero is defined, and keypathing.
/// Stochastic gradient descent (SGD) optimizer.
///
/// An optimizer that implements stochastic gradient descent, with support for momentum, learning
/// rate decay, and Nesterov momentum.
public class SimpleSGD<Model: Layer, Scalar: TensorFlowFloatingPoint>: Optimizer
where Model.AllDifferentiableVariables == Model.CotangentVector {
/// The learning rate.
public var learningRate: Scalar
public init(
learningRate: Scalar = 0.01
) {
precondition(learningRate >= 0, "Learning rate must be non-negative")
self.learningRate = learningRate
}
public func update(_ model: inout Model.AllDifferentiableVariables,
along direction: Model.CotangentVector) {
for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Scalar>.self) {
model[keyPath: kp] -= -learningRate * direction[keyPath: kp]
}
}
}
let simpleOptFlex = SimpleSGD<MyModel, Float>(learningRate: lr)
for epoch in 1...epochs{
for i in 0..<((n-1)/bs){
let startIdx = i * bs
let endIdx = startIdx + bs
let xb = xTrain[startIdx..<endIdx]
let yb = yTrain[startIdx..<endIdx]
let (loss, grads) = modelFixed.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
optimizerFixed.update(&modelFixed.allDifferentiableVariables, along: grads)
}
}
accuracy(modelFixed, inputs: xValid, target: yValid)
modelFixedStart.d1.weight == modelFixed.d1.weight
for epoch in 1...epochs{
for i in 0..<((n-1)/bs){
let startIdx = i * bs
let endIdx = startIdx + bs
let xb = xTrain[startIdx..<endIdx]
let yb = yTrain[startIdx..<endIdx]
let (loss, grads) = modelFlex.valueWithGradient { model -> Tensor<Float> in
let preds = model.applied(to: xb)
return softmaxCrossEntropy(logits: preds, labels: yb)
}
// optimizerFlex.update(&modelFlex.allDifferentiableVariables, along: grads)
simpleOptFlex.update(&modelFlex.allDifferentiableVariables, along: grads)
}
}
accuracy(modelFlex, inputs: xValid, target: yValid)
modelFlex.layers.layers[0].weight == modelFlexStart.layers.layers[0].weight