#@title ##### Licensed under the Apache License, Version 2.0 (the "License"); { display-mode: "form" }
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()

import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from discussion import nn as tfp_nn

# Globally Enable XLA.
# tf.config.optimizer.set_jit(True)

try:
  physical_devices = tf.config.list_physical_devices('GPU')
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

tfb = tfp.bijectors
tfd = tfp.distributions

[train_dataset, eval_dataset], datasets_info = tfds.load(
    name='mnist',
    split=['train', 'test'],
    with_info=True,
    as_supervised=True,
    shuffle_files=True)

def _preprocess(image, label):
  image = tf.cast(image, dtype=tf.float32) / 255.
  label = tf.cast(label, dtype=tf.int32)
  return image, label

train_size = datasets_info.splits['train'].num_examples
batch_size = 32

train_dataset = tfp_nn.util.tune_dataset(
    train_dataset,
    batch_size=batch_size,
    shuffle_size=int(train_size  / 7),
    preprocess_fn=_preprocess)

eval_dataset = tfp_nn.util.tune_dataset(
    eval_dataset,
    repeat_count=None,
    preprocess_fn=_preprocess)

x, y = next(iter(eval_dataset.batch(10)))
tfp_nn.util.display_imgs(x, y);

max_pool = tf.keras.layers.MaxPooling2D(  # Has no tf.Variables.
    pool_size=(2, 2),
    strides=(2, 2),
    padding='SAME',
    data_format='channels_last')

bnn = tfp_nn.Sequential([
  tfp_nn.ConvolutionVariationalFlipout(
      input_size=1,
      output_size=8,
      filter_shape=5,
      padding='SAME',
      penalty_weight=1. / train_size,
      name='conv1'),
  tf.nn.leaky_relu,
  max_pool,  # [28, 28, 8] -> [14, 14, 8]
  tfp_nn.ConvolutionVariationalFlipout(
      input_size=8,
      output_size=16,
      filter_shape=5,
      padding='SAME',
      penalty_weight = 1. / train_size,
      name='conv2'),
  tf.nn.leaky_relu,
  max_pool,  # [14, 14, 16] -> [7, 7, 16]
  tfp_nn.util.flatten_rightmost,
  tfp_nn.AffineVariationalReparameterizationLocal(
      input_size=7 * 7 * 16,
      output_size=10,
      penalty_weight = 1. / train_size,
      name='affine1'),
  lambda x: tfd.Categorical(logits=x, dtype=tf.int32),   
], name='BNN')

print(bnn.summary())

train_iter = iter(train_dataset)
eval_iter = iter(eval_dataset.batch(2000).repeat())

def loss():
  x, y = next(train_iter)
  nll = -tf.reduce_mean(bnn(x).log_prob(y), axis=-1)
  kl = bnn.extra_loss
  return nll + kl, (nll, kl)

opt = tf.optimizers.Adam(learning_rate=1e-2)
 
fit = tfp_nn.util.make_fit_op(
    loss,
    opt,
    bnn.trainable_variables,
    grad_summary_fn=lambda gs: tf.nest.map_structure(tf.norm, gs))

@tf.function(autograph=False)
def eval():
  with tf.xla.experimental.jit_scope(compile_ops=True):
    x, y = next(eval_iter)
    yhat = bnn(x)
    nll = -tf.reduce_mean(yhat.log_prob(y))
    kl = bnn.extra_loss
    loss = nll + kl
    acc = tf.reduce_mean(tf.cast(tf.equal(y, yhat.mode()), tf.float32), axis=-1)
    return loss, acc, nll, kl

num_train_steps = 20e3  # @param { isTemplate: true}
num_train_steps = int(num_train_steps)  # Enforce correct type when overridden.
dur_sec = dur_num = 0

for i in range(num_train_steps):
  start = time.time()
  trn_loss, (trn_nll, trn_kl), g = fit()
  stop = time.time()
  dur_sec += stop - start
  dur_num += 1
  if i % 100 == 0 or i == num_train_steps - 1:
    tst_loss, tst_acc, tst_nll, tst_kl = eval()
    f, x = zip(*[
        ('it:{:5}', opt.iterations),
        ('ms/it:{:6.4f}', dur_sec / max(1., dur_num) * 1000.),
        ('tst_acc:{:6.4f}', tst_acc),
        ('trn_loss:{:6.4f}', trn_loss),
        ('tst_loss:{:6.4f}', tst_loss),
        ('tst_nll:{:6.4f}', tst_nll),
        ('tst_kl:{:6.4f}', tst_kl),
        ('sum_norm_grad:{:6.4f}', sum(g)),

    ])
    print('   '.join(f).format(*[getattr(x_, 'numpy', lambda: x_)()
                                 for x_ in x]))
    sys.stdout.flush()
    dur_sec = dur_num = 0
  # if i % 1000 == 0 or i == maxiter - 1:
  #   bnn.save('/tmp/bnn.npz')

# Run inference multiple times...
num_inferences = 10  # @param { isTemplate: true}

@tf.function(autograph=False)
def predicted_log_prob(x):
  with tf.xla.experimental.jit_scope(compile_ops=True):
    return tf.math.log_softmax(bnn(x).logits, axis=-1)
  
eval_iter = iter(eval_dataset.batch(2000).repeat(int(num_inferences)))

before_avg_predicted_log_probs = tf.reshape(
    tf.stack([predicted_log_prob(x) for x, _ in eval_iter], axis=0),
    shape=[int(num_inferences), datasets_info.splits['test'].num_examples, -1])

bnn_predicted_log_probs = tfp.math.reduce_logmeanexp(
    before_avg_predicted_log_probs, axis=0)

decision = tf.argmax(bnn_predicted_log_probs, axis=-1, output_type=tf.int32)
confidence = tf.reduce_max(bnn_predicted_log_probs, axis=-1)
threshold = 0.95

decided_idx = tf.where(confidence > np.log(threshold))
ordered = tf.argsort(confidence)

n = datasets_info.splits['test'].num_examples
x_final, y_final = next(iter(eval_dataset.batch(n)))

print('Number of examples undecided: {}'.format(n - tf.size(decided_idx)))

accurary = tf.reduce_mean(
    tf.cast(tf.equal(tf.gather(y_final, decided_idx),
                     tf.gather(decision, decided_idx)),
            tf.float32))
print('Accurary after excluding undecided ones: {}'.format(accurary))

tfp_nn.util.display_imgs(
     tf.gather(x_final, ordered[0:50]),
     tf.gather(y_final, ordered[0:50]));

from sklearn import metrics
bnn_auc = np.array([
  metrics.roc_auc_score(tf.equal(y_final, i), bnn_predicted_log_probs[:, i])
  for i in range(10)])
print('Per class AUC:\n{}'.format(bnn_auc[:, np.newaxis]))

max_pool = tf.keras.layers.MaxPooling2D(  # Has no tf.Variables.
    pool_size=(2, 2),
    strides=(2, 2),
    padding='SAME',
    data_format='channels_last')

dnn = tfp_nn.Sequential([
  tfp_nn.Convolution(
      input_size=1,
      output_size=8,
      filter_shape=5,
      padding='SAME',
      name='conv1'),
  tf.nn.leaky_relu,
  max_pool,  # [28, 28, 8] -> [14, 14, 8]
  tfp_nn.Convolution(
      input_size=8,
      output_size=16,
      filter_shape=5,
      padding='SAME',
      name='conv2'),
  tf.nn.leaky_relu,
  max_pool,  # [14, 14, 16] -> [7, 7, 16]
  tfp_nn.util.flatten_rightmost,
  tfp_nn.Affine(
      input_size=7 * 7 * 16,
      output_size=10,
      name='affine1'),
  lambda x: tfd.Categorical(logits=x, dtype=tf.int32),   
], name='DNN')

print(dnn.summary())

train_iter = iter(train_dataset)
eval_iter = iter(eval_dataset.batch(2000).repeat())

def loss():
  x, y = next(train_iter)
  return -tf.reduce_mean(dnn(x).log_prob(y), axis=-1), None

opt = tf.optimizers.Adam(learning_rate=1e-2)
 
fit = tfp_nn.util.make_fit_op(
    loss,
    opt,
    dnn.trainable_variables,
    grad_summary_fn=lambda gs: tf.nest.map_structure(tf.norm, gs))

@tf.function(autograph=False)
def eval():
  with tf.xla.experimental.jit_scope(compile_ops=True):
    x, y = next(eval_iter)
    yhat = dnn(x)
    nll = -tf.reduce_mean(yhat.log_prob(y), axis=-1)
    acc = tf.reduce_mean(tf.cast(tf.equal(y, yhat.mode()), tf.float32), axis=-1)
    return nll, acc

num_train_steps = 20e3  # @param { isTemplate: true}
num_train_steps = int(num_train_steps)  # Enforce correct type when overridden.
dur_sec = dur_num = 0

for i in range(num_train_steps):
  start = time.time()
  trn_loss, _, g = fit()
  stop = time.time()
  dur_sec += stop - start
  dur_num += 1
  if i % 100 == 0 or i == num_train_steps - 1:
    tst_loss, tst_acc= eval()
    f, x = zip(*[
        ('it:{:5}', opt.iterations),
        ('ms/it:{:6.4f}', dur_sec / max(1., dur_num) * 1000.),
        ('tst_acc:{:6.4f}', tst_acc),
        ('trn_loss:{:6.4f}', trn_loss),
        ('tst_loss:{:6.4f}', tst_loss),
        ('tst_nll:{:6.4f}', tst_nll),
        ('tst_kl:{:6.4f}', tst_kl),
        ('sum_norm_grad:{:6.4f}', sum(g)),

    ])
    print('   '.join(f).format(*[getattr(x_, 'numpy', lambda: x_)()
                                 for x_ in x]))
    sys.stdout.flush()
    dur_sec = dur_num = 0
  # if i % 1000 == 0 or i == maxiter - 1:
  #   dnn.save('/tmp/vae.npz')

@tf.function(autograph=False)
def dnn_predicted_log_prob(x):
  with tf.xla.experimental.jit_scope(compile_ops=True):
    return tf.math.log_softmax(dnn(x).logits, axis=-1)
  
eval_iter = iter(eval_dataset.batch(2000))

dnn_predicted_log_probs = tf.reshape(
    tf.stack([dnn_predicted_log_prob(x) for x, _ in eval_iter], axis=0),
    shape=[datasets_info.splits['test'].num_examples, -1])

decision = tf.argmax(dnn_predicted_log_probs, axis=-1, output_type=tf.int32)
confidence = tf.reduce_max(dnn_predicted_log_probs, axis=-1)
threshold = 0.95

decided_idx = tf.where(confidence > np.log(threshold))
ordered = tf.argsort(confidence)

n = datasets_info.splits['test'].num_examples
x_final, y_final = next(iter(eval_dataset.batch(n)))

print('Number of examples undecided: {}'.format(n - tf.size(decided_idx)))

accurary = tf.reduce_mean(
    tf.cast(tf.equal(tf.gather(y_final, decided_idx),
                     tf.gather(decision, decided_idx)),
            tf.float32))
print('Accurary after excluding undecided ones: {}'.format(accurary))

tfp_nn.util.display_imgs(
     tf.gather(x_final, ordered[0:50]),
     tf.gather(y_final, ordered[0:50]));

from sklearn import metrics
dnn_auc = np.array([
  metrics.roc_auc_score(tf.equal(y_final, i), dnn_predicted_log_probs[:, i])
  for i in range(10)])
print('Per class AUC:\n{}'.format(dnn_auc[:, np.newaxis]))