#!/usr/bin/env python
# coding: utf-8

# # Enter State Farm

# In[1]:


from theano.sandbox import cuda
cuda.use('gpu0')


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')
from __future__ import print_function, division
path = "data/state/"
#path = "data/state/sample/"
import utils; reload(utils)
from utils import *
from IPython.display import FileLink


# In[3]:


batch_size=64


# ## Setup batches

# In[4]:


batches = get_batches(path+'train', batch_size=batch_size)
val_batches = get_batches(path+'valid', batch_size=batch_size*2, shuffle=False)


# In[5]:


(val_classes, trn_classes, val_labels, trn_labels, 
    val_filenames, filenames, test_filenames) = get_classes(path)


# Rather than using batches, we could just import all the data into an array to save some processing time. (In most examples I'm using the batches, however - just because that's how I happened to start out.)

# In[53]:


trn = get_data(path+'train')
val = get_data(path+'valid')


# In[ ]:


save_array(path+'results/val.dat', val)
save_array(path+'results/trn.dat', trn)


# In[7]:


val = load_array(path+'results/val.dat')
trn = load_array(path+'results/trn.dat')


# ## Re-run sample experiments on full dataset

# We should find that everything that worked on the sample (see statefarm-sample.ipynb), works on the full dataset too. Only better! Because now we have more data. So let's see how they go - the models in this section are exact copies of the sample notebook models.

# ### Single conv layer

# In[19]:


def conv1(batches):
    model = Sequential([
            BatchNormalization(axis=1, input_shape=(3,224,224)),
            Convolution2D(32,3,3, activation='relu'),
            BatchNormalization(axis=1),
            MaxPooling2D((3,3)),
            Convolution2D(64,3,3, activation='relu'),
            BatchNormalization(axis=1),
            MaxPooling2D((3,3)),
            Flatten(),
            Dense(200, activation='relu'),
            BatchNormalization(),
            Dense(10, activation='softmax')
        ])

    model.compile(Adam(lr=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit_generator(batches, batches.nb_sample, nb_epoch=2, validation_data=val_batches, 
                     nb_val_samples=val_batches.nb_sample)
    model.optimizer.lr = 0.001
    model.fit_generator(batches, batches.nb_sample, nb_epoch=4, validation_data=val_batches, 
                     nb_val_samples=val_batches.nb_sample)
    return model


# In[20]:


model = conv1(batches)


# Interestingly, with no regularization or augmentation we're getting some reasonable results from our simple convolutional model. So with augmentation, we hopefully will see some very good results.

# ### Data augmentation

# In[6]:


gen_t = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)


# In[22]:


model = conv1(batches)


# In[23]:


model.optimizer.lr = 0.0001
model.fit_generator(batches, batches.nb_sample, nb_epoch=15, validation_data=val_batches, 
                 nb_val_samples=val_batches.nb_sample)


# I'm shocked by *how* good these results are! We're regularly seeing 75-80% accuracy on the validation set, which puts us into the top third or better of the competition. With such a simple model and no dropout or semi-supervised learning, this really speaks to the power of this approach to data augmentation.

# ### Four conv/pooling pairs + dropout

# Unfortunately, the results are still very unstable - the validation accuracy jumps from epoch to epoch. Perhaps a deeper model with some dropout would help.

# In[20]:


gen_t = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)


# In[21]:


model = Sequential([
        BatchNormalization(axis=1, input_shape=(3,224,224)),
        Convolution2D(32,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D(),
        Convolution2D(64,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D(),
        Convolution2D(128,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D(),
        Flatten(),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(10, activation='softmax')
    ])


# In[22]:


model.compile(Adam(lr=10e-5), loss='categorical_crossentropy', metrics=['accuracy'])


# In[23]:


model.fit_generator(batches, batches.nb_sample, nb_epoch=2, validation_data=val_batches, 
                 nb_val_samples=val_batches.nb_sample)


# In[24]:


model.optimizer.lr=0.001


# In[25]:


model.fit_generator(batches, batches.nb_sample, nb_epoch=10, validation_data=val_batches, 
                 nb_val_samples=val_batches.nb_sample)


# In[26]:


model.optimizer.lr=0.00001


# In[27]:


model.fit_generator(batches, batches.nb_sample, nb_epoch=10, validation_data=val_batches, 
                 nb_val_samples=val_batches.nb_sample)


# This is looking quite a bit better - the accuracy is similar, but the stability is higher. There's still some way to go however...

# ### Imagenet conv features

# Since we have so little data, and it is similar to imagenet images (full color photos), using pre-trained VGG weights is likely to be helpful - in fact it seems likely that we won't need to fine-tune the convolutional layer weights much, if at all. So we can pre-compute the output of the last convolutional layer, as we did in lesson 3 when we experimented with dropout. (However this means that we can't use full data augmentation, since we can't pre-compute something that changes every image.)

# In[14]:


vgg = Vgg16()
model=vgg.model
last_conv_idx = [i for i,l in enumerate(model.layers) if type(l) is Convolution2D][-1]
conv_layers = model.layers[:last_conv_idx+1]


# In[15]:


conv_model = Sequential(conv_layers)


# In[ ]:


# batches shuffle must be set to False when pre-computing features
batches = get_batches(path+'train', batch_size=batch_size, shuffle=False)


# In[16]:


(val_classes, trn_classes, val_labels, trn_labels, 
    val_filenames, filenames, test_filenames) = get_classes(path)


# In[ ]:


conv_feat = conv_model.predict_generator(batches, batches.nb_sample)
conv_val_feat = conv_model.predict_generator(val_batches, val_batches.nb_sample)
conv_test_feat = conv_model.predict_generator(test_batches, test_batches.nb_sample)


# In[ ]:


save_array(path+'results/conv_val_feat.dat', conv_val_feat)
save_array(path+'results/conv_test_feat.dat', conv_test_feat)
save_array(path+'results/conv_feat.dat', conv_feat)


# In[10]:


conv_feat = load_array(path+'results/conv_feat.dat')
conv_val_feat = load_array(path+'results/conv_val_feat.dat')
conv_val_feat.shape


# In[ ]:


# ### Batchnorm dense layers on pretrained conv layers

# Since we've pre-computed the output of the last convolutional layer, we need to create a network that takes that as input, and predicts our 10 classes. Let's try using a simplified version of VGG's dense layers.

# In[71]:


def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p/2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(p/2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(10, activation='softmax')
        ]


# In[72]:


p=0.8


# In[73]:


bn_model = Sequential(get_bn_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])


# In[74]:


bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_feat, val_labels))


# In[75]:


bn_model.optimizer.lr=0.01


# In[76]:


bn_model.fit(conv_feat, trn_labels, batch_size=batch_size, nb_epoch=2, 
             validation_data=(conv_val_feat, val_labels))


# In[77]:


bn_model.save_weights(path+'models/conv8.h5')


# Looking good! Let's try pre-computing 5 epochs worth of augmented data, so we can experiment with combining dropout and augmentation on the pre-trained model.

# ### Pre-computed data augmentation + dropout

# We'll use our usual data augmentation parameters:

# In[107]:


gen_t = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)
da_batches = get_batches(path+'train', gen_t, batch_size=batch_size, shuffle=False)


# We use those to create a dataset of convolutional features 5x bigger than the training set.

# In[108]:


da_conv_feat = conv_model.predict_generator(da_batches, da_batches.nb_sample*5)


# In[109]:


save_array(path+'results/da_conv_feat2.dat', da_conv_feat)


# In[78]:


da_conv_feat = load_array(path+'results/da_conv_feat2.dat')


# Let's include the real training data as well in its non-augmented form.

# In[131]:


da_conv_feat = np.concatenate([da_conv_feat, conv_feat])


# Since we've now got a dataset 6x bigger than before, we'll need to copy our labels 6 times too.

# In[132]:


da_trn_labels = np.concatenate([trn_labels]*6)


# Based on some experiments the previous model works well, with bigger dense layers.

# In[210]:


def get_bn_da_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(10, activation='softmax')
        ]


# In[216]:


p=0.8


# In[240]:


bn_model = Sequential(get_bn_da_layers(p))
bn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])


# Now we can train the model as usual, with pre-computed augmented data.

# In[241]:


bn_model.fit(da_conv_feat, da_trn_labels, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_feat, val_labels))


# In[242]:


bn_model.optimizer.lr=0.01


# In[243]:


bn_model.fit(da_conv_feat, da_trn_labels, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))


# In[244]:


bn_model.optimizer.lr=0.0001


# In[245]:


bn_model.fit(da_conv_feat, da_trn_labels, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))


# Looks good - let's save those weights.

# In[246]:


bn_model.save_weights(path+'models/da_conv8_1.h5')


# ### Pseudo labeling

# We're going to try using a combination of [pseudo labeling](http://deeplearning.net/wp-content/uploads/2013/03/pseudo_label_final.pdf) and [knowledge distillation](https://arxiv.org/abs/1503.02531) to allow us to use unlabeled data (i.e. do semi-supervised learning). For our initial experiment we'll use the validation set as the unlabeled data, so that we can see that it is working without using the test set. At a later date we'll try using the test set.

# To do this, we simply calculate the predictions of our model...

# In[247]:


val_pseudo = bn_model.predict(conv_val_feat, batch_size=batch_size)


# ...concatenate them with our training labels...

# In[255]:


comb_pseudo = np.concatenate([da_trn_labels, val_pseudo])


# In[256]:


comb_feat = np.concatenate([da_conv_feat, conv_val_feat])


# ...and fine-tune our model using that data.

# In[257]:


bn_model.load_weights(path+'models/da_conv8_1.h5')


# In[258]:


bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_feat, val_labels))


# In[259]:


bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))


# In[260]:


bn_model.optimizer.lr=0.00001


# In[261]:


bn_model.fit(comb_feat, comb_pseudo, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_feat, val_labels))


# That's a distinct improvement - even although the validation set isn't very big. This looks encouraging for when we try this on the test set.

# In[262]:


bn_model.save_weights(path+'models/bn-ps8.h5')


# ### Submit

# We'll find a good clipping amount using the validation set, prior to submitting.

# In[271]:


def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)


# In[282]:


keras.metrics.categorical_crossentropy(val_labels, do_clip(val_preds, 0.93)).eval()


# In[283]:


conv_test_feat = load_array(path+'results/conv_test_feat.dat')


# In[284]:


preds = bn_model.predict(conv_test_feat, batch_size=batch_size*2)


# In[285]:


subm = do_clip(preds,0.93)


# In[305]:


subm_name = path+'results/subm.gz'


# In[296]:


classes = sorted(batches.class_indices, key=batches.class_indices.get)


# In[301]:


submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[4:] for a in test_filenames])
submission.head()


# In[307]:


submission.to_csv(subm_name, index=False, compression='gzip')


# In[308]:


FileLink(subm_name)


# This gets 0.534 on the leaderboard.

# ## The "things that didn't really work" section

# You can safely ignore everything from here on, because they didn't really help.

# ### Finetune some conv layers too

# In[28]:


for l in get_bn_layers(p): conv_model.add(l)


# In[29]:


for l1,l2 in zip(bn_model.layers, conv_model.layers[last_conv_idx+1:]):
    l2.set_weights(l1.get_weights())


# In[30]:


for l in conv_model.layers: l.trainable =False


# In[31]:


for l in conv_model.layers[last_conv_idx+1:]: l.trainable =True


# In[36]:


comb = np.concatenate([trn, val])


# In[37]:


gen_t = image.ImageDataGenerator(rotation_range=8, height_shift_range=0.04, 
                shear_range=0.03, channel_shift_range=10, width_shift_range=0.08)


# In[38]:


batches = gen_t.flow(comb, comb_pseudo, batch_size=batch_size)


# In[176]:


val_batches = get_batches(path+'valid', batch_size=batch_size*2, shuffle=False)


# In[177]:


conv_model.compile(Adam(lr=0.00001), loss='categorical_crossentropy', metrics=['accuracy'])


# In[178]:


conv_model.fit_generator(batches, batches.N, nb_epoch=1, validation_data=val_batches, 
                 nb_val_samples=val_batches.N)


# In[ ]:


conv_model.optimizer.lr = 0.0001


# In[ ]:


conv_model.fit_generator(batches, batches.N, nb_epoch=3, validation_data=val_batches, 
                 nb_val_samples=val_batches.N)


# In[ ]:


for l in conv_model.layers[16:]: l.trainable =True


# In[ ]:


conv_model.optimizer.lr = 0.00001


# In[ ]:


conv_model.fit_generator(batches, batches.N, nb_epoch=8, validation_data=val_batches, 
                 nb_val_samples=val_batches.N)


# In[ ]:


conv_model.save_weights(path+'models/conv8_ps.h5')


# In[77]:


conv_model.load_weights(path+'models/conv8_da.h5')


# In[135]:


val_pseudo = conv_model.predict(val, batch_size=batch_size*2)


# In[159]:


save_array(path+'models/pseudo8_da.dat', val_pseudo)


# ### Ensembling

# In[14]:


drivers_ds = pd.read_csv(path+'driver_imgs_list.csv')
drivers_ds.head()


# In[15]:


img2driver = drivers_ds.set_index('img')['subject'].to_dict()


# In[16]:


driver2imgs = {k: g["img"].tolist() 
               for k,g in drivers_ds[['subject', 'img']].groupby("subject")}


# In[56]:


def get_idx(driver_list):
    return [i for i,f in enumerate(filenames) if img2driver[f[3:]] in driver_list]


# In[17]:


drivers = driver2imgs.keys()


# In[94]:


rnd_drivers = np.random.permutation(drivers)


# In[95]:


ds1 = rnd_drivers[:len(rnd_drivers)//2]
ds2 = rnd_drivers[len(rnd_drivers)//2:]


# In[68]:


models=[fit_conv([d]) for d in drivers]
models=[m for m in models if m is not None]


# In[77]:


all_preds = np.stack([m.predict(conv_test_feat, batch_size=128) for m in models])
avg_preds = all_preds.mean(axis=0)
avg_preds = avg_preds/np.expand_dims(avg_preds.sum(axis=1), 1)


# In[102]:


keras.metrics.categorical_crossentropy(val_labels, np.clip(avg_val_preds,0.01,0.99)).eval()


# In[103]:


keras.metrics.categorical_accuracy(val_labels, np.clip(avg_val_preds,0.01,0.99)).eval()


# In[ ]: