#!/usr/bin/env python # coding: utf-8 # In[1]: from theano.sandbox import cuda # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import utils; reload(utils) from utils import * from __future__ import division, print_function # In[3]: model_path = 'data/imdb/models/' get_ipython().run_line_magic('mkdir', '-p $model_path') # ## Setup data # We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment. Keras comes with some helpers for this dataset. # In[4]: from keras.datasets import imdb idx = imdb.get_word_index() # This is the word list: # In[5]: idx_arr = sorted(idx, key=idx.get) idx_arr[:10] # ...and this is the mapping from id to word # In[6]: idx2word = {v: k for k, v in idx.iteritems()} # We download the reviews using code copied from keras.datasets: # In[ ]: path = get_file('imdb_full.pkl', origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl', md5_hash='d091312047c43cf9e4e38fef92437263') f = open(path, 'rb') (x_train, labels_train), (x_test, labels_test) = pickle.load(f) # In[ ]: len(x_train) # Here's the 1st review. As you see, the words have been replaced by ids. The ids can be looked up in idx2word. # In[ ]: ', '.join(map(str, x_train[0])) # The first word of the first review is 23022. Let's see what that is. # In[ ]: idx2word[23022] # Here's the whole review, mapped from ids to words. # In[ ]: ' '.join([idx2word[o] for o in x_train[0]]) # The labels are 1 for positive, 0 for negative. # In[26]: labels_train[:10] # Reduce vocab size by setting rare words to max index. # In[27]: vocab_size = 5000 trn = [np.array([i if i