#!/usr/bin/env python # coding: utf-8 # # Language Modeling & Sentiment Analysis of IMDB movie reviews # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') from fastai import * from fastai.text import * # In[2]: # bs=48 bs=128 # In[3]: path = untar_data(URLs.IMDB) # ## Language model # In[7]: data_lm = (TextList.from_folder(path) .filter_by_folder(include=['train', 'test', 'unsup']) .split_by_rand_pct(0.1, seed=42) .label_for_lm() .databunch(bs=bs, num_workers=1)) len(data_lm.vocab.itos),len(data_lm.train_ds) # In[14]: data_lm.save('lm_databunch') # In[7]: data_lm = load_data(path, 'lm_databunch', bs=bs) # In[9]: learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=1.).to_fp16() # In[10]: lr = 1e-2 lr *= bs/48 # In[12]: learn_lm.fit_one_cycle(1, lr, moms=(0.8,0.7)) # In[13]: learn_lm.unfreeze() learn_lm.fit_one_cycle(10, lr/10, moms=(0.8,0.7)) # In[16]: learn_lm.save('fine_tuned_10') learn_lm.save_encoder('fine_tuned_enc_10') # ## Classifier # In[9]: data_clas = (TextList.from_folder(path, vocab=data_lm.vocab) .split_by_folder(valid='test') .label_from_folder(classes=['neg', 'pos']) .databunch(bs=bs, num_workers=1)) # In[11]: data_clas.save('imdb_textlist_class') # In[5]: data_clas = load_data(path, 'imdb_textlist_class', bs=bs, num_workers=1) # In[20]: learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5).to_fp16() learn_c.load_encoder('fine_tuned_enc_10') learn_c.freeze() # In[21]: lr=2e-2 lr *= bs/48 # In[22]: learn_c.fit_one_cycle(1, lr, moms=(0.8,0.7)) # In[23]: learn_c.save('1') # In[37]: learn_c.freeze_to(-2) learn_c.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7)) # In[13]: learn_c.save('2nd') # In[38]: learn_c.freeze_to(-3) learn_c.fit_one_cycle(1, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7)) # In[15]: learn_c.save('3rd') # In[39]: learn_c.unfreeze() learn_c.fit_one_cycle(2, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7)) # In[41]: learn_c.save('clas') # In[ ]: