%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

path = untar_data(URLs.IMDB_SAMPLE)
path.ls()

df = pd.read_csv(path/'texts.csv')
df.head()

df['text'][1]

data_lm = TextDataBunch.from_csv(path, 'texts.csv')

data_lm.save()

data = TextDataBunch.load(path)

data = TextClasDataBunch.load(path)
data.show_batch()

data.vocab.itos[:10]

data.train_ds[0][0]

data.train_ds[0][0].data[:10]

data = (TextList.from_csv(path, 'texts.csv', cols='text')
                .split_from_df(col=2)
                .label_from_df(cols=0)
                .databunch())

path = untar_data(URLs.IMDB)
path.ls()

(path/'train').ls()

bs=48

data_lm = (TextList.from_folder(path)
           #Inputs: all the text files in path
            .filter_by_folder(include=['train', 'test']) 
           #We may have other temp folders that contain text files so we only keep what's in train and test
            .random_split_by_pct(0.1)
           #We randomly split and keep 10% (10,000 reviews) for validation
            .label_for_lm()           
           #We want to do a language model so we label accordingly
            .databunch(bs=bs))
data_lm.save('tmp_lm')

data_lm = TextLMDataBunch.load(path, 'tmp_lm', bs=bs)

data_lm.show_batch()

learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.3)

learn.lr_find()

learn.recorder.plot(skip_end=15)

learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))

learn.save('fit_head')

learn.load('fit_head');

learn.unfreeze()
learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))

learn.save('fine_tuned')

learn.load('fine_tuned');

learn.predict('I liked this movie because ', 100, temperature=1.1, min_p=0.001)

learn.save_encoder('fine_tuned_enc')

path = untar_data(URLs.IMDB)
path.ls()

(path/'train').ls()

data_clas = (TextList.from_folder(path, vocab=data_lm.vocab)
             #grab all the text files in path
             .split_by_folder(valid='test')
             #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
             .label_from_folder(classes=['neg', 'pos'])
             #remove docs with labels not in above list (i.e. 'unsup')
             .filter_missing_y()
             #label them all with their folders
             .databunch(bs=bs))
data_clas.save('tmp_clas')

len(data_clas.train_ds)

data_clas = TextClasDataBunch.load(path, 'tmp_clas', bs=bs)
data_clas.show_batch()

learn = text_classifier_learner(data_clas, drop_mult=0.5)
learn.load_encoder('fine_tuned_enc')
learn.freeze()

learn.lr_find()

learn.recorder.plot()

learn.fit_one_cycle(1, 2e-2, moms=(0.8,0.7))

learn.save('first')

learn.load('first');

learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))

learn.save('second')

learn.load('second');

learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))

learn.save('third')

learn.load('third');

learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))

learn.predict("I really loved that movie, it was awesome!")