#hide
from utils import *
from IPython.display import display,HTML

from fastai2.text.all import *
path = untar_data(URLs.IMDB)

tfm = Tokenizer.from_folder(path)
files = get_text_files(path, folders = ['train', 'test', 'unsup'])
tls = TfmdLists(files, tfm)

tls[0]

defaults.text_proc_rules

tfm = Tokenizer.from_folder(path)
files = get_text_files(path, folders = ['train', 'test', 'unsup'])
tls = TfmdLists(files, [tfm, Numericalize])
tls[0][:10]

tls.vocab[8:15]

stream = "In this chapter, we will go back over the example of classifying movie reviews we studied in chapter 1 and dig deeper under the surface. First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the PreProcessor used in the data block API.\nThen we will study how we build a language model and train it. We will have another theoretical part and explain the different variants of Stochastic Gradient Descent and the default the library uses, Adam.\nThen, we'll go back to classification and learn about padding."
tokens = tfm(stream)[:90]
bs,seq_len = 6,15
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

bs,seq_len = 6,5
d_tokens = np.array([tokens[i*seq_len:(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

bs,seq_len = 6,5
d_tokens = np.array([tokens[i*seq_len+5:(i+1)*seq_len+5] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

bs,seq_len = 6,5
d_tokens = np.array([tokens[10+i*seq_len:10+(i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))

dl = LMDataLoader(tls)

tfms = [Tokenizer.from_folder(path), Numericalize]
files = get_text_files(path, folders = ['train', 'test', 'unsup'])
splits = RandomSplitter(valid_pct=0.1, seed=42)(files)
tls = TfmdLists(files, tfms, splits=splits)
dls = tls.dataloaders(dl_type=LMDataLoader)

tfms = [[Tokenizer.from_folder(path), Numericalize], [parent_label, Categorize]]

files = get_text_files(path, folders = ['train', 'test'])

splits = GrandparentSplitter(valid_name='test')(files)

dsets = Datasets(files, tfms, splits=splits)

dls = dsets.dataloaders(dl_type=SortedDL, before_batch=pad_input_chunk)

dls = TextDataLoaders.from_folder(path, valid='test')

imdb = DataBlock(blocks=(TextBlock.from_folder(path),CategoryBlock),
                 get_y = parent_label,
                 get_items=partial(get_text_files, folders=['train', 'test']),
                 splitter=GrandparentSplitter(valid_name='test'))
dls = imdb.dataloaders(path)

tfms = [[Tokenizer.from_folder(path), Numericalize], [parent_label, Categorize]]
files = get_text_files(path, folders = ['train', 'test'])
splits = GrandparentSplitter(valid_name='test')(files)
dsets = Datasets(files, tfms, splits=splits)
dls = dsets.dataloaders(dl_type=SortedDL, before_batch=pad_input_chunk)

imdb_lm = DataBlock(blocks=(TextBlock.from_folder(path, is_lm=True),),
                    get_items=partial(get_text_files, folders=['train', 'test', 'unsup']),
                    splitter=RandomSplitter(0.1))

dls_lm = imdb_lm.dataloaders(path, path=path, bs=128, seq_len=80)

dls_lm.show_batch(max_n=3)

learn = language_model_learner(dls_lm, AWD_LSTM, 
                               drop_mult=0.3, 
                               metrics=[accuracy, Perplexity()]).to_fp16()

learn.fit_one_cycle(1, 2e-2)

learn.save('1epoch')

learn = learn.load('1epoch')

learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

learn.save_encoder('finetuned')

TEXT = "I liked this movie because"
N_WORDS = 40
N_SENTENCES = 2
print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

imdb_clas = DataBlock(blocks=(TextBlock.from_folder(path, vocab=dls_lm.vocab),CategoryBlock),
                      get_y = parent_label,
                      get_items=partial(get_text_files, folders=['train', 'test']),
                      splitter=GrandparentSplitter(valid_name='test'))
dls_clas = imdb_clas.dataloaders(path, path=path, bs=128, seq_len=80)

dls_clas.show_batch(max_n=3)

learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy).to_fp16()
learn = learn.load_encoder('finetuned')

learn.fit_one_cycle(1, 2e-2)

learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3),moms=(0.8,0.7, 0.8))

learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7, 0.8))