#!/usr/bin/env python # coding: utf-8 # # Vietnamese ULMFiT from scratch (backwards) # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') from fastai import * from fastai.text import * # In[2]: bs=128 # In[3]: data_path = Config.data_path() lang = 'vi' name = f'{lang}wiki' path = data_path/name dest = path/'docs' lm_fns = [f'{lang}_wt_bwd', f'{lang}_wt_vocab_bwd'] # ## Vietnamese wikipedia model # In[5]: data = (TextList.from_folder(dest) .split_by_rand_pct(0.1, seed=42) .label_for_lm() .databunch(bs=bs, num_workers=1, backwards=True)) data.save(f'{lang}_databunch_bwd') # In[4]: data = load_data(dest, f'{lang}_databunch_bwd', bs=bs, backwards=True) # In[5]: learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16() # In[6]: lr = 3e-3 lr *= bs/48 # Scale learning rate by batch size # In[12]: learn.unfreeze() learn.fit_one_cycle(10, lr, moms=(0.8,0.7)) # In[14]: mdl_path = path/'models' mdl_path.mkdir(exist_ok=True) learn.to_fp32().save(mdl_path/lm_fns[0], with_opt=False) learn.data.vocab.save(mdl_path/(lm_fns[1] + '.pkl')) # ## Vietnamese sentiment analysis # ### Language model # In[9]: train_df = pd.read_csv(path/'train.csv') train_df.loc[pd.isna(train_df.comment),'comment']='NA' test_df = pd.read_csv(path/'test.csv') test_df.loc[pd.isna(test_df.comment),'comment']='NA' test_df['label'] = 0 df = pd.concat([train_df,test_df]) # In[10]: data_lm = (TextList.from_df(df, path, cols='comment') .split_by_rand_pct(0.1, seed=42) .label_for_lm() .databunch(bs=bs, num_workers=1, backwards=True)) learn_lm = language_model_learner(data_lm, AWD_LSTM, config={**awd_lstm_lm_config, 'n_hid': 1152}, pretrained_fnames=lm_fns, drop_mult=1.0) # In[11]: lr = 1e-3 lr *= bs/48 # In[21]: learn_lm.fit_one_cycle(2, lr*10, moms=(0.8,0.7)) # In[22]: learn_lm.unfreeze() learn_lm.fit_one_cycle(8, lr, moms=(0.8,0.7)) # In[23]: learn_lm.save(f'{lang}fine_tuned_bwd') learn_lm.save_encoder(f'{lang}fine_tuned_enc_bwd') # ### Classifier # In[12]: data_clas = (TextList.from_df(train_df, path, vocab=data_lm.vocab, cols='comment') .split_by_rand_pct(0.1, seed=42) .label_from_df(cols='label') .databunch(bs=bs, num_workers=1, backwards=True)) data_clas.save(f'{lang}_textlist_class_bwd') # In[13]: data_clas = load_data(path, f'{lang}_textlist_class_bwd', bs=bs, num_workers=1, backwards=True) # In[14]: from sklearn.metrics import f1_score @np_func def f1(inp,targ): return f1_score(targ, np.argmax(inp, axis=-1)) # In[15]: learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy,f1]).to_fp16() learn_c.load_encoder(f'{lang}fine_tuned_enc_bwd') learn_c.freeze() # In[16]: lr=2e-2 lr *= bs/48 # In[17]: learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7)) # In[18]: learn_c.freeze_to(-2) learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7)) # In[27]: learn_c.freeze_to(-3) learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7)) # In[20]: learn_c.unfreeze() learn_c.fit_one_cycle(1, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7)) # In[21]: learn_c.save(f'{lang}clas_bwd') # In[ ]: