#!/usr/bin/env python # coding: utf-8 # # Turkish ULMFiT from scratch # In[19]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') from fastai import * from fastai.text import * # In[20]: bs=128 torch.cuda.set_device(2) data_path = Config.data_path() lang = 'tr' name = f'{lang}wiki' path = data_path/name path.mkdir(exist_ok=True, parents=True) # In[21]: mdl_path = path/'models' mdl_path.mkdir(exist_ok=True) lm_fns = [mdl_path/f'{lang}_wt', mdl_path/f'{lang}_wt_vocab'] # ## Turkish wikipedia model # In[22]: from nlputils import split_wiki,get_wiki get_wiki(path,lang) get_ipython().system('head -n4 {path}/{name}') # In[23]: dest = split_wiki(path,lang) # Turkish is an [Agglutinative_language](https://en.wikipedia.org/wiki/Agglutinative_language) so it needs special care! # # ![Turkish morphemes example](images/turkish.jpg) # In[5]: data = (TextList.from_folder(dest, processor=[OpenFileProcessor(), SPProcessor()]) .split_by_rand_pct(0.1, seed=42) .label_for_lm() .databunch(bs=bs, num_workers=1)) data.save(f'{lang}_databunch') len(data.vocab.itos),len(data.train_ds) # In[ ]: data = load_data(dest, f'{lang}_databunch', bs=bs) # In[12]: data.show_batch() # In[13]: learn = language_model_learner(data, AWD_LSTM, drop_mult=0.1, wd=0.1, pretrained=False).to_fp16() # In[14]: lr = 3e-3 lr *= bs/48 # Scale learning rate by batch size # In[15]: learn.unfreeze() learn.fit_one_cycle(10, lr, moms=(0.8,0.7)) # In[32]: learn.to_fp32().save(lm_fns[0], with_opt=False) learn.data.vocab.save(lm_fns[1].with_suffix('.pkl')) # ## Turkish sentiment analysis # https://www.win.tue.nl/~mpechen/projects/smm/ # ### Language model # In[24]: path_clas = path/'movies' path_clas.ls() # In[25]: pos = (path_clas/'tr_polarity.pos').open(encoding='iso-8859-9').readlines() pos_df = pd.DataFrame({'text':pos}) pos_df['pos'] = 1 pos_df.head() # In[26]: neg = (path_clas/'tr_polarity.neg').open(encoding='iso-8859-9').readlines() neg_df = pd.DataFrame({'text':neg}) neg_df['pos'] = 0 neg_df.head() # In[27]: df = pd.concat([pos_df,neg_df], sort=False) # In[28]: data_lm = (TextList.from_df(df, path_clas, cols='text', processor=SPProcessor.load(dest)) .split_by_rand_pct(0.1, seed=42) .label_for_lm() .databunch(bs=bs, num_workers=1)) data_lm.save(f'{lang}_clas_databunch') # In[29]: data_lm = load_data(path_clas, f'{lang}_clas_databunch', bs=bs) # In[30]: data_lm.show_batch() # In[31]: learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained_fnames=lm_fns, drop_mult=1.0, wd=0.1) # In[32]: lr = 1e-3 lr *= bs/48 # In[33]: learn_lm.fit_one_cycle(1, lr*10, moms=(0.8,0.7)) # In[34]: learn_lm.unfreeze() learn_lm.fit_one_cycle(5, slice(lr/10,lr*10), moms=(0.8,0.7)) # In[35]: learn_lm.save(f'{lang}fine_tuned') learn_lm.save_encoder(f'{lang}fine_tuned_enc') # ### Classifier # In[37]: data_clas = (TextList.from_df(df, path_clas, cols='text', processor=SPProcessor.load(dest)) .split_by_rand_pct(0.1, seed=42) .label_from_df(cols='pos') .databunch(bs=bs, num_workers=1)) # In[38]: learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, pretrained=False, wd=0.1).to_fp16() learn_c.load_encoder(f'{lang}fine_tuned_enc') learn_c.freeze() # In[39]: lr=2e-2 lr *= bs/48 # In[40]: learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7)) # In[41]: learn_c.fit_one_cycle(2, lr, moms=(0.8,0.7)) # In[42]: learn_c.freeze_to(-2) learn_c.fit_one_cycle(2, slice(lr/(2.6**4),lr), moms=(0.8,0.7)) # In[43]: learn_c.freeze_to(-3) learn_c.fit_one_cycle(2, slice(lr/2/(2.6**4),lr/2), moms=(0.8,0.7)) # In[ ]: learn_c.unfreeze() learn_c.fit_one_cycle(4, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7)) # Accuracy in Gezici (2018), *Sentiment Analysis in Turkish* is: `75.16%`. # In[158]: learn_c.save(f'{lang}clas') # ## fin # In[ ]: