from fastai.text import * from fastai import * path = Path('data/wikitext-103') def istitle(line): return len(re.findall(r'^ = [^=]* = $', line)) != 0 def process_unk(s): return UNK if s == '' else s def read_file(filename): articles = [] with open(filename, encoding='utf8') as f: lines = f.readlines() current_article = '' for i,line in enumerate(lines): current_article += line if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]): articles.append(current_article) current_article = '' articles.append(current_article) return np.array(articles) train = read_file(path/'wiki.train.tokens') valid = read_file(path/'wiki.valid.tokens') test = read_file(path/'wiki.test.tokens') len(train), len(valid), len(test) all_texts = np.concatenate([valid, train,test]) df = pd.DataFrame({'texts':all_texts}) df.head() df.head() del train del valid del test data = (TextList.from_df(df, path, col='texts') .split_by_idx(range(0,60)) .label_for_lm() .databunch()) data.save() data = TextLMDataBunch.load(path, bs=80, max_len=15) data.show_batch() learn = language_model_learner(data, drop_mult=0., emb_sz=400, nh=1550, nl=4, qrnn=True, clip=0.12) learn.fit_one_cycle(10,5e-3, moms=(0.8,0.7)) learn.save('qrnn_maj') learn = language_model_learner(data, drop_mult=0.1, clip=0.12) learn.load('lstm_maj'); from fastai.callbacks.tracker import SaveModelCallback cb = SaveModelCallback(learn) learn.fit_one_cycle(5,1e-3, moms=(0.8,0.7), callbacks=[cb], pct_start=0.1) learn.save('qrnn_maj1') learn.validate(learn.data.valid_dl)