from fastai.text import * path = Config().data_path()/'wikitext-103' def istitle(line): return len(re.findall(r'^ = [^=]* = $', line)) != 0 def read_file(filename): articles = [] with open(filename, encoding='utf8') as f: lines = f.readlines() current_article = '' for i,line in enumerate(lines): current_article += line if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]): current_article = current_article.replace('', UNK) articles.append(current_article) current_article = '' current_article = current_article.replace('', UNK) articles.append(current_article) return np.array(articles) train = read_file(path/'train.txt') valid = read_file(path/'valid.txt') test = read_file(path/'test.txt') len(train), len(valid), len(test) all_texts = np.concatenate([valid, train, test]) df = pd.DataFrame({'texts':all_texts}) df.head() del train del valid del text data = (TextList.from_df(df, path, cols='texts') .split_by_idx(range(0,60)) .label_for_lm() .databunch(bs=100, bptt=70)) data.show_batch() data.save() data = load_data(path) data.show_batch() learn = language_model_learner(data, AWD_LSTM, drop_mult=0.1, pretrained=False, clip=0.1, metrics=[accuracy, Perplexity()]) learn.fit_one_cycle(10, 5e-3, moms=(0.8,0.7), div_factor=10, wd=1e-3) learn.save('lstm', with_opt=False) learn.data.vocab.save(path/'vocab.pkl')