from fastai2.text.all import * chunked?? path = untar_data(URLs.IMDB_SAMPLE) df = pd.read_csv(path/'texts.csv') df.head(2) ss = L(list(df.text)) ss[0] def delim_tok(s, delim=' '): return L(s.split(delim)) s = ss[0] delim_tok(s) def apply(func, items): return list(map(func, items)) %%timeit -n 2 -r 3 global t t = apply(delim_tok, ss) %%timeit -n 2 -r 3 parallel(delim_tok, ss, n_workers=2, progress=False) batches32 = [L(list(o)).map(str) for o in np.array_split(ss, 32)] batches8 = [L(list(o)).map(str) for o in np.array_split(ss, 8 )] batches = [L(list(o)).map(str) for o in np.array_split(ss, 2 )] %%timeit -n 2 -r 3 parallel(partial(apply, delim_tok), batches, progress=False, n_workers=2) %%timeit -n 2 -r 3 global t t = parallel(noop, batches, progress=False, n_workers=2) def f(x): return 1 %%timeit -n 2 -r 3 global t t = parallel(f, batches, progress=False, n_workers=2) def f(items): o = [s.split(' ') for s in items] return [s for s in items] %%timeit -n 2 -r 3 global t t = parallel(f, batches, progress=False, n_workers=2) sarr = np.array(ss) %%timeit -n 2 -r 3 global t t = np.char.split(sarr) from spacy.lang.en import English def conv_sp(doc): return L(doc).map(str) class SpTok: def __init__(self): nlp = English() self.tok = nlp.Defaults.create_tokenizer(nlp) def __call__(self, x): return L(self.tok(str(x))).map(conv_sp) %%timeit -n 2 -r 3 SpTok() nlp = English() sp_tokenizer = nlp.Defaults.create_tokenizer(nlp) def spacy_tok(s): return L(sp_tokenizer(str(s))).map(str) %%timeit -r 3 global t t = apply(spacy_tok, ss) %%timeit -r 3 global t t = parallel(partial(apply, spacy_tok), batches, progress=False, n_workers=2) %%timeit -r 3 global t t = parallel(partial(apply, spacy_tok), batches8, progress=False, n_workers=8) def f(its): tok = SpTok() return [[str(o) for o in tok(p)] for p in its] %%timeit -r 3 global t t = parallel(f, batches8, progress=False, n_workers=8) %%timeit -r 3 global t t = L(nlp.tokenizer.pipe(ss)).map(conv_sp) def f(its): return L(nlp.tokenizer.pipe(its)).map(conv_sp) %%timeit -r 3 global t t = parallel(f, batches8, progress=False, n_workers=8) test_eq(chunked(range(12),n_chunks=4), [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]) test_eq(chunked(range(11),n_chunks=4), [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]]) test_eq(chunked(range(10),n_chunks=4), [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]) test_eq(chunked(range( 9),n_chunks=3), [[0, 1, 2], [3, 4, 5], [6, 7, 8]]) %%timeit -r 3 global t t = parallel_chunks(f, ss, n_workers=8, progress=False) def array_split(arr, n): return chunked(arr, math.floor(len(arr)/n))