from fastai2.text.all import *

chunked??

path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

ss = L(list(df.text))
ss[0]

def delim_tok(s, delim=' '): return L(s.split(delim))
s = ss[0]
delim_tok(s)

def apply(func, items): return list(map(func, items))

%%timeit -n 2 -r 3
global t
t = apply(delim_tok, ss)

%%timeit -n 2 -r 3
parallel(delim_tok, ss, n_workers=2, progress=False)

batches32 = [L(list(o)).map(str) for o in np.array_split(ss, 32)]
batches8  = [L(list(o)).map(str) for o in np.array_split(ss, 8 )]
batches   = [L(list(o)).map(str) for o in np.array_split(ss, 2 )]

%%timeit -n 2 -r 3
parallel(partial(apply, delim_tok), batches, progress=False, n_workers=2)

%%timeit -n 2 -r 3
global t
t = parallel(noop, batches, progress=False, n_workers=2)

def f(x): return 1

%%timeit -n 2 -r 3
global t
t = parallel(f, batches, progress=False, n_workers=2)

def f(items):
    o = [s.split(' ') for s in items]
    return [s for s in items]

%%timeit -n 2 -r 3
global t
t = parallel(f, batches, progress=False, n_workers=2)

sarr = np.array(ss)

%%timeit -n 2 -r 3
global t
t = np.char.split(sarr)

from spacy.lang.en import English

def conv_sp(doc): return L(doc).map(str)

class SpTok:
    def __init__(self):
        nlp = English()
        self.tok = nlp.Defaults.create_tokenizer(nlp)
    
    def __call__(self, x): return L(self.tok(str(x))).map(conv_sp)

%%timeit -n 2 -r 3
SpTok()

nlp = English()
sp_tokenizer = nlp.Defaults.create_tokenizer(nlp)
def spacy_tok(s): return L(sp_tokenizer(str(s))).map(str)

%%timeit -r 3
global t
t = apply(spacy_tok, ss)

%%timeit -r 3
global t
t = parallel(partial(apply, spacy_tok), batches, progress=False, n_workers=2)

%%timeit -r 3
global t
t = parallel(partial(apply, spacy_tok), batches8, progress=False, n_workers=8)

def f(its):
    tok = SpTok()
    return [[str(o) for o in tok(p)] for p in its]

%%timeit -r 3
global t
t = parallel(f, batches8, progress=False, n_workers=8)

%%timeit -r 3
global t
t = L(nlp.tokenizer.pipe(ss)).map(conv_sp)

def f(its): return L(nlp.tokenizer.pipe(its)).map(conv_sp)

%%timeit -r 3
global t
t = parallel(f, batches8, progress=False, n_workers=8)

test_eq(chunked(range(12),n_chunks=4), [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]])
test_eq(chunked(range(11),n_chunks=4), [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10]])
test_eq(chunked(range(10),n_chunks=4), [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]])
test_eq(chunked(range( 9),n_chunks=3), [[0, 1, 2], [3, 4, 5], [6, 7, 8]])

%%timeit -r 3
global t
t = parallel_chunks(f, ss, n_workers=8, progress=False)


def array_split(arr, n): return chunked(arr, math.floor(len(arr)/n))