#default_exp tabular.core
#export
from fastai2.torch_basics import *
from fastai2.data.all import *
#hide
from nbdev.showdoc import *
#export
pd.set_option('mode.chained_assignment','raise')
Basic function to preprocess tabular data before assembling it in a
DataLoaders.
#export
def make_date(df, date_field):
"Make sure `df[date_field]` is of the right date type."
field_dtype = df[date_field].dtype
if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
field_dtype = np.datetime64
if not np.issubdtype(field_dtype, np.datetime64):
df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[ns]'))
#export
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
"Helper function that adds columns relevant to a date in the column `field_name` of `df`."
make_date(df, field_name)
field = df[field_name]
prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
if time: attr = attr + ['Hour', 'Minute', 'Second']
for n in attr: df[prefix + n] = getattr(field.dt, n.lower())
df[prefix + 'Elapsed'] = field.astype(np.int64) // 10 ** 9
if drop: df.drop(field_name, axis=1, inplace=True)
return df
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
df.head()
| Year | Month | Week | Day | Dayofweek | Dayofyear | Is_month_end | Is_month_start | Is_quarter_end | Is_quarter_start | Is_year_end | Is_year_start | Elapsed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019 | 12 | 49 | 4 | 2 | 338 | False | False | False | False | False | False | 1575417600 |
| 1 | 2019 | 11 | 48 | 29 | 4 | 333 | False | False | False | False | False | False | 1574985600 |
| 2 | 2019 | 11 | 46 | 15 | 4 | 319 | False | False | False | False | False | False | 1573776000 |
| 3 | 2019 | 10 | 43 | 24 | 3 | 297 | False | False | False | False | False | False | 1571875200 |
#export
def _get_elapsed(df,field_names, date_field, base_field, prefix):
for f in field_names:
day1 = np.timedelta64(1, 'D')
last_date,last_base,res = np.datetime64(),None,[]
for b,v,d in zip(df[base_field].values, df[f].values, df[date_field].values):
if last_base is None or b != last_base:
last_date,last_base = np.datetime64(),b
if v: last_date = d
res.append(((d-last_date).astype('timedelta64[D]') / day1))
df[prefix + f] = res
return df
#export
def add_elapsed_times(df, field_names, date_field, base_field):
"Add in `df` for each event in `field_names` the elapsed time according to `date_field` grouped by `base_field`"
field_names = list(L(field_names))
#Make sure date_field is a date and base_field a bool
df[field_names] = df[field_names].astype('bool')
make_date(df, date_field)
work_df = df[field_names + [date_field, base_field]]
work_df = work_df.sort_values([base_field, date_field])
work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'After')
work_df = work_df.sort_values([base_field, date_field], ascending=[True, False])
work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'Before')
for a in ['After' + f for f in field_names] + ['Before' + f for f in field_names]:
work_df[a] = work_df[a].fillna(0).astype(int)
for a,s in zip([True, False], ['_bw', '_fw']):
work_df = work_df.set_index(date_field)
tmp = (work_df[[base_field] + field_names].sort_index(ascending=a)
.groupby(base_field).rolling(7, min_periods=1).sum())
tmp.drop(base_field,1,inplace=True)
tmp.reset_index(inplace=True)
work_df.reset_index(inplace=True)
work_df = work_df.merge(tmp, 'left', [date_field, base_field], suffixes=['', s])
work_df.drop(field_names,1,inplace=True)
return df.merge(work_df, 'left', [date_field, base_field])
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df
| date | event | base | Afterevent | Beforeevent | event_bw | event_fw | |
|---|---|---|---|---|---|---|---|
| 0 | 2019-12-04 | False | 1 | 5 | 0 | 1.0 | 0.0 |
| 1 | 2019-11-29 | True | 1 | 0 | 0 | 1.0 | 1.0 |
| 2 | 2019-11-15 | False | 2 | 22 | 0 | 1.0 | 0.0 |
| 3 | 2019-10-24 | True | 2 | 0 | 0 | 1.0 | 1.0 |
#export
def cont_cat_split(df, max_card=20, dep_var=None):
"Helper function that returns column names of cont and cat variables from given `df`."
cont_names, cat_names = [], []
for label in df:
if label == dep_var: continue
if df[label].dtype == int and df[label].unique().shape[0] > max_card or df[label].dtype == float:
cont_names.append(label)
else: cat_names.append(label)
return cont_names, cat_names
#export
def df_shrink_dtypes(df, skip=[], obj2cat=True, int2uint=False):
"Return any possible smaller data types for DataFrame columns. Allows `object`->`category`, `int`->`uint`, and exclusion."
# 1: Build column filter and typemap
excl_types, skip = {'category','datetime64[ns]','bool'}, set(skip)
typemap = {'int' : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.int8, np.int16, np.int32, np.int64)],
'uint' : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.uint8, np.uint16, np.uint32, np.uint64)],
'float' : [(np.dtype(x), np.finfo(x).min, np.finfo(x).max) for x in (np.float32, np.float64, np.longdouble)]
}
if obj2cat: typemap['object'] = 'category' # User wants to categorify dtype('Object'), which may not always save space
else: excl_types.add('object')
new_dtypes = {}
exclude = lambda dt: dt[1].name not in excl_types and dt[0] not in skip
for c, old_t in filter(exclude, df.dtypes.items()):
t = next((v for k,v in typemap.items() if old_t.name.startswith(k)), None)
if isinstance(t, list): # Find the smallest type that fits
if int2uint and t==typemap['int'] and df[c].min() >= 0: t=typemap['uint']
new_t = next((r[0] for r in t if r[1]<=df[c].min() and r[2]>=df[c].max()), None)
if new_t and new_t == old_t: new_t = None
else: new_t = t if isinstance(t, str) else None
if new_t: new_dtypes[c] = new_t
return new_dtypes
show_doc(df_shrink_dtypes, title_level=3)
df_shrink_dtypes[source]
df_shrink_dtypes(df,skip=[],obj2cat=True,int2uint=False)
Return any possible smaller data types for DataFrame columns. Allows object->category, int->uint, and exclusion.
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
'date':['2019-12-04','2019-11-29','2019-11-15',]})
dt = df_shrink_dtypes(df)
test_eq(df['i'].dtype, 'int64')
test_eq(dt['i'], 'int8')
test_eq(df['f'].dtype, 'float64')
test_eq(dt['f'], 'float32')
# Default ignore 'object' and 'boolean' columns
test_eq(df['date'].dtype, 'object')
test_eq(dt['date'], 'category')
# Test categorifying 'object' type
dt2 = df_shrink_dtypes(df, obj2cat=False)
test_eq('date' not in dt2, True)
#export
def df_shrink(df, skip=[], obj2cat=True, int2uint=False):
"Reduce DataFrame memory usage, by casting to smaller types returned by `df_shrink_dtypes()`."
dt = df_shrink_dtypes(df, skip, obj2cat=obj2cat, int2uint=int2uint)
return df.astype(dt)
show_doc(df_shrink, title_level=3)
df_shrink[source]
df_shrink(df,skip=[],obj2cat=True,int2uint=False)
Reduce DataFrame memory usage, by casting to smaller types returned by df_shrink_dtypes().
df_shrink(df) attempts to make a DataFrame uses less memory, by fit numeric columns into smallest datatypes. In addition:
boolean, category, datetime64[ns] dtype columns are ignored.obj2cat=False.int2uint=True, to fit int types to uint types, if all data in the column is >= 0.excl_cols=['col1','col2'].To get only new column data types without actually casting a DataFrame,
use df_shrink_dtypes() with all the same parameters for df_shrink().
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])
test_eq(df['i'].dtype=='int64' and df2['i'].dtype=='int8', True)
test_eq(df['f'].dtype=='float64' and df2['f'].dtype=='float32', True)
test_eq(df['u'].dtype=='int64' and df2['u'].dtype=='int16', True)
test_eq(df2['date'].dtype, 'object')
test_eq(df2.memory_usage().sum() < df.memory_usage().sum(), True)
# Test int => uint (when col.min() >= 0)
df3 = df_shrink(df, int2uint=True)
test_eq(df3['u'].dtype, 'uint8') # int64 -> uint8 instead of int16
# Test excluding columns
df4 = df_shrink(df, skip=['i','u'])
test_eq(df['i'].dtype, df4['i'].dtype)
test_eq(df4['u'].dtype, 'int64')
Here's an example using the ADULT_SAMPLE dataset:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)
print(f"Memory usage: {df.memory_usage().sum()} --> {new_df.memory_usage().sum()}")
Memory usage: 3907448 --> 818665
#export
class _TabIloc:
"Get/set rows by iloc and cols by name"
def __init__(self,to): self.to = to
def __getitem__(self, idxs):
df = self.to.items
if isinstance(idxs,tuple):
rows,cols = idxs
cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
else: rows,cols = idxs,slice(None)
return self.to.new(df.iloc[rows, cols])
#export
class Tabular(CollBase, GetAttr, FilteredBase):
"A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
_default,with_cont='procs',True
def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
do_setup=True, device=None, inplace=False, reduce_memory=True):
if inplace and splits is not None and pd.options.mode.chained_assignment is not None:
warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
if not inplace: df = df.copy()
if reduce_memory: df = df_shrink(df)
if splits is not None: df = df.iloc[sum(splits, [])]
self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
super().__init__(df)
self.y_names,self.device = L(y_names),device
if y_block is None and self.y_names:
# Make ys categorical if they're not numeric
ys = df[self.y_names]
if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): y_block = CategoryBlock()
else: y_block = RegressionBlock()
if y_block is not None and do_setup:
if callable(y_block): y_block = y_block()
procs = L(procs) + y_block.type_tfms
self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
self.split = len(df) if splits is None else len(splits[0])
if do_setup: self.setup()
def new(self, df):
return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
**attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))
def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
def copy(self): self.items = self.items.copy(); return self
def decode(self): return self.procs.decode(self)
def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
def setup(self): self.procs.setup(self)
def process(self): self.procs(self)
def loc(self): return self.items.loc
def iloc(self): return _TabIloc(self)
def targ(self): return self.items[self.y_names]
def x_names (self): return self.cat_names + self.cont_names
def n_subsets(self): return 2
def y(self): return self[self.y_names[0]]
def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
def to_device(self, d=None):
self.device = d
return self
def all_col_names (self):
ys = [n for n in self.y_names if n in self.items.columns]
return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names
properties(Tabular,'loc','iloc','targ','all_col_names','n_subsets','x_names','y')
df: A DataFrame of your datacat_names: Your categorical x variablescont_names: Your continuous x variablesy_names: Your dependendant y variablesy_block: How to sub-categorize the type of y_names (CategoryBlock or RegressionBlock)splits: How to split your datado_setup: A parameter for if Tabular will run the data through the procs upon initializationdevice: cuda or cpuinplace: If True, Tabular will not keep a seperate copy of your original DataFrame in memory. You should ensure pd.options.mode.chained_assignment is None before setting thisreduce_memory: fastai will attempt to reduce the overall memory usage by the inputed DataFrame with df_shrink#export
class TabularPandas(Tabular):
"A `Tabular` object with transforms"
def transform(self, cols, f, all_col=True):
if not all_col: cols = [c for c in cols if c in self.items.columns]
if len(cols) > 0: self[cols] = self[cols].transform(f)
#export
def _add_prop(cls, nm):
@property
def f(o): return o[list(getattr(o,nm+'_names'))]
@f.setter
def fset(o, v): o[getattr(o,nm+'_names')] = v
setattr(cls, nm+'s', f)
setattr(cls, nm+'s', fset)
_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'y')
_add_prop(Tabular, 'x')
_add_prop(Tabular, 'all_col')
df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
to = TabularPandas(df, cat_names='a')
t = pickle.loads(pickle.dumps(to))
test_eq(t.items,to.items)
test_eq(to.all_cols,to[['a']])
#export
class TabularProc(InplaceTransform):
"Base class to write a non-lazy tabular processor for dataframes"
store_attrs=''
def setup(self, items=None, train_setup=False): #TODO: properly deal with train_setup
super().setup(getattr(items,'train',items), train_setup=False)
# Procs are called as soon as data is available
return self(items.items if isinstance(items,Datasets) else items)
@property
def name(self):
if self.store_attrs: attrs = self.store_attrs.split(',')
else: attrs = ''
return f"{super().name} -- {attrdict(self, *attrs)}"
#export
def _apply_cats (voc, add, c):
if not is_categorical_dtype(c):
return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
return c.cat.codes+add #if is_categorical_dtype(c) else c.map(voc[c.name].o2i)
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))
#export
class Categorify(TabularProc):
"Transform the categorical variables to something similar to `pd.Categorical`"
order,store_attrs = 1,'classes'
def setups(self, to):
self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}
def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
def __getitem__(self,k): return self.classes[k]
#export
@Categorize
def setups(self, to:Tabular):
if len(to.y_names) > 0:
if self.vocab is None:
self.vocab = CategoryMap(getattr(to, 'train', to).iloc[:,to.y_names[0]].items, strict=True)
else:
self.vocab = CategoryMap(self.vocab, sort=False, add_na=self.add_na)
self.c = len(self.vocab)
return self(to)
@Categorize
def encodes(self, to:Tabular):
to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0), all_col=False)
return to
@Categorize
def decodes(self, to:Tabular):
to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}), all_col=False)
return to
show_doc(Categorify, title_level=3)
class Categorify[source]
Categorify(enc=None,dec=None,split_idx=None,order=None) ::TabularProc
Transform the categorical variables to something similar to pd.Categorical
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to['a'], [1,2,3,1,3])
to.show()
| a | |
|---|---|
| 0 | 0 |
| 1 | 1 |
| 2 | 2 |
| 3 | 0 |
| 4 | 2 |
df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
to1.process()
#Values that weren't in the training df are sent to 0 (na)
test_eq(to1['a'], [2,1,0,0,3])
to2 = cat.decode(to1)
test_eq(to2['a'], [1,0,'#na#','#na#',2])
#test with splits
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to['a'], [1,2,3,0,3])
df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#','H','M','L'])
test_eq(to.items.a, [2,1,3,2])
to2 = cat.decode(to)
test_eq(to2['a'], ['M','H','L','M'])
#test with targets
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
test_eq(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2['b'], ['a', 'b', 'a', 'b', 'b'])
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
test_eq(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2['b'], ['a', 'b', 'a', 'b', 'b'])
#test with targets and train
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'c', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
#export
@Normalize
def setups(self, to:Tabular):
self.means,self.stds = dict(getattr(to, 'train', to).conts.mean()),dict(getattr(to, 'train', to).conts.std(ddof=0)+1e-7)
self.store_attrs = 'means,stds'
return self(to)
@Normalize
def encodes(self, to:Tabular):
to.conts = (to.conts-self.means) / self.stds
return to
@Normalize
def decodes(self, to:Tabular):
to.conts = (to.conts*self.stds ) + self.means
return to
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (x-m)/s)
df1 = pd.DataFrame({'a':[5,6,7]})
to1 = to.new(df1)
to1.process()
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
test_close(to2['a'].values, [5,6,7])
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)
#export
class FillStrategy:
"Namespace containing the various filling strategies."
def median (c,fill): return c.median()
def constant(c,fill): return fill
def mode (c,fill): return c.dropna().value_counts().idxmax()
Currently, filling with the median, a constant, and the mode are supported.
#export
class FillMissing(TabularProc):
"Fill the missing values in continuous columns."
store_attrs = 'fill_strategy,add_col,fill_vals'
def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
if fill_vals is None: fill_vals = defaultdict(int)
store_attr(self,self.store_attrs)
def setups(self, dsets):
missing = pd.isnull(dsets.conts).any()
self.na_dict = {n:self.fill_strategy(dsets[n], self.fill_vals[n])
for n in missing[missing].keys()}
self.store_attrs += ',na_dict'
self.fill_strategy = self.fill_strategy.__name__
def encodes(self, to):
missing = pd.isnull(to.conts)
for n in missing.any()[missing.any()].keys():
assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
for n in self.na_dict.keys():
to[n].fillna(self.na_dict[n], inplace=True)
if self.add_col:
to.loc[:,n+'_na'] = missing[n]
if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')
show_doc(FillMissing, title_level=3)
class FillMissing[source]
FillMissing(fill_strategy='median',add_col=True,fill_vals=None) ::TabularProc
Fill the missing values in continuous columns.
fill1,fill2,fill3 = (FillMissing(fill_strategy=s)
for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
tos = (TabularPandas(df, fill1, cont_names='a'),
TabularPandas(df1, fill2, cont_names='a'),
TabularPandas(df2, fill3, cont_names='a'))
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})
for t in tos: test_eq(t.cat_names, ['a_na'])
for to_,v in zip(tos, [1.5, 0., 1.]):
test_eq(to_['a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
test_eq(to_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
fill = FillMissing()
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(fill.na_dict, {'a': 1.5})
test_eq(to.cat_names, ['a_na'])
test_eq(to['a'].values, np.array([0, 1, 1.5, 1, 2, 3, 4]))
test_eq(to['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
test_eq(to['b'].values, np.array([0,1,2,3,4,5,6]))
procs = [Normalize, Categorify, FillMissing, noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')
#Test setup and apply on df_main
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
#Test apply on y_names
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.vocab, ['a','b'])
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,2,1,0,2,0])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,2,1,1,1,1,1])
test_eq(to['c'], [1,0,0,0,1,0,1])
#export
def _maybe_expand(o): return o[:,None] if o.ndim==1 else o
#export
class ReadTabBatch(ItemTransform):
def __init__(self, to): self.to = to
def encodes(self, to):
if not to.with_cont: res = (tensor(to.cats).long(),)
else: res = (tensor(to.cats).long(),tensor(to.conts).float())
ys = [n for n in to.y_names if n in to.items.columns]
if len(ys) == len(to.y_names): res = res + (tensor(to.targ),)
if to.device is not None: res = to_device(res, to.device)
return res
def decodes(self, o):
o = [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
vals = np.concatenate(o, axis=1)
try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
except: df = pd.DataFrame(vals, columns=self.to.x_names)
to = self.to.new(df)
return to
#export
@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
x.show()
from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter,_DatasetKind
_loaders = (_MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter)
#export
@delegates()
class TabDataLoader(TfmdDL):
"A transformed `DataLoader` for Tabular data"
do_item = noops
def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)
super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)
def create_batch(self, b): return self.dataset.iloc[b]
TabularPandas._dl_type = TabDataLoader
For a more in-depth explaination, see the tabular tutorial
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)
dls = to.dataloaders()
dls.valid.show_batch()
| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Some-college | Married-spouse-absent | Other-service | Not-in-family | White | False | 22.999999 | 54472.005407 | 10.0 | <50k |
| 1 | Private | Some-college | Never-married | Other-service | Other-relative | Black | False | 21.000001 | 236683.999905 | 10.0 | <50k |
| 2 | Private | Some-college | Never-married | Sales | Own-child | White | False | 18.000001 | 163786.998406 | 10.0 | <50k |
| 3 | Local-gov | Masters | Divorced | #na# | Unmarried | White | False | 44.000000 | 135055.998622 | 14.0 | <50k |
| 4 | Self-emp-inc | HS-grad | Married-civ-spouse | Adm-clerical | Husband | White | False | 40.000000 | 207577.999886 | 9.0 | >=50k |
| 5 | State-gov | Masters | Married-civ-spouse | Exec-managerial | Husband | White | False | 37.000000 | 210451.999548 | 14.0 | <50k |
| 6 | ? | Bachelors | Never-married | ? | Not-in-family | White | False | 32.000000 | 169885.999453 | 13.0 | <50k |
| 7 | Private | HS-grad | Never-married | Adm-clerical | Not-in-family | White | False | 20.000000 | 236804.000495 | 9.0 | <50k |
| 8 | Private | Some-college | Married-civ-spouse | Other-service | Husband | White | False | 31.000000 | 137680.998667 | 10.0 | <50k |
| 9 | Self-emp-inc | Some-college | Married-civ-spouse | Sales | Husband | White | False | 46.000000 | 284798.997462 | 10.0 | <50k |
to.show()
| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 3380 | Private | Some-college | Married-civ-spouse | Other-service | Husband | White | False | 33.0 | 248584.0 | 10.0 | <50k |
| 3158 | Local-gov | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | 51.0 | 110327.0 | 13.0 | >=50k |
| 8904 | Private | Some-college | Never-married | Exec-managerial | Not-in-family | White | False | 27.0 | 133937.0 | 10.0 | <50k |
| 5912 | Self-emp-not-inc | Some-college | Married-civ-spouse | Farming-fishing | Husband | White | False | 48.0 | 164582.0 | 10.0 | >=50k |
| 3583 | Private | Masters | Never-married | Exec-managerial | Not-in-family | White | False | 39.0 | 49020.0 | 14.0 | <50k |
| 2945 | Private | Bachelors | Never-married | Adm-clerical | Own-child | White | False | 26.0 | 166051.0 | 13.0 | <50k |
| 204 | ? | HS-grad | Married-civ-spouse | #na# | Husband | White | True | 60.0 | 174073.0 | 10.0 | <50k |
| 3196 | Private | Some-college | Never-married | Adm-clerical | Own-child | White | False | 21.0 | 241367.0 | 10.0 | <50k |
| 1183 | ? | Some-college | Married-civ-spouse | ? | Husband | White | False | 65.0 | 52728.0 | 10.0 | <50k |
| 2829 | Private | Masters | Married-civ-spouse | Prof-specialty | Husband | White | False | 46.0 | 261059.0 | 14.0 | >=50k |
We can decode any set of transformed data by calling to.decode_row with our raw data:
row = to.items.iloc[0]
to.decode_row(row)
age 33 workclass Private fnlwgt 248584 education Some-college education-num 10 marital-status Married-civ-spouse occupation Other-service relationship Husband race White sex Male capital-gain 0 capital-loss 0 hours-per-week 50 native-country United-States salary <50k education-num_na False Name: 3380, dtype: object
to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | education-num_na | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10000 | 0.466910 | 5 | 1.359596 | 10 | 1.170520 | 3 | 2 | 1 | 2 | Male | 0 | 0 | 40 | Philippines | 1 |
| 10001 | -0.932292 | 5 | 1.271990 | 12 | -0.425893 | 3 | 15 | 1 | 4 | Male | 0 | 0 | 40 | United-States | 1 |
| 10002 | 1.056047 | 5 | 0.161911 | 2 | -1.224099 | 1 | 9 | 2 | 5 | Female | 0 | 0 | 37 | United-States | 1 |
| 10003 | 0.540552 | 5 | -0.274100 | 12 | -0.425893 | 7 | 2 | 5 | 5 | Female | 0 | 0 | 43 | United-States | 1 |
| 10004 | 0.761479 | 6 | 1.462819 | 9 | 0.372313 | 3 | 5 | 1 | 5 | Male | 0 | 0 | 60 | United-States | 1 |
tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()
| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Bachelors | Married-civ-spouse | Adm-clerical | Husband | Asian-Pac-Islander | False | 45.000000 | 338105.000172 | 13.0 |
| 1 | Private | HS-grad | Married-civ-spouse | Transport-moving | Husband | Other | False | 26.000000 | 328662.996625 | 9.0 |
| 2 | Private | 11th | Divorced | Other-service | Not-in-family | White | False | 52.999999 | 209021.999484 | 7.0 |
| 3 | Private | HS-grad | Widowed | Adm-clerical | Unmarried | White | False | 46.000000 | 162030.001554 | 9.0 |
| 4 | Self-emp-inc | Assoc-voc | Married-civ-spouse | Exec-managerial | Husband | White | False | 49.000000 | 349230.005561 | 11.0 |
| 5 | Local-gov | Some-college | Married-civ-spouse | Exec-managerial | Husband | White | False | 34.000000 | 124827.001916 | 10.0 |
| 6 | Self-emp-inc | Some-college | Married-civ-spouse | Sales | Husband | White | False | 52.999999 | 290640.000454 | 10.0 |
| 7 | Private | Some-college | Never-married | Sales | Own-child | White | False | 19.000000 | 106273.002866 | 10.0 |
| 8 | Private | Some-college | Married-civ-spouse | Protective-serv | Husband | Black | False | 71.999999 | 53683.997254 | 10.0 |
| 9 | Private | Some-college | Never-married | Sales | Own-child | White | False | 20.000000 | 505980.004555 | 10.0 |
def _mock_multi_label(df):
sal,sex,white = [],[],[]
for row in df.itertuples():
sal.append(row.salary == '>=50k')
sex.append(row.sex == ' Male')
white.append(row.race == ' White')
df['salary'] = np.array(sal)
df['male'] = np.array(sex)
df['white'] = np.array(white)
return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | male | white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | True | False | True |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | True | True | True |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | False | False | False |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | True | True | False |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | False | False | False |
#export
@EncodedMultiCategorize
def setups(self, to:Tabular):
self.c = len(self.vocab)
return self(to)
@EncodedMultiCategorize
def encodes(self, to:Tabular): return to
@EncodedMultiCategorize
def decodes(self, to:Tabular):
to.transform(to.y_names, lambda c: c==1)
return to
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)
CPU times: user 77.2 ms, sys: 238 µs, total: 77.4 ms Wall time: 76.7 ms
dls = to.dataloaders()
dls.valid.show_batch()
| workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | male | white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | HS-grad | Married-civ-spouse | #na# | Husband | Amer-Indian-Eskimo | True | 30.000000 | 216811.000739 | 10.000000 | False | True | False |
| 1 | Private | Bachelors | Married-civ-spouse | #na# | Husband | White | False | 53.000000 | 96061.998009 | 13.000000 | False | True | True |
| 2 | Private | HS-grad | Married-civ-spouse | Adm-clerical | Wife | White | False | 31.000000 | 196787.999901 | 9.000000 | False | False | True |
| 3 | ? | Bachelors | Married-civ-spouse | ? | Husband | White | True | 65.999999 | 177351.000226 | 10.000000 | True | True | True |
| 4 | Private | 10th | Separated | Sales | Unmarried | Black | False | 21.000000 | 353628.005662 | 5.999999 | False | False | False |
| 5 | Private | Bachelors | Never-married | Prof-specialty | Not-in-family | White | False | 40.000000 | 143045.999229 | 13.000000 | False | False | True |
| 6 | Private | Masters | Married-civ-spouse | Prof-specialty | Husband | White | False | 37.000000 | 117381.002561 | 14.000000 | False | True | True |
| 7 | Private | HS-grad | Never-married | Sales | Not-in-family | White | False | 29.000000 | 183854.000291 | 9.000000 | False | False | True |
| 8 | Private | HS-grad | Divorced | Priv-house-serv | Not-in-family | White | False | 54.999999 | 175942.000053 | 9.000000 | False | False | True |
| 9 | Private | Some-college | Widowed | Tech-support | Unmarried | White | False | 64.000000 | 91342.999448 | 10.000000 | False | False | True |
def _mock_multi_label(df):
targ = []
for row in df.itertuples():
labels = []
if row.salary == '>=50k': labels.append('>50k')
if row.sex == ' Male': labels.append('male')
if row.race == ' White': labels.append('white')
targ.append(' '.join(labels))
df['target'] = np.array(targ)
return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k | >50k white |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k | >50k male white |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k | |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k | >50k male |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
@MultiCategorize
def encodes(self, to:Tabular):
#to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
return to
@MultiCategorize
def decodes(self, to:Tabular):
#to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
return to
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)
CPU times: user 81 ms, sys: 178 µs, total: 81.2 ms Wall time: 80.1 ms
to.procs[2].vocab
(#24) ['-','_','a','c','d','e','f','g','h','i'...]
#export
@RegressionSetup
def setups(self, to:Tabular):
if self.c is not None: return
self.c = len(to.y_names)
return to
@RegressionSetup
def encodes(self, to:Tabular): return to
@RegressionSetup
def decodes(self, to:Tabular): return to
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)
CPU times: user 82.2 ms, sys: 508 µs, total: 82.7 ms Wall time: 81.8 ms
to.procs[-1].means
{'fnlwgt': 193046.84475, 'education-num': 10.08025}
dls = to.dataloaders()
dls.valid.show_batch()
| workclass | education | marital-status | occupation | relationship | race | education-num_na | fnlwgt | education-num | age | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | State-gov | Masters | Never-married | #na# | Not-in-family | White | False | 47569.994748 | 14.0 | 36.0 |
| 1 | Federal-gov | 11th | Never-married | Sales | Not-in-family | Black | False | 166418.999287 | 7.0 | 50.0 |
| 2 | Private | 9th | Divorced | Farming-fishing | Not-in-family | Black | False | 225603.000537 | 5.0 | 58.0 |
| 3 | Local-gov | 12th | Widowed | Adm-clerical | Not-in-family | White | False | 48055.004282 | 8.0 | 55.0 |
| 4 | Federal-gov | Prof-school | Divorced | Prof-specialty | Not-in-family | White | False | 66504.003988 | 15.0 | 57.0 |
| 5 | Private | Some-college | Never-married | Adm-clerical | Unmarried | Asian-Pac-Islander | False | 91274.998927 | 10.0 | 36.0 |
| 6 | State-gov | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | 391584.996528 | 13.0 | 49.0 |
| 7 | Self-emp-not-inc | 1st-4th | Divorced | Craft-repair | Not-in-family | White | False | 130435.999390 | 2.0 | 71.0 |
| 8 | Private | Bachelors | Never-married | Prof-specialty | Own-child | White | False | 62507.003940 | 13.0 | 22.0 |
| 9 | Private | HS-grad | Married-civ-spouse | Handlers-cleaners | Own-child | White | False | 236696.000903 | 9.0 | 24.0 |
class TensorTabular(fastuple):
def get_ctxs(self, max_n=10, **kwargs):
n_samples = min(self[0].shape[0], max_n)
df = pd.DataFrame(index = range(n_samples))
return [df.iloc[i] for i in range(n_samples)]
def display(self, ctxs): display_df(pd.DataFrame(ctxs))
class TabularLine(pd.Series):
"A line of a dataframe that knows how to show itself"
def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)
class ReadTabLine(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row):
cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
return TensorTabular(tensor(cats).long(),tensor(conts).float())
def decodes(self, o):
to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
to = self.proc.decode(to)
return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))
class ReadTabTarget(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])
# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)
# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')
# test_stdout(lambda: print(show_at(tds, 1)), """a 1
# b_na False
# b 1
# category a
# dtype: object""")
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_torch_core.ipynb. Converted 01_layers.ipynb. Converted 02_data.load.ipynb. Converted 03_data.core.ipynb. Converted 04_data.external.ipynb. Converted 05_data.transforms.ipynb. Converted 06_data.block.ipynb. Converted 07_vision.core.ipynb. Converted 08_vision.data.ipynb. Converted 09_vision.augment.ipynb. Converted 09b_vision.utils.ipynb. Converted 09c_vision.widgets.ipynb. Converted 10_tutorial.pets.ipynb. Converted 11_vision.models.xresnet.ipynb. Converted 12_optimizer.ipynb. Converted 13_callback.core.ipynb. Converted 13a_learner.ipynb. Converted 13b_metrics.ipynb. Converted 14_callback.schedule.ipynb. Converted 14a_callback.data.ipynb. Converted 15_callback.hook.ipynb. Converted 15a_vision.models.unet.ipynb. Converted 16_callback.progress.ipynb. Converted 17_callback.tracker.ipynb. Converted 18_callback.fp16.ipynb. Converted 18a_callback.training.ipynb. Converted 19_callback.mixup.ipynb. Converted 20_interpret.ipynb. Converted 20a_distributed.ipynb. Converted 21_vision.learner.ipynb. Converted 22_tutorial.imagenette.ipynb. Converted 23_tutorial.vision.ipynb. Converted 24_tutorial.siamese.ipynb. Converted 24_vision.gan.ipynb. Converted 30_text.core.ipynb. Converted 31_text.data.ipynb. Converted 32_text.models.awdlstm.ipynb. Converted 33_text.models.core.ipynb. Converted 34_callback.rnn.ipynb. Converted 35_tutorial.wikitext.ipynb. Converted 36_text.models.qrnn.ipynb. Converted 37_text.learner.ipynb. Converted 38_tutorial.text.ipynb. Converted 39_tutorial.transformers.ipynb. Converted 40_tabular.core.ipynb. Converted 41_tabular.data.ipynb. Converted 42_tabular.model.ipynb. Converted 43_tabular.learner.ipynb. Converted 44_tutorial.tabular.ipynb. Converted 45_collab.ipynb. Converted 46_tutorial.collab.ipynb. Converted 50_tutorial.datablock.ipynb. Converted 60_medical.imaging.ipynb. Converted 61_tutorial.medical_imaging.ipynb. Converted 65_medical.text.ipynb. Converted 70_callback.wandb.ipynb. Converted 71_callback.tensorboard.ipynb. Converted 72_callback.neptune.ipynb. Converted 73_callback.captum.ipynb. Converted 74_callback.cutmix.ipynb. Converted 97_test_utils.ipynb. Converted 99_pytorch_doc.ipynb. Converted index.ipynb. Converted tutorial.ipynb.