#default_exp tabular.core
#export
from fastai2.torch_basics import *
from fastai2.test import *
from fastai2.core import *
from fastai2.data.all import *
from nbdev.showdoc import *
#export
pd.set_option('mode.chained_assignment','raise')
Basic function to preprocess tabular data before assembling it in a
DataBunch.
#export
class _TabIloc:
"Get/set rows by iloc and cols by name"
def __init__(self,to): self.to = to
def __getitem__(self, idxs):
df = self.to.items
if isinstance(idxs,tuple):
rows,cols = idxs
cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
else: rows,cols = idxs,slice(None)
return self.to.new(df.iloc[rows, cols])
#export
class Tabular(CollBase, GetAttr, FilteredBase):
"A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
_default='items'
def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, block_y=CategoryBlock, splits=None, do_setup=True):
if splits is None: splits=[range_of(df)]
df = df.iloc[sum(splits, [])].copy()
super().__init__(df)
self.y_names = L(y_names)
if block_y is not None:
if callable(block_y): block_y = block_y()
procs = L(procs) + block_y.type_tfms
self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs, as_item=True)
self.split = len(splits[0])
if do_setup: self.setup()
def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
def copy(self): self.items = self.items.copy(); return self
def new(self, df): return type(self)(df, do_setup=False, block_y=None, **attrdict(self, 'procs','cat_names','cont_names','y_names'))
def show(self, max_n=10, **kwargs): display_df(self.all_cols[:max_n])
def setup(self): self.procs.setup(self)
def process(self): self.procs(self)
def iloc(self): return _TabIloc(self)
def targ(self): return self.items[self.y_names]
def all_col_names (self): return self.cat_names + self.cont_names + self.y_names
def n_subsets(self): return 2
properties(Tabular,'iloc','targ','all_col_names','n_subsets')
#export
class TabularPandas(Tabular):
def transform(self, cols, f): self[cols] = self[cols].transform(f)
#export
def _add_prop(cls, nm):
@property
def f(o): return o[list(getattr(o,nm+'_names'))]
@f.setter
def fset(o, v): o[getattr(o,nm+'_names')] = v
setattr(cls, nm+'s', f)
setattr(cls, nm+'s', fset)
_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'y')
_add_prop(Tabular, 'all_col')
df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
to = TabularPandas(df, cat_names='a')
t = pickle.loads(pickle.dumps(to))
test_eq(t.items,to.items)
test_eq(to.all_cols,to[['a']])
to.show() # only shows 'a' since that's the only col in `TabularPandas`
| a | |
|---|---|
| 0 | 0 |
| 1 | 1 |
| 2 | 2 |
| 3 | 0 |
| 4 | 2 |
#export
class TabularProc(InplaceTransform):
"Base class to write a non-lazy tabular processor for dataframes"
def setup(self, items=None):
super().setup(getattr(items,'train',items))
# Procs are called as soon as data is available
return self(items.items if isinstance(items,DataSource) else items)
#export
def _apply_cats (voc, add, c): return c.cat.codes+add if is_categorical_dtype(c) else c.map(voc[c.name].o2i)
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))
#export
class Categorify(TabularProc):
"Transform the categorical variables to that type."
order = 1
def setups(self, to):
self.classes = {n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}
def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
def __getitem__(self,k): return self.classes[k]
#export
@Categorize
def setups(self, to:Tabular):
if len(to.y_names) > 0: self.vocab = CategoryMap(to.iloc[:,to.y_names[0]].items)
return self(to)
@Categorize
def encodes(self, to:Tabular):
to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
return to
@Categorize
def decodes(self, to:Tabular):
to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
return to
show_doc(Categorify, title_level=3)
class Categorify[source]
Categorify(enc=None,dec=None,split_idx=None,as_item=False,order=None) ::TabularProc
Transform the categorical variables to that type.
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to.a, [1,2,3,1,3])
df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
to1.process()
#Values that weren't in the training df are sent to 0 (na)
test_eq(to1.a, [2,1,0,0,3])
to2 = cat.decode(to1)
test_eq(to2.a, [1,0,'#na#','#na#',2])
#test with splits
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
test_eq(cat['a'], ['#na#',0,1,2])
test_eq(to['a'], [1,2,3,0,3])
df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(cat['a'], ['#na#','H','M','L'])
test_eq(to.a, [2,1,3,2])
to2 = cat.decode(to)
test_eq(to2.a, ['M','H','L','M'])
#test with targets
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.procs.vocab, ['a', 'b'])
test_eq(to.b, [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2.b, ['a', 'b', 'a', 'b', 'b'])
#export
class Normalize(TabularProc):
"Normalize the continuous variables."
order = 2
def setups(self, dsrc): self.means,self.stds = dsrc.conts.mean(),dsrc.conts.std(ddof=0)+1e-7
def encodes(self, to): to.conts = (to.conts-self.means) / self.stds
def decodes(self, to): to.conts = (to.conts*self.stds ) + self.means
show_doc(Normalize, title_level=3)
class Normalize[source]
Normalize(enc=None,dec=None,split_idx=None,as_item=False,order=None) ::TabularProc
Normalize the continuous variables.
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to.a.values, (x-m)/s)
df1 = pd.DataFrame({'a':[5,6,7]})
to1 = to.new(df1)
to1.process()
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
test_close(to2.a.values, [5,6,7])
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)
#export
class FillStrategy:
"Namespace containing the various filling strategies."
def median (c,fill): return c.median()
def constant(c,fill): return fill
def mode (c,fill): return c.dropna().value_counts().idxmax()
#export
class FillMissing(TabularProc):
"Fill the missing values in continuous columns."
def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
if fill_vals is None: fill_vals = defaultdict(int)
store_attr(self, 'fill_strategy,add_col,fill_vals')
def setups(self, dsrc):
self.na_dict = {n:self.fill_strategy(dsrc[n], self.fill_vals[n])
for n in pd.isnull(dsrc.conts).any().keys()}
def encodes(self, to):
missing = pd.isnull(to.conts)
for n in missing.any().keys():
assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
to[n].fillna(self.na_dict[n], inplace=True)
if self.add_col:
to.loc[:,n+'_na'] = missing[n]
if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')
show_doc(FillMissing, title_level=3)
class FillMissing[source]
FillMissing(fill_strategy='median',add_col=True,fill_vals=None) ::TabularProc
Fill the missing values in continuous columns.
fill1,fill2,fill3 = (FillMissing(fill_strategy=s)
for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
tos = TabularPandas(df, fill1, cont_names='a'),TabularPandas(df1, fill2, cont_names='a'),TabularPandas(df2, fill3, cont_names='a')
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})
for t in tos: test_eq(t.cat_names, ['a_na'])
for to_,v in zip(tos, [1.5, 0., 1.]):
test_eq(to_.a.values, np.array([0, 1, v, 1, 2, 3, 4]))
test_eq(to_.a_na.values, np.array([0, 0, 1, 0, 0, 0, 0]))
dfa = pd.DataFrame({'a':[np.nan,0,np.nan]})
tos = [t.new(o) for t,o in zip(tos,(dfa,dfa.copy(),dfa.copy()))]
for t in tos: t.process()
for to_,v in zip(tos, [1.5, 0., 1.]):
test_eq(to_.a.values, np.array([v, 0, v]))
test_eq(to_.a_na.values, np.array([1, 0, 1]))
procs = [Normalize, Categorify, FillMissing, noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')
#Test setup and apply on df_main
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,3,2,2,3,1])
test_eq(to.b_na, [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.b.values, (x-m)/s)
test_eq(to.procs.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
#Test apply on y_names
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,3,2,2,3,1])
test_eq(to.b_na, [1,1,2,1,1,1,1])
test_eq(to.c, [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.b.values, (x-m)/s)
test_eq(to.procs.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.procs.vocab, ['a','b'])
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,3,2,2,3,1])
test_eq(df.a.dtype,int)
test_eq(to.b_na, [1,1,2,1,1,1,1])
test_eq(to.c, [1,0,1,0,0,1,0])
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a, [1,2,2,1,0,2,0])
test_eq(df.a.dtype,int)
test_eq(to.b_na, [1,2,1,1,1,1,1])
test_eq(to.c, [1,0,0,0,1,0,1])
#export
class ReadTabBatch(ItemTransform):
order = -1 #run before cuda
def __init__(self, to): self.to = to
# TODO: use float for cont targ
def encodes(self, to): return tensor(to.cats).long(),tensor(to.conts).float(), tensor(to.targ)
def decodes(self, o):
cats,conts,targs = to_np(o)
vals = np.concatenate([cats,conts,targs], axis=1)
df = pd.DataFrame(vals, columns=self.to.all_col_names)
to = self.to.new(df)
to = self.to.procs.decode(to)
return to
#export
@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
x.show()
#export
@delegates()
class TabDataLoader(TfmdDL):
do_item = noops
def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
after_batch = L(after_batch)+ReadTabBatch(dataset)
super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)
def create_batch(self, b): return self.dataset.iloc[b]
TabularPandas._dl_type = TabDataLoader
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)
CPU times: user 192 ms, sys: 276 µs, total: 192 ms Wall time: 191 ms
dbch = to.databunch()
dbch.valid_dl.show_batch()
| workclass | education | marital-status | occupation | relationship | race | age_na | fnlwgt_na | education-num_na | age | fnlwgt | education-num | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | State-gov | HS-grad | Married-civ-spouse | Craft-repair | Husband | Black | False | False | False | 50.0 | 229271.999663 | 9.0 | >=50k |
| 1 | Private | Assoc-voc | Never-married | Craft-repair | Own-child | White | False | False | False | 30.0 | 160633.999723 | 11.0 | >=50k |
| 2 | Private | HS-grad | Never-married | Handlers-cleaners | Not-in-family | White | False | False | False | 32.0 | 164507.001425 | 9.0 | <50k |
| 3 | Private | HS-grad | Married-civ-spouse | Sales | Husband | White | False | False | False | 48.0 | 320421.005237 | 9.0 | <50k |
| 4 | Private | HS-grad | Married-civ-spouse | Adm-clerical | Wife | White | False | False | False | 46.0 | 243189.999445 | 9.0 | >=50k |
| 5 | Private | HS-grad | Divorced | Sales | Not-in-family | White | False | False | False | 31.0 | 217802.999944 | 9.0 | <50k |
| 6 | Private | HS-grad | Divorced | Sales | Not-in-family | White | False | False | False | 34.0 | 245172.999308 | 9.0 | <50k |
| 7 | Private | HS-grad | Never-married | Other-service | Unmarried | White | False | False | False | 52.0 | 195638.000066 | 9.0 | <50k |
| 8 | Private | Masters | Never-married | Prof-specialty | Not-in-family | White | False | False | False | 28.0 | 274679.000327 | 14.0 | <50k |
| 9 | Private | Some-college | Never-married | Sales | Unmarried | Black | False | False | False | 38.0 | 363394.997929 | 10.0 | <50k |
to_tst = to.new(df_test)
to_tst.process()
to_tst.all_cols.head()
| workclass | education | marital-status | occupation | relationship | race | age_na | fnlwgt_na | education-num_na | age | fnlwgt | education-num | salary | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10000 | 5 | 10 | 3 | 2 | 1 | 2 | 1 | 1 | 1 | 0.459514 | 1.345251 | 1.183611 | 0 |
| 10001 | 5 | 12 | 3 | 15 | 1 | 4 | 1 | 1 | 1 | -0.935650 | 1.257914 | -0.427498 | 0 |
| 10002 | 5 | 2 | 1 | 9 | 2 | 5 | 1 | 1 | 1 | 1.046951 | 0.151258 | -1.233052 | 0 |
| 10003 | 5 | 12 | 7 | 2 | 5 | 5 | 1 | 1 | 1 | 0.532943 | -0.283410 | -0.427498 | 0 |
| 10004 | 6 | 9 | 3 | 5 | 1 | 5 | 1 | 1 | 1 | 0.753232 | 1.448155 | 0.378057 | 1 |
def _mock_multi_label(df):
sal,sex,white = [],[],[]
for row in df.itertuples():
sal.append(row.salary == '>=50k')
sex.append(row.sex == ' Male')
white.append(row.race == ' White')
df['salary'] = np.array(sal)
df['male'] = np.array(sex)
df['white'] = np.array(white)
return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | male | white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | True | False | True |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | True | True | True |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | False | False | False |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | True | True | False |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | False | False | False |
#export
@EncodedMultiCategorize
def encodes(self, to:Tabular): return to
@EncodedMultiCategorize
def decodes(self, to:Tabular):
to.transform(to.y_names, lambda c: c==1)
return to
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, block_y=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)
CPU times: user 162 ms, sys: 0 ns, total: 162 ms Wall time: 160 ms
dbch = to.databunch()
dbch.valid_dl.show_batch()
| workclass | education | marital-status | occupation | relationship | race | age_na | fnlwgt_na | education-num_na | age | fnlwgt | education-num | salary | male | white | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Some-college | Never-married | #na# | Own-child | White | False | False | True | 19.000000 | 226912.999574 | 10.0 | False | False | True |
| 1 | ? | Some-college | Divorced | ? | Unmarried | White | False | False | True | 51.000000 | 76437.000728 | 10.0 | False | False | True |
| 2 | Private | Some-college | Never-married | Handlers-cleaners | Not-in-family | White | False | False | False | 24.000000 | 165064.000614 | 10.0 | False | True | True |
| 3 | Private | HS-grad | Never-married | Machine-op-inspct | Unmarried | White | False | False | False | 26.000000 | 211435.000638 | 9.0 | False | False | True |
| 4 | Federal-gov | 11th | Never-married | Other-service | Own-child | Asian-Pac-Islander | False | False | False | 17.999999 | 101709.002120 | 7.0 | False | True | False |
| 5 | Federal-gov | Some-college | Married-civ-spouse | Adm-clerical | Husband | Black | False | False | False | 39.000000 | 314822.002568 | 10.0 | False | True | False |
| 6 | Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | Black | False | False | False | 45.000000 | 256649.002323 | 9.0 | False | True | False |
| 7 | Private | HS-grad | Married-civ-spouse | Exec-managerial | Husband | White | False | False | False | 50.000000 | 161437.998631 | 9.0 | True | True | True |
| 8 | Private | Assoc-acdm | Never-married | #na# | Unmarried | White | False | False | False | 26.000000 | 159602.998851 | 12.0 | False | False | True |
| 9 | Private | Bachelors | Never-married | Prof-specialty | Not-in-family | White | False | False | False | 27.000000 | 660870.007785 | 13.0 | False | False | True |
def _mock_multi_label(df):
targ = []
for row in df.itertuples():
labels = []
if row.salary == '>=50k': labels.append('>50k')
if row.sex == ' Male': labels.append('male')
if row.race == ' White': labels.append('white')
targ.append(' '.join(labels))
df['target'] = np.array(targ)
return df
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
df_main.head()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k | >50k white |
| 1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k | >50k male white |
| 2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k | |
| 3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k | >50k male |
| 4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
@MultiCategorize
def encodes(self, to:Tabular):
#to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
return to
@MultiCategorize
def decodes(self, to:Tabular):
#to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
return to
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", type_y=MultiCategory, splits=splits)
CPU times: user 214 ms, sys: 3.95 ms, total: 218 ms Wall time: 217 ms
to.procs[2].vocab
(#24) [-,_,a,c,d,e,f,g,h,i...]
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', type_y=Float, splits=splits)
CPU times: user 148 ms, sys: 7.58 ms, total: 155 ms Wall time: 154 ms
to.procs[-1].means
fnlwgt 192859.69200 education-num 10.08125 dtype: float64
dbch = to.databunch()
dbch.valid_dl.show_batch()
| workclass | education | marital-status | occupation | relationship | race | fnlwgt_na | education-num_na | fnlwgt | education-num | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Private | Some-college | Married-civ-spouse | Craft-repair | Husband | White | False | False | 213840.999630 | 10.0 | 37.0 |
| 1 | Federal-gov | Assoc-acdm | Never-married | Adm-clerical | Not-in-family | White | False | False | 235890.999234 | 12.0 | 45.0 |
| 2 | Private | HS-grad | Never-married | Sales | Not-in-family | White | False | False | 234108.000241 | 9.0 | 21.0 |
| 3 | Self-emp-not-inc | HS-grad | Married-civ-spouse | Prof-specialty | Husband | White | False | False | 287037.000237 | 9.0 | 41.0 |
| 4 | ? | HS-grad | Separated | ? | Unmarried | Black | False | False | 427965.002375 | 9.0 | 29.0 |
| 5 | Private | Some-college | Married-civ-spouse | Handlers-cleaners | Husband | White | False | False | 368949.005576 | 10.0 | 29.0 |
| 6 | Private | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | False | 85341.001781 | 9.0 | 48.0 |
| 7 | Private | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | False | 110243.002830 | 13.0 | 49.0 |
| 8 | ? | 10th | Divorced | ? | Not-in-family | White | False | False | 124647.999756 | 6.0 | 61.0 |
| 9 | ? | Masters | Never-married | ? | Unmarried | Asian-Pac-Islander | False | False | 173799.999432 | 14.0 | 27.0 |
class TensorTabular(Tuple):
def get_ctxs(self, max_n=10, **kwargs):
n_samples = min(self[0].shape[0], max_n)
df = pd.DataFrame(index = range(n_samples))
return [df.iloc[i] for i in range(n_samples)]
def display(self, ctxs): display_df(pd.DataFrame(ctxs))
class TabularLine(pd.Series):
"A line of a dataframe that knows how to show itself"
def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)
class ReadTabLine(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row):
cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
return TensorTabular(tensor(cats).long(),tensor(conts).float())
def decodes(self, o):
to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
to = self.proc.decode(to)
return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))
class ReadTabTarget(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])
# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)
# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')
# test_stdout(lambda: print(show_at(tds, 1)), """a 1
# b_na False
# b 1
# category a
# dtype: object""")
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_test.ipynb. Converted 01_core_foundation.ipynb. Converted 01a_core_utils.ipynb. Converted 01b_core_dispatch.ipynb. Converted 01c_core_transform.ipynb. Converted 02_core_script.ipynb. Converted 03_torchcore.ipynb. Converted 03a_layers.ipynb. Converted 04_data_load.ipynb. Converted 05_data_core.ipynb. Converted 06_data_transforms.ipynb. Converted 07_data_block.ipynb. Converted 08_vision_core.ipynb. Converted 09_vision_augment.ipynb. Converted 09a_vision_data.ipynb. Converted 09b_vision_utils.ipynb. Converted 10_pets_tutorial.ipynb. Converted 11_vision_models_xresnet.ipynb. Converted 12_optimizer.ipynb. Converted 13_learner.ipynb. Converted 13a_metrics.ipynb. Converted 14_callback_schedule.ipynb. Converted 14a_callback_data.ipynb. Converted 15_callback_hook.ipynb. Converted 15a_vision_models_unet.ipynb. Converted 16_callback_progress.ipynb. Converted 17_callback_tracker.ipynb. Converted 18_callback_fp16.ipynb. Converted 19_callback_mixup.ipynb. Converted 20_interpret.ipynb. Converted 20a_distributed.ipynb. Converted 21_vision_learner.ipynb. Converted 22_tutorial_imagenette.ipynb. Converted 23_tutorial_transfer_learning.ipynb. Converted 30_text_core.ipynb. Converted 31_text_data.ipynb. Converted 32_text_models_awdlstm.ipynb. Converted 33_text_models_core.ipynb. Converted 34_callback_rnn.ipynb. Converted 35_tutorial_wikitext.ipynb. Converted 36_text_models_qrnn.ipynb. Converted 37_text_learner.ipynb. Converted 38_tutorial_ulmfit.ipynb. Converted 40_tabular_core.ipynb. Converted 41_tabular_model.ipynb. Converted 42_tabular_rapids.ipynb. Converted 50_data_block_examples.ipynb. Converted 60_medical_imaging.ipynb. Converted 65_medical_text.ipynb. Converted 70_callback_wandb.ipynb. Converted 71_callback_tensorboard.ipynb. Converted 90_notebook_core.ipynb. Converted 91_notebook_export.ipynb. Converted 92_notebook_showdoc.ipynb. Converted 93_notebook_export2html.ipynb. Converted 94_notebook_test.ipynb. Converted 95_index.ipynb. Converted 96_data_external.ipynb. Converted 97_utils_test.ipynb. Converted notebook2jekyll.ipynb. Converted xse_resnext.ipynb.