from fastai.gen_doc.nbdoc import *
from fastai.tabular import *
from fastai import *

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
train_df, valid_df = df.iloc[:800].copy(), df.iloc[800:1000].copy()
train_df.head()

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
cont_names = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

show_doc(TabularProc, doc_string=False)

show_doc(TabularProc.__call__)

show_doc(TabularProc.apply_train, doc_string=False)

show_doc(TabularProc.apply_test, doc_string=False)

show_doc(Categorify, doc_string=False)

show_doc(Categorify.apply_train, doc_string=False)

show_doc(Categorify.apply_test, doc_string=False)

tfm = Categorify(cat_names, cont_names)
tfm(train_df)
tfm(valid_df, test=True)

train_df['workclass'].cat.categories

valid_df['workclass'].cat.categories

show_doc(FillMissing, doc_string=False)

show_doc(FillMissing.apply_train, doc_string=False)

show_doc(FillMissing.apply_test, doc_string=False)

train_df[cont_names].head()

tfm = FillMissing(cat_names, cont_names)
tfm(train_df)
tfm(valid_df, test=True)
train_df[cont_names].head()

valid_df[cont_names].head()

%reload_ext autoreload
%autoreload 2
%matplotlib inline

show_doc(FillStrategy, alt_doc_string='Enum flag represents determines how `FillMissing` should handle missing/nan values', arg_comments={
    'MEDIAN':'nans are replaced by the median value of the column',
    'COMMON': 'nans are replaced by the most common value of the column',
    'CONSTANT': 'nans are replaced by `fill_val`'
})

show_doc(Normalize, doc_string=False)

show_doc(Normalize.apply_train, doc_string=False)

show_doc(Normalize.apply_test, doc_string=False)