from fastai.gen_doc.nbdoc import * from fastai.tabular import * from fastai import * path = untar_data(URLs.ADULT_SAMPLE) df = pd.read_csv(path/'adult.csv') train_df, valid_df = df.iloc[:800].copy(), df.iloc[800:1000].copy() train_df.head() cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] cont_names = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] show_doc(TabularProc, doc_string=False) show_doc(TabularProc.__call__) show_doc(TabularProc.apply_train, doc_string=False) show_doc(TabularProc.apply_test, doc_string=False) show_doc(Categorify, doc_string=False) show_doc(Categorify.apply_train, doc_string=False) show_doc(Categorify.apply_test, doc_string=False) tfm = Categorify(cat_names, cont_names) tfm(train_df) tfm(valid_df, test=True) train_df['workclass'].cat.categories valid_df['workclass'].cat.categories show_doc(FillMissing, doc_string=False) show_doc(FillMissing.apply_train, doc_string=False) show_doc(FillMissing.apply_test, doc_string=False) train_df[cont_names].head() tfm = FillMissing(cat_names, cont_names) tfm(train_df) tfm(valid_df, test=True) train_df[cont_names].head() valid_df[cont_names].head() %reload_ext autoreload %autoreload 2 %matplotlib inline show_doc(FillStrategy, alt_doc_string='Enum flag represents determines how `FillMissing` should handle missing/nan values', arg_comments={ 'MEDIAN':'nans are replaced by the median value of the column', 'COMMON': 'nans are replaced by the most common value of the column', 'CONSTANT': 'nans are replaced by `fill_val`' }) show_doc(Normalize, doc_string=False) show_doc(Normalize.apply_train, doc_string=False) show_doc(Normalize.apply_test, doc_string=False)