#!/usr/bin/env python # coding: utf-8 # # The data block API # In[1]: from fastai.gen_doc.nbdoc import * from fastai.vision import * from fastai import * # The data block API lets you customize how to create a [`DataBunch`](/basic_data.html#DataBunch) by isolating the underlying parts of that process in separate blocks, mainly: # - where are the inputs # - how to label them # - how to split the data into a training and validation set # - what type of [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) to create # - possible transforms to apply # - how to warp in dataloaders and create the [`DataBunch`](/basic_data.html#DataBunch) # # This is a bit longer than using the factory methods but is way more flexible. As usual, we'll begin with end-to-end examples, then switch to the details of each of those parts. # ## Examples of use # In [`vision.data`](/vision.data.html#vision.data), we create an easy [`DataBunch`](/basic_data.html#DataBunch) suitable for classification by simply typing: # In[2]: path = untar_data(URLs.MNIST_TINY) tfms = get_transforms(do_flip=False) data = ImageDataBunch.from_folder(path, ds_tfms=tfms, size=24) # This is aimed at data that is in fodlers following an ImageNet style, with a train and valid directory containing each one subdirectory per class, where all the pictures are. With the data block API, the same thing is achieved like this: # In[3]: path = untar_data(URLs.MNIST_TINY) tfms = get_transforms(do_flip=False) # In[4]: data = (ImageFileList.from_folder(path) #Where to find the data? -> in path and its subfolders .label_from_folder() #How to label? -> depending on the folder of the filenames .split_by_folder() #How to split in train/valid? -> use the folders .add_test_folder() #Optionally add a test set .datasets(ImageClassificationDataset) #How to convert to datasets? -> use ImageClassificationDataset .transform(tfms, size=224) #Data augmetnation? -> use tfms with a size of 224 .databunch()) #Finally? -> use the defaults for conversion to ImageDataBunch # In[5]: data.test_ds[0] # In[6]: data.show_batch(rows=3, figsize=(5,5)) # In[7]: data.valid_ds.classes # Let's look at another example from [`vision.data`](/vision.data.html#vision.data) with the planet dataset. This time, it's a multiclassification problem with the labels in a csv file and no given split between valid and train data, so we use a random split. The factory method is: # In[8]: planet = untar_data(URLs.PLANET_TINY) planet_tfms = get_transforms(flip_vert=True, max_lighting=0.1, max_zoom=1.05, max_warp=0.) data = ImageDataBunch.from_csv(planet, folder='train', size=128, suffix='.jpg', sep = ' ', ds_tfms=planet_tfms) # With the data block API we can rewrite this like that: # In[9]: data = (ImageFileList.from_folder(planet) #Where to find the data? -> in planet and its subfolders .label_from_csv('labels.csv', sep=' ', folder='train', suffix='.jpg') #How to label? -> use the csv file labels.csv in path, #add .jpg to the names and take them in the folder train .random_split_by_pct() #How to split in train/valid? -> randomly with the defulat 20% in valid .datasets(ImageMultiDataset) #How to convert to datasets? -> use ImageMultiDataset .transform(planet_tfms, size=128) #Data augmetnation? -> use tfms with a size of 128 .databunch()) #Finally? -> use the defaults for conversion to databunch # In[11]: data.show_batch(rows=3, figsize=(10,8), ds_type=DatasetType.Valid) # This new API also allows to use datasets type for which there is no direct [`ImageDataBunch`](/vision.data.html#ImageDataBunch) factory method. For a segmentation task, for instance, we can use it to quickly get a [`DataBunch`](/basic_data.html#DataBunch). Let's take the example of the [camvid dataset](http://mi.eng.cam.ac.uk/research/projects/VideoRec/CamVid/). The images are in an 'images' folder and their corresponding mask is in a 'labels' folder. # In[12]: camvid = untar_data(URLs.CAMVID_TINY) path_lbl = camvid/'labels' path_img = camvid/'images' # We have a file that gives us the names of the classes (what each code inside the masks corresponds to: a pedestrian, a tree, a road...) # In[13]: codes = np.loadtxt(camvid/'codes.txt', dtype=str); codes # And we define the following function that infers the mask filename from the image filename. # In[14]: get_y_fn = lambda x: path_lbl/f'{x.stem}_P{x.suffix}' # Then we can easily define a [`DataBunch`](/basic_data.html#DataBunch) using the data block API. Here we need to use `tfm_y=True` in the transform call because we need the same transforms to be applied to the target mask as were applied to the image. # In[15]: data = (ImageFileList.from_folder(path_img) #Where are the input files? -> in path_img .label_from_func(get_y_fn) #How to label? -> use get_y_fn .random_split_by_pct() #How to split between train and valid? -> randomly .datasets(SegmentationDataset, classes=codes) #How to create a dataset? -> use SegmentationDataset .transform(get_transforms(), size=96, tfm_y=True) #Data aug -> Use standard tfms with tfm_y=True .databunch(bs=64)) #Lastly convert in a databunch. # In[16]: data.show_batch(rows=2, figsize=(5,5)) # One last example for object detection. We use our tiny sample of the [COCO dataset](http://cocodataset.org/#home) here. There is a helper function in the library that reads the annotation file and returns the list of images names with the list of labelled bboxes associated to it. We convert it to a dictionary that maps image names with their bboxes and then write the function that will give us the target for each image filename. # In[17]: coco = untar_data(URLs.COCO_TINY) images, lbl_bbox = get_annotations(coco/'train.json') img2bbox = {img:bb for img, bb in zip(images, lbl_bbox)} get_y_func = lambda o:img2bbox[o.name] # The following code is very similar to what we saw before. The only new addition is the use of special function to collate the samples in batches. This comes from the fact that our images may have multiple bounding boxes, so we need to pad them to the largest number of bounding boxes. # In[18]: data = (ImageFileList.from_folder(coco) #Where are the images? -> in coco .label_from_func(get_y_func) #How to find the labels? -> use get_y_func .random_split_by_pct() #How to split in train/valid? -> randomly with the default 20% in valid .datasets(ObjectDetectDataset) #How to create datasets? -> with ObjectDetectDataset #Data augmentation? -> Standard transforms with tfm_y=True .databunch(bs=16, collate_fn=bb_pad_collate)) #Finally we convert to a DataBunch and we use bb_pad_collate # In[21]: data.show_batch(rows=3, ds_type=DatasetType.Valid, figsize=(8,7)) # ## Provide inputs # The inputs we want to feed our model are regrouped in the following class. The class contains methods to get the corresponding labels. # In[2]: show_doc(InputList, title_level=3, doc_string=False) # This class regroups the inputs for our model in `items` and saves a `path` attribute which is where it will look for any files (image files, csv file with labels...) # In[3]: show_doc(InputList.from_folder) # Note that [`InputList`](/data_block.html#InputList) is subclassed in vision by [`ImageFileList`](/vision.data.html#ImageFileList) that changes the default of `extensions` to image file extensions (which is why we used [`ImageFileList`](/vision.data.html#ImageFileList) in our previous examples). # ## Labelling the inputs # All the followings are methods of [`InputList`](/data_block.html#InputList). Note that some of them are primarly intended for inputs that are filenames and might not work in general situations. # In[4]: show_doc(InputList.label_from_csv) # If a `folder` is specified, filenames are taken in `self.path/folder`. `suffix` is added. If `sep` is specified, splits the values in `label_col` accordingly. This method is intended for inputs that are filenames. # In[25]: jekyll_note("This method will only keep the filenames that are both present in the csv file and in `self.items`.") # In[5]: show_doc(InputList.label_from_df) # In[27]: jekyll_note("This method will only keep the filenames that are both present in the dataframe and in `self.items`.") # In[6]: show_doc(InputList.label_from_folder) # In[29]: jekyll_note("This method looks at the last subfolder in the path to determine the classes.") # In[7]: show_doc(InputList.label_from_func) # This method is primarly intended for inputs that are filenames, but could work in other settings. # In[8]: show_doc(InputList.label_from_re) # In[9]: show_doc(LabelList, title_level=3, doc_string=False) # A list of labelled inputs in `items` (expected to be tuples of input, label) with a `path` attribute. This class contains methods to create `SplitDataset`. # ## Split the data between train and validation. # The following functions are methods of [`LabelList`](/data_block.html#LabelList), to create a [`SplitData`](/data_block.html#SplitData) in different ways. # In[10]: show_doc(LabelList.random_split_by_pct) # In[11]: show_doc(LabelList.split_by_files) # In[12]: show_doc(LabelList.split_by_fname_file) # In[13]: show_doc(LabelList.split_by_folder) # In[37]: jekyll_note("This method looks at the folder immediately after `self.path` for `valid` and `train`.") # In[14]: show_doc(LabelList.split_by_idx) # In[15]: show_doc(SplitData, title_level=3) # You won't normally construct a [`SplitData`](/data_block.html#SplitData) yourself, but instead will use one of the `split*` methods in [`LabelList`](/data_block.html#LabelList). # In[16]: show_doc(SplitData.datasets) # In[17]: show_doc(SplitData.add_test) # In[18]: show_doc(SplitData.add_test_folder) # ## Create datasets # To create the datasets from [`SplitData`](/data_block.html#SplitData) we have the following class method. # In[19]: show_doc(SplitData.datasets) # In[20]: show_doc(SplitDatasets, title_level=3) # This class can be constructed directly from one of the following factory methods. # In[21]: show_doc(SplitDatasets.from_single) # In[22]: show_doc(SplitDatasets.single_from_c) # In[23]: show_doc(SplitDatasets.single_from_classes) # Then we can build the [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) around our [`Dataset`](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) like this. # In[24]: show_doc(SplitDatasets.dataloaders) # The methods `img_transform` and `img_databunch` used earlier are documented in [`vision.data`](/vision.data.html#vision.data). # ## Utility classes # In[25]: show_doc(ItemList, title_level=3) # In[26]: show_doc(PathItemList, title_level=3)