%pylab inline
import pandas as pd
import numpy as np
from pathlib import Path
import random
import shutil
from fastprogress import progress_bar
import PIL.Image
from functools import partial
IMGNET = Path('/DATA/kaggle/imgnetloc/')
IMAGES_TRAIN = Path('/DATA/kaggle/imgnetloc/ILSVRC/Data/CLS-LOC/train/')
IMAGES_VAL = Path('/DATA/kaggle/imgnetloc/ILSVRC/Data/CLS-LOC/val/')
TRAIN_SOLUTION_CSV = IMGNET/'LOC_train_solution.csv'
VALID_SOLUTION_CSV = IMGNET/'LOC_val_solution.csv'
ANNO_TRAIN = Path('/DATA/kaggle/imgnetloc/ILSVRC/Annotations/CLS-LOC/train/')
ANNO_VAL = Path('/DATA/kaggle/imgnetloc/ILSVRC/Annotations/CLS-LOC/val/')
# parse one line of class file, just going to grab first descriptions
def parse_class_line(l):
id = l.split(' ')[0]
classes = l[len(id):].strip().split(',')
return id, classes[0].strip()
# read in mapping of class id to text description
def read_classes(fn):
classes = dict(map(parse_class_line, open(fn,'r').readlines()))
return classes
classes = read_classes(IMGNET/'LOC_synset_mapping.txt')
def get_img_fns(img_train_path, class_id):
img_fns = []
for fn in (img_train_path/class_id).iterdir():
img_fns.append(fn)
return img_fns
def plot_samples(clsid):
img_fns = get_img_fns(IMAGES_TRAIN, clsid)
images = [PIL.Image.open(fn) for fn in np.random.choice(img_fns, 3)]
_,axes = plt.subplots(1,3, figsize=(12,3))
for i,ax in enumerate(axes.flat): ax.imshow(images[i])
pull_classes = [
'n01443537', 'n01669191', 'n01774750', 'n01641577', 'n01882714',
'n01983481', 'n02114367', 'n02115641', 'n02317335', 'n01806143',
'n01484850', 'n03063689', 'n03272010', 'n03124170', 'n02799071',
'n03400231', 'n03452741', 'n02802426', 'n02692877', 'n02787622',
'n03785016', 'n04252077', 'n02088466', 'n04254680', 'n02504458',
'n03345487', 'n03642806', 'n03063599'
]
for k in pull_classes:
plot_samples(k)
total_images = 0
for clsid in pull_classes:
img_fns = get_img_fns(IMAGES_TRAIN, clsid)
num_images = len(img_fns)
total_images += num_images
print(classes[clsid], num_images)
print('total images:', total_images)
valid_df = pd.read_csv(VALID_SOLUTION_CSV)
train_df = pd.read_csv(TRAIN_SOLUTION_CSV)
len(train_df), len(valid_df)
train_df['classid'] = train_df.ImageId.apply(lambda x: x.split('_')[0])
def parse_prediction_string(s):
ids = []
items = s.split(' ')
pred_count = len(items) // 5
for i in range(pred_count):
ids.append(items[i*5])
return ids[0]
valid_df['classid'] = valid_df.PredictionString.apply(parse_prediction_string)
small_train_df = train_df.loc[train_df.classid.isin(pull_classes)]
small_valid_df = valid_df.loc[valid_df.classid.isin(pull_classes)]
len(pull_classes), small_train_df.shape, small_valid_df.shape
IMGNET_SMALL = Path('/DATA/kaggle/imgnetloc_small/')
SMALL_DATA = IMGNET_SMALL/'ILSVRC/Data/CLS-LOC'
SMALL_ANNO = IMGNET_SMALL/'ILSVRC/Annotations/CLS-LOC'
SMALL_DATA.mkdir(parents=True, exist_ok=True)
SMALL_ANNO.mkdir(parents=True, exist_ok=True)
(SMALL_DATA/'train').mkdir(parents=True, exist_ok=True)
(SMALL_ANNO/'val').mkdir(parents=True, exist_ok=True)
# copy training directories
for k in progress_bar(pull_classes):
src_data_path = IMAGES_TRAIN/k
dest_data_path = SMALL_DATA/'train'/k
if dest_data_path.exists():
shutil.rmtree(dest_data_path)
shutil.copytree(src_data_path, dest_data_path)
src_data_path = ANNO_TRAIN/k
dest_data_path = SMALL_ANNO/'train'/k
if dest_data_path.exists():
shutil.rmtree(dest_data_path)
shutil.copytree(src_data_path, dest_data_path)
# copy validation directories
dest_val_data = SMALL_DATA/'val'
dest_val_anno = SMALL_ANNO/'val'
if dest_val_data.exists():
shutil.rmtree(dest_val_data)
if dest_val_anno.exists():
shutil.rmtree(dest_val_anno)
dest_val_data.mkdir(parents=True, exist_ok=True)
dest_val_anno.mkdir(parents=True, exist_ok=True)
for ix, row in progress_bar(list(small_valid_df.ImageId.items())):
src_file = IMAGES_VAL/f'{row}.JPEG'
dest_file = dest_val_data/f'{row}.JPEG'
shutil.copyfile(src_file, dest_file)
src_file = ANNO_VAL/f'{row}.xml'
dest_file = dest_val_anno/f'{row}.xml'
shutil.copyfile(src_file, dest_file)
# copy text files, filtering out classes we don't want
def copy_file_with_filter(src_file, dst_file, filter_func, has_header=True):
with open(src_file,'r') as rf:
src_lines = rf.readlines()
start_line = 1 if has_header else 0
with open(dst_file,'w') as wf:
if has_header: wf.write(src_lines[0])
for line in src_lines[start_line:]:
if filter_func(line):
wf.write(line)
def valid_is_desired_class(line, classes):
clsid = line.split(',')[1].split(' ')[0]
#print(clsid in classes)
return clsid in classes
def is_desired_class(line, classes):
clsid = line[0:9]
return clsid in classes
def copy_filtered_csv(src_path, dst_path, fn, classes, has_header, check_func):
copy_file_with_filter(src_path/fn, dst_path/fn, partial(check_func, classes=classes), has_header=has_header)
text_files = [
('LOC_val_solution.csv', True, valid_is_desired_class),
('LOC_train_solution.csv', True, is_desired_class),
('LOC_synset_mapping.txt', False, is_desired_class)
]
for fn,has_header,func in text_files: copy_filtered_csv(IMGNET, IMGNET_SMALL, fn, pull_classes, has_header, func)