%matplotlib inline
%reload_ext autoreload
%autoreload 2
import argparse
import os
import shutil
import time
from fastai.transforms import *
from fastai.dataset import *
from fastai.fp16 import *
from fastai.conv_learner import *
from pathlib import *
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import models
import models.cifar10 as cifar10models
from distributed import DistributedDataParallel as DDP
# print(models.cifar10.__dict__)
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
cifar10_names = sorted(name for name in cifar10models.__dict__
if name.islower() and not name.startswith("__")
and callable(cifar10models.__dict__[name]))
model_names = cifar10_names + model_names
# print(model_names)
# Example usage: python run_fastai.py /home/paperspace/ILSVRC/Data/CLS-LOC/ -a resnext_50_32x4d --epochs 1 -j 4 -b 64 --fp16
parser = argparse.ArgumentParser(description='PyTorch Cifar10 Training')
parser.add_argument('data', metavar='DIR',
help='path to dataset')
parser.add_argument('--save-dir', type=str, default=Path.home()/'imagenet_training',
help='Directory to save logs and models.')
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet56',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet56)')
parser.add_argument('-j', '--workers', default=7, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=1, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--cycle-len', default=95, type=float, metavar='N',
help='Length of cycle to run')
# parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
# help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=512, type=int,
metavar='N', help='mini-batch size (default: 256)')
parser.add_argument('--lr', '--learning-rate', default=0.8, type=float,
metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)')
# parser.add_argument('--print-freq', '-p', default=10, type=int,
# metavar='N', help='print frequency (default: 10)')
# parser.add_argument('--resume', default='', type=str, metavar='PATH',
# help='path to latest checkpoint (default: none)')
# parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
# help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model')
parser.add_argument('--fp16', action='store_true', help='Run model fp16 mode.')
parser.add_argument('--use-tta', default=True, type=bool, help='Validate model with TTA at the end of traiing.')
parser.add_argument('--train-half', action='store_true', help='Train model on half images. TODO: allow custom epochs and LR')
parser.add_argument('--sz', default=32, type=int, help='Size of transformed image.')
# parser.add_argument('--decay-int', default=30, type=int, help='Decay LR by 10 every decay-int epochs')
parser.add_argument('--use-clr', default='10,13.68,0.95,0.85', type=str,
help='div,pct,max_mom,min_mom. Pass in a string delimited by commas. Ex: "20,2,0.95,0.85"')
parser.add_argument('--loss-scale', type=float, default=128,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--prof', dest='prof', action='store_true', help='Only run a few iters for profiling.')
parser.add_argument('--dist-url', default='file://sync.file', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend')
parser.add_argument('--world-size', default=1, type=int,
help='Number of GPUs to use. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
help='Used for multi-process training. Can either be manually set ' +
'or automatically set by using \'python -m multiproc\'.')
class TorchModelData(ModelData):
def __init__(self, path, trn_dl, val_dl, aug_dl=None):
super().__init__(path, trn_dl, val_dl)
self.aug_dl = aug_dl
def torch_loader(data_path, size):
# Data loading code
traindir = os.path.join(data_path, 'train')
valdir = os.path.join(data_path, 'test')
normalize = transforms.Normalize(mean=[0.4914 , 0.48216, 0.44653], std=[0.24703, 0.24349, 0.26159])
train_tfms = transforms.Compose([
# transforms.RandomResizedCrop(size),
transforms.ColorJitter(.3,.3,.3),
transforms.RandomRotation(3),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
])
train_dataset = datasets.ImageFolder(traindir, train_tfms)
train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset)
if args.distributed else None)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
val_tfms = transforms.Compose([
# transforms.Resize(int(size*1.14)),
# transforms.CenterCrop(size),
transforms.ToTensor(),
normalize,
])
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, val_tfms),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
aug_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, train_tfms),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
train_loader = DataPrefetcher(train_loader)
val_loader = DataPrefetcher(val_loader)
aug_loader = DataPrefetcher(aug_loader)
if args.prof:
train_loader.stop_after = 200
val_loader.stop_after = 0
data = TorchModelData(data_path, train_loader, val_loader, aug_loader)
return data, train_sampler
# Seems to speed up training by ~2%
class DataPrefetcher():
def __init__(self, loader, stop_after=None):
self.loader = loader
self.dataset = loader.dataset
self.stream = torch.cuda.Stream()
self.stop_after = stop_after
self.next_input = None
self.next_target = None
def __len__(self):
return len(self.loader)
def preload(self):
try:
self.next_input, self.next_target = next(self.loaditer)
except StopIteration:
self.next_input = None
self.next_target = None
return
with torch.cuda.stream(self.stream):
self.next_input = self.next_input.cuda(async=True)
self.next_target = self.next_target.cuda(async=True)
def __iter__(self):
count = 0
self.loaditer = iter(self.loader)
self.preload()
while self.next_input is not None:
torch.cuda.current_stream().wait_stream(self.stream)
input = self.next_input
target = self.next_target
self.preload()
count += 1
yield input, target
if type(self.stop_after) is int and (count > self.stop_after):
break
def top5(output, target):
"""Computes the precision@k for the specified values of k"""
top5 = 5
batch_size = target.size(0)
_, pred = output.topk(top5, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
correct_k = correct[:top5].view(-1).float().sum(0, keepdim=True)
return correct_k.mul_(1.0 / batch_size)
class ImagenetLoggingCallback(Callback):
def __init__(self, save_path, print_every=50):
super().__init__()
self.save_path=save_path
self.print_every=print_every
def on_train_begin(self):
self.batch = 0
self.epoch = 0
self.f = open(self.save_path, "a", 1)
self.log("\ton_train_begin")
def on_epoch_end(self, metrics):
log_str = f'\tEpoch:{self.epoch}\ttrn_loss:{self.last_loss}'
for (k,v) in zip(['val_loss', 'acc', 'top5', ''], metrics): log_str += f'\t{k}:{v}'
self.log(log_str)
self.epoch += 1
def on_batch_end(self, metrics):
self.last_loss = metrics
self.batch += 1
if self.batch % self.print_every == 0:
self.log(f'Epoch: {self.epoch} Batch: {self.batch} Metrics: {metrics}')
def on_train_end(self):
self.log("\ton_train_end")
self.f.close()
def log(self, string):
self.f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+"\t"+string+"\n")
# Logging + saving models
def save_args(name, save_dir):
if (args.rank != 0) or not args.save_dir: return {}
log_dir = f'{save_dir}/training_logs'
os.makedirs(log_dir, exist_ok=True)
return {
'best_save_name': f'{name}_best_model',
'cycle_save_name': f'{name}',
'callbacks': [
ImagenetLoggingCallback(f'{log_dir}/{name}_log.txt')
]
}
def save_sched(sched, save_dir):
if (args.rank != 0) or not args.save_dir: return {}
log_dir = f'{save_dir}/training_logs'
sched.save_path = log_dir
sched.plot_loss()
sched.plot_lr()
def update_model_dir(learner, base_dir):
learner.tmp_path = f'{base_dir}/tmp'
os.makedirs(learner.tmp_path, exist_ok=True)
learner.models_path = f'{base_dir}/models'
os.makedirs(learner.models_path, exist_ok=True)
args_input = [
'/home/paperspace/imagenet-fast/fp16/data/cifar10',
'--save-dir', '/home/paperspace/data/cifar_training/test1',
# '-a', 'resnet56',
# '-j', '6',
# '--prof',
# '-b', '512',
# '--sz', '32',
# '--loss-scale', '128',
'--fp16',
# '--cycle-len', '95',
# '--epochs', '1',
# '--use-clr', '10,13.68,0.95,0.85',
'--wd', '2e-4',
'--lr', '1',
# '--train-half' # With fp16, iterations are so fast this doesn't matter
]
# This is important for speed
cudnn.benchmark = True
global arg
args = parser.parse_args(args_input); args
if args.cycle_len > 1: args.cycle_len = int(args.cycle_len)
args.distributed = args.world_size > 1
args.gpu = 0
if args.distributed:
args.gpu = args.rank % torch.cuda.device_count()
if args.distributed:
torch.cuda.set_device(args.gpu)
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size)
if args.fp16:
assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
# create model
model = cifar10models.__dict__[args.arch] if args.arch in cifar10_names else models.__dict__[args.arch]
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = model(pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = model()
=> creating model 'resnet56'
model = model.cuda()
if args.distributed:
model = DDP(model)
if args.train_half:
data, train_sampler = torch_loader(args.data, 16)
else:
data, train_sampler = torch_loader(args.data, args.sz)
learner = Learner.from_model_data(model, data)
# learner.crit = F.nll_loss
learner.crit = F.cross_entropy
learner.metrics = [accuracy]
if args.fp16: learner.half()
if args.prof:
args.epochs = 1
args.cycle_len=.01
if args.use_clr:
args.use_clr = tuple(map(float, args.use_clr.split(',')))
x,y = next(iter(data.trn_dl))
torch.Size([512, 3, 32, 32])
plt.imshow(np.transpose(x[50], (1, 2, 0)))
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
<matplotlib.image.AxesImage at 0x7f549a31f7f0>
# %pdb off
# 128x128
if args.train_half:
save_dir = args.save_dir+'/128'
update_model_dir(learner, save_dir)
sargs = save_args('first_run_128', save_dir)
learner.fit(args.lr,args.epochs, cycle_len=45,
train_sampler=train_sampler,
wds=args.weight_decay,
use_clr_beta=args.use_clr,
loss_scale=args.loss_scale,
**sargs
)
save_sched(learner.sched, save_dir)
data, train_sampler = torch(args.data, args.sz)
learner.set_data(data)
# Full size
update_model_dir(learner, args.save_dir)
sargs = save_args('first_run', args.save_dir)
learner.fit(args.lr,args.epochs, cycle_len=args.cycle_len,
sampler=train_sampler,
wds=args.weight_decay,
use_clr_beta=args.use_clr,
loss_scale=args.loss_scale,
**sargs
)
save_sched(learner.sched, args.save_dir)
print('Finished!')
HBox(children=(IntProgress(value=0, description='Epoch', max=95), HTML(value='')))
epoch trn_loss val_loss accuracy
0 1.964649 2.29585 0.2413
1 1.629196 1.670711 0.4153
2 1.279893 1.212244 0.5794
3 1.022324 1.081357 0.621
4 0.87207 0.947416 0.6629
5 0.7623 1.424813 0.5834
6 0.711834 0.880024 0.6867
7 0.673645 0.882425 0.7036
8 0.624824 0.75577 0.7406
9 0.591317 0.977343 0.6912
10 0.576992 0.738988 0.7422
11 0.54849 0.760248 0.7389
12 0.537716 0.806278 0.7491
13 0.521323 0.857931 0.7177
14 0.520798 0.709312 0.7678
15 0.501003 1.247033 0.6533
16 0.488959 0.970011 0.7197
17 0.474377 0.710652 0.7681
18 0.476366 0.686354 0.7693
19 0.480246 1.048503 0.6718
20 0.460859 0.705917 0.7693
21 0.4575 0.928597 0.7142
22 0.451474 1.079432 0.6969
23 0.457522 0.714242 0.7713
24 0.454815 0.958381 0.7166
25 0.454017 0.721306 0.7626
26 0.440028 0.671276 0.783
27 0.432509 0.704762 0.7709
28 0.434946 0.848601 0.7411
29 0.436465 0.878166 0.7304
30 0.427199 0.884603 0.7291
31 0.434507 1.089957 0.6863
32 0.417676 0.861309 0.7223
33 0.421785 0.723549 0.7686
34 0.422014 0.76699 0.7494
35 0.414088 0.737529 0.7734
36 0.42199 0.867438 0.7532
37 0.410439 0.731212 0.7665
38 0.419615 0.893294 0.722
39 0.410767 0.783612 0.7611
40 0.417583 0.834628 0.7361
41 0.412891 0.63415 0.7936
42 0.409541 0.940022 0.7131
43 0.406647 0.683275 0.7813
44 0.404456 0.787411 0.7438
45 0.396082 0.752332 0.7505
46 0.393867 0.762795 0.7638
47 0.399886 0.699477 0.7797
48 0.39515 0.909923 0.71
49 0.393156 0.672227 0.7759
50 0.378291 0.74518 0.7671
51 0.379188 1.026341 0.7154
52 0.384426 0.720828 0.7656
53 0.376072 0.84426 0.7349
54 0.367509 0.724659 0.771
55 0.376669 0.787541 0.7382
56 0.366169 0.679759 0.7823
57 0.366301 0.766295 0.7787
58 0.347275 0.569886 0.8152
59 0.360935 0.670394 0.7923
60 0.347934 0.603666 0.8008
61 0.336586 0.581869 0.8179
62 0.342905 0.801929 0.7625
63 0.343652 0.529446 0.8312
64 0.335459 0.678714 0.7988
65 0.329838 0.817373 0.7575
66 0.322143 0.718153 0.7865
67 0.324445 0.632962 0.7979
68 0.312053 0.57795 0.8145
69 0.301993 0.502879 0.8408
70 0.30651 0.496823 0.8404
71 0.29968 0.694079 0.7911
72 0.289324 0.570044 0.8077
73 0.284152 0.791094 0.758
74 0.27619 0.637103 0.8159
75 0.262262 0.5558 0.823
76 0.250873 0.483756 0.8489
77 0.231588 0.470876 0.8502
78 0.226033 0.506693 0.8422
79 0.208242 0.403554 0.8717
80 0.177001 0.499644 0.8528
81 0.171563 0.455609 0.8591
82 0.146025 0.644057 0.8225
83 0.13099 0.407798 0.8785
84 0.127202 0.411514 0.879
85 0.109014 0.399722 0.8855
86 0.102309 0.395437 0.8841
87 0.082008 0.409852 0.8869
88 0.072423 0.396025 0.8919
89 0.059635 0.389407 0.8953
90 0.044886 0.398666 0.8958
91 0.036124 0.377657 0.9034
92 0.027008 0.373316 0.9056
93 0.02094 0.370854 0.9058
94 0.018163 0.369598 0.9068
Finished!
learner.save('cifar10-wd4e4-lr1')
learner.sched.plot()
learner.lr_find()
HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))
81%|████████ | 79/98 [00:06<00:01, 12.36it/s, loss=0.0398]
learner.sched.plot()
learner.fit(1e-5,1, cycle_len=15,
sampler=train_sampler,
wds=args.weight_decay,
loss_scale=args.loss_scale,
**sargs
)
HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))
epoch trn_loss val_loss accuracy
0 0.006505 0.322454 0.911
1 0.006266 0.323212 0.9128
2 0.007029 0.323482 0.9125
3 0.006095 0.321533 0.9118
4 0.005952 0.323129 0.9117
5 0.005498 0.322257 0.9112
6 0.005841 0.323728 0.9117
7 0.005956 0.322007 0.9119
8 0.006588 0.321131 0.9119
64%|██████▍ | 63/98 [00:05<00:02, 12.17it/s, loss=0.00652]
Process Process-1468:
Process Process-1465:
Process Process-1467:
Process Process-1470:
Process Process-1469:
Process Process-1464:
Process Process-1466:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
Traceback (most recent call last):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
Traceback (most recent call last):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 50, in _worker_loop
r = index_queue.get()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 50, in _worker_loop
r = index_queue.get()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
return self._semlock.__enter__()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 55, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 55, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 50, in _worker_loop
r = index_queue.get()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 50, in _worker_loop
r = index_queue.get()
KeyboardInterrupt
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 55, in <listcomp>
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 55, in <listcomp>
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
return self._semlock.__enter__()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/queues.py", line 335, in get
res = self._reader.recv_bytes()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/datasets/folder.py", line 124, in __getitem__
img = self.transform(img)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/datasets/folder.py", line 124, in __getitem__
img = self.transform(img)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 50, in _worker_loop
r = index_queue.get()
KeyboardInterrupt
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 42, in __call__
img = t(img)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 42, in __call__
img = t(img)
KeyboardInterrupt
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 61, in __call__
return F.to_tensor(pic)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/queues.py", line 334, in get
with self._rlock:
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/transforms.py", line 61, in __call__
return F.to_tensor(pic)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/functional.py", line 71, in to_tensor
img = img.view(pic.size[1], pic.size[0], nchannel)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torchvision/transforms/functional.py", line 63, in to_tensor
img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
return self._semlock.__enter__()
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/PIL/Image.py", line 721, in tobytes
e = _getencoder(self.mode, encoder_name, args)
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.
KeyboardInterrupt
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/PIL/Image.py", line 453, in _getencoder
encoder = getattr(core, encoder_name + "_encoder")
KeyboardInterrupt
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
Traceback (most recent call last):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-20-c99c0c2bcae9>", line 5, in <module>
**sargs
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/fastai/learner.py", line 251, in fit
return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/fastai/learner.py", line 198, in fit_gen
metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16, **kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/fastai/model.py", line 115, in fit
loss = stepper.step(V(x),V(y), epoch)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/fastai/model.py", line 47, in step
output = self.m(*xs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/fastai/fp16.py", line 11, in forward
return self.module(input.half())
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/paperspace/imagenet-fast/cifar10/models/cifar10/clr_resnet.py", line 48, in forward
out = self.layer2(out)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/container.py", line 67, in forward
input = module(input)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/paperspace/imagenet-fast/cifar10/models/cifar10/clr_resnet.py", line 24, in forward
out = self.bn2(self.conv2(F.relu(self.bn1(out))))
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/nn/functional.py", line 583, in relu
return threshold(input, 0, 0, inplace)
KeyboardInterrupt
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 1863, in showtraceback
stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1095, in get_records
return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/ultratb.py", line 311, in wrapped
return f(*args, **kwargs)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/IPython/core/ultratb.py", line 345, in _fixed_getinnerframes
records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/inspect.py", line 1483, in getinnerframes
frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/inspect.py", line 1441, in getframeinfo
filename = getsourcefile(frame) or getfile(frame)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/inspect.py", line 696, in getsourcefile
if getattr(getmodule(object, filename), '__loader__', None) is not None:
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/inspect.py", line 742, in getmodule
os.path.realpath(f)] = module.__name__
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/posixpath.py", line 388, in realpath
path, ok = _joinrealpath(filename[:0], filename, {})
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/posixpath.py", line 422, in _joinrealpath
if not islink(newpath):
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/posixpath.py", line 171, in islink
st = os.lstat(path)
File "/home/paperspace/anaconda3/envs/fastai/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 175, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 17315) exited unexpectedly with exit code 1.
KeyboardInterrupt
---------------------------------------------------------------------------
64%|██████▍ | 63/98 [00:19<00:11, 3.15it/s, loss=0.00652]
if args.use_tta:
log_preds,y = learner.TTA()
preds = np.mean(np.exp(log_preds),0)
acc = accuracy(torch.FloatTensor(preds),torch.LongTensor(y))
print('TTA acc:', acc)
with open(args.save_dir+'/tta_accuracy.txt', "a", 1) as f:
f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+f"\tTTA accuracty: {acc}\n")
TTA acc: 0.9226