#!/usr/bin/env python # coding: utf-8 # ### Downloads and resizes imagenet # 1. Create spot instance # 2. Mount EFS # 3. Download imagenet from kaggle and untar # 4. Resize images to 80, 160, 320, 375 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: from aws_setup import * # #### Define parameters # In[3]: vpc_name='fast-ai' # #### Get Existing VPC by tag name # In[4]: vpc = get_vpc(vpc_name); vpc # #### Create EFS (if you haven't already) # In[6]: efs_tag = f'{vpc_name}-efs' # In[ ]: efs = create_efs(efs_tag, vpc, performance_mode='maxIO') # #### Request Spot instance # In[7]: instance_name = f'{vpc_name}-instance' # Recommend a high compute instance as we need to do multi-threaded resizing later on instance_type = 'c5.4xlarge' # In[8]: spot_price = get_spot_prices()[instance_type] bid_price = "%.4f" % (float(spot_price)*3) print(f'Spot price: {spot_price}, Bid price: {bid_price}') # In[9]: launch_specs = LaunchSpecs(vpc, instance_type=instance_type).build() # In[10]: launch_specs['BlockDeviceMappings'][0]['Ebs']['VolumeSize'] = 1000 # In[11]: launch_specs # In[12]: instance = create_spot_instance(instance_name, launch_specs, spot_price=bid_price); instance # In[ ]: # instance = get_instance(instance_name); instance get_ssh_command(instance) # ### SSH # In[35]: client = connect_to_instance(instance) # #### Mount EFS # In[16]: efs_addr = get_efs_address('fast-ai-efs'); efs_addr # In[17]: _ = run_command(client, 'mkdir ~/efs_mount') # In[18]: efs_mount_cmd = f'sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 {efs_addr}:/ ~/efs_mount' _ = run_command(client, efs_mount_cmd) # In[34]: _ = run_command(client, 'ls efs_mount') # no reformatting # ## Tmux # In[37]: tsess = TmuxSession(client, 'sess') # ### Download dataset from kaggle # In[19]: _ = run_command(client, 'mkdir ~/.kaggle') # In[21]: kaggle_file = Path.home()/'.kaggle/kaggle.json' upload_file(client, str(kaggle_file), '.kaggle/kaggle.json') # In[32]: download_kaggle_file = Path.cwd()/'upload_scripts/download_kaggle_imagenet.sh' upload_file(client, str(download_kaggle_file), 'download_kaggle_imagenet.sh') # In[33]: tsess.run_cmd('bash download_kaggle_imagenet.sh') # ### Upload image resize # In[40]: # imagenet_formatting.sh uses this for multithreaded resizing # resize_imags.py methods are taken from fast.ai dataset.py upload_path = Path.cwd()/'upload_scripts/resize_images.py' upload_file(client, str(upload_path), 'resize_images.py') # In[47]: # creates sizes 80, 160, 320, 375 and stores files in EFS upload_path = Path.cwd()/'upload_scripts/imagenet_formatting.sh' upload_file(client, str(upload_path), 'imagenet_formatting.sh') # In[ ]: tsess.run_cmd('bash imagenet_formatting.sh')