#default_exp data.external #export from fastai2.torch_basics import * # export class Config: "Setup config at `~/.fastai` unless it exists already." config_path = Path(os.getenv('FASTAI_HOME', '~/.fastai')).expanduser() config_file = config_path/'config.yml' def __init__(self): self.config_path.mkdir(parents=True, exist_ok=True) if not self.config_file.exists(): self.create_config() self.d = self.load_config() def __getitem__(self,k): k = k.lower() if k not in self.d: k = k+'_path' return Path(self.d[k]) def __getattr__(self,k): if k=='d': raise AttributeError return self[k] def __setitem__(self,k,v): self.d[k] = str(v) def __contains__(self,k): return k in self.d def load_config(self): "load and return config if version equals 2 in existing, else create new config." with open(self.config_file, 'r') as f: config = yaml.safe_load(f) if 'version' in config and config['version'] == 2: return config elif 'version' in config: self.create_config(config) else: self.create_config() return self.load_config() def create_config(self, cfg=None): "create new config with default paths and set `version` to 2." config = {'data_path': str(self.config_path/'data'), 'archive_path': str(self.config_path/'archive'), 'storage_path': '/tmp', 'model_path': str(self.config_path/'models'), 'version': 2} if cfg is not None: cfg['version'] = 2 config = merge(config, cfg) self.save_file(config) def save(self): self.save_file(self.d) def save_file(self, config): "save config file at default config location `~/.fastai/config.yml`." with self.config_file.open('w') as f: yaml.dump(config, f, default_flow_style=False) config_file = Path("~/.fastai/config.yml").expanduser() if config_file.exists(): os.remove(config_file) assert not config_file.exists() config = Config() assert config_file.exists() config.d #hide config = Config() config_path = config.config_path config_file,config_bak = config_path/'config.yml',config_path/'config.yml.bak' config_file,config_bak #hide #This cell is just to make the config file compatible with current fastai # TODO: make this a method that auto-runs as needed if 'data_archive_path' not in config: config['data_archive_path'] = config.data_path config.save() if config_file.exists(): shutil.move(config_file, config_bak) config['archive_path'] = Path(".") config.save() config = Config() config.d if config_bak.exists(): shutil.move(config_bak, config_file) config = Config() config.d #export class URLs(): "Global constants for dataset and model URLs." LOCAL_PATH = Path.cwd() URL = 'http://files.fast.ai/data/examples/' MDL = 'http://files.fast.ai/models/' S3 = 'https://s3.amazonaws.com/fast-ai-' S3_IMAGE = f'{S3}imageclas/' S3_IMAGELOC = f'{S3}imagelocal/' S3_AUDI = f'{S3}audio/' S3_NLP = f'{S3}nlp/' S3_COCO = f'{S3}coco/' S3_MODEL = f'{S3}modelzoo/' # main datasets ADULT_SAMPLE = f'{URL}adult_sample.tgz' BIWI_SAMPLE = f'{URL}biwi_sample.tgz' CIFAR = f'{URL}cifar10.tgz' COCO_SAMPLE = f'{S3_COCO}coco_sample.tgz' COCO_TINY = f'{URL}coco_tiny.tgz' HUMAN_NUMBERS = f'{URL}human_numbers.tgz' IMDB = f'{S3_NLP}imdb.tgz' IMDB_SAMPLE = f'{URL}imdb_sample.tgz' ML_SAMPLE = f'{URL}movie_lens_sample.tgz' ML_100k = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip' MNIST_SAMPLE = f'{URL}mnist_sample.tgz' MNIST_TINY = f'{URL}mnist_tiny.tgz' MNIST_VAR_SIZE_TINY = f'{S3_IMAGE}mnist_var_size_tiny.tgz' PLANET_SAMPLE = f'{URL}planet_sample.tgz' PLANET_TINY = f'{URL}planet_tiny.tgz' IMAGENETTE = f'{S3_IMAGE}imagenette2.tgz' IMAGENETTE_160 = f'{S3_IMAGE}imagenette2-160.tgz' IMAGENETTE_320 = f'{S3_IMAGE}imagenette2-320.tgz' IMAGEWOOF = f'{S3_IMAGE}imagewoof2.tgz' IMAGEWOOF_160 = f'{S3_IMAGE}imagewoof2-160.tgz' IMAGEWOOF_320 = f'{S3_IMAGE}imagewoof2-320.tgz' IMAGEWANG = f'{S3_IMAGE}imagewang.tgz' IMAGEWANG_160 = f'{S3_IMAGE}imagewang-160.tgz' IMAGEWANG_320 = f'{S3_IMAGE}imagewang-320.tgz' # kaggle competitions download dogs-vs-cats -p {DOGS.absolute()} DOGS = f'{URL}dogscats.tgz' # image classification datasets CALTECH_101 = f'{S3_IMAGE}caltech_101.tgz' CARS = f'{S3_IMAGE}stanford-cars.tgz' CIFAR_100 = f'{S3_IMAGE}cifar100.tgz' CUB_200_2011 = f'{S3_IMAGE}CUB_200_2011.tgz' FLOWERS = f'{S3_IMAGE}oxford-102-flowers.tgz' FOOD = f'{S3_IMAGE}food-101.tgz' MNIST = f'{S3_IMAGE}mnist_png.tgz' PETS = f'{S3_IMAGE}oxford-iiit-pet.tgz' # NLP datasets AG_NEWS = f'{S3_NLP}ag_news_csv.tgz' AMAZON_REVIEWS = f'{S3_NLP}amazon_review_full_csv.tgz' AMAZON_REVIEWS_POLARITY = f'{S3_NLP}amazon_review_polarity_csv.tgz' DBPEDIA = f'{S3_NLP}dbpedia_csv.tgz' MT_ENG_FRA = f'{S3_NLP}giga-fren.tgz' SOGOU_NEWS = f'{S3_NLP}sogou_news_csv.tgz' WIKITEXT = f'{S3_NLP}wikitext-103.tgz' WIKITEXT_TINY = f'{S3_NLP}wikitext-2.tgz' YAHOO_ANSWERS = f'{S3_NLP}yahoo_answers_csv.tgz' YELP_REVIEWS = f'{S3_NLP}yelp_review_full_csv.tgz' YELP_REVIEWS_POLARITY = f'{S3_NLP}yelp_review_polarity_csv.tgz' # Image localization datasets BIWI_HEAD_POSE = f"{S3_IMAGELOC}biwi_head_pose.tgz" CAMVID = f'{S3_IMAGELOC}camvid.tgz' CAMVID_TINY = f'{URL}camvid_tiny.tgz' LSUN_BEDROOMS = f'{S3_IMAGE}bedroom.tgz' PASCAL_2007 = f'{S3_IMAGELOC}pascal_2007.tgz' PASCAL_2012 = f'{S3_IMAGELOC}pascal_2012.tgz' # Audio classification datasets MACAQUES = 'https://storage.googleapis.com/ml-animal-sounds-datasets/macaques.zip' ZEBRA_FINCH = 'https://storage.googleapis.com/ml-animal-sounds-datasets/zebra_finch.zip' # Medical Imaging datasets #SKIN_LESION = f'{S3_IMAGELOC}skin_lesion.tgz' SIIM_SMALL = f'{S3_IMAGELOC}siim_small.tgz' #Pretrained models OPENAI_TRANSFORMER = f'{S3_MODEL}transformer.tgz' WT103_FWD = f'{S3_MODEL}wt103-fwd.tgz' WT103_BWD = f'{S3_MODEL}wt103-bwd.tgz' def path(url='.', c_key='archive'): "Return local path where to download based on `c_key`" fname = url.split('/')[-1] local_path = URLs.LOCAL_PATH/('models' if c_key=='models' else 'data')/fname if local_path.exists(): return local_path return Config()[c_key]/fname url = URLs.PETS local_path = URLs.path(url) test_eq(local_path.parent, Config()['archive']); local_path local_path = URLs.path(url, c_key='model') test_eq(local_path.parent, Config()['model']) local_path # export def download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1024*1024, timeout=4, retries=5): "Download `url` to `dest` unless it exists and not `overwrite`" if os.path.exists(dest) and not overwrite: return s = requests.Session() s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries)) # additional line to identify as a firefox browser, see fastai/#2438 s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}) u = s.get(url, stream=True, timeout=timeout) try: file_size = int(u.headers["Content-Length"]) except: show_progress = False with open(dest, 'wb') as f: nbytes = 0 if show_progress: pbar = progress_bar(range(file_size), leave=False, parent=pbar) try: if show_progress: pbar.update(0) for chunk in u.iter_content(chunk_size=chunk_size): nbytes += len(chunk) if show_progress: pbar.update(nbytes) f.write(chunk) except requests.exceptions.ConnectionError as e: fname = url.split('/')[-1] data_dir = dest.parent print(f'\n Download of {url} has failed after {retries} retries\n' f' Fix the download manually:\n' f'$ mkdir -p {data_dir}\n' f'$ cd {data_dir}\n' f'$ wget -c {url}\n' f'$ tar xf {fname}\n' f' And re-run your code once the download is successful\n') fname = Path("./dog.jpg") if fname.exists(): os.remove(fname) url = "https://i.insider.com/569fdd9ac08a80bd448b7138?width=1100&format=jpeg&auto=webp" download_url(url, fname) assert fname.exists() from PIL import Image im = Image.open(fname) plt.imshow(im); if fname.exists(): last_modified_time = os.path.getmtime(fname) download_url(url, fname) test_eq(os.path.getmtime(fname), last_modified_time) if fname.exists(): os.remove(fname) # export def download_data(url, fname=None, c_key='archive', force_download=False): "Download `url` to `fname`." fname = Path(fname or URLs.path(url, c_key=c_key)) fname.parent.mkdir(parents=True, exist_ok=True) if not fname.exists() or force_download: download_url(url, fname, overwrite=force_download) return fname #hide try: test_eq(download_data(URLs.MNIST_SAMPLE), config.archive/'mnist_sample.tgz') test_eq(download_data(URLs.MNIST_TINY, fname=Path('mnist.tgz')), Path('mnist.tgz')) finally: Path('mnist.tgz').unlink() try: tst_model = config.model/'mnist_tiny.tgz' test_eq(download_data(URLs.MNIST_TINY, c_key='model'), tst_model) os.remove(tst_model) finally: if tst_model.exists(): tst_model.unlink() #hide from nbdev.imports import Config as NbdevConfig __file__ = NbdevConfig().lib_path/'data'/'external.py' #export def _get_check(url): "internal function to get the hash of the file at `url`." checks = json.load(open(Path(__file__).parent/'checks.txt', 'r')) return checks.get(url, None) def _check_file(fname): "internal function to get the hash of the local file at `fname`." size = os.path.getsize(fname) with open(fname, "rb") as f: hash_nb = hashlib.md5(f.read(2**20)).hexdigest() return [size,hash_nb] #hide test_eq(_get_check(URLs.MNIST_SAMPLE), _check_file(URLs.path(URLs.MNIST_SAMPLE))) _get_check(URLs.MNIST_SAMPLE), _check_file(URLs.path(URLs.MNIST_SAMPLE)) _get_check(URLs.PASCAL_2007),_get_check(URLs.PASCAL_2012) #export def _add_check(url, fname): "Internal function to update the internal check file with `url` and check on `fname`." checks = json.load(open(Path(__file__).parent/'checks.txt', 'r')) checks[url] = _check_file(fname) json.dump(checks, open(Path(__file__).parent/'checks.txt', 'w'), indent=2) #export def file_extract(fname, dest=None): "Extract `fname` to `dest` using `tarfile` or `zipfile`." if dest is None: dest = Path(fname).parent fname = str(fname) if fname.endswith('gz'): tarfile.open(fname, 'r:gz').extractall(dest) elif fname.endswith('zip'): zipfile.ZipFile(fname ).extractall(dest) else: raise Exception(f'Unrecognized archive: {fname}') #export def _try_from_storage(dest, storage): "an internal function to create symbolic links for files from `storage` to `dest` if `storage` exists" if not storage.exists(): return os.makedirs(dest, exist_ok=True) for f in storage.glob('*'): os.symlink(f, dest/f.name, target_is_directory=f.is_dir()) #hide with tempfile.TemporaryDirectory() as d: with tempfile.TemporaryDirectory() as d2: d,d2 = Path(d),Path(d2) for k in ['a', 'b', 'c']: os.makedirs(d/k) for k in ['d', 'e', 'f']: (d/k).touch() _try_from_storage(d2, d) for k in ['a', 'b', 'c']: assert (d2/k).exists() assert (d2/k).is_dir() for k in ['d', 'e', 'f']: assert (d2/k).exists() assert (d2/k).is_file() #export def newest_folder(path): "Return newest folder on path" list_of_paths = path.glob('*') return max(list_of_paths, key=lambda p: p.stat().st_ctime) #export def rename_extracted(dest): "Rename file if different from dest" extracted = newest_folder(dest.parent) if not (extracted.name == dest.name): extracted.rename(dest) #export def untar_data(url, fname=None, dest=None, c_key='data', force_download=False, extract_func=file_extract): "Download `url` to `fname` if `dest` doesn't exist, and un-tgz or unzip to folder `dest`." default_dest = URLs.path(url, c_key=c_key).with_suffix('') dest = default_dest if dest is None else Path(dest)/default_dest.name fname = Path(fname or URLs.path(url)) if fname.exists() and _get_check(url) and _check_file(fname) != _get_check(url): print("A new version of this dataset is available, downloading...") force_download = True if force_download: if fname.exists(): os.remove(fname) if dest.exists(): shutil.rmtree(dest) if not dest.exists(): _try_from_storage(dest, URLs.path(url, c_key='storage').with_suffix('')) if not dest.exists(): fname = download_data(url, fname=fname, c_key=c_key) if _get_check(url) and _check_file(fname) != _get_check(url): print(f"File downloaded is broken. Remove {fname} and try again.") extract_func(fname, dest.parent) rename_extracted(dest) return dest from tempfile import TemporaryDirectory test_eq(untar_data(URLs.MNIST_SAMPLE), config.data/'mnist_sample') with TemporaryDirectory() as d: d = Path(d) dest = untar_data(URLs.MNIST_TINY, fname='mnist_tiny.tgz', dest=d, force_download=True) assert Path('mnist_tiny.tgz').exists() assert (d/'mnist_tiny').exists() os.unlink('mnist_tiny.tgz') #Test c_key tst_model = config.model/'mnist_sample' test_eq(untar_data(URLs.MNIST_SAMPLE, c_key='model'), tst_model) assert not tst_model.with_suffix('.tgz').exists() #Archive wasn't downloaded in the models path assert (config.archive/'mnist_sample.tgz').exists() #Archive was downloaded there shutil.rmtree(tst_model) #test fname!=dest with TemporaryDirectory() as d: d = Path(d) untar_data(URLs.MNIST_TINY, fname='mnist_tiny.tgz', dest=d, force_download=True) Path('mnist_tiny.tgz').rename('nims_tini.tgz') p = Path('nims_tini.tgz') dest = Path('nims_tini') assert p.exists() file_extract(p, dest.parent) rename_extracted(dest) p.unlink() shutil.rmtree(dest) #hide #Check all URLs are in the checks.txt file and match for downloaded archives _whitelist = "MDL LOCAL_PATH URL WT103_BWD WT103_FWD".split() checks = json.load(open(Path(__file__).parent/'checks.txt', 'r')) for d in dir(URLs): if d.upper() == d and not d.startswith("S3") and not d in _whitelist: url = getattr(URLs, d) assert url in checks,f"""{d} is not in the check file for all URLs. To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below): url = URLs.{d} untar_data(url, force_download=True) _add_check(url, URLs.path(url)) """ f = URLs.path(url) if f.exists(): assert checks[url] == _check_file(f),f"""The log we have for {d} in checks does not match the actual archive. To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below): url = URLs.{d} _add_check(url, URLs.path(url)) """ #hide from nbdev.export import notebook2script notebook2script()