# default_exp core # export from fastprogress.fastprogress import progress_bar from fastcore.imports import * from fastcore.basics import * from fastcore.foundation import * from fastcore.net import * from fastcore.xtras import untar_dir import hashlib,shutil from pprint import pformat #hide from nbdev.showdoc import show_doc import tempfile,fastdownload dest = Path('tmp') url = 'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz' #hide shutil.rmtree(dest, ignore_errors=True) Path.BASE_PATH = Path.home() #export def download_url(url, dest=None): "Download `url` to `dest` and show progress" pbar = progress_bar([]) def progress(count=1, bsize=1, tsize=None): pbar.total = tsize pbar.update(count*bsize) return urlsave(url, dest, reporthook=progress) dest.mkdir() fpath = download_url(url, dest) fpath # export def path_stats(fpath): "`True` if size and hash of `fpath` matches `size_check` and `hash_check`" size = os.path.getsize(fpath) # Just use first 1MB of file for performance with open(fpath, "rb") as f: hashed = hashlib.md5(f.read(2**20)).hexdigest() return size,hashed path_stats(fpath) #export def checks_module(module): "Location of `download_checks.py`" if not module: return {} return Path(module.__file__).parent/'download_checks.py' mod = checks_module(fastdownload) mod #export def read_checks(fmod): "Evaluated contents of `download_checks.py`" if not fmod.exists(): return {} txt = fmod.read_text() return eval(txt) if txt else {} #export def check(fmod, url, fpath): "Check whether size and hash of `fpath` matches stored data for `url` or data is missing" checks = read_checks(fmod).get(url) return not checks or path_stats(fpath)==checks # export def update_checks(fpath, url, fmod): "Store the hash and size of `fpath` for `url` in `download_checks.py`" checks = read_checks(fmod) checks[url] = path_stats(fpath) fmod.write_text(pformat(checks)) if mod.exists(): mod.unlink() update_checks(fpath, url, mod) read_checks(mod) #export def download_and_check(url, fpath, fmod, force): "Download `url` to `fpath`, unless exists and `check` fails and not `force`" if not force and fpath.exists(): if check(fmod, url, fpath): return fpath else: print("Downloading a new version of this dataset...") res = download_url(url, fpath) if not check(fmod, url, fpath): raise Exception("Downloaded file is corrupt or not latest version") return res # export class FastDownload: def __init__(self, cfg=None, base='~/.fastdownload', archive=None, data=None, module=None): base = Path(base).expanduser().absolute() default = {'data':(data or 'data'), 'archive':(archive or 'archive')} self.cfg = Config(base, 'config.ini', create=default) if cfg is None else cfg self.module = checks_module(module) if data is not None: self.cfg['data'] = data if archive is not None: self.cfg['archive'] = archive def arch_path(self): "Path to archives" return self.cfg.path('archive') def data_path(self, extract_key='data'): "Path to extracted data" return self.cfg.path(extract_key) def check(self, url, fpath): "Check whether size and hash of `fpath` matches stored data for `url` or data is missing" checks = read_checks(self.module).get(url) return not checks or path_stats(fpath)==checks def download(self, url, force=False): "Download `url` to archive path, unless exists and `self.check` fails and not `force`" self.arch_path().mkdir(exist_ok=True, parents=True) return download_and_check(url, urldest(url, self.arch_path()), self.module, force) def rm(self, url, rm_arch=True, rm_data=True, extract_key='data'): "Delete downloaded archive and extracted data for `url`" arch = urldest(url, self.arch_path()) if rm_arch: arch.delete() if rm_data: dest = self.data_path(extract_key) (dest/remove_suffix(arch.stem, '.tar')).delete() def update(self, url): "Store the hash and size in `download_checks.py`" update_checks(urldest(url, self.arch_path()), url, self.module) def extract(self, url, extract_key='data', force=False): "Extract archive already downloaded from `url`, overwriting existing if `force`" arch = urldest(url, self.arch_path()) if not arch.exists(): raise Exception(f'{arch} does not exist') dest = self.data_path(extract_key) dest.mkdir(exist_ok=True, parents=True) return untar_dir(arch, dest, rename=True, overwrite=force) def get(self, url, extract_key='data', force=False): "Download and extract `url`, overwriting existing if `force`" self.download(url, force=force) return self.extract(url, extract_key=extract_key, force=force) d = FastDownload(module=fastdownload) d.module d.cfg.config_file print(d.cfg.config_file.read_text()) show_doc(FastDownload.download) if d.module.exists(): d.module.unlink() arch = d.download(url) arch show_doc(FastDownload.update) d.update(url) eval(d.module.read_text()) d.download(url) show_doc(FastDownload.extract) extr = d.extract(url, force=True) extr extr.ls() d.cfg['model_path'] = 'models' d.extract(url, extract_key='model_path') show_doc(FastDownload.rm) d.rm(url) extr.exists(),arch.exists() show_doc(FastDownload.get) res = d.get(url) res,extr.exists() res = d.get(url, extract_key='model_path') res,res.exists() #hide from nbdev.export import notebook2script notebook2script()