# default_exp core
# export
from fastprogress.fastprogress import progress_bar
from fastcore.imports import *
from fastcore.basics import *
from fastcore.foundation import *
from fastcore.net import *
from fastcore.xtras import untar_dir
import hashlib,shutil
from pprint import pformat
#hide
from nbdev.showdoc import show_doc
import tempfile,fastdownload
This helper functions provide the functionality that FastDownload relies on. Most users should use FastDownload rather than calling these helpers.
dest = Path('tmp')
url = 'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz'
#hide
shutil.rmtree(dest, ignore_errors=True)
Path.BASE_PATH = Path.home()
#export
def download_url(url, dest=None):
"Download `url` to `dest` and show progress"
pbar = progress_bar([])
def progress(count=1, bsize=1, tsize=None):
pbar.total = tsize
pbar.update(count*bsize)
return urlsave(url, dest, reporthook=progress)
dest.mkdir()
fpath = download_url(url, dest)
fpath
Path('tmp/mnist_tiny.tgz')
# export
def path_stats(fpath):
"`True` if size and hash of `fpath` matches `size_check` and `hash_check`"
size = os.path.getsize(fpath)
# Just use first 1MB of file for performance
with open(fpath, "rb") as f: hashed = hashlib.md5(f.read(2**20)).hexdigest()
return size,hashed
path_stats(fpath)
(342207, '56143e8f24db90d925d82a5a74141875')
#export
def checks_module(module):
"Location of `download_checks.py`"
if not module: return {}
return Path(module.__file__).parent/'download_checks.py'
The download_checks.py file containing sizes and hashes will be located next to module:
mod = checks_module(fastdownload)
mod
Path('git/fastdownload/fastdownload/download_checks.py')
#export
def read_checks(fmod):
"Evaluated contents of `download_checks.py`"
if not fmod.exists(): return {}
txt = fmod.read_text()
return eval(txt) if txt else {}
#export
def check(fmod, url, fpath):
"Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
checks = read_checks(fmod).get(url)
return not checks or path_stats(fpath)==checks
# export
def update_checks(fpath, url, fmod):
"Store the hash and size of `fpath` for `url` in `download_checks.py`"
checks = read_checks(fmod)
checks[url] = path_stats(fpath)
fmod.write_text(pformat(checks))
if mod.exists(): mod.unlink()
update_checks(fpath, url, mod)
read_checks(mod)
{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207,
'56143e8f24db90d925d82a5a74141875')}
#export
def download_and_check(url, fpath, fmod, force):
"Download `url` to `fpath`, unless exists and `check` fails and not `force`"
if not force and fpath.exists():
if check(fmod, url, fpath): return fpath
else: print("Downloading a new version of this dataset...")
res = download_url(url, fpath)
if not check(fmod, url, fpath): raise Exception("Downloaded file is corrupt or not latest version")
return res
# export
class FastDownload:
def __init__(self, cfg=None, base='~/.fastdownload', archive=None, data=None, module=None):
base = Path(base).expanduser().absolute()
default = {'data':(data or 'data'), 'archive':(archive or 'archive')}
self.cfg = Config(base, 'config.ini', create=default) if cfg is None else cfg
self.module = checks_module(module)
if data is not None: self.cfg['data'] = data
if archive is not None: self.cfg['archive'] = archive
def arch_path(self):
"Path to archives"
return self.cfg.path('archive')
def data_path(self, extract_key='data'):
"Path to extracted data"
return self.cfg.path(extract_key)
def check(self, url, fpath):
"Check whether size and hash of `fpath` matches stored data for `url` or data is missing"
checks = read_checks(self.module).get(url)
return not checks or path_stats(fpath)==checks
def download(self, url, force=False):
"Download `url` to archive path, unless exists and `self.check` fails and not `force`"
self.arch_path().mkdir(exist_ok=True, parents=True)
return download_and_check(url, urldest(url, self.arch_path()), self.module, force)
def rm(self, url, rm_arch=True, rm_data=True, extract_key='data'):
"Delete downloaded archive and extracted data for `url`"
arch = urldest(url, self.arch_path())
if rm_arch: arch.delete()
if rm_data:
dest = self.data_path(extract_key)
(dest/remove_suffix(arch.stem, '.tar')).delete()
def update(self, url):
"Store the hash and size in `download_checks.py`"
update_checks(urldest(url, self.arch_path()), url, self.module)
def extract(self, url, extract_key='data', force=False):
"Extract archive already downloaded from `url`, overwriting existing if `force`"
arch = urldest(url, self.arch_path())
if not arch.exists(): raise Exception(f'{arch} does not exist')
dest = self.data_path(extract_key)
dest.mkdir(exist_ok=True, parents=True)
return untar_dir(arch, dest, rename=True, overwrite=force)
def get(self, url, extract_key='data', force=False):
"Download and extract `url`, overwriting existing if `force`"
self.download(url, force=force)
return self.extract(url, extract_key=extract_key, force=force)
d = FastDownload(module=fastdownload)
d.module
Path('git/fastdownload/fastdownload/download_checks.py')
The config.ini file will be created (if it doesn't exist) in {base}/config.ini:
d.cfg.config_file
Path('.fastdownload/config.ini')
print(d.cfg.config_file.read_text())
[DEFAULT] data = /home/jhoward/.fastdownload/data archive = /home/jhoward/.fastdownload/archive
show_doc(FastDownload.download)
FastDownload.download[source]
FastDownload.download(url,force=False)
Download url to archive path, unless exists and self.check fails and not force
If there is no stored hash and size for url, or the size and hash matches the stored checks, then download will only download the URL if the destination file does not exist. The destination path will be retured.
if d.module.exists(): d.module.unlink()
arch = d.download(url)
arch
Path('.fastdownload/archive/mnist_tiny.tgz')
show_doc(FastDownload.update)
d.update(url)
eval(d.module.read_text())
{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207,
'56143e8f24db90d925d82a5a74141875')}
Calling download will now just return the existing file, since the checks match:
d.download(url)
Path('.fastdownload/archive/mnist_tiny.tgz')
If the checks file doesn't match the size or hash of the archive, then a new copy of the file will be downloaded.
show_doc(FastDownload.extract)
FastDownload.extract[source]
FastDownload.extract(url,extract_key='data',force=False)
Extract archive already downloaded from url, overwriting existing if force
extr = d.extract(url, force=True)
extr
Path('.fastdownload/data/mnist_tiny')
extr.ls()
(#5) [Path('.fastdownload/data/mnist_tiny/models'),Path('.fastdownload/data/mnist_tiny/train'),Path('.fastdownload/data/mnist_tiny/labels.csv'),Path('.fastdownload/data/mnist_tiny/valid'),Path('.fastdownload/data/mnist_tiny/test')]
Pass extract_key to use a key other than data from your config file when selecting an archive extraction location:
d.cfg['model_path'] = 'models'
d.extract(url, extract_key='model_path')
Path('.fastdownload/models/mnist_tiny')
show_doc(FastDownload.rm)
FastDownload.rm[source]
FastDownload.rm(url,rm_arch=True,rm_data=True,extract_key='data')
Delete downloaded archive and extracted data for url
d.rm(url)
extr.exists(),arch.exists()
(False, False)
show_doc(FastDownload.get)
FastDownload.get[source]
FastDownload.get(url,extract_key='data',force=False)
Download and extract url, overwriting existing if force
res = d.get(url)
res,extr.exists()
(Path('.fastdownload/data/mnist_tiny'), True)
extract_key works the same way as in FastDownload.extract:
res = d.get(url, extract_key='model_path')
res,res.exists()
(Path('.fastdownload/models/mnist_tiny'), True)
#hide
from nbdev.export import notebook2script
notebook2script()
Converted 00_core.ipynb. Converted index.ipynb.