#default_exp data.external

#export
from fastai2.torch_basics import *

# export
class Config:
    "Setup config at `~/.fastai` unless it exists already."
    config_path = Path(os.getenv('FASTAI_HOME', '~/.fastai')).expanduser()
    config_file = config_path/'config.yml'

    def __init__(self):
        self.config_path.mkdir(parents=True, exist_ok=True)
        if not self.config_file.exists(): self.create_config()
        self.d = self.load_config()

    def __getitem__(self,k):
        k = k.lower()
        if k not in self.d: k = k+'_path'
        return Path(self.d[k])

    def __getattr__(self,k):
        if k=='d': raise AttributeError
        return self[k]

    def __setitem__(self,k,v): self.d[k] = str(v)
    def __contains__(self,k): return k in self.d

    def load_config(self):
        "load and return config if version equals 2 in existing, else create new config."
        with open(self.config_file, 'r') as f:
            config = yaml.safe_load(f)
            if 'version' in config and config['version'] == 2: return config
            elif 'version' in config: self.create_config(config)
            else: self.create_config()
        return self.load_config()

    def create_config(self, cfg=None):
        "create new config with default paths and set `version` to 2."
        config = {'data_path':    str(self.config_path/'data'),
                  'archive_path': str(self.config_path/'archive'),
                  'storage_path': '/tmp',
                  'model_path':   str(self.config_path/'models'),
                  'version':      2}
        if cfg is not None:
            cfg['version'] = 2
            config = merge(config, cfg)
        self.save_file(config)

    def save(self): self.save_file(self.d)
    def save_file(self, config):
        "save config file at default config location `~/.fastai/config.yml`."
        with self.config_file.open('w') as f: yaml.dump(config, f, default_flow_style=False)

config_file = Path("~/.fastai/config.yml").expanduser()
if config_file.exists(): os.remove(config_file)
assert not config_file.exists()

config = Config()
assert config_file.exists()

config.d

#hide
config = Config()
config_path = config.config_path
config_file,config_bak = config_path/'config.yml',config_path/'config.yml.bak'
config_file,config_bak

#hide
#This cell is just to make the config file compatible with current fastai
# TODO: make this a method that auto-runs as needed
if 'data_archive_path' not in config:
    config['data_archive_path'] = config.data_path
    config.save()

if config_file.exists(): shutil.move(config_file, config_bak)
config['archive_path'] = Path(".")
config.save()

config = Config()
config.d

if config_bak.exists(): shutil.move(config_bak, config_file)
config = Config()
config.d

#export
class URLs():
    "Global constants for dataset and model URLs."
    LOCAL_PATH = Path.cwd()
    URL = 'http://files.fast.ai/data/examples/'
    MDL = 'http://files.fast.ai/models/'
    S3 = 'https://s3.amazonaws.com/fast-ai-'

    S3_IMAGE    = f'{S3}imageclas/'
    S3_IMAGELOC = f'{S3}imagelocal/'
    S3_AUDI     = f'{S3}audio/'
    S3_NLP      = f'{S3}nlp/'
    S3_COCO     = f'{S3}coco/'
    S3_MODEL    = f'{S3}modelzoo/'

    # main datasets
    ADULT_SAMPLE        = f'{URL}adult_sample.tgz'
    BIWI_SAMPLE         = f'{URL}biwi_sample.tgz'
    CIFAR               = f'{URL}cifar10.tgz'
    COCO_SAMPLE         = f'{S3_COCO}coco_sample.tgz'
    COCO_TINY           = f'{URL}coco_tiny.tgz'
    HUMAN_NUMBERS       = f'{URL}human_numbers.tgz'
    IMDB                = f'{S3_NLP}imdb.tgz'
    IMDB_SAMPLE         = f'{URL}imdb_sample.tgz'
    ML_SAMPLE           = f'{URL}movie_lens_sample.tgz'
    ML_100k             = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
    MNIST_SAMPLE        = f'{URL}mnist_sample.tgz'
    MNIST_TINY          = f'{URL}mnist_tiny.tgz'
    MNIST_VAR_SIZE_TINY = f'{S3_IMAGE}mnist_var_size_tiny.tgz'
    PLANET_SAMPLE       = f'{URL}planet_sample.tgz'
    PLANET_TINY         = f'{URL}planet_tiny.tgz'
    IMAGENETTE          = f'{S3_IMAGE}imagenette2.tgz'
    IMAGENETTE_160      = f'{S3_IMAGE}imagenette2-160.tgz'
    IMAGENETTE_320      = f'{S3_IMAGE}imagenette2-320.tgz'
    IMAGEWOOF           = f'{S3_IMAGE}imagewoof2.tgz'
    IMAGEWOOF_160       = f'{S3_IMAGE}imagewoof2-160.tgz'
    IMAGEWOOF_320       = f'{S3_IMAGE}imagewoof2-320.tgz'
    IMAGEWANG           = f'{S3_IMAGE}imagewang.tgz'
    IMAGEWANG_160       = f'{S3_IMAGE}imagewang-160.tgz'
    IMAGEWANG_320       = f'{S3_IMAGE}imagewang-320.tgz'

    # kaggle competitions download dogs-vs-cats -p {DOGS.absolute()}
    DOGS = f'{URL}dogscats.tgz'

    # image classification datasets
    CALTECH_101  = f'{S3_IMAGE}caltech_101.tgz'
    CARS         = f'{S3_IMAGE}stanford-cars.tgz'
    CIFAR_100    = f'{S3_IMAGE}cifar100.tgz'
    CUB_200_2011 = f'{S3_IMAGE}CUB_200_2011.tgz'
    FLOWERS      = f'{S3_IMAGE}oxford-102-flowers.tgz'
    FOOD         = f'{S3_IMAGE}food-101.tgz'
    MNIST        = f'{S3_IMAGE}mnist_png.tgz'
    PETS         = f'{S3_IMAGE}oxford-iiit-pet.tgz'

    # NLP datasets
    AG_NEWS                 = f'{S3_NLP}ag_news_csv.tgz'
    AMAZON_REVIEWS          = f'{S3_NLP}amazon_review_full_csv.tgz'
    AMAZON_REVIEWS_POLARITY = f'{S3_NLP}amazon_review_polarity_csv.tgz'
    DBPEDIA                 = f'{S3_NLP}dbpedia_csv.tgz'
    MT_ENG_FRA              = f'{S3_NLP}giga-fren.tgz'
    SOGOU_NEWS              = f'{S3_NLP}sogou_news_csv.tgz'
    WIKITEXT                = f'{S3_NLP}wikitext-103.tgz'
    WIKITEXT_TINY           = f'{S3_NLP}wikitext-2.tgz'
    YAHOO_ANSWERS           = f'{S3_NLP}yahoo_answers_csv.tgz'
    YELP_REVIEWS            = f'{S3_NLP}yelp_review_full_csv.tgz'
    YELP_REVIEWS_POLARITY   = f'{S3_NLP}yelp_review_polarity_csv.tgz'

    # Image localization datasets
    BIWI_HEAD_POSE     = f"{S3_IMAGELOC}biwi_head_pose.tgz"
    CAMVID             = f'{S3_IMAGELOC}camvid.tgz'
    CAMVID_TINY        = f'{URL}camvid_tiny.tgz'
    LSUN_BEDROOMS      = f'{S3_IMAGE}bedroom.tgz'
    PASCAL_2007        = f'{S3_IMAGELOC}pascal_2007.tgz'
    PASCAL_2012        = f'{S3_IMAGELOC}pascal_2012.tgz'

    # Audio classification datasets
    MACAQUES           = 'https://storage.googleapis.com/ml-animal-sounds-datasets/macaques.zip'
    ZEBRA_FINCH        = 'https://storage.googleapis.com/ml-animal-sounds-datasets/zebra_finch.zip'

    # Medical Imaging datasets
    #SKIN_LESION        = f'{S3_IMAGELOC}skin_lesion.tgz'
    SIIM_SMALL         = f'{S3_IMAGELOC}siim_small.tgz'

    #Pretrained models
    OPENAI_TRANSFORMER = f'{S3_MODEL}transformer.tgz'
    WT103_FWD          = f'{S3_MODEL}wt103-fwd.tgz'
    WT103_BWD          = f'{S3_MODEL}wt103-bwd.tgz'

    def path(url='.', c_key='archive'):
        "Return local path where to download based on `c_key`"
        fname = url.split('/')[-1]
        local_path = URLs.LOCAL_PATH/('models' if c_key=='models' else 'data')/fname
        if local_path.exists(): return local_path
        return Config()[c_key]/fname

url = URLs.PETS
local_path = URLs.path(url)
test_eq(local_path.parent, Config()['archive']); 
local_path

local_path = URLs.path(url, c_key='model')
test_eq(local_path.parent, Config()['model'])
local_path

# export
def download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1024*1024,
                 timeout=4, retries=5):
    "Download `url` to `dest` unless it exists and not `overwrite`"
    if os.path.exists(dest) and not overwrite: return

    s = requests.Session()
    s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
    # additional line to identify as a firefox browser, see fastai/#2438
    s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'})
    u = s.get(url, stream=True, timeout=timeout)
    try: file_size = int(u.headers["Content-Length"])
    except: show_progress = False

    with open(dest, 'wb') as f:
        nbytes = 0
        if show_progress: pbar = progress_bar(range(file_size), leave=False, parent=pbar)
        try:
            if show_progress: pbar.update(0)
            for chunk in u.iter_content(chunk_size=chunk_size):
                nbytes += len(chunk)
                if show_progress: pbar.update(nbytes)
                f.write(chunk)
        except requests.exceptions.ConnectionError as e:
            fname = url.split('/')[-1]
            data_dir = dest.parent
            print(f'\n Download of {url} has failed after {retries} retries\n'
                  f' Fix the download manually:\n'
                  f'$ mkdir -p {data_dir}\n'
                  f'$ cd {data_dir}\n'
                  f'$ wget -c {url}\n'
                  f'$ tar xf {fname}\n'
                  f' And re-run your code once the download is successful\n')

fname = Path("./dog.jpg")
if fname.exists(): os.remove(fname)
url = "https://i.insider.com/569fdd9ac08a80bd448b7138?width=1100&format=jpeg&auto=webp"
download_url(url, fname)
assert fname.exists()

from PIL import Image
im = Image.open(fname)
plt.imshow(im);

if fname.exists(): last_modified_time = os.path.getmtime(fname)
download_url(url, fname)
test_eq(os.path.getmtime(fname), last_modified_time)
if fname.exists(): os.remove(fname)

# export
def download_data(url, fname=None, c_key='archive', force_download=False):
    "Download `url` to `fname`."
    fname = Path(fname or URLs.path(url, c_key=c_key))
    fname.parent.mkdir(parents=True, exist_ok=True)
    if not fname.exists() or force_download: download_url(url, fname, overwrite=force_download)
    return fname

#hide
try:
    test_eq(download_data(URLs.MNIST_SAMPLE), config.archive/'mnist_sample.tgz')
    test_eq(download_data(URLs.MNIST_TINY, fname=Path('mnist.tgz')), Path('mnist.tgz'))
finally: Path('mnist.tgz').unlink()

try:
    tst_model = config.model/'mnist_tiny.tgz'
    test_eq(download_data(URLs.MNIST_TINY, c_key='model'), tst_model)
    os.remove(tst_model)
finally:
    if tst_model.exists(): tst_model.unlink()

#hide
from nbdev.imports import Config as NbdevConfig
__file__ = NbdevConfig().lib_path/'data'/'external.py'

#export
def _get_check(url):
    "internal function to get the hash of the file at `url`."
    checks = json.load(open(Path(__file__).parent/'checks.txt', 'r'))
    return checks.get(url, None)

def _check_file(fname):
    "internal function to get the hash of the local file at `fname`."
    size = os.path.getsize(fname)
    with open(fname, "rb") as f: hash_nb = hashlib.md5(f.read(2**20)).hexdigest()
    return [size,hash_nb]

#hide
test_eq(_get_check(URLs.MNIST_SAMPLE), _check_file(URLs.path(URLs.MNIST_SAMPLE)))
_get_check(URLs.MNIST_SAMPLE), _check_file(URLs.path(URLs.MNIST_SAMPLE))

_get_check(URLs.PASCAL_2007),_get_check(URLs.PASCAL_2012)

#export
def _add_check(url, fname):
    "Internal function to update the internal check file with `url` and check on `fname`."
    checks = json.load(open(Path(__file__).parent/'checks.txt', 'r'))
    checks[url] = _check_file(fname)
    json.dump(checks, open(Path(__file__).parent/'checks.txt', 'w'), indent=2)

#export
def file_extract(fname, dest=None):
    "Extract `fname` to `dest` using `tarfile` or `zipfile`."
    if dest is None: dest = Path(fname).parent
    fname = str(fname)
    if   fname.endswith('gz'):  tarfile.open(fname, 'r:gz').extractall(dest)
    elif fname.endswith('zip'): zipfile.ZipFile(fname     ).extractall(dest)
    else: raise Exception(f'Unrecognized archive: {fname}')

#export
def _try_from_storage(dest, storage):
    "an internal function to create symbolic links for files from `storage` to `dest` if `storage` exists"
    if not storage.exists(): return
    os.makedirs(dest, exist_ok=True)
    for f in storage.glob('*'): os.symlink(f, dest/f.name, target_is_directory=f.is_dir())

#hide
with tempfile.TemporaryDirectory() as d:
    with tempfile.TemporaryDirectory() as d2:
        d,d2 = Path(d),Path(d2)
        for k in ['a', 'b', 'c']: os.makedirs(d/k)
        for k in ['d', 'e', 'f']: (d/k).touch()
        _try_from_storage(d2, d)
        for k in ['a', 'b', 'c']: 
            assert (d2/k).exists()
            assert (d2/k).is_dir()
        for k in ['d', 'e', 'f']: 
            assert (d2/k).exists()
            assert (d2/k).is_file()

#export
def newest_folder(path):
    "Return newest folder on path"
    list_of_paths = path.glob('*')
    return max(list_of_paths, key=lambda p: p.stat().st_ctime)

#export
def rename_extracted(dest):
    "Rename file if different from dest"
    extracted = newest_folder(dest.parent)
    if not (extracted.name == dest.name): extracted.rename(dest)

#export
def untar_data(url, fname=None, dest=None, c_key='data', force_download=False, extract_func=file_extract):
    "Download `url` to `fname` if `dest` doesn't exist, and un-tgz or unzip to folder `dest`."
    default_dest = URLs.path(url, c_key=c_key).with_suffix('')
    dest = default_dest if dest is None else Path(dest)/default_dest.name
    fname = Path(fname or URLs.path(url))
    if fname.exists() and _get_check(url) and _check_file(fname) != _get_check(url):
        print("A new version of this dataset is available, downloading...")
        force_download = True
    if force_download:
        if fname.exists(): os.remove(fname)
        if dest.exists(): shutil.rmtree(dest)
    if not dest.exists(): _try_from_storage(dest, URLs.path(url, c_key='storage').with_suffix(''))
    if not dest.exists():
        fname = download_data(url, fname=fname, c_key=c_key)
        if _get_check(url) and _check_file(fname) != _get_check(url):
            print(f"File downloaded is broken. Remove {fname} and try again.")
        extract_func(fname, dest.parent)
        rename_extracted(dest)
    return dest

from tempfile import TemporaryDirectory

test_eq(untar_data(URLs.MNIST_SAMPLE), config.data/'mnist_sample')

with TemporaryDirectory() as d:
    d = Path(d)
    dest = untar_data(URLs.MNIST_TINY, fname='mnist_tiny.tgz', dest=d, force_download=True)
    assert Path('mnist_tiny.tgz').exists()
    assert (d/'mnist_tiny').exists()
    os.unlink('mnist_tiny.tgz')

#Test c_key
tst_model = config.model/'mnist_sample'
test_eq(untar_data(URLs.MNIST_SAMPLE, c_key='model'), tst_model)
assert not tst_model.with_suffix('.tgz').exists() #Archive wasn't downloaded in the models path
assert (config.archive/'mnist_sample.tgz').exists() #Archive was downloaded there
shutil.rmtree(tst_model)

#test fname!=dest
with TemporaryDirectory() as d:
    d = Path(d)
    untar_data(URLs.MNIST_TINY, fname='mnist_tiny.tgz', dest=d, force_download=True)
    Path('mnist_tiny.tgz').rename('nims_tini.tgz')
    p = Path('nims_tini.tgz')
    dest = Path('nims_tini')
    assert p.exists()
    file_extract(p, dest.parent)
    rename_extracted(dest)
    p.unlink()
    shutil.rmtree(dest)

#hide
#Check all URLs are in the checks.txt file and match for downloaded archives
_whitelist = "MDL LOCAL_PATH URL WT103_BWD WT103_FWD".split()
checks = json.load(open(Path(__file__).parent/'checks.txt', 'r'))
for d in dir(URLs): 
    if d.upper() == d and not d.startswith("S3") and not d in _whitelist: 
        url = getattr(URLs, d)
        assert url in checks,f"""{d} is not in the check file for all URLs.
To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):
url = URLs.{d}
untar_data(url, force_download=True)
_add_check(url, URLs.path(url))
"""
        f = URLs.path(url)
        if f.exists():
            assert checks[url] == _check_file(f),f"""The log we have for {d} in checks does not match the actual archive.
To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):
url = URLs.{d}
_add_check(url, URLs.path(url))
"""

#hide
from nbdev.export import notebook2script
notebook2script()