#!/usr/bin/env python
# coding: utf-8

# In[9]:


# pip install -U --user diffusers transformers huggingface_hub


# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[8]:


from PIL import Image
from fastcore.all import concat
import torch, logging
from pathlib import Path
from huggingface_hub import notebook_login
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler
from transformers import CLIPTextModel, CLIPTokenizer
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from miniai.stability import *
logging.disable(logging.WARNING)

torch.manual_seed(1)
if not (Path.home()/'.huggingface'/'token').exists(): notebook_login()


# In[3]:


guidance_scale = 7.5
num_inference_steps = 50
width = height = 512


# In[4]:


tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16)
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.float16).to("cuda")
# Here we use a different VAE to the original release, which has been fine-tuned for more steps
vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema", torch_dtype=torch.float16).to("cuda")
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16).to("cuda")
scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)


# In[5]:


from urllib.request import urlretrieve
img_url = 'https://huggingface.co/blog/assets/98_stable_diffusion/stable_diffusion_12_1.png'
img_path = Path('horse.png')
if not img_path.exists(): urlretrieve(img_url, img_path)


# In[44]:


def preprocess(image):
    w, h = image.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
    image = image.resize((w, h), resample=Image.Resampling.LANCZOS)
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return 2.0 * image - 1.0


# In[46]:


img = preprocess(Image.open(img_path).convert('RGB'))
show_image(img[0]);


# In[146]:


def encode(x):
    latents = vae.encode(x.to("cuda", dtype=torch.float16)).latent_dist.sample()
    return latents * 0.18215


# In[147]:


latents = encode(img)
show_images(latents[0].detach()); # TODO: detach in `show_images`?


# In[149]:


# TODO: try different `r`s
r = len(scheduler.timesteps)//2
timesteps = scheduler.timesteps[[r]]
noise = torch.randn_like(latents, device='cuda')
noisy_latents = scheduler.add_noise(latents, noise, timesteps)
show_images(noisey_latents[0].detach());


# In[150]:


inp = scheduler.scale_model_input(torch.cat([noisy_latents] * 2), ts).cuda()


# In[151]:


show_images(decode(inp).detach().to(dtype=torch.float32))


# In[119]:


def embed(prompts):
    tokens = tokenizer(prompts, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    return text_encoder(tokens.input_ids.to("cuda"))[0].half()


# In[132]:


prompts = ['horse', 'zebra']


# In[134]:


t = embed(prompts)
u = embed([""] * len(prompts))
emb = torch.cat([u, t])
emb.shape


# In[135]:


torch.manual_seed(100)
g = guidance_scale
ts = timesteps


# In[137]:


scheduler.set_timesteps(num_inference_steps)
with torch.no_grad(): u,t = unet(inp, ts.cuda(), encoder_hidden_states=emb.cuda()).sample.chunk(2)
pred = u + g*(t-u)
latents = scheduler.step(pred, ts, noisy_latents).prev_sample


# In[131]:


show_images(decode(torch.concat([u,t,pred,latents])).detach().to(dtype=torch.float32))


# In[ ]:


for i,ts in enumerate(tqdm(scheduler.timesteps)):
    inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)
    with torch.no_grad(): u,t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)
    pred = u + g*(t-u)
    latents = scheduler.step(pred, ts, latents).prev_sample


# In[80]:


def decode(x):
    with torch.no_grad(): res = vae.decode(1 / 0.18215 * x).sample
    return (res / 2 + 0.5).clamp(0, 1)


# In[96]:


res = decode(latents)
show_images(res.detach().to(dtype=torch.float32));


# In[155]:


def text_enc(prompts, maxlen=None):
    if maxlen is None: maxlen = tokenizer.model_max_length
    inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
    return text_encoder(inp.input_ids.to("cuda"))[0].half()

def mk_img(t):
    image = (t/2+0.5).clamp(0,1).detach().cpu().permute(1, 2, 0).numpy()
    return Image.fromarray((image*255).round().astype("uint8"))


# In[166]:


prompts = ['a photograph of an astronaut riding a horse']
seed = 100
steps = 50


# In[167]:


bs = len(prompts)
text = text_enc(prompts)
uncond = text_enc([""] * bs, text.shape[1])
emb = torch.cat([uncond, text])
if seed: torch.manual_seed(seed)

latents = torch.randn((bs, unet.in_channels, height//8, width//8))
scheduler.set_timesteps(steps)
latents = latents.to("cuda").half() * scheduler.init_noise_sigma

for i,ts in enumerate(tqdm(scheduler.timesteps)):
    inp = scheduler.scale_model_input(latents, ts)
    with torch.no_grad(): x = unet(inp, ts, encoder_hidden_states=emb).sample
    pred = u + g*(t-u)
    latents = scheduler.step(pred, ts, latents).prev_sample

with torch.no_grad(): res = vae.decode(1 / 0.18215 * latents).sample


# In[172]:


ts = scheduler.timesteps[0]


# In[174]:


inp = scheduler.scale_model_input(latents, ts)


# In[175]:


with torch.no_grad(): x = unet(inp, ts, encoder_hidden_states=emb).sample


# In[171]:


show_images(res.to(dtype=torch.float32), figsize=(10,10));