I am trying to use DIY loop to generate an image from a base image. For some reason the result is a blurry image that doesn’t look like anything. Can you help me understand where I went wrong?
import torch
from diffusers import DDIMScheduler
from PIL import Image
import numpy as np
# Load and preprocess the image
def preprocess_image(image_path, target_size=(256, 256)):
image = Image.open(image_path).convert('RGB')
image = image.resize(target_size)
image = np.array(image) / 255.0
image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).float()
return image
# Initialize the DDIMScheduler
scheduler = DDIMScheduler(
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear"
)
# Set the number of inference steps
num_inference_steps = 50
# Load and preprocess your image
image_path = "/kaggle/input/imageforlatent/Untitled.jpg"
image = preprocess_image(image_path)
#Setting the timesteps
timesteps = scheduler.timesteps[::len(scheduler.timesteps) // num_inference_steps]
timesteps = timesteps[:num_inference_steps] # Ensure we have exactly num_inference_steps
# using the scheduler adding noise to initial image
# Generate noise
noise = torch.randn_like(image)
#Adding noise using loop
for step,t in enumerate(tqdm(timesteps.flip(0))):
# Add noise
noisy_image = scheduler.add_noise(image, noise, t)
# Save the image every 10 steps and at the final step
if step % 10 == 0 :
noisy_image_np = noisy_image.squeeze().permute(1, 2, 0).clamp(0, 1).numpy()
plt.imshow(noisy_image_np)
plt.show()
noisy_image = noisy_image.to(device)
# Encode to latent space
with torch.no_grad():
latents = 0.18215 * pipe.vae.encode(noisy_image).latent_dist.mean
# DIY loop to generate an image from the noisy image
guidance_scale = 8 # @param
num_inference_steps = 50 # @param
prompt = "a man and a moon" # @param
negative_prompt = "zoomed in, blurry, oversaturated, warped" # @param
# Encode the prompt
text_embeddings = pipe._encode_prompt(prompt, device, 1, True, negative_prompt)
#####USING OUR OWN LATENT
# Create our random starting point
#latents = torch.randn((1, 4, 64, 64), device=device, generator=generator)
#latents *= pipe.scheduler.init_noise_sigma
# Prepare the scheduler
pipe.scheduler.set_timesteps(num_inference_steps, device=device)
# Loop through the sampling timesteps
for i, t in enumerate(pipe.scheduler.timesteps):
# Expand the latents if we are doing classifier free guidance
latent_model_input = torch.cat([latents] * 2)
# Apply any scaling required by the scheduler
latent_model_input = pipe.scheduler.scale_model_input(latent_model_input, t)
# Predict the noise residual with the UNet
with torch.no_grad():
noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# Perform guidance
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# Compute the previous noisy sample x_t -> x_t-1
latents = pipe.scheduler.step(noise_pred, t, latents).prev_sample
# Decode the resulting latents into an image
with torch.no_grad():
image = pipe.decode_latents(latents.detach())
# View
pipe.numpy_to_pil(image)[0]
I tried the code above but it generated a blurry image