@@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/
181181
182182``` python
183183model_ckpt_1_5 = " stable-diffusion-v1-5/stable-diffusion-v1-5"
184- sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype = weight_dtype ).to(device )
184+ sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype = torch.float16 ).to(" cuda " )
185185
186186images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt = 1 , generator = generator, output_type = " np" ).images
187187```
@@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline
280280
281281instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
282282 " timbrooks/instruct-pix2pix" , torch_dtype = torch.float16
283- ).to(device )
283+ ).to(" cuda " )
284284```
285285
286286Now, we perform the edits:
@@ -326,9 +326,9 @@ from transformers import (
326326
327327clip_id = " openai/clip-vit-large-patch14"
328328tokenizer = CLIPTokenizer.from_pretrained(clip_id)
329- text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device )
329+ text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(" cuda " )
330330image_processor = CLIPImageProcessor.from_pretrained(clip_id)
331- image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device )
331+ image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(" cuda " )
332332```
333333
334334Notice that we are using a particular CLIP checkpoint, i.e., ` openai/clip-vit-large-patch14 ` . This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [ documentation] ( https://huggingface.co/docs/transformers/model_doc/clip ) .
@@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module):
350350
351351 def preprocess_image (self , image ):
352352 image = self .image_processor(image, return_tensors = " pt" )[" pixel_values" ]
353- return {" pixel_values" : image.to(device )}
353+ return {" pixel_values" : image.to(" cuda " )}
354354
355355 def tokenize_text (self , text ):
356356 inputs = self .tokenizer(
@@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module):
360360 truncation = True ,
361361 return_tensors = " pt" ,
362362 )
363- return {" input_ids" : inputs.input_ids.to(device )}
363+ return {" input_ids" : inputs.input_ids.to(" cuda " )}
364364
365365 def encode_image (self , image ):
366366 preprocessed_image = self .preprocess_image(image)
@@ -459,6 +459,7 @@ with ZipFile(local_filepath, "r") as zipper:
459459``` python
460460from PIL import Image
461461import os
462+ import numpy as np
462463
463464dataset_path = " sample-imagenet-images"
464465image_paths = sorted ([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])
@@ -477,6 +478,7 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t
477478
478479``` python
479480from torchvision.transforms import functional as F
481+ import torch
480482
481483
482484def preprocess_image (image ):
@@ -498,6 +500,10 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=
498500dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config)
499501dit_pipeline = dit_pipeline.to(" cuda" )
500502
503+ seed = 0
504+ generator = torch.manual_seed(seed)
505+
506+
501507words = [
502508 " cassette player" ,
503509 " chainsaw" ,
0 commit comments