diff --git a/docs/source/en/conceptual/evaluation.md b/docs/source/en/conceptual/evaluation.md index 8dfbc8f2ac80..90e072bbf2ba 100644 --- a/docs/source/en/conceptual/evaluation.md +++ b/docs/source/en/conceptual/evaluation.md @@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/ ```python model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5" -sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device) +sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=torch.float16).to("cuda") images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images ``` @@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", torch_dtype=torch.float16 -).to(device) +).to("cuda") ``` Now, we perform the edits: @@ -326,9 +326,9 @@ from transformers import ( clip_id = "openai/clip-vit-large-patch14" tokenizer = CLIPTokenizer.from_pretrained(clip_id) -text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device) +text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda") image_processor = CLIPImageProcessor.from_pretrained(clip_id) -image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device) +image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda") ``` Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip). @@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module): def preprocess_image(self, image): image = self.image_processor(image, return_tensors="pt")["pixel_values"] - return {"pixel_values": image.to(device)} + return {"pixel_values": image.to("cuda")} def tokenize_text(self, text): inputs = self.tokenizer( @@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module): truncation=True, return_tensors="pt", ) - return {"input_ids": inputs.input_ids.to(device)} + return {"input_ids": inputs.input_ids.to("cuda")} def encode_image(self, image): preprocessed_image = self.preprocess_image(image) @@ -459,6 +459,7 @@ with ZipFile(local_filepath, "r") as zipper: ```python from PIL import Image import os +import numpy as np dataset_path = "sample-imagenet-images" image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)]) @@ -477,6 +478,7 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t ```python from torchvision.transforms import functional as F +import torch def preprocess_image(image): @@ -498,6 +500,10 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype= dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config) dit_pipeline = dit_pipeline.to("cuda") +seed = 0 +generator = torch.manual_seed(seed) + + words = [ "cassette player", "chainsaw",