Skip to content

Commit 922c5f5

Browse files
authored
Fixed Nits in Evaluation Docs (huggingface#10063)
Minor fixes and script improvement in evaluation docs.
1 parent 8d386f7 commit 922c5f5

File tree

1 file changed

+12
-6
lines changed

1 file changed

+12
-6
lines changed

docs/source/en/conceptual/evaluation.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/
181181

182182
```python
183183
model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5"
184-
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device)
184+
sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=torch.float16).to("cuda")
185185

186186
images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images
187187
```
@@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline
280280

281281
instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
282282
"timbrooks/instruct-pix2pix", torch_dtype=torch.float16
283-
).to(device)
283+
).to("cuda")
284284
```
285285

286286
Now, we perform the edits:
@@ -326,9 +326,9 @@ from transformers import (
326326

327327
clip_id = "openai/clip-vit-large-patch14"
328328
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
329-
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device)
329+
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda")
330330
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
331-
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device)
331+
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda")
332332
```
333333

334334
Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip).
@@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module):
350350

351351
def preprocess_image(self, image):
352352
image = self.image_processor(image, return_tensors="pt")["pixel_values"]
353-
return {"pixel_values": image.to(device)}
353+
return {"pixel_values": image.to("cuda")}
354354

355355
def tokenize_text(self, text):
356356
inputs = self.tokenizer(
@@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module):
360360
truncation=True,
361361
return_tensors="pt",
362362
)
363-
return {"input_ids": inputs.input_ids.to(device)}
363+
return {"input_ids": inputs.input_ids.to("cuda")}
364364

365365
def encode_image(self, image):
366366
preprocessed_image = self.preprocess_image(image)
@@ -459,6 +459,7 @@ with ZipFile(local_filepath, "r") as zipper:
459459
```python
460460
from PIL import Image
461461
import os
462+
import numpy as np
462463

463464
dataset_path = "sample-imagenet-images"
464465
image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)])
@@ -477,6 +478,7 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t
477478

478479
```python
479480
from torchvision.transforms import functional as F
481+
import torch
480482

481483

482484
def preprocess_image(image):
@@ -498,6 +500,10 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=
498500
dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config)
499501
dit_pipeline = dit_pipeline.to("cuda")
500502

503+
seed = 0
504+
generator = torch.manual_seed(seed)
505+
506+
501507
words = [
502508
"cassette player",
503509
"chainsaw",

0 commit comments

Comments
 (0)