From 5393fdde9cc7d9db06bb2741c4d948aa321907ea Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 19 Dec 2024 02:04:28 +0100
Subject: [PATCH 01/10] Add Intel Gaudi HPU device usage

---
 clip/clip.py  | 52 ++++++++++++++++++++++++++++++++++++++++-----------
 clip/utils.py | 30 +++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 11 deletions(-)
 create mode 100644 clip/utils.py

diff --git a/clip/clip.py b/clip/clip.py
index 398a6282c..334f06490 100644
--- a/clip/clip.py
+++ b/clip/clip.py
@@ -12,9 +12,11 @@
 
 from .model import build_model
 from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from .utils import get_device_initial
 
 try:
     from torchvision.transforms import InterpolationMode
+
     BICUBIC = InterpolationMode.BICUBIC
 except ImportError:
     BICUBIC = Image.BICUBIC
@@ -51,13 +53,24 @@ def _download(url: str, root: str):
         raise RuntimeError(f"{download_target} exists and is not a regular file")
 
     if os.path.isfile(download_target):
-        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+        if (
+            hashlib.sha256(open(download_target, "rb").read()).hexdigest()
+            == expected_sha256
+        ):
             return download_target
         else:
-            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+            warnings.warn(
+                f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
+            )
 
     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+        with tqdm(
+            total=int(source.info().get("Content-Length")),
+            ncols=80,
+            unit="iB",
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as loop:
             while True:
                 buffer = source.read(8192)
                 if not buffer:
@@ -91,7 +104,12 @@ def available_models() -> List[str]:
     return list(_MODELS.keys())
 
 
-def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
+def load(
+    name: str,
+    device: Union[str, torch.device] = get_device_initial(),
+    jit: bool = False,
+    download_root: str = None,
+):
     """Load a CLIP model
 
     Parameters
@@ -100,7 +118,7 @@ def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_a
         A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
 
     device : Union[str, torch.device]
-        The device to put the loaded model
+        The device to put the loaded model, by default it uses the device returned by `clip.get_device_initial()`
 
     jit : bool
         Whether to load the optimized JIT model or more hackable non-JIT model (default).
@@ -123,10 +141,12 @@ def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_a
     else:
         raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
 
-    with open(model_path, 'rb') as opened_file:
+    with open(model_path, "rb") as opened_file:
         try:
             # loading JIT archive
-            model = torch.jit.load(opened_file, map_location=device if jit else "cpu").eval()
+            model = torch.jit.load(
+                opened_file, map_location=device if jit else "cpu"
+            ).eval()
             state_dict = None
         except RuntimeError:
             # loading saved state dict
@@ -171,9 +191,11 @@ def patch_device(module):
     patch_device(model.encode_image)
     patch_device(model.encode_text)
 
-    # patch dtype to float32 on CPU
-    if str(device) == "cpu":
-        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+    # patch dtype to float32 on CPU, HPU
+    if str(device) in ["cpu", "hpu"]:
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[]
+        )
         float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
         float_node = float_input.node()
 
@@ -199,10 +221,18 @@ def patch_float(module):
 
         model.float()
 
+    if str(device) == "hpu":
+        if torch.hpu.is_available():
+            from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+            model = wrap_in_hpu_graph(model)
+            model = model.eval().to(torch.device(device))
     return model, _transform(model.input_resolution.item())
 
 
-def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
+def tokenize(
+    texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False
+) -> Union[torch.IntTensor, torch.LongTensor]:
     """
     Returns the tokenized representation of given input string(s)
 
diff --git a/clip/utils.py b/clip/utils.py
new file mode 100644
index 000000000..738489549
--- /dev/null
+++ b/clip/utils.py
@@ -0,0 +1,30 @@
+import importlib.util
+
+import torch
+
+
+def get_device_initial(preferred_device=None):
+    """
+    Determine the appropriate device to use (cuda, hpu, or cpu).
+    Args:
+        preferred_device (str): User-preferred device ('cuda', 'hpu', or 'cpu').
+
+    Returns:
+        str: Device string ('cuda', 'hpu', or 'cpu').
+    """
+    # Check for HPU support
+    if importlib.util.find_spec("habana_frameworks") is not None:
+        from habana_frameworks.torch.utils.library_loader import load_habana_module
+
+        load_habana_module()
+        if torch.hpu.is_available():
+            if preferred_device == "hpu" or preferred_device is None:
+                return "hpu"
+
+    # Check for CUDA (GPU support)
+    if torch.cuda.is_available():
+        if preferred_device == "cuda" or preferred_device is None:
+            return "cuda"
+
+    # Default to CPU
+    return "cpu"

From deb2964c2639c02644e724781ef231e4541780ec Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 19 Dec 2024 02:05:11 +0100
Subject: [PATCH 02/10] Add test - `test_hpu_support`

---
 tests/test_consistency.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/test_consistency.py b/tests/test_consistency.py
index f2c6fd4fe..f1b6fae22 100644
--- a/tests/test_consistency.py
+++ b/tests/test_consistency.py
@@ -6,7 +6,7 @@
 import clip
 
 
-@pytest.mark.parametrize('model_name', clip.available_models())
+@pytest.mark.parametrize("model_name", clip.available_models())
 def test_consistency(model_name):
     device = "cpu"
     jit_model, transform = clip.load(model_name, device=device, jit=True)
@@ -23,3 +23,22 @@ def test_consistency(model_name):
         py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
 
     assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)
+
+
+@pytest.mark.parametrize("model_name", clip.available_models())
+def test_hpu_support(model_name):
+    device = "hpu"
+    jit_model, transform = clip.load(model_name, device=device, jit=True)
+    py_model, _ = clip.load(model_name, device=device, jit=False)
+
+    image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
+    text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
+
+    with torch.no_grad():
+        logits_per_image, _ = jit_model(image, text)
+        jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+        logits_per_image, _ = py_model(image, text)
+        py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+    assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)

From e8d6206c164bf1f064cd3d36e06426aba9d845bd Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 19 Dec 2024 02:06:10 +0100
Subject: [PATCH 03/10] Add Dockerfile.hpu, requirements_hpu.txt and update
 README.md with HPU support information

---
 Dockerfile.hpu       | 25 +++++++++++++++++++
 README.md            | 59 ++++++++++++++++++++++++++++++++++++++++++++
 requirements_hpu.txt |  7 ++++++
 3 files changed, 91 insertions(+)
 create mode 100644 Dockerfile.hpu
 create mode 100644 requirements_hpu.txt

diff --git a/Dockerfile.hpu b/Dockerfile.hpu
new file mode 100644
index 000000000..fe729c15a
--- /dev/null
+++ b/Dockerfile.hpu
@@ -0,0 +1,25 @@
+# Use the official Gaudi Docker image with PyTorch
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+# Set environment variables for Habana
+ENV HABANA_VISIBLE_DEVICES=all
+ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
+ENV PT_HPU_LAZY_ACC_PAR_MODE=0
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=1
+
+# Set timezone to UTC and install essential packages
+ENV DEBIAN_FRONTEND="noninteractive" TZ=Etc/UTC
+RUN apt-get update && apt-get install -y \
+    tzdata \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . /workspace/clip
+WORKDIR /workspace/clip
+
+# Copy HPU requirements
+COPY requirements_hpu.txt /workspace/requirements_hpu.txt
+
+# Install Python packages
+RUN pip install --upgrade pip \
+    && pip install -r requirements_hpu.txt
diff --git a/README.md b/README.md
index db56b56e2..51264f8e1 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,65 @@ print(f"Accuracy = {accuracy:.3f}")
 Note that the `C` value should be determined via a hyperparameter sweep using a validation split.
 
 
+## Intel® Gaudi® HPU Usage
+
+### Build the Docker Image
+To use Intel® Gaudi® HPU for running this notebook, start by building a Docker image with the appropriate environment setup.  
+
+```bash
+docker build -t clip_hpu:latest -f Dockerfile.hpu .
+```  
+
+In the `Dockerfile.hpu`, we use the `vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest` base image. Ensure that the version matches your setup.  
+See the [PyTorch Docker Images for the Intel® Gaudi® Accelerator](https://developer.habana.ai/catalog/pytorch-container/) for more information.  
+
+### Run the Container  
+
+```bash
+docker run -it --runtime=habana clip_hpu:latest
+```  
+
+Optionally, you can add a mapping volume (`-v`) to access your project directory inside the container. Add the flag `-v /path/to/your/project:/workspace/project` to the `docker run` command.  
+Replace `/path/to/your/project` with the path to your project directory on your local machine.  
+
+### Command-line Usage with Intel® Gaudi® HPU  
+
+To run the notebook with Intel® Gaudi® HPU, use the `--device hpu` option when specifying the device in the code.  
+
+For example, modify the device assignment as follows:  
+
+```python
+device = 'hpu' if torch.device('hpu').is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
+model.to(device)
+image_input = image_input.to(device)
+text_tokens = text_tokens.to(device)
+```
+
+### Python Usage with Intel® Gaudi® HPU  
+
+To leverage Intel® Gaudi® HPU in Python, ensure that the device is specified as `hpu` during model initialization and tensor manipulation.  
+
+```python
+import clip
+import torch
+
+# Load the model on HPU
+device = "hpu"
+model, preprocess = clip.load("ViT-B/32", device=device)
+
+# Prepare data and move to HPU
+image_input = preprocess(image).unsqueeze(0).to(device)
+text_tokens = clip.tokenize("a sample text").to(device)
+
+# Run inference
+with torch.no_grad():
+    image_features = model.encode_image(image_input)
+    text_features = model.encode_text(text_tokens)
+
+print("Inference completed on HPU")
+```
+
+
 ## See Also
 
 * [OpenCLIP](https://github.com/mlfoundations/open_clip): includes larger and independently trained CLIP models up to ViT-G/14
diff --git a/requirements_hpu.txt b/requirements_hpu.txt
new file mode 100644
index 000000000..3eefe7c1a
--- /dev/null
+++ b/requirements_hpu.txt
@@ -0,0 +1,7 @@
+-r requirements.txt
+optimum-habana==1.14.1
+transformers==4.45.2
+huggingface-hub==0.26.2
+tiktoken==0.8.0
+torch-geometric==2.6.1
+numba==0.60.0

From d17b83174d11214edef78ef5bba5b0350394123c Mon Sep 17 00:00:00 2001
From: PiotrBLL <piotr.sobieszczyk@bluelabellabs.com>
Date: Thu, 19 Dec 2024 15:24:38 +0100
Subject: [PATCH 04/10] Fix JIT error

---
 clip/clip.py              | 16 ++++++++++++++--
 tests/test_consistency.py |  2 +-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/clip/clip.py b/clip/clip.py
index 334f06490..7565b8930 100644
--- a/clip/clip.py
+++ b/clip/clip.py
@@ -156,13 +156,25 @@ def load(
             state_dict = torch.load(opened_file, map_location="cpu")
 
     if not jit:
-        model = build_model(state_dict or model.state_dict()).to(device)
+        model = build_model(state_dict or model.state_dict())
+
+        if str(device) == "hpu":
+            from habana_frameworks.torch.utils.library_loader import load_habana_module
+
+            load_habana_module()
+            if torch.hpu.is_available():
+                from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+
+                model = wrap_in_hpu_graph(model)
+                model = model.eval().to(torch.device(device))
+        else:
+            model = model.to(device)
         if str(device) == "cpu":
             model.float()
         return model, _transform(model.visual.input_resolution)
 
     # patch the device names
-    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device("cpu" if device == "hpu" else device)), example_inputs=[])
     device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
 
     def _node_get(node: torch._C.Node, key: str):
diff --git a/tests/test_consistency.py b/tests/test_consistency.py
index f1b6fae22..18000f61b 100644
--- a/tests/test_consistency.py
+++ b/tests/test_consistency.py
@@ -28,7 +28,7 @@ def test_consistency(model_name):
 @pytest.mark.parametrize("model_name", clip.available_models())
 def test_hpu_support(model_name):
     device = "hpu"
-    jit_model, transform = clip.load(model_name, device=device, jit=True)
+    jit_model, transform = clip.load(model_name, device="cpu", jit=True)
     py_model, _ = clip.load(model_name, device=device, jit=False)
 
     image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)

From 859a140c046fd6b6858317398688cdfd65cb9779 Mon Sep 17 00:00:00 2001
From: bartosz roguski <bartosz.roguski@bluelabellabs.com>
Date: Tue, 28 Jan 2025 18:50:41 +0100
Subject: [PATCH 05/10] fix: Remove incorrect code snippet from README-HPU
 section.

---
 README.md                 | 52 +++++++++------------------------------
 tests/test_consistency.py | 24 +++++++++---------
 2 files changed, 23 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 51264f8e1..e2f4206f6 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,9 @@ import torch
 import clip
 from PIL import Image
 
-device = "cuda" if torch.cuda.is_available() else "cpu"
+from clip.utils import get_device_initial
+
+device = get_device_initial() # "HPU" if using Intel® Gaudi® HPU, "cuda" if using CUDA GPU, "cpu" otherwise
 model, preprocess = clip.load("ViT-B/32", device=device)
 
 image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
@@ -94,8 +96,10 @@ import clip
 import torch
 from torchvision.datasets import CIFAR100
 
+from clip.utils import get_device_initial
+
 # Load the model
-device = "cuda" if torch.cuda.is_available() else "cpu"
+device = get_device_initial()
 model, preprocess = clip.load('ViT-B/32', device)
 
 # Download the dataset
@@ -153,8 +157,10 @@ from torch.utils.data import DataLoader
 from torchvision.datasets import CIFAR100
 from tqdm import tqdm
 
+from clip.utils import get_device_initial
+
 # Load the model
-device = "cuda" if torch.cuda.is_available() else "cpu"
+device = get_device_initial()
 model, preprocess = clip.load('ViT-B/32', device)
 
 # Load the dataset
@@ -209,47 +215,11 @@ See the [PyTorch Docker Images for the Intel® Gaudi® Accelerator](https://deve
 
 ```bash
 docker run -it --runtime=habana clip_hpu:latest
-```  
-
-Optionally, you can add a mapping volume (`-v`) to access your project directory inside the container. Add the flag `-v /path/to/your/project:/workspace/project` to the `docker run` command.  
-Replace `/path/to/your/project` with the path to your project directory on your local machine.  
-
-### Command-line Usage with Intel® Gaudi® HPU  
-
-To run the notebook with Intel® Gaudi® HPU, use the `--device hpu` option when specifying the device in the code.  
-
-For example, modify the device assignment as follows:  
-
-```python
-device = 'hpu' if torch.device('hpu').is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
-model.to(device)
-image_input = image_input.to(device)
-text_tokens = text_tokens.to(device)
-```
+```   
 
 ### Python Usage with Intel® Gaudi® HPU  
 
-To leverage Intel® Gaudi® HPU in Python, ensure that the device is specified as `hpu` during model initialization and tensor manipulation.  
-
-```python
-import clip
-import torch
-
-# Load the model on HPU
-device = "hpu"
-model, preprocess = clip.load("ViT-B/32", device=device)
-
-# Prepare data and move to HPU
-image_input = preprocess(image).unsqueeze(0).to(device)
-text_tokens = clip.tokenize("a sample text").to(device)
-
-# Run inference
-with torch.no_grad():
-    image_features = model.encode_image(image_input)
-    text_features = model.encode_text(text_tokens)
-
-print("Inference completed on HPU")
-```
+You do not need to change the code to leverage Intel® Gaudi® HPU. The `get_device_initial()` function will automatically detect the correct device and return the appropriate device name. So no changes are required.
 
 
 ## See Also
diff --git a/tests/test_consistency.py b/tests/test_consistency.py
index 18000f61b..371725031 100644
--- a/tests/test_consistency.py
+++ b/tests/test_consistency.py
@@ -27,18 +27,18 @@ def test_consistency(model_name):
 
 @pytest.mark.parametrize("model_name", clip.available_models())
 def test_hpu_support(model_name):
-    device = "hpu"
-    jit_model, transform = clip.load(model_name, device="cpu", jit=True)
-    py_model, _ = clip.load(model_name, device=device, jit=False)
+    devices = ["hpu", "cpu"]
+    all_probs = []
+    for device in devices:
+        print(f"=== Testing {model_name} on {device} ===")
+        model, transform = clip.load(model_name, device=device, jit=False)
 
-    image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
-    text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
-
-    with torch.no_grad():
-        logits_per_image, _ = jit_model(image, text)
-        jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+        image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
+        text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
 
-        logits_per_image, _ = py_model(image, text)
-        py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+        with torch.no_grad():
+            logits_per_image, _ = model(image, text)
+            probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+            all_probs.append(probs)
 
-    assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)
+    assert np.allclose(all_probs[0], all_probs[1], atol=0.01, rtol=0.1)

From fc198e5172fb72d2e045348cdd674521606bdbc5 Mon Sep 17 00:00:00 2001
From: bartosz roguski <bartosz.roguski@bluelabellabs.com>
Date: Wed, 5 Feb 2025 00:06:35 +0100
Subject: [PATCH 06/10] Add time execution on HPU vs CPU benchmark

---
 benchmark.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 benchmark.py

diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 000000000..c31bbeff5
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,37 @@
+import logging
+import time
+import numpy as np
+import habana_frameworks.torch.core as ht
+import torch
+from PIL import Image
+
+import clip
+from clip.utils import get_device_initial
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def run_model(model_name, device):
+    model, transform = clip.load(model_name, device=get_device_initial(device), jit=False)
+
+    image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
+    text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
+
+    with torch.no_grad():
+        logits_per_image, _ = model(image, text)
+        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+    return probs
+
+
+if __name__ == "__main__":
+    logger.info("Running on HPU")
+    start_time = time.time()
+    run_model("RN50", "hpu")
+    end_time = time.time()
+    logger.info(f"HPU execution time: {end_time - start_time:.4f} seconds")
+
+    logger.info("Running on CPU")
+    start_time = time.time()
+    run_model("RN50", "cpu")
+    end_time = time.time()
+    logger.info(f"CPU execution time: {end_time - start_time:.4f} seconds")

From af0f80c1e647b2ece79ea8cf99b2b3a3343ff410 Mon Sep 17 00:00:00 2001
From: bartosz roguski <bartosz.roguski@bluelabellabs.com>
Date: Wed, 12 Feb 2025 13:38:36 +0100
Subject: [PATCH 07/10] Add CLIP installation instruction to Dockerfile

`pip install -e .`
---
 Dockerfile.hpu       | 3 ++-
 requirements_hpu.txt | 6 +-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index fe729c15a..2fa8d3aa2 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -22,4 +22,5 @@ COPY requirements_hpu.txt /workspace/requirements_hpu.txt
 
 # Install Python packages
 RUN pip install --upgrade pip \
-    && pip install -r requirements_hpu.txt
+    && pip install -r requirements_hpu.txt \
+    && pip install -e .
\ No newline at end of file
diff --git a/requirements_hpu.txt b/requirements_hpu.txt
index 3eefe7c1a..5aba6057c 100644
--- a/requirements_hpu.txt
+++ b/requirements_hpu.txt
@@ -1,7 +1,3 @@
 -r requirements.txt
 optimum-habana==1.14.1
-transformers==4.45.2
-huggingface-hub==0.26.2
-tiktoken==0.8.0
-torch-geometric==2.6.1
-numba==0.60.0
+pytest

From 6d3570d0e2d9aa6af3d88a2f0640daed6fe6b433 Mon Sep 17 00:00:00 2001
From: bartosz roguski <bartosz.roguski@bluelabellabs.com>
Date: Wed, 12 Feb 2025 14:06:40 +0100
Subject: [PATCH 08/10] Add info on how to run tests to README.md

---
 README.md                 | 6 ++++++
 tests/test_consistency.py | 1 +
 2 files changed, 7 insertions(+)

diff --git a/README.md b/README.md
index e2f4206f6..abda08854 100644
--- a/README.md
+++ b/README.md
@@ -221,6 +221,12 @@ docker run -it --runtime=habana clip_hpu:latest
 
 You do not need to change the code to leverage Intel® Gaudi® HPU. The `get_device_initial()` function will automatically detect the correct device and return the appropriate device name. So no changes are required.
 
+### Run the Tests
+
+```bash
+pytest
+```
+This will run the tests and verify that the model is working correctly.
 
 ## See Also
 
diff --git a/tests/test_consistency.py b/tests/test_consistency.py
index 371725031..9b72208bf 100644
--- a/tests/test_consistency.py
+++ b/tests/test_consistency.py
@@ -2,6 +2,7 @@
 import pytest
 import torch
 from PIL import Image
+import habana_frameworks.torch
 
 import clip
 

From 07c9771a30d70171a29357d9e959115f92b1749e Mon Sep 17 00:00:00 2001
From: bartosz roguski <bartosz.roguski@bluelabellabs.com>
Date: Wed, 12 Feb 2025 14:08:35 +0100
Subject: [PATCH 09/10] Move image IO ops out of time measurements

Run model n times and average the runtime
---
 benchmark.py | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/benchmark.py b/benchmark.py
index c31bbeff5..0b944775e 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -1,7 +1,6 @@
 import logging
 import time
 import numpy as np
-import habana_frameworks.torch.core as ht
 import torch
 from PIL import Image
 
@@ -11,27 +10,39 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+
 def run_model(model_name, device):
-    model, transform = clip.load(model_name, device=get_device_initial(device), jit=False)
+    model, transform = clip.load(
+        model_name, device=get_device_initial(device), jit=False
+    )
 
     image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
     text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
 
     with torch.no_grad():
+        start_time = time.perf_counter()
+
         logits_per_image, _ = model(image, text)
         probs = logits_per_image.softmax(dim=-1).cpu().numpy()
-    return probs
+
+        end_time = time.perf_counter()
+        logger.info(f"Execution time: {end_time - start_time:.4f} seconds")
+    return probs, end_time - start_time
+
+
+def run_n_times(model_name, device, n):
+    times = []
+    logger.info(f"Running {model_name} on {device} {n} times")
+    for _ in range(n):
+        logger.info(f"Run {_ + 1} of {n}")
+        _, time = run_model(model_name, device)
+        times.append(time)
+    return np.mean(times)
 
 
 if __name__ == "__main__":
-    logger.info("Running on HPU")
-    start_time = time.time()
-    run_model("RN50", "hpu")
-    end_time = time.time()
-    logger.info(f"HPU execution time: {end_time - start_time:.4f} seconds")
-
-    logger.info("Running on CPU")
-    start_time = time.time()
-    run_model("RN50", "cpu")
-    end_time = time.time()
-    logger.info(f"CPU execution time: {end_time - start_time:.4f} seconds")
+    hpu_time = run_n_times("RN50", "hpu", 10)
+    cpu_time = run_n_times("RN50", "cpu", 10)
+
+    logger.info(f"HPU time: {hpu_time:.4f} seconds")
+    logger.info(f"CPU time: {cpu_time:.4f} seconds")

From 91532816085ba91ffe111d9bc9083304b60f41d1 Mon Sep 17 00:00:00 2001
From: bartosz roguski <bartosz.roguski@bluelabellabs.com>
Date: Mon, 24 Feb 2025 17:32:29 +0100
Subject: [PATCH 10/10] fix: Remove benchmark.py

---
 benchmark.py | 48 ------------------------------------------------
 1 file changed, 48 deletions(-)
 delete mode 100644 benchmark.py

diff --git a/benchmark.py b/benchmark.py
deleted file mode 100644
index 0b944775e..000000000
--- a/benchmark.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import logging
-import time
-import numpy as np
-import torch
-from PIL import Image
-
-import clip
-from clip.utils import get_device_initial
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def run_model(model_name, device):
-    model, transform = clip.load(
-        model_name, device=get_device_initial(device), jit=False
-    )
-
-    image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
-    text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
-
-    with torch.no_grad():
-        start_time = time.perf_counter()
-
-        logits_per_image, _ = model(image, text)
-        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
-
-        end_time = time.perf_counter()
-        logger.info(f"Execution time: {end_time - start_time:.4f} seconds")
-    return probs, end_time - start_time
-
-
-def run_n_times(model_name, device, n):
-    times = []
-    logger.info(f"Running {model_name} on {device} {n} times")
-    for _ in range(n):
-        logger.info(f"Run {_ + 1} of {n}")
-        _, time = run_model(model_name, device)
-        times.append(time)
-    return np.mean(times)
-
-
-if __name__ == "__main__":
-    hpu_time = run_n_times("RN50", "hpu", 10)
-    cpu_time = run_n_times("RN50", "cpu", 10)
-
-    logger.info(f"HPU time: {hpu_time:.4f} seconds")
-    logger.info(f"CPU time: {cpu_time:.4f} seconds")