feat: add support for Metal (#120)

OlivierDehaene · web-flow · commit 0e97f109f9ba · 2024-01-04T13:48:21.000+01:00
diff --git a/README.md b/README.md
@@ -45,6 +45,7 @@ classification models. TEI enables high-performance extraction for the most popu
 Ember, GTE and E5. TEI implements many features such as:
 
 * No model graph compilation step
+* Metal support for local execution on Macs
 * Small docker images and fast boot times. Get ready for true serverless!
 * Token based dynamic batching
 * Optimized transformers code for inference using [Flash Attention](https://github.com/HazyResearch/flash-attention),
@@ -372,7 +373,7 @@ Then run:
 # On x86
 cargo install --path router -F candle -F mkl
 # On M1 or M2
-cargo install --path router -F candle -F accelerate
+cargo install --path router -F candle -F metal
 ```
 
 You can now launch Text Embeddings Inference on CPU with:
diff --git a/backends/candle/src/alibi.rs b/backends/candle/src/alibi.rs
@@ -52,8 +52,9 @@ pub fn build_alibi_tensor(
     device: &Device,
     dtype: DType,
 ) -> Result<Tensor> {
-    let context_positions = Tensor::arange(0.0, num_positions as f64, device)?.unsqueeze(1)?;
-    let memory_positions = Tensor::arange(0.0, num_positions as f64, device)?.unsqueeze(0)?;
+    let context_positions =
+        Tensor::arange(0.0, num_positions as f64, &Device::Cpu)?.unsqueeze(1)?;
+    let memory_positions = Tensor::arange(0.0, num_positions as f64, &Device::Cpu)?.unsqueeze(0)?;
 
     let relative_positions = memory_positions.broadcast_sub(&context_positions)?.abs()?;
     // [num_heads, num_positions, num_positions]
@@ -63,13 +64,17 @@ pub fn build_alibi_tensor(
             .expand((num_heads, num_positions, num_positions))?;
 
     // [num_heads, 1, 1]
-    let slopes =
-        (Tensor::from_vec(alibi_head_slopes(num_heads), (num_heads, 1, 1), device)? * -1_f64)?;
+    let slopes = (Tensor::from_vec(
+        alibi_head_slopes(num_heads),
+        (num_heads, 1, 1),
+        &Device::Cpu,
+    )? * -1_f64)?;
 
     // [num_heads, num_positions, num_positions]
     let alibi = relative_positions.broadcast_mul(&slopes)?;
 
     alibi
         .reshape((1, num_heads, num_positions, num_positions))?
-        .to_dtype(dtype)
+        .to_dtype(dtype)?
+        .to_device(device)
 }
diff --git a/backends/candle/src/models/bert.rs b/backends/candle/src/models/bert.rs
@@ -506,7 +506,7 @@ impl BertModel {
                         input_ids.push(batch.input_ids[j]);
                         type_ids.push(batch.token_type_ids[j]);
                         position_ids.push(batch.position_ids[j]);
-                        attention_mask.push(1.0);
+                        attention_mask.push(1.0_f32);
                         attention_bias.push(0.0);
                     }
 
@@ -519,7 +519,7 @@ impl BertModel {
                             input_ids.push(0);
                             type_ids.push(0);
                             position_ids.push(0);
-                            attention_mask.push(0.0);
+                            attention_mask.push(0.0_f32);
                             attention_bias.push(f32::NEG_INFINITY);
                         }
                     }
diff --git a/backends/candle/src/models/jina.rs b/backends/candle/src/models/jina.rs
@@ -440,7 +440,7 @@ impl JinaBertModel {
                         input_ids.push(batch.input_ids[j]);
                         type_ids.push(batch.token_type_ids[j]);
                         position_ids.push(batch.position_ids[j]);
-                        attention_mask.push(1.0);
+                        attention_mask.push(1.0_f32);
                         attention_bias.push(0.0);
                     }
 
@@ -453,7 +453,7 @@ impl JinaBertModel {
                             input_ids.push(0);
                             type_ids.push(0);
                             position_ids.push(0);
-                            attention_mask.push(0.0);
+                            attention_mask.push(0.0_f32);
                             attention_bias.push(f32::NEG_INFINITY);
                         }
                     }
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -9,6 +9,8 @@
 - sections:
   - local: local_cpu
     title: Using TEI locally with CPU
+  - local: local_metal
+    title: Using TEI locally with Metal
   - local: local_gpu
     title: Using TEI locally with GPU
   - local: private_models
diff --git a/docs/source/en/local_metal.md b/docs/source/en/local_metal.md
@@ -0,0 +1,47 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Using TEI locally with Metal
+
+You can install `text-embeddings-inference` locally to run it on your own Mac with Metal support. 
+Here are the step-by-step instructions for installation:
+
+## Step 1: Install Rust
+
+[Install Rust]((https://rustup.rs/) on your machine by run the following in your terminal, then following the instructions:
+
+```shell
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+```
+
+## Step 2: Install with Metal support
+
+```shell
+cargo install --path router -F candle -F metal
+```
+
+## Step 3: Launch Text Embeddings Inference
+
+Once the installation is successfully complete, you can launch Text Embeddings Inference with Metal with the following command:
+
+```shell
+model=BAAI/bge-large-en-v1.5
+revision=refs/pr/5
+
+text-embeddings-router --model-id $model --revision $revision --port 8080
+```
+
+Now you are ready to use `text-embeddings-inference` locally on your machine.

Original file line number	Diff line number	Diff line change
`@@ -506,7 +506,7 @@ impl BertModel {`
`506`	`506`	`input_ids.push(batch.input_ids[j]);`
`507`	`507`	`type_ids.push(batch.token_type_ids[j]);`
`508`	`508`	`position_ids.push(batch.position_ids[j]);`
`509`		`- attention_mask.push(1.0);`
	`509`	`+ attention_mask.push(1.0_f32);`
`510`	`510`	`attention_bias.push(0.0);`
`511`	`511`	`}`
`512`	`512`
`@@ -519,7 +519,7 @@ impl BertModel {`
`519`	`519`	`input_ids.push(0);`
`520`	`520`	`type_ids.push(0);`
`521`	`521`	`position_ids.push(0);`
`522`		`- attention_mask.push(0.0);`
	`522`	`+ attention_mask.push(0.0_f32);`
`523`	`523`	`attention_bias.push(f32::NEG_INFINITY);`
`524`	`524`	`}`
`525`	`525`	`}`
Original file line number	Diff line number	Diff line change
`@@ -440,7 +440,7 @@ impl JinaBertModel {`
`440`	`440`	`input_ids.push(batch.input_ids[j]);`
`441`	`441`	`type_ids.push(batch.token_type_ids[j]);`
`442`	`442`	`position_ids.push(batch.position_ids[j]);`
`443`		`- attention_mask.push(1.0);`
	`443`	`+ attention_mask.push(1.0_f32);`
`444`	`444`	`attention_bias.push(0.0);`
`445`	`445`	`}`
`446`	`446`
`@@ -453,7 +453,7 @@ impl JinaBertModel {`
`453`	`453`	`input_ids.push(0);`
`454`	`454`	`type_ids.push(0);`
`455`	`455`	`position_ids.push(0);`
`456`		`- attention_mask.push(0.0);`
	`456`	`+ attention_mask.push(0.0_f32);`
`457`	`457`	`attention_bias.push(f32::NEG_INFINITY);`
`458`	`458`	`}`
`459`	`459`	`}`