huggingface
diff --git a/‎candle-core/src/cuda_backend/device.rs‎
Lines changed: 12 additions & 0 deletions b/‎candle-core/src/cuda_backend/device.rs‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎candle-core/src/metal_backend/device.rs‎
Lines changed: 3 additions & 3 deletions b/‎candle-core/src/metal_backend/device.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎candle-core/src/quantized/cuda.rs‎
Lines changed: 143 additions & 22 deletions b/‎candle-core/src/quantized/cuda.rs‎
Lines changed: 143 additions & 22 deletions
diff --git a/‎candle-core/src/quantized/dummy_cuda.rs‎
Lines changed: 26 additions & 0 deletions b/‎candle-core/src/quantized/dummy_cuda.rs‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎candle-core/src/quantized/dummy_metal.rs‎
Lines changed: 26 additions & 0 deletions b/‎candle-core/src/quantized/dummy_metal.rs‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎candle-core/src/quantized/ggml_file.rs‎
Lines changed: 1 addition & 1 deletion b/‎candle-core/src/quantized/ggml_file.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎candle-core/src/quantized/gguf_file.rs‎
Lines changed: 1 addition & 0 deletions b/‎candle-core/src/quantized/gguf_file.rs‎
Lines changed: 1 addition & 0 deletions
@@ -94,6 +94,18 @@ impl CudaDevice {
         self.stream.memcpy_dtod(src, dst).w()
     }
 
+    pub fn memcpy_dtoh<
+        T: cudarc::driver::DeviceRepr,
+        Src: cudarc::driver::DevicePtr<T>,
+        Dst: cudarc::driver::HostSlice<T>,
+    >(
+        &self,
+        src: &Src,
+        dst: &mut Dst,
+    ) -> Result<()> {
+        self.stream.memcpy_dtoh(src, dst).w()
+    }
+
     pub fn memcpy_stod<
         T: cudarc::driver::DeviceRepr,
         Src: cudarc::driver::HostSlice<T> + ?Sized,
 
@@ -125,7 +125,7 @@ impl MetalDevice {
     }
 
     pub fn command_encoder(&self) -> Result<ComputeCommandEncoder> {
-        let mut commands = self.commands.write().map_err(MetalError::from)?;
+        let commands = self.commands.write().map_err(MetalError::from)?;
         let (flush, command_encoder) = commands.command_encoder().map_err(MetalError::from)?;
         if flush {
             self.drop_unused_buffers()?
@@ -134,7 +134,7 @@ impl MetalDevice {
     }
 
     pub fn blit_command_encoder(&self) -> Result<BlitCommandEncoder> {
-        let mut commands = self.commands.write().map_err(MetalError::from)?;
+        let commands = self.commands.write().map_err(MetalError::from)?;
         let (flush, command_encoder) = commands.blit_command_encoder().map_err(MetalError::from)?;
         if flush {
             self.drop_unused_buffers()?
@@ -143,7 +143,7 @@ impl MetalDevice {
     }
 
     pub fn wait_until_completed(&self) -> Result<()> {
-        let mut commands = self.commands.write().map_err(MetalError::from)?;
+        let commands = self.commands.write().map_err(MetalError::from)?;
         commands.wait_until_completed().map_err(MetalError::from)?;
         Ok(())
     }
 
@@ -46,24 +46,57 @@ fn pad(p: usize, q: usize) -> usize {
 fn quantize_q8_1(
     src: &CudaView<f32>,
     dst: &mut CudaSlice<u8>,
-    elem_count: usize,
+    k: usize,
     ky: usize,
     dev: &CudaDevice,
 ) -> Result<()> {
-    let kx = elem_count;
-    let kx_padded = pad(kx, MATRIX_ROW_PADDING);
+    let kx_padded = pad(k, MATRIX_ROW_PADDING);
     let num_blocks = ceil_div(kx_padded, CUDA_QUANTIZE_BLOCK_SIZE);
+
+    let total_rows = ky;
+    // Get Q8_1 metadata.
+    let q8_1_block_size = GgmlDType::Q8_1.block_size();
+    let q8_1_type_size = GgmlDType::Q8_1.type_size();
+
+    // Calculate the size of the output buffer in bytes.
+    let num_blocks_per_row = kx_padded / q8_1_block_size;
+    let dst_row_size_bytes = num_blocks_per_row * q8_1_type_size;
+
+    const CHUNK_SIZE: usize = 65535; // gridDim.y limit
     let func = dev.get_or_load_func("quantize_q8_1", &candle_kernels::QUANTIZED)?;
-    let cfg = cudarc::driver::LaunchConfig {
-        grid_dim: (num_blocks as u32, ky as u32, 1),
-        block_dim: (CUDA_QUANTIZE_BLOCK_SIZE as u32, 1, 1),
-        shared_mem_bytes: 0,
-    };
-    let mut builder = func.builder();
-    builder.arg(src);
-    builder.arg(dst);
-    barg!(builder, kx as i32, kx_padded as i32);
-    unsafe { builder.launch(cfg) }.w()?;
+
+    let mut rows_processed = 0;
+    while rows_processed < total_rows {
+        // --- calculate the number of rows for this chunk ---
+        let remaining_rows = total_rows - rows_processed;
+        // This is our gridDim.y, now <= 65535
+        let rows_in_chunk = std::cmp::min(CHUNK_SIZE, remaining_rows);
+
+        // --- slice the source (f32) tensor by elements ---
+        let src_start_elem = rows_processed * k;
+        let src_num_elems = rows_in_chunk * k;
+        let src_chunk = src.slice(src_start_elem..(src_start_elem + src_num_elems));
+
+        // --- slice the destination (u8) tensor by bytes ---
+        let dst_start_byte = rows_processed * dst_row_size_bytes;
+        let dst_num_bytes = rows_in_chunk * dst_row_size_bytes;
+        let dst_chunk = dst.slice(dst_start_byte..(dst_start_byte + dst_num_bytes));
+
+        let cfg = cudarc::driver::LaunchConfig {
+            grid_dim: (num_blocks as u32, rows_in_chunk as u32, 1),
+            block_dim: (CUDA_QUANTIZE_BLOCK_SIZE as u32, 1, 1),
+            shared_mem_bytes: 0,
+        };
+
+        let mut builder = func.builder();
+        builder.arg(&src_chunk);
+        builder.arg(&dst_chunk);
+        barg!(builder, k as i32, kx_padded as i32);
+        unsafe { builder.launch(cfg) }.w()?;
+
+        rows_processed += rows_in_chunk;
+    }
+
     Ok(())
 }
 
@@ -477,6 +510,87 @@ impl QCudaStorage {
         Ok(())
     }
 
+    pub fn quantize_imatrix(
+        &mut self,
+        src: &CudaStorage,
+        imatrix_weights: &[f32],
+        n_per_row: usize,
+    ) -> Result<()> {
+        // Run the quantization on cpu.
+        let src = match &src.slice {
+            crate::cuda_backend::CudaStorageSlice::F32(data) => self.device.memcpy_dtov(data)?,
+            _ => crate::bail!("only f32 can be quantized"),
+        };
+        let src_len = src.len();
+        let src = crate::Storage::Cpu(crate::CpuStorage::F32(src));
+        let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
+        qcpu_storage.quantize_imatrix(&src, imatrix_weights, n_per_row)?;
+        let data = qcpu_storage.data()?;
+        let padded_len =
+            data.len() + MATRIX_ROW_PADDING * self.dtype.type_size() / self.dtype.block_size();
+        let mut inner = unsafe { self.device.alloc::<u8>(padded_len)? };
+        self.device
+            .memcpy_htod(data.as_ref(), &mut inner.slice_mut(..data.len()))?;
+        self.data = PaddedCudaSlice {
+            inner,
+            len: data.len(),
+        };
+        Ok(())
+    }
+
+    pub fn quantize_imatrix_onto(
+        &mut self,
+        src: &crate::CpuStorage,
+        imatrix_weights: &[f32],
+        n_per_row: usize,
+    ) -> Result<()> {
+        // Run the quantization on cpu.
+        let src_len = src.as_slice::<f32>()?.len();
+        let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
+
+        if let QStorage::Cpu(storage) = &mut qcpu_storage {
+            storage.from_float_imatrix(src.as_slice::<f32>()?, imatrix_weights, n_per_row);
+        } else {
+            unreachable!()
+        }
+
+        let data = qcpu_storage.data()?;
+        let padded_len =
+            data.len() + MATRIX_ROW_PADDING * self.dtype.type_size() / self.dtype.block_size();
+        let mut inner = unsafe { self.device.alloc::<u8>(padded_len)? };
+        self.device
+            .memcpy_htod(data.as_ref(), &mut inner.slice_mut(..data.len()))?;
+        self.data = PaddedCudaSlice {
+            inner,
+            len: data.len(),
+        };
+        Ok(())
+    }
+
+    pub fn quantize_onto(&mut self, src: &crate::CpuStorage) -> Result<()> {
+        // Run the quantization on cpu.
+        let src_len = src.as_slice::<f32>()?.len();
+        let mut qcpu_storage = crate::Device::Cpu.qzeros(src_len, self.dtype)?;
+
+        if let QStorage::Cpu(storage) = &mut qcpu_storage {
+            storage.from_float(src.as_slice::<f32>()?);
+        } else {
+            unreachable!()
+        }
+
+        let data = qcpu_storage.data()?;
+        let padded_len =
+            data.len() + MATRIX_ROW_PADDING * self.dtype.type_size() / self.dtype.block_size();
+        let mut inner = unsafe { self.device.alloc::<u8>(padded_len)? };
+        self.device
+            .memcpy_htod(data.as_ref(), &mut inner.slice_mut(..data.len()))?;
+        self.data = PaddedCudaSlice {
+            inner,
+            len: data.len(),
+        };
+        Ok(())
+    }
+
     pub fn storage_size_in_bytes(&self) -> usize {
         self.data.len
     }
@@ -503,6 +617,13 @@ impl QCudaStorage {
             self.dequantize_matmul(self_shape, storage, layout)
         }
     }
+
+    pub fn data(&self) -> Result<Vec<u8>> {
+        let mut out = vec![0u8; self.data.len];
+        self.device
+            .memcpy_dtoh(&self.data.inner.slice(..self.data.len), &mut out)?;
+        Ok(out)
+    }
 }
 
 impl QCudaStorage {
@@ -629,7 +750,7 @@ mod test {
         let mut y_q8_1 = unsafe { dev.alloc::<u8>(y_size_in_bytes)? };
         let vs: Vec<f32> = (0..el).map(|v| v as f32).collect();
         let y = dev.memcpy_stod(&vs)?;
-        quantize_q8_1(&y.slice(..), &mut y_q8_1, el, 1, &dev)?;
+        quantize_q8_1(&y.as_view(), &mut y_q8_1, el, 1, &dev)?;
         Ok(())
     }
 
@@ -643,30 +764,30 @@ mod test {
         xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
         let cuda_storage = mul_mat_vec_via_q8_1(
             &xs.data,
-            &y.slice(..),
+            &y.as_view(),
             /* dtype */ GgmlDType::Q4_0,
             /* ncols */ ncols,
             /* nrows */ 1,
             /* b_size */ 1,
             &dev,
         )?;
         let vs = cuda_storage.as_cuda_slice::<f32>()?;
-        let vs = dev.memcpy_dtov(&vs.slice(..))?;
+        let vs = dev.memcpy_dtov(&vs.as_view())?;
         assert_eq!(vs.len(), 1);
         // for n = 255, n.(n+1).(2n+1) / 6 = 5559680
         // Q8 means 1/256 precision.
         assert_eq!(vs[0], 5561664.5);
 
         let cuda_storage = dequantize_mul_mat_vec(
             &xs.data,
-            &y.slice(..),
+            &y.as_view(),
             /* dtype */ GgmlDType::Q4_0,
             /* ncols */ ncols,
             /* nrows */ 1,
             &dev,
         )?;
         let vs = cuda_storage.as_cuda_slice::<f32>()?;
-        let vs = dev.memcpy_dtov(&vs.slice(..))?;
+        let vs = dev.memcpy_dtov(&vs.as_view())?;
         assert_eq!(vs.len(), 1);
         assert_eq!(vs[0], 5561851.0);
         Ok(())
@@ -682,7 +803,7 @@ mod test {
         xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
         let cuda_storage = mul_mat_via_q8_1(
             &xs.data,
-            &y.slice(..),
+            &y.as_view(),
             /* dtype */ GgmlDType::Q4_0,
             /* x_rows */ 4,
             /* x_cols */ ncols,
@@ -691,7 +812,7 @@ mod test {
             &dev,
         )?;
         let vs = cuda_storage.as_cuda_slice::<f32>()?;
-        let vs = dev.memcpy_dtov(&vs.slice(..))?;
+        let vs = dev.memcpy_dtov(&vs.as_view())?;
 
         /*
            x = torch.tensor([float(v) for v in range(1024)]).reshape(4, 256)
@@ -723,7 +844,7 @@ mod test {
         xs.quantize(&CudaStorage::wrap_cuda_slice(y.clone(), dev.clone()))?;
         let cuda_storage = mul_mat_via_q8_1(
             &xs.data,
-            &y.slice(..),
+            &y.as_view(),
             /* dtype */ GgmlDType::Q4_0,
             /* x_rows */ x_rows,
             /* x_cols */ ncols,
@@ -732,7 +853,7 @@ mod test {
             &dev,
         )?;
         let vs = cuda_storage.as_cuda_slice::<f32>()?;
-        let _vs = dev.memcpy_dtov(&vs.slice(..))?;
+        let _vs = dev.memcpy_dtov(&vs.as_view())?;
         Ok(())
     }
 }
@@ -32,6 +32,28 @@ impl QCudaStorage {
         Err(Error::NotCompiledWithCudaSupport)
     }
 
+    pub fn quantize_imatrix(
+        &mut self,
+        _src: &CudaStorage,
+        _imatrix_weights: &[f32],
+        _n_per_row: usize,
+    ) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub fn quantize_imatrix_onto(
+        &mut self,
+        _src: &crate::CpuStorage,
+        _imatrix_weights: &[f32],
+        _n_per_row: usize,
+    ) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
+    pub fn quantize_onto(&mut self, _src: &crate::CpuStorage) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
     pub fn storage_size_in_bytes(&self) -> usize {
         0
     }
@@ -44,6 +66,10 @@ impl QCudaStorage {
     ) -> Result<(CudaStorage, crate::Shape)> {
         Err(Error::NotCompiledWithCudaSupport)
     }
+
+    pub fn data(&self) -> Result<Vec<u8>> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
 }
 
 pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
 
@@ -28,6 +28,28 @@ impl QMetalStorage {
         Err(Error::NotCompiledWithMetalSupport)
     }
 
+    pub fn quantize_imatrix(
+        &mut self,
+        _src: &MetalStorage,
+        _imatrix_weights: &[f32],
+        _n_per_row: usize,
+    ) -> Result<()> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
+
+    pub fn quantize_imatrix_onto(
+        &mut self,
+        _src: &crate::CpuStorage,
+        _imatrix_weights: &[f32],
+        _n_per_row: usize,
+    ) -> Result<()> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
+
+    pub fn quantize_onto(&mut self, _src: &crate::CpuStorage) -> Result<()> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+
     pub fn storage_size_in_bytes(&self) -> usize {
         0
     }
@@ -40,6 +62,10 @@ impl QMetalStorage {
     ) -> Result<(MetalStorage, crate::Shape)> {
         Err(Error::NotCompiledWithMetalSupport)
     }
+
+    pub fn data(&self) -> Result<Vec<u8>> {
+        Err(Error::NotCompiledWithMetalSupport)
+    }
 }
 
 pub fn load_quantized<T: super::GgmlType + Send + Sync + 'static>(
 
@@ -134,7 +134,7 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(
     super::QTensor::new(data, dims)
 }
 
-/// Creates a Tensor from a raw GGML tensor.
+/// Creates a [Tensor] from a raw GGML tensor.
 pub fn qtensor_from_ggml(
     ggml_dtype: GgmlDType,
     raw_data: &[u8],
 
@@ -1,5 +1,6 @@
 //! Support for the [GGUF file format](https://github.com/philpax/ggml/blob/gguf-spec/docs/gguf.md).
 //!
+//! Spec: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md  
 
 use super::{GgmlDType, QTensor};
 use crate::{Context, Device, Result};
Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ impl MetalDevice {`
`125`	`125`	`}`
`126`	`126`
`127`	`127`	`pub fn command_encoder(&self) -> Result<ComputeCommandEncoder> {`
`128`		`- let mut commands = self.commands.write().map_err(MetalError::from)?;`
	`128`	`+ let commands = self.commands.write().map_err(MetalError::from)?;`
`129`	`129`	`let (flush, command_encoder) = commands.command_encoder().map_err(MetalError::from)?;`
`130`	`130`	`if flush {`
`131`	`131`	`self.drop_unused_buffers()?`
`@@ -134,7 +134,7 @@ impl MetalDevice {`
`134`	`134`	`}`
`135`	`135`
`136`	`136`	`pub fn blit_command_encoder(&self) -> Result<BlitCommandEncoder> {`
`137`		`- let mut commands = self.commands.write().map_err(MetalError::from)?;`
	`137`	`+ let commands = self.commands.write().map_err(MetalError::from)?;`
`138`	`138`	`let (flush, command_encoder) = commands.blit_command_encoder().map_err(MetalError::from)?;`
`139`	`139`	`if flush {`
`140`	`140`	`self.drop_unused_buffers()?`
`@@ -143,7 +143,7 @@ impl MetalDevice {`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`pub fn wait_until_completed(&self) -> Result<()> {`
`146`		`- let mut commands = self.commands.write().map_err(MetalError::from)?;`
	`146`	`+ let commands = self.commands.write().map_err(MetalError::from)?;`
`147`	`147`	`commands.wait_until_completed().map_err(MetalError::from)?;`
`148`	`148`	`Ok(())`
`149`	`149`	`}`
Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,7 @@ fn from_raw_data<T: super::GgmlType + Send + Sync + 'static>(`
`134`	`134`	`super::QTensor::new(data, dims)`
`135`	`135`	`}`
`136`	`136`
`137`		`-/// Creates a Tensor from a raw GGML tensor.`
	`137`	`+/// Creates a [Tensor] from a raw GGML tensor.`
`138`	`138`	`pub fn qtensor_from_ggml(`
`139`	`139`	`ggml_dtype: GgmlDType,`
`140`	`140`	`raw_data: &[u8],`