Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ members = [
"candle-core",
"candle-datasets",
"candle-examples",
"macros/*",
"candle-nn",
"candle-pyo3",
"candle-transformers",
Expand Down Expand Up @@ -35,6 +36,8 @@ accelerate-src = { version = "0.3.2" }
anyhow = { version = "1", features = ["backtrace"] }
byteorder = "1.4.3"
candle = { path = "./candle-core", package = "candle-core", version = "0.9.2-alpha.1" }
candle-macros = { path = "./macros/candle-macros", version = "0.9.2-alpha.1" }
candle-macros-types = { path = "./macros/candle-macros-types", version = "0.9.2-alpha.1" }
candle-datasets = { path = "./candle-datasets", version = "0.9.2-alpha.1" }
candle-flash-attn = { path = "./candle-flash-attn", version = "0.9.2-alpha.1" }
candle-flash-attn-v3 = { path = "./candle-flash-attn-v3", version = "0.9.2-alpha.1" }
Expand Down
5 changes: 4 additions & 1 deletion candle-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ accelerate-src = { workspace = true, optional = true }
byteorder = { workspace = true }
candle-kernels = { workspace = true, optional = true }
candle-metal-kernels = { workspace = true, optional = true }
candle-macros = { workspace = true }
candle-macros-types = { workspace = true }
objc2-metal = { workspace = true, optional = true }
objc2-foundation = { workspace = true, optional = true }
cudarc = { workspace = true, optional = true }
Expand Down Expand Up @@ -46,7 +48,7 @@ criterion = { workspace = true }

[features]
default = []
cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda", "float8/cuda"]
cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda", "float8/cuda", "candle-macros-types/cuda"]
cudnn = ["cuda", "cudarc/cudnn"]
nccl = ["cuda", "cudarc/nccl"]
mkl = ["dep:libc", "dep:intel-mkl-src"]
Expand All @@ -56,6 +58,7 @@ metal = [
"dep:objc2-foundation",
"dep:candle-metal-kernels",
"dep:ug-metal",
"candle-macros-types/metal",
]

[[bench]]
Expand Down
151 changes: 151 additions & 0 deletions candle-core/examples/quantize_basics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
//! Quantization examples: demonstrating GGML quantization types and operations
//!
//! Run: `cargo run --example quantize_basics --release [--features cuda]`

use candle_core::{DType, Device, QuantizedDType, Result, Tensor};

fn main() -> Result<()> {
println!("=== Candle Quantization Demo ===\n");

#[cfg(feature = "cuda")]
let cuda_device = Device::new_cuda(0).ok();
#[cfg(not(feature = "cuda"))]
let _cuda_device: Option<Device> = None;

// Create test tensor
let data: Vec<f32> = (0..256).map(|i| (i as f32) * 0.1).collect();
let tensor = Tensor::from_slice(&data, 256, &Device::Cpu)?;

// Test quantization types
let qtypes = [
("Q2K", QuantizedDType::GgmlQ2K, 2.6),
("Q4K", QuantizedDType::GgmlQ4K, 4.0),
("Q8_0", QuantizedDType::GgmlQ8_0, 8.0),
];

println!("1. Accuracy Comparison\n");
println!("Type | Bits | Error");
println!("------+------+-------");

let original_data = tensor.to_vec1::<f32>()?;

for (name, qtype, bits) in &qtypes {
let quantized = tensor.to_dtype(DType::Quantized(*qtype))?;
let recovered = quantized.to_dtype(DType::F32)?.to_vec1::<f32>()?;
let error = original_data
.iter()
.zip(&recovered)
.map(|(a, b)| (a - b).abs())
.sum::<f32>()
/ original_data.len() as f32;
println!("{:<5} | {:>4.1} | {:.4}", name, bits, error);
}

// Quantized operations
println!("\n2. Quantized Operations (CPU)\n");

// Addition (Q8_0 block size = 32)
let a = Tensor::from_slice(
&(0..32).map(|i| i as f32).collect::<Vec<_>>(),
32,
&Device::Cpu,
)?;
let b = Tensor::from_slice(
&(0..32).map(|i| (i as f32) * 0.5).collect::<Vec<_>>(),
32,
&Device::Cpu,
)?;
let a_q = a.to_dtype(DType::Quantized(QuantizedDType::GgmlQ8_0))?;
let b_q = b.to_dtype(DType::Quantized(QuantizedDType::GgmlQ8_0))?;
let sum = (&a_q + &b_q)?;
let sum_result = sum.to_vec1::<f32>()?;
println!(
"✓ Add: Q8_0[32] + Q8_0[32] → [{:.1}, {:.1}, {:.1}, ...]",
sum_result[0], sum_result[1], sum_result[2]
);

// Multiplication
let mul = (&a_q * &b_q)?;
let mul_result = mul.to_vec1::<f32>()?;
println!(
"✓ Mul: Q8_0[32] * Q8_0[32] → [{:.1}, {:.1}, {:.1}, ...]",
mul_result[0], mul_result[1], mul_result[2]
);

// MatMul
let weights = Tensor::from_slice(
&(0..256 * 64)
.map(|i| (i as f32) * 0.001)
.collect::<Vec<_>>(),
(256, 64),
&Device::Cpu,
)?;
let activations = Tensor::from_slice(
&(0..256).map(|i| (i as f32) * 0.01).collect::<Vec<_>>(),
(1, 256),
&Device::Cpu,
)?;
let weights_q4k = weights.to_dtype(DType::Quantized(QuantizedDType::GgmlQ4K))?;
let result = activations.matmul(&weights_q4k)?;
println!("✓ MatMul: F32[1,256] × Q4K[256,64] → {:?}", result.shape());

// CUDA examples
#[cfg(feature = "cuda")]
if let Some(ref dev) = cuda_device {
if let Err(e) = cuda_examples(dev) {
println!("⚠ CUDA error: {}", e);
}
}

Ok(())
}

#[cfg(feature = "cuda")]
fn cuda_examples(device: &Device) -> Result<()> {
println!("\n3. CUDA GPU Acceleration\n");

// Roundtrip test
let data: Vec<f32> = (0..256).map(|i| (i as f32) * 0.05).collect();
let gpu_tensor = Tensor::from_slice(&data, 256, device)?;
let gpu_q4k = gpu_tensor.to_dtype(DType::Quantized(QuantizedDType::GgmlQ4K))?;
let gpu_deq = gpu_q4k.to_dtype(DType::F32)?;
let result = gpu_deq.to_device(&Device::Cpu)?.to_vec1::<f32>()?;
let error = data
.iter()
.zip(&result)
.map(|(a, b)| (a - b).abs())
.sum::<f32>()
/ data.len() as f32;
println!("✓ Q4K roundtrip error: {:.6}", error);

// GPU matmul
let weights = Tensor::from_slice(
&(0..128 * 64)
.map(|i| (i as f32) * 0.001)
.collect::<Vec<_>>(),
(128, 64),
device,
)?;
let activations = Tensor::from_slice(
&(0..128).map(|i| (i as f32) * 0.01).collect::<Vec<_>>(),
(1, 128),
device,
)?;
let weights_q4k = weights.to_dtype(DType::Quantized(QuantizedDType::GgmlQ8_0))?;
let result = activations.matmul(&weights_q4k)?;

let result_cpu = result.to_device(&Device::Cpu)?;
let result_vec = result_cpu.to_vec2::<f32>()?;
println!(
"✓ GPU MatMul: {:?} → {:?}",
activations.shape(),
result_cpu.shape()
);
println!(
" Result sample: [{:.4}, ..., {:.4}]",
result_vec[0][0],
result_vec[0][result_vec[0].len() - 1]
);

Ok(())
}
4 changes: 4 additions & 0 deletions candle-core/src/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ impl Tensor {
f.write_u8(v.to_bits())?
}
}
DType::Quantized(_) => {
// TODO: Implement write_bytes for quantized types
crate::bail!("write_bytes not supported for quantized types")
}
}
Ok(())
}
Expand Down
Loading