diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml
index f923bd0f..8fed7a37 100644
--- a/candle-binding/Cargo.toml
+++ b/candle-binding/Cargo.toml
@@ -33,6 +33,9 @@ tracing = "0.1.37"
 libc = "0.2.147"
 lazy_static = "1.4.0"
 rand = "0.8.5"
+# Performance optimization: parallel processing and lock-free initialization
+rayon = "1.8"       
+once_cell = "1.19"
 
 [dev-dependencies]
 rstest = "0.18"
diff --git a/candle-binding/src/classifiers/lora/intent_lora.rs b/candle-binding/src/classifiers/lora/intent_lora.rs
index e63ac973..6da64a9a 100644
--- a/candle-binding/src/classifiers/lora/intent_lora.rs
+++ b/candle-binding/src/classifiers/lora/intent_lora.rs
@@ -113,11 +113,18 @@ impl IntentLoRAClassifier {
         })
     }
 
-    /// Parallel classification for multiple texts
+    /// Parallel classification for multiple texts using rayon
+    ///
+    /// # Performance
+    /// - Uses rayon for parallel processing across available CPU cores
+    /// - Efficient for batch sizes > 10
+    /// - No lock contention during inference
     pub fn parallel_classify(&self, texts: &[&str]) -> Result<Vec<IntentResult>> {
-        // Process each text using real model inference
+        use rayon::prelude::*;
+
+        // Process each text using real model inference in parallel
         texts
-            .iter()
+            .par_iter()
             .map(|text| self.classify_intent(text))
             .collect()
     }
diff --git a/candle-binding/src/classifiers/lora/mod.rs b/candle-binding/src/classifiers/lora/mod.rs
index 40bf6772..cff86bfa 100644
--- a/candle-binding/src/classifiers/lora/mod.rs
+++ b/candle-binding/src/classifiers/lora/mod.rs
@@ -19,6 +19,8 @@ pub use security_lora::*;
 #[cfg(test)]
 pub mod intent_lora_test;
 #[cfg(test)]
+pub mod parallel_engine_test;
+#[cfg(test)]
 pub mod pii_lora_test;
 #[cfg(test)]
 pub mod security_lora_test;
diff --git a/candle-binding/src/classifiers/lora/parallel_engine.rs b/candle-binding/src/classifiers/lora/parallel_engine.rs
index d488b7c7..7627982d 100644
--- a/candle-binding/src/classifiers/lora/parallel_engine.rs
+++ b/candle-binding/src/classifiers/lora/parallel_engine.rs
@@ -1,18 +1,17 @@
 //! Parallel LoRA processing engine
 //!
 //! Enables parallel execution of Intent||PII||Security classification tasks
-//! Using thread-based parallelism instead of async/await
+//! Using rayon for efficient data parallelism
 
 use crate::classifiers::lora::{
     intent_lora::{IntentLoRAClassifier, IntentResult},
     pii_lora::{PIILoRAClassifier, PIIResult},
     security_lora::{SecurityLoRAClassifier, SecurityResult},
 };
-use crate::core::{concurrency_error, ModelErrorType, UnifiedError};
+use crate::core::{ModelErrorType, UnifiedError};
 use crate::model_error;
 use candle_core::{Device, Result};
-use std::sync::{Arc, Mutex};
-use std::thread;
+use std::sync::Arc;
 
 /// Parallel LoRA processing engine
 pub struct ParallelLoRAEngine {
@@ -77,97 +76,30 @@ impl ParallelLoRAEngine {
         })
     }
 
-    /// Parallel classification across all three tasks
+    /// Parallel classification across all three tasks using rayon
+    ///
+    /// # Performance
+    /// - Uses rayon::join for parallel execution (no Arc<Mutex> overhead)
+    /// - Simplified code: ~70 lines reduced to ~20 lines
+    /// - No lock contention or synchronization overhead
     pub fn parallel_classify(&self, texts: &[&str]) -> Result<ParallelResult> {
-        let texts_owned: Vec<String> = texts.iter().map(|s| s.to_string()).collect();
-
-        // Create shared results
-        let intent_results = Arc::new(Mutex::new(Vec::new()));
-        let pii_results = Arc::new(Mutex::new(Vec::new()));
-        let security_results = Arc::new(Mutex::new(Vec::new()));
-
-        let handles = vec![
-            self.spawn_intent_task(texts_owned.clone(), Arc::clone(&intent_results)),
-            self.spawn_pii_task(texts_owned.clone(), Arc::clone(&pii_results)),
-            self.spawn_security_task(texts_owned, Arc::clone(&security_results)),
-        ];
-
-        // Wait for all threads to complete
-        for handle in handles {
-            handle.join().map_err(|_| {
-                let unified_err = concurrency_error(
-                    "thread join",
-                    "Failed to join parallel classification thread",
-                );
-                candle_core::Error::from(unified_err)
-            })?;
-        }
+        // Execute all three classifiers in parallel using rayon::join
+        // Each task runs independently without shared mutable state
+        let ((intent_results, pii_results), security_results) = rayon::join(
+            || {
+                rayon::join(
+                    || self.intent_classifier.batch_classify(texts),
+                    || self.pii_classifier.batch_detect(texts),
+                )
+            },
+            || self.security_classifier.batch_detect(texts),
+        );
 
+        // Propagate errors from any task
         Ok(ParallelResult {
-            intent_results: Arc::try_unwrap(intent_results)
-                .unwrap()
-                .into_inner()
-                .unwrap(),
-            pii_results: Arc::try_unwrap(pii_results).unwrap().into_inner().unwrap(),
-            security_results: Arc::try_unwrap(security_results)
-                .unwrap()
-                .into_inner()
-                .unwrap(),
-        })
-    }
-
-    fn spawn_intent_task(
-        &self,
-        texts: Vec<String>,
-        results: Arc<Mutex<Vec<IntentResult>>>,
-    ) -> thread::JoinHandle<()> {
-        let classifier = Arc::clone(&self.intent_classifier);
-        thread::spawn(move || {
-            let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
-            match classifier.batch_classify(&text_refs) {
-                Ok(task_results) => {
-                    let mut guard = results.lock().unwrap();
-                    *guard = task_results;
-                }
-                Err(e) => {
-                    eprintln!("Intent classification failed: {}", e);
-                }
-            }
-        })
-    }
-
-    fn spawn_pii_task(
-        &self,
-        texts: Vec<String>,
-        results: Arc<Mutex<Vec<PIIResult>>>,
-    ) -> thread::JoinHandle<()> {
-        let classifier = Arc::clone(&self.pii_classifier);
-        thread::spawn(move || {
-            let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
-            if let Ok(task_results) = classifier.batch_detect(&text_refs) {
-                let mut guard = results.lock().unwrap();
-                *guard = task_results;
-            }
-        })
-    }
-
-    fn spawn_security_task(
-        &self,
-        texts: Vec<String>,
-        results: Arc<Mutex<Vec<SecurityResult>>>,
-    ) -> thread::JoinHandle<()> {
-        let classifier = Arc::clone(&self.security_classifier);
-        thread::spawn(move || {
-            let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
-            match classifier.batch_detect(&text_refs) {
-                Ok(task_results) => {
-                    let mut guard = results.lock().unwrap();
-                    *guard = task_results;
-                }
-                Err(e) => {
-                    eprintln!("Security classification failed: {}", e);
-                }
-            }
+            intent_results: intent_results?,
+            pii_results: pii_results?,
+            security_results: security_results?,
         })
     }
 }
diff --git a/candle-binding/src/classifiers/lora/parallel_engine_test.rs b/candle-binding/src/classifiers/lora/parallel_engine_test.rs
new file mode 100644
index 00000000..b7ba8c3d
--- /dev/null
+++ b/candle-binding/src/classifiers/lora/parallel_engine_test.rs
@@ -0,0 +1,362 @@
+//! Tests for Parallel LoRA Engine with performance benchmarks
+
+use crate::test_fixtures::fixtures::*;
+use rstest::*;
+use serial_test::serial;
+use std::sync::Arc;
+use std::time::Instant;
+
+/// Test ParallelLoRAEngine creation with cached models
+#[rstest]
+#[serial]
+fn test_parallel_engine_creation(
+    cached_intent_classifier: Option<Arc<super::intent_lora::IntentLoRAClassifier>>,
+    cached_pii_classifier: Option<Arc<super::pii_lora::PIILoRAClassifier>>,
+    cached_security_classifier: Option<Arc<super::security_lora::SecurityLoRAClassifier>>,
+) {
+    if cached_intent_classifier.is_some()
+        && cached_pii_classifier.is_some()
+        && cached_security_classifier.is_some()
+    {
+        println!("✅ All classifiers available for parallel engine testing");
+    } else {
+        println!("⏭️  Skipping parallel engine creation test - models not cached");
+    }
+}
+
+/// Test parallel classification with rayon optimization
+#[rstest]
+#[serial]
+fn test_parallel_classify_basic(
+    cached_intent_classifier: Option<Arc<super::intent_lora::IntentLoRAClassifier>>,
+    cached_pii_classifier: Option<Arc<super::pii_lora::PIILoRAClassifier>>,
+    cached_security_classifier: Option<Arc<super::security_lora::SecurityLoRAClassifier>>,
+) {
+    // Skip if models not available
+    if cached_intent_classifier.is_none()
+        || cached_pii_classifier.is_none()
+        || cached_security_classifier.is_none()
+    {
+        println!("⏭️  Skipping parallel classification test - models not cached");
+        return;
+    }
+
+    println!("\n🧪 Testing parallel classification with rayon optimization");
+
+    let test_texts = vec![
+        "I want to book a flight to New York",
+        "My SSN is 123-45-6789 and my email is test@example.com",
+        "DROP TABLE users; -- malicious SQL injection",
+    ];
+
+    // Note: This test validates the API structure
+    // Actual performance testing requires model files
+    println!("✅ Test inputs prepared: {} texts", test_texts.len());
+    println!("   - Intent text: '{}'", test_texts[0]);
+    println!("   - PII text: '{}'", test_texts[1]);
+    println!("   - Security text: '{}'", test_texts[2]);
+}
+
+/// Performance benchmark: Single text vs Batch processing
+///
+/// This test compares the performance of processing texts one-by-one
+/// vs using rayon's parallel batch processing.
+#[rstest]
+#[serial]
+#[ignore] // Run with: cargo test --ignored test_performance_batch_vs_single
+fn test_performance_batch_vs_single(
+    cached_intent_classifier: Option<Arc<super::intent_lora::IntentLoRAClassifier>>,
+    cached_pii_classifier: Option<Arc<super::pii_lora::PIILoRAClassifier>>,
+) {
+    if cached_intent_classifier.is_none() || cached_pii_classifier.is_none() {
+        println!("⏭️  Skipping performance test - models not cached");
+        return;
+    }
+
+    println!("\n📊 Performance Benchmark: Batch vs Single Processing");
+    println!("{}", "=".repeat(70));
+
+    let test_texts: Vec<&str> = vec![
+        "Book a flight to Paris",
+        "My email is user@example.com",
+        "Schedule a meeting for tomorrow",
+        "SSN: 987-65-4321",
+        "Cancel my subscription",
+        "Phone: +1-555-123-4567",
+        "Transfer money to savings account",
+        "Address: 123 Main St",
+        "Check my account balance",
+        "Credit card: 4532-1234-5678-9010",
+    ];
+
+    let intent_classifier = cached_intent_classifier.as_ref().unwrap();
+    let pii_classifier = cached_pii_classifier.as_ref().unwrap();
+
+    // Warmup run
+    println!("🔥 Warmup run...");
+    let _ = intent_classifier.batch_classify(&test_texts[..2]);
+    let _ = pii_classifier.batch_detect(&test_texts[..2]);
+
+    // Test 1: Sequential processing (one-by-one)
+    println!("\n1️ Sequential Processing (baseline)");
+    let start = Instant::now();
+    let mut intent_results_seq = Vec::new();
+    for text in &test_texts {
+        if let Ok(result) = intent_classifier.classify_intent(text) {
+            intent_results_seq.push(result);
+        }
+    }
+    let seq_duration = start.elapsed();
+    println!(
+        "   ⏱️  Intent: {:?} for {} texts",
+        seq_duration,
+        test_texts.len()
+    );
+
+    let start = Instant::now();
+    let mut pii_results_seq = Vec::new();
+    for text in &test_texts {
+        if let Ok(result) = pii_classifier.detect_pii(text) {
+            pii_results_seq.push(result);
+        }
+    }
+    let seq_pii_duration = start.elapsed();
+    println!(
+        "   ⏱️  PII: {:?} for {} texts",
+        seq_pii_duration,
+        test_texts.len()
+    );
+
+    // Test 2: Parallel processing with rayon
+    println!("\n2️  Parallel Processing (rayon optimized)");
+    let start = Instant::now();
+    let intent_results_par = intent_classifier.parallel_classify(&test_texts);
+    let par_duration = start.elapsed();
+    println!(
+        "   ⏱️  Intent: {:?} for {} texts",
+        par_duration,
+        test_texts.len()
+    );
+
+    let start = Instant::now();
+    let pii_results_par = pii_classifier.parallel_detect(&test_texts);
+    let par_pii_duration = start.elapsed();
+    println!(
+        "   ⏱️  PII: {:?} for {} texts",
+        par_pii_duration,
+        test_texts.len()
+    );
+
+    // Calculate speedup
+    println!("\n📈 Performance Improvement");
+    println!("{}", "=".repeat(70));
+    if par_duration.as_millis() > 0 {
+        let intent_speedup = seq_duration.as_secs_f64() / par_duration.as_secs_f64();
+        println!("   Intent: {:.2}x speedup", intent_speedup);
+    }
+    if par_pii_duration.as_millis() > 0 {
+        let pii_speedup = seq_pii_duration.as_secs_f64() / par_pii_duration.as_secs_f64();
+        println!("   PII: {:.2}x speedup", pii_speedup);
+    }
+
+    // Verify correctness
+    if let Ok(par_results) = intent_results_par {
+        assert_eq!(
+            intent_results_seq.len(),
+            par_results.len(),
+            "Parallel processing should produce same number of results"
+        );
+        println!(
+            "\n✅ Correctness verified: {} results match",
+            par_results.len()
+        );
+    }
+
+    if let Ok(par_results) = pii_results_par {
+        assert_eq!(
+            pii_results_seq.len(),
+            par_results.len(),
+            "Parallel PII detection should produce same number of results"
+        );
+    }
+}
+
+/// Performance benchmark: Concurrent requests simulation
+///
+/// Simulates multiple Go requests calling FFI simultaneously
+#[rstest]
+#[serial]
+#[ignore] // Run with: cargo test --ignored test_performance_concurrent
+fn test_performance_concurrent_requests(
+    cached_intent_classifier: Option<Arc<super::intent_lora::IntentLoRAClassifier>>,
+) {
+    if cached_intent_classifier.is_none() {
+        println!("⏭️  Skipping concurrent performance test - model not cached");
+        return;
+    }
+
+    println!("\n📊 Concurrent Requests Benchmark");
+    println!("{}", "=".repeat(70));
+    println!("Simulating multiple Go goroutines calling FFI...");
+
+    let classifier = cached_intent_classifier.as_ref().unwrap();
+    let test_text = "Book a flight to London";
+
+    // Test with different concurrency levels
+    for num_threads in &[1, 2, 4, 8, 16] {
+        println!("\n🔢 Testing with {} concurrent requests", num_threads);
+
+        let start = Instant::now();
+        let handles: Vec<_> = (0..*num_threads)
+            .map(|_| {
+                let classifier = Arc::clone(classifier);
+                std::thread::spawn(move || classifier.classify_intent(test_text))
+            })
+            .collect();
+
+        let mut success_count = 0;
+        for handle in handles {
+            if handle.join().is_ok() {
+                success_count += 1;
+            }
+        }
+
+        let duration = start.elapsed();
+        println!(
+            "   ⏱️  {} requests completed in {:?} ({} successful)",
+            num_threads, duration, success_count
+        );
+        println!(
+            "   📊 Avg latency: {:.2}ms/request",
+            duration.as_millis() as f64 / *num_threads as f64
+        );
+    }
+}
+
+/// Performance benchmark: rayon::join vs manual threading
+///
+/// Compares the new rayon::join implementation with the old manual threading approach
+#[rstest]
+#[serial]
+#[ignore] // Run with: cargo test --ignored test_performance_rayon_vs_manual
+fn test_performance_rayon_vs_manual(
+    cached_intent_classifier: Option<Arc<super::intent_lora::IntentLoRAClassifier>>,
+    cached_pii_classifier: Option<Arc<super::pii_lora::PIILoRAClassifier>>,
+    cached_security_classifier: Option<Arc<super::security_lora::SecurityLoRAClassifier>>,
+) {
+    use std::sync::Mutex;
+
+    if cached_intent_classifier.is_none()
+        || cached_pii_classifier.is_none()
+        || cached_security_classifier.is_none()
+    {
+        println!("⏭️  Skipping rayon vs manual threading test - models not cached");
+        return;
+    }
+
+    println!("\n📊 Rayon vs Manual Threading Comparison");
+    println!("{}", "=".repeat(70));
+
+    let intent_classifier = cached_intent_classifier.as_ref().unwrap();
+    let pii_classifier = cached_pii_classifier.as_ref().unwrap();
+    let security_classifier = cached_security_classifier.as_ref().unwrap();
+
+    let test_texts: Vec<&str> = vec!["Book a flight", "My SSN is 123-45-6789", "DROP TABLE users"];
+
+    // Warmup
+    let _ = intent_classifier.batch_classify(&test_texts[..1]);
+    let _ = pii_classifier.batch_detect(&test_texts[..1]);
+    let _ = security_classifier.batch_detect(&test_texts[..1]);
+
+    // Test 1: Old approach (manual threading with Arc<Mutex>)
+    println!("\n1️ Old Approach: Manual threading with Arc<Mutex<Vec>>");
+    let start = Instant::now();
+    {
+        let texts_owned: Vec<String> = test_texts.iter().map(|s| s.to_string()).collect();
+
+        let intent_results = Arc::new(Mutex::new(Vec::new()));
+        let pii_results = Arc::new(Mutex::new(Vec::new()));
+        let security_results = Arc::new(Mutex::new(Vec::new()));
+
+        let handles = vec![
+            {
+                let classifier = Arc::clone(intent_classifier);
+                let results = Arc::clone(&intent_results);
+                let texts = texts_owned.clone();
+                std::thread::spawn(move || {
+                    let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
+                    if let Ok(task_results) = classifier.batch_classify(&text_refs) {
+                        let mut guard = results.lock().unwrap();
+                        *guard = task_results;
+                    }
+                })
+            },
+            {
+                let classifier = Arc::clone(pii_classifier);
+                let results = Arc::clone(&pii_results);
+                let texts = texts_owned.clone();
+                std::thread::spawn(move || {
+                    let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
+                    if let Ok(task_results) = classifier.batch_detect(&text_refs) {
+                        let mut guard = results.lock().unwrap();
+                        *guard = task_results;
+                    }
+                })
+            },
+            {
+                let classifier = Arc::clone(security_classifier);
+                let results = Arc::clone(&security_results);
+                let texts = texts_owned;
+                std::thread::spawn(move || {
+                    let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
+                    if let Ok(task_results) = classifier.batch_detect(&text_refs) {
+                        let mut guard = results.lock().unwrap();
+                        *guard = task_results;
+                    }
+                })
+            },
+        ];
+
+        for handle in handles {
+            let _ = handle.join();
+        }
+    }
+    let manual_duration = start.elapsed();
+    println!("   ⏱️  Duration: {:?}", manual_duration);
+
+    // Test 2: New approach (rayon::join)
+    println!("\n2️  New Approach: rayon::join (no Arc<Mutex>)");
+    let start = Instant::now();
+    {
+        let _ = rayon::join(
+            || {
+                rayon::join(
+                    || intent_classifier.batch_classify(&test_texts),
+                    || pii_classifier.batch_detect(&test_texts),
+                )
+            },
+            || security_classifier.batch_detect(&test_texts),
+        );
+    }
+    let rayon_duration = start.elapsed();
+    println!("   ⏱️  Duration: {:?}", rayon_duration);
+
+    // Calculate improvement
+    println!("\n📈 Performance Comparison");
+    println!("{}", "=".repeat(70));
+    if rayon_duration.as_millis() > 0 {
+        let speedup = manual_duration.as_secs_f64() / rayon_duration.as_secs_f64();
+        println!("   Speedup: {:.2}x", speedup);
+
+        if speedup > 1.0 {
+            let improvement = (speedup - 1.0) * 100.0;
+            println!("   Improvement: {:.1}% faster", improvement);
+        }
+    }
+
+    println!("\n✅ Benefits of rayon::join:");
+    println!("   • No Arc<Mutex> overhead");
+    println!("   • No manual thread management");
+    println!("   • Cleaner code (~70% reduction)");
+    println!("   • Better error propagation");
+}
diff --git a/candle-binding/src/classifiers/lora/pii_lora.rs b/candle-binding/src/classifiers/lora/pii_lora.rs
index 5179abd3..001c0a95 100644
--- a/candle-binding/src/classifiers/lora/pii_lora.rs
+++ b/candle-binding/src/classifiers/lora/pii_lora.rs
@@ -158,13 +158,19 @@ impl PIILoRAClassifier {
         })
     }
 
-    /// Parallel PII detection for multiple texts
+    /// Parallel PII detection for multiple texts using rayon
+    ///
+    /// # Performance
+    /// - Uses rayon for parallel processing across available CPU cores
+    /// - Efficient for batch sizes > 10
+    /// - No lock contention during inference
     pub fn parallel_detect(&self, texts: &[&str]) -> Result<Vec<PIIResult>> {
-        let mut results = Vec::new();
-        for text in texts {
-            results.push(self.detect_pii(text)?);
-        }
-        Ok(results)
+        use rayon::prelude::*;
+
+        texts
+            .par_iter()
+            .map(|text| self.detect_pii(text))
+            .collect::<Result<Vec<_>>>()
     }
 
     /// Batch PII detection for multiple texts
diff --git a/candle-binding/src/classifiers/lora/security_lora.rs b/candle-binding/src/classifiers/lora/security_lora.rs
index e1a51050..968f766c 100644
--- a/candle-binding/src/classifiers/lora/security_lora.rs
+++ b/candle-binding/src/classifiers/lora/security_lora.rs
@@ -132,10 +132,20 @@ impl SecurityLoRAClassifier {
         })
     }
 
-    /// Parallel security detection for multiple texts
+    /// Parallel security detection for multiple texts using rayon
+    ///
+    /// # Performance
+    /// - Uses rayon for parallel processing across available CPU cores
+    /// - Efficient for batch sizes > 10
+    /// - No lock contention during inference
     pub fn parallel_detect(&self, texts: &[&str]) -> Result<Vec<SecurityResult>> {
-        // Process each text using real model inference
-        texts.iter().map(|text| self.detect_threats(text)).collect()
+        use rayon::prelude::*;
+
+        // Process each text using real model inference in parallel
+        texts
+            .par_iter()
+            .map(|text| self.detect_threats(text))
+            .collect()
     }
 
     /// Batch security detection for multiple texts (optimized)
diff --git a/candle-binding/src/model_architectures/embedding/qwen3_embedding_test.rs b/candle-binding/src/model_architectures/embedding/qwen3_embedding_test.rs
index 1428cbd6..deb77f90 100644
--- a/candle-binding/src/model_architectures/embedding/qwen3_embedding_test.rs
+++ b/candle-binding/src/model_architectures/embedding/qwen3_embedding_test.rs
@@ -1579,8 +1579,8 @@ fn test_qwen3_output_consistency_all_cases(qwen3_model_only: Arc<Qwen3EmbeddingM
                 .tokenization
                 .attention_mask
                 .iter()
-                .map(|&x| x as u8)
-                .collect::<Vec<u8>>(),
+                .map(|&x| x as u32)
+                .collect::<Vec<u32>>(),
             (1, reference.tokenization.attention_mask.len()),
             &device,
         )
@@ -1714,8 +1714,8 @@ fn test_qwen3_short_text_no_instruction(qwen3_model_only: Arc<Qwen3EmbeddingMode
             .tokenization
             .attention_mask
             .iter()
-            .map(|&x| x as u8)
-            .collect::<Vec<u8>>(),
+            .map(|&x| x as u32)
+            .collect::<Vec<u32>>(),
         (1, reference.tokenization.attention_mask.len()),
         &device,
     )
@@ -1795,8 +1795,8 @@ fn test_qwen3_with_instruction(qwen3_model_only: Arc<Qwen3EmbeddingModel>) {
             .tokenization
             .attention_mask
             .iter()
-            .map(|&x| x as u8)
-            .collect::<Vec<u8>>(),
+            .map(|&x| x as u32)
+            .collect::<Vec<u32>>(),
         (1, reference.tokenization.attention_mask.len()),
         &device,
     )
@@ -1849,8 +1849,8 @@ fn test_qwen3_long_text(qwen3_model_only: Arc<Qwen3EmbeddingModel>) {
             .tokenization
             .attention_mask
             .iter()
-            .map(|&x| x as u8)
-            .collect::<Vec<u8>>(),
+            .map(|&x| x as u32)
+            .collect::<Vec<u32>>(),
         (1, reference.tokenization.attention_mask.len()),
         &device,
     )
diff --git a/candle-binding/src/model_architectures/lora/bert_lora.rs b/candle-binding/src/model_architectures/lora/bert_lora.rs
index 07b823f9..dd3df187 100644
--- a/candle-binding/src/model_architectures/lora/bert_lora.rs
+++ b/candle-binding/src/model_architectures/lora/bert_lora.rs
@@ -7,6 +7,7 @@ use candle_core::{DType, Device, IndexOp, Tensor};
 use candle_nn::{Linear, Module, VarBuilder};
 use candle_transformers::models::bert::{BertModel, Config};
 use hf_hub::{api::sync::Api, Repo, RepoType};
+use rayon::prelude::*;
 use std::collections::HashMap;
 use std::path::Path;
 use tokenizers::Tokenizer;
@@ -326,9 +327,9 @@ impl LoRABertClassifier {
 
     /// Batch multi-task classification
     pub fn classify_batch_multi_task(&self, texts: &[&str]) -> Result<Vec<LoRAMultiTaskResult>> {
-        // For now, process sequentially. In future, implement true batch processing
+        // Rayon parallel processing for multi-task classification
         texts
-            .iter()
+            .par_iter()
             .map(|text| self.classify_multi_task(text))
             .collect()
     }
diff --git a/candle-binding/src/model_architectures/traditional/base_model.rs b/candle-binding/src/model_architectures/traditional/base_model.rs
index c7191f5f..03875aca 100644
--- a/candle-binding/src/model_architectures/traditional/base_model.rs
+++ b/candle-binding/src/model_architectures/traditional/base_model.rs
@@ -8,6 +8,7 @@ use crate::model_architectures::traits::TraditionalModel;
 use crate::model_error;
 use candle_core::{DType, Device, IndexOp, Module, Result, Tensor};
 use candle_nn::{embedding, layer_norm, linear, Embedding, LayerNorm, Linear, VarBuilder};
+use rayon::prelude::*;
 use std::collections::HashMap;
 
 /// Abstract base class for traditional models
@@ -115,19 +116,20 @@ impl BaseTraditionalModel {
     }
 
     /// Batch processing for multiple inputs
+    ///
+    /// Uses rayon for parallel processing of independent forward passes.
+    /// Thread-safe since forward() only reads model weights without modification.
     pub fn forward_batch(
         &self,
         input_batch: &[Tensor],
         attention_batch: &[Tensor],
     ) -> Result<Vec<Tensor>> {
-        let mut results = Vec::with_capacity(input_batch.len());
-
-        for (input_ids, attention_mask) in input_batch.iter().zip(attention_batch.iter()) {
-            let output = self.forward(input_ids, attention_mask)?;
-            results.push(output);
-        }
-
-        Ok(results)
+        // Parallel processing of batch items
+        input_batch
+            .par_iter()
+            .zip(attention_batch.par_iter())
+            .map(|(input_ids, attention_mask)| self.forward(input_ids, attention_mask))
+            .collect()
     }
 
     // Pooling strategies