grandinetech
diff --git a/‎.DS_Store‎
10 KB b/‎.DS_Store‎
10 KB
diff --git a/‎kzg/src/msm/strauss.rs‎
Lines changed: 94 additions & 58 deletions b/‎kzg/src/msm/strauss.rs‎
Lines changed: 94 additions & 58 deletions
@@ -27,15 +27,15 @@ where
     TG1Affine: G1Affine<TG1, TG1Fp>,
     TG1ProjAddAffine: G1ProjAddAffine<TG1, TG1Fp, TG1Affine>,
 {
-    /// precomputed per-chunk tables (each chunk table is a Vec of projective points
-    /// holding sums for each mask in 0..(1<<chunk_len)). Storing projective
-    /// points avoids repeated affine->projective conversions during the inner loop.
-    chunk_tables: Vec<Vec<TG1>>,
-
+    /// Precomputed per-chunk tables stored as AFFINE points.
+    /// Saves 33% memory (2 field elements vs 3 for projective).
+    /// Projective + Affine addition is also 1 multiplication faster
+    chunk_tables: Vec<Vec<TG1Affine>>, // Projective -> Affine change
     numpoints: usize,
 
     batch_numpoints: usize,
-    batch_points: Vec<Vec<TG1>>,
+    batch_points: Vec<Vec<TG1Affine>>, // Projective -> Affine change
+    batch_chunk_tables: Vec<Vec<Vec<TG1Affine>>>, // recomputed tables per row
 
     g1_marker: PhantomData<TG1>,
     g1_fp_marker: PhantomData<TG1Fp>,
@@ -76,26 +76,35 @@ where
                         let end = core::cmp::min(start + chunk_size, n);
                         let chunk_len = end - start;
                         let table_size = (1usize << chunk_len) - 1;
-
-                        let chunk: Vec<TG1> = cache.table[offset..offset + table_size]
-                            .iter()
-                            .map(|affine| affine.to_proj())
-                            .collect();
+                        
+                        // Store directly as affine
+                        let chunk: Vec<TG1Affine> = cache.table[offset..offset + table_size]
+                            .to_vec();
                         chunk_tables.push(chunk);
                         offset += table_size;
                     }
 
-                    let batch_points = cache
-                        .batch_table
+                    // Store batch points as affine too
+                    let batch_points = cache.batch_table.clone();
+
+                    // Rebuild batch_chunk_tables from batch_points
+                    let batch_chunk_tables: Vec<Vec<Vec<TG1Affine>>> = batch_points
                         .iter()
-                        .map(|row| row.iter().map(|affine| affine.to_proj()).collect())
-                        .collect::<Vec<_>>();
+                        .map(|point_row| {
+                            let proj_row: Vec<TG1> = point_row
+                                .iter()
+                                .map(|affine| affine.to_proj())
+                                .collect();
+                            Self::build_chunk_tables(&proj_row, chunk_size)
+                        })
+                        .collect();
 
                     Self {
                         chunk_tables,
                         numpoints: cache.numpoints,
                         batch_numpoints: cache.batch_numpoints,
                         batch_points,
+                        batch_chunk_tables,
                         g1_marker: PhantomData,
                         g1_fp_marker: PhantomData,
                         fr_marker: PhantomData,
@@ -112,30 +121,23 @@ where
     fn try_write_cache(
         points: &[TG1],
         matrix: &[Vec<TG1>],
-        chunk_tables: &[Vec<TG1>],
+        chunk_tables: &[Vec<TG1Affine>], // now takes affine
         numpoints: usize,
-        batch_points: &[Vec<TG1>],
+        batch_points: &[Vec<TG1Affine>], // now takes affine
         batch_numpoints: usize,
         contenthash: Option<[u8; 32]>,
     ) -> Result<(), String> {
         #[cfg(feature = "diskcache")]
         {
-            // Flatten chunk_tables to a single vector and convert to affine
+            // Flatten chunk_tables 
             let table_affine: Vec<TG1Affine> = chunk_tables
                 .iter()
                 .flat_map(|chunk| chunk.iter())
-                .map(|proj| TG1Affine::into_affine(proj))
+                .cloned()
                 .collect();
 
-            // Convert batch_points to affine
-            let batch_table_affine: Vec<Vec<TG1Affine>> = batch_points
-                .iter()
-                .map(|row| {
-                    row.iter()
-                        .map(|proj| TG1Affine::into_affine(proj))
-                        .collect()
-                })
-                .collect();
+            // Batch points are already affine, just clone
+            let batch_table_affine = batch_points.to_vec();
 
             DiskCache::<TG1, TG1Fp, TG1Affine>::save(
                 "strauss",
@@ -174,6 +176,7 @@ where
                     numpoints: 0,
                     batch_numpoints: 0,
                     batch_points: Vec::new(),
+                    batch_chunk_tables: Vec::new(),
                     g1_marker: PhantomData,
                     g1_fp_marker: PhantomData,
                     fr_marker: PhantomData,
@@ -183,7 +186,7 @@ where
                 return Ok(Some(table));
             }
 
-            // Build chunk tables directly from projective points
+            // Build chunk tables as affine
             let chunk_tables = Self::build_chunk_tables(points, strauss_chunk_size);
 
             Self::try_write_cache(points, matrix, &chunk_tables, n, &[], 0, contenthash)?;
@@ -193,6 +196,7 @@ where
                 numpoints: n,
                 batch_numpoints: 0,
                 batch_points: Vec::new(),
+                batch_chunk_tables: Vec::new(),
                 g1_marker: PhantomData,
                 g1_fp_marker: PhantomData,
                 fr_marker: PhantomData,
@@ -202,7 +206,7 @@ where
             return Ok(Some(table));
         }
 
-        // Batch case: store projective points directly
+        // Batch case: convert projective input to affine for storage
         let batch_numpoints = matrix[0].len();
         let mut batch_points = Vec::new();
 
@@ -211,9 +215,29 @@ where
             .map_err(|_| "Strauss precomputation table is too large".to_owned())?;
 
         for row in matrix {
-            batch_points.push(row.to_vec());
+            // Convert projective to affine for storage
+            let affine_row: Vec<TG1Affine> = row
+                .iter()
+                .map(|proj| TG1Affine::into_affine(proj))
+                .collect();
+            batch_points.push(affine_row);
         }
 
+        // precompute once during table creation
+        let batch_chunk_tables: Vec<Vec<Vec<TG1Affine>>> = batch_points
+            .iter()
+            .map(|point_row| {
+                // Convert affine back to projective for table building
+                let proj_row: Vec<TG1> = point_row
+                    .iter()
+                    .map(|affine| affine.to_proj())
+                    .collect();
+                // Build chunk tables for this row
+                Self::build_chunk_tables(&proj_row, strauss_chunk_size)
+            })
+            .collect();
+    
+
         // We still need to build the single-point-set table for the main points
         // (though it may not be used for batch operations)
         let n = points.len();
@@ -238,6 +262,7 @@ where
             numpoints: n,
             batch_numpoints,
             batch_points,
+            batch_chunk_tables,
             g1_marker: PhantomData,
             g1_fp_marker: PhantomData,
             fr_marker: PhantomData,
@@ -247,10 +272,10 @@ where
         Ok(Some(table))
     }
 
-    /// Helper to build chunk tables from projective points
-    fn build_chunk_tables(points: &[TG1], chunk_size: usize) -> Vec<Vec<TG1>> {
+    /// Build chunk tables - returns AFFINE for storage efficiency
+    fn build_chunk_tables(points: &[TG1], chunk_size: usize) -> Vec<Vec<TG1Affine>> {
         let n = points.len();
-        let mut chunk_tables: Vec<Vec<TG1>> = Vec::new();
+        let mut chunk_tables: Vec<Vec<TG1Affine>> = Vec::new();
 
         let num_chunks = n.div_ceil(chunk_size);
 
@@ -263,6 +288,7 @@ where
             let table_size = (1usize << chunk_len) - 1;
 
             // Build incremental table in projective space using the lowest-bit trick.
+            // faster additions in projective
             let mut table_proj: Vec<TG1> = Vec::with_capacity(table_size);
 
             for mask in 1..=table_size {
@@ -277,7 +303,14 @@ where
                 }
             }
 
-            chunk_tables.push(table_proj);
+            // Convert to affine ONCE for storage 
+            let table_affine: Vec<TG1Affine> = table_proj
+                .iter()
+                .map(|proj| TG1Affine::into_affine(proj))
+                .collect();
+
+
+            chunk_tables.push(table_affine);
         }
 
         chunk_tables
@@ -289,7 +322,7 @@ where
     }
 
     /// Core multiplication logic using provided tables
-    fn multiply_with_tables(scalars: &[TFr], chunk_tables: &[Vec<TG1>]) -> TG1 {
+    fn multiply_with_tables(scalars: &[TFr], chunk_tables: &[Vec<TG1Affine>]) -> TG1 {
         let n = scalars.len();
         if n == 0 || chunk_tables.is_empty() {
             return TG1::zero();
@@ -324,32 +357,35 @@ where
             let mut pt_idx = 0usize;
             for table in chunk_tables.iter() {
                 let table_size = table.len();
-                if table_size == 0 {
-                    continue;
-                }
                 // Derive chunk_len from table size: table_size = 2^chunk_len - 1
                 // So 2^chunk_len = table_size + 1, thus log2(table_size + 1)
                 let chunk_len =
                     (usize::BITS - 1) as usize - (table_size + 1).leading_zeros() as usize;
 
-                // Build mask for this bit across chunk scalars
-                let mut mask = 0usize;
-                for i in 0..chunk_len {
+                // Only process this chunk if we have scalars for it
+                // This handles the case where tables were built for more points than we're using
+                if pt_idx >= scalar_values.len() {
+                    break;
+                }
+
+                // Build table_index for this bit across chunk scalars
+                let mut table_index = 0usize;
+                let actual_chunk_len = core::cmp::min(chunk_len, scalar_values.len() - pt_idx);
+
+                for i in 0..actual_chunk_len {
                     let scalar_idx = pt_idx + i;
-                    if scalar_idx >= scalar_values.len() {
-                        break;
-                    }
 
                     let s = &scalar_values[scalar_idx];
                     // Extract single bit at position 'bit' from scalar
                     if (get_wval_limb(s, bit, 1) & 1) != 0 {
-                        mask |= 1 << i;
+                        table_index |= 1 << i;
                     }
                 }
 
-                if mask != 0 {
-                    let tab_proj = &table[mask - 1];
-                    accumulator.add_or_dbl_assign(tab_proj);
+                if table_index != 0 {
+                    // Mixed addition - Projective + Affine (should be faster than Proj + Proj)
+                    let affine_pt = &table[table_index - 1];
+                    TG1ProjAddAffine::add_or_double_assign_affine(&mut accumulator, affine_pt);
                 }
 
                 pt_idx += chunk_len;
@@ -360,31 +396,31 @@ where
     }
 
     pub fn multiply_batch(&self, scalars: &[Vec<TFr>]) -> Vec<TG1> {
-        if self.batch_points.is_empty() {
+        if self.batch_chunk_tables.is_empty() {
             // Fall back to sequential calls using main chunk_tables
             scalars
                 .iter()
                 .map(|s| self.multiply_sequential(s))
                 .collect()
         } else {
-            // Use batch_points: build temporary tables for each row
+            // Use precomputed batch_chunk_tables
             assert!(
-                scalars.len() == self.batch_points.len(),
-                "Scalars length {} != batch_points length {}",
+                scalars.len() == self.batch_chunk_tables.len(),
+                "Scalars length {} != batch_chunk_tables length {}",
                 scalars.len(),
-                self.batch_points.len()
+                self.batch_chunk_tables.len()
             );
 
             let strauss_chunk_size: usize = get_window_size();
 
             scalars
                 .iter()
-                .zip(self.batch_points.iter())
-                .map(|(scalar_row, point_row)| {
-                    let chunk_tables = Self::build_chunk_tables(point_row, strauss_chunk_size);
-                    Self::multiply_with_tables(scalar_row, &chunk_tables)
+                .zip(self.batch_chunk_tables.iter())
+                .map(|(scalar_row, chunk_tables)| {
+                    Self::multiply_with_tables(scalar_row, chunk_tables)
                 })
                 .collect()
         }
     }
 }
+