perf: optimize range checks for small number of small field ops (#1699)

ivokub · web-flow · commit e45e0a7dda4b · 2026-02-04T10:15:00.000+01:00
diff --git a/std/math/emulated/field.go b/std/math/emulated/field.go
@@ -19,7 +19,14 @@ import (
 )
 
 const (
+	// rangeCheckBaseLengthForSmallField is the base length used for range
+	// checking when using small field optimization. We start enforcing
+	// the base length only when the number of range checks exceeds
+	// thresholdOptimizeOptimizedOverflow.
 	rangeCheckBaseLengthForSmallField = 16
+	// thresholdForInexactOverflow is the number of range checks after
+	// which we start enforcing the base length for small field optimization.
+	thresholdForInexactOverflow = 55000
 )
 
 // Field holds the configuration for non-native field operations. The field
@@ -51,8 +58,11 @@ type Field[T FieldParams] struct {
 
 	log zerolog.Logger
 
-	constrainedLimbs map[[16]byte]struct{}
+	// constrainedLimbs keeps track of already range checked limbs. The map
+	// value indicates the range check width.
+	constrainedLimbs map[[16]byte]int
 	checker          frontend.Rangechecker
+	nbRangeChecks    int
 
 	deferredChecks []deferredChecker
 
@@ -81,7 +91,7 @@ func NewField[T FieldParams](native frontend.API) (*Field[T], error) {
 	f := &Field[T]{
 		api:              native,
 		log:              logger.Logger(),
-		constrainedLimbs: make(map[[16]byte]struct{}),
+		constrainedLimbs: make(map[[16]byte]int),
 		checker:          rangecheck.New(native),
 		fParams:          newStaticFieldParams[T](native.Compiler().Field()),
 	}
@@ -93,15 +103,6 @@ func NewField[T FieldParams](native frontend.API) (*Field[T], error) {
 		}
 		f.extensionApi = extapi
 	}
-	if f.useSmallFieldOptimization() {
-		// in case of emulated small fields we use base length 16 to reduce
-		// needing to range check for [v_lo, v_hi, 2*v_hi].
-		//
-		// But this means that hints could output values which are bigger than
-		// the emulated modulus bitwidth (for example 31 bits). This means we
-		// have to set the overflow of returned elements correctly.
-		f.checker = rangecheck.New(native, rangecheck.WithBaseLength(rangeCheckBaseLengthForSmallField))
-	}
 
 	// ensure prime is correctly set
 	if f.fParams.IsPrime() {
@@ -265,7 +266,7 @@ func (f *Field[T]) enforceWidthConditional(a *Element[T]) (didConstrain bool) {
 				// that we should enforce width for the whole element. But we
 				// still iterate over all limbs just to mark them in the table.
 				didConstrain = true
-				f.constrainedLimbs[h] = struct{}{}
+				break
 			}
 		} else {
 			// we have no way of knowing if the limb has been constrained. To be
@@ -392,3 +393,48 @@ func (f *Field[T]) useSmallFieldOptimization() bool {
 	})
 	return f.smallFieldMode
 }
+
+// rangeCheck performs a range check on v to ensure it fits in nbBits.
+// It also keeps track of the number of range checks done, and after a certain
+// threshold switches to using base length range checking for small field
+// optimization.
+//
+// It returns a boolean indicating if the range check was actually performed (i.e. if
+// the limb was not already constrained).
+func (f *Field[T]) rangeCheck(v frontend.Variable, nbBits int) bool {
+	if h, ok := v.(interface{ HashCode() [16]byte }); ok {
+		// if the variable has a hashcode, then we can use it to see if we have
+		// already range checked it.
+		hc := h.HashCode()
+		if existingWidth, ok := f.constrainedLimbs[hc]; ok {
+			// already range checked with a certain width
+			if existingWidth <= nbBits {
+				return false
+			}
+		}
+		// mark as range checked
+		f.constrainedLimbs[hc] = nbBits
+	}
+	// update the number of range checks done. This is only to keep track if we
+	// should switch to the case where instead of exact width we range check
+	// multiple of base length. This reduces number of range checks when
+	// emulating small field.
+	f.nbRangeChecks++
+
+	if f.nbRangeChecks == thresholdForInexactOverflow {
+		// the threshold is reached, set the range checker to use base length.
+		// Now we know that when constructing non-native elements, then we should
+		// set overflow=f.smallAdditionalOverflow()
+		if f.useSmallFieldOptimization() {
+			// in case of emulated small fields we use base length 16 to reduce
+			// needing to range check for [v_lo, v_hi, 2*v_hi].
+			//
+			// But this means that hints could output values which are bigger than
+			// the emulated modulus bitwidth (for example 31 bits). This means we
+			// have to set the overflow of returned elements correctly.
+			f.checker = rangecheck.New(f.api, rangecheck.WithBaseLength(rangeCheckBaseLengthForSmallField))
+		}
+	}
+	f.checker.Check(v, nbBits)
+	return true
+}
diff --git a/std/math/emulated/field_assert.go b/std/math/emulated/field_assert.go
@@ -27,7 +27,7 @@ func (f *Field[T]) enforceWidth(a *Element[T], modWidth bool) {
 			// take only required bits from the most significant limb
 			limbNbBits = ((f.fParams.Modulus().BitLen() - 1) % int(f.fParams.BitsPerLimb())) + 1
 		}
-		f.checker.Check(a.Limbs[i], limbNbBits)
+		f.rangeCheck(a.Limbs[i], limbNbBits)
 	}
 }
 
@@ -37,7 +37,7 @@ func (f *Field[T]) smallEnforceWidth(a *Element[T], modWidth bool) {
 	}
 
 	for i := range a.Limbs {
-		f.checker.Check(a.Limbs[i], f.fParams.Modulus().BitLen()+int(a.overflow))
+		f.rangeCheck(a.Limbs[i], f.fParams.Modulus().BitLen()+int(a.overflow))
 	}
 }
 
diff --git a/std/math/emulated/field_mul.go b/std/math/emulated/field_mul.go
@@ -614,9 +614,16 @@ func mulHint(field *big.Int, inputs, outputs []*big.Int) error {
 	return nil
 }
 
-// Mul computes a*b and reduces it modulo the field order. The returned Element
-// has default number of limbs and zero overflow. If the result wouldn't fit
-// into Element, then locally reduces the inputs first. Doesn't mutate inputs.
+// Mul computes a*b. Depending on the emulated field it either reduces the result
+// modulo the field order or returns the full product.
+//
+// When emulating large field, the uses reducing multiplication by default.
+//
+// If the field is small (fits into single limb), then it uses non-reducing
+// multiplication by default for efficiency. It only falls back to reducing
+// multiplication when the overflow of the result would be too large.
+//
+// Doesn't mutate inputs.
 //
 // For multiplying by a constant, use [Field[T].MulConst] method which is more
 // efficient.
@@ -625,6 +632,12 @@ func (f *Field[T]) Mul(a, b *Element[T]) *Element[T] {
 	if a.isStrictZero() || b.isStrictZero() {
 		return f.Zero()
 	}
+	if f.useSmallFieldOptimization() {
+		// for small fields, it is more efficient to use non-reducing multiplication by default
+		// we only fall back to reducing multiplication when modular reduction is necessary
+		// to reduce the overflow
+		return f.reduceAndOp(f.mulNoReduce, f.mulPreCondNoReduce, a, b)
+	}
 	return f.reduceAndOp(func(a, b *Element[T], u uint) *Element[T] { return f.mulMod(a, b, u, nil) }, f.mulPreCondReduced, a, b)
 }
 
diff --git a/std/math/emulated/field_smallmul.go b/std/math/emulated/field_smallmul.go
@@ -209,7 +209,7 @@ func (f *Field[T]) smallMulMod(a, b *Element[T]) *Element[T] {
 
 	// Range check the remainder (quotient is range-checked via batched sum in check)
 	modBits := f.fParams.Modulus().BitLen()
-	f.checker.Check(r, modBits+f.smallAdditionalOverflow())
+	f.rangeCheck(r, modBits+f.smallAdditionalOverflow())
 
 	// Compute the number of bits needed for the quotient.
 	// For a*b = q*p + r:
@@ -413,8 +413,17 @@ func (f *Field[T]) toSingleLimbElement(a *Element[T]) *Element[T] {
 // range checking, but define that the non-native small field element can have
 // some additional overflow bits to accommodate this difference.
 func (f *Field[T]) smallAdditionalOverflow() int {
+	// when we emulate large field, then we always construct elements with exact
+	// overflow
 	if !f.useSmallFieldOptimization() {
 		return 0
 	}
+	// when we haven't performed too many range checks, then we still use exact
+	// overflow
+	if f.nbRangeChecks < thresholdForInexactOverflow {
+		return 0
+	}
+	// otherwise, we use the additional overflow which reduced number of
+	// decompositions during range checking
 	return (rangeCheckBaseLengthForSmallField - (f.fParams.Modulus().BitLen() % rangeCheckBaseLengthForSmallField)) % rangeCheckBaseLengthForSmallField
 }
diff --git a/std/math/emulated/smallfield_test.go b/std/math/emulated/smallfield_test.go
@@ -425,6 +425,15 @@ func BenchmarkSmallFieldMulConstraints(b *testing.B) {
 		b.Run(bc.name, func(b *testing.B) {
 			circuit := &SmallFieldMulBenchCircuit{A: make([]Element[emparams.KoalaBear], bc.nbMuls)}
 
+			for b.Loop() {
+				// for some reason, when we don't run the loop here, then the benchmark suite
+				// runs the whole benchmark multiple times. I guess it has something to do
+				// with the `b.Run` above (i.e. it parallelizes etc). To avoid this, we run an
+				// empty b.Loop() here to ensure we only run the compile once.
+				//
+				// this adds overhead as the `b.Loop()` will be run for `benchtime` period, but
+				// by default it is small. Otherwise the benchmark will be very slow.
+			}
 			csr1, err := frontend.Compile(ecc.BLS12_377.ScalarField(), r1cs.NewBuilder, circuit)
 			if err != nil {
 				b.Fatal(err)
@@ -440,6 +449,7 @@ func BenchmarkSmallFieldMulConstraints(b *testing.B) {
 			constraintsSCSPerMul := float64(css.GetNbConstraints()) / float64(bc.nbMuls)
 			b.ReportMetric(constraintsSCSPerMul, "scs_constraints/mul")
 			b.ReportMetric(float64(css.GetNbConstraints()), "scs_total_constraints")
+			b.ReportMetric(0.0, "ns/op") // avoid ns/op reporting as we don't measure time here
 		})
 	}
 }
@@ -483,11 +493,11 @@ func (c *MaliciousMulCircuit) Define(api frontend.API) error {
 	}
 
 	// 5 multiplications: ((A*B) * (C*D)) * (E*F)
-	ab := f.Mul(&c.A, &c.B)
-	cd := f.Mul(&c.C, &c.D)
-	ef := f.Mul(&c.E, &c.F)
-	abcd := f.Mul(ab, cd)
-	result := f.Mul(abcd, ef)
+	ab := f.MulMod(&c.A, &c.B)
+	cd := f.MulMod(&c.C, &c.D)
+	ef := f.MulMod(&c.E, &c.F)
+	abcd := f.MulMod(ab, cd)
+	result := f.MulMod(abcd, ef)
 	f.AssertIsEqual(result, &c.Result)
 	return nil
 }
diff --git a/std/math/emulated/subtraction_padding.go b/std/math/emulated/subtraction_padding.go
@@ -119,7 +119,7 @@ func (f *Field[T]) computeSubPaddingHint(overflow uint, nbLimbs uint, modulus *E
 		// at least native_width-overflow) and should be nbBits+overflow+1 bits
 		// wide (as expected padding is one bit wider than the maximum allowed
 		// subtraction limb).
-		f.checker.Check(f.api.Sub(res[i], maxLimb), int(f.fParams.BitsPerLimb()+overflow+1))
+		f.rangeCheck(f.api.Sub(res[i], maxLimb), int(f.fParams.BitsPerLimb()+overflow+1))
 	}
 
 	// ensure that condition 1 holds

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ func (f Field[T]) enforceWidth(a Element[T], modWidth bool) {`
`27`	`27`	`// take only required bits from the most significant limb`
`28`	`28`	`limbNbBits = ((f.fParams.Modulus().BitLen() - 1) % int(f.fParams.BitsPerLimb())) + 1`
`29`	`29`	`}`
`30`		`- f.checker.Check(a.Limbs[i], limbNbBits)`
	`30`	`+ f.rangeCheck(a.Limbs[i], limbNbBits)`
`31`	`31`	`}`
`32`	`32`	`}`
`33`	`33`
`@@ -37,7 +37,7 @@ func (f Field[T]) smallEnforceWidth(a Element[T], modWidth bool) {`
`37`	`37`	`}`
`38`	`38`
`39`	`39`	`for i := range a.Limbs {`
`40`		`- f.checker.Check(a.Limbs[i], f.fParams.Modulus().BitLen()+int(a.overflow))`
	`40`	`+ f.rangeCheck(a.Limbs[i], f.fParams.Modulus().BitLen()+int(a.overflow))`
`41`	`41`	`}`
`42`	`42`	`}`
`43`	`43`
Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ func (f Field[T]) computeSubPaddingHint(overflow uint, nbLimbs uint, modulus E`
`119`	`119`	`// at least native_width-overflow) and should be nbBits+overflow+1 bits`
`120`	`120`	`// wide (as expected padding is one bit wider than the maximum allowed`
`121`	`121`	`// subtraction limb).`
`122`		`- f.checker.Check(f.api.Sub(res[i], maxLimb), int(f.fParams.BitsPerLimb()+overflow+1))`
	`122`	`+ f.rangeCheck(f.api.Sub(res[i], maxLimb), int(f.fParams.BitsPerLimb()+overflow+1))`
`123`	`123`	`}`
`124`	`124`
`125`	`125`	`// ensure that condition 1 holds`