feat internal: weighted random sampler util

oetr · oetr · commit 2c67ff924d42 · 2025-10-28T12:14:56.000+01:00
Enables easy tweaking of probabilities for indidual mutation functions
in the future.
diff --git a/src/main/java/com/code_intelligence/jazzer/mutation/combinator/SamplingUtils.java b/src/main/java/com/code_intelligence/jazzer/mutation/combinator/SamplingUtils.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2025 Code Intelligence GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.code_intelligence.jazzer.mutation.combinator;
+
+import static com.code_intelligence.jazzer.mutation.support.Preconditions.require;
+import static java.util.Objects.requireNonNull;
+
+import com.code_intelligence.jazzer.mutation.api.PseudoRandom;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+public final class SamplingUtils {
+
+  public static <T> Function<PseudoRandom, T> weightedSampler(T[] values, double[] weights) {
+    // Use Vose's alias method for O(1) sampling after O(n) preprocessing.
+    requireNonNull(values, "Values must not be null");
+    requireNonNull(weights, "Weights must not be null");
+    require(values.length > 0, "Values must not be empty");
+    require(values.length == weights.length, "Values and weights must have the same length");
+
+    double sum = Arrays.stream(weights).sum();
+    require(sum > 0, "At least one weight must be positive");
+
+    int n = values.length;
+    int[] alias = new int[n];
+    double[] probability = new double[n];
+    double[] scaledWeights = Arrays.stream(weights).map(w -> w * n / sum).toArray();
+    int[] small = new int[n];
+    int[] large = new int[n];
+    int smallCount = 0;
+    int largeCount = 0;
+    for (int i = 0; i < n; i++) {
+      if (scaledWeights[i] < 1.0) {
+        small[smallCount++] = i;
+      } else {
+        large[largeCount++] = i;
+      }
+    }
+
+    while (smallCount > 0 && largeCount > 0) {
+      int less = small[--smallCount];
+      int more = large[--largeCount];
+
+      probability[less] = scaledWeights[less];
+      alias[less] = more;
+      scaledWeights[more] = (scaledWeights[more] + scaledWeights[less]) - 1.0;
+
+      if (scaledWeights[more] < 1.0) {
+        small[smallCount++] = more;
+      } else {
+        large[largeCount++] = more;
+      }
+    }
+    while (largeCount > 0) {
+      probability[large[--largeCount]] = 1.0;
+    }
+
+    while (smallCount > 0) {
+      probability[small[--smallCount]] = 1.0;
+    }
+    return (PseudoRandom random) -> {
+      int column = random.indexIn(n);
+      return values[random.closedRange(0.0, 1.0) < probability[column] ? column : alias[column]];
+    };
+  }
+
+  public static <T> Function<PseudoRandom, T> weightedSampler(
+      List<WeightedMutationFunction<T>> weightedFunctions) {
+    requireNonNull(weightedFunctions, "Weighted functions must not be null");
+    require(!weightedFunctions.isEmpty(), "Weighted functions must not be empty");
+
+    double[] weights = weightedFunctions.stream().mapToDouble(m -> m.weight).toArray();
+
+    T[] fns = (T[]) weightedFunctions.stream().map(m -> m.fn).toArray(Object[]::new);
+
+    return weightedSampler(fns, weights);
+  }
+
+  @SafeVarargs
+  public static <T> Function<PseudoRandom, T> weightedSampler(
+      Optional<WeightedMutationFunction<T>>... values) {
+    return weightedSampler(
+        Arrays.stream(values)
+            .filter(Optional::isPresent)
+            .map(Optional::get)
+            .collect(Collectors.toList()));
+  }
+
+  /**
+   * A simple struct to hold a mutation function and its weight. It is here just for stylistic
+   * reasons, to make the definitions of weights and functions more readable.
+   */
+  public static class WeightedMutationFunction<T> {
+    public final double weight;
+    public final T fn;
+
+    public WeightedMutationFunction(double weight, T fn) {
+      this.fn = fn;
+      this.weight = weight;
+    }
+
+    public static <T> WeightedMutationFunction<T> of(double weight, T fn) {
+      return new WeightedMutationFunction<>(weight, fn);
+    }
+
+    public static <T> Optional<WeightedMutationFunction<T>> ofOptional(double weight, T fn) {
+      return Optional.of(new WeightedMutationFunction<>(weight, fn));
+    }
+  }
+}
diff --git a/src/test/java/com/code_intelligence/jazzer/mutation/combinator/BUILD.bazel b/src/test/java/com/code_intelligence/jazzer/mutation/combinator/BUILD.bazel
@@ -8,6 +8,7 @@ java_test_suite(
     deps = [
         "//src/main/java/com/code_intelligence/jazzer/mutation/api",
         "//src/main/java/com/code_intelligence/jazzer/mutation/combinator",
+        "//src/main/java/com/code_intelligence/jazzer/mutation/engine",
         "//src/main/java/com/code_intelligence/jazzer/mutation/support",
         "//src/test/java/com/code_intelligence/jazzer/mutation/support:test_support",
     ],
diff --git a/src/test/java/com/code_intelligence/jazzer/mutation/combinator/SamplingUtilsTest.java b/src/test/java/com/code_intelligence/jazzer/mutation/combinator/SamplingUtilsTest.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2025 Code Intelligence GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.code_intelligence.jazzer.mutation.combinator;
+
+import static org.junit.jupiter.params.provider.Arguments.arguments;
+
+import com.code_intelligence.jazzer.mutation.api.PseudoRandom;
+import com.code_intelligence.jazzer.mutation.engine.SeededPseudoRandom;
+import java.util.function.Function;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+public class SamplingUtilsTest {
+  static Stream<?> weightsProvider() {
+    final int N = 1000000;
+    final double T = 0.03;
+    return Stream.of(
+        arguments(N, T, new double[] {1.0, 1.0, 1.0}),
+        arguments(N, T, new double[] {1.0, 2.0, 3.0, 4.0, 5.0}),
+        arguments(N, T, new double[] {0.1, 0.2, 0.3, 0.4}),
+        arguments(N, T, new double[] {10.0, 0.0, 0.1, 0.0, 90.0}),
+        arguments(N, T, new double[] {5.0, 5.0, 0.0, 0.0, 0.01, 5.0, 5.0}),
+        arguments(N, T, new double[] {0.0, 0.0, 0.0, 1.0}),
+        arguments(N, T, new double[] {1.0}),
+        arguments(N, T, new double[] {0.01, 0.01, 0.01, 0.97}),
+        arguments(N, T, new double[] {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}),
+        arguments(N, T, new double[] {0.001, 0.002, 0.003, 0.004, 0.005}),
+        arguments(N, T, new double[] {0.001, 0.002, 0.003, 0.004, 0.000001, 10.0}),
+        arguments(N, T, new double[] {0.001, 1000.0, 0.003, 10000.0, 0.005}),
+        arguments(N, T, IntStream.range(1, 10).mapToDouble(i -> i).toArray()),
+        arguments(N, 0.09, IntStream.range(1, 100).mapToDouble(i -> 1.0).toArray()),
+        arguments(N, 0.15, IntStream.range(1, 1000).mapToDouble(i -> 1.0).toArray()),
+        arguments(10000000, 0.15, IntStream.range(1, 10000).mapToDouble(i -> 1.0).toArray()),
+        arguments(100000000, 0.16, IntStream.range(1, 100000).mapToDouble(i -> 1.0).toArray()));
+  }
+
+  @ParameterizedTest
+  @MethodSource("weightsProvider")
+  public void testWeightedSampler(int trials, double tolerance, double[] weights) {
+    Integer[] indices = IntStream.range(0, weights.length).boxed().toArray(Integer[]::new);
+    Function<PseudoRandom, Integer> sampler = SamplingUtils.weightedSampler(indices, weights);
+
+    PseudoRandom random = new SeededPseudoRandom(12345);
+    int[] counts = new int[indices.length];
+    for (int i = 0; i < trials; i++) {
+      counts[sampler.apply(random)]++;
+    }
+
+    // Calculate expected probabilities that are proportional to the weights.
+    double[] pExpected = new double[weights.length];
+    double sum = 0.0;
+    for (double w : weights) {
+      sum += w;
+    }
+    for (int i = 0; i < weights.length; i++) {
+      pExpected[i] = weights[i] / sum;
+    }
+
+    double tol = (double) trials / weights.length * tolerance; // 5% of expected count
+    // Ensure that the frequencies are within 5% of the expected frequencies.
+    for (int i = 0; i < weights.length; i++) {
+      double expectedCount = trials * pExpected[i];
+      assert Math.abs(counts[i] - expectedCount) < tol
+          : String.format(
+              "Count for index %d out of tolerance: got %d, expected ~%.2f",
+              i, counts[i], expectedCount);
+    }
+  }
+}