refactor: unify error type to errno mapping for better sorting

Dayuxiaoshui · Dayuxiaoshui · commit 22339b34ac8c · 2025-11-17T09:59:51.000+08:00
- Replace error_type_counts (dict[str, int]) with errno2count (dict[int, int])
- Add get_errno_from_error_type() to map error type strings to errno (1, 2, 3)
- Add get_error_type_from_errno() for reverse mapping when error type strings are needed
- Update calculate_pi() to use errno2count and return dict[int, float]
- Update calculate_all_aggregated_parameters() to use errno2count and errno_tolerance_thresholds
- Update analysis_util.py and verify_aggregated_params.py to use errno2count
- Improve code maintainability by using integer errno for sorting and comparison
diff --git a/graph_net/analysis_util.py b/graph_net/analysis_util.py
@@ -6,6 +6,7 @@
 from collections import OrderedDict, defaultdict
 from graph_net.config.datatype_tolerance_config import get_precision
 from graph_net import samples_statistics
+from graph_net.samples_statistics import get_errno_from_error_type
 
 
 def extract_speedup_data_from_subdirs(benchmark_path: str) -> dict:
@@ -568,7 +569,7 @@ def calculate_s_scores(
     def print_stat_info(
         t_key,
         correct_count,
-        error_type_counts,
+        errno2count,
         pi,
         correct_negative_speedup_count,
         correct_speedups,
@@ -584,7 +585,7 @@ def print_stat_info(
             aggregated_params = samples_statistics.calculate_all_aggregated_parameters(
                 total_samples=total_samples,
                 correct_speedups=correct_speedups,
-                error_type_counts=error_type_counts,
+                errno2count=errno2count,
                 t_key=t_key,
                 negative_speedup_penalty=negative_speedup_penalty,
                 fpdb=fpdb,
@@ -626,13 +627,13 @@ def print_stat_info(
     final_correct_count = 0
     final_correct_negative_speedup_count = 0
     final_correct_speedups = []
-    final_error_type_counts = {}  # Store error type counts at t=1
+    final_errno2count = {}  # Store error type counts at t=1 (using errno)
 
     for t_key in t_keys:
         rectified_speedups = []
         rectified_speedups_fake_degrad = []
         correct_count = 0
-        error_type_counts = {}  # Dictionary to count errors by type
+        errno2count = {}  # Dictionary to count errors by errno
         correct_negative_speedup_count = 0
         correct_speedups = []
 
@@ -652,9 +653,10 @@ def print_stat_info(
                 if speedup is not None and speedup < 1:
                     correct_negative_speedup_count += 1
 
-            # Count errors by type
+            # Count errors by errno (convert error type string to errno)
             if fail_type is not None:
-                error_type_counts[fail_type] = error_type_counts.get(fail_type, 0) + 1
+                errno = get_errno_from_error_type(fail_type)
+                errno2count[errno] = errno2count.get(errno, 0) + 1
 
             # Store state at t=1 for ES(t) calculation
             if t_key == 1:
@@ -683,12 +685,12 @@ def print_stat_info(
         if t_key == 1:
             # Calculate pi at t=1 using the dedicated function
             pi = samples_statistics.calculate_pi(
-                error_type_counts, total_samples, correct_speedups
+                errno2count, total_samples, correct_speedups
             )
             final_correct_count = correct_count
             final_correct_negative_speedup_count = correct_negative_speedup_count
             final_correct_speedups = correct_speedups
-            final_error_type_counts = error_type_counts.copy()  # Save for t >= 1
+            final_errno2count = errno2count.copy()  # Save for t >= 1
 
         if rectified_speedups:
             s_scores[t_key] = gmean(rectified_speedups)
@@ -700,17 +702,17 @@ def print_stat_info(
                 expected_s, expected_es = print_stat_info(
                     t_key,
                     correct_count,
-                    error_type_counts,
+                    errno2count,
                     pi,
                     correct_negative_speedup_count,
                     correct_speedups,
                 )
             else:
-                # For t >= 1, use error_type_counts from t=1 (frozen state)
+                # For t >= 1, use errno2count from t=1 (frozen state)
                 expected_s, expected_es = print_stat_info(
                     t_key,
                     final_correct_count,
-                    final_error_type_counts,  # Use the frozen error_type_counts from t=1
+                    final_errno2count,  # Use the frozen errno2count from t=1
                     pi,
                     final_correct_negative_speedup_count,
                     final_correct_speedups,
@@ -722,6 +724,6 @@ def print_stat_info(
             s_scores._aggregated_results[t_key] = expected_s
             s_scores_fake_degrad._aggregated_results[t_key] = expected_es
 
-    print(f"    - pi: {list(pi)}")
+    print(f"    - pi: {dict(sorted(pi.items()))}")
 
     return s_scores, s_scores_fake_degrad
diff --git a/graph_net/samples_statistics.py b/graph_net/samples_statistics.py
@@ -9,6 +9,56 @@
 from collections.abc import Callable
 
 
+def get_errno_from_error_type(error_type: str) -> int:
+    """
+    Map error type string to errno (error number) for sorting.
+
+    According to the paper:
+    - c=1: accuracy errors (精度错误)
+    - c=2: runtime crashes (运行时崩溃)
+    - c=3: compilation failures (编译失败)
+
+    Args:
+        error_type: Error type string (e.g., "accuracy", "eager", "compiled")
+
+    Returns:
+        Errno (1, 2, or 3) based on error type
+    """
+    if error_type == "accuracy":
+        return 1
+    elif error_type in ("eager", "other", "runtime_fail", "eager_fail"):
+        return 2
+    elif error_type in ("compiled", "compile_fail"):
+        return 3
+    else:
+        # Default to 2 for unknown error types (runtime errors)
+        return 2
+
+
+def get_error_type_from_errno(errno: int) -> str:
+    """
+    Map errno (error number) back to error type string.
+
+    This is the reverse mapping of get_errno_from_error_type.
+    Used when error type string information is needed.
+
+    Args:
+        errno: Error number (1, 2, or 3)
+
+    Returns:
+        Error type string:
+        - 1 -> "accuracy"
+        - 2 -> "runtime_fail"
+        - 3 -> "compile_fail"
+    """
+    errno_to_error_type = {
+        1: "accuracy",
+        2: "runtime_fail",
+        3: "compile_fail",
+    }
+    return errno_to_error_type.get(errno, "runtime_fail")
+
+
 def calculate_alpha(correct_speedups: list[float]) -> float:
     """
     Calculate alpha: geometric mean of correct sample speedups.
@@ -80,30 +130,31 @@ def calculate_eta(correct_speedups: list[float]) -> float:
 
 
 def calculate_pi(
-    error_type_counts: dict[str, int], total_samples: int, correct_speedups: list[float]
-) -> dict[str, float]:
+    errno2count: dict[int, int], total_samples: int, correct_speedups: list[float]
+) -> dict[int, float]:
     """
     Calculate pi: error type proportions for t > 0.
 
     According to Appendix C: pi_c is the proportion of error type c among all error samples.
 
     Args:
-        error_type_counts: Dictionary mapping error type names to their counts
+        errno2count: Dictionary mapping errno (error number) to their counts.
+            Errno values: 1=accuracy, 2=runtime, 3=compilation.
         total_samples: Total number of samples
         correct_speedups: List of speedup values for correct samples
 
     Returns:
-        Dictionary mapping error type names to their proportions among error samples.
+        Dictionary mapping errno to their proportions among error samples.
         If error_count is 0, returns a dictionary with all proportions set to 0.0.
     """
     correct_count = len(correct_speedups)
     error_count = total_samples - correct_count
     if error_count == 0:
-        return {error_type: 0.0 for error_type in error_type_counts.keys()}
+        return {errno: 0.0 for errno in errno2count.keys()}
 
     pi = {}
-    for error_type, count in error_type_counts.items():
-        pi[error_type] = count / error_count
+    for errno, count in errno2count.items():
+        pi[errno] = count / error_count
     return pi
 
 
@@ -210,12 +261,12 @@ def calculate_es_t_from_aggregated(
 def calculate_all_aggregated_parameters(
     total_samples: int,
     correct_speedups: list[float],
-    error_type_counts: dict[str, int],
+    errno2count: dict[int, int],
     t_key: int,
     negative_speedup_penalty: float = 0.0,
     fpdb: float = 0.1,
-    pi: dict[str, float] | None = None,
-    error_tolerance_thresholds: dict[str, int] | None = None,
+    pi: dict[int, float] | None = None,
+    errno_tolerance_thresholds: dict[int, int] | None = None,
 ) -> dict:
     """
     Calculate all aggregated parameters for a given tolerance level.
@@ -225,15 +276,16 @@ def calculate_all_aggregated_parameters(
     Args:
         total_samples: Total number of samples
         correct_speedups: List of speedup values for correct samples
-        error_type_counts: Dictionary mapping error type names to their counts
+        errno2count: Dictionary mapping errno (error number) to their counts.
+            Errno values: 1=accuracy, 2=runtime, 3=compilation.
         t_key: Tolerance level
         negative_speedup_penalty: Penalty power p for negative speedup
         fpdb: Base penalty b for severe errors
-        pi: Dictionary mapping error type names to their proportions (calculated at t=1).
-            If None, will be calculated from error_type_counts.
-        error_tolerance_thresholds: Dictionary mapping error type names to their tolerance thresholds.
+        pi: Dictionary mapping errno to their proportions (calculated at t=1).
+            If None, will be calculated from errno2count.
+        errno_tolerance_thresholds: Dictionary mapping errno to their tolerance thresholds.
             An error type is tolerated (not penalized) when t >= threshold.
-            If None, uses default thresholds: {"accuracy": 1} for accuracy errors, 3 for others.
+            If None, uses default thresholds: {1: 1} for accuracy errors (errno=1), {2: 3, 3: 3} for others.
 
     Returns:
         Dictionary containing all aggregated parameters and calculated scores:
@@ -243,36 +295,34 @@ def calculate_all_aggregated_parameters(
             'lambda': float,
             'eta': float,
             'gamma': float,
-            'pi': dict[str, float],
+            'pi': dict[int, float],
             's_t': float,
             'es_t': float
         }
     """
     # Use default error tolerance thresholds if not provided
-    if error_tolerance_thresholds is None:
-        error_tolerance_thresholds = {}
-        for error_type in error_type_counts.keys():
-            if error_type == "accuracy":
-                error_tolerance_thresholds[error_type] = 1
-            else:
-                error_tolerance_thresholds[error_type] = 3
+    if errno_tolerance_thresholds is None:
+        errno_tolerance_thresholds = {}
+        for errno in errno2count.keys():
+            if errno == 1:  # accuracy errors
+                errno_tolerance_thresholds[errno] = 1
+            else:  # runtime (2) or compilation (3) errors
+                errno_tolerance_thresholds[errno] = 3
 
     # Calculate pi if not provided
     if pi is None:
-        pi = calculate_pi(error_type_counts, total_samples, correct_speedups)
+        pi = calculate_pi(errno2count, total_samples, correct_speedups)
 
     # Convert dictionary-based pi and thresholds to indexed format for calculate_gamma
-    # Create ordered list of error types for consistent indexing
-    error_types = sorted(error_type_counts.keys())
-    errno_tolerances = [
-        error_tolerance_thresholds.get(error_type, 3) for error_type in error_types
-    ]
+    # Create ordered list of errnos for consistent indexing (sorted by errno)
+    errnos = sorted(errno2count.keys())
+    errno_tolerances = [errno_tolerance_thresholds.get(errno, 3) for errno in errnos]
 
     # Create get_pi function that maps error type index to pi value
     def get_pi(error_type_index: int) -> float:
-        if error_type_index < len(error_types):
-            error_type = error_types[error_type_index]
-            return pi.get(error_type, 0.0)
+        if error_type_index < len(errnos):
+            errno = errnos[error_type_index]
+            return pi.get(errno, 0.0)
         return 0.0
 
     alpha = calculate_alpha(correct_speedups)
diff --git a/graph_net/verify_aggregated_params.py b/graph_net/verify_aggregated_params.py
@@ -4,6 +4,10 @@
 from collections import OrderedDict, Counter
 from graph_net import analysis_util
 from graph_net import samples_statistics
+from graph_net.samples_statistics import (
+    get_errno_from_error_type,
+    get_error_type_from_errno,
+)
 
 
 def calculate_aggregated_parameters(
@@ -45,7 +49,7 @@ def calculate_aggregated_parameters(
     final_correct_negative_speedup_count = 0
     final_correct_speedups = []
     final_slowdown_speedups = []
-    final_error_type_counts = {}  # Store error type counts at t=1
+    final_errno2count = {}  # Store error type counts at t=1 (using errno)
 
     results = OrderedDict()
 
@@ -74,10 +78,10 @@ def calculate_aggregated_parameters(
         slowdown_speedups = [speedup for speedup in correct_speedups if speedup < 1]
         correct_negative_speedup_count = len(slowdown_speedups)
 
-        # Count errors by type using Counter
-        error_type_counts = dict(
+        # Count errors by errno using Counter (convert error type string to errno)
+        errno2count = dict(
             Counter(
-                fail_type
+                get_errno_from_error_type(fail_type)
                 for _, _, _, _, fail_type in sample_data
                 if fail_type is not None
             )
@@ -101,13 +105,13 @@ def calculate_aggregated_parameters(
         # Calculate pi at t=1 using the dedicated function
         if t_key == 1:
             pi = samples_statistics.calculate_pi(
-                error_type_counts, total_samples, correct_speedups
+                errno2count, total_samples, correct_speedups
             )
             final_correct_count = correct_count
             final_correct_negative_speedup_count = correct_negative_speedup_count
             final_correct_speedups = correct_speedups
             final_slowdown_speedups = slowdown_speedups
-            final_error_type_counts = error_type_counts.copy()  # Save for t >= 1
+            final_errno2count = errno2count.copy()  # Save for t >= 1
 
         # Calculate aggregated parameters
         if total_samples > 0:
@@ -127,16 +131,16 @@ def calculate_aggregated_parameters(
                 stats_slowdown_speedups = final_slowdown_speedups
 
             # Calculate all aggregated parameters using the dedicated module
-            # For t >= 1, use error_type_counts from t=1 (frozen state)
+            # For t >= 1, use errno2count from t=1 (frozen state)
             if t_key < 1:
-                stats_error_type_counts = error_type_counts
+                stats_errno2count = errno2count
             else:
-                stats_error_type_counts = final_error_type_counts  # Use frozen from t=1
+                stats_errno2count = final_errno2count  # Use frozen from t=1
 
             aggregated_params = samples_statistics.calculate_all_aggregated_parameters(
                 total_samples=total_samples,
                 correct_speedups=stats_correct_speedups,
-                error_type_counts=stats_error_type_counts,
+                errno2count=stats_errno2count,
                 t_key=t_key,
                 negative_speedup_penalty=negative_speedup_penalty,
                 fpdb=fpdb,
@@ -184,9 +188,17 @@ def calculate_aggregated_parameters(
                 )
             print(f"  gamma (average error penalty): {gamma:.6f}")
             if t_key >= 1:
+                # pi is now dict[int, float], convert to list for display
+                errnos = sorted(pi.keys())
+                pi_list = [pi[errno] for errno in errnos]
                 indicator = [1 if t_key < 1 else 0, 1 if t_key < 3 else 0]
-                pi_indicator_sum = sum(pi[i] * indicator[i] for i in range(len(pi)))
-                print(f"    - pi: {list(pi)}")
+                # Calculate pi_indicator_sum using errno-based pi
+                pi_indicator_sum = sum(
+                    pi.get(errno, 0.0) * indicator[min(i, len(indicator) - 1)]
+                    for i, errno in enumerate(errnos)
+                )
+                print(f"    - pi (errno -> proportion): {dict(sorted(pi.items()))}")
+                print(f"    - pi (as list): {pi_list}")
                 print(f"    - indicator: {indicator}")
                 print(
                     f"    - gamma = fpdb^(sum(pi[i] * indicator[i])) = {fpdb}^{pi_indicator_sum:.6f} = {gamma:.6f}"