docs(benchmark): add a benchmark section and fix fill command (#2093)

LouisTsai-Csie · marioevz · danceratopz · web-flow · commit b3624b5bfd70 · 2025-09-08T17:05:22.000+02:00
Co-authored-by: Mario Vega &lt;marioevz@gmail.com&gt;
Co-authored-by: danceratopz &lt;danceratopz@gmail.com&gt;
diff --git a/docs/navigation.md b/docs/navigation.md
@@ -15,6 +15,7 @@
       * [Adding a New Test](writing_tests/adding_a_new_test.md)
       * [Types of Test](writing_tests/types_of_tests.md)
       * [Writing a New Test](writing_tests/writing_a_new_test.md)
+      * [Benchmarks](writing_tests/benchmarks.md)
       * [Test Markers](writing_tests/test_markers.md)
       * [Verifying Changes Locally](writing_tests/verifying_changes.md)
       * [Code Standards](writing_tests/code_standards.md)
diff --git a/docs/templates/base.md.j2 b/docs/templates/base.md.j2
@@ -9,7 +9,11 @@ Documentation for [`{{ pytest_node_id }}@{{ short_git_ref }}`]({{ source_code_ur
 !!! example "Generate fixtures for these test cases for {{ target_or_valid_fork }} with:"
 
     ```console
+{% if is_benchmark %}
+    fill -v {{ pytest_node_id }} -m benchmark
+{% else %}
     fill -v {{ pytest_node_id }} --fork {{ target_or_valid_fork }}
+{% endif %}
     ```
 {% endif %}
 
diff --git a/docs/writing_tests/benchmarks.md b/docs/writing_tests/benchmarks.md
@@ -0,0 +1,60 @@
+# Benchmark Test Cases
+
+Benchmark tests aim to maximize the usage of a specific opcode, precompile, or operation within a transaction or block. They are located in the `./tests/benchmarks` folder and the available test cases are documented in [test case reference](../tests/benchmark/index.md).
+
+To fill a benchmark test, in addition to the usual test flags, you must include the `-m benchmark` flag. This is necessary because benchmark tests are ignored by default; they must be manually selected via the `benchmark` pytest marker (="tag"). This marker is applied to all tests under `./tests/benchmark/` automatically by the framework.
+
+## Setting the Gas Limit for Benchmarking
+
+To consume the full benchmark gas limit, use the `gas_benchmark_value` fixture as the gas limit:
+
+```py
+def test_benchmark(
+    blockchain_test: BlockchainTestFiller,
+    pre: Alloc,
+    gas_benchmark_value: int
+):
+    ...
+```
+
+You can specify the block gas limit used in benchmark tests by setting the `--gas-benchmark-values` flag. This flag accepts a comma-separated list of values (in millions of gas), e.g. `--gas-benchmark-values 1,10,45,60`. This example would run the test 4 times, using a `gas_benchmark_value` of 1M, 10M, 45M, and 60M respectively.
+
+Do not configure the transaction/block gas limit to `env.gas_limit`. When running in benchmark mode, the test framework sets this value to a very large number (e.g., `1_000_000_000_000`), this setup allows the framework to reuse a single genesis file for all specified gas limits. I.e., the example below is invalid:
+
+```py
+def test_benchmark(
+    blockchain_test: BlockchainTestFiller,
+    pre: Alloc,
+    env: Environment
+):
+    ...
+    tx = Transaction(
+        to=opcode_address,
+        gas_limit=env.gas_limit, # Do not set the gas_limit manually.
+        sender=pre.fund_eoa(),
+    )
+    ...
+```
+
+## Expected Gas Usage
+
+In benchmark mode, the developer should set the expected gas consumption using the `expected_benchmark_gas_used` field. Benchmark tests do not need to consume the full gas limit, instead, you could calculate and specify the expected usage. If `expected_benchmark_gas_used` is not set, the test will fall back to using `gas_benchmark_value` as the expected value.
+
+```py
+@pytest.mark.valid_from("Prague")
+def test_empty_block(
+    blockchain_test: BlockchainTestFiller,
+    pre: Alloc,
+):
+    """Test running an empty block as a baseline for fixed proving costs."""
+    blockchain_test(
+        pre=pre,
+        post={},
+        blocks=[Block(txs=[])],
+        expected_benchmark_gas_used=0,
+    )
+```
+
+This is a safety check to make sure the benchmark works as expected. For example, if a test uses the `JUMP` instruction but the jump destination is invalid, each transaction will stop early. That means it won't use as much gas as we expected.
+
+This check helps catch such issues. As a result, the post-storage comparison method via `SSTORE` is no longer needed, thereby reducing the additional storage cost.
diff --git a/src/pytest_plugins/filler/gen_test_doc/gen_test_doc.py b/src/pytest_plugins/filler/gen_test_doc/gen_test_doc.py
@@ -423,6 +423,8 @@ def create_function_page_props(self, test_functions: Dict["str", List[Item]]) ->
                 ]
             )
 
+            is_benchmark = items[0].get_closest_marker("benchmark") is not None
+
             self.function_page_props[function_id] = FunctionPageProps(
                 title=get_test_function_name(items[0]),
                 source_code_url=source_url,
@@ -437,6 +439,7 @@ def create_function_page_props(self, test_functions: Dict["str", List[Item]]) ->
                 docstring_one_liner=get_docstring_one_liner(items[0]),
                 html_static_page_target=f"./{get_test_function_name(items[0])}.html",
                 mkdocs_function_page_target=f"./{get_test_function_name(items[0])}/",
+                is_benchmark=is_benchmark,
             )
 
     def create_module_page_props(self) -> None:
@@ -451,6 +454,7 @@ def create_module_page_props(self) -> None:
                     path=module_path,
                     pytest_node_id=str(module_path),
                     package_name=get_import_path(module_path),
+                    is_benchmark=function_page.is_benchmark,
                     test_functions=[
                         TestFunction(
                             name=function_page.title,
@@ -462,6 +466,8 @@ def create_module_page_props(self) -> None:
                 )
             else:
                 existing_module_page = self.module_page_props[str(function_page.path)]
+                if function_page.is_benchmark:
+                    existing_module_page.is_benchmark = True
                 existing_module_page.test_functions.append(
                     TestFunction(
                         name=function_page.title,
@@ -493,15 +499,23 @@ def add_directory_page_props(self) -> None:
                 fork = self.target_fork
             else:
                 fork = directory_fork_name
+
+            is_benchmark = any(
+                module_page.is_benchmark
+                for module_page in self.module_page_props.values()
+                if module_page.path.parent == directory
+            )
+
             self.page_props[str(directory)] = DirectoryPageProps(
                 title=sanitize_string_title(str(directory.name)),
                 path=directory,
                 pytest_node_id=str(directory),
                 source_code_url=generate_github_url(directory, branch_or_commit_or_tag=self.ref),
                 # TODO: This won't work in all cases; should be from the development fork
                 # Currently breaks for `tests/unscheduled/eip7692_eof_v1/index.md`  # noqa: SC100
-                target_or_valid_fork=fork.capitalize(),
+                target_or_valid_fork=fork.capitalize() if fork else "Unknown",
                 package_name=get_import_path(directory),  # init.py will be used for docstrings
+                is_benchmark=is_benchmark,
             )
 
     def find_files_within_collection_scope(self, file_pattern: str) -> List[Path]:
diff --git a/src/pytest_plugins/filler/gen_test_doc/page_props.py b/src/pytest_plugins/filler/gen_test_doc/page_props.py
@@ -12,7 +12,7 @@
 
 import re
 from abc import abstractmethod
-from dataclasses import asdict, dataclass
+from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import IO, Any, ContextManager, Dict, List, Protocol
 
@@ -104,6 +104,7 @@ class PagePropsBase:
     path: Path
     pytest_node_id: str
     package_name: str
+    is_benchmark: bool = False
 
     @property
     @abstractmethod
@@ -137,8 +138,8 @@ def write_page(self, file_opener: FileOpener, jinja2_env: Environment):
 class EipChecklistPageProps(PagePropsBase):
     """Properties used to generate the EIP checklist page."""
 
-    eip: int
-    lines: List[str]
+    eip: int = 0
+    lines: List[str] = field(default_factory=list)
 
     @property
     def template(self) -> str:
@@ -174,13 +175,13 @@ class FunctionPageProps(PagePropsBase):
     corresponding static HTML pages.
     """
 
-    test_case_count: int
-    fixture_formats: List[str]
-    test_type: str
-    docstring_one_liner: str
-    html_static_page_target: str
-    mkdocs_function_page_target: str
-    cases: List[TestCase]
+    test_case_count: int = 0
+    fixture_formats: List[str] = field(default_factory=list)
+    test_type: str = ""
+    docstring_one_liner: str = ""
+    html_static_page_target: str = ""
+    mkdocs_function_page_target: str = ""
+    cases: List[TestCase] = field(default_factory=list)
 
     @property
     def template(self) -> str:
@@ -229,7 +230,7 @@ class TestFunction:
 class ModulePageProps(PagePropsBase):
     """Definitions used for test modules, e.g., `tests/berlin/eip2930_access_list/test_acl.py`."""
 
-    test_functions: List[TestFunction]
+    test_functions: List[TestFunction] = field(default_factory=list)
 
     @property
     def template(self) -> str:
diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py
@@ -1 +1,8 @@
-"""abstract: Tests for zkVMs."""
+"""
+abstract: Benchmark tests for EVMs.
+    Benchmark tests aim to maximize the usage of a specific opcode,
+    precompile, or operation within a transaction or block. These can
+    be executed against EVM implementations to ensure they handle
+    pathological cases efficiently and correctly, allowing Ethereum to
+    safely [Scale the L1](https://protocol.ethereum.foundation/).
+"""