Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions backends/test/suite/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,39 @@ This directory contains tests that validate correctness and coverage of backends

These tests are intended to ensure that backends are robust and provide a smooth, "out-of-box" experience for users across the full span of input patterns. They are not intended to be a replacement for backend-specific tests, as they do not attempt to validate performance or that backends delegate operators that they expect to.

## Running Tests and Interpreting Output
Tests can be run from the command line, either using the runner.py entry point or the standard Python unittest runner. When running through runner.py, the test runner will report test statistics, including the number of tests with each result type.

Backends can be specified with the `ET_TEST_ENABLED_BACKENDS` environment variable. By default, all available backends are enabled. Note that backends such as Core ML or Vulkan may require specific hardware or software to be available. See the documentation for each backend for information on requirements.

Example:
```
ET_TEST_ENABLED_BACKENDS=xnnpack python -m executorch.backends.test.suite.runner
```

```
2465 Passed / 2494
16 Failed
13 Skipped

[Success]
736 Delegated
1729 Undelegated

[Failure]
5 Lowering Fail
3 PTE Run Fail
8 Output Mismatch Fail
```

Outcomes can be interpreted as follows:
* Success (delegated): The test passed and at least one op was delegated by the backend.
* Success (undelegated): The test passed with no ops delegated by the backend. This is a pass, as the partitioner works as intended.
* Skipped: test fails in eager or export (indicative of a test or dynamo issue).
* Lowering fail: The test fails in to_edge_transform_and_lower.
* PTE run failure: The test errors out when loading or running the method.
* Output mismatch failure: Output delta (vs eager) exceeds the configured tolerance.

## Backend Registration

To plug into the test framework, each backend should provide an implementation of the Tester class, defined in backends/test/harness/tester.py. Backends can provide implementations of each stage, or use the default implementation, as appropriate.
Expand Down
104 changes: 68 additions & 36 deletions backends/test/suite/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@

import torch
from executorch.backends.test.harness import Tester
from executorch.backends.test.suite.context import get_active_test_context, TestContext
from executorch.backends.test.suite.reporting import log_test_summary
from executorch.backends.test.suite.runner import run_test, runner_main

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -60,17 +63,17 @@ def is_backend_enabled(backend):


DTYPES = [
torch.int8,
torch.uint8,
torch.int16,
torch.uint16,
torch.int32,
torch.uint32,
torch.int64,
torch.uint64,
torch.float16,
# torch.int8,
# torch.uint8,
# torch.int16,
# torch.uint16,
# torch.int32,
# torch.uint32,
# torch.int64,
# torch.uint64,
# torch.float16,
torch.float32,
torch.float64,
# torch.float64,
]

FLOAT_DTYPES = [
Expand Down Expand Up @@ -117,16 +120,19 @@ def _expand_test(cls, test_name: str):
delattr(cls, test_name)


def _make_wrapped_test(test_func, *args, **kwargs):
def _make_wrapped_test(
test_func: Callable,
test_name: str,
test_flow: str,
tester_factory: Callable,
params: dict | None = None,
):
def wrapped_test(self):
test_func(self, *args, **kwargs)
with TestContext(test_name, test_flow, params):
test_kwargs = params or {}
test_kwargs["tester_factory"] = tester_factory

return wrapped_test


def _make_wrapped_dtype_test(test_func, dtype, tester_factory):
def wrapped_test(self):
test_func(self, dtype, tester_factory)
test_func(self, **test_kwargs)

return wrapped_test

Expand All @@ -140,37 +146,63 @@ def _create_test_for_backend(
test_type = getattr(test_func, "test_type", TestType.STANDARD)

if test_type == TestType.STANDARD:
wrapped_test = _make_wrapped_test(test_func, tester_factory)
wrapped_test = _make_wrapped_test(
test_func, test_func.__name__, flow_name, tester_factory
)
test_name = f"{test_func.__name__}_{flow_name}"
setattr(cls, test_name, wrapped_test)
elif test_type == TestType.DTYPE:
for dtype in DTYPES:
# wrapped_test = _make_wrapped_dtype_test(test_func, dtype, tester_factory)
wrapped_test = _make_wrapped_test(test_func, dtype, tester_factory)
wrapped_test = _make_wrapped_test(
test_func,
test_func.__name__,
flow_name,
tester_factory,
{"dtype": dtype},
)
dtype_name = str(dtype)[6:] # strip "torch."
test_name = f"{test_func.__name__}_{dtype_name}_{flow_name}"
setattr(cls, test_name, wrapped_test)
else:
raise NotImplementedError(f"Unknown test type {test_type}.")


def load_tests(loader, suite, pattern):
package_dir = os.path.dirname(__file__)
discovered_suite = loader.discover(
start_dir=package_dir, pattern=pattern or "test_*.py"
)
suite.addTests(discovered_suite)
return suite


class OperatorTest(unittest.TestCase):
def _test_op(self, model, inputs, tester_factory):
tester = (
tester_factory(
model,
inputs,
)
.export()
.to_edge_transform_and_lower()
context = get_active_test_context()

# This should be set in the wrapped test. See _make_wrapped_test above.
assert context is not None, "Missing test context."

run_summary = run_test(
model,
inputs,
tester_factory,
context.test_name,
context.flow_name,
context.params,
)

is_delegated = any(
n.target == torch._higher_order_ops.executorch_call_delegate
for n in tester.stages[tester.cur].graph_module.graph.nodes
if n.op == "call_function"
)
log_test_summary(run_summary)

if not run_summary.result.is_success():
if run_summary.result.is_backend_failure():
raise RuntimeError("Test failure.") from run_summary.error
else:
# Non-backend failure indicates a bad test. Mark as skipped.
raise unittest.SkipTest(
f"Test failed for reasons other than backend failure. Error: {run_summary.error}"
)


# Only run the runtime test if the op was delegated.
if is_delegated:
(tester.to_executorch().serialize().run_method_and_compare_outputs())
if __name__ == "__main__":
runner_main()
28 changes: 28 additions & 0 deletions backends/test/suite/context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Test run context management. This is used to determine the test context for reporting
# purposes.
class TestContext:
def __init__(self, test_name: str, flow_name: str, params: dict | None):
self.test_name = test_name
self.flow_name = flow_name
self.params = params

def __enter__(self):
global _active_test_context
import sys

if _active_test_context is not None:
print(f"Active context: {_active_test_context.test_name}", file=sys.stderr)
assert _active_test_context is None
_active_test_context = self

def __exit__(self, exc_type, exc_value, traceback):
global _active_test_context
_active_test_context = None


_active_test_context: TestContext | None = None


def get_active_test_context() -> TestContext | None:
global _active_test_context
return _active_test_context
163 changes: 163 additions & 0 deletions backends/test/suite/reporting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from collections import Counter
from dataclasses import dataclass
from enum import IntEnum, nonmember


class TestResult(IntEnum):
"""Represents the result of a test case run, indicating success or a specific failure reason."""

SUCCESS = 0
""" The test succeeded with the backend delegate part or all of the graph. """

SUCCESS_UNDELEGATED = 1
""" The test succeeded without the backend delegating anything. """

EAGER_FAIL = 2
""" The test failed due to the model failing to run in eager mode. """

EXPORT_FAIL = 3
""" The test failed due to the model failing to export. """

LOWER_FAIL = 4
""" The test failed due to a failure in partitioning or lowering. """

PTE_LOAD_FAIL = 5
""" The test failed due to the resulting PTE failing to load. """

PTE_RUN_FAIL = 6
""" The test failed due to the resulting PTE failing to run. """

OUTPUT_MISMATCH_FAIL = 7
""" The test failed due to a mismatch between runtime and reference outputs. """

UNKNOWN_FAIL = 8
""" The test failed in an unknown or unexpected manner. """

@nonmember
def is_success(self):
return self in {TestResult.SUCCESS, TestResult.SUCCESS_UNDELEGATED}

@nonmember
def is_non_backend_failure(self):
return self in {TestResult.EAGER_FAIL, TestResult.EAGER_FAIL}

@nonmember
def is_backend_failure(self):
return not self.is_success() and not self.is_non_backend_failure()

@nonmember
def display_name(self):
if self == TestResult.SUCCESS:
return "Success (Delegated)"
elif self == TestResult.SUCCESS_UNDELEGATED:
return "Success (Undelegated)"
elif self == TestResult.EAGER_FAIL:
return "Fail (Eager)"
elif self == TestResult.EXPORT_FAIL:
return "Fail (Export)"
elif self == TestResult.LOWER_FAIL:
return "Fail (Lowering)"
elif self == TestResult.PTE_LOAD_FAIL:
return "Fail (PTE Load)"
elif self == TestResult.PTE_RUN_FAIL:
return "Fail (PTE Run)"
elif self == TestResult.OUTPUT_MISMATCH_FAIL:
return "Fail (Output Mismatch)"
elif self == TestResult.UNKNOWN_FAIL:
return "Fail (Other)"
else:
raise ValueError(f"Invalid TestResult value: {self}.")


@dataclass
class TestCaseSummary:
"""
Contains summary results for the execution of a single test case.
"""

name: str
""" The qualified name of the test, not including the flow suffix. """

flow: str
""" The backend-specific flow name. Corresponds to flows registered in backends/test/suite/__init__.py. """

params: dict | None
""" Test-specific parameters, such as dtype. """

result: TestResult
""" The top-level result, such as SUCCESS or LOWER_FAIL. """

error: Exception | None
""" The Python exception object, if any. """


class TestSessionState:
test_case_summaries: list[TestCaseSummary]

def __init__(self):
self.test_case_summaries = []


@dataclass
class RunSummary:
aggregated_results: dict[TestResult, int]
num_test_cases: int
test_case_summaries: list[TestCaseSummary]
total_failed: int
total_passed: int
total_skipped: int

@classmethod
def from_session(cls, session: TestSessionState) -> "RunSummary":
# Total each outcome type.
aggregated_results = dict(
sorted(Counter(s.result for s in session.test_case_summaries).items())
)

total_failed = 0
total_passed = 0
total_skipped = 0

for k, v in aggregated_results.items():
if k.is_success():
total_passed += v
elif k.is_backend_failure():
total_failed += v
else:
total_skipped += v

return cls(
aggregated_results=aggregated_results,
num_test_cases=len(session.test_case_summaries),
test_case_summaries=session.test_case_summaries,
total_failed=total_failed,
total_passed=total_passed,
total_skipped=total_skipped,
)


_active_session: TestSessionState | None = None


def begin_test_session():
global _active_session

assert _active_session is None, "A test session is already active."
_active_session = TestSessionState()


def log_test_summary(summary: TestCaseSummary):
global _active_session

if _active_session is not None:
_active_session.test_case_summaries.append(summary)


def complete_test_session() -> RunSummary:
global _active_session

assert _active_session is not None, "No test session is active."
summary = RunSummary.from_session(_active_session)
_active_session = None

return summary
Loading
Loading