PAWL/integration/ir/profile/runtime_compare.py at main · Protonk/PAWL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
"""Runtime equivalence comparison for the five-point harness.

Generates a probe plan from a Profile IR's explicit slots, executes both
SBPL texts through PolicyWitness, and compares step signatures to determine
behavioral equivalence.

The comparison answers: "does the reversed SBPL produce the same sandbox
decisions as the original source SBPL for every explicit operation?"

When signatures diverge, the first distinguishing step is recorded so the
caller can triage whether the behavioral difference is expected (e.g. an
import that the reverser can't reproduce) or a bug.
"""
from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import Any

from runtime.core.unavailability import (
    EXECUTION_ERROR,
    NO_TARGETS,
    RUNNER_MISSING,
    UNSTABLE,
    ProbeUnavailable,
)

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Result type
# ---------------------------------------------------------------------------

@dataclass
class RuntimeCompareResult:
    """Outcome of a runtime equivalence comparison.

    Attributes
    ----------
    equal:
        ``True`` if all step signatures match, ``False`` if they diverge,
        ``None`` if the comparison could not run (runner unavailable, etc.).
    status:
        ``"ok"`` | ``"mismatch"`` | ``"unavailable"`` | ``"unstable"``
    first_divergence:
        When ``equal is False``, the first step where the two runs differ.
    diagnostics:
        Reasons the comparison was degraded or blocked.
    sig_a:
        Full step signature for the source SBPL (when available).
    sig_b:
        Full step signature for the reversed SBPL (when available).
    """
    equal: bool | None
    status: str
    first_divergence: dict[str, Any] | None = None
    diagnostics: list[str] | None = None
    sig_a: list[dict[str, Any]] | None = None
    sig_b: list[dict[str, Any]] | None = None
    unavailable: list[dict[str, Any]] | None = None

    def to_distinguishing(self) -> dict[str, Any]:
        """Return a dict suitable for the harness ``distinguishing`` field."""
        out: dict[str, Any] = {"status": self.status}
        if self.first_divergence:
            out["first_divergence"] = self.first_divergence
        if self.diagnostics:
            out["diagnostics"] = self.diagnostics
        if self.unavailable:
            out["unavailable"] = self.unavailable
        return out


# ---------------------------------------------------------------------------
# Probe plan generator
# ---------------------------------------------------------------------------

# Map operation prefixes to attempt kinds.
_OP_ATTEMPT_MAP: list[tuple[str, str, str]] = [
    # (prefix, attempt_kind, attempt_action)
    ("file-read", "file", "read"),
    ("file-write", "file", "write"),
    ("file-", "file", "access"),
    ("mach-lookup", "mach_lookup", "lookup"),
    ("mach-", "mach", "access"),
    ("iokit-open", "iokit", "open"),
    ("iokit-", "iokit", "access"),
    ("network-", "network", "access"),
    ("process-exec", "process", "exec"),
    ("process-", "process", "access"),
    ("signal", "signal", "send"),
    ("sysctl-", "sysctl", "access"),
    ("ipc-", "ipc", "access"),
    ("socket-", "socket", "access"),
    ("system-", "system", "access"),
    ("user-preference-", "user_preference", "access"),
]


def _attempt_for_op(op_name: str) -> tuple[str, str]:
    """Return (attempt_kind, attempt_action) for an operation name."""
    for prefix, kind, action in _OP_ATTEMPT_MAP:
        if op_name.startswith(prefix):
            return kind, action
    return "generic", "access"


def _filter_from_node(
    node_offset: int,
    nodes: dict[int, dict],
) -> dict[str, str] | None:
    """Extract the first filter predicate from a node's subgraph."""
    visited: set[int] = set()
    stack = [node_offset]
    while stack:
        off = stack.pop()
        if off in visited:
            continue
        visited.add(off)
        node = nodes.get(off)
        if node is None:
            continue
        if node["kind"] == "non_terminal":
            fid = node.get("filter_id")
            if fid is not None:
                return {"kind": "filter_id", "value": str(fid)}
            m = node.get("match_offset")
            u = node.get("unmatch_offset")
            if m is not None:
                stack.append(m)
            if u is not None:
                stack.append(u)
    return None


def generate_probe_plan(ir: dict) -> dict[str, Any]:
    """Generate a probe plan from a Profile IR's explicit slots.

    For each ``explicit`` slot, generates allow-path and deny-path probe
    steps.  Returns a dict with ``status`` and ``steps`` keys compatible
    with ``run_policy_witness``'s ``probe_plan`` parameter.
    """
    slots = ir.get("op_table", {}).get("slots", [])
    nodes = {n["offset"]: n for n in ir.get("node_graph", {}).get("nodes", [])}

    explicit_slots = [s for s in slots if s.get("assignment_class") == "explicit"]
    if not explicit_slots:
        return {"status": "ok", "steps": []}

    # Skip wildcard operations — the runner can't probe them directly.
    # Sub-ops (e.g. file-read-data) are probed individually.
    steps: list[dict[str, Any]] = []
    for slot in explicit_slots:
        op_name = slot["op_name"]
        if op_name.endswith("*"):
            continue
        kind, action = _attempt_for_op(op_name)

        step_id = f"harness_{op_name}_allow"
        steps.append({
            "step_id": step_id,
            "status": "ok",
            "sandbox_check": {
                "operation": op_name,
                "filter": {"kind": "none", "value": ""},
            },
            "attempt": {
                "kind": kind,
                "action": action,
                "target": f"/tmp/harness_probe_{op_name}",
            },
        })

        # Deny-path probe: use a target that should be denied.
        deny_step_id = f"harness_{op_name}_deny"
        steps.append({
            "step_id": deny_step_id,
            "status": "ok",
            "sandbox_check": {
                "operation": op_name,
                "filter": {"kind": "none", "value": ""},
            },
            "attempt": {
                "kind": kind,
                "action": action,
                "target": f"/nonexistent/harness_deny_{op_name}",
            },
        })

    return {"status": "ok", "steps": steps}


def generate_mf_probe_plan(ir: dict) -> dict[str, Any]:
    """Generate MF-specific probes for 0x4000 profiles.

    For message-filter profiles, generates probes targeting iokit-open and
    mach-lookup operations with filter values from the IR.
    """
    plan = generate_probe_plan(ir)
    if ir.get("profile_type_flags", 0) & 0x4000:
        existing_ops = {
            s.get("sandbox_check", {}).get("operation")
            for s in plan.get("steps", [])
        }
        mf_ops = ["iokit-open-user-client", "mach-bootstrap"]
        for op in mf_ops:
            if op not in existing_ops:
                kind, action = _attempt_for_op(op)
                plan["steps"].append({
                    "step_id": f"harness_mf_{op}",
                    "status": "ok",
                    "sandbox_check": {
                        "operation": op,
                        "filter": {"kind": "none", "value": ""},
                    },
                    "attempt": {
                        "kind": kind,
                        "action": action,
                        "target": f"/tmp/harness_mf_{op}",
                    },
                })
    return plan


# ---------------------------------------------------------------------------
# Step signature extraction
# ---------------------------------------------------------------------------

def _extract_steps_from_record(record: dict) -> list[dict]:
    """Extract step signatures from a PolicyWitness run record."""
    stdout_json = record.get("stdout_json")
    if not isinstance(stdout_json, dict):
        return []
    data = stdout_json.get("data")
    if not isinstance(data, dict):
        return []
    runner_result = data.get("runner_result")
    if not isinstance(runner_result, dict):
        return []
    steps = runner_result.get("steps")
    if not isinstance(steps, list):
        return []

    signature = []
    for step in steps:
        if not isinstance(step, dict):
            continue
        sandbox_check = (
            step.get("sandbox_check")
            if isinstance(step.get("sandbox_check"), dict)
            else {}
        )
        attempt = (
            step.get("attempt")
            if isinstance(step.get("attempt"), dict)
            else {}
        )
        signature.append({
            "step_id": step.get("step_id"),
            "sandbox_outcome": sandbox_check.get("outcome"),
            "sandbox_errno": sandbox_check.get("errno"),
            "attempt_outcome": attempt.get("outcome"),
            "attempt_errno": attempt.get("errno") or attempt.get("syscall_errno"),
        })
    return signature


def _extract_error_from_record(record: dict) -> str | None:
    """Extract an error message from a PolicyWitness run record."""
    stdout_json = record.get("stdout_json")
    if isinstance(stdout_json, dict):
        result = stdout_json.get("result")
        if isinstance(result, dict):
            return result.get("error")
    return None


# ---------------------------------------------------------------------------
# Runtime comparator
# ---------------------------------------------------------------------------

def _run_sbpl(
    sbpl_text: str,
    probe_plan: dict[str, Any],
    *,
    stability_runs: int,
    runner_requirements: dict[str, Any] | None = None,
    param_bindings: dict[str, str] | None = None,
) -> tuple[list[dict] | None, list[str], list[ProbeUnavailable]]:
    """Run one SBPL text through PolicyWitness, returning (signature, diagnostics).

    Returns ``(None, diagnostics)`` if the run fails or is unstable.
    """
    try:
        from runtime.core import executor as rt_executor
    except ImportError:
        return None, ["import_failed"], [
            ProbeUnavailable(category=RUNNER_MISSING, reason="import_failed", layer="execution")
        ]

    runs: list[list[dict]] = []
    for run_idx in range(stability_runs):
        policy_spec: dict[str, Any] = {
            "format": "sbpl",
            "sbpl_source": sbpl_text,
        }
        if param_bindings:
            policy_spec["params"] = param_bindings
        try:
            execution = rt_executor.run_policy_witness(
                policy_spec=policy_spec,
                probe_plan=probe_plan,
                runner_requirements=runner_requirements,
            )
        except Exception as exc:
            reason = f"execution_error_run{run_idx}: {exc}"
            return None, [reason], [
                ProbeUnavailable(category=EXECUTION_ERROR, reason=reason, layer="execution")
            ]

        if execution.summary.status != "ok":
            error = _extract_error_from_record(execution.record)
            diag = f"run_status_{execution.summary.status}"
            if error:
                diag += f": {error}"
            return None, [diag], [
                ProbeUnavailable(category=EXECUTION_ERROR, reason=diag, layer="execution")
            ]

        sig = _extract_steps_from_record(execution.record)
        if not sig:
            reason = f"no_steps_run{run_idx}"
            return None, [reason], [
                ProbeUnavailable(category=NO_TARGETS, reason=reason, layer="execution")
            ]
        runs.append(sig)

    # Stability: all runs must agree.
    if len(runs) >= 2:
        for i in range(1, len(runs)):
            if runs[i] != runs[0]:
                return None, ["unstable"], [
                    ProbeUnavailable(category=UNSTABLE, reason="unstable", layer="execution")
                ]

    return runs[0], [], []


def compare_runtime(
    source_sbpl: str,
    reversed_sbpl: str,
    probe_plan: dict[str, Any],
    *,
    stability_runs: int = 2,
    runner_requirements: dict[str, Any] | None = None,
    param_bindings: dict[str, str] | None = None,
) -> RuntimeCompareResult:
    """Compare runtime behavior of source and reversed SBPL via PolicyWitness.

    Compiles each SBPL text and runs the probe plan.  Compares step
    signatures to determine behavioral equivalence.

    Parameters
    ----------
    param_bindings:
        Optional mapping of param names to values. Required for profiles
        that use ``(param "NAME")`` expressions - without bindings, the
        compiler will fail with errors like "string-append: argument 1
        must be: string".
    """
    steps = probe_plan.get("steps", [])
    if not steps:
        unavailable = ProbeUnavailable(category=NO_TARGETS, reason="vacuous_no_steps", layer="execution")
        return RuntimeCompareResult(
            equal=True,
            status="ok",
            diagnostics=["vacuous_no_steps"],
            unavailable=[unavailable.to_json()],
        )

    # Check runner availability when external runner is required.
    if runner_requirements and runner_requirements.get("requires_xpc"):
        try:
            from runtime.core import preflight as runtime_preflight

            preflight = runtime_preflight.run_runner_preflight(
                required_entitlements=runner_requirements.get("required_entitlements"),
                requires_xpc=True,
            )
            if preflight.status != "ok":
                unavailable = ProbeUnavailable(category=RUNNER_MISSING, reason="no_external_runner", layer="execution")
                return RuntimeCompareResult(
                    equal=None,
                    status="unavailable",
                    diagnostics=["no_external_runner"],
                    unavailable=[unavailable.to_json()],
                )
        except Exception as exc:
            reason = f"runner_check_failed: {exc}"
            unavailable = ProbeUnavailable(category=RUNNER_MISSING, reason=reason, layer="execution")
            return RuntimeCompareResult(
                equal=None,
                status="unavailable",
                diagnostics=[reason],
                unavailable=[unavailable.to_json()],
            )

    sig_a, diag_a, unavailable_a = _run_sbpl(
        source_sbpl, probe_plan,
        stability_runs=stability_runs,
        runner_requirements=runner_requirements,
        param_bindings=param_bindings,
    )
    if sig_a is None:
        source_unavailable = [
            ProbeUnavailable(
                category=entry.category,
                reason=f"source_{entry.reason}",
                layer=entry.layer,
                operation=entry.operation,
            ).to_json()
            for entry in unavailable_a
        ]
        return RuntimeCompareResult(
            equal=None,
            status="unavailable",
            diagnostics=["source_" + d for d in diag_a],
            unavailable=source_unavailable or None,
        )

    sig_b, diag_b, unavailable_b = _run_sbpl(
        reversed_sbpl, probe_plan,
        stability_runs=stability_runs,
        runner_requirements=runner_requirements,
        param_bindings=param_bindings,
    )
    if sig_b is None:
        reversed_unavailable = [
            ProbeUnavailable(
                category=entry.category,
                reason=f"reversed_{entry.reason}",
                layer=entry.layer,
                operation=entry.operation,
            ).to_json()
            for entry in unavailable_b
        ]
        return RuntimeCompareResult(
            equal=None,
            status="unavailable",
            diagnostics=["reversed_" + d for d in diag_b],
            sig_a=sig_a,
            unavailable=reversed_unavailable or None,
        )

    # Compare.
    if sig_a == sig_b:
        return RuntimeCompareResult(
            equal=True,
            status="ok",
            sig_a=sig_a,
            sig_b=sig_b,
        )

    # Find the first diverging step.
    first_div: dict[str, Any] | None = None
    for step_a, step_b in zip(sig_a, sig_b):
        if step_a != step_b:
            first_div = {
                "step_id": step_a.get("step_id"),
                "source": step_a,
                "reversed": step_b,
            }
            break
    # Handle length mismatch.
    if first_div is None:
        shorter, longer = ("source", "reversed") if len(sig_a) < len(sig_b) else ("reversed", "source")
        extra = sig_b[len(sig_a):] if len(sig_a) < len(sig_b) else sig_a[len(sig_b):]
        first_div = {
            "step_id": extra[0].get("step_id") if extra else None,
            "reason": f"{longer} has {len(extra)} extra steps",
        }

    return RuntimeCompareResult(
        equal=False,
        status="mismatch",
        first_divergence=first_div,
        sig_a=sig_a,
        sig_b=sig_b,
    )