PAWL/pawl/contract/coordinates.py at main · Protonk/PAWL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""Coordinate index types for the IR envelope.

These types bridge Policy IR nodes to their Evidence IR (blob) origins via
byte offsets, node indices, and structural hashes.  The coordinate index
is populated after both lanes produce their outputs, during envelope assembly.

Types:

- ``NodeCoordinate``:  per-node blob location (indices + offsets).
- ``CoordinateIndex``: full node-to-blob bridge + op-table index.
- ``SectionHashes``:   per-section SHA-256 for the three blob regions.
- ``HashBoundaries``:  structural hashes at three granularities.

The digest module (``pawl.structure.inventory.digests.api``) computes
section hashes from raw blobs; this module defines the contract-level
representations that the envelope consumes.  Subgraph hashes come from
normalized ``Policy`` metadata (``structural_hash_entry`` written by
``pawl.forward.layer2.normalize.canonicalize_operation``).
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Mapping, Sequence

from pawl.structure.core.blob_geometry import (
    NODE_BYTE_SIZE,
    node_index_to_absolute_offset,
)


@dataclass(frozen=True)
class NodeCoordinate:
    """Map a single Policy IR node to its Evidence IR (blob) origin.

    Tuples (not scalars) because the compiler shares nodes across
    operations — a single ``NodeTest`` may map to multiple blob offsets.
    """

    node_indices: tuple[int, ...] = ()
    byte_offsets: tuple[int, ...] = ()
    literal_pool_offset: int | None = None
    absence_reason: str | None = None

    def as_dict(self) -> dict[str, object]:
        out: dict[str, object] = {}
        if self.node_indices:
            out["node_indices"] = list(self.node_indices)
        if self.byte_offsets:
            out["byte_offsets"] = list(self.byte_offsets)
        if self.literal_pool_offset is not None:
            out["literal_pool_offset"] = self.literal_pool_offset
        if self.absence_reason is not None:
            out["absence_reason"] = self.absence_reason
        return out


@dataclass(frozen=True)
class CoordinateIndex:
    """Bridge between Policy IR nodes and Evidence IR blob locations.

    ``nodes`` maps each node_id to its blob coordinate.  Every Policy IR
    node must either have a populated coordinate or an explicit
    ``absence_reason``.

    ``op_table`` maps operation names to their op-table entry index in
    the compiled blob.
    """

    nodes: Mapping[str, NodeCoordinate] = field(default_factory=dict)
    op_table: Mapping[str, int] = field(default_factory=dict)

    def as_dict(self) -> dict[str, object]:
        return {
            "nodes": {k: v.as_dict() for k, v in self.nodes.items()},
            "op_table": dict(self.op_table),
        }


@dataclass(frozen=True)
class SectionHashes:
    """SHA-256 hashes for the three structural sections of a compiled blob.

    Field names match the IR envelope contract.  A structurally identical
    type exists in ``pawl.structure.inventory.digests.api`` for the digest
    pipeline; this is the contract-level representation.
    """

    op_table: str
    node_stream: str
    literal_pool: str

    def as_dict(self) -> dict[str, object]:
        return {
            "op_table": self.op_table,
            "node_stream": self.node_stream,
            "literal_pool": self.literal_pool,
        }


@dataclass(frozen=True)
class HashBoundaries:
    """Structural hashes at three granularities.

    - ``profile_hash``: SHA-256 of the full compiled blob.
      Null when no blob is available (Source-Evaluate path).
    - ``section_hashes``: per-section SHA-256 (op_table, node_stream,
      literal_pool).  Null when no blob is available.
    - ``subgraph_hashes``: per-operation SHA-256 of the reachable
      subgraph.  Computed from normalized ``Policy`` metadata
      (``structural_hash_entry`` written by ``canonicalize_operation``).
    """

    profile_hash: str | None = None
    section_hashes: SectionHashes | None = None
    subgraph_hashes: Mapping[str, str] = field(default_factory=dict)

    def as_dict(self) -> dict[str, object]:
        return {
            "profile_hash": self.profile_hash,
            "section_hashes": (
                self.section_hashes.as_dict()
                if self.section_hashes is not None
                else None
            ),
            "subgraph_hashes": dict(self.subgraph_hashes),
        }


# ---------------------------------------------------------------------------
# Builder helpers
# ---------------------------------------------------------------------------


def coordinate_from_provenance(
    origin_offsets: Sequence[int],
    nodes_start_offset: int,
    node_byte_size: int = NODE_BYTE_SIZE,
    *,
    literal_pool_offset: int | None = None,
) -> NodeCoordinate:
    """Build a ``NodeCoordinate`` from provenance origin offsets.

    ``origin_offsets`` are node-stream indices (not byte offsets).
    ``nodes_start_offset`` is the absolute byte offset where the node
    stream begins in the blob.
    """
    byte_offsets = tuple(
        node_index_to_absolute_offset(
            nodes_start_offset,
            idx,
            node_byte_size=node_byte_size,
        )
        for idx in origin_offsets
    )
    return NodeCoordinate(
        node_indices=tuple(origin_offsets),
        byte_offsets=byte_offsets,
        literal_pool_offset=literal_pool_offset,
    )


def build_coordinate_index(
    node_origins: Mapping[str, Sequence[int]],
    op_table_indices: Mapping[str, int],
    nodes_start_offset: int,
    node_byte_size: int = NODE_BYTE_SIZE,
    *,
    literal_pool_offsets: Mapping[str, int] | None = None,
    absence_reasons: Mapping[str, str] | None = None,
) -> CoordinateIndex:
    """Assemble a ``CoordinateIndex`` from pre-extracted provenance data.

    Parameters
    ----------
    node_origins:
        ``{node_id: origin_offsets}`` — node-stream indices from
        ``NodeProvenance.origin_offsets`` for each Policy IR node.
    op_table_indices:
        ``{operation_name: index}`` — op-table entry index per operation.
    nodes_start_offset:
        Absolute byte offset where the blob's node stream begins
        (from ``SectionOffsets.nodes_start``).
    literal_pool_offsets:
        Optional ``{node_id: offset}`` for nodes that reference a
        literal pool entry.
    absence_reasons:
        Optional ``{node_id: reason}`` for nodes without blob
        coordinates (e.g. ``source_evaluate_no_blob``).
    """
    lpo = literal_pool_offsets or {}
    reasons = absence_reasons or {}
    coords: dict[str, NodeCoordinate] = {}

    for node_id, offsets in node_origins.items():
        if offsets:
            coords[node_id] = coordinate_from_provenance(
                offsets,
                nodes_start_offset,
                node_byte_size,
                literal_pool_offset=lpo.get(node_id),
            )
        else:
            coords[node_id] = NodeCoordinate(
                absence_reason=reasons.get(node_id, "no_provenance"),
            )

    # Nodes mentioned in absence_reasons but not in node_origins.
    for node_id, reason in reasons.items():
        if node_id not in coords:
            coords[node_id] = NodeCoordinate(absence_reason=reason)

    return CoordinateIndex(nodes=coords, op_table=op_table_indices)


__all__ = [
    "NODE_BYTE_SIZE",
    "NodeCoordinate",
    "CoordinateIndex",
    "SectionHashes",
    "HashBoundaries",
    "coordinate_from_provenance",
    "build_coordinate_index",
]