-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcoordinates.py
More file actions
224 lines (183 loc) · 7.33 KB
/
coordinates.py
File metadata and controls
224 lines (183 loc) · 7.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""Coordinate index types for the IR envelope.
These types bridge Policy IR nodes to their Evidence IR (blob) origins via
byte offsets, node indices, and structural hashes. The coordinate index
is populated after both lanes produce their outputs, during envelope assembly.
Types:
- ``NodeCoordinate``: per-node blob location (indices + offsets).
- ``CoordinateIndex``: full node-to-blob bridge + op-table index.
- ``SectionHashes``: per-section SHA-256 for the three blob regions.
- ``HashBoundaries``: structural hashes at three granularities.
The digest module (``pawl.structure.inventory.digests.api``) computes
section hashes from raw blobs; this module defines the contract-level
representations that the envelope consumes. Subgraph hashes come from
normalized ``Policy`` metadata (``structural_hash_entry`` written by
``pawl.forward.layer2.normalize.canonicalize_operation``).
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Mapping, Sequence
from pawl.structure.core.blob_geometry import (
NODE_BYTE_SIZE,
node_index_to_absolute_offset,
)
@dataclass(frozen=True)
class NodeCoordinate:
"""Map a single Policy IR node to its Evidence IR (blob) origin.
Tuples (not scalars) because the compiler shares nodes across
operations — a single ``NodeTest`` may map to multiple blob offsets.
"""
node_indices: tuple[int, ...] = ()
byte_offsets: tuple[int, ...] = ()
literal_pool_offset: int | None = None
absence_reason: str | None = None
def as_dict(self) -> dict[str, object]:
out: dict[str, object] = {}
if self.node_indices:
out["node_indices"] = list(self.node_indices)
if self.byte_offsets:
out["byte_offsets"] = list(self.byte_offsets)
if self.literal_pool_offset is not None:
out["literal_pool_offset"] = self.literal_pool_offset
if self.absence_reason is not None:
out["absence_reason"] = self.absence_reason
return out
@dataclass(frozen=True)
class CoordinateIndex:
"""Bridge between Policy IR nodes and Evidence IR blob locations.
``nodes`` maps each node_id to its blob coordinate. Every Policy IR
node must either have a populated coordinate or an explicit
``absence_reason``.
``op_table`` maps operation names to their op-table entry index in
the compiled blob.
"""
nodes: Mapping[str, NodeCoordinate] = field(default_factory=dict)
op_table: Mapping[str, int] = field(default_factory=dict)
def as_dict(self) -> dict[str, object]:
return {
"nodes": {k: v.as_dict() for k, v in self.nodes.items()},
"op_table": dict(self.op_table),
}
@dataclass(frozen=True)
class SectionHashes:
"""SHA-256 hashes for the three structural sections of a compiled blob.
Field names match the IR envelope contract. A structurally identical
type exists in ``pawl.structure.inventory.digests.api`` for the digest
pipeline; this is the contract-level representation.
"""
op_table: str
node_stream: str
literal_pool: str
def as_dict(self) -> dict[str, object]:
return {
"op_table": self.op_table,
"node_stream": self.node_stream,
"literal_pool": self.literal_pool,
}
@dataclass(frozen=True)
class HashBoundaries:
"""Structural hashes at three granularities.
- ``profile_hash``: SHA-256 of the full compiled blob.
Null when no blob is available (Source-Evaluate path).
- ``section_hashes``: per-section SHA-256 (op_table, node_stream,
literal_pool). Null when no blob is available.
- ``subgraph_hashes``: per-operation SHA-256 of the reachable
subgraph. Computed from normalized ``Policy`` metadata
(``structural_hash_entry`` written by ``canonicalize_operation``).
"""
profile_hash: str | None = None
section_hashes: SectionHashes | None = None
subgraph_hashes: Mapping[str, str] = field(default_factory=dict)
def as_dict(self) -> dict[str, object]:
return {
"profile_hash": self.profile_hash,
"section_hashes": (
self.section_hashes.as_dict()
if self.section_hashes is not None
else None
),
"subgraph_hashes": dict(self.subgraph_hashes),
}
# ---------------------------------------------------------------------------
# Builder helpers
# ---------------------------------------------------------------------------
def coordinate_from_provenance(
origin_offsets: Sequence[int],
nodes_start_offset: int,
node_byte_size: int = NODE_BYTE_SIZE,
*,
literal_pool_offset: int | None = None,
) -> NodeCoordinate:
"""Build a ``NodeCoordinate`` from provenance origin offsets.
``origin_offsets`` are node-stream indices (not byte offsets).
``nodes_start_offset`` is the absolute byte offset where the node
stream begins in the blob.
"""
byte_offsets = tuple(
node_index_to_absolute_offset(
nodes_start_offset,
idx,
node_byte_size=node_byte_size,
)
for idx in origin_offsets
)
return NodeCoordinate(
node_indices=tuple(origin_offsets),
byte_offsets=byte_offsets,
literal_pool_offset=literal_pool_offset,
)
def build_coordinate_index(
node_origins: Mapping[str, Sequence[int]],
op_table_indices: Mapping[str, int],
nodes_start_offset: int,
node_byte_size: int = NODE_BYTE_SIZE,
*,
literal_pool_offsets: Mapping[str, int] | None = None,
absence_reasons: Mapping[str, str] | None = None,
) -> CoordinateIndex:
"""Assemble a ``CoordinateIndex`` from pre-extracted provenance data.
Parameters
----------
node_origins:
``{node_id: origin_offsets}`` — node-stream indices from
``NodeProvenance.origin_offsets`` for each Policy IR node.
op_table_indices:
``{operation_name: index}`` — op-table entry index per operation.
nodes_start_offset:
Absolute byte offset where the blob's node stream begins
(from ``SectionOffsets.nodes_start``).
literal_pool_offsets:
Optional ``{node_id: offset}`` for nodes that reference a
literal pool entry.
absence_reasons:
Optional ``{node_id: reason}`` for nodes without blob
coordinates (e.g. ``source_evaluate_no_blob``).
"""
lpo = literal_pool_offsets or {}
reasons = absence_reasons or {}
coords: dict[str, NodeCoordinate] = {}
for node_id, offsets in node_origins.items():
if offsets:
coords[node_id] = coordinate_from_provenance(
offsets,
nodes_start_offset,
node_byte_size,
literal_pool_offset=lpo.get(node_id),
)
else:
coords[node_id] = NodeCoordinate(
absence_reason=reasons.get(node_id, "no_provenance"),
)
# Nodes mentioned in absence_reasons but not in node_origins.
for node_id, reason in reasons.items():
if node_id not in coords:
coords[node_id] = NodeCoordinate(absence_reason=reason)
return CoordinateIndex(nodes=coords, op_table=op_table_indices)
__all__ = [
"NODE_BYTE_SIZE",
"NodeCoordinate",
"CoordinateIndex",
"SectionHashes",
"HashBoundaries",
"coordinate_from_provenance",
"build_coordinate_index",
]