yhs0602 · yhs0602 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -72,7 +72,7 @@ set(CRAFTGROUND_PY_SOURCES src/cpp/ipc.cpp src/cpp/ipc_noboost.cpp src/cpp/print
 
 if(APPLE)
     # Add Apple-specific source files
-    list(APPEND CRAFTGROUND_PY_SOURCES src/cpp/ipc_apple.mm src/cpp/ipc_apple_torch.cpp)
+    list(APPEND CRAFTGROUND_PY_SOURCES src/cpp/ipc_apple.mm src/cpp/ipc_apple_ops.mm src/cpp/ipc_apple_torch.cpp)
 
     # Apple-specific compile options
     set(CRAFTGROUND_PY_COMPILE_OPTIONS -fobjc-arc)

diff --git a/docs/superpowers/plans/2026-03-19-zerocopy-raw-normalize.md b/docs/superpowers/plans/2026-03-19-zerocopy-raw-normalize.md
@@ -0,0 +1,164 @@
+# Zerocopy Raw Normalize Implementation Plan
+
+> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Refactor the macOS zerocopy path so raw transport tensors stay internal while the public `ZEROCOPY` path continues to return normalized `RGB + flipped` output.
+
+**Architecture:** Keep the native macOS bridge unchanged for now and move the raw-to-public normalization boundary into `ObservationConverter`. Minimize diff by touching the current macOS zerocopy branches only, adding a small normalization helper and focused tests.
+
+**Tech Stack:** Python, PyTorch, pytest, existing CraftGround environment converter code
+
+---
+
+## Chunk 1: MacOS Zerocopy Boundary
+
+### Task 1: Add failing tests for macOS zerocopy public normalization
+
+**Files:**
+- Modify: `tests/python/unit/test_observation_converter.py` or nearest existing converter test file
+- Modify: `src/craftground/environment/observation_converter.py`
+
+- [ ] **Step 1: Locate or create the nearest unit-test file for `ObservationConverter`**
+
+Run: `rg -n "ObservationConverter|initialize_zerocopy|ScreenEncodingMode.ZEROCOPY" tests src/craftground`
+Expected: identify the closest existing test home; only create a new test file if no suitable one exists.
+
+- [ ] **Step 2: Write a failing test for the macOS path that preserves public output semantics**
+
+Test idea:
+
+```python
+def test_macos_zerocopy_returns_rgb_flipped_from_raw_transport(...):
+    converter = ObservationConverter(...)
+    raw = fake_tensor_with_known_bgra_rows(...)
+    converter.internal_type = ObservationTensorType.APPLE_TENSOR
+    converter.last_observations[0] = raw
+
+    obs, _ = converter.convert(fake_observation(...))
+
+    assert torch.equal(obs, expected_rgb_flipped)
+```
+
+- [ ] **Step 3: Write a failing test that initialization stores raw transport data without normalization**
+
+Test idea:
+
+```python
+def test_initialize_zerocopy_stores_raw_apple_tensor(...):
+    converter = ObservationConverter(...)
+    raw = fake_raw_tensor(...)
+    mock_initialize_from_mach_port.return_value = raw
+
+    converter.initialize_zerocopy(fake_mach_port_bytes)
+
+    assert converter.last_observations[0] is raw
+```
+
+- [ ] **Step 4: Run the targeted tests and verify they fail for the intended reason**
+
+Run: `pytest <targeted-test-file> -k zerocopy -v`
+Expected: FAIL due to missing helper/incorrect state handling, not import or syntax errors.
+
+- [ ] **Step 5: Commit the failing tests**
+
+```bash
+git add <targeted-test-file>
+git commit -m "test: add macos zerocopy normalization coverage"
+```
+
+### Task 2: Implement raw/public separation with minimal diff
+
+**Files:**
+- Modify: `src/craftground/environment/observation_converter.py`
+- Test: `tests/python/unit/test_observation_converter.py` or nearest existing converter test file
+
+- [ ] **Step 1: Add a small helper for macOS raw-to-public normalization**
+
+Implementation target:
+
+```python
+def _normalize_apple_zerocopy_tensor(self, tensor: "TorchArrayType") -> "TorchArrayType":
+    return tensor.clone()[:, :, [2, 1, 0]].flip(0)
+```
+
+Keep this helper private and adjacent to zerocopy handling.
+
+- [ ] **Step 2: Make `initialize_zerocopy()` store the raw Apple tensor only**
+
+Requirements:
+
+- no normalization in initialization
+- keep storing the raw transport tensor
+- ensure type tracking updates the field used by `convert()`
+
+- [ ] **Step 3: Update the public `ZEROCOPY` branch to call the helper at return time**
+
+Requirements:
+
+- preserve current `RGB + flipped` behavior for Apple tensors
+- do not broaden the change into unrelated output types
+- keep non-Apple branches unchanged unless required for correctness
+
+- [ ] **Step 4: Fix the touched zerocopy state variable inconsistency if still present**
+
+The code currently mixes `internal_type` and `observation_tensor_type`. The implementation should ensure the active state variable used by `convert()` is updated consistently in the touched path.
+
+- [ ] **Step 5: Run the targeted tests and verify they pass**
+
+Run: `pytest <targeted-test-file> -k zerocopy -v`
+Expected: PASS
+
+- [ ] **Step 6: Run any neighboring converter tests**
+
+Run: `pytest <targeted-test-file> -v`
+Expected: PASS with no zerocopy regressions in nearby behavior.
+
+- [ ] **Step 7: Commit the implementation**
+
+```bash
+git add src/craftground/environment/observation_converter.py <targeted-test-file>
+git commit -m "refactor: separate raw macos zerocopy transport from public normalization"
+```
+
+## Chunk 2: Verification And Handoff
+
+### Task 3: Verify branch state and prepare for PR
+
+**Files:**
+- Modify: `docs/superpowers/specs/2026-03-19-zerocopy-raw-normalize-design.md`
+- Modify: `docs/superpowers/plans/2026-03-19-zerocopy-raw-normalize.md`
+
+- [ ] **Step 1: Run a focused verification sweep**
+
+Run:
+
+```bash
+pytest <targeted-test-file> -v
+git status --short
+git log --oneline -5
+```
+
+Expected:
+
+- tests passing
+- only intended files modified
+- branch contains the new zerocopy commits
+
+- [ ] **Step 2: Update docs if implementation differs from the planned low-diff shape**
+
+Keep spec and plan aligned with actual code decisions.
+
+- [ ] **Step 3: Prepare PR summary**
+
+Include:
+
+- internal raw/public separation
+- no public `ZEROCOPY` behavior change
+- future raw-mode support enabled
+
+- [ ] **Step 4: Commit any doc adjustments**
+
+```bash
+git add docs/superpowers/specs/2026-03-19-zerocopy-raw-normalize-design.md docs/superpowers/plans/2026-03-19-zerocopy-raw-normalize.md
+git commit -m "docs: record zerocopy raw normalize design and plan"
+```
diff --git a/docs/superpowers/specs/2026-03-19-zerocopy-raw-normalize-design.md b/docs/superpowers/specs/2026-03-19-zerocopy-raw-normalize-design.md
@@ -0,0 +1,118 @@
+# Zerocopy Raw Normalize Design
+
+## Summary
+
+Refactor the macOS zerocopy path so the internal transport tensor remains in its native raw layout, while the public `ZEROCOPY` API continues to expose the current normalized image contract (`RGB + flipped`). This keeps the current user-facing behavior stable, reduces coupling between the custom MPS bridge and presentation logic, and creates a clean path to expose a future raw mode without redesigning the pipeline again.
+
+## Current Problem
+
+The macOS zerocopy path currently mixes three concerns in one flow:
+
+1. Transporting the framebuffer into Python as an MPS-backed tensor.
+2. Representing the native producer layout.
+3. Returning the user-facing normalized layout.
+
+This makes the custom bridge responsible for data semantics it should not own. It also forces the current implementation to rely on `clone()` in the public path as both a correctness workaround and a format-conversion boundary.
+
+## Decision
+
+Use the approved option:
+
+- Keep the internal macOS zerocopy representation as native raw data.
+- Preserve the existing public `ZEROCOPY` output semantics.
+- Move normalization (`RGB + flip`) to the public conversion boundary.
+- Do not expose a new raw public mode in this change.
+
+This is the lowest-diff option that improves maintainability without giving up future extensibility.
+
+## Scope
+
+In scope:
+
+- macOS zerocopy internals in `observation_converter.py`
+- separation between raw transport tensor and normalized public tensor
+- tests for the new internal/public boundary behavior
+- cleanup of the internal zerocopy type-tracking bug if encountered in the touched path
+
+Out of scope:
+
+- changing the public API shape of `ScreenEncodingMode.ZEROCOPY`
+- introducing a new public raw mode
+- redesigning the native C++ bridge
+- changing non-macOS behavior unless required for shared helper safety
+
+## Design
+
+### Internal Representation
+
+`initialize_zerocopy()` should store the macOS tensor exactly as produced by the native bridge. No channel reorder, Y-flip, or user-facing normalization should happen at initialization time.
+
+The raw tensor is the transport artifact. It is allowed to be in producer-native layout as long as the public boundary is responsible for converting it into the documented output format.
+
+### Public Representation
+
+The existing `ScreenEncodingMode.ZEROCOPY` path should continue to return the same logical image as before. The normalization step should happen in a helper near the public return site, not during bridge setup.
+
+For the macOS path this means:
+
+- source: raw Apple tensor
+- public output: normalized `RGB + flipped`
+
+This keeps behavior stable for users while isolating future raw-mode support to a small API addition later.
+
+### State Ownership
+
+The converter should make the distinction between:
+
+- raw zerocopy tensor state
+- normalized public observation state
+
+The implementation may use separate fields or a clearly named helper-based boundary, but the raw/public distinction must be explicit in code. Avoid clever state machines. This should stay easy to follow inside the existing file.
+
+### Future Raw Mode
+
+This refactor should make a later `ZEROCOPY_RAW` or equivalent mode straightforward:
+
+- the raw tensor already exists as a first-class internal concept
+- the later feature would only add a new public branch that returns the raw tensor directly
+- the normalized branch would remain unchanged
+
+## Testing Strategy
+
+Add targeted tests around the converter boundary:
+
+- macOS zerocopy initialization stores the raw transport tensor without premature normalization
+- public `ZEROCOPY` still returns the same normalized semantics as before
+- normalization helper is the only place that performs the macOS format conversion
+
+Tests should focus on layout semantics and call boundaries, not the real native bridge.
+
+## Risk Management
+
+Primary risk:
+
+- accidentally changing the user-visible output of `ZEROCOPY`
+
+Mitigation:
+
+- keep the public return contract unchanged
+- isolate the new normalization helper
+- cover the output semantics with tests
+
+Secondary risk:
+
+- touching shared converter state and unintentionally affecting non-macOS paths
+
+Mitigation:
+
+- keep the diff local to the macOS-specific branches whenever possible
+- avoid broad refactors outside the touched boundary
+
+## Success Criteria
+
+The change is successful when:
+
+- current `ZEROCOPY` callers still receive `RGB + flipped` output
+- the macOS zerocopy path internally preserves a raw transport tensor
+- the code clearly separates raw transport from public normalization
+- the refactor makes a future raw public mode a small additive change
diff --git a/pybind11 b/pybind11
diff --git a/src/cpp/ipc.cpp b/src/cpp/ipc.cpp
@@ -10,6 +10,9 @@ py::object
 initialize_from_mach_port(unsigned int machPort, int width, int height) {
     return mtl_tensor_from_mach_port(machPort, width, height);
 }
+py::object normalize_apple_mtl_tensor(py::object tensor) {
+    return normalize_apple_mtl_tensor_impl(std::move(tensor));
+}
 py::capsule mtl_tensor_from_cuda_mem_handle(
     const char *cuda_ipc_handle, int width, int height
 ) {
@@ -22,6 +25,7 @@ py::object
 initialize_from_mach_port(unsigned int machPort, int width, int height) {
     return py::none();
 }
+py::object normalize_apple_mtl_tensor(py::object tensor) { return py::none(); }
 
 py::capsule mtl_tensor_from_cuda_mem_handle(
     const char *cuda_ipc_handle, int width, int height
@@ -51,6 +55,7 @@ py::object
 initialize_from_mach_port(unsigned int machPort, int width, int height) {
     return py::none();
 }
+py::object normalize_apple_mtl_tensor(py::object tensor) { return py::none(); }
 
 py::capsule mtl_tensor_from_cuda_mem_handle(
     const char *cuda_ipc_handle, int width, int height
@@ -100,6 +105,7 @@ void destroy_shared_memory(const char *memory_name, bool release_semaphores) {
 PYBIND11_MODULE(craftground_native, m) {
     m.doc() = "Craftground Native Module";
     m.def("initialize_from_mach_port", &initialize_from_mach_port);
+    m.def("normalize_apple_mtl_tensor", &normalize_apple_mtl_tensor);
     m.def("mtl_tensor_from_cuda_mem_handle", &mtl_tensor_from_cuda_mem_handle);
     m.def("initialize_shared_memory", &initialize_shared_memory);
     m.def("write_to_shared_memory", &write_to_shared_memory);

diff --git a/src/cpp/ipc.h b/src/cpp/ipc.h
@@ -4,9 +4,12 @@
 #include <stdlib.h>
 #include <string>
 
+#include <pybind11/pybind11.h>
+
 namespace py = pybind11;
 py::object
 initialize_from_mach_port(unsigned int machPort, int width, int height);
+py::object normalize_apple_mtl_tensor(py::object tensor);
 py::capsule mtl_tensor_from_cuda_mem_handle(
     const char *cuda_ipc_handle, int width, int height
 );
@@ -31,4 +34,4 @@ void destroy_shared_memory(const char *memory_name, bool release_semaphores);
 
 bool shared_memory_exists(const std::string &name);
 
-#endif
+#endif
diff --git a/src/cpp/ipc_apple.h b/src/cpp/ipc_apple.h
@@ -1,9 +1,14 @@
 #ifndef __IPC_APPLE_H__
 #define __IPC_APPLE_H__
 
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
 #define USE_CUSTOM_DL_PACK_TENSOR 1
 
 py::object
 mtl_tensor_from_mach_port(unsigned int machPort, int width, int height);
+py::object normalize_apple_mtl_tensor_impl(py::object tensor);
 
-#endif // __IPC_APPLE_H__
+#endif // __IPC_APPLE_H__