Skip to content

Commit 2e18d97

Browse files
committed
churn
1 parent 76ec7ce commit 2e18d97

File tree

2 files changed

+90
-85
lines changed

2 files changed

+90
-85
lines changed

src/mdio/segy/_disaster_recovery_wrapper.py

Lines changed: 87 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -14,28 +14,6 @@
1414
from numpy.typing import NDArray
1515

1616

17-
def debug_compare_raw_vs_processed(segy_file, trace_index=0):
18-
"""Debug function to compare raw filesystem data vs processed data."""
19-
from segy.indexing import HeaderIndexer
20-
21-
# Create a fresh indexer to get raw data
22-
indexer = HeaderIndexer(
23-
segy_file.fs,
24-
segy_file.url,
25-
segy_file.spec.trace,
26-
segy_file.num_traces,
27-
transform_pipeline=None # No transforms = raw data
28-
)
29-
30-
# Get raw data directly from filesystem
31-
raw_data = indexer[trace_index]
32-
33-
# Get processed data with transforms
34-
processed_data = segy_file.header[trace_index]
35-
36-
return raw_data, processed_data
37-
38-
3917
class HeaderRawTransformedAccessor:
4018
"""Utility class to access both raw and transformed header data with single filesystem read.
4119
@@ -57,28 +35,7 @@ def __init__(self, segy_file: SegyFile):
5735
segy_file: The SegyFile instance to work with
5836
"""
5937
self.segy_file = segy_file
60-
self.header_indexer = segy_file.header
61-
self.transform_pipeline = self.header_indexer.transform_pipeline
62-
63-
def get_raw_and_transformed(
64-
self, indices: int | list[int] | np.ndarray | slice
65-
) -> tuple[NDArray, NDArray]:
66-
"""Get both raw and transformed header data with single filesystem read.
67-
68-
Args:
69-
indices: Which headers to retrieve (int, list, ndarray, or slice)
70-
71-
Returns:
72-
Tuple of (raw_headers, transformed_headers)
73-
"""
74-
# Get the transformed data using the normal API
75-
# This reads from filesystem and applies transforms
76-
transformed_data = self.header_indexer[indices]
77-
78-
# Now reverse the transforms to get back to raw data
79-
raw_data = self._reverse_transforms(transformed_data)
80-
81-
return raw_data, transformed_data
38+
self.transform_pipeline = self.segy_file.header.transform_pipeline
8239

8340
def _reverse_transforms(self, transformed_data: NDArray) -> NDArray:
8441
"""Reverse the transform pipeline to get raw data from transformed data.
@@ -95,52 +52,51 @@ def _reverse_transforms(self, transformed_data: NDArray) -> NDArray:
9552

9653
# Apply transforms in reverse order with reversed operations
9754
for i, transform in enumerate(reversed(self.transform_pipeline.transforms)):
98-
raw_data = self._reverse_single_transform(raw_data, transform)
55+
raw_data = _reverse_single_transform(raw_data, transform)
9956

10057
return raw_data
10158

102-
def _reverse_single_transform(self, data: NDArray, transform: Transform) -> NDArray:
103-
"""Reverse a single transform operation.
104-
105-
Args:
106-
data: The data to reverse transform
107-
transform: The transform to reverse
108-
109-
Returns:
110-
Data with the transform reversed
111-
"""
112-
# Import here to avoid circular imports
113-
from segy.transforms import get_endianness
114-
from segy.schema import Endianness
115-
116-
if isinstance(transform, ByteSwapTransform):
117-
# For byte swap, we need to reverse the endianness conversion
118-
# If the transform was converting to little-endian, we need to convert back to big-endian
59+
@profile
60+
def _reverse_single_transform(data: NDArray, transform: Transform) -> NDArray:
61+
"""Reverse a single transform operation.
11962
120-
# Get current data endianness
121-
current_endianness = get_endianness(data)
63+
Args:
64+
data: The data to reverse transform
65+
transform: The transform to reverse
12266
123-
# If transform was converting TO little-endian, we need to convert TO big-endian
124-
if transform.target_order == Endianness.LITTLE:
125-
reverse_target = Endianness.BIG
126-
else:
127-
reverse_target = Endianness.LITTLE
67+
Returns:
68+
Data with the transform reversed
69+
"""
70+
# Import here to avoid circular imports
71+
from segy.transforms import get_endianness
72+
from segy.schema import Endianness
73+
74+
if isinstance(transform, ByteSwapTransform):
75+
# For byte swap, we need to reverse the endianness conversion
76+
# If the transform was converting to little-endian, we need to convert back to big-endian
77+
78+
# If transform was converting TO little-endian, we need to convert TO big-endian
79+
# TODO: I don't think this is correct
80+
if transform.target_order == Endianness.LITTLE:
81+
reverse_target = Endianness.BIG
82+
else:
83+
reverse_target = Endianness.LITTLE
12884

129-
reverse_transform = ByteSwapTransform(reverse_target)
130-
result = reverse_transform.apply(data)
85+
reverse_transform = ByteSwapTransform(reverse_target)
86+
result = reverse_transform.apply(data)
13187

132-
return result
88+
return result
13389

134-
elif isinstance(transform, IbmFloatTransform):
135-
# Reverse IBM float conversion by swapping direction
136-
reverse_direction = "to_ibm" if transform.direction == "to_ieee" else "to_ieee"
137-
reverse_transform = IbmFloatTransform(reverse_direction, transform.keys)
138-
return reverse_transform.apply(data)
90+
elif isinstance(transform, IbmFloatTransform):
91+
# Reverse IBM float conversion by swapping direction
92+
reverse_direction = "to_ibm" if transform.direction == "to_ieee" else "to_ieee"
93+
reverse_transform = IbmFloatTransform(reverse_direction, transform.keys)
94+
return reverse_transform.apply(data)
13995

140-
else:
141-
# For unknown transforms, return data unchanged
142-
# This maintains compatibility if new transforms are added
143-
return data
96+
else:
97+
# For unknown transforms, return data unchanged
98+
# This maintains compatibility if new transforms are added
99+
return data
144100

145101

146102
def get_header_raw_and_transformed(
@@ -171,5 +127,53 @@ def get_header_raw_and_transformed(
171127
# Slice of headers
172128
raw_hdrs, transformed_hdrs = get_header_raw_and_transformed(segy_file, slice(0, 10))
173129
"""
174-
accessor = HeaderRawTransformedAccessor(segy_file)
175-
return accessor.get_raw_and_transformed(indices)
130+
return _get_header_raw_optimized(segy_file, indices)
131+
132+
@profile
133+
def _get_header_raw_optimized(
134+
segy_file: SegyFile,
135+
indices: int | list[int] | np.ndarray | slice
136+
) -> tuple[NDArray, NDArray]:
137+
"""Ultra-optimized function that eliminates double disk reads entirely.
138+
139+
This function:
140+
1. Gets transformed headers using the normal API (single disk read)
141+
2. Reverses the transforms on the already-loaded data (no second disk read)
142+
3. Returns both raw and transformed headers
143+
144+
Args:
145+
segy_file: The SegyFile instance
146+
indices: Which headers to retrieve
147+
148+
Returns:
149+
Tuple of (raw_headers, transformed_headers) where transformed_headers
150+
is the same as what segy_file.header[indices] would return
151+
"""
152+
# Get transformed headers using the normal API (single disk read)
153+
transformed_headers = segy_file.header[indices]
154+
155+
# Reverse the transforms on the already-loaded transformed data
156+
# This eliminates the second disk read entirely!
157+
raw_headers = _reverse_transforms(transformed_headers, segy_file.header.transform_pipeline)
158+
159+
return raw_headers, transformed_headers
160+
161+
@profile
162+
def _reverse_transforms(transformed_data: NDArray, transform_pipeline) -> NDArray:
163+
"""Reverse the transform pipeline to get raw data from transformed data.
164+
165+
Args:
166+
transformed_data: Data that has been processed through the transform pipeline
167+
transform_pipeline: The transform pipeline to reverse
168+
169+
Returns:
170+
Raw data equivalent to what was read directly from filesystem
171+
"""
172+
# Start with the transformed data
173+
raw_data = transformed_data.copy() if hasattr(transformed_data, 'copy') else transformed_data
174+
175+
# Apply transforms in reverse order with reversed operations
176+
for transform in reversed(transform_pipeline.transforms):
177+
raw_data = _reverse_single_transform(raw_data, transform)
178+
179+
return raw_data

src/mdio/segy/_workers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def header_scan_worker(
8282

8383
return cast("HeaderArray", trace_header)
8484

85-
@profile
85+
# @profile
8686
def trace_worker( # noqa: PLR0913
8787
segy_kw: SegyFileArguments,
8888
output_path: UPath,
@@ -142,6 +142,7 @@ def trace_worker( # noqa: PLR0913
142142
# TODO(BrianMichell): Implement this better so that we can enable fill values without changing the code. #noqa: TD003
143143
tmp_headers = np.zeros_like(dataset[header_key])
144144
tmp_headers[not_null] = transformed_headers
145+
# tmp_headers[not_null] = traces.header
145146
# Create a new Variable object to avoid copying the temporary array
146147
# The ideal solution is to use `ds_to_write[header_key][:] = tmp_headers`
147148
# but Xarray appears to be copying memory instead of doing direct assignment.
@@ -152,7 +153,7 @@ def trace_worker( # noqa: PLR0913
152153
attrs=ds_to_write[header_key].attrs,
153154
encoding=ds_to_write[header_key].encoding, # Not strictly necessary, but safer than not doing it.
154155
)
155-
del transformed_headers # Manage memory
156+
# del transformed_headers # Manage memory
156157
if raw_header_key in worker_variables:
157158
tmp_raw_headers = np.zeros_like(dataset[raw_header_key])
158159
tmp_raw_headers[not_null] = raw_headers.view("|V240")

0 commit comments

Comments
 (0)