Skip to content

Commit bc1b950

Browse files
committed
Begins adding tests for rechunking
1 parent 80ceb2c commit bc1b950

File tree

1 file changed

+149
-0
lines changed

1 file changed

+149
-0
lines changed

tests/integration/test_rechunk.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
"""Integration test for MDIO rechunking.
2+
3+
This test creates a fake 3D SEG‑Y file with a 3×4 grid (3 inlines and 4 crosslines)
4+
with 100 samples per trace. Each trace header stores its inline and crossline numbers.
5+
It then converts the SEG‑Y file to MDIO, reads the original data arrays, performs a
6+
rechunk operation via the convenience API, and finally validates that the data in the
7+
new rechunked arrays exactly matches the original MDIO data.
8+
"""
9+
10+
import struct
11+
12+
import numpy as np
13+
import pytest
14+
15+
from mdio.api import convenience
16+
from mdio.api.accessor import MDIOAccessor
17+
from mdio.converters import segy_to_mdio
18+
19+
20+
def create_fake_segy_3d(file_path):
21+
"""Create a fake 3D SEG-Y file with 3 inlines and 4 crosslines and 100 samples per trace.
22+
23+
Each trace header includes inline and crossline numbers, stored in big-endian format
24+
at positions corresponding to the SEG-Y standard (bytes 189 and 193).
25+
"""
26+
num_inlines = 3
27+
num_crosslines = 4
28+
samples_per_trace = 100
29+
30+
with open(file_path, "wb") as f:
31+
# Write textual header (3200 bytes).
32+
f.write(b" " * 3200)
33+
# Create a binary header of 400 bytes using a mutable bytearray.
34+
bin_header = bytearray(400)
35+
# For SEG‑Y revision 0, the sample interval is stored at bytes 17–18 (0-indexed: 16:18).
36+
# Set the sample interval to 1000 microseconds.
37+
bin_header[16:18] = struct.pack(">H", 1000)
38+
# The number of samples per trace is stored at bytes 21–22 (0-indexed: 20:22).
39+
# Set the number of samples per trace to 100.
40+
bin_header[20:22] = struct.pack(">H", 100)
41+
# Optionally, set the data sample format code at bytes 25–26
42+
# (0-indexed: 24:26) to 5 (IEEE floating point).
43+
bin_header[24:26] = struct.pack(">H", 5)
44+
# Set bytes 96-99 to 0 so that explicit endianness code is 0
45+
# and the SEG-Y library will fall back to the legacy method.
46+
bin_header[96:100] = b"\x00" * 4
47+
f.write(bin_header)
48+
for inline in range(1, num_inlines + 1):
49+
for crossline in range(1, num_crosslines + 1):
50+
# Create a 240-byte trace header.
51+
header = bytearray(240)
52+
# SEG‑Y standard:
53+
# - Inline number is stored at bytes 189-192.
54+
# - Crossline number is stored at bytes 193-196.
55+
# - Python indexing is 0-based.
56+
header[188:192] = struct.pack(">i", inline)
57+
header[192:196] = struct.pack(">i", crossline)
58+
f.write(header)
59+
# Create trace sample data.
60+
# For each IL/XL pair, we increment the base value by 1, and for each trace
61+
# (i.e. each sample) we increment by 0.002.
62+
trace_samples = np.arange(
63+
samples_per_trace, dtype=np.float32
64+
) * 0.002 + (inline * 10 + crossline + 1)
65+
# Convert samples to big-endian IEEE float32 before writing
66+
trace_samples_be = trace_samples.astype(">f4")
67+
f.write(trace_samples_be.tobytes())
68+
69+
70+
@pytest.fixture
71+
def segy_file(tmp_path):
72+
"""Create a fake 3D SEG-Y file with 3 inlines and 4 crosslines and 100 samples per trace."""
73+
segy_path = tmp_path / "fake3d.sgy"
74+
create_fake_segy_3d(segy_path)
75+
return segy_path
76+
77+
78+
@pytest.fixture
79+
def mdio_path(tmp_path):
80+
"""Create a temporary MDIO file."""
81+
return tmp_path / "test.mdio"
82+
83+
84+
def test_rechunk_integration(segy_file, mdio_path):
85+
"""Basic rechunking test.
86+
87+
1. Convert a fake 3D SEG-Y file to an MDIO file.
88+
2. Capture the original data arrays from the resulting MDIO file.
89+
3. Perform a rechunk operation via the convenience API.
90+
4. Validate that the rechunked arrays have the same underlying data as the original,
91+
ensuring that data integrity remains undamaged.
92+
"""
93+
# Convert the fake SEG-Y file to MDIO.
94+
# For conversion, we choose inline and crossline header values from bytes 189 and 193.
95+
segy_to_mdio(
96+
segy_path=str(segy_file),
97+
mdio_path_or_buffer=str(mdio_path),
98+
index_bytes=(189, 193),
99+
index_names=("inline", "crossline"),
100+
chunksize=(2, 2, 100),
101+
overwrite=True,
102+
)
103+
104+
# Create an MDIOReader for the newly created MDIO file.
105+
reader = MDIOAccessor(
106+
str(mdio_path),
107+
mode="r+",
108+
access_pattern="012",
109+
storage_options=None,
110+
return_metadata=False,
111+
new_chunks=None,
112+
backend="zarr",
113+
memory_cache_size=0,
114+
disk_cache=False,
115+
)
116+
117+
# Capture the original data.
118+
original_traces = reader._traces[
119+
:
120+
] # Main data array (3D: inline, crossline, samples).
121+
original_headers = reader._headers[:] # Header array.
122+
123+
# Choose a new chunk size different from the original.
124+
# Here we change the chunking of the inline dimension.
125+
new_chunk = (3, 4, 50)
126+
127+
# Perform rechunking with a new suffix.
128+
convenience.rechunk(reader, new_chunk, "sample", overwrite=True)
129+
130+
# After rechunk, we need to reinitialize the reader to access the new chunks
131+
rechunked_reader = MDIOAccessor(
132+
str(mdio_path),
133+
mode="r+",
134+
access_pattern="sample",
135+
storage_options=None,
136+
return_metadata=False,
137+
new_chunks=None,
138+
backend="zarr",
139+
memory_cache_size=0,
140+
disk_cache=False,
141+
)
142+
143+
# Get the rechunked data using the accessor's methods
144+
rechunked_data = rechunked_reader._traces[:]
145+
rechunked_headers = rechunked_reader._headers[:]
146+
147+
# Validate that the underlying data has not changed.
148+
np.testing.assert_array_equal(original_traces, rechunked_data)
149+
np.testing.assert_array_equal(original_headers, rechunked_headers)

0 commit comments

Comments
 (0)