55import os
66from pathlib import Path
77from tempfile import TemporaryDirectory
8+ from typing import TYPE_CHECKING
89
910import numpy as np
1011from psutil import cpu_count
1112from tqdm .dask import TqdmCallback
1213
13- from mdio import MDIOReader
14+ from mdio . api . opener import open_dataset
1415from mdio .segy .blocked_io import to_segy
1516from mdio .segy .creation import concat_files
1617from mdio .segy .creation import mdio_spec_to_segy
2122except ImportError :
2223 distributed = None
2324
25+ if TYPE_CHECKING :
26+ from segy .schema import SegySpec
27+
28+ from mdio .core .storage_location import StorageLocation
2429
2530default_cpus = cpu_count (logical = True )
2631NUM_CPUS = int (os .getenv ("MDIO__EXPORT__CPU_COUNT" , default_cpus ))
2732
2833
29- def mdio_to_segy ( # noqa: PLR0912, PLR0913
30- mdio_path_or_buffer : str ,
31- output_segy_path : str ,
32- endian : str = "big" ,
33- access_pattern : str = "012" ,
34- storage_options : dict = None ,
35- new_chunks : tuple [int , ...] = None ,
34+ def mdio_to_segy ( # noqa: PLR0912, PLR0913, PLR0915
35+ segy_spec : SegySpec ,
36+ input_location : StorageLocation ,
37+ output_location : StorageLocation ,
3638 selection_mask : np .ndarray = None ,
3739 client : distributed .Client = None ,
3840) -> None :
@@ -47,13 +49,9 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913
4749 A `selection_mask` can be provided (same shape as spatial grid) to export a subset.
4850
4951 Args:
50- mdio_path_or_buffer: Input path where the MDIO is located.
51- output_segy_path: Path to the output SEG-Y file.
52- endian: Endianness of the input SEG-Y. Rev.2 allows little endian. Default is 'big'.
53- access_pattern: This specificies the chunk access pattern. Underlying zarr.Array must
54- exist. Examples: '012', '01'
55- storage_options: Storage options for the cloud storage backend. Default: None (anonymous)
56- new_chunks: Set manual chunksize. For development purposes only.
52+ segy_spec: The SEG-Y specification to use for the conversion.
53+ input_location: Store or URL (and cloud options) for MDIO file.
54+ output_location: Path to the output SEG-Y file.
5755 selection_mask: Array that lists the subset of traces
5856 client: Dask client. If `None` we will use local threaded scheduler. If `auto` is used we
5957 will create multiple processes (with 8 threads each).
@@ -64,86 +62,70 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913
6462
6563 Examples:
6664 To export an existing local MDIO file to SEG-Y we use the code snippet below. This will
67- export the full MDIO (without padding) to SEG-Y format using IBM floats and big-endian
68- byte order.
65+ export the full MDIO (without padding) to SEG-Y format.
6966
70- >>> from mdio import mdio_to_segy
71- >>>
67+ >>> from mdio import mdio_to_segy, StorageLocation
7268 >>>
73- >>> mdio_to_segy(
74- ... mdio_path_or_buffer="prefix2/file.mdio",
75- ... output_segy_path="prefix/file.segy",
76- ... )
77-
78- If we want to export this as an IEEE big-endian, using a selection mask, we would run:
79-
80- >>> mdio_to_segy(
81- ... mdio_path_or_buffer="prefix2/file.mdio",
82- ... output_segy_path="prefix/file.segy",
83- ... selection_mask=boolean_mask,
84- ... )
85-
69+ >>> input_location = StorageLocation("prefix2/file.mdio")
70+ >>> output_location = StorageLocation("prefix/file.segy")
71+ >>> mdio_to_segy(input_location, output_location)
8672 """
87- backend = "dask"
88-
89- output_segy_path = Path (output_segy_path )
73+ output_segy_path = Path (output_location .uri )
9074
91- mdio = MDIOReader (
92- mdio_path_or_buffer = mdio_path_or_buffer ,
93- access_pattern = access_pattern ,
94- storage_options = storage_options ,
95- )
75+ # First we open with vanilla zarr backend and then get some info
76+ # We will re-open with `new_chunks` and Dask later in mdio_spec_to_segy
77+ dataset = open_dataset (input_location )
9678
97- if new_chunks is None :
98- new_chunks = segy_export_rechunker (mdio .chunks , mdio .shape , mdio ._traces .dtype )
79+ default_variable_name = dataset .attrs ["attributes" ]["default_variable_name" ]
80+ amplitude = dataset [default_variable_name ]
81+ chunks = amplitude .encoding ["preferred_chunks" ]
82+ sizes = amplitude .sizes
83+ dtype = amplitude .dtype
84+ new_chunks = segy_export_rechunker (chunks , sizes , dtype )
9985
100- creation_args = [
101- mdio_path_or_buffer ,
102- output_segy_path ,
103- access_pattern ,
104- endian ,
105- storage_options ,
106- new_chunks ,
107- backend ,
108- ]
86+ creation_args = [segy_spec , input_location , output_location , new_chunks ]
10987
11088 if client is not None :
11189 if distributed is not None :
11290 # This is in case we work with big data
11391 feature = client .submit (mdio_spec_to_segy , * creation_args )
114- mdio , segy_factory = feature .result ()
92+ dataset , segy_factory = feature .result ()
11593 else :
11694 msg = "Distributed client was provided, but `distributed` is not installed"
11795 raise ImportError (msg )
11896 else :
119- mdio , segy_factory = mdio_spec_to_segy (* creation_args )
97+ dataset , segy_factory = mdio_spec_to_segy (* creation_args )
12098
121- live_mask = mdio . live_mask .compute ()
99+ trace_mask = dataset [ "trace_mask" ] .compute ()
122100
123101 if selection_mask is not None :
124- live_mask = live_mask & selection_mask
102+ if trace_mask .shape != selection_mask .shape :
103+ msg = "Selection mask and trace mask shapes do not match."
104+ raise ValueError (msg )
105+ selection_mask = trace_mask .copy (data = selection_mask ) # make into DataArray
106+ trace_mask = trace_mask & selection_mask
125107
126108 # This handles the case if we are skipping a whole block.
127- if live_mask .sum () == 0 :
109+ if trace_mask .sum () == 0 :
128110 msg = "No traces will be written out. Live mask is empty."
129111 raise ValueError (msg )
130112
131113 # Find rough dim limits, so we don't unnecessarily hit disk / cloud store.
132114 # Typically, gets triggered when there is a selection mask
133- dim_slices = ()
134- live_nonzeros = live_mask .nonzero ()
135- for dim_nonzeros in live_nonzeros :
136- start = np .min (dim_nonzeros )
137- stop = np .max (dim_nonzeros ) + 1
138- dim_slices += ( slice (start , stop ), )
115+ dim_slices = {}
116+ dim_live_indices = np .nonzero (trace_mask . values )
117+ for dim_name , dim_live in zip ( trace_mask . dims , dim_live_indices , strict = True ) :
118+ start = dim_live .min (). item ( )
119+ stop = dim_live .max (). item ( ) + 1
120+ dim_slices [ dim_name ] = slice (start , stop )
139121
140- # Lazily pull the data with limits now, and limit mask so its the same shape .
141- live_mask , headers , samples = mdio [ dim_slices ]
142- live_mask = live_mask . rechunk ( headers . chunks )
122+ # Lazily pull the data with limits now.
123+ # All the variables, metadata, etc. is all sliced to the same range.
124+ dataset = dataset . isel ( dim_slices )
143125
144126 if selection_mask is not None :
145127 selection_mask = selection_mask [dim_slices ]
146- live_mask = live_mask & selection_mask
128+ dataset [ "trace_mask" ] = dataset [ "trace_mask" ] & selection_mask
147129
148130 # tmp file root
149131 out_dir = output_segy_path .parent
@@ -152,9 +134,9 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913
152134 with tmp_dir :
153135 with TqdmCallback (desc = "Unwrapping MDIO Blocks" ):
154136 block_records = to_segy (
155- samples = samples ,
156- headers = headers ,
157- live_mask = live_mask ,
137+ samples = dataset [ default_variable_name ]. data ,
138+ headers = dataset [ " headers" ]. data ,
139+ live_mask = dataset [ "trace_mask" ]. data ,
158140 segy_factory = segy_factory ,
159141 file_root = tmp_dir .name ,
160142 )
0 commit comments