88
99from __future__ import annotations
1010
11+ import json
1112from collections .abc import Iterator
1213from dataclasses import dataclass
14+ from importlib .resources import files
1315from typing import Any
1416
1517from .clib import clibmseed , ffi
1618from .logging import clear_error_messages , get_error_messages
17- from .msrecord import MS3Record
1819from .mstracelist import MS3TraceList
1920
2021# (buf_ptr, absolute_offset, record_length) — or (None, offset, error_code) for detection failure
2122_RecordTuple = tuple [Any , int , int ]
2223
24+ # Maps supported schema IDs to their bundled JSON Schema filenames
25+ _KNOWN_SCHEMAS : dict [str , str ] = {
26+ "FDSN-v1.0" : "ExtraHeaders-FDSN-v1.0.schema-2020-12.json" ,
27+ }
28+
2329
2430@dataclass (frozen = True )
2531class ValidationError :
@@ -82,6 +88,7 @@ def __init__(self, filename: str, chunk_size: int = 1_048_576) -> None:
8288 def __iter__ (self ) -> Iterator [_RecordTuple ]:
8389 format_version = ffi .new ("uint8_t *" )
8490 buf = bytearray ()
91+ buf_offset = 0
8592 file_offset = 0
8693 eof = False
8794
@@ -90,44 +97,48 @@ def __iter__(self) -> Iterator[_RecordTuple]:
9097 if not eof :
9198 chunk = f .read (self ._chunk_size )
9299 if chunk :
100+ if buf_offset > 0 :
101+ del buf [:buf_offset ]
102+ buf_offset = 0
93103 buf .extend (chunk )
94104 else :
95105 eof = True
96106
97- if not buf :
107+ remaining = len (buf ) - buf_offset
108+ if remaining <= 0 :
98109 return
99110
100111 reclen = clibmseed .ms3_detect (
101- ffi .from_buffer (buf ),
102- len ( buf ) ,
112+ ffi .from_buffer (buf ) + buf_offset ,
113+ remaining ,
103114 format_version ,
104115 )
105116
106117 if reclen < 0 :
107118 yield (None , file_offset , reclen )
108119 return
109120
110- if reclen == 0 or reclen > len ( buf ) :
121+ if reclen == 0 or reclen > remaining :
111122 if eof :
112123 return
113124 continue
114125
115- record_bytes = bytes (buf [: reclen ])
126+ record_bytes = bytes (buf [buf_offset : buf_offset + reclen ])
116127 yield (ffi .from_buffer (record_bytes ), file_offset , reclen )
117-
118- buf = buf [reclen :]
128+ buf_offset += reclen
119129 file_offset += reclen
120130
121131
122132class MS3RecordValidator :
123133 """Validate miniSEED records with comprehensive error detection.
124134
125- Processes records from a buffer or file using a 4 -step process:
135+ Processes records from a buffer or file using a 5 -step process:
126136
127137 1. Determine record length (handled by the record source)
128138 2. Parse record metadata without unpacking data
129- 3. Optionally add record to a trace coverage list (with no data samples)
130- 4. Optionally decompress data samples and test for decoding errors
139+ 3. Optionally validate extra headers
140+ 4. Optionally add record to a trace coverage list (with no data samples)
141+ 5. Optionally decompress data samples and test for decoding errors
131142
132143 This approach ensures maximum information recovery — all records with
133144 parseable headers are added to the trace list, with complete error tracking.
@@ -260,6 +271,24 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
260271
261272 msr_ptr = ffi .new ("MS3Record **" )
262273
274+ # Pre-load JSON schema validator once — avoid reloading per-record
275+ _eh_validator : Any = None
276+ _eh_import_error = False
277+ if self ._validate_extra_headers :
278+ if self ._extra_headers_schema not in _KNOWN_SCHEMAS :
279+ raise ValueError (f"Unknown schema_id: { self ._extra_headers_schema } " )
280+ try :
281+ from jsonschema import Draft202012Validator
282+
283+ schema_bytes = (
284+ files ("pymseed.schemas" )
285+ .joinpath (_KNOWN_SCHEMAS [self ._extra_headers_schema ])
286+ .read_bytes ()
287+ )
288+ _eh_validator = Draft202012Validator (json .loads (schema_bytes ))
289+ except ImportError :
290+ _eh_import_error = True
291+
263292 try :
264293 for buf_ptr , offset , record_length in self ._source :
265294 # Detection failure — source signals this with buf_ptr=None
@@ -278,7 +307,7 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
278307
279308 clear_error_messages ()
280309
281- # Step 2: Parse record structure (without unpacking data samples)
310+ # Step 2: Parse record metadata (without unpacking data samples)
282311 status = clibmseed .msr3_parse (
283312 buf_ptr ,
284313 record_length ,
@@ -304,10 +333,11 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
304333 )
305334 continue
306335
307- record = MS3Record (recordptr = msr_ptr [0 ])
336+ # Read metadata directly from C struct
337+ msr = msr_ptr [0 ]
308338 rec_fields = {
309- "sourceid" : record . sourceid ,
310- "starttime" : record .starttime ,
339+ "sourceid" : ffi . string ( msr . sid ). decode ( "utf-8" ) ,
340+ "starttime" : msr .starttime ,
311341 "reclen" : record_length ,
312342 }
313343
@@ -323,41 +353,46 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
323353 )
324354 )
325355
326- if self ._validate_extra_headers and record .extralength > 0 :
327- try :
328- validation_errors = record .validate_extra_headers (
329- schema_id = self ._extra_headers_schema
330- )
331- for validation_error in validation_errors :
332- errors .append (
333- ValidationError (
334- offset = offset ,
335- message = f"Extra headers validation error: { validation_error .message } at { validation_error .json_path } " ,
336- ** rec_fields ,
337- )
338- )
339- except ImportError :
356+ # Step 3: Optionally validate extra headers
357+ if self ._validate_extra_headers and msr .extralength > 0 :
358+ if _eh_import_error :
340359 errors .append (
341360 ValidationError (
342361 offset = offset ,
343362 message = "Extra headers validation skipped: jsonschema not installed" ,
344363 ** rec_fields ,
345364 )
346365 )
347- except Exception as e :
348- errors .append (
349- ValidationError (
350- offset = offset ,
351- message = f"Extra headers validation error: { e } " ,
352- ** rec_fields ,
366+ else :
367+ try :
368+ extra_str = (
369+ ffi .string (msr .extra ).decode ("utf-8" )
370+ if msr .extra != ffi .NULL
371+ else ""
372+ )
373+ if extra_str :
374+ for ve in _eh_validator .iter_errors (json .loads (extra_str )):
375+ errors .append (
376+ ValidationError (
377+ offset = offset ,
378+ message = f"Extra headers validation error: { ve .message } at { ve .json_path } " ,
379+ ** rec_fields ,
380+ )
381+ )
382+ except Exception as e :
383+ errors .append (
384+ ValidationError (
385+ offset = offset ,
386+ message = f"Extra headers validation error: { e } " ,
387+ ** rec_fields ,
388+ )
353389 )
354- )
355390
356- # Step 3 : Add record to trace list
391+ # Step 4 : Add record to trace list
357392 if tracelist is not None :
358393 segptr = clibmseed .mstl3_addmsr_recordptr (
359394 tracelist ._mstl ,
360- record . _msr ,
395+ msr ,
361396 ffi .NULL ,
362397 0 , # splitversion
363398 1 , # autoheal
@@ -375,10 +410,10 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
375410 )
376411 continue
377412
378- # Step 4 : Optionally decompress data samples to detect decoding errors
413+ # Step 5 : Optionally decompress data samples to detect decoding errors
379414 if self ._unpack_data :
380415 clear_error_messages ()
381- status = clibmseed .msr3_unpack_data (msr_ptr [ 0 ] , self ._verbose )
416+ status = clibmseed .msr3_unpack_data (msr , self ._verbose )
382417
383418 error_messages = get_error_messages ()
384419
0 commit comments