Skip to content

Commit b77bccd

Browse files
MS3RecordValidator.from_file() significantly optimized
1 parent 5609ec9 commit b77bccd

File tree

3 files changed

+81
-41
lines changed

3 files changed

+81
-41
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.4.0] - 2026-03-08
11+
12+
### Changed
13+
- `MS3RecordValidator.from_file()` significantly optimized
14+
1015
## [0.3.0] - 2026-03-08
1116

1217
### Added

src/pymseed/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# Package version
2-
__version__ = "0.3.0"
2+
__version__ = "0.4.0"

src/pymseed/msrecord_validator.py

Lines changed: 75 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,24 @@
88

99
from __future__ import annotations
1010

11+
import json
1112
from collections.abc import Iterator
1213
from dataclasses import dataclass
14+
from importlib.resources import files
1315
from typing import Any
1416

1517
from .clib import clibmseed, ffi
1618
from .logging import clear_error_messages, get_error_messages
17-
from .msrecord import MS3Record
1819
from .mstracelist import MS3TraceList
1920

2021
# (buf_ptr, absolute_offset, record_length) — or (None, offset, error_code) for detection failure
2122
_RecordTuple = tuple[Any, int, int]
2223

24+
# Maps supported schema IDs to their bundled JSON Schema filenames
25+
_KNOWN_SCHEMAS: dict[str, str] = {
26+
"FDSN-v1.0": "ExtraHeaders-FDSN-v1.0.schema-2020-12.json",
27+
}
28+
2329

2430
@dataclass(frozen=True)
2531
class ValidationError:
@@ -82,6 +88,7 @@ def __init__(self, filename: str, chunk_size: int = 1_048_576) -> None:
8288
def __iter__(self) -> Iterator[_RecordTuple]:
8389
format_version = ffi.new("uint8_t *")
8490
buf = bytearray()
91+
buf_offset = 0
8592
file_offset = 0
8693
eof = False
8794

@@ -90,44 +97,48 @@ def __iter__(self) -> Iterator[_RecordTuple]:
9097
if not eof:
9198
chunk = f.read(self._chunk_size)
9299
if chunk:
100+
if buf_offset > 0:
101+
del buf[:buf_offset]
102+
buf_offset = 0
93103
buf.extend(chunk)
94104
else:
95105
eof = True
96106

97-
if not buf:
107+
remaining = len(buf) - buf_offset
108+
if remaining <= 0:
98109
return
99110

100111
reclen = clibmseed.ms3_detect(
101-
ffi.from_buffer(buf),
102-
len(buf),
112+
ffi.from_buffer(buf) + buf_offset,
113+
remaining,
103114
format_version,
104115
)
105116

106117
if reclen < 0:
107118
yield (None, file_offset, reclen)
108119
return
109120

110-
if reclen == 0 or reclen > len(buf):
121+
if reclen == 0 or reclen > remaining:
111122
if eof:
112123
return
113124
continue
114125

115-
record_bytes = bytes(buf[:reclen])
126+
record_bytes = bytes(buf[buf_offset : buf_offset + reclen])
116127
yield (ffi.from_buffer(record_bytes), file_offset, reclen)
117-
118-
buf = buf[reclen:]
128+
buf_offset += reclen
119129
file_offset += reclen
120130

121131

122132
class MS3RecordValidator:
123133
"""Validate miniSEED records with comprehensive error detection.
124134
125-
Processes records from a buffer or file using a 4-step process:
135+
Processes records from a buffer or file using a 5-step process:
126136
127137
1. Determine record length (handled by the record source)
128138
2. Parse record metadata without unpacking data
129-
3. Optionally add record to a trace coverage list (with no data samples)
130-
4. Optionally decompress data samples and test for decoding errors
139+
3. Optionally validate extra headers
140+
4. Optionally add record to a trace coverage list (with no data samples)
141+
5. Optionally decompress data samples and test for decoding errors
131142
132143
This approach ensures maximum information recovery — all records with
133144
parseable headers are added to the trace list, with complete error tracking.
@@ -260,6 +271,24 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
260271

261272
msr_ptr = ffi.new("MS3Record **")
262273

274+
# Pre-load JSON schema validator once — avoid reloading per-record
275+
_eh_validator: Any = None
276+
_eh_import_error = False
277+
if self._validate_extra_headers:
278+
if self._extra_headers_schema not in _KNOWN_SCHEMAS:
279+
raise ValueError(f"Unknown schema_id: {self._extra_headers_schema}")
280+
try:
281+
from jsonschema import Draft202012Validator
282+
283+
schema_bytes = (
284+
files("pymseed.schemas")
285+
.joinpath(_KNOWN_SCHEMAS[self._extra_headers_schema])
286+
.read_bytes()
287+
)
288+
_eh_validator = Draft202012Validator(json.loads(schema_bytes))
289+
except ImportError:
290+
_eh_import_error = True
291+
263292
try:
264293
for buf_ptr, offset, record_length in self._source:
265294
# Detection failure — source signals this with buf_ptr=None
@@ -278,7 +307,7 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
278307

279308
clear_error_messages()
280309

281-
# Step 2: Parse record structure (without unpacking data samples)
310+
# Step 2: Parse record metadata (without unpacking data samples)
282311
status = clibmseed.msr3_parse(
283312
buf_ptr,
284313
record_length,
@@ -304,10 +333,11 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
304333
)
305334
continue
306335

307-
record = MS3Record(recordptr=msr_ptr[0])
336+
# Read metadata directly from C struct
337+
msr = msr_ptr[0]
308338
rec_fields = {
309-
"sourceid": record.sourceid,
310-
"starttime": record.starttime,
339+
"sourceid": ffi.string(msr.sid).decode("utf-8"),
340+
"starttime": msr.starttime,
311341
"reclen": record_length,
312342
}
313343

@@ -323,41 +353,46 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
323353
)
324354
)
325355

326-
if self._validate_extra_headers and record.extralength > 0:
327-
try:
328-
validation_errors = record.validate_extra_headers(
329-
schema_id=self._extra_headers_schema
330-
)
331-
for validation_error in validation_errors:
332-
errors.append(
333-
ValidationError(
334-
offset=offset,
335-
message=f"Extra headers validation error: {validation_error.message} at {validation_error.json_path}",
336-
**rec_fields,
337-
)
338-
)
339-
except ImportError:
356+
# Step 3: Optionally validate extra headers
357+
if self._validate_extra_headers and msr.extralength > 0:
358+
if _eh_import_error:
340359
errors.append(
341360
ValidationError(
342361
offset=offset,
343362
message="Extra headers validation skipped: jsonschema not installed",
344363
**rec_fields,
345364
)
346365
)
347-
except Exception as e:
348-
errors.append(
349-
ValidationError(
350-
offset=offset,
351-
message=f"Extra headers validation error: {e}",
352-
**rec_fields,
366+
else:
367+
try:
368+
extra_str = (
369+
ffi.string(msr.extra).decode("utf-8")
370+
if msr.extra != ffi.NULL
371+
else ""
372+
)
373+
if extra_str:
374+
for ve in _eh_validator.iter_errors(json.loads(extra_str)):
375+
errors.append(
376+
ValidationError(
377+
offset=offset,
378+
message=f"Extra headers validation error: {ve.message} at {ve.json_path}",
379+
**rec_fields,
380+
)
381+
)
382+
except Exception as e:
383+
errors.append(
384+
ValidationError(
385+
offset=offset,
386+
message=f"Extra headers validation error: {e}",
387+
**rec_fields,
388+
)
353389
)
354-
)
355390

356-
# Step 3: Add record to trace list
391+
# Step 4: Add record to trace list
357392
if tracelist is not None:
358393
segptr = clibmseed.mstl3_addmsr_recordptr(
359394
tracelist._mstl,
360-
record._msr,
395+
msr,
361396
ffi.NULL,
362397
0, # splitversion
363398
1, # autoheal
@@ -375,10 +410,10 @@ def validate(self) -> tuple[list[ValidationError], MS3TraceList | None]:
375410
)
376411
continue
377412

378-
# Step 4: Optionally decompress data samples to detect decoding errors
413+
# Step 5: Optionally decompress data samples to detect decoding errors
379414
if self._unpack_data:
380415
clear_error_messages()
381-
status = clibmseed.msr3_unpack_data(msr_ptr[0], self._verbose)
416+
status = clibmseed.msr3_unpack_data(msr, self._verbose)
382417

383418
error_messages = get_error_messages()
384419

0 commit comments

Comments
 (0)