diff --git a/doc/changes/dev/13583.bugfix.rst b/doc/changes/dev/13583.bugfix.rst new file mode 100644 index 00000000000..4703387baa3 --- /dev/null +++ b/doc/changes/dev/13583.bugfix.rst @@ -0,0 +1 @@ +Raise a :class:`NotImplementedError` when reading EDF+D or BDF+D files with acquisition gaps instead of silently loading them as continuous data, by `Arnav Kumar`_ (:gh:`13583`). diff --git a/doc/changes/names.inc b/doc/changes/names.inc index 1a9c62b9bea..65cd2c498dd 100644 --- a/doc/changes/names.inc +++ b/doc/changes/names.inc @@ -29,6 +29,7 @@ .. _Antti Rantala: https://github.com/Odingod .. _Apoorva Karekal: https://github.com/apoorva6262 .. _Archit Singhal: https://github.com/architsinghal-mriirs +.. _Arnav Kumar: https://github.com/Arnav1709 .. _Arne Pelzer: https://github.com/aplzr .. _Ashley Drew: https://github.com/ashdrew .. _Asish Panda: https://github.com/kaichogami diff --git a/mne/io/edf/edf.py b/mne/io/edf/edf.py index b1ebe3ce622..e3478257ee7 100644 --- a/mne/io/edf/edf.py +++ b/mne/io/edf/edf.py @@ -235,6 +235,30 @@ def __init__( np.ones((len(idx), 1)), None, ) + + # Check for discontinuous EDF+D files with actual gaps + if edf_info.get("discontinuous", False): + record_times = _get_tal_record_times(tal_data[0], encoding=encoding) + record_length = edf_info["record_length"][0] + n_records = edf_info["n_records"] + has_gaps, gaps = _check_edf_discontinuity( + record_times, record_length, n_records + ) + if has_gaps: + gap_info = ", ".join( + [f"{onset:.3f}s (duration: {dur:.3f}s)" for onset, dur in gaps] + ) + raise NotImplementedError( + "This EDF+D file contains discontinuous data with gaps " + f"between records. Gaps found at: {gap_info}. " + "MNE-Python does not currently support reading EDF+D files " + "with acquisition gaps. The data would be incorrectly " + "treated as continuous, leading to incorrect time alignment. " + "Consider using specialized tools like luna/lunapi for " + "discontinuous EDF+ files, or convert the file to EDF+C " + "format if the gaps are not significant for your analysis." + ) + annotations = _read_annotations_edf( tal_data[0], ch_names=info["ch_names"], @@ -447,6 +471,30 @@ def __init__( np.ones((len(idx), 1)), None, ) + + # Check for discontinuous BDF+D files with actual gaps + if edf_info.get("discontinuous", False): + record_times = _get_tal_record_times(tal_data[0], encoding=encoding) + record_length = edf_info["record_length"][0] + n_records = edf_info["n_records"] + has_gaps, gaps = _check_edf_discontinuity( + record_times, record_length, n_records + ) + if has_gaps: + gap_info = ", ".join( + [f"{onset:.3f}s (duration: {dur:.3f}s)" for onset, dur in gaps] + ) + raise NotImplementedError( + "This BDF+D file contains discontinuous data with gaps " + f"between records. Gaps found at: {gap_info}. " + "MNE-Python does not currently support reading BDF+D files " + "with acquisition gaps. The data would be incorrectly " + "treated as continuous, leading to incorrect time alignment. " + "Consider using specialized tools like luna/lunapi for " + "discontinuous BDF+ files, or convert the file to BDF+C " + "format if the gaps are not significant for your analysis." + ) + annotations = _read_annotations_edf( tal_data[0], ch_names=info["ch_names"], @@ -1159,9 +1207,16 @@ def _read_edf_header( # to determine the subtype (EDF or BDF, which differ in the # number of bytes they use for the data records; EDF uses 2 bytes # whereas BDF uses 3 bytes). - fid.read(44) + # However, we still need to check for EDF+D/BDF+D (discontinuous) files. + reserved = fid.read(44).decode("latin-1").rstrip() subtype = file_type + # Check for discontinuous EDF+D/BDF+D files + if reserved in ("EDF+D", "BDF+D"): + edf_info["discontinuous"] = True + else: + edf_info["discontinuous"] = False + n_records = int(_edf_str(fid.read(8))) record_length = float(_edf_str(fid.read(8))) record_length = np.array([record_length, 1.0]) # in seconds @@ -2005,6 +2060,11 @@ def read_raw_edf( The EDF specification allows storage of subseconds in measurement date. However, this reader currently sets subseconds to 0 by default. + + EDF+D (discontinuous) files with actual gaps between data records are not + supported and will raise a :class:`NotImplementedError`. EDF+D files that + are marked as discontinuous but have no actual gaps (e.g., from some + Nihon Kohden systems) will load normally. """ _check_args(input_fname, preload, "edf") @@ -2144,6 +2204,10 @@ def read_raw_bdf( If channels named 'status' or 'trigger' are present, they are considered as STIM channels by default. Use func:`mne.find_events` to parse events encoded in such analog stim channels. + + BDF+D (discontinuous) files with actual gaps between data records are not + supported and will raise a :class:`NotImplementedError`. BDF+D files that + are marked as discontinuous but have no actual gaps will load normally. """ _check_args(input_fname, preload, "bdf") @@ -2355,3 +2419,137 @@ def _get_annotations_gdf(edf_info, sfreq): desc = events[2] return onset, duration, desc + + +def _get_tal_record_times(annotations, encoding="utf8"): + """Extract TAL record onset times from EDF+ annotation data. + + In EDF+ files, each data record contains a Time-stamped Annotation List (TAL) + that starts with the onset time of that data record. This function extracts + these onset times to detect gaps between records in EDF+D (discontinuous) files. + + Parameters + ---------- + annotations : ndarray (n_chans, n_samples) | str + Channel data in EDF+ TAL format or path to annotation file. + encoding : str + Encoding to use when decoding the TAL data. + + Returns + ------- + record_times : list of float + List of onset times for each data record, in seconds. + """ + pat = "([+-]\\d+\\.?\\d*)(\x15(\\d+\\.?\\d*))?(\x14.*?)\x14\x00" + if isinstance(annotations, str | Path): + with open(annotations, "rb") as annot_file: + triggers = re.findall(pat.encode(), annot_file.read()) + triggers = [tuple(map(lambda x: x.decode(encoding), t)) for t in triggers] + else: + tals = bytearray() + annotations = np.atleast_2d(annotations) + for chan in annotations: + this_chan = chan.ravel() + if this_chan.dtype == INT32: # BDF + this_chan = this_chan.view(dtype=UINT8) + this_chan = this_chan.reshape(-1, 4) + this_chan = this_chan[:, :3].ravel() + tals.extend(this_chan) + else: + this_chan = chan.astype(np.int64) + tals.extend(np.uint8([this_chan % 256, this_chan // 256]).flatten("F")) + try: + triggers = re.findall(pat, tals.decode(encoding)) + except UnicodeDecodeError: + return [] + + # Extract record onset times (first TAL entry of each record has empty description) + record_times = [] + for ev in triggers: + onset = float(ev[0]) + # Check if this is a record timestamp (empty description after \x14) + descriptions = ev[3].split("\x14")[1:] + # The first TAL in each record has the record onset time + # If there's no description, it's the record timestamp + if not any(descriptions): + record_times.append(onset) + + return record_times + + +def _check_edf_discontinuity(record_times, record_length, n_records, tolerance=1e-6): + """Check if an EDF+D file has actual gaps between records. + + Parameters + ---------- + record_times : list of float + List of onset times for each data record, extracted from TAL annotations. + record_length : float + Duration of each data record in seconds. + n_records : int + Expected number of data records. + tolerance : float + Tolerance for comparing times (in seconds). + + Returns + ------- + has_gaps : bool + True if gaps exist between records. + gaps : list of tuple + List of (onset, duration) tuples for each gap. + + Raises + ------ + ValueError + If the number of record times does not match n_records, or if record_times + contains non-numeric values. + """ + # Handle empty/short cases early + if len(record_times) < 2: + # Validate n_records match even for edge cases + if len(record_times) != n_records: + raise ValueError( + f"Number of record times ({len(record_times)}) does not match " + f"expected number of records ({n_records})." + ) + return False, [] + + # Validate n_records matches + if len(record_times) != n_records: + raise ValueError( + f"Number of record times ({len(record_times)}) does not match " + f"expected number of records ({n_records})." + ) + + # Convert to numpy array for validation and ensure numeric + try: + times = np.asarray(record_times, dtype=np.float64) + except (ValueError, TypeError) as e: + raise ValueError( + f"record_times must contain numeric values, got error: {e}" + ) from e + + # Check for NaN or inf values + if not np.all(np.isfinite(times)): + raise ValueError("record_times contains non-finite values (NaN or inf).") + + # Check if times are sorted (monotonically increasing) + if not np.all(np.diff(times) >= 0): + warn( + "record_times are not sorted in monotonically increasing order. " + "Sorting a copy for gap detection." + ) + times = np.sort(times) + + # Detect gaps using validated/sorted times + gaps = [] + for i in range(len(times) - 1): + expected_next = times[i] + record_length + actual_next = times[i + 1] + gap = actual_next - expected_next + + if gap > tolerance: + # Found a gap + gaps.append((expected_next, gap)) + + return len(gaps) > 0, gaps diff --git a/mne/io/edf/tests/test_edf.py b/mne/io/edf/tests/test_edf.py index 1760081bac4..9b1ffdee6f5 100644 --- a/mne/io/edf/tests/test_edf.py +++ b/mne/io/edf/tests/test_edf.py @@ -24,6 +24,7 @@ from mne.datasets import testing from mne.io import edf, read_raw_bdf, read_raw_edf, read_raw_fif, read_raw_gdf from mne.io.edf.edf import ( + _check_edf_discontinuity, _edf_str, _parse_prefilter_string, _prefilter_float, @@ -1261,3 +1262,235 @@ def test_edf_read_from_file_like(): ] assert raw.ch_names == channels + + +def _create_edf_plus_file( + fname, n_records=5, record_duration=1.0, sfreq=256, discontinuous=False, gap_at=2 +): + """Create a minimal EDF+ file for testing. + + Parameters + ---------- + fname : str + Path to save the EDF+ file. + n_records : int + Number of data records. + record_duration : float + Duration of each record in seconds. + sfreq : int + Sampling frequency. + discontinuous : bool + If True, create an EDF+D file with a gap. + gap_at : int + Record index after which to insert a gap (only used if discontinuous=True). + """ + n_samples_per_record = int(sfreq * record_duration) + n_channels = 1 # One signal channel + one annotation channel + + # Calculate header sizes + header_bytes = 256 + (n_channels + 1) * 256 # Main header + channel headers + + with open(fname, "wb") as f: + # === Main Header (256 bytes) === + # Version (8 bytes) + f.write(b"0 ") + + # Patient ID (80 bytes) + patient_id = "X X X X" + f.write(patient_id.ljust(80).encode("latin-1")) + + # Recording ID (80 bytes) + recording_id = "Startdate 01-JAN-2020 X X X" + f.write(recording_id.ljust(80).encode("latin-1")) + + # Start date (8 bytes) + f.write(b"01.01.20") + + # Start time (8 bytes) + f.write(b"00.00.00") + + # Number of bytes in header (8 bytes) + f.write(str(header_bytes).ljust(8).encode("latin-1")) + + # Reserved field (44 bytes) - EDF+C or EDF+D + if discontinuous: + reserved = "EDF+D" + else: + reserved = "EDF+C" + f.write(reserved.ljust(44).encode("latin-1")) + + # Number of data records (8 bytes) + f.write(str(n_records).ljust(8).encode("latin-1")) + + # Duration of data record in seconds (8 bytes) + f.write(str(record_duration).ljust(8).encode("latin-1")) + + # Number of signals (4 bytes) - 1 data channel + 1 annotation channel + f.write(str(n_channels + 1).ljust(4).encode("latin-1")) + + # === Channel Headers === + # Labels (16 bytes each) + f.write("EEG Ch1 ".encode("latin-1")) # Data channel + f.write("EDF Annotations ".encode("latin-1")) # Annotation channel + + # Transducer type (80 bytes each) + f.write((" " * 80).encode("latin-1")) + f.write((" " * 80).encode("latin-1")) + + # Physical dimension (8 bytes each) + f.write("uV ".encode("latin-1")) + f.write(" ".encode("latin-1")) + + # Physical minimum (8 bytes each) + f.write("-3200 ".encode("latin-1")) + f.write("-1 ".encode("latin-1")) + + # Physical maximum (8 bytes each) + f.write("3200 ".encode("latin-1")) + f.write("1 ".encode("latin-1")) + + # Digital minimum (8 bytes each) + f.write("-32768 ".encode("latin-1")) + f.write("-32768 ".encode("latin-1")) + + # Digital maximum (8 bytes each) + f.write("32767 ".encode("latin-1")) + f.write("32767 ".encode("latin-1")) + + # Prefiltering (80 bytes each) + f.write((" " * 80).encode("latin-1")) + f.write((" " * 80).encode("latin-1")) + + # Number of samples in each data record (8 bytes each) + f.write(str(n_samples_per_record).ljust(8).encode("latin-1")) + # Annotation channel - use 60 samples (120 bytes) for TAL + annot_samples = 60 + f.write(str(annot_samples).ljust(8).encode("latin-1")) + + # Reserved (32 bytes each) + f.write((" " * 32).encode("latin-1")) + f.write((" " * 32).encode("latin-1")) + + # === Data Records === + for record_idx in range(n_records): + # Calculate record onset time + if discontinuous and record_idx > gap_at: + # Add 1 second gap after gap_at + onset = record_idx * record_duration + 1.0 + else: + onset = record_idx * record_duration + + # Write signal data (simple sine wave) + for _ in range(n_samples_per_record): + # Write 16-bit integer (2 bytes, little-endian) + f.write((0).to_bytes(2, byteorder="little", signed=True)) + + # Write TAL annotation + # Format: +onset\x14\x14\x00 (onset time with empty annotation) + tal = f"+{onset:.6f}\x14\x14\x00" + tal_bytes = tal.encode("latin-1") + # Pad to fill annot_samples * 2 bytes + tal_bytes = tal_bytes.ljust(annot_samples * 2, b"\x00") + f.write(tal_bytes) + + +def test_edf_plus_discontinuous_detection(tmp_path): + """Test that EDF+D files with gaps raise NotImplementedError.""" + # Create a continuous EDF+C file - should load fine + edf_c_path = tmp_path / "test_edf_c.edf" + _create_edf_plus_file(edf_c_path, discontinuous=False) + raw = read_raw_edf(edf_c_path, preload=True) + assert len(raw.ch_names) == 1 + assert raw.n_times == 5 * 256 # 5 records * 256 samples + + # Create a discontinuous EDF+D file with a gap - should raise + edf_d_path = tmp_path / "test_edf_d.edf" + _create_edf_plus_file(edf_d_path, discontinuous=True, gap_at=2) + + with pytest.raises( + NotImplementedError, match="EDF\\+D file contains discontinuous" + ): + read_raw_edf(edf_d_path, preload=True) + + +def test_check_edf_discontinuity(): + """Test the _check_edf_discontinuity helper function.""" + # Continuous records (no gaps) + record_times = [0.0, 1.0, 2.0, 3.0, 4.0] + record_length = 1.0 + has_gaps, gaps = _check_edf_discontinuity(record_times, record_length, 5) + assert not has_gaps + assert gaps == [] + + # Discontinuous records with one gap + record_times = [0.0, 1.0, 2.0, 4.0, 5.0] # Gap after record 3 + has_gaps, gaps = _check_edf_discontinuity(record_times, record_length, 5) + assert has_gaps + assert len(gaps) == 1 + assert_allclose(gaps[0][0], 3.0) # Gap starts at 3.0 + assert_allclose(gaps[0][1], 1.0) # Gap duration is 1.0 + + # Multiple gaps + record_times = [0.0, 1.0, 3.0, 4.0, 6.0] # Gaps after records 2 and 4 + has_gaps, gaps = _check_edf_discontinuity(record_times, record_length, 5) + assert has_gaps + assert len(gaps) == 2 + + # Edge case: only one record (no gaps possible) + record_times = [0.0] + has_gaps, gaps = _check_edf_discontinuity(record_times, record_length, 1) + assert not has_gaps + assert gaps == [] + + # Edge case: empty record times + record_times = [] + has_gaps, gaps = _check_edf_discontinuity(record_times, record_length, 0) + assert not has_gaps + assert gaps == [] + + # Test n_records mismatch validation + record_times = [0.0, 1.0, 2.0] + with pytest.raises(ValueError, match="does not match expected number of records"): + _check_edf_discontinuity(record_times, record_length, 5) + + # Test n_records mismatch for single record + record_times = [0.0] + with pytest.raises(ValueError, match="does not match expected number of records"): + _check_edf_discontinuity(record_times, record_length, 2) + + # Test non-numeric values + record_times = [0.0, "invalid", 2.0] + with pytest.raises(ValueError, match="must contain numeric values"): + _check_edf_discontinuity(record_times, record_length, 3) + + # Test NaN values + record_times = [0.0, np.nan, 2.0] + with pytest.raises(ValueError, match="non-finite values"): + _check_edf_discontinuity(record_times, record_length, 3) + + # Test inf values + record_times = [0.0, np.inf, 2.0] + with pytest.raises(ValueError, match="non-finite values"): + _check_edf_discontinuity(record_times, record_length, 3) + + # Test unsorted record_times (should warn and sort) + record_times = [2.0, 0.0, 1.0, 3.0, 4.0] # Unsorted + with pytest.warns(RuntimeWarning, match="not sorted"): + has_gaps, gaps = _check_edf_discontinuity(record_times, record_length, 5) + assert not has_gaps # After sorting, should be continuous + assert gaps == [] + + +def test_edf_plus_d_continuous_allowed(tmp_path): + """Test that EDF+D files marked discontinuous but actually continuous load OK.""" + # Some EDF+D files are marked as discontinuous but have no actual gaps + # (e.g., Nihon Kohden systems). These should load fine. + edf_d_no_gap_path = tmp_path / "test_edf_d_no_gap.edf" + + # Create EDF+D file but with continuous timestamps (no actual gaps) + _create_edf_plus_file(edf_d_no_gap_path, discontinuous=True, gap_at=100) + # gap_at=100 means no gap since we only have 5 records + + # This should work because there are no actual gaps + raw = read_raw_edf(edf_d_no_gap_path, preload=True) + assert len(raw.ch_names) == 1