diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ec2434f41c7..6c7a7b37ab3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: # Ruff mne - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.0 + rev: v0.14.1 hooks: - id: ruff-check name: ruff lint mne @@ -82,7 +82,7 @@ repos: # zizmor - repo: https://github.com/woodruffw/zizmor-pre-commit - rev: v1.14.2 + rev: v1.15.2 hooks: - id: zizmor diff --git a/doc/changes/dev/13096.newfeature.rst b/doc/changes/dev/13096.newfeature.rst new file mode 100644 index 00000000000..7c5ad84a98a --- /dev/null +++ b/doc/changes/dev/13096.newfeature.rst @@ -0,0 +1 @@ +Support for preloading=False when reading .set files, by `Bruno Aristimunha`_. \ No newline at end of file diff --git a/mne/io/eeglab/_eeglab.py b/mne/io/eeglab/_eeglab.py index 28df469fada..78df0be366b 100644 --- a/mne/io/eeglab/_eeglab.py +++ b/mne/io/eeglab/_eeglab.py @@ -9,9 +9,9 @@ except ImportError: # scipy < 1.8 from scipy.io.matlab.mio5 import MatlabFunction from scipy.io.matlab.mio5_params import MatlabOpaque -from scipy.io import loadmat +from scipy.io import loadmat, whosmat -from ...utils import _import_pymatreader_funcs +from ...utils import _import_pymatreader_funcs, _soft_import, warn def _todict_from_np_struct(data): # taken from pymatreader.utils @@ -71,13 +71,201 @@ def _check_for_scipy_mat_struct(data): # taken from pymatreader.utils return data -def _readmat(fname, uint16_codec=None): +def _scipy_reader(file_name, variable_names=None, uint16_codec=None): + """Load with scipy and then run the check function.""" + mat_data = loadmat( + file_name, + squeeze_me=True, + mat_dtype=False, + variable_names=variable_names, + uint16_codec=uint16_codec, + ) + return _check_for_scipy_mat_struct(mat_data) + + +def _whosmat_hdf5(fname: str): + """List variables in a MATLAB v7.3 (HDF5) .mat file without loading data. + + This function provides similar functionality to :func:`scipy.io.whosmat` but + for MATLAB v7.3 files stored in HDF5 format, which are not supported by SciPy. + + Parameters + ---------- + fname : str | PathLike + Path to the MATLAB v7.3 (.mat) file. + + Returns + ------- + variables : list of tuple + A list of (name, shape, class) tuples for each variable in the file. + The name is a string, shape is a tuple of ints, and class is a string + indicating the MATLAB data type (e.g., 'double', 'int32', 'struct'). + + Notes + ----- + This function only works with MATLAB v7.3 (HDF5) files. For earlier versions, + use :func:`scipy.io.whosmat` instead. + + See Also + -------- + scipy.io.whosmat : List variables in classic MATLAB files. + """ + h5py = _soft_import("h5py", purpose="MATLAB v7.3 I/O", strict=False) + if h5py is False: + raise ModuleNotFoundError( + "h5py is required to inspect MATLAB v7.3 files preload=`False` " + "Please install h5py to use this functionality." + ) + + variables = [] + + with h5py.File(str(fname), "r") as f: + for name in f.keys(): + node = f[name] + + # Extract shape from HDF5 object + if isinstance(node, h5py.Dataset): + shape = tuple(int(x) for x in node.shape) + else: + shape = () + for attr_key in ( + "MATLAB_shape", + "MATLAB_Size", + "MATLAB_size", + "dims", + "MATLAB_dims", + ): + shp = node.attrs.get(attr_key) + if shp is not None: + try: + shape = tuple(int(x) for x in shp) + break + except Exception: + pass + if not shape and "size" in node: + try: + shape = tuple(int(x) for x in node["size"][()]) + except Exception: + pass + + # Infer MATLAB class from HDF5 object + mcls = node.attrs.get("MATLAB_class", "").lower() + if mcls: + matlab_class = "char" if mcls == "string" else mcls + elif isinstance(node, h5py.Dataset): + dt = node.dtype + # Handle complex numbers stored as {real, imag} struct + if getattr(dt, "names", None) and {"real", "imag"} <= set(dt.names): + matlab_class = ( + "double" if dt["real"].base.itemsize == 8 else "single" + ) + # Map NumPy dtype to MATLAB class + elif (kind := dt.kind) == "f": + matlab_class = "double" if dt.itemsize == 8 else "single" + elif kind == "i": + matlab_class = f"int{8 * dt.itemsize}" + elif kind == "u": + matlab_class = f"uint{8 * dt.itemsize}" + elif kind == "b": + matlab_class = "logical" + elif kind in ("S", "U", "O"): + matlab_class = "char" + else: + matlab_class = "unknown" + # Check for sparse matrix structure + elif {"ir", "jc", "data"}.issubset(set(node.keys())): + matlab_class = "sparse" + else: + matlab_class = "unknown" + + variables.append((name, shape, matlab_class)) + + return variables + + +def _readmat(fname, uint16_codec=None, *, preload=False): try: read_mat = _import_pymatreader_funcs("EEGLAB I/O") except RuntimeError: # pymatreader not installed - eeg = loadmat( - fname, squeeze_me=True, mat_dtype=False, uint16_codec=uint16_codec + read_mat = _scipy_reader + + # First handle the preload=False case + if not preload: + # when preload is `False`, we need to be selective about what we load + # and handle the 'data' field specially + + # the files in eeglab are always the same field names + # the the fields were taken from the eeglab sample reference + # available at the eeglab github: + # https://github.com/sccn/eeglab/blob/develop/sample_data/eeglab_data.set + # The sample reference is the big reference for the field names + # in eeglab files, and what is used in the eeglab tests. + info_fields = """ + setname filename filepath subject group condition session comments + nbchan trials pnts srate xmin xmax times icaact icawinv icasphere + icaweights icachansind chanlocs urchanlocs chaninfo ref event + urevent eventdescription epoch epochdescription reject stats + specdata specicaact splinefile icasplinefile dipfit history saved + etc + """.split() + + # We first load only the info fields that are not data + # Then we check if 'data' is present and load it separately if needed + mat_data = read_mat( + fname, + variable_names=info_fields, + uint16_codec=uint16_codec, ) - return _check_for_scipy_mat_struct(eeg) - else: + + # checking the variables in the .set file + # to decide how to handle 'data' variable + try: + variables = whosmat(str(fname)) + except NotImplementedError: + try: + variables = _whosmat_hdf5(str(fname)) + except ModuleNotFoundError: + warn( + "pymatreader is required to preload=`False` for " + "Matlab files v7.3 files with HDF5 support. " + "Setting preload=True." + ) + preload = True + return read_mat(fname, uint16_codec=uint16_codec) + + is_possible_not_loaded = False + + numeric_types = """ + int8 int16 int32 + int64 uint8 uint16 + uint32 uint64 single double + """.split() + + for var in variables: + # looking for 'data' variable + if var[0] != "data": + continue + + # checking if 'data' variable is numeric + is_numeric = var[2] in numeric_types + + # if any 'data' variable is numeric, mark as possibly not loaded + if is_numeric: + # set the 'data' field to the filename + mat_data["data"] = str(fname) + + is_possible_not_loaded = is_possible_not_loaded or is_numeric + + if is_possible_not_loaded: + return mat_data + else: + # "The 'data' variable in the .set file appears to be numeric. " + # "In preload=False mode, the data is not loaded into memory. " + # "Instead, the filename is provided in mat_data['data']. " + # "To load the actual data, set preload=True." + # this is case of single file .set with data inside + preload = True + + # here is intended to be if and not else if + if preload: return read_mat(fname, uint16_codec=uint16_codec) diff --git a/mne/io/eeglab/eeglab.py b/mne/io/eeglab/eeglab.py index 83148666ffa..497a7eeaf8f 100644 --- a/mne/io/eeglab/eeglab.py +++ b/mne/io/eeglab/eeglab.py @@ -14,7 +14,7 @@ from ..._fiff.constants import FIFF from ..._fiff.meas_info import create_info from ..._fiff.pick import _PICK_TYPES_KEYS -from ..._fiff.utils import _find_channels, _read_segments_file +from ..._fiff.utils import _find_channels, _mult_cal_one, _read_segments_file from ...annotations import Annotations, read_annotations from ...channels import make_dig_montage from ...defaults import DEFAULTS @@ -39,9 +39,10 @@ def _check_eeglab_fname(fname, dataname): """Check whether the filename is valid. - Check if the file extension is ``.fdt`` (older ``.dat`` being invalid) or - whether the ``EEG.data`` filename exists. If ``EEG.data`` file is absent - the set file name with .set changed to .fdt is checked. + Check if the file extension is ``.fdt`` (older ``.dat`` being invalid) + or ``.set`` (new EEGLAB format) or whether the ``EEG.data`` filename exists. + If ``EEG.data`` file is absent the set file name with + .set changed to .fdt is checked. """ fmt = str(op.splitext(dataname)[-1]) if fmt == ".dat": @@ -50,6 +51,8 @@ def _check_eeglab_fname(fname, dataname): "version and resave the data in .fdt format" ) + _check_option("EEGLAB file extension", fmt, (".set", ".fdt")) + basedir = op.dirname(fname) data_fname = op.join(basedir, dataname) if not op.exists(data_fname): @@ -68,10 +71,10 @@ def _check_eeglab_fname(fname, dataname): return data_fname -def _check_load_mat(fname, uint16_codec): +def _check_load_mat(fname, uint16_codec, *, preload=False): """Check if the mat struct contains 'EEG'.""" fname = _check_fname(fname, "read", True) - eeg = _readmat(fname, uint16_codec=uint16_codec) + eeg = _readmat(fname, uint16_codec=uint16_codec, preload=preload) if "ALLEEG" in eeg: raise NotImplementedError( "Loading an ALLEEG array is not supported. Please contact" @@ -81,9 +84,9 @@ def _check_load_mat(fname, uint16_codec): eeg = eeg["EEG"] eeg = eeg.get("EEG", eeg) # handle nested EEG structure eeg = Bunch(**eeg) - eeg.trials = int(eeg.trials) - eeg.nbchan = int(eeg.nbchan) - eeg.pnts = int(eeg.pnts) + eeg.trials = int(eeg.get("trials", 1)) + eeg.nbchan = int(eeg.get("nbchan", 1)) + eeg.pnts = int(eeg.get("pnts", 1)) return eeg @@ -302,8 +305,6 @@ def read_raw_eeglab( If 'auto', the channel names containing ``EOG`` or ``EYE`` are used. Defaults to empty tuple. %(preload)s - Note that ``preload=False`` will be effective only if the data is - stored in a separate binary file. %(uint16_codec)s %(montage_units)s @@ -420,8 +421,6 @@ class RawEEGLAB(BaseRaw): If 'auto', the channel names containing ``EOG`` or ``EYE`` are used. Defaults to empty tuple. %(preload)s - Note that preload=False will be effective only if the data is stored - in a separate binary file. %(uint16_codec)s %(montage_units)s %(verbose)s @@ -447,7 +446,7 @@ def __init__( verbose=None, ): input_fname = str(_check_fname(input_fname, "read", True, "input_fname")) - eeg = _check_load_mat(input_fname, uint16_codec) + eeg = _check_load_mat(input_fname, uint16_codec, preload=preload) if eeg.trials != 1: raise TypeError( f"The number of trials is {eeg.trials:d}. It must be 1 for raw" @@ -462,6 +461,8 @@ def __init__( if isinstance(eeg.data, str): data_fname = _check_eeglab_fname(input_fname, eeg.data) logger.info(f"Reading {data_fname}") + # Check if data is embedded in the same .set file + is_embedded = op.realpath(data_fname) == op.realpath(input_fname) super().__init__( info, @@ -470,16 +471,15 @@ def __init__( last_samps=last_samps, orig_format="double", verbose=verbose, + raw_extras=[ + { + "is_embedded": is_embedded, + "input_fname": input_fname, + "uint16_codec": uint16_codec, + } + ], ) else: - if preload is False or isinstance(preload, str): - warn( - "Data will be preloaded. preload=False or a string " - "preload is not supported when the data is stored in " - "the .set file" - ) - # can't be done in standard way with preload=True because of - # different reading path (.set file) if eeg.nbchan == 1 and len(eeg.data.shape) == 1: n_chan, n_times = [1, eeg.data.shape[0]] else: @@ -508,6 +508,45 @@ def __init__( def _read_segment_file(self, data, idx, fi, start, stop, cals, mult): """Read a chunk of raw data.""" + # Check if data is embedded in .set file + raw_extra = self._raw_extras[fi] + if raw_extra.get("is_embedded", False): + # Check if we have already loaded and cached the embedded data + if "cached_data" not in raw_extra: + # Load from MATLAB struct on-demand (only once) + input_fname = raw_extra["input_fname"] + uint16_codec = raw_extra["uint16_codec"] + eeg_full = _readmat( + input_fname, uint16_codec=uint16_codec, preload=True + ) + if "EEG" in eeg_full: + eeg_full = eeg_full["EEG"] + eeg_full = eeg_full.get("EEG", eeg_full) + full_data = eeg_full.get("data") + + if full_data is None: + raise ValueError( + f"Could not find 'data' field in embedded EEGLAB file: " + f"{input_fname}. The file may be corrupted or not a valid " + "EEGLAB file." + ) + + # Handle 1D data + if full_data.ndim == 1: + full_data = full_data[np.newaxis, :] + + # Cache for future segment reads + raw_extra["cached_data"] = full_data + + # Extract the requested segment from cached data (don't scale here) + full_data = raw_extra["cached_data"] + block = full_data[:, start:stop].astype(np.float32) + # Apply calibration and projection via _mult_cal_one + data_view = data[:, :] + _mult_cal_one(data_view, block, idx, cals, mult) + return + + # Fall back to reading from file (separate .fdt file) _read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype=" raw_not_preloaded._size + + @testing.requires_testing_data def test_io_set_raw_more(tmp_path): """Test importing EEGLAB .set files.""" @@ -641,8 +664,6 @@ def test_io_set_raw_2021(): _test_raw_reader( reader=read_raw_eeglab, input_fname=raw_fname_2021, - test_preloading=False, - preload=True, ) @@ -746,3 +767,123 @@ def test_eeglab_drop_nan_annotations(tmp_path): with pytest.warns(RuntimeWarning, match="1 .* have an onset that is NaN.*"): raw = read_raw_eeglab(file_path, preload=True) + + +@testing.requires_testing_data +@pytest.mark.timeout(10) +def test_io_set_preload_false_is_faster(monkeypatch): + """Using preload=False should skip the expensive data read branch.""" + real_loadmat = eeglab_mod.loadmat + call_counts = {"n": 0} + + def counting_loadmat(*args, **kwargs): + call_counts["n"] += 1 + return real_loadmat(*args, **kwargs) + + monkeypatch.setattr(eeglab_mod, "loadmat", counting_loadmat) + + durations = {} + with _record_warnings(): + for preload in (False, True): + start = time.perf_counter() + _ = read_raw_eeglab(raw_fname_mat, preload=preload) + durations[preload] = time.perf_counter() - start + + # preload=True should not be faster than preload=False (timings may vary + # across systems, so avoid strict thresholds) + assert durations[True] > durations[False] + + +@testing.requires_testing_data +def test_lazy_vs_preload_integrity(): + """Test that lazy loading produces identical data to preload.""" + raw_lazy = read_raw_eeglab(raw_fname_onefile_mat, preload=False) + raw_preload = read_raw_eeglab(raw_fname_onefile_mat, preload=True) + + # Get data from both modes + data_lazy = raw_lazy.get_data() + data_preload = raw_preload.get_data() + + # Data should be identical + assert_array_almost_equal(data_lazy, data_preload, decimal=5) + + # Verify shape consistency + assert data_lazy.shape == data_preload.shape + assert raw_lazy.n_times == raw_preload.n_times + assert len(raw_lazy.ch_names) == len(raw_preload.ch_names) + + # Verify no NaN/Inf and data is not all zeros + assert np.isfinite(data_lazy).all() + assert not np.all(data_lazy == 0) + + +@testing.requires_testing_data +def test_lazy_loading_segment_reads(): + """Test that lazy loading correctly reads data segments.""" + raw_lazy = read_raw_eeglab(raw_fname_onefile_mat, preload=False) + raw_preload = read_raw_eeglab(raw_fname_onefile_mat, preload=True) + + # Test beginning, middle, and end segments + segments = [ + (0, 100), + (100, 200), + (raw_lazy.n_times - 100, raw_lazy.n_times), + ] + + for start, stop in segments: + data_lazy = raw_lazy[:, start:stop][0] + data_preload = raw_preload[:, start:stop][0] + + # Segments should be identical + assert_array_almost_equal(data_lazy, data_preload, decimal=5) + + # Data should not be all zeros + assert not np.all(data_lazy == 0) + + +@testing.requires_testing_data +def test_lazy_loading_data_consistency(): + """Test that lazy loading maintains consistency across multiple reads.""" + raw_lazy = read_raw_eeglab(raw_fname_onefile_mat, preload=False) + raw_preload = read_raw_eeglab(raw_fname_onefile_mat, preload=True) + + # Get data multiple times from lazy-loaded raw + reads = [raw_lazy.get_data().copy() for _ in range(3)] + + # All reads should be identical + for i in range(1, len(reads)): + assert_array_equal(reads[0], reads[i]) + + # Should match preloaded data + data_preload = raw_preload.get_data() + assert_array_almost_equal(reads[0], data_preload, decimal=5) + + # Check numerical stability + lazy_mean = np.mean(reads[0]) + lazy_std = np.std(reads[0]) + preload_mean = np.mean(data_preload) + preload_std = np.std(data_preload) + + assert_allclose(lazy_mean, preload_mean, rtol=1e-10) + assert_allclose(lazy_std, preload_std, rtol=1e-10) + + +@testing.requires_testing_data +@pytest.mark.parametrize("fname", [raw_fname_onefile_mat, raw_fname_mat]) +def test_lazy_vs_preload_all_formats(fname): + """Test lazy loading vs preload for both embedded and separate formats.""" + raw_lazy = read_raw_eeglab(fname, preload=False) + raw_preload = read_raw_eeglab(fname, preload=True) + + # Verify identical data + data_lazy = raw_lazy.get_data() + data_preload = raw_preload.get_data() + assert_array_almost_equal(data_lazy, data_preload, decimal=5) + + # Verify metadata is identical + assert raw_lazy.n_times == raw_preload.n_times + assert raw_lazy.info["sfreq"] == raw_preload.info["sfreq"] + assert len(raw_lazy.ch_names) == len(raw_preload.ch_names) + + # Verify annotations are present + assert len(raw_lazy.annotations) == len(raw_preload.annotations) diff --git a/tools/azure_dependencies.sh b/tools/azure_dependencies.sh index 8880e6478fa..abea6af8e86 100755 --- a/tools/azure_dependencies.sh +++ b/tools/azure_dependencies.sh @@ -9,7 +9,9 @@ if [ "${TEST_MODE}" == "pip" ]; then elif [ "${TEST_MODE}" == "pip-pre" ]; then ${SCRIPT_DIR}/install_pre_requirements.sh python -m pip install $STD_ARGS --pre -e .[test_extra] - echo "##vso[task.setvariable variable=MNE_TEST_ALLOW_SKIP].*(Requires (spm|brainstorm) dataset|Requires MNE-C|CUDA not|Numba not| on Windows|MNE_FORCE_SERIAL|PySide6 causes segfaults).*" + echo "##vso[task.setvariable variable=MNE_TEST_ALLOW_SKIP].*(Requires (spm|brainstorm) dataset|Requires MNE-C|CUDA not|Numba not| on Windows|MNE_FORCE_SERIAL|PySide6 causes segfaults|neo).*" + # https://github.com/python-quantities/python-quantities/issues/262 + python -m pip uninstall -yq neo else echo "Unknown run type ${TEST_MODE}" exit 1 diff --git a/tools/github_actions_dependencies.sh b/tools/github_actions_dependencies.sh index 4e7300cd40b..985ed838915 100755 --- a/tools/github_actions_dependencies.sh +++ b/tools/github_actions_dependencies.sh @@ -37,3 +37,7 @@ echo "" echo "::group::Installing test dependencies using pip" python -m pip install $STD_ARGS $INSTALL_ARGS .[$INSTALL_KIND] echo "::endgroup::" +if [[ "${MNE_CI_KIND}" == "pip-pre" ]]; then + # https://github.com/python-quantities/python-quantities/issues/262 + python -m pip uninstall -yq neo +fi diff --git a/tools/github_actions_env_vars.sh b/tools/github_actions_env_vars.sh index 9f424ae5f48..ed116873caf 100755 --- a/tools/github_actions_env_vars.sh +++ b/tools/github_actions_env_vars.sh @@ -9,7 +9,7 @@ if [[ "$MNE_CI_KIND" == "pip"* ]]; then # We should test an eager import somewhere, might as well be here echo "EAGER_IMPORT=true" | tee -a $GITHUB_ENV # Make sure nothing unexpected is skipped - echo "MNE_TEST_ALLOW_SKIP=.*(Requires (spm|brainstorm) dataset|CUDA not|Numba not|PySide6 causes segfaults).*" | tee -a $GITHUB_ENV + echo "MNE_TEST_ALLOW_SKIP=.*(Requires (spm|brainstorm) dataset|CUDA not|Numba not|PySide6 causes segfaults|neo).*" | tee -a $GITHUB_ENV else echo "MNE_QT_BACKEND=PySide6" | tee -a $GITHUB_ENV fi