BAMresearch
diff --git a/‎src/modacor/io/csv/__init__.py‎ b/‎src/modacor/io/csv/__init__.py‎
diff --git a/‎src/modacor/io/csv/csv_source.py‎
Lines changed: 210 additions & 0 deletions b/‎src/modacor/io/csv/csv_source.py‎
Lines changed: 210 additions & 0 deletions
diff --git a/‎src/modacor/io/hdf/hdf_loader.py‎ ‎src/modacor/io/hdf/hdf_source.py‎src/modacor/io/hdf/hdf_loader.py renamed to src/modacor/io/hdf/hdf_source.py
Lines changed: 27 additions & 14 deletions b/‎src/modacor/io/hdf/hdf_loader.py‎ ‎src/modacor/io/hdf/hdf_source.py‎src/modacor/io/hdf/hdf_loader.py renamed to src/modacor/io/hdf/hdf_source.py
Lines changed: 27 additions & 14 deletions
diff --git a/‎src/modacor/io/io_source.py‎
Lines changed: 5 additions & 0 deletions b/‎src/modacor/io/io_source.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/modacor/io/yaml/yaml_loader.py‎ ‎src/modacor/io/yaml/yaml_source.py‎src/modacor/io/yaml/yaml_loader.py renamed to src/modacor/io/yaml/yaml_source.py
Lines changed: 16 additions & 13 deletions b/‎src/modacor/io/yaml/yaml_loader.py‎ ‎src/modacor/io/yaml/yaml_source.py‎src/modacor/io/yaml/yaml_loader.py renamed to src/modacor/io/yaml/yaml_source.py
Lines changed: 16 additions & 13 deletions
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+__coding__ = "utf-8"
+__authors__ = ["Brian R. Pauw"]
+__copyright__ = "Copyright 2025, The MoDaCor team"
+__date__ = "12/12/2025"
+__status__ = "Development"  # "Development", "Production"
+# end of header and standard imports
+
+__all__ = ["CSVSource"]
+
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from attrs import define, field, validators
+
+from modacor.dataclasses.messagehandler import MessageHandler
+from modacor.io.io_source import ArraySlice
+
+from ..io_source import IoSource
+
+
+def _is_callable(_, __, value):
+    if not callable(value):
+        raise TypeError("method must be callable")
+
+
+@define(kw_only=True)
+class CSVSource(IoSource):
+    """
+    IoSource for loading columnar data from CSV-like text files using NumPy's
+    loadtxt or genfromtxt.
+
+    Expected usage
+    --------------
+    - Data is 1D per column (no multi-dimensional fields).
+    - Columns are returned as 1D arrays; each column corresponds to one data_key.
+    - for np.loadtxt, column names must be provided via dtype with field names, e.g.:
+        dtype=[("q", float), ("I", float), ("I_sigma", float)]
+    - for np.genfromtxt, column names come from the first row or are specified explicitly via the `names` parameter. Typical patterns:
+        * np.genfromtxt(..., names=True, delimiter=..., ...)  # use first row as names
+        * np.genfromtxt(..., names=["q", "I", "I_sigma"], ...)  # specify names explicitly
+      so that they can be clearly identified later.
+
+    Configuration
+    -------------
+    `iosource_method_kwargs` is passed directly to the NumPy function `method`.
+    This allows you to use all standard NumPy options, e.g.:
+
+    For np.genfromtxt:
+        delimiter=","
+        skip_header=3
+        max_rows=1000
+        usecols=(0, 1, 2)
+        names=True or names=["q", "I", "sigma"]
+        dtype=None or dtype=float
+        encoding="utf-8"
+        comments="#"
+        ...
+
+    For np.loadtxt:
+        delimiter=","
+        skiprows=3
+        max_rows=1000
+        usecols=(0, 1, 2)
+        dtype=float
+        encoding="utf-8"
+        comments="#"
+        ...
+
+    Notes
+    -----
+    - 2D arrays (no field names) are not supported in this implementation.
+      If the resulting array does not have `dtype.names`, a ValueError is raised.
+    """
+
+    # external API:
+    resource_location: Path = field(converter=Path, validator=validators.instance_of((Path)))
+    method: Callable[..., np.ndarray] = field(
+        default=np.genfromtxt, validator=_is_callable
+    )  # default to genfromtxt, better for names
+    # internal use (type hints; real values set per-instance)
+    _data_cache: np.ndarray | None = field(init=False, default=None)
+    _data_dict_cache: dict[str, np.ndarray] = field(factory=dict)
+    _file_datasets_dtypes: dict[str, np.dtype] = field(init=False)
+    _file_datasets_shapes: dict[str, tuple[int, ...]] = field(init=False)
+    logger: MessageHandler = field(init=False)
+
+    def __attrs_post_init__(self) -> None:
+        # super().__init__(source_reference=self.source_reference, iosource_method_kwargs=self.iosource_method_kwargs)
+        self.logger = MessageHandler(level=self.logging_level, name="CSVSource")
+        # Set file path
+        if not self.resource_location.is_file():
+            self.logger.error(f"CSVSource: file {self.resource_location} does not exist.")
+
+        # Bookkeeping structures for IoSource API
+        self._file_datasets_shapes: dict[str, tuple[int, ...]] = {}
+        self._file_datasets_dtypes: dict[str, np.dtype] = {}
+
+        # Load and preprocess data immediately
+        self._load_data()
+        self._preload()
+
+    # ------------------------------------------------------------------ #
+    # Internal loading / preprocessing                                    #
+    # ------------------------------------------------------------------ #
+
+    def _load_data(self) -> None:
+        """
+        Load the CSV data into a structured NumPy array using the configured
+        method (np.genfromtxt or np.loadtxt).
+
+        iosource_method_kwargs are passed directly to that method.
+        """
+        self.logger.info(
+            f"CSVSource loading data from {self.resource_location} "
+            f"using {self.method.__name__} with options: {self.iosource_method_kwargs}"
+        )
+
+        try:
+            self._data_cache = self.method(self.resource_location, **self.iosource_method_kwargs)
+        except Exception as exc:  # noqa: BLE001
+            self.logger.error(f"Error while loading CSV data from {self.resource_location}: {exc}")
+            raise
+
+        if self._data_cache is None:
+            raise ValueError(f"CSVSource: no data loaded from file {self.resource_location}.")
+        # Ensure we have a structured array with named fields
+        if self._data_cache.dtype.names is None:
+            raise ValueError(
+                "CSVSource expected a structured array with named fields, "
+                "but dtype.names is None.\n"
+                "Hint: use np.genfromtxt with 'names=True' or 'names=[...]', "
+                "or provide an appropriate 'dtype' with field names."
+            )
+
+    def _preload(self) -> None:
+        """
+        Populate dataset lists, shapes, and dtypes from the structured array.
+        """
+        assert self._data_cache is not None  # for type checkers
+
+        self._data_dict_cache = {}
+        self._file_datasets_shapes.clear()
+        self._file_datasets_dtypes.clear()
+
+        for name in self._data_cache.dtype.names:
+            column = self._data_cache[name]
+            self._data_dict_cache[name] = self._data_cache[name]
+            self._file_datasets_shapes[name] = column.shape
+            self._file_datasets_dtypes[name] = column.dtype
+
+        self.logger.info(f"CSVSource loaded datasets: {self._file_datasets_shapes.keys()}")
+
+    # ------------------------------------------------------------------ #
+    # IoSource API                                                       #
+    # ------------------------------------------------------------------ #
+
+    def get_static_metadata(self, data_key: str) -> None:
+        """
+        CSVSource does not support static metadata; always returns None.
+        """
+        self.logger.warning(
+            f"You asked for static metadata '{data_key}', but CSVSource does not support static metadata."
+        )
+        return None
+
+    def get_data(self, data_key: str, load_slice: ArraySlice = ...) -> np.ndarray:
+        """
+        Return the data column corresponding to `data_key`, cast to float, apply `load_slice`.
+
+        - data_key must match one of the field names in the structured array.
+        - `load_slice` is applied to that 1D column (e.g. ellipsis, slice, array of indices).
+        """
+        if self._data_cache is None:
+            raise RuntimeError("CSVSource data cache is empty; loading may have failed.")
+
+        try:
+            column = self._data_dict_cache[data_key]
+        except KeyError:
+            raise KeyError(
+                f"Data key '{data_key}' not found in CSV data. Available keys: {list(self._data_dict_cache.keys())}"  # noqa: E713
+            ) from None
+
+        return np.asarray(column[load_slice]).astype(float)
+
+    def get_data_shape(self, data_key: str) -> tuple[int, ...]:
+        if data_key in self._file_datasets_shapes:
+            return self._file_datasets_shapes[data_key]
+        return ()
+
+    def get_data_dtype(self, data_key: str) -> np.dtype | None:
+        if data_key in self._file_datasets_dtypes:
+            return self._file_datasets_dtypes[data_key]
+        return None
+
+    def get_data_attributes(self, data_key: str) -> dict[str, Any]:
+        """
+        CSV has no per-dataset attributes; return a dict with None.
+        """
+        self.logger.warning(
+            f"You asked for attributes of '{data_key}', but CSVSource does not support data attributes."
+        )
+        return {data_key: None}
@@ -13,13 +13,13 @@
 __status__ = "Development"  # "Development", "Production"
 # end of header and standard imports
 
-__all__ = ["HDFLoader"]
+__all__ = ["HDFSource"]
 
-from logging import WARNING
 from pathlib import Path
 
 import h5py
 import numpy as np
+from attrs import define, field, validators
 
 from modacor.dataclasses.messagehandler import MessageHandler
 
@@ -29,17 +29,30 @@
 from ..io_source import IoSource
 
 
-class HDFLoader(IoSource):
-    _data_cache: dict[str, np.ndarray] = None
-    _file_path: Path | None = None
-    _static_metadata_cache: dict[str, Any] = None
-
-    def __init__(self, source_reference: str, logging_level=WARNING, resource_location: Path | str | None = None):
-        super().__init__(source_reference=source_reference)
-        self.logger = MessageHandler(level=logging_level, name="HDFLoader")
-        self._file_path = Path(resource_location) if resource_location is not None else None
-        # self._file_reference = None  # let's not leave open file references lying around if we can help it.
-        self._file_datasets = []
+@define(kw_only=True)
+class HDFSource(IoSource):
+    resource_location: Path | str | None = field(
+        init=True, default=None, validator=validators.optional(validators.instance_of((Path, str)))
+    )
+    _data_cache: dict[str, np.ndarray] = field(init=False, factory=dict, validator=validators.instance_of(dict))
+    _file_path: Path | None = field(
+        init=False, default=None, validator=validators.optional(validators.instance_of(Path))
+    )
+    _file_datasets_shapes: dict[str, tuple[int, ...]] = field(
+        init=False, factory=dict, validator=validators.instance_of(dict)
+    )
+    _file_datasets_dtypes: dict[str, np.dtype] = field(init=False, factory=dict, validator=validators.instance_of(dict))
+    _static_metadata_cache: dict[str, Any] = field(init=False, factory=dict, validator=validators.instance_of(dict))
+    logger: MessageHandler = field(init=False)
+
+    # source_reference comes from IoSource
+    # iosource_method_kwargs comes from IoSource
+
+    def __attrs_post_init__(self):
+        # super().__init__(source_reference=source_reference)
+        self.logger = MessageHandler(level=self.logging_level, name="HDFSource")
+        self._file_path = Path(self.resource_location) if self.resource_location is not None else None
+        # self._file_datasets = []
         self._file_datasets_shapes = {}
         self._file_datasets_dtypes = {}
         self._data_cache = {}
@@ -61,7 +74,7 @@ def _find_datasets(self, path_name, path_object):
         the datasets within
         """
         if isinstance(path_object, h5py._hl.dataset.Dataset):
-            self._file_datasets.append(path_name)
+            # self._file_datasets.append(path_name)
             self._file_datasets_shapes[path_name] = path_object.shape
             self._file_datasets_dtypes[path_name] = path_object.dtype
 
 
@@ -4,6 +4,8 @@
 
 from __future__ import annotations
 
+from logging import WARNING
+
 import attrs
 
 __coding__ = "utf-8"
@@ -67,6 +69,9 @@ class IoSource:
     configuration: dict[str, Any] = field(factory=default_config)
     source_reference: str = field(default="", converter=str, validator=attrs.validators.instance_of(str))
     type_reference: str = "IoSource"
+    # for passing extra kwargs to the data loading method if needed (e.g. csv_source)
+    iosource_method_kwargs: dict[str, Any] = field(factory=dict, validator=attrs.validators.instance_of(dict))
+    logging_level: int = field(default=WARNING, validator=attrs.validators.instance_of(int))
 
     def get_data(self, data_key: str, load_slice: Optional[ArraySlice] = None) -> np.ndarray:
         """
 
@@ -11,14 +11,15 @@
 __status__ = "Development"  # "Development", "Production"
 # end of header and standard imports
 
-__all__ = ["YAMLLoader"]
+__all__ = ["YAMLSource"]
 
 from logging import WARNING
 from pathlib import Path
 from typing import Any
 
 import numpy as np
 import yaml
+from attrs import define, field, validators
 
 from modacor.dataclasses.messagehandler import MessageHandler
 from modacor.io.io_source import ArraySlice
@@ -37,7 +38,8 @@ def get_from_nested_dict_by_path(data, path):
     return data
 
 
-class YAMLLoader(IoSource):
+@define(kw_only=True)
+class YAMLSource(IoSource):
     """
     This IoSource is used to load and make experiment metadata available to
     the processing pipeline modules.
@@ -48,17 +50,18 @@ class YAMLLoader(IoSource):
     The entries are returned as BaseData elements, with units and uncertainties.
     """
 
-    _yaml_data: dict[str, Any] = dict()
-    _data_cache: dict[str, np.ndarray] = None
-    _file_path: Path | None = None
-    _static_metadata_cache: dict[str, Any] = None
-
-    def __init__(self, source_reference: str, logging_level=WARNING, resource_location: Path | str | None = None):
-        super().__init__(source_reference=source_reference)
-        self.logger = MessageHandler(level=logging_level, name="YAMLLoader")
-        self._file_path = Path(resource_location) if resource_location is not None else None
-        self._file_datasets = []
-        self._file_datasets_shapes = {}
+    resource_location: Path = field(converter=Path, validator=validators.instance_of((Path)))
+    _yaml_data: dict[str, Any] = field(factory=dict, validator=validators.instance_of(dict))
+    _data_cache: dict[str, np.ndarray] = field(factory=dict, validator=validators.instance_of(dict))
+    _file_path: Path | None = field(default=None, validator=validators.optional(validators.instance_of(Path)))
+    _static_metadata_cache: dict[str, Any] = field(factory=dict, validator=validators.instance_of(dict))
+    logging_level: int = field(default=WARNING, validator=validators.instance_of(int))
+    logger: MessageHandler = field(init=False)
+
+    def __attrs_post_init__(self):
+        # super().__init__(source_reference=source_reference)
+        self.logger = MessageHandler(level=self.logging_level, name="YAMLSource")
+        self._file_path = Path(self.resource_location) if self.resource_location is not None else None
         self._data_cache = {}  # for values that are float
         self._static_metadata_cache = {}  # for other elements such as strings and tags
         self._preload()  # load the yaml data immediately