Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions attribution/ATTRIBUTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
| charset-normalizer | 3.4.6 | MIT |
| click | 8.3.1 | BSD-3-Clause |
| comm | 0.2.3 | BSD License |
| cyclonedx-python-lib | 9.1.0 | Apache Software License |
| cyclonedx-python-lib | 11.7.0 | Apache Software License |
| debugpy | 1.8.20 | MIT License |
| decorator | 5.2.1 | BSD License |
| defusedxml | 0.7.1 | Python Software Foundation License |
| duckdb | 1.5.0 | MIT License |
| duckdb | 1.5.1 | MIT License |
| exceptiongroup | 1.3.1 | MIT License |
| execnet | 2.1.2 | MIT License |
| executing | 2.2.1 | MIT License |
Expand All @@ -39,15 +39,15 @@
| matplotlib-inline | 0.2.1 | UNKNOWN |
| mdurl | 0.1.2 | MIT License |
| mktestdocs | 0.2.5 | MIT |
| mloda | 0.5.3 | Apache-2.0 |
| mloda | 0.5.5 | Apache-2.0 |
| mmh3 | 5.2.1 | MIT License |
| msgpack | 1.1.2 | Apache-2.0 |
| mypy | 1.19.1 | MIT License |
| mypy_extensions | 1.1.0 | MIT |
| nbclient | 0.10.4 | BSD License |
| nbformat | 5.10.4 | BSD License |
| nest-asyncio | 1.6.0 | BSD License |
| nltk | 3.9.3 | Apache Software License |
| nltk | 3.9.4 | Apache Software License |
| numpy | 2.2.6 | BSD License |
| opentelemetry-api | 1.40.0 | Apache-2.0 |
| opentelemetry-instrumentation | 0.61b0 | Apache Software License |
Expand All @@ -65,7 +65,7 @@
| pexpect | 4.9.0 | ISC License (ISCL) |
| pip-api | 0.0.34 | Apache Software License |
| pip-requirements-parser | 32.0.1 | MIT |
| pip_audit | 2.9.0 | Apache Software License |
| pip_audit | 2.10.0 | Apache Software License |
| platformdirs | 4.9.4 | MIT License |
| pluggy | 1.6.0 | MIT License |
| polars | 1.39.3 | MIT License |
Expand All @@ -90,7 +90,7 @@
| pyzmq | 27.1.0 | BSD License |
| referencing | 0.37.0 | MIT |
| regex | 2026.2.28 | Apache-2.0 AND CNRI-Python |
| requests | 2.32.5 | Apache Software License |
| requests | 2.33.0 | Apache Software License |
| rich | 14.3.3 | MIT License |
| rpds-py | 0.30.0 | MIT |
| ruff | 0.15.7 | MIT License |
Expand All @@ -105,12 +105,13 @@
| testbook | 0.4.2 | BSD License |
| threadpoolctl | 3.6.0 | BSD License |
| toml | 0.10.2 | MIT License |
| tomli_w | 1.2.0 | MIT License |
| tornado | 6.5.5 | Apache Software License |
| tqdm | 4.67.3 | MPL-2.0 AND MIT |
| traitlets | 5.14.3 | BSD License |
| typeguard | 4.5.1 | MIT |
| types-PyYAML | 6.0.12.20250915 | Apache-2.0 |
| types-requests | 2.32.4.20260107 | Apache-2.0 |
| types-requests | 2.32.4.20260324 | Apache-2.0 |
| types-toml | 0.10.8.20240310 | Apache Software License |
| typing-inspect | 0.9.0 | MIT License |
| typing-inspection | 0.4.2 | MIT |
Expand Down
22 changes: 22 additions & 0 deletions docs/docs/in_depth/feature-chain-parser.md
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,28 @@ class SklearnPipelineFeatureGroup(FeatureChainParserMixin, FeatureGroup):
return base_match
```

#### 4. Operation Resolution with `_resolve_operation()`

Use this helper to extract the operation type from either the feature name pattern or a config key, without calling `FeatureChainParser` directly:

``` python
class AggregatedFeatureGroup(FeatureChainParserMixin, FeatureGroup):
AGGREGATION_TYPE = "aggregation_type"

@classmethod
def _extract_aggregation_type(cls, feature: Feature) -> Optional[str]:
# Two-arg form: pass a Feature and the config key
return cls._resolve_operation(feature, cls.AGGREGATION_TYPE)

@classmethod
def calculate_feature(cls, data, features):
for feature in features.features:
agg_type = cls._resolve_operation(feature, cls.AGGREGATION_TYPE)
# ... use agg_type
```

The three-arg form `cls._resolve_operation(feature_name, options, config_key)` is also supported for contexts where the name and options are already separate (e.g., `match_feature_group_criteria`).

## Modern Implementation in Feature Groups

### 1. Define PROPERTY_MAPPING Configuration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,57 @@ def _extract_source_features(cls, feature: Feature) -> List[str]:
# Configuration-based fallback using get_in_features()
in_features_set = feature.options.get_in_features()
return [f.get_name() for f in in_features_set]

@classmethod
def _resolve_operation(
cls,
feature_or_name: Any,
options_or_key: Any,
config_key: Optional[str] = None,
) -> Optional[str]:
"""Resolve the operation type from either a chained feature name or options.

Many feature groups need to extract an operation type (e.g. aggregation type,
scaler type, algorithm) from a feature. The value can come from the feature
name string (parsed via PREFIX_PATTERN) or from a configuration key in options.
This helper encapsulates that dual-path lookup.

Supports two calling conventions:

1. ``cls._resolve_operation(feature, config_key)``
Extracts the name and options from the Feature object.

2. ``cls._resolve_operation(feature_name, options, config_key)``
Uses the provided name (str or FeatureName) and Options separately.

The string-based path always takes precedence. If the feature name matches
PREFIX_PATTERN, the captured group is returned. Otherwise, falls back to
``options.get(config_key)`` and converts to string.

Args:
feature_or_name: A Feature object (convention 1) or a feature name
as str/FeatureName (convention 2).
options_or_key: The config_key str (convention 1) or an Options
object (convention 2).
config_key: The options key to fall back on (convention 2 only).

Returns:
The resolved operation as a string, or None if neither path matches.
"""
if isinstance(feature_or_name, Feature) and isinstance(options_or_key, str):
_name = feature_or_name.get_name()
_options = feature_or_name.options
_key = options_or_key
else:
_name = feature_or_name.name if isinstance(feature_or_name, FeatureName) else str(feature_or_name)
_options = options_or_key
_key = config_key if config_key is not None else ""

prefix_patterns = cls._get_prefix_patterns()
operation_config, _ = FeatureChainParser.parse_feature_name(_name, prefix_patterns, CHAIN_SEPARATOR)
if operation_config is not None:
return operation_config
value = _options.get(_key)
if value is not None:
return str(value)
return None
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,7 @@ def _extract_aggregation_type(cls, feature: Feature) -> Optional[str]:
Returns:
The aggregation type, or None if not found
"""
# Try string-based parsing first
aggregation_type, _ = FeatureChainParser.parse_feature_name(feature.name, [cls.PREFIX_PATTERN])
if aggregation_type is not None:
return aggregation_type

# Fall back to configuration
aggregation_type = feature.options[cls.AGGREGATION_TYPE]
return str(aggregation_type) if aggregation_type is not None else None
return cls._resolve_operation(feature, cls.AGGREGATION_TYPE)

@classmethod
def _extract_aggr_and_source_feature(cls, feature: Feature) -> tuple[str, str]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,22 +208,14 @@ def _extract_scaler_type(cls, feature: Feature) -> Optional[str]:
Raises:
ValueError: If scaler type is unsupported
"""
feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)

# Try string-based parsing first
if FeatureChainParser.is_chained_feature(feature_name_str):
scaler_type = cls.get_scaler_type(feature_name_str)
return scaler_type

# Fall back to configuration-based approach
scaler_type = feature.options[cls.SCALER_TYPE]
scaler_type = cls._resolve_operation(feature, cls.SCALER_TYPE)

if scaler_type is not None and scaler_type not in cls.SUPPORTED_SCALERS:
raise ValueError(
f"Unsupported scaler type: {scaler_type}. Supported types: {', '.join(cls.SUPPORTED_SCALERS.keys())}"
)

return str(scaler_type) if scaler_type is not None else None
return scaler_type

@classmethod
def _import_sklearn_components(cls) -> Dict[str, Any]:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
"""Tests for FeatureChainParserMixin._resolve_operation helper."""

from __future__ import annotations

from typing import Any, Dict

from mloda.core.abstract_plugins.components.feature_chainer.feature_chain_parser_mixin import (
FeatureChainParserMixin,
)
from mloda.core.abstract_plugins.components.feature_name import FeatureName
from mloda.core.abstract_plugins.components.options import Options
from mloda.user import Feature
from mloda.user import mloda
from mloda.user import PluginCollector
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
from mloda_plugins.feature_group.experimental.aggregated_feature_group.base import AggregatedFeatureGroup
from mloda_plugins.feature_group.experimental.aggregated_feature_group.pandas import PandasAggregatedFeatureGroup
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys

from tests.test_plugins.integration_plugins.test_data_creator import ATestDataCreator


class MockResolverFG(FeatureChainParserMixin):
"""Mock feature group for testing _resolve_operation."""

PREFIX_PATTERN = r".*__([\w]+)_op$"
AGGREGATION_TYPE = "aggregation_type"

PROPERTY_MAPPING = {
"aggregation_type": {
"sum": "Sum",
"avg": "Average",
DefaultOptionKeys.context: True,
DefaultOptionKeys.strict_validation: True,
},
}


class TestResolveOperationUnit:
"""Unit tests for _resolve_operation."""

def test_returns_parsed_operation_from_string(self) -> None:
"""When feature name matches PREFIX_PATTERN, returns parsed operation."""
options = Options(context={"aggregation_type": "sum"})
result = MockResolverFG._resolve_operation("source__sum_op", options, "aggregation_type")
assert result == "sum"

def test_returns_config_when_pattern_does_not_match(self) -> None:
"""When feature name does not match, falls back to options[config_key]."""
options = Options(context={"aggregation_type": "avg"})
result = MockResolverFG._resolve_operation("plain_name", options, "aggregation_type")
assert result == "avg"

def test_returns_none_when_neither_matches(self) -> None:
"""When neither pattern nor config key matches, returns None."""
options = Options(context={})
result = MockResolverFG._resolve_operation("plain_name", options, "aggregation_type")
assert result is None

def test_string_path_takes_precedence_over_config(self) -> None:
"""String-based resolution takes precedence even if config key is also set."""
options = Options(context={"aggregation_type": "avg"})
result = MockResolverFG._resolve_operation("source__sum_op", options, "aggregation_type")
assert result == "sum"

def test_works_with_feature_name_object(self) -> None:
"""Accepts FeatureName objects as well as strings."""
options = Options(context={"aggregation_type": "sum"})
result = MockResolverFG._resolve_operation(FeatureName("source__sum_op"), options, "aggregation_type")
assert result == "sum"


class TestResolveOperationIntegration:
"""Integration test: use _resolve_operation in a concrete subclass."""

def test_string_and_config_resolve_same_value(self) -> None:
"""Both paths should yield the same operation for equivalent inputs."""
string_options = Options(context={"aggregation_type": "sum"})
config_options = Options(context={"aggregation_type": "sum"})

string_result = MockResolverFG._resolve_operation("source__sum_op", string_options, "aggregation_type")
config_result = MockResolverFG._resolve_operation("my_result", config_options, "aggregation_type")

assert string_result == config_result == "sum"

def test_config_returns_string_value(self) -> None:
"""Config-based resolution converts the option value to string."""
options = Options(context={"aggregation_type": "avg"})
result = MockResolverFG._resolve_operation("my_result", options, "aggregation_type")
assert isinstance(result, str)
assert result == "avg"


class TestResolveOperationFeatureShorthand:
"""Tests for the Feature-based calling convention."""

def test_feature_shorthand_string_match(self) -> None:
"""Feature shorthand resolves from the feature name pattern."""
feature = Feature("source__sum_op", options=Options(context={"aggregation_type": "sum"}))
result = MockResolverFG._resolve_operation(feature, "aggregation_type")
assert result == "sum"

def test_feature_shorthand_config_fallback(self) -> None:
"""Feature shorthand falls back to options when pattern does not match."""
feature = Feature("my_result", options=Options(context={"aggregation_type": "avg"}))
result = MockResolverFG._resolve_operation(feature, "aggregation_type")
assert result == "avg"

def test_feature_shorthand_returns_none(self) -> None:
"""Feature shorthand returns None when neither path resolves."""
feature = Feature("my_result", options=Options(context={}))
result = MockResolverFG._resolve_operation(feature, "aggregation_type")
assert result is None

def test_feature_shorthand_matches_three_arg_form(self) -> None:
"""Both calling conventions return the same result."""
feature = Feature("source__sum_op", options=Options(context={"aggregation_type": "sum"}))
shorthand = MockResolverFG._resolve_operation(feature, "aggregation_type")
explicit = MockResolverFG._resolve_operation("source__sum_op", feature.options, "aggregation_type")
assert shorthand == explicit == "sum"


class ResolveOperationTestDataCreator(ATestDataCreator):
"""Test data creator for _resolve_operation integration tests."""

compute_framework = PandasDataFrame

@classmethod
def get_raw_data(cls) -> Dict[str, Any]:
return {"Sales": [100, 200, 300, 400, 500]}


class TestResolveOperationRunAll:
"""End-to-end test: _resolve_operation works through mloda.run_all()."""

def test_string_based_feature_via_run_all(self) -> None:
"""String-based aggregation feature resolves correctly through the engine."""
plugin_collector = PluginCollector.enabled_feature_groups(
{ResolveOperationTestDataCreator, PandasAggregatedFeatureGroup}
)

results = mloda.run_all(
["Sales__sum_aggr"],
compute_frameworks={PandasDataFrame},
plugin_collector=plugin_collector,
)

assert len(results) == 1
assert "Sales__sum_aggr" in results[0].columns
assert results[0]["Sales__sum_aggr"].iloc[0] == 1500

def test_config_based_feature_via_run_all(self) -> None:
"""Config-based aggregation feature resolves correctly through the engine."""
plugin_collector = PluginCollector.enabled_feature_groups(
{ResolveOperationTestDataCreator, PandasAggregatedFeatureGroup}
)

feature = Feature(
"total_sales",
Options(
context={
AggregatedFeatureGroup.AGGREGATION_TYPE: "sum",
DefaultOptionKeys.in_features: "Sales",
}
),
)

results = mloda.run_all(
[feature],
compute_frameworks={PandasDataFrame},
plugin_collector=plugin_collector,
)

assert len(results) == 1
assert "total_sales" in results[0].columns
assert results[0]["total_sales"].iloc[0] == 1500
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,7 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
class TestListValuedOptionsE2E:
"""End-to-end tests for list-valued options through the mloda pipeline."""

plugin_collector = PluginCollector.enabled_feature_groups(
{ListValuedTestDataCreator, ListValuedFeatureGroup}
)
plugin_collector = PluginCollector.enabled_feature_groups({ListValuedTestDataCreator, ListValuedFeatureGroup})

def test_list_valued_option_order_preserved(self) -> None:
"""List-valued option order is preserved through the pipeline.
Expand Down
Loading