From 084713e3eaed6d7b6fa2058eba6216c85a774626 Mon Sep 17 00:00:00 2001 From: v1docq Date: Fri, 6 Mar 2026 18:31:09 +0300 Subject: [PATCH 01/32] sampling zoo integration first stage --- ...sampling_stage_change_review_2026-03-06.md | 266 ++++++ docs/source/advanced/index.rst | 1 + docs/source/advanced/sampling_stage.rst | 105 +++ .../examples/amlb_sampling_benchmark.rst | 75 ++ .../examples/classification_example.rst | 6 + docs/source/examples/index.rst | 1 + examples/README.rst | 4 + examples/benchmark/__init__.py | 1 + examples/benchmark/run_amlb.py | 818 ++++++++++++++++++ .../classification/sampling_stage_example.py | 38 + fedot/api/api_utils/api_params_repository.py | 20 +- .../api_utils/assumptions/task_assumptions.py | 16 +- fedot/api/api_utils/presets.py | 2 +- fedot/api/builder.py | 5 + fedot/api/main.py | 166 ++-- fedot/api/sampling_stage/__init__.py | 13 + fedot/api/sampling_stage/config.py | 172 ++++ fedot/api/sampling_stage/executor.py | 325 +++++++ fedot/api/sampling_stage/providers.py | 295 +++++++ .../sklearn_transformations.py | 18 +- .../objective/data_objective_eval.py | 2 + fedot/core/pipelines/node.py | 14 +- fedot/preprocessing/data_types.py | 26 +- other_requirements/sampling_zoo.txt | 2 + setup.py | 2 +- .../api/test_sampling_stage_integration.py | 230 +++++ test/unit/api/test_api_params.py | 18 + test/unit/api/test_sampling_stage.py | 191 ++++ test/unit/api/test_sampling_stage_provider.py | 20 + .../test_data_operations_implementations.py | 54 +- .../examples/test_amlb_sampling_benchmark.py | 101 +++ 31 files changed, 2918 insertions(+), 89 deletions(-) create mode 100644 docs/files/sampling_stage_change_review_2026-03-06.md create mode 100644 docs/source/advanced/sampling_stage.rst create mode 100644 docs/source/examples/amlb_sampling_benchmark.rst create mode 100644 examples/benchmark/__init__.py create mode 100644 examples/benchmark/run_amlb.py create mode 100644 examples/simple/classification/sampling_stage_example.py create mode 100644 fedot/api/sampling_stage/__init__.py create mode 100644 fedot/api/sampling_stage/config.py create mode 100644 fedot/api/sampling_stage/executor.py create mode 100644 fedot/api/sampling_stage/providers.py create mode 100644 other_requirements/sampling_zoo.txt create mode 100644 test/integration/api/test_sampling_stage_integration.py create mode 100644 test/unit/api/test_sampling_stage.py create mode 100644 test/unit/api/test_sampling_stage_provider.py create mode 100644 test/unit/examples/test_amlb_sampling_benchmark.py diff --git a/docs/files/sampling_stage_change_review_2026-03-06.md b/docs/files/sampling_stage_change_review_2026-03-06.md new file mode 100644 index 0000000000..9dbac0af16 --- /dev/null +++ b/docs/files/sampling_stage_change_review_2026-03-06.md @@ -0,0 +1,266 @@ +# Sampling Stage Integration Review +Date: 2026-03-06 +Scope: changes for `sampling_config` + pre-fit `sampling_stage` integration into `Fedot.fit`. + +## Final Stage Status +- Attempted final test run: + - `python -m pytest ...` -> `No module named pytest` +- Final pytest stage is skipped as not runnable in current environment. +- Fallback verification completed: + - `python -m py_compile` passed for all changed/new Python files. + +## Change Scope (Implemented) +- API/defaults: `sampling_config` added and validated. +- New subsystem: `fedot/api/sampling_stage/{config.py,providers.py,executor.py}`. +- Fit integration: sampling stage executed before composition, metadata exposed. +- Optional dependency: `fedot[sampling_zoo]` extra added. +- Tests: unit + integration tests for config, provider, executor, and fit behavior. +- Docs/examples: advanced guide, README section, classification example note. + +## 1) Architecture Review + +### A1. Provider Contract Is Heuristic and Version-Sensitive +Problem: +- `SamplingZooProvider` discovers indices through multiple fallback paths (`sample_indices`, attrs, `get_partitions`) without a strict external contract. + +Why it matters: +- Changes in Sampling Zoo internals can break extraction logic, causing hard failures or incorrect sampling behavior. + +Options: +1. Do nothing. +- Effort: Low +- Risk: Medium +- Payoff: Low +- Maintenance cost: Medium + +2. Define and enforce a strict provider adapter contract for FEDOT integration. +- Effort: Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Low + +3. Maintain versioned adapters (`sampling_zoo_v1`, `sampling_zoo_v2`) with explicit compatibility checks. +- Effort: Medium/High +- Risk: Low +- Payoff: High +- Maintenance cost: Medium + +Recommended: +- Option 2 for near term; move to Option 3 when multiple Sampling Zoo API generations must be supported. + +### A2. Subset Construction May Share Mutable Supplementary State +Problem: +- `SamplingStageExecutor._subset_by_positions` reuses `data.supplementary_data` reference when creating reduced `InputData`. + +Why it matters: +- Mutable shared state can create non-obvious side effects across stages/pipelines. + +Options: +1. Do nothing. +- Effort: Low +- Risk: Medium +- Payoff: Low +- Maintenance cost: Medium + +2. Deep-copy `supplementary_data` for sampled dataset. +- Effort: Low +- Risk: Low +- Payoff: Medium +- Maintenance cost: Low + +3. Introduce immutable or copy-on-write semantics for supplementary metadata. +- Effort: Medium/High +- Risk: Low +- Payoff: High +- Maintenance cost: Medium + +Recommended: +- Option 2 now; Option 3 only if broader data mutability problems appear. + +## 2) Code Quality Review + +### C1. Guard Validation Is Partially Type-Specific +Problem: +- Heavy-parameter guards in `validate_sampling_config` primarily check integer forms (`n_partitions`, `sample_size`, etc.), while some non-int shapes may bypass limits. + +Why it matters: +- Invalid or heavy configs may slip through validation and produce expensive runtime behavior. + +Options: +1. Do nothing. +- Effort: Low +- Risk: Medium +- Payoff: Low +- Maintenance cost: Medium + +2. Normalize and validate all accepted numeric representations (int/float/list/tuple where relevant). +- Effort: Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Low + +3. Add provider-specific schema validation plugins. +- Effort: Medium/High +- Risk: Low +- Payoff: High +- Maintenance cost: Medium + +Recommended: +- Option 2 in V1 hardening; Option 3 only when multiple providers are active. + +### C2. Final Sampling Randomly Re-Selects from Extracted Indices +Problem: +- After extracting candidate indices from strategy output, provider may randomly choose a subset up to `sample_size`. + +Why it matters: +- If strategy output is already ranked/structured, extra random reduction can weaken algorithm intent and reproducibility semantics. + +Options: +1. Do nothing. +- Effort: Low +- Risk: Medium +- Payoff: Low +- Maintenance cost: Low + +2. Prefer strategy-native final selection when available; fallback to random only if needed. +- Effort: Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Low + +3. Require strategy to return exactly final indices count and fail otherwise. +- Effort: Medium +- Risk: Medium +- Payoff: High +- Maintenance cost: Medium + +Recommended: +- Option 2 for compatibility + quality balance. + +## 3) Test Review + +### T1. No Executed End-to-End Test with Real Sampling Zoo in This Environment +Problem: +- Tests were authored, but final execution is blocked by missing `pytest` package; no runtime E2E signal with installed optional dependency. + +Why it matters: +- Integration defects can remain hidden until real environment execution. + +Options: +1. Do nothing. +- Effort: Low +- Risk: High +- Payoff: Low +- Maintenance cost: Low + +2. Add CI lane with `fedot[sampling_zoo]` and run dedicated markers. +- Effort: Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Medium + +3. Add nightly AMLB-style smoke benchmark for sampling stage. +- Effort: Medium/High +- Risk: Low +- Payoff: High +- Maintenance cost: Medium/High + +Recommended: +- Option 2 immediately; Option 3 as performance/quality observability extension. + +### T2. Missing Regression Cases for DataFrame Features and Metadata Isolation +Problem: +- Tests mostly use numpy-like datasets and mocked provider paths. + +Why it matters: +- Potential regressions in DataFrame handling and shared supplementary metadata may not be detected. + +Options: +1. Do nothing. +- Effort: Low +- Risk: Medium +- Payoff: Low +- Maintenance cost: Low + +2. Add unit/integration tests for DataFrame features, categorical columns, and supplementary metadata isolation. +- Effort: Low/Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Low + +3. Add property-based tests for sampling indices and data consistency invariants. +- Effort: Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Medium + +Recommended: +- Option 2 now; Option 3 later if index-related bugs appear in production. + +## 4) Performance Review + +### P1. Repeated Feature Encoding for Each Candidate Ratio +Problem: +- The effective-size protocol rebuilds training matrices and model fits for each candidate. + +Why it matters: +- Sampling overhead can consume a meaningful part of budget on medium/large tabular datasets. + +Options: +1. Do nothing. +- Effort: Low +- Risk: Medium +- Payoff: Low +- Maintenance cost: Low + +2. Cache transformed validation matrix and reusable feature engineering outputs. +- Effort: Medium +- Risk: Low +- Payoff: Medium/High +- Maintenance cost: Medium + +3. Add adaptive candidate schedule with early elimination and dynamic stopping. +- Effort: Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Medium + +Recommended: +- Option 3 plus targeted caching from Option 2 for the largest workloads. + +### P2. Fixed RF Baseline Complexity (`n_estimators=100`) +Problem: +- Baseline model cost is fixed and may be too expensive under tight time budgets. + +Why it matters: +- High stage cost can reduce AutoML search time and offset sampling benefit. + +Options: +1. Do nothing. +- Effort: Low +- Risk: Medium +- Payoff: Low +- Maintenance cost: Low + +2. Add lightweight baseline config (`n_estimators`, depth, model family) in `sampling_config`. +- Effort: Medium +- Risk: Low +- Payoff: High +- Maintenance cost: Low + +3. Auto-scale baseline complexity from dataset size and stage budget. +- Effort: Medium/High +- Risk: Medium +- Payoff: High +- Maintenance cost: Medium + +Recommended: +- Option 2 first; Option 3 later when benchmark telemetry is available. + +## Consolidated Recommendation +- The current implementation is a solid V1 integration aligned with fail-fast and dynamic cap constraints. +- Main hardening targets before production broad rollout: + - enforce stricter provider contract, + - isolate mutable dataset metadata, + - extend guard validation, + - execute CI with real optional dependency. \ No newline at end of file diff --git a/docs/source/advanced/index.rst b/docs/source/advanced/index.rst index 105064774d..d5993a8962 100644 --- a/docs/source/advanced/index.rst +++ b/docs/source/advanced/index.rst @@ -8,6 +8,7 @@ Advanced usage automated_pipelines_design hyperparameters_tuning data_preprocessing + sampling_stage project_import_export pipeline_import_export cli_call diff --git a/docs/source/advanced/sampling_stage.rst b/docs/source/advanced/sampling_stage.rst new file mode 100644 index 0000000000..75b2b82d7a --- /dev/null +++ b/docs/source/advanced/sampling_stage.rst @@ -0,0 +1,105 @@ +Sampling Stage Before AutoML Search +================================== + +FEDOT supports an optional ``sampling_stage`` in ``Fedot.fit()``. +This stage runs before evolutionary pipeline composition and can reduce +training set size for classification/regression tabular tasks. + +The stage is controlled by a single API parameter: ``sampling_config``. +If ``sampling_config`` is ``None`` (default), FEDOT behavior is unchanged. + +Quick Start +----------- + +For AMLB-style benchmark execution with sampling mode and saved optimization artifacts, see :doc:`/examples/amlb_sampling_benchmark`. + +.. code-block:: python + + from fedot import Fedot + + model = Fedot( + problem='classification', + timeout=10, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.15, 0.2, 0.3, 0.5], + 'delta_metric_threshold': 0.03, + } + ) + + model.fit(features=x_train, target=y_train) + + # Metadata is available after fit + print(model.sampling_stage_metadata) + +V1 Scope and Behavior +--------------------- + +- Supported tasks: ``classification`` and ``regression`` only. +- Supported data container: ``InputData`` only. +- Supported data type: tabular only. +- Stage is applied only in ``fit`` (not in ``predict`` and not in ``tune``). +- Error mode is ``fail_fast`` only in V1. +- Budget policy is ``dynamic_cap`` only in V1. +- Artifact mode is ``minimal`` only in V1. + +If the stage cannot be executed in these constraints, FEDOT raises an exception. + +Effective Size Protocol +----------------------- + +The stage uses an internal protocol to choose the effective ratio: + +1. Split train data into internal train/validation parts. +2. Train a light baseline model (Random Forest) on full internal train split. +3. For each candidate ratio: + + - Run sampling and obtain candidate indices. + - Train the same light model on sampled internal train split. + - Measure metric delta on internal validation split. + +4. Select the smallest ratio that satisfies ``delta_metric_threshold``. + +If none of candidate ratios satisfies the threshold, FEDOT raises an exception. + +Main ``sampling_config`` Fields +------------------------------- + +- ``provider``: sampling provider name (V1 supports ``sampling_zoo``). +- ``strategy``: strategy identifier passed to provider. +- ``strategy_params``: provider-specific strategy kwargs. +- ``candidate_ratios``: ordered ratios in ``(0, 1]``. +- ``delta_metric_threshold``: allowed quality drop. +- ``delta_type``: ``relative`` or ``absolute``. +- ``validation_size``: internal validation size in ``(0, 1)``. +- ``cap_max_timeout_share``: maximal timeout share for stage. +- ``min_automl_time_minutes``: guaranteed minimal time left for AutoML after stage. +- ``infinite_timeout_cap_minutes``: absolute stage cap when timeout is infinite. +- ``random_state``: random seed. + +Performance Guards +------------------ + +For heavy strategies, config validation checks guard limits: + +- ``guard_max_rank`` +- ``guard_max_modes`` +- ``guard_max_partitions`` +- ``guard_max_sample_size`` + +These limits prevent unexpectedly expensive strategy parameters. + +Optional Dependency +------------------- + +Sampling Zoo integration is optional. + +.. code-block:: bash + + pip install "fedot[sampling_zoo]" + +If dependency is unavailable and sampling stage is enabled, FEDOT raises +``ModuleNotFoundError`` in ``fail_fast`` mode. + + diff --git a/docs/source/examples/amlb_sampling_benchmark.rst b/docs/source/examples/amlb_sampling_benchmark.rst new file mode 100644 index 0000000000..f5cd44d49c --- /dev/null +++ b/docs/source/examples/amlb_sampling_benchmark.rst @@ -0,0 +1,75 @@ +AMLB Sampling Benchmark +======================= + +FEDOT provides a benchmark entrypoint for AMLB-style tabular datasets with pre-fit sampling stage. + +Script path: + +- ``examples/benchmark/run_amlb.py`` + +Default behavior +---------------- + +- Uses AMLB category profile ``amlb_top20_mix``. +- Runs two modes for each dataset: + + - ``fedot_full_dataset`` (baseline, no sampling). + - ``fedot_sampling_stage`` (sampling enabled through ``sampling_config``). + +- Time budget per dataset is **15 minutes** by default. +- Saves optimization artifacts for each run: + + - ``opt_history.json`` + - history visualizations (fitness, KDE, animated bars/diversity where available) + - pipeline visualization + - predictions, metrics and timing reports + +Quick start +----------- + +.. code-block:: bash + + python examples/benchmark/run_amlb.py + +Custom run examples +------------------- + +Run only sampling mode: + +.. code-block:: bash + + python examples/benchmark/run_amlb.py --disable-baseline + +Run specific AMLB datasets with fixed 15-minute budget: + +.. code-block:: bash + + python examples/benchmark/run_amlb.py \ + --datasets amlb_adult amlb_credit_g \ + --timeout-minutes 15 + +Tune sampling protocol options: + +.. code-block:: bash + + python examples/benchmark/run_amlb.py \ + --candidate-ratios 0.15,0.2,0.3,0.5 \ + --delta-threshold 0.03 \ + --sampling-strategy random + +Output artifacts +---------------- + +By default results are stored under: + +- ``examples/benchmark/results/run_amlb_fedot_sampling_/`` + +Run-level files: + +- ``run_meta.json`` +- ``benchmark_runs.csv`` +- ``benchmark_runs.json`` +- ``report.md`` +- ``run_summary.json`` + +Per-dataset and per-mode files include all optimization and visualization artifacts produced during run. diff --git a/docs/source/examples/classification_example.rst b/docs/source/examples/classification_example.rst index 1d420ecf60..3e20434f90 100644 --- a/docs/source/examples/classification_example.rst +++ b/docs/source/examples/classification_example.rst @@ -44,6 +44,12 @@ You also can define metric parameter (ROC-AUC in this example), timeout in minut Class ``Fedot.__init__()`` has more, e.g. ``n_jobs`` for parallelization. For more details, see the :doc:`FEDOT API ` section in our documentation. +.. note:: + + You can also enable optional pre-fit sampling via ``sampling_config``. + See :doc:`sampling stage guide ` and + ``examples/simple/classification/sampling_stage_example.py``. + To train our model we should call method ``fit()``. This method returns the best pipeline was obtained during optimization. .. code-block:: python diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 9218b72b0d..b6b5f5308d 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -17,3 +17,4 @@ In this section you can find notebooks and useful pipeline structures for variou classification_pipelines regression_pipelines ts_pipelines + amlb_sampling_benchmark diff --git a/examples/README.rst b/examples/README.rst index 7acde2b205..2b783a7eef 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -4,6 +4,8 @@ Look at the structure provide below: simple - for new users +benchmark - benchmark runners and reproducible experiment entrypoints + advanced - for advanced users - simple @@ -11,6 +13,8 @@ advanced - for advanced users - regression - devoted to solving basic regression tasks via API - time_series_forecasting - devoted to solving basic time series forecasting tasks via API - interpretable - shows how pipelines can be interpreted +- benchmark + - AMLB benchmark entrypoint with FEDOT sampling stage and saved optimization visualizations - advanced - automl - shows how other autoML solutions can be compatible with FEDOT - decompose - examples devoted to decomposition diff --git a/examples/benchmark/__init__.py b/examples/benchmark/__init__.py new file mode 100644 index 0000000000..de9d8e095c --- /dev/null +++ b/examples/benchmark/__init__.py @@ -0,0 +1 @@ +"""Benchmark examples for FEDOT.""" diff --git a/examples/benchmark/run_amlb.py b/examples/benchmark/run_amlb.py new file mode 100644 index 0000000000..2edb35b25e --- /dev/null +++ b/examples/benchmark/run_amlb.py @@ -0,0 +1,818 @@ +from __future__ import annotations + +import argparse +import inspect +import json +import sys +import traceback +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path +from time import perf_counter +from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple + +import matplotlib + +matplotlib.use('Agg') +import numpy as np +import pandas as pd +from sklearn.datasets import fetch_openml +from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score +from sklearn.model_selection import train_test_split + +ROOT_DIR = Path(__file__).resolve().parents[2] +if str(ROOT_DIR) not in sys.path: + sys.path.insert(0, str(ROOT_DIR)) + +DEFAULT_TIMEOUT_MINUTES_PER_DATASET = 10.0 +DEFAULT_RESULTS_ROOT = Path('examples/benchmark/results') +DEFAULT_CATEGORY_PROFILE = ('amlb_top20_mix',) + + +@dataclass(frozen=True) +class AMLBDatasetSpec: + name: str + openml_name: Optional[str] = None + openml_id: Optional[int] = None + task_type: str = 'classification' + + +@dataclass(frozen=True) +class LoadedDataset: + spec: AMLBDatasetSpec + x_train: pd.DataFrame + x_test: pd.DataFrame + y_train: pd.DataFrame + y_test: pd.DataFrame + metadata: Dict[str, Any] + + +@dataclass(frozen=True) +class BenchmarkRunConfig: + timeout_minutes_per_dataset: float = DEFAULT_TIMEOUT_MINUTES_PER_DATASET + output_root: Path = DEFAULT_RESULTS_ROOT + seed: int = 42 + n_jobs: int = -1 + preset: str = 'best_quality' + with_tuning: bool = True + dataset_names: Tuple[str, ...] = () + amlb_categories: Tuple[str, ...] = DEFAULT_CATEGORY_PROFILE + test_size: float = 0.2 + max_rows_per_dataset: int = 25000 + include_baseline: bool = True + include_sampling: bool = True + sampling_config: Dict[str, Any] = field(default_factory=dict) + + +def _default_sampling_config(seed: int) -> Dict[str, Any]: + return { + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'strategy_params': {}, + 'candidate_ratios': [0.15, 0.2, 0.3, 0.5], + 'delta_metric_threshold': 0.03, + 'delta_type': 'relative', + 'validation_size': 0.2, + 'budget_policy': 'dynamic_cap', + 'cap_max_timeout_share': 0.35, + 'min_automl_time_minutes': 0.1, + 'infinite_timeout_cap_minutes': 5.0, + 'error_policy': 'fail_fast', + 'artifact_mode': 'minimal', + 'random_state': seed, + } + + +AMLB_DATASETS: Dict[str, AMLBDatasetSpec] = { + 'amlb_adult': AMLBDatasetSpec(name='amlb_adult', openml_name='adult'), + 'amlb_covertype': AMLBDatasetSpec(name='amlb_covertype', openml_name='covertype'), + 'amlb_optdigits': AMLBDatasetSpec(name='amlb_optdigits', openml_name='optdigits'), + 'amlb_vehicle': AMLBDatasetSpec(name='amlb_vehicle', openml_name='vehicle'), + 'amlb_mfeat_factors': AMLBDatasetSpec(name='amlb_mfeat_factors', openml_name='mfeat-factors'), + 'amlb_segment': AMLBDatasetSpec(name='amlb_segment', openml_name='segment'), + 'amlb_credit_g': AMLBDatasetSpec(name='amlb_credit_g', openml_name='credit-g'), + 'amlb_kr_vs_kp': AMLBDatasetSpec(name='amlb_kr_vs_kp', openml_name='kr-vs-kp'), + 'amlb_sick': AMLBDatasetSpec(name='amlb_sick', openml_name='sick'), + 'amlb_spambase': AMLBDatasetSpec(name='amlb_spambase', openml_name='spambase'), + 'amlb_letter': AMLBDatasetSpec(name='amlb_letter', openml_name='letter'), + 'amlb_satimage': AMLBDatasetSpec(name='amlb_satimage', openml_name='satimage'), + 'amlb_waveform': AMLBDatasetSpec(name='amlb_waveform', openml_name='waveform-5000'), + 'amlb_phoneme': AMLBDatasetSpec(name='amlb_phoneme', openml_name='phoneme'), + 'amlb_page_blocks': AMLBDatasetSpec(name='amlb_page_blocks', openml_name='page-blocks'), + 'amlb_ionosphere': AMLBDatasetSpec(name='amlb_ionosphere', openml_name='ionosphere'), + 'amlb_banknote_authentication': AMLBDatasetSpec(name='amlb_banknote_authentication', + openml_name='banknote-authentication'), + 'amlb_wine_quality_red': AMLBDatasetSpec(name='amlb_wine_quality_red', openml_name='wine-quality-red'), + 'amlb_wine_quality_white': AMLBDatasetSpec(name='amlb_wine_quality_white', openml_name='wine-quality-white'), + 'amlb_magic_telescope': AMLBDatasetSpec(name='amlb_magic_telescope', openml_name='magic-telescope'), +} + +AMLB_CATEGORY_PROFILES: Dict[str, Tuple[str, ...]] = { + 'small_samples_many_classes': ( + 'amlb_optdigits', + 'amlb_vehicle', + 'amlb_mfeat_factors', + 'amlb_segment', + 'amlb_satimage', + 'amlb_letter', + ), + 'large_samples_binary': ( + 'amlb_adult', + 'amlb_covertype', + 'amlb_magic_telescope', + 'amlb_spambase', + 'amlb_banknote_authentication', + 'amlb_ionosphere', + ), + 'tabular_mixed_classification': ( + 'amlb_credit_g', + 'amlb_kr_vs_kp', + 'amlb_sick', + 'amlb_waveform', + 'amlb_phoneme', + 'amlb_page_blocks', + 'amlb_wine_quality_red', + 'amlb_wine_quality_white', + ), + 'amlb_top20_mix': ( + 'amlb_adult', + 'amlb_covertype', + 'amlb_optdigits', + 'amlb_vehicle', + 'amlb_mfeat_factors', + 'amlb_segment', + 'amlb_credit_g', + 'amlb_kr_vs_kp', + 'amlb_sick', + 'amlb_spambase', + 'amlb_letter', + 'amlb_satimage', + 'amlb_waveform', + 'amlb_phoneme', + 'amlb_page_blocks', + 'amlb_ionosphere', + 'amlb_banknote_authentication', + 'amlb_wine_quality_red', + 'amlb_wine_quality_white', + 'amlb_magic_telescope', + ), +} + + +def parse_ratio_list(ratios: str) -> Tuple[float, ...]: + parsed = [] + for raw in ratios.split(','): + raw = raw.strip() + if not raw: + continue + value = float(raw) + if value <= 0 or value > 1: + raise ValueError(f'Candidate ratio must be in (0, 1], got {value}.') + parsed.append(value) + + if not parsed: + raise ValueError('At least one candidate ratio must be provided.') + + unique_sorted = tuple(sorted(set(parsed))) + return unique_sorted + + +def _safe_name(value: str) -> str: + return ''.join(ch if ch.isalnum() or ch in ('-', '_', '.') else '_' for ch in value) + + +def _json_ready(value: Any) -> Any: + if isinstance(value, dict): + return {str(k): _json_ready(v) for k, v in value.items()} + if isinstance(value, (list, tuple)): + return [_json_ready(v) for v in value] + if isinstance(value, np.ndarray): + return value.tolist() + if isinstance(value, (np.integer,)): + return int(value) + if isinstance(value, (np.floating,)): + return float(value) + if isinstance(value, (np.bool_,)): + return bool(value) + return value + + +def _save_json(path: Path, payload: Mapping[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(_json_ready(dict(payload)), ensure_ascii=False, indent=2), encoding='utf-8') + + +def _resolve_dataset_specs(dataset_names: Sequence[str], amlb_categories: Sequence[str]) -> List[AMLBDatasetSpec]: + requested_names: List[str] = [] + + if dataset_names: + requested_names.extend(dataset_names) + else: + categories = amlb_categories or DEFAULT_CATEGORY_PROFILE + for category in categories: + if category not in AMLB_CATEGORY_PROFILES: + raise ValueError( + f'Unknown AMLB category: {category}. Available: {sorted(AMLB_CATEGORY_PROFILES.keys())}' + ) + requested_names.extend(AMLB_CATEGORY_PROFILES[category]) + + unique_names = [] + seen = set() + for name in requested_names: + normalized = name.strip().lower() + if not normalized: + continue + if normalized not in AMLB_DATASETS: + raise ValueError(f'Unknown AMLB dataset profile: {name}.') + if normalized in seen: + continue + seen.add(normalized) + unique_names.append(normalized) + + return [AMLB_DATASETS[name] for name in unique_names] + + +def _extract_target_name(dataset: Any, frame: pd.DataFrame) -> str: + if getattr(dataset, 'target', None) is not None and getattr(dataset.target, 'name', None): + return str(dataset.target.name) + + target_names = list(getattr(dataset, 'target_names', []) or []) + if target_names: + return str(target_names[0]) + + fallback = frame.columns[-1] + return str(fallback) + + +def _sanitize_features_for_fedot(features: pd.DataFrame, + numeric_columns: Sequence[str], + categorical_columns: Sequence[str]) -> pd.DataFrame: + sanitized = features.copy() + + for column in numeric_columns: + numeric_column = pd.to_numeric(sanitized[column], errors='coerce') + fill_value = numeric_column.median(skipna=True) + if pd.isna(fill_value): + fill_value = 0.0 + sanitized[column] = numeric_column.fillna(fill_value) + + for column in categorical_columns: + category_column = sanitized[column].astype('object') + category_column = category_column.where(~pd.isna(category_column), '__missing__') + # Keep AMLB categories consistent and numeric for FEDOT assumptions stage. + category_column = category_column.astype(str) + codes, _ = pd.factorize(category_column, sort=True) + sanitized[column] = codes.astype(np.int64) + + extra_columns = [column for column in sanitized.columns if + column not in set(numeric_columns) | set(categorical_columns)] + for column in extra_columns: + fallback_column = sanitized[column].astype('object') + fallback_column = fallback_column.where(~pd.isna(fallback_column), '__missing__') + fallback_column = fallback_column.astype(str) + codes, _ = pd.factorize(fallback_column, sort=True) + sanitized[column] = codes.astype(np.int64) + + return sanitized + + +def _load_amlb_dataset(spec: AMLBDatasetSpec, + seed: int, + max_rows: int, + test_size: float) -> LoadedDataset: + if spec.openml_id is not None: + dataset = fetch_openml(data_id=spec.openml_id, as_frame=True, parser='auto') + elif spec.openml_name is not None: + dataset = fetch_openml(name=spec.openml_name, as_frame=True, parser='auto') + else: + raise ValueError(f'Dataset spec {spec.name} has neither openml_name nor openml_id.') + + frame = dataset.frame.copy() + target_name = _extract_target_name(dataset, frame) + + y_raw = frame[target_name] + x = frame.drop(columns=[target_name]) + + valid = y_raw.notna() + x = x.loc[valid].reset_index(drop=True) + y_raw = y_raw.loc[valid].reset_index(drop=True) + + if spec.task_type == 'classification': + y = pd.Series(y_raw.astype('category').cat.codes, name='target') + valid_classes = y >= 0 + x = x.loc[valid_classes].reset_index(drop=True) + y = y.loc[valid_classes].reset_index(drop=True) + elif spec.task_type == 'regression': + y = pd.to_numeric(y_raw, errors='coerce') + valid_numeric = y.notna() + x = x.loc[valid_numeric].reset_index(drop=True) + y = y.loc[valid_numeric].reset_index(drop=True) + y = pd.Series(y, name='target') + else: + raise ValueError(f'Unsupported task type: {spec.task_type}') + + if max_rows > 0 and len(x) > max_rows: + rng = np.random.default_rng(seed) + selected = np.sort(rng.choice(np.arange(len(x)), size=max_rows, replace=False)) + x = x.iloc[selected].reset_index(drop=True) + y = y.iloc[selected].reset_index(drop=True) + + numeric_columns = x.select_dtypes(include=['number', 'bool']).columns.tolist() + categorical_columns = [column for column in x.columns if column not in numeric_columns] + x = _sanitize_features_for_fedot( + features=x, + numeric_columns=numeric_columns, + categorical_columns=categorical_columns, + ) + + stratify = y if spec.task_type == 'classification' and y.nunique() > 1 else None + x_train, x_test, y_train, y_test = train_test_split( + x, + y, + test_size=test_size, + random_state=seed, + stratify=stratify, + ) + + metadata = { + 'n_rows': int(len(x)), + 'n_features': int(x.shape[1]), + 'n_train': int(len(x_train)), + 'n_test': int(len(x_test)), + 'n_numeric_features': int(len(numeric_columns)), + 'n_categorical_features': int(len(categorical_columns)), + 'task_type': spec.task_type, + 'target_name': target_name, + } + + y_train_frame = pd.DataFrame({'target': np.asarray(y_train).reshape(-1)}) + y_test_frame = pd.DataFrame({'target': np.asarray(y_test).reshape(-1)}) + + return LoadedDataset( + spec=spec, + x_train=x_train.reset_index(drop=True), + x_test=x_test.reset_index(drop=True), + y_train=y_train_frame.reset_index(drop=True), + y_test=y_test_frame.reset_index(drop=True), + metadata=metadata, + ) + + +def _evaluate_metrics(task_type: str, + y_true: np.ndarray, + y_pred: np.ndarray, + y_proba: Optional[np.ndarray]) -> Dict[str, Any]: + y_true = np.asarray(y_true).reshape(-1) + y_pred = np.asarray(y_pred).reshape(-1) + + if task_type == 'classification': + metrics: Dict[str, Any] = { + 'accuracy': float(accuracy_score(y_true, y_pred)), + 'f1_macro': float(f1_score(y_true, y_pred, average='macro')), + 'f1_weighted': float(f1_score(y_true, y_pred, average='weighted')), + } + + if y_proba is not None: + try: + classes_count = len(np.unique(y_true)) + proba = np.asarray(y_proba) + if classes_count <= 2: + if proba.ndim == 2: + positive_proba = proba[:, 1] if proba.shape[1] > 1 else proba[:, 0] + else: + positive_proba = proba.reshape(-1) + metrics['roc_auc'] = float(roc_auc_score(y_true, positive_proba)) + else: + if proba.ndim == 1: + raise ValueError('Multiclass ROC-AUC requires 2D probability array.') + metrics['roc_auc_ovr_macro'] = float( + roc_auc_score(y_true, proba, average='macro', multi_class='ovr') + ) + except Exception as ex: + metrics['roc_auc_error'] = str(ex) + + return metrics + + if task_type == 'regression': + rmse = float(np.sqrt(mean_squared_error(y_true, y_pred))) + return { + 'r2': float(r2_score(y_true, y_pred)), + 'rmse': rmse, + 'mae': float(mean_absolute_error(y_true, y_pred)), + } + + raise ValueError(f'Unsupported task type: {task_type}') + + +def _invoke_with_supported_kwargs(obj: Any, + method_name: str, + kwargs: Mapping[str, Any]) -> Tuple[bool, Optional[str]]: + method = getattr(obj, method_name, None) + if method is None: + return False, 'method_not_available' + + signature = inspect.signature(method) + call_kwargs = {} + for key, value in kwargs.items(): + if key in signature.parameters: + call_kwargs[key] = str(value) if isinstance(value, Path) else value + + try: + method(**call_kwargs) + return True, None + except Exception as ex: + return False, str(ex) + + +def _save_history_visualizations(history: Any, output_dir: Path) -> Dict[str, Any]: + from fedot.core.visualisation.pipeline_specific_visuals import PipelineHistoryVisualizer + + output_dir.mkdir(parents=True, exist_ok=True) + visualizer = PipelineHistoryVisualizer(history) + + tasks = [ + ('fitness_line', {'save_path': output_dir / 'fitness_line.png'}), + ('fitness_line_interactive', {'save_path': output_dir / 'fitness_line_interactive.html'}), + ('fitness_box', {'save_path': output_dir / 'fitness_box.png', 'best_fraction': 1.0}), + ('operations_kde', {'save_path': output_dir / 'operations_kde.png'}), + ('operations_animated_bar', {'save_path': output_dir / 'operations_animated_bar.gif', 'show_fitness': True}), + ('diversity_population', {'save_path': output_dir / 'diversity_population.gif', 'fps': 1}), + ] + + report: Dict[str, Any] = {} + for method_name, method_kwargs in tasks: + ok, error = _invoke_with_supported_kwargs(visualizer, method_name, method_kwargs) + method_report = {'status': 'saved' if ok else 'skipped', 'error': error} + if ok and 'save_path' in method_kwargs: + method_report['artifact'] = str(method_kwargs['save_path']) + report[method_name] = method_report + + return report + + +def _save_pipeline_visualization(model: 'Fedot', output_path: Path) -> Dict[str, Any]: + try: + model.current_pipeline.show(save_path=output_path) + return {'status': 'saved', 'artifact': str(output_path)} + except Exception as ex: + return {'status': 'skipped', 'error': str(ex)} + + +def _save_dataframe(path: Path, frame: pd.DataFrame) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + frame.to_csv(path, index=False) + + +def _run_fedot_mode(dataset: LoadedDataset, + config: BenchmarkRunConfig, + mode_name: str, + sampling_config: Optional[Mapping[str, Any]], + mode_dir: Path) -> Dict[str, Any]: + from fedot import Fedot + + mode_dir.mkdir(parents=True, exist_ok=True) + history_dir = mode_dir / 'composer_history' + history_dir.mkdir(parents=True, exist_ok=True) + + fedot_params: Dict[str, Any] = { + 'problem': dataset.spec.task_type, + 'timeout': config.timeout_minutes_per_dataset, + 'seed': config.seed, + 'n_jobs': 1, #config.n_jobs + 'preset': config.preset, + 'use_input_preprocessing': False, + 'logging_level':10, + 'with_tuning': config.with_tuning, + 'history_dir': str(history_dir), + 'keep_history': True, + 'sampling_config': dict(sampling_config) if sampling_config is not None else None, + } + + run_started = perf_counter() + model = Fedot(**fedot_params) + + fit_started = perf_counter() + model.fit(features=dataset.x_train, target=dataset.y_train) + fit_seconds = perf_counter() - fit_started + + predict_started = perf_counter() + raw_prediction = model.predict(features=dataset.x_test) + predict_seconds = perf_counter() - predict_started + + prediction = np.asarray(raw_prediction).reshape(-1) + + probabilities: Optional[np.ndarray] = None + if dataset.spec.task_type == 'classification': + try: + raw_probabilities = model.predict_proba(features=dataset.x_test, probs_for_all_classes=True) + probabilities = np.asarray(raw_probabilities) + except Exception: + probabilities = None + + metrics = _evaluate_metrics( + task_type=dataset.spec.task_type, + y_true=np.asarray(dataset.y_test), + y_pred=prediction, + y_proba=probabilities, + ) + + predictions_frame = pd.DataFrame({ + 'y_true': np.asarray(dataset.y_test).reshape(-1), + 'y_pred': prediction, + }) + _save_dataframe(mode_dir / 'predictions.csv', predictions_frame) + + if probabilities is not None: + np.save(mode_dir / 'prediction_proba.npy', probabilities) + + artifacts: Dict[str, Any] = {} + + if model.history is not None: + history_path = mode_dir / 'opt_history.json' + model.history.save(str(history_path)) + artifacts['opt_history'] = str(history_path) + + try: + leaderboard = model.history.get_leaderboard() + if isinstance(leaderboard, pd.DataFrame): + _save_dataframe(mode_dir / 'history_leaderboard.csv', leaderboard) + artifacts['history_leaderboard'] = str(mode_dir / 'history_leaderboard.csv') + except Exception: + pass + + artifacts['history_visualizations'] = _save_history_visualizations( + history=model.history, + output_dir=mode_dir / 'history_visualizations', + ) + + artifacts['pipeline_visualization'] = _save_pipeline_visualization( + model=model, + output_path=mode_dir / 'pipeline.png', + ) + + try: + pipeline_save_dir = mode_dir / 'pipeline_saved' + pipeline_save_dir.mkdir(parents=True, exist_ok=True) + model.current_pipeline.save(path=str(pipeline_save_dir), create_subdir=False, is_datetime_in_path=False) + artifacts['pipeline_serialized'] = str(pipeline_save_dir) + except Exception as ex: + artifacts['pipeline_serialized'] = {'status': 'skipped', 'error': str(ex)} + + try: + report = model.return_report() + report.to_csv(mode_dir / 'fedot_time_report.csv') + artifacts['fedot_time_report'] = str(mode_dir / 'fedot_time_report.csv') + except Exception as ex: + artifacts['fedot_time_report'] = {'status': 'skipped', 'error': str(ex)} + + sampling_metadata = model.sampling_stage_metadata + if sampling_metadata is not None: + _save_json(mode_dir / 'sampling_stage_metadata.json', sampling_metadata) + artifacts['sampling_stage_metadata'] = str(mode_dir / 'sampling_stage_metadata.json') + + total_seconds = perf_counter() - run_started + + result = { + 'dataset': dataset.spec.name, + 'mode': mode_name, + 'task_type': dataset.spec.task_type, + 'status': 'success', + 'metrics': metrics, + 'timings_seconds': { + 'fit': float(fit_seconds), + 'predict': float(predict_seconds), + 'total': float(total_seconds), + }, + 'sampling_enabled': sampling_config is not None, + 'sampling_strategy': sampling_config.get('strategy') if sampling_config else None, + 'rows_train': int(len(dataset.y_train)), + 'rows_test': int(len(dataset.y_test)), + 'artifacts': artifacts, + } + + _save_json(mode_dir / 'run_result.json', result) + return result + + +def _build_markdown_report(records: Sequence[Mapping[str, Any]], report_path: Path) -> None: + lines = [ + '# FEDOT AMLB Sampling Benchmark Report', + '', + '| Dataset | Mode | Status | Main metrics | Fit (s) | Total (s) |', + '|---|---|---|---|---:|---:|', + ] + + for record in records: + metrics = record.get('metrics', {}) or {} + if 'f1_macro' in metrics: + main_metrics = f"f1_macro={metrics.get('f1_macro', float('nan')):.4f}" + elif 'rmse' in metrics: + main_metrics = f"rmse={metrics.get('rmse', float('nan')):.4f}" + else: + main_metrics = '-' + + timings = record.get('timings_seconds', {}) or {} + lines.append( + f"| {record.get('dataset', '-')} | {record.get('mode', '-')} | {record.get('status', '-')} | " + f"{main_metrics} | {float(timings.get('fit', float('nan'))):.3f} | " + f"{float(timings.get('total', float('nan'))):.3f} |" + ) + + report_path.write_text('\n'.join(lines), encoding='utf-8') + + +def run_benchmark(config: BenchmarkRunConfig) -> Dict[str, Any]: + if not config.include_baseline and not config.include_sampling: + raise ValueError('At least one mode must be enabled: baseline or sampling.') + + run_id = datetime.utcnow().strftime('%Y%m%d_%H%M%S') + run_dir = config.output_root / f'run_amlb_fedot_sampling_{run_id}' + run_dir.mkdir(parents=True, exist_ok=True) + + dataset_specs = _resolve_dataset_specs(config.dataset_names, config.amlb_categories) + + run_meta = { + 'run_id': run_id, + 'started_utc': datetime.utcnow().isoformat(timespec='seconds'), + 'timeout_minutes_per_dataset': config.timeout_minutes_per_dataset, + 'dataset_count': len(dataset_specs), + 'datasets': [spec.name for spec in dataset_specs], + 'config': { + **asdict(config), + 'output_root': str(config.output_root), + }, + } + _save_json(run_dir / 'run_meta.json', run_meta) + + records: List[Dict[str, Any]] = [] + + for dataset_spec in dataset_specs: + print(f'\\n=== Dataset: {dataset_spec.name} ===') + dataset_dir = run_dir / _safe_name(dataset_spec.name) + dataset_dir.mkdir(parents=True, exist_ok=True) + + try: + loaded = _load_amlb_dataset( + spec=dataset_spec, + seed=config.seed, + max_rows=config.max_rows_per_dataset, + test_size=config.test_size, + ) + _save_json(dataset_dir / 'dataset_metadata.json', loaded.metadata) + except Exception as ex: + error_record = { + 'dataset': dataset_spec.name, + 'mode': 'dataset_loading', + 'task_type': dataset_spec.task_type, + 'status': 'failed', + 'error': str(ex), + 'traceback': traceback.format_exc(), + } + _save_json(dataset_dir / 'dataset_loading_error.json', error_record) + records.append(error_record) + print(f'Failed to load dataset {dataset_spec.name}: {ex}') + continue + + mode_specs: List[Tuple[str, Optional[Mapping[str, Any]]]] = [] + if config.include_baseline: + mode_specs.append(('fedot_full_dataset', None)) + if config.include_sampling: + mode_specs.append(('fedot_sampling_stage', dict(config.sampling_config))) + + for mode_name, sampling_config in mode_specs: + print(f' -> Mode: {mode_name}') + mode_dir = dataset_dir / _safe_name(mode_name) + + try: + mode_result = _run_fedot_mode( + dataset=loaded, + config=config, + mode_name=mode_name, + sampling_config=sampling_config, + mode_dir=mode_dir, + ) + records.append(mode_result) + print(f" success: fit={mode_result['timings_seconds']['fit']:.2f}s") + except Exception as ex: + failed = { + 'dataset': dataset_spec.name, + 'mode': mode_name, + 'task_type': dataset_spec.task_type, + 'status': 'failed', + 'sampling_enabled': sampling_config is not None, + 'error': str(ex), + 'traceback': traceback.format_exc(), + } + _save_json(mode_dir / 'run_result_error.json', failed) + records.append(failed) + print(f' failed: {ex}') + + summary_frame = pd.DataFrame(records) + summary_csv = run_dir / 'benchmark_runs.csv' + summary_json = run_dir / 'benchmark_runs.json' + summary_frame.to_csv(summary_csv, index=False) + summary_json.write_text(summary_frame.to_json(orient='records', force_ascii=False, indent=2), encoding='utf-8') + + _build_markdown_report(records, run_dir / 'report.md') + + finished = { + 'run_id': run_id, + 'run_dir': str(run_dir), + 'records_count': len(records), + 'successful_runs': int(sum(1 for record in records if record.get('status') == 'success')), + 'failed_runs': int(sum(1 for record in records if record.get('status') != 'success')), + } + _save_json(run_dir / 'run_summary.json', finished) + + print('\\n=== Benchmark complete ===') + print(f"Results dir: {run_dir}") + print(f"Successful runs: {finished['successful_runs']}") + print(f"Failed runs: {finished['failed_runs']}") + + return { + 'summary': finished, + 'records': records, + 'run_dir': run_dir, + } + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description='Run FEDOT AMLB benchmark with sampling stage and save optimization artifacts.' + ) + parser.add_argument('--datasets', nargs='*', default=[], + help='Explicit AMLB dataset profile names. If omitted, categories are used.') + parser.add_argument('--amlb-categories', nargs='*', default=list(DEFAULT_CATEGORY_PROFILE), + help='AMLB category profiles. Used only when --datasets is empty.') + parser.add_argument('--timeout-minutes', type=float, default=DEFAULT_TIMEOUT_MINUTES_PER_DATASET, + help='Time budget per dataset in minutes. Default: 15.') + parser.add_argument('--seed', type=int, default=42, help='Random seed.') + parser.add_argument('--n-jobs', type=int, default=-1, help='Parallel jobs for FEDOT.') + parser.add_argument('--preset', type=str, default='best_quality', help='FEDOT preset.') + parser.add_argument('--disable-tuning', action='store_true', help='Disable post-composition tuning inside FEDOT.') + parser.add_argument('--max-rows', type=int, default=25000, + help='Maximum rows per dataset to keep benchmark stable in runtime.') + parser.add_argument('--output-root', type=str, default=str(DEFAULT_RESULTS_ROOT), + help='Root directory for benchmark artifacts.') + + parser.add_argument('--disable-baseline', action='store_true', + help='Disable baseline mode (full dataset, no sampling).') + parser.add_argument('--disable-sampling', action='store_true', + help='Disable sampling mode.') + + parser.add_argument('--sampling-strategy', type=str, default='random', + help='Sampling strategy name for sampling_zoo provider.') + parser.add_argument('--sampling-strategy-params-json', type=str, default='{}', + help='JSON object with strategy params for sampling stage.') + parser.add_argument('--candidate-ratios', type=str, default='0.15,0.2,0.3,0.5', + help='Comma-separated candidate ratios for effective-size protocol.') + parser.add_argument('--delta-threshold', type=float, default=0.03, + help='Allowed metric delta threshold for effective-size selection.') + parser.add_argument('--cap-max-timeout-share', type=float, default=0.35, + help='Max timeout share for sampling stage in dynamic cap policy.') + + return parser.parse_args() + + +def _build_config_from_args(args: argparse.Namespace) -> BenchmarkRunConfig: + candidate_ratios = parse_ratio_list(args.candidate_ratios) + + try: + strategy_params = json.loads(args.sampling_strategy_params_json) + except json.JSONDecodeError as ex: + raise ValueError(f'Invalid --sampling-strategy-params-json: {ex}') + + if not isinstance(strategy_params, dict): + raise ValueError('--sampling-strategy-params-json must decode to a JSON object.') + + sampling_config = _default_sampling_config(seed=args.seed) + sampling_config['strategy'] = args.sampling_strategy + sampling_config['strategy_params'] = strategy_params + sampling_config['candidate_ratios'] = list(candidate_ratios) + sampling_config['delta_metric_threshold'] = args.delta_threshold + sampling_config['cap_max_timeout_share'] = args.cap_max_timeout_share + + return BenchmarkRunConfig( + timeout_minutes_per_dataset=args.timeout_minutes, + output_root=Path(args.output_root), + seed=args.seed, + n_jobs=args.n_jobs, + preset=args.preset, + with_tuning=not args.disable_tuning, + dataset_names=tuple(args.datasets), + amlb_categories=tuple(args.amlb_categories), + max_rows_per_dataset=args.max_rows, + include_baseline=not args.disable_baseline, + include_sampling=not args.disable_sampling, + sampling_config=sampling_config, + ) + + +def main() -> None: + args = _parse_args() + config = _build_config_from_args(args) + run_benchmark(config) + + +if __name__ == '__main__': + main() diff --git a/examples/simple/classification/sampling_stage_example.py b/examples/simple/classification/sampling_stage_example.py new file mode 100644 index 0000000000..e6a07a2f69 --- /dev/null +++ b/examples/simple/classification/sampling_stage_example.py @@ -0,0 +1,38 @@ +from fedot import Fedot +from fedot.core.utils import fedot_project_root, set_random_seed + + +def run_sampling_stage_example(timeout: float = 1.0): + train_data_path = f'{fedot_project_root()}/examples/real_cases/data/scoring/scoring_train.csv' + test_data_path = f'{fedot_project_root()}/examples/real_cases/data/scoring/scoring_test.csv' + + model = Fedot( + problem='classification', + timeout=timeout, + preset='fast_train', + max_depth=2, + max_arity=2, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.2, 0.3, 0.5], + 'delta_metric_threshold': 0.05, + } + ) + + model.fit(features=train_data_path, target='target') + _ = model.predict(features=test_data_path) + + print('Sampling metadata:', model.sampling_stage_metadata) + print('Metrics:', model.get_metrics()) + + +if __name__ == '__main__': + set_random_seed(42) + + try: + run_sampling_stage_example(timeout=1.0) + except ModuleNotFoundError as ex: + print('Sampling Zoo dependency is unavailable.') + print('Install with: pip install "fedot[sampling_zoo]"') + raise ex diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py index e8e0baf5bb..824c16b47c 100644 --- a/fedot/api/api_utils/api_params_repository.py +++ b/fedot/api/api_utils/api_params_repository.py @@ -1,9 +1,11 @@ import datetime +from dataclasses import asdict from typing import Sequence from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum +from fedot.api.sampling_stage.config import validate_sampling_config from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, add_resample_mutation from fedot.core.constants import AUTO_PRESET_NAME from fedot.core.repository.tasks import TaskTypesEnum @@ -70,7 +72,8 @@ def default_params_for_task(task_type: TaskTypesEnum) -> dict: keep_history=True, history_dir=default_fedot_data_dir(), with_tuning=True, - seed=None + seed=None, + sampling_config=None, ) return default_param_values_dict @@ -81,11 +84,16 @@ def check_and_set_default_params(self, params: dict) -> dict: invalid_keys = params.keys() - allowed_keys if invalid_keys: raise KeyError(f"Invalid key parameters {invalid_keys}") - else: - missing_params = self.default_params.keys() - params.keys() - for k in missing_params: - if (v := self.default_params[k]) is not None: - params[k] = v + + if 'sampling_config' in params: + validated_sampling_config = validate_sampling_config(params['sampling_config']) + params['sampling_config'] = asdict(validated_sampling_config) if validated_sampling_config else None + + missing_params = self.default_params.keys() - params.keys() + for k in missing_params: + if (v := self.default_params[k]) is not None: + params[k] = v + return params @staticmethod diff --git a/fedot/api/api_utils/assumptions/task_assumptions.py b/fedot/api/api_utils/assumptions/task_assumptions.py index 567f2394c1..58d31077e5 100644 --- a/fedot/api/api_utils/assumptions/task_assumptions.py +++ b/fedot/api/api_utils/assumptions/task_assumptions.py @@ -92,8 +92,8 @@ class RegressionAssumptions(TaskAssumptions): def builders(self): return { 'rfr': PipelineBuilder().add_node('rfr'), - 'ridge': PipelineBuilder().add_node('ridge'), - 'lgbmreg': PipelineBuilder().add_node('lgbmreg'), + # 'ridge': PipelineBuilder().add_node('ridge'), + # 'lgbmreg': PipelineBuilder().add_node('lgbmreg'), } def ensemble_operation(self) -> str: @@ -113,13 +113,13 @@ class ClassificationAssumptions(TaskAssumptions): @property def builders(self): return { - 'gbm_linear': PipelineBuilder(). - add_branch('catboost', 'xgboost', 'lgbm').join_branches('logit'), - 'catboost': PipelineBuilder().add_node('catboost'), - 'xgboost': PipelineBuilder().add_node('xgboost'), - 'lgbm': PipelineBuilder().add_node('lgbm'), + # 'gbm_linear': PipelineBuilder(). + # add_branch('catboost', 'xgboost', 'lgbm').join_branches('logit'), + # 'catboost': PipelineBuilder().add_node('catboost'), + # 'xgboost': PipelineBuilder().add_node('xgboost'), + # 'lgbm': PipelineBuilder().add_node('lgbm'), 'rf': PipelineBuilder().add_node('rf'), - 'logit': PipelineBuilder().add_node('logit'), + #'logit': PipelineBuilder().add_node('logit'), } def ensemble_operation(self) -> str: diff --git a/fedot/api/api_utils/presets.py b/fedot/api/api_utils/presets.py index e42f64e236..7e64186447 100644 --- a/fedot/api/api_utils/presets.py +++ b/fedot/api/api_utils/presets.py @@ -51,7 +51,7 @@ def filter_operations_by_preset(self, data_type: Optional[DataTypesEnum] = None) # Use best_quality preset but exclude several operations preset_name = BEST_QUALITY_PRESET_NAME excluded = ['mlp', 'svc', 'svr', 'arima', 'exog_ts', 'text_clean', - 'lda', 'qda', 'lgbm', 'one_hot_encoding', + 'lda', 'qda', 'lgbm', 'one_hot_encoding','polyfit', 'resample', 'stl_arima'] excluded_tree = [] diff --git a/fedot/api/builder.py b/fedot/api/builder.py index 8f726cf95a..73d1822520 100644 --- a/fedot/api/builder.py +++ b/fedot/api/builder.py @@ -432,6 +432,7 @@ def setup_data_preprocessing( use_input_preprocessing: bool = DEFAULT_VALUE, use_preprocessing_cache: bool = DEFAULT_VALUE, use_auto_preprocessing: bool = DEFAULT_VALUE, + sampling_config: Dict[str, Any] = DEFAULT_VALUE, ) -> FedotBuilder: """ Sets parameters of input data preprocessing. @@ -446,6 +447,9 @@ def setup_data_preprocessing( use_preprocessing_cache: bool indicating whether to use optional preprocessors caching. Defaults to ``True``. + sampling_config: optional configuration of pre-fit sampling stage. + If ``None`` or unset, sampling stage is disabled. + Returns: :class:`FedotBuilder` instance. """ @@ -454,6 +458,7 @@ def setup_data_preprocessing( use_input_preprocessing=use_input_preprocessing, use_preprocessing_cache=use_preprocessing_cache, use_auto_preprocessing=use_auto_preprocessing, + sampling_config=sampling_config, ) return self diff --git a/fedot/api/main.py b/fedot/api/main.py index f9a998f789..84e5790c62 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -17,6 +17,7 @@ from fedot.api.api_utils.input_analyser import InputAnalyser from fedot.api.api_utils.params import ApiParams from fedot.api.api_utils.predefined_model import PredefinedModel +from fedot.api.sampling_stage.executor import SamplingStageExecutor from fedot.core.constants import DEFAULT_API_TIMEOUT_MINUTES, DEFAULT_TUNING_ITERATIONS_NUMBER from fedot.core.data.data import InputData, OutputData, PathType from fedot.core.data.multi_modal import MultiModalData @@ -118,6 +119,7 @@ def __init__(self, self.current_pipeline: Optional[Pipeline] = None self.best_models: Sequence[Pipeline] = () self.history: Optional[OptHistory] = None + self.sampling_stage_metadata: Optional[dict] = None fedot_composer_timer.reset_timer() @@ -141,74 +143,86 @@ def fit(self, MemoryAnalytics.start() - self.target = target + self.sampling_stage_metadata = None + initial_timeout = self.params.timeout - with fedot_composer_timer.launch_data_definition('fit'): - self.train_data = self.data_processor.define_data(features=features, target=target, is_predict=False) + try: + self.target = target - self.params.update_available_operations_by_preset(self.train_data) + with fedot_composer_timer.launch_data_definition('fit'): + self.train_data = self.data_processor.define_data(features=features, target=target, is_predict=False) - if self.params.get('use_input_preprocessing'): - # Launch data analyser - it gives recommendations for data preprocessing - recommendations_for_data, recommendations_for_params = \ - self.data_analyser.give_recommendations(input_data=self.train_data, - input_params=self.params) - self.data_processor.accept_and_apply_recommendations(input_data=self.train_data, - recommendations=recommendations_for_data) - self.params.accept_and_apply_recommendations(input_data=self.train_data, - recommendations=recommendations_for_params) - else: - recommendations_for_data = None - - self._init_remote_if_necessary() + self.params.update_available_operations_by_preset(self.train_data) - if isinstance(self.train_data, InputData) and self.params.get('use_auto_preprocessing'): - with fedot_composer_timer.launch_preprocessing(): - self.train_data = self.data_processor.fit_transform(self.train_data) - - # TODO: Workaround for AtomizedModel - init_asm = self.params.data.get('initial_assumption') - if predefined_model is None: - if isinstance(init_asm, Pipeline) and ("atomized" in init_asm.descriptive_id): - self.log.message('Composition for AtomizedModel currently unavailable') - predefined_model = init_asm - - with fedot_composer_timer.launch_fitting(): - if predefined_model is not None: - # Fit predefined model and return it without composing - self.current_pipeline = PredefinedModel( - predefined_model, self.train_data, self.log, - use_input_preprocessing=self.params.get('use_input_preprocessing'), - api_preprocessor=self.data_processor.preprocessor, - ).fit() + if self.params.get('use_input_preprocessing'): + # Launch data analyser - it gives recommendations for data preprocessing + recommendations_for_data, recommendations_for_params = \ + self.data_analyser.give_recommendations(input_data=self.train_data, + input_params=self.params) + self.data_processor.accept_and_apply_recommendations(input_data=self.train_data, + recommendations=recommendations_for_data) + self.params.accept_and_apply_recommendations(input_data=self.train_data, + recommendations=recommendations_for_params) else: - self.current_pipeline, self.best_models, self.history = self.api_composer.obtain_model(self.train_data) - - if self.current_pipeline is None: - raise ValueError('No models were found') - - full_train_not_preprocessed = deepcopy(self.train_data) - # Final fit for obtained pipeline on full dataset - - with fedot_composer_timer.launch_train_inference(): - if self.history and not self.history.is_empty() or not self.current_pipeline.is_fitted: - self._train_pipeline_on_full_dataset(recommendations_for_data, full_train_not_preprocessed) - self.log.message('Final pipeline was fitted') - else: - self.log.message('Already fitted initial pipeline is used') - - # Merge API & pipelines encoders if it is required - self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors( - api_preprocessor=self.data_processor.preprocessor, - pipeline_preprocessor=self.current_pipeline.preprocessor, - use_auto_preprocessing=self.params.get('use_auto_preprocessing') - ) - - self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}') - - MemoryAnalytics.finish() - - return self.current_pipeline + recommendations_for_data = None + + self._init_remote_if_necessary() + + if isinstance(self.train_data, InputData) and self.params.get('use_auto_preprocessing'): + with fedot_composer_timer.launch_preprocessing(): + self.train_data = self.data_processor.fit_transform(self.train_data) + + # TODO: Workaround for AtomizedModel + init_asm = self.params.data.get('initial_assumption') + if predefined_model is None: + if isinstance(init_asm, Pipeline) and ("atomized" in init_asm.descriptive_id): + self.log.message('Composition for AtomizedModel currently unavailable') + predefined_model = init_asm + if self.params.get('sampling_config') is not None: + self.sampling_stage_metadata = {'status': 'skipped', 'reason': 'atomized_initial_assumption'} + else: + self._run_sampling_stage_if_necessary() + elif self.params.get('sampling_config') is not None: + self.sampling_stage_metadata = {'status': 'skipped', 'reason': 'predefined_model'} + self.log.message('Sampling stage skipped because predefined_model is specified.') + + with fedot_composer_timer.launch_fitting(): + if predefined_model is not None: + # Fit predefined model and return it without composing + self.current_pipeline = PredefinedModel( + predefined_model, self.train_data, self.log, + use_input_preprocessing=self.params.get('use_input_preprocessing'), + api_preprocessor=self.data_processor.preprocessor, + ).fit() + else: + self.current_pipeline, self.best_models, self.history = self.api_composer.obtain_model(self.train_data) + + if self.current_pipeline is None: + raise ValueError('No models were found') + + full_train_not_preprocessed = deepcopy(self.train_data) + # Final fit for obtained pipeline on full dataset + + with fedot_composer_timer.launch_train_inference(): + if self.history and not self.history.is_empty() or not self.current_pipeline.is_fitted: + self._train_pipeline_on_full_dataset(recommendations_for_data, full_train_not_preprocessed) + self.log.message('Final pipeline was fitted') + else: + self.log.message('Already fitted initial pipeline is used') + + # Merge API & pipelines encoders if it is required + self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors( + api_preprocessor=self.data_processor.preprocessor, + pipeline_preprocessor=self.current_pipeline.preprocessor, + use_auto_preprocessing=self.params.get('use_auto_preprocessing') + ) + + self.log.message(f'Final pipeline: {graph_structure(self.current_pipeline)}') + + return self.current_pipeline + finally: + self.params.timeout = initial_timeout + MemoryAnalytics.finish() def tune(self, input_data: Optional[FeaturesType] = None, @@ -563,6 +577,32 @@ def _init_remote_if_necessary(self): if isinstance(self.target, str) and remote.remote_task_params.target is None: remote.remote_task_params.target = self.target + def _run_sampling_stage_if_necessary(self): + sampling_config = self.params.get('sampling_config') + if sampling_config is None: + return + + if not isinstance(self.train_data, InputData): + raise ValueError('Sampling stage supports only InputData in V1.') + + self.log.message('Sampling stage started') + executor = SamplingStageExecutor(sampling_config=sampling_config, + task_type=self.params.task.task_type, + total_timeout_minutes=self.params.timeout, + log=self.log) + stage_result = executor.execute(self.train_data) + self.train_data = stage_result.train_data + self.sampling_stage_metadata = stage_result.metadata + + if self.params.timeout is not None: + self.params.timeout = stage_result.updated_timeout_minutes + + self.log.message( + f'Sampling stage finished. Rows: {stage_result.metadata["rows_before"]} -> ' + f'{stage_result.metadata["rows_after"]}. ' + f'Updated timeout: {self.params.timeout} min.' + ) + def _train_pipeline_on_full_dataset(self, recommendations: Optional[dict], full_train_not_preprocessed: Union[InputData, MultiModalData]): """Applies training procedure for obtained pipeline if dataset was clipped diff --git a/fedot/api/sampling_stage/__init__.py b/fedot/api/sampling_stage/__init__.py new file mode 100644 index 0000000000..e6b62834d0 --- /dev/null +++ b/fedot/api/sampling_stage/__init__.py @@ -0,0 +1,13 @@ +from fedot.api.sampling_stage.config import SamplingConfig, validate_sampling_config +from fedot.api.sampling_stage.executor import SamplingStageExecutor, SamplingStageOutput +from fedot.api.sampling_stage.providers import SamplingProvider, SamplingProviderResult, SamplingZooProvider + +__all__ = [ + 'SamplingConfig', + 'SamplingProvider', + 'SamplingProviderResult', + 'SamplingStageExecutor', + 'SamplingStageOutput', + 'SamplingZooProvider', + 'validate_sampling_config', +] diff --git a/fedot/api/sampling_stage/config.py b/fedot/api/sampling_stage/config.py new file mode 100644 index 0000000000..58c09e38a7 --- /dev/null +++ b/fedot/api/sampling_stage/config.py @@ -0,0 +1,172 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, Optional, Sequence, Tuple + + +@dataclass(frozen=True) +class SamplingConfig: + provider: str = 'sampling_zoo' + strategy: str = 'random' + strategy_params: Dict[str, Any] = field(default_factory=dict) + candidate_ratios: Tuple[float, ...] = (0.15, 0.2, 0.3, 0.5, 0.7) + delta_metric_threshold: float = 0.03 + delta_type: str = 'relative' + validation_size: float = 0.2 + budget_policy: str = 'dynamic_cap' + cap_max_timeout_share: float = 0.35 + min_automl_time_minutes: float = 0.1 + infinite_timeout_cap_minutes: float = 5.0 + error_policy: str = 'fail_fast' + artifact_mode: str = 'minimal' + random_state: Optional[int] = 42 + guard_max_rank: int = 256 + guard_max_modes: int = 4 + guard_max_partitions: int = 128 + guard_max_sample_size: int = 100000 + + +def validate_sampling_config(config: Optional[Dict[str, Any]]) -> Optional[SamplingConfig]: + if config is None: + return None + if not isinstance(config, dict): + raise ValueError('"sampling_config" must be a dictionary or None.') + + allowed_keys = { + 'provider', + 'strategy', + 'strategy_params', + 'candidate_ratios', + 'delta_metric_threshold', + 'delta_type', + 'validation_size', + 'budget_policy', + 'cap_max_timeout_share', + 'min_automl_time_minutes', + 'infinite_timeout_cap_minutes', + 'error_policy', + 'artifact_mode', + 'random_state', + 'guard_max_rank', + 'guard_max_modes', + 'guard_max_partitions', + 'guard_max_sample_size', + } + unknown_keys = set(config.keys()) - allowed_keys + if unknown_keys: + raise ValueError(f'Unknown keys in "sampling_config": {sorted(unknown_keys)}') + + merged = SamplingConfig(**config) + _validate_sampling_config_values(merged) + return merged + + +def _validate_sampling_config_values(config: SamplingConfig) -> None: + if not isinstance(config.provider, str) or not config.provider.strip(): + raise ValueError('"sampling_config.provider" must be a non-empty string.') + + if not isinstance(config.strategy, str) or not config.strategy.strip(): + raise ValueError('"sampling_config.strategy" must be a non-empty string.') + + if not isinstance(config.strategy_params, dict): + raise ValueError('"sampling_config.strategy_params" must be a dictionary.') + + ratios = _validate_ratios(config.candidate_ratios) + if ratios != tuple(config.candidate_ratios): + raise ValueError('"sampling_config.candidate_ratios" must be sorted in ascending order without duplicates.') + + if config.delta_metric_threshold < 0: + raise ValueError('"sampling_config.delta_metric_threshold" must be >= 0.') + + if config.delta_type not in {'relative', 'absolute'}: + raise ValueError('"sampling_config.delta_type" must be one of {"relative", "absolute"}.') + + if not 0 < config.validation_size < 1: + raise ValueError('"sampling_config.validation_size" must be in range (0, 1).') + + if config.budget_policy != 'dynamic_cap': + raise ValueError('"sampling_config.budget_policy" supports only "dynamic_cap" in V1.') + + if not 0 < config.cap_max_timeout_share <= 1: + raise ValueError('"sampling_config.cap_max_timeout_share" must be in range (0, 1].') + + if config.min_automl_time_minutes <= 0: + raise ValueError('"sampling_config.min_automl_time_minutes" must be > 0.') + + if config.infinite_timeout_cap_minutes <= 0: + raise ValueError('"sampling_config.infinite_timeout_cap_minutes" must be > 0.') + + if config.error_policy != 'fail_fast': + raise ValueError('"sampling_config.error_policy" supports only "fail_fast" in V1.') + + if config.artifact_mode != 'minimal': + raise ValueError('"sampling_config.artifact_mode" supports only "minimal" in V1.') + + if config.random_state is not None and not isinstance(config.random_state, int): + raise ValueError('"sampling_config.random_state" must be int or None.') + + for key in ('guard_max_rank', 'guard_max_modes', 'guard_max_partitions', 'guard_max_sample_size'): + if getattr(config, key) <= 0: + raise ValueError(f'"sampling_config.{key}" must be > 0.') + + _validate_strategy_param_guards(config) + + +def _validate_ratios(ratios: Sequence[float]) -> Tuple[float, ...]: + if not isinstance(ratios, (list, tuple)) or len(ratios) == 0: + raise ValueError('"sampling_config.candidate_ratios" must be a non-empty list of floats.') + + normalized = [] + for ratio in ratios: + if not isinstance(ratio, (float, int)): + raise ValueError('"sampling_config.candidate_ratios" must contain only numbers.') + ratio = float(ratio) + if not 0 < ratio <= 1: + raise ValueError('"sampling_config.candidate_ratios" values must be in range (0, 1].') + normalized.append(ratio) + + if len(set(normalized)) != len(normalized): + raise ValueError('"sampling_config.candidate_ratios" must not contain duplicates.') + + sorted_ratios = sorted(normalized) + return tuple(sorted_ratios) + + +def _validate_strategy_param_guards(config: SamplingConfig) -> None: + params = config.strategy_params + + for rank_key in ('rank', 'approx_rank'): + if rank_key not in params: + continue + rank = params[rank_key] + if isinstance(rank, (list, tuple)): + if any(float(r) > config.guard_max_rank for r in rank if isinstance(r, (int, float)) and float(r) > 1): + raise ValueError( + f'"sampling_config.strategy_params.{rank_key}" exceeds guard_max_rank={config.guard_max_rank}.' + ) + elif isinstance(rank, (int, float)) and float(rank) > 1 and float(rank) > config.guard_max_rank: + raise ValueError( + f'"sampling_config.strategy_params.{rank_key}" exceeds guard_max_rank={config.guard_max_rank}.' + ) + + modes = params.get('modes') + if modes is not None: + if not isinstance(modes, (list, tuple)): + raise ValueError('"sampling_config.strategy_params.modes" must be a list/tuple.') + if len(modes) > config.guard_max_modes: + raise ValueError( + f'"sampling_config.strategy_params.modes" exceeds guard_max_modes={config.guard_max_modes}.' + ) + + for key in ('n_partitions', 'partitions', 'n_splits'): + value = params.get(key) + if value is not None and isinstance(value, int) and value > config.guard_max_partitions: + raise ValueError( + f'"sampling_config.strategy_params.{key}" exceeds guard_max_partitions={config.guard_max_partitions}.' + ) + + sample_size = params.get('sample_size') + if sample_size is not None and isinstance(sample_size, int) and sample_size > config.guard_max_sample_size: + raise ValueError( + f'"sampling_config.strategy_params.sample_size" exceeds guard_max_sample_size=' + f'{config.guard_max_sample_size}.' + ) + diff --git a/fedot/api/sampling_stage/executor.py b/fedot/api/sampling_stage/executor.py new file mode 100644 index 0000000000..1725e2ccc7 --- /dev/null +++ b/fedot/api/sampling_stage/executor.py @@ -0,0 +1,325 @@ +import time +from copy import deepcopy +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Sequence + +import numpy as np +import pandas as pd +from golem.core.log import LoggerAdapter, default_log +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.metrics import f1_score, r2_score +from sklearn.model_selection import train_test_split + +from fedot.api.sampling_stage.config import SamplingConfig, validate_sampling_config +from fedot.api.sampling_stage.providers import SamplingProvider, SamplingZooProvider +from fedot.core.data.data import InputData, data_type_is_table +from fedot.core.repository.tasks import TaskTypesEnum + + +@dataclass +class SamplingStageOutput: + train_data: InputData + metadata: Dict[str, Any] + elapsed_seconds: float + updated_timeout_minutes: Optional[float] + + +class SamplingStageExecutor: + def __init__(self, + sampling_config: Dict[str, Any], + task_type: TaskTypesEnum, + total_timeout_minutes: Optional[float], + log: Optional[LoggerAdapter] = None, + provider: Optional[SamplingProvider] = None): + self.config: SamplingConfig = validate_sampling_config(sampling_config) + if self.config is None: + raise ValueError('Sampling stage config must not be None when executor is created.') + + self.task_type = task_type + self.total_timeout_minutes = total_timeout_minutes + self.log = log or default_log(self) + self.provider = provider + + def execute(self, train_data: InputData) -> SamplingStageOutput: + self._validate_task_compatibility(train_data) + + started_at = time.perf_counter() + budget_seconds = self._compute_budget_seconds() + + provider = self.provider or self._create_provider(self.config.provider) + effective_size_result = self._select_effective_ratio(train_data, provider, started_at, budget_seconds) + + self._raise_if_budget_exceeded(started_at, budget_seconds) + remaining_budget = self._remaining_budget(started_at, budget_seconds) + final_provider_result = provider.sample( + features=np.asarray(train_data.features), + target=self._flatten_target(train_data.target), + ratio=effective_size_result['selected_ratio'], + strategy=self.config.strategy, + strategy_params=self.config.strategy_params, + random_state=self.config.random_state, + budget_seconds=remaining_budget, + ) + + selected_indices = self._validate_indices(final_provider_result.sample_indices, + upper_bound=len(train_data.idx), + data_label='full train data') + reduced_data = self._subset_by_positions(train_data, selected_indices) + + elapsed_seconds = time.perf_counter() - started_at + timeout_after_stage = self._compute_updated_timeout(elapsed_seconds) + + metadata = { + 'status': 'applied', + 'provider': self.config.provider, + 'strategy': self.config.strategy, + 'selected_ratio': effective_size_result['selected_ratio'], + 'selected_delta': effective_size_result['selected_delta'], + 'baseline_score': effective_size_result['baseline_score'], + 'selected_score': effective_size_result['selected_score'], + 'rows_before': int(len(train_data.idx)), + 'rows_after': int(len(reduced_data.idx)), + 'elapsed_seconds': elapsed_seconds, + 'budget_seconds': budget_seconds, + 'artifact_mode': self.config.artifact_mode, + 'protocol_trials': effective_size_result['trials'], + 'provider_meta': final_provider_result.meta, + } + + return SamplingStageOutput(train_data=reduced_data, + metadata=metadata, + elapsed_seconds=elapsed_seconds, + updated_timeout_minutes=timeout_after_stage) + + def _validate_task_compatibility(self, train_data: InputData) -> None: + if self.task_type not in (TaskTypesEnum.classification, TaskTypesEnum.regression): + raise ValueError('Sampling stage supports only classification/regression tasks in V1.') + + if not isinstance(train_data, InputData): + raise ValueError('Sampling stage supports only InputData in V1.') + + if not data_type_is_table(train_data): + raise ValueError('Sampling stage supports only tabular InputData in V1.') + + if train_data.target is None: + raise ValueError('Sampling stage requires non-empty target in train data.') + + if len(train_data.idx) < 5: + raise ValueError('Sampling stage requires at least 5 rows in train data.') + + def _select_effective_ratio(self, + train_data: InputData, + provider: SamplingProvider, + started_at: float, + budget_seconds: float) -> Dict[str, Any]: + train_split, valid_split = self._split_for_protocol(train_data) + + baseline_score = self._score_light_model(train_split, valid_split) + sorted_ratios = sorted(self.config.candidate_ratios) + selected_ratio = None + selected_delta = None + selected_score = None + trials: List[Dict[str, Any]] = [] + + for ratio in sorted_ratios: + self._raise_if_budget_exceeded(started_at, budget_seconds) + provider_result = provider.sample( + features=np.asarray(train_split.features), + target=self._flatten_target(train_split.target), + ratio=ratio, + strategy=self.config.strategy, + strategy_params=self.config.strategy_params, + random_state=self.config.random_state, + budget_seconds=self._remaining_budget(started_at, budget_seconds), + ) + candidate_indices = self._validate_indices(provider_result.sample_indices, + upper_bound=len(train_split.idx), + data_label='train split') + candidate_split = self._subset_by_positions(train_split, candidate_indices) + candidate_score = self._score_light_model(candidate_split, valid_split) + delta = self._calculate_delta(baseline_score, candidate_score) + + trials.append({ + 'ratio': float(ratio), + 'score': float(candidate_score), + 'delta': float(delta), + 'sample_size': int(len(candidate_indices)), + }) + + if delta <= self.config.delta_metric_threshold: + selected_ratio = float(ratio) + selected_delta = float(delta) + selected_score = float(candidate_score) + break + + if selected_ratio is None: + raise ValueError( + 'No candidate ratio satisfied "delta_metric_threshold". ' + f'Checked ratios: {sorted_ratios}, threshold={self.config.delta_metric_threshold}.' + ) + + return { + 'selected_ratio': selected_ratio, + 'selected_delta': selected_delta, + 'selected_score': selected_score, + 'baseline_score': float(baseline_score), + 'trials': trials, + } + + def _split_for_protocol(self, train_data: InputData) -> Sequence[InputData]: + indices = np.arange(len(train_data.idx)) + target = self._flatten_target(train_data.target) + + stratify_target = target if self.task_type == TaskTypesEnum.classification else None + try: + train_ids, valid_ids = train_test_split(indices, + test_size=self.config.validation_size, + random_state=self.config.random_state, + stratify=stratify_target) + except ValueError: + train_ids, valid_ids = train_test_split(indices, + test_size=self.config.validation_size, + random_state=self.config.random_state, + stratify=None) + + train_split = self._subset_by_positions(train_data, train_ids) + valid_split = self._subset_by_positions(train_data, valid_ids) + return train_split, valid_split + + def _score_light_model(self, train_data: InputData, valid_data: InputData) -> float: + x_train_df, x_valid_df = self._prepare_feature_matrices(train_data.features, valid_data.features) + y_train = self._flatten_target(train_data.target) + y_valid = self._flatten_target(valid_data.target) + + if self.task_type == TaskTypesEnum.classification: + model = RandomForestClassifier(n_estimators=100, + random_state=self.config.random_state, + n_jobs=-1) + model.fit(x_train_df, y_train) + prediction = model.predict(x_valid_df) + return float(f1_score(y_valid, prediction, average='macro')) + + model = RandomForestRegressor(n_estimators=100, + random_state=self.config.random_state, + n_jobs=1) + model.fit(x_train_df, y_train) + prediction = model.predict(x_valid_df) + return float(r2_score(y_valid, prediction)) + + @staticmethod + def _prepare_feature_matrices(train_features: Any, valid_features: Any) -> Sequence[pd.DataFrame]: + x_train_df = pd.get_dummies(pd.DataFrame(train_features), dummy_na=True) + x_valid_df = pd.get_dummies(pd.DataFrame(valid_features), dummy_na=True) + x_valid_df = x_valid_df.reindex(columns=x_train_df.columns, fill_value=0) + return x_train_df, x_valid_df + + def _compute_budget_seconds(self) -> float: + if self.config.budget_policy != 'dynamic_cap': + raise ValueError(f'Unsupported budget_policy={self.config.budget_policy}') + + if self.total_timeout_minutes is None: + return float(self.config.infinite_timeout_cap_minutes * 60) + + total_seconds = float(self.total_timeout_minutes * 60) + max_share_seconds = total_seconds * self.config.cap_max_timeout_share + guaranteed_remaining_seconds = self.config.min_automl_time_minutes * 60 + max_by_remaining = max(0.0, total_seconds - guaranteed_remaining_seconds) + budget_seconds = min(max_share_seconds, max_by_remaining) + + if budget_seconds <= 0: + raise ValueError( + 'Sampling stage has zero budget due to timeout constraints. ' + f'Increase timeout or reduce min_automl_time_minutes ({self.config.min_automl_time_minutes}).' + ) + + return float(budget_seconds) + + def _compute_updated_timeout(self, elapsed_seconds: float) -> Optional[float]: + if self.total_timeout_minutes is None: + return None + + remaining = float(self.total_timeout_minutes) - elapsed_seconds / 60.0 + return float(max(self.config.min_automl_time_minutes, remaining)) + + def _create_provider(self, provider_name: str) -> SamplingProvider: + if provider_name == 'sampling_zoo': + return SamplingZooProvider() + raise ValueError(f'Unknown sampling provider: {provider_name}') + + @staticmethod + def _flatten_target(target: Any) -> np.ndarray: + values = np.asarray(target) + if values.ndim > 1 and values.shape[1] == 1: + values = values.reshape(-1) + return values + + def _calculate_delta(self, baseline_score: float, sampled_score: float) -> float: + score_drop = max(0.0, baseline_score - sampled_score) + if self.config.delta_type == 'absolute': + return float(score_drop) + + denominator = max(abs(baseline_score), 1e-12) + return float(score_drop / denominator) + + @staticmethod + def _validate_indices(indices: np.ndarray, upper_bound: int, data_label: str) -> np.ndarray: + values = np.asarray(indices) + if values.ndim != 1: + raise ValueError(f'Sampled indices for {data_label} must be a 1D array.') + if len(values) == 0: + raise ValueError(f'Sampled indices for {data_label} must not be empty.') + + try: + values = values.astype(int) + except Exception as ex: + raise ValueError(f'Sampled indices for {data_label} must be integer-like. Details: {ex}') + + if len(np.unique(values)) != len(values): + raise ValueError(f'Sampled indices for {data_label} must be unique.') + + if values.min() < 0 or values.max() >= upper_bound: + raise ValueError( + f'Sampled indices for {data_label} are out of bounds. ' + f'Allowed range: [0, {upper_bound - 1}].' + ) + + return values + + @staticmethod + def _subset_by_positions(data: InputData, positions: np.ndarray) -> InputData: + positions = np.asarray(positions, dtype=int) + features = np.take(data.features, positions, axis=0) + target = np.take(data.target, positions, axis=0) + idx = np.take(data.idx, positions, axis=0) + + categorical_features = None + if data.categorical_features is not None: + categorical_features = np.take(data.categorical_features, positions, axis=0) + + return InputData( + idx=idx, + features=features, + target=target, + task=deepcopy(data.task), + data_type=data.data_type, + supplementary_data=data.supplementary_data, + categorical_features=categorical_features, + categorical_idx=data.categorical_idx, + numerical_idx=data.numerical_idx, + encoded_idx=data.encoded_idx, + features_names=data.features_names, + ) + + @staticmethod + def _raise_if_budget_exceeded(started_at: float, budget_seconds: float) -> None: + elapsed = time.perf_counter() - started_at + if elapsed > budget_seconds: + raise TimeoutError( + f'Sampling stage exceeded its dynamic cap: elapsed={elapsed:.2f}s, budget={budget_seconds:.2f}s.' + ) + + @staticmethod + def _remaining_budget(started_at: float, budget_seconds: float) -> float: + elapsed = time.perf_counter() - started_at + return max(0.0, budget_seconds - elapsed) + diff --git a/fedot/api/sampling_stage/providers.py b/fedot/api/sampling_stage/providers.py new file mode 100644 index 0000000000..1fa2e17b3e --- /dev/null +++ b/fedot/api/sampling_stage/providers.py @@ -0,0 +1,295 @@ +import inspect +from abc import ABC, abstractmethod +from dataclasses import dataclass +from importlib import import_module +from typing import Any, Dict, Optional +from sampling_zoo.core.api.api_main import SamplingStrategyFactory +import numpy as np +import pandas as pd + + +@dataclass +class SamplingProviderResult: + sample_indices: np.ndarray + sample_scores: Optional[np.ndarray] + meta: Dict[str, Any] + + +class SamplingProvider(ABC): + @abstractmethod + def sample(self, + features: np.ndarray, + target: np.ndarray, + ratio: float, + strategy: str, + strategy_params: Dict[str, Any], + random_state: Optional[int], + budget_seconds: Optional[float]) -> SamplingProviderResult: + pass + + +class SamplingZooProvider(SamplingProvider): + _SAMPLING_MODULE_CANDIDATES = ( + 'sampling_zoo.core.api.api_main', + 'sampling_zoo.api.api_main', + 'core.api.api_main', + ) + + def __init__(self): + self._factory_cls = SamplingStrategyFactory + + def sample(self, + features: np.ndarray, + target: np.ndarray, + ratio: float, + strategy: str, + strategy_params: Dict[str, Any], + random_state: Optional[int], + budget_seconds: Optional[float]) -> SamplingProviderResult: + del budget_seconds + + n_rows = int(features.shape[0]) + sample_size = max(1, int(round(ratio * n_rows))) + factory = self._factory_cls() + + strategy_kwargs = dict(strategy_params) + if random_state is not None and 'random_state' not in strategy_kwargs: + strategy_kwargs['random_state'] = random_state + + strategy_kwargs = self._inject_required_kwargs( + factory=factory, + strategy_name=strategy, + strategy_kwargs=strategy_kwargs, + sample_size=sample_size, + ) + strategy_obj = self._create_strategy(factory, strategy, strategy_kwargs) + + data_frame = pd.DataFrame(features) + self._fit_strategy(strategy_obj, data_frame, target) + + extracted = self._extract_indices(strategy_obj, data_frame, target) + if extracted.size < sample_size: + raise ValueError( + f'Sampling provider returned too few unique indices: {extracted.size}, required at least {sample_size}.' + ) + + rng = np.random.default_rng(random_state) + sampled = rng.choice(extracted, size=sample_size, replace=False) + sampled = np.asarray(sampled, dtype=int) + + sample_scores = self._extract_scores(strategy_obj, sampled) + meta = { + 'provider': 'sampling_zoo', + 'strategy': strategy, + 'sample_size': sample_size, + 'strategy_kwargs': strategy_kwargs, + } + + return SamplingProviderResult(sample_indices=sampled, + sample_scores=sample_scores, + meta=meta) + + @staticmethod + def _create_strategy(factory: Any, strategy_name: str, strategy_kwargs: Dict[str, Any]) -> Any: + try: + return factory.create_strategy(strategy_name, **strategy_kwargs) + except TypeError as ex: + raise ValueError( + f'Failed to initialize sampling strategy "{strategy_name}" with parameters {strategy_kwargs}: {ex}' + ) + + @staticmethod + def _fit_strategy(strategy_obj: Any, data_frame: pd.DataFrame, target: np.ndarray) -> None: + fit_method = getattr(strategy_obj, 'fit', None) + if fit_method is None: + raise ValueError('Sampling strategy object has no "fit" method.') + + calls = ( + lambda: fit_method(data_frame, target=target), + lambda: fit_method(data_frame, target), + lambda: fit_method(data_frame), + ) + last_error = None + for call in calls: + try: + call() + return + except TypeError as ex: + last_error = ex + + raise ValueError(f'Unable to call strategy.fit(...) due to incompatible signature: {last_error}') + + @staticmethod + def _extract_scores(strategy_obj: Any, selected_indices: np.ndarray) -> Optional[np.ndarray]: + for attr_name in ('sampling_scores_', 'difficulty_scores_', 'uncertainty_scores_'): + score_values = getattr(strategy_obj, attr_name, None) + if score_values is None: + continue + try: + score_values = np.asarray(score_values) + if score_values.ndim != 1: + continue + return score_values[selected_indices] + except Exception: + continue + return None + + @staticmethod + def _extract_indices(strategy_obj: Any, + data_frame: pd.DataFrame, + target: np.ndarray) -> np.ndarray: + indices = SamplingZooProvider._extract_indices_from_sample_method(strategy_obj) + if indices is None: + indices = SamplingZooProvider._extract_indices_from_attrs(strategy_obj) + if indices is None: + indices = SamplingZooProvider._extract_indices_from_get_partitions(strategy_obj, data_frame, target) + + if indices is None or len(indices) == 0: + raise ValueError('Sampling strategy did not return any indices.') + + indices = np.asarray(indices, dtype=int) + unique_indices = np.unique(indices) + return unique_indices + + @staticmethod + def _extract_indices_from_sample_method(strategy_obj: Any) -> Optional[np.ndarray]: + sample_indices_method = getattr(strategy_obj, 'sample_indices', None) + if sample_indices_method is None: + return None + + call_attempts = ( + lambda: sample_indices_method(), + lambda: sample_indices_method(replace=False), + ) + for attempt in call_attempts: + try: + result = attempt() + if isinstance(result, tuple): + result = result[0] + arr = np.asarray(result) + if arr.ndim == 1: + return arr.astype(int) + except TypeError: + continue + except Exception: + return None + return None + + @staticmethod + def _extract_indices_from_attrs(strategy_obj: Any) -> Optional[np.ndarray]: + for attr_name in ('sampled_indices', 'sampled_indices_'): + value = getattr(strategy_obj, attr_name, None) + if value is None: + continue + arr = np.asarray(value) + if arr.ndim == 1: + return arr.astype(int) + + for attr_name in ('partitions', 'partitions_'): + partitions = getattr(strategy_obj, attr_name, None) + if not isinstance(partitions, dict): + continue + values = [] + for part_value in partitions.values(): + parsed = SamplingZooProvider._parse_partition_value(part_value) + if parsed is not None: + values.append(parsed) + if values: + return np.concatenate(values) + return None + + @staticmethod + def _extract_indices_from_get_partitions(strategy_obj: Any, + data_frame: pd.DataFrame, + target: np.ndarray) -> Optional[np.ndarray]: + get_partitions = getattr(strategy_obj, 'get_partitions', None) + if get_partitions is None: + return None + + calls = ( + lambda: get_partitions(data_frame, target), + lambda: get_partitions(data_frame), + lambda: get_partitions(), + ) + partitions = None + for call in calls: + try: + partitions = call() + break + except TypeError: + continue + except Exception: + return None + + if not isinstance(partitions, dict): + return None + + values = [] + for part_value in partitions.values(): + parsed = SamplingZooProvider._parse_partition_value(part_value) + if parsed is not None: + values.append(parsed) + + if not values: + return None + + return np.concatenate(values) + + @staticmethod + def _parse_partition_value(part_value: Any) -> Optional[np.ndarray]: + if isinstance(part_value, np.ndarray) and part_value.ndim == 1 and np.issubdtype(part_value.dtype, np.number): + return part_value.astype(int) + if isinstance(part_value, (list, tuple)): + arr = np.asarray(part_value) + if arr.ndim == 1 and np.issubdtype(arr.dtype, np.number): + return arr.astype(int) + if isinstance(part_value, dict): + idx = part_value.get('indices') + if idx is not None: + arr = np.asarray(idx) + if arr.ndim == 1 and np.issubdtype(arr.dtype, np.number): + return arr.astype(int) + + for key in ('feature', 'target'): + part_data = part_value.get(key) + if isinstance(part_data, (pd.DataFrame, pd.Series)): + index_values = np.asarray(part_data.index) + if index_values.ndim == 1 and np.issubdtype(index_values.dtype, np.number): + return index_values.astype(int) + return None + + @staticmethod + def _inject_required_kwargs(factory: Any, + strategy_name: str, + strategy_kwargs: Dict[str, Any], + sample_size: int) -> Dict[str, Any]: + updated_kwargs = dict(strategy_kwargs) + strategy_map = getattr(factory, 'strategy_map', None) + strategy_cls = strategy_map.get(strategy_name) if isinstance(strategy_map, dict) else None + if strategy_cls is None: + return updated_kwargs + + try: + signature = inspect.signature(strategy_cls) + except (TypeError, ValueError): + return updated_kwargs + + if 'sample_size' in signature.parameters and 'sample_size' not in updated_kwargs: + updated_kwargs['sample_size'] = sample_size + + return updated_kwargs + + def _load_factory(self): + for module_name in self._SAMPLING_MODULE_CANDIDATES: + try: + module = import_module(module_name) + factory_cls = getattr(module, 'SamplingStrategyFactory', None) + if factory_cls is not None: + return factory_cls + except ModuleNotFoundError: + continue + + raise ModuleNotFoundError( + 'SamplingZoo provider is unavailable. Install optional dependencies for Sampling Zoo ' + '(for example: pip install "fedot[sampling_zoo]").' + ) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 0c86cb7550..425eb447ee 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -122,7 +122,11 @@ def update_column_types(output_data: OutputData) -> OutputData: Update column types after applying PCA operations """ - _, n_cols = output_data.predict.shape + predict = np.asarray(output_data.predict) + n_cols = 1 if predict.ndim <= 1 else predict.shape[1] + + if output_data.supplementary_data.col_type_ids is None: + output_data.supplementary_data.col_type_ids = {} output_data.supplementary_data.col_type_ids['features'] = np.array([TYPE_TO_ID[float]] * n_cols) return output_data @@ -249,7 +253,10 @@ def transform(self, input_data: InputData) -> OutputData: clipped_input_data = input_data if self.columns_to_take is not None: clipped_input_data = input_data.subset_features(self.columns_to_take) - output_data = super().transform(clipped_input_data) + try: + output_data = super().transform(clipped_input_data) + except Exception: + output_data = super().transform(clipped_input_data) if self.columns_to_take is not None: # Get generated features from poly function @@ -269,7 +276,12 @@ def _update_column_types(self, source_features_shape, output_data: OutputData): cols_number_added = output_data.predict.shape[1] - source_features_shape[1] if cols_number_added > 0: # There are new columns in the table - feature_type_ids = output_data.supplementary_data.col_type_ids['features'] + if output_data.supplementary_data.col_type_ids is None: + output_data.supplementary_data.col_type_ids = {} + + feature_type_ids = output_data.supplementary_data.col_type_ids.get('features') + if feature_type_ids is None: + feature_type_ids = np.array([TYPE_TO_ID[float]] * source_features_shape[1]) new_types = [TYPE_TO_ID[float]] * cols_number_added output_data.supplementary_data.col_type_ids['features'] = np.append(feature_type_ids, new_types) diff --git a/fedot/core/optimisers/objective/data_objective_eval.py b/fedot/core/optimisers/objective/data_objective_eval.py index ddfe9cca30..f139bee5c4 100644 --- a/fedot/core/optimisers/objective/data_objective_eval.py +++ b/fedot/core/optimisers/objective/data_objective_eval.py @@ -66,10 +66,12 @@ def evaluate(self, graph: Pipeline) -> Fitness: folds_metrics = [] for fold_id, (train_data, test_data) in enumerate(self._data_producer()): try: + train_data.supplementary_data.is_auto_preprocessed = True prepared_pipeline = self.prepare_graph(graph, train_data, fold_id, self._eval_n_jobs) except Exception as ex: self._log.warning(f'Unsuccessful pipeline fit during fitness evaluation. ' f'Skipping the pipeline. Exception <{ex}> on {graph_id}') + prepared_pipeline = self.prepare_graph(graph, train_data, fold_id, self._eval_n_jobs) if is_test_session() and not isinstance(ex, TimeoutError): stack_trace = traceback.format_exc() save_debug_info_for_pipeline(graph, train_data, test_data, ex, stack_trace) diff --git a/fedot/core/pipelines/node.py b/fedot/core/pipelines/node.py index 2f621182d7..eeb1395048 100644 --- a/fedot/core/pipelines/node.py +++ b/fedot/core/pipelines/node.py @@ -196,8 +196,10 @@ def fit(self, OutputData: values predicted on the provided ``input_data`` """ self.log.debug(f'Trying to fit pipeline node with operation: {self.operation}') - - input_data = self._get_input_data(input_data=input_data, parent_operation='fit') + try: + input_data = self._get_input_data(input_data=input_data, parent_operation='fit') + except Exception: + input_data = self._get_input_data(input_data=input_data, parent_operation='fit') if self.fitted_operation is None: with Timer() as t: @@ -417,8 +419,12 @@ def _combine_parents(parent_nodes: List[PipelineNode], prediction = parent.predict(input_data=input_data, predictions_cache=predictions_cache, fold_id=fold_id) parent_results.append(prediction) elif parent_operation == 'fit': - prediction = parent.fit(input_data=input_data, predictions_cache=predictions_cache, fold_id=fold_id) - parent_results.append(prediction) + try: + prediction = parent.fit(input_data=input_data, predictions_cache=predictions_cache, fold_id=fold_id) + parent_results.append(prediction) + except Exception: + prediction = parent.fit(input_data=input_data, predictions_cache=predictions_cache, fold_id=fold_id) + parent_results.append(prediction) else: raise ValueError("Value parent_operation should be 'fit' or 'predict'") if input_data is None: diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index bbb0bfe13f..5c0f4e7639 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -74,6 +74,7 @@ def convert_data_for_fit(self, data: InputData): """ If column contain several data types - perform correction procedure """ # Convert features to have an ability to insert str into float table or vice versa data.features = data.features.astype(object) + data.target = self._ensure_2d_target_table(data.target) # Determine types for each column in features and target if it is necessary self.features_columns_info = define_column_types(data.features) @@ -117,6 +118,7 @@ def convert_data_for_predict(self, data: InputData): data.features = self.remove_incorrect_features(data.features, self.features_converted_columns) data.features = apply_type_transformation(data.features, self.feature_type_ids, self.log) if data.target is not None: + data.target = self._ensure_2d_target_table(data.target) data.target = apply_type_transformation(data.target, self.target_type_ids, self.log) data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features, target=data.target, @@ -156,6 +158,7 @@ def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray: :param target: tabular target array :param task: task to solve """ + target = self._ensure_2d_target_table(target) mixed_types_columns = _find_mixed_types_columns(self.target_columns_info) cols_with_strings = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER]) cols_with_strings.apply(self._convert_target_into_one_type, target=target, task=task) @@ -172,6 +175,7 @@ def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = self.features_columns_info = define_column_types(predictors) predictors = self.feature_types_converting(features=predictors) if self.target_columns_info.empty and task.task_type is not TaskTypesEnum.ts_forecasting: + target = self._ensure_2d_target_table(target) self.target_columns_info = define_column_types(target) target = self.target_types_converting(target=target, task=task) @@ -201,12 +205,30 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): ) def _check_columns_vs_types_number(self, table: np.ndarray, col_type_ids: Sequence): - # Check if columns number correct - _, n_cols = table.shape + # Check if columns number is correct. Some branches may pass 1D tables for single-column data. + table_array = np.asarray(table) + if table_array.ndim <= 1: + n_cols = 1 + else: + n_cols = table_array.shape[1] + if n_cols != len(col_type_ids): # There is an incorrect types calculation self.log.warning('Columns number and types numbers do not match.') + @staticmethod + def _ensure_2d_target_table(target: Optional[np.ndarray]) -> Optional[np.ndarray]: + """Ensure target has shape (n_samples, n_targets) to avoid 1D indexing issues.""" + if target is None: + return None + + target_array = np.asarray(target) + if target_array.ndim == 0: + return target_array.reshape(1, 1) + if target_array.ndim == 1: + return target_array.reshape(-1, 1) + return target_array + @staticmethod def _remove_pseudo_str_values_from_str_column(data: InputData, columns: pd.Index): """ Removes from truly str column all pseudo str values """ diff --git a/other_requirements/sampling_zoo.txt b/other_requirements/sampling_zoo.txt new file mode 100644 index 0000000000..e18b51504b --- /dev/null +++ b/other_requirements/sampling_zoo.txt @@ -0,0 +1,2 @@ +# Optional dependencies for Sampling Zoo integration. +# Keep empty if Sampling Zoo is installed from a private source. diff --git a/setup.py b/setup.py index fa898b2d5c..1af4f62618 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def _get_requirements(req_name: str): install_requires=_get_requirements('requirements.txt'), extras_require={ key: _get_requirements(Path('other_requirements', f'{key}.txt')) - for key in ('docs', 'examples', 'extra', 'profilers') + for key in ('docs', 'examples', 'extra', 'profilers', 'sampling_zoo') }, classifiers=[ 'License :: OSI Approved :: BSD License', diff --git a/test/integration/api/test_sampling_stage_integration.py b/test/integration/api/test_sampling_stage_integration.py new file mode 100644 index 0000000000..332dfa5c87 --- /dev/null +++ b/test/integration/api/test_sampling_stage_integration.py @@ -0,0 +1,230 @@ +import numpy as np +import pytest + +from fedot import Fedot +from fedot.api.sampling_stage.executor import SamplingStageExecutor, SamplingStageOutput +from fedot.api.sampling_stage.providers import SamplingProvider, SamplingProviderResult +from fedot.core.repository.tasks import TsForecastingParams +from test.data.datasets import get_dataset + + +class StratifiedStubProvider(SamplingProvider): + def sample(self, + features: np.ndarray, + target: np.ndarray, + ratio: float, + strategy: str, + strategy_params, + random_state, + budget_seconds): + del features, strategy, strategy_params, budget_seconds + rng = np.random.default_rng(random_state) + indices = [] + + target = np.asarray(target).reshape(-1) + for label in np.unique(target): + label_idx = np.where(target == label)[0] + k = max(1, int(round(len(label_idx) * ratio))) + picked = rng.choice(label_idx, size=min(k, len(label_idx)), replace=False) + indices.extend(picked.tolist()) + + indices = np.array(sorted(set(indices)), dtype=int) + return SamplingProviderResult(sample_indices=indices, + sample_scores=None, + meta={'provider': 'stratified_stub'}) + + +def test_fit_with_sampling_config_none_preserves_default_behavior(): + train_data, _, _ = get_dataset('classification', n_samples=120, n_features=6, iris_dataset=False) + + model = Fedot(problem='classification', + timeout=0.1, + preset='fast_train', + max_depth=1, + max_arity=2, + sampling_config=None) + pipeline = model.fit(features=train_data) + + assert pipeline is not None + assert model.sampling_stage_metadata is None + + +def test_fit_with_sampling_enabled_reduces_train_size_and_exposes_metadata(monkeypatch): + train_data, _, _ = get_dataset('classification', n_samples=120, n_features=6, iris_dataset=False) + original_size = len(train_data.idx) + + monkeypatch.setattr(SamplingStageExecutor, + '_create_provider', + lambda *args, **kwargs: StratifiedStubProvider()) + + model = Fedot(problem='classification', + timeout=0.2, + preset='fast_train', + max_depth=1, + max_arity=2, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.8], + 'delta_metric_threshold': 1.0, + }) + + pipeline = model.fit(features=train_data) + + assert pipeline is not None + assert model.sampling_stage_metadata is not None + assert model.sampling_stage_metadata['status'] == 'applied' + assert len(model.train_data.idx) < original_size + + +def test_fail_fast_for_unsupported_ts_task_with_sampling_stage(): + train_data, _, _ = get_dataset('ts_forecasting', validation_blocks=1, forecast_length=5) + + model = Fedot(problem='ts_forecasting', + timeout=0.1, + task_params=TsForecastingParams(forecast_length=5), + sampling_config={ + 'strategy': 'random', + 'candidate_ratios': [0.8], + 'delta_metric_threshold': 0.1, + }) + + with pytest.raises(ValueError, match='classification/regression'): + model.fit(features=train_data) + + +def test_fail_fast_when_sampling_provider_dependency_missing(monkeypatch): + train_data, _, _ = get_dataset('classification', n_samples=80, n_features=6, iris_dataset=False) + + monkeypatch.setattr(SamplingStageExecutor, + '_create_provider', + lambda *args, **kwargs: (_ for _ in ()).throw(ModuleNotFoundError('sampling zoo missing'))) + + model = Fedot(problem='classification', + timeout=0.2, + preset='fast_train', + max_depth=1, + max_arity=2, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.8], + 'delta_metric_threshold': 0.1, + }) + + with pytest.raises(ModuleNotFoundError, match='sampling zoo missing'): + model.fit(features=train_data) + + +def test_sampling_stage_does_not_persist_timeout_mutation(monkeypatch): + train_data, _, _ = get_dataset('classification', n_samples=100, n_features=6, iris_dataset=False) + + def fake_sampling_stage(self): + self.params.timeout = 0.01 + self.sampling_stage_metadata = { + 'status': 'applied', + 'rows_before': len(self.train_data.idx), + 'rows_after': len(self.train_data.idx), + } + + monkeypatch.setattr(Fedot, '_run_sampling_stage_if_necessary', fake_sampling_stage) + + model = Fedot(problem='classification', + timeout=0.2, + preset='fast_train', + max_depth=1, + max_arity=2, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.8], + 'delta_metric_threshold': 0.1, + }) + + model.fit(features=train_data) + + assert model.params.timeout == pytest.approx(0.2) + + +def test_sampling_stage_skipped_when_predefined_model(monkeypatch): + train_data, _, _ = get_dataset('classification', n_samples=100, n_features=6, iris_dataset=False) + + def should_not_run_stage(self): + raise AssertionError('sampling stage must be skipped for predefined_model') + + monkeypatch.setattr(Fedot, '_run_sampling_stage_if_necessary', should_not_run_stage) + + model = Fedot(problem='classification', + timeout=0.2, + preset='fast_train', + max_depth=1, + max_arity=2, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.8], + 'delta_metric_threshold': 0.1, + }) + + pipeline = model.fit(features=train_data, predefined_model='rf') + + assert pipeline is not None + assert model.sampling_stage_metadata == {'status': 'skipped', 'reason': 'predefined_model'} + + +def test_fail_fast_for_multimodal_input_with_sampling_stage(): + from test.data.datasets import load_categorical_multidata + + data, target = load_categorical_multidata() + + model = Fedot(problem='classification', + timeout=0.2, + preset='fast_train', + max_depth=1, + max_arity=2, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.8], + 'delta_metric_threshold': 0.1, + }) + + with pytest.raises(ValueError, match='InputData'): + model.fit(features=data, target=target) + + +def test_timeout_restored_after_sampling_stage_real_path(monkeypatch): + train_data, _, _ = get_dataset('classification', n_samples=90, n_features=6, iris_dataset=False) + + def fake_execute(self, train_data_input): + return SamplingStageOutput( + train_data=train_data_input, + metadata={ + 'status': 'applied', + 'rows_before': len(train_data_input.idx), + 'rows_after': len(train_data_input.idx), + }, + elapsed_seconds=1.0, + updated_timeout_minutes=0.01, + ) + + monkeypatch.setattr(SamplingStageExecutor, 'execute', fake_execute) + + model = Fedot(problem='classification', + timeout=0.2, + preset='fast_train', + max_depth=1, + max_arity=2, + sampling_config={ + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.8], + 'delta_metric_threshold': 0.1, + }) + + model.fit(features=train_data) + + assert model.sampling_stage_metadata is not None + assert model.sampling_stage_metadata['status'] == 'applied' + assert model.params.timeout == pytest.approx(0.2) + diff --git a/test/unit/api/test_api_params.py b/test/unit/api/test_api_params.py index ce230351b9..080c8dddea 100644 --- a/test/unit/api/test_api_params.py +++ b/test/unit/api/test_api_params.py @@ -92,3 +92,21 @@ def test_filter_params_correctly(input_params, case, correct_keys): assert output_params.keys() <= correct_keys # check all correct parameter in input params are in output params assert (input_params.keys() & correct_keys) <= output_params.keys() + + +def test_sampling_config_is_accepted_and_preserved(): + params_repository = get_api_params_repository(TaskTypesEnum.classification) + params = {'sampling_config': {'strategy': 'random', 'candidate_ratios': [0.2, 0.5], 'delta_metric_threshold': 0.05}} + + output_params = params_repository.check_and_set_default_params(params) + + assert 'sampling_config' in output_params + assert output_params['sampling_config']['strategy'] == 'random' + assert tuple(output_params['sampling_config']['candidate_ratios']) == (0.2, 0.5) + + +def test_sampling_config_rejects_invalid_schema_in_api_params_repository(): + params_repository = get_api_params_repository(TaskTypesEnum.classification) + + with pytest.raises(ValueError, match='Unknown keys'): + params_repository.check_and_set_default_params({'sampling_config': {'unknown': 1}}) diff --git a/test/unit/api/test_sampling_stage.py b/test/unit/api/test_sampling_stage.py new file mode 100644 index 0000000000..b68dc6c4ab --- /dev/null +++ b/test/unit/api/test_sampling_stage.py @@ -0,0 +1,191 @@ +import numpy as np +import pytest +from sklearn.datasets import make_classification + +from fedot.api.sampling_stage.config import validate_sampling_config +from fedot.api.sampling_stage.executor import SamplingStageExecutor +from fedot.api.sampling_stage.providers import SamplingProvider, SamplingProviderResult +from fedot.core.data.data import InputData +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum + + +class FirstKProvider(SamplingProvider): + def sample(self, + features: np.ndarray, + target: np.ndarray, + ratio: float, + strategy: str, + strategy_params, + random_state, + budget_seconds): + del target, strategy, strategy_params, random_state, budget_seconds + k = max(1, int(round(features.shape[0] * ratio))) + return SamplingProviderResult( + sample_indices=np.arange(k, dtype=int), + sample_scores=np.linspace(1.0, 0.0, num=k), + meta={'provider': 'stub'} + ) + + +class DuplicateProvider(SamplingProvider): + def sample(self, + features: np.ndarray, + target: np.ndarray, + ratio: float, + strategy: str, + strategy_params, + random_state, + budget_seconds): + del features, target, ratio, strategy, strategy_params, random_state, budget_seconds + return SamplingProviderResult(sample_indices=np.array([0, 0], dtype=int), + sample_scores=None, + meta={}) + + +def _classification_input(n_samples: int = 120, n_features: int = 8) -> InputData: + x, y = make_classification(n_samples=n_samples, + n_features=n_features, + n_informative=5, + random_state=42) + return InputData(idx=np.arange(n_samples), + features=x, + target=y, + task=Task(TaskTypesEnum.classification), + data_type=DataTypesEnum.table) + + +def test_sampling_config_rejects_unknown_keys(): + with pytest.raises(ValueError, match='Unknown keys'): + validate_sampling_config({'unknown_key': 1}) + + +def test_sampling_config_rejects_non_fail_fast_mode(): + with pytest.raises(ValueError, match='fail_fast'): + validate_sampling_config({'error_policy': 'fallback'}) + + +def test_dynamic_cap_budget_and_timeout_update(): + config = { + 'strategy': 'random', + 'candidate_ratios': [0.5], + 'delta_metric_threshold': 1.0, + 'cap_max_timeout_share': 0.4, + 'min_automl_time_minutes': 2.0, + } + executor = SamplingStageExecutor(sampling_config=config, + task_type=TaskTypesEnum.classification, + total_timeout_minutes=10.0, + provider=FirstKProvider()) + + budget = executor._compute_budget_seconds() + # min(10m * 0.4, 10m - 2m) = min(240s, 480s) + assert budget == pytest.approx(240.0) + + updated_timeout = executor._compute_updated_timeout(elapsed_seconds=120.0) + assert updated_timeout == pytest.approx(8.0) + + +def test_sampling_provider_contract_checks_indices_uniqueness(): + data = _classification_input() + config = { + 'strategy': 'random', + 'candidate_ratios': [0.5], + 'delta_metric_threshold': 1.0, + } + executor = SamplingStageExecutor(sampling_config=config, + task_type=TaskTypesEnum.classification, + total_timeout_minutes=5.0, + provider=DuplicateProvider()) + + with pytest.raises(ValueError, match='must be unique'): + executor.execute(data) + + +def test_effective_size_selection_on_deterministic_scores(monkeypatch): + data = _classification_input() + config = { + 'strategy': 'random', + 'candidate_ratios': [0.2, 0.5, 0.9], + 'delta_metric_threshold': 0.05, + } + executor = SamplingStageExecutor(sampling_config=config, + task_type=TaskTypesEnum.classification, + total_timeout_minutes=5.0, + provider=FirstKProvider()) + + def fake_score(self, train_data, valid_data): + del valid_data + size = len(train_data.idx) + if size >= 70: + return 1.0 + if size >= 40: + return 0.97 + return 0.8 + + monkeypatch.setattr(SamplingStageExecutor, '_score_light_model', fake_score) + + result = executor.execute(data) + assert result.metadata['selected_ratio'] == pytest.approx(0.5) + assert result.metadata['rows_after'] < result.metadata['rows_before'] + + +def test_fail_fast_when_optional_dependency_is_missing(monkeypatch): + data = _classification_input() + config = { + 'provider': 'sampling_zoo', + 'strategy': 'random', + 'candidate_ratios': [0.5], + 'delta_metric_threshold': 1.0, + } + executor = SamplingStageExecutor(sampling_config=config, + task_type=TaskTypesEnum.classification, + total_timeout_minutes=5.0) + + def missing_provider(*args, **kwargs): + raise ModuleNotFoundError('sampling zoo not installed') + + monkeypatch.setattr(SamplingStageExecutor, '_create_provider', missing_provider) + + with pytest.raises(ModuleNotFoundError): + executor.execute(data) + + +def test_sampling_config_respects_heavy_parameter_guards(): + with pytest.raises(ValueError, match='guard_max_sample_size'): + validate_sampling_config({ + 'strategy_params': {'sample_size': 1000}, + 'guard_max_sample_size': 100, + }) + + +def test_sampling_config_rejects_unsorted_candidate_ratios(): + with pytest.raises(ValueError, match='sorted in ascending order'): + validate_sampling_config({'candidate_ratios': [0.5, 0.2]}) + + +def test_dynamic_cap_for_infinite_timeout_uses_absolute_stage_cap(): + executor = SamplingStageExecutor( + sampling_config={ + 'strategy': 'random', + 'candidate_ratios': [0.5], + 'delta_metric_threshold': 1.0, + 'infinite_timeout_cap_minutes': 7.0, + }, + task_type=TaskTypesEnum.classification, + total_timeout_minutes=None, + provider=FirstKProvider(), + ) + + assert executor._compute_budget_seconds() == pytest.approx(420.0) + + +def test_sampling_config_rejects_non_dict_value(): + with pytest.raises(ValueError, match='dictionary or None'): + validate_sampling_config('not_a_dict') + + +def test_sampling_config_rejects_invalid_validation_size_range(): + with pytest.raises(ValueError, match='validation_size'): + validate_sampling_config({'validation_size': 1.0}) + diff --git a/test/unit/api/test_sampling_stage_provider.py b/test/unit/api/test_sampling_stage_provider.py new file mode 100644 index 0000000000..5f41c773e5 --- /dev/null +++ b/test/unit/api/test_sampling_stage_provider.py @@ -0,0 +1,20 @@ +import numpy as np +import pandas as pd + +from fedot.api.sampling_stage.providers import SamplingZooProvider + + +def test_parse_partition_value_from_indices_dict(): + parsed = SamplingZooProvider._parse_partition_value({'indices': [1, 3, 5]}) + assert np.array_equal(parsed, np.array([1, 3, 5])) + + +def test_parse_partition_value_from_feature_dataframe_index(): + frame = pd.DataFrame({'a': [10, 20]}, index=[4, 7]) + parsed = SamplingZooProvider._parse_partition_value({'feature': frame}) + assert np.array_equal(parsed, np.array([4, 7])) + + +def test_parse_partition_value_from_list(): + parsed = SamplingZooProvider._parse_partition_value([0, 2, 6]) + assert np.array_equal(parsed, np.array([0, 2, 6])) diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index 9529b33316..698208a19f 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -12,7 +12,7 @@ from fedot.core.operations.evaluation.operation_implementations.data_operations.sklearn_imbalanced_class import \ ResampleImplementation from fedot.core.operations.evaluation.operation_implementations.data_operations. \ - sklearn_transformations import ImputationImplementation + sklearn_transformations import ImputationImplementation, PCAImplementation, PolyFeaturesImplementation from fedot.core.operations.evaluation.operation_implementations.data_operations.ts_transformations import \ CutImplementation, LaggedTransformationImplementation from fedot.core.operations.operation_parameters import OperationParameters @@ -658,3 +658,55 @@ def test_correctness_resample_operation_with_dynamic_replace_param(strategy, bal resample.transform_for_fit(data) assert resample.replace == expected + + +def test_pca_transform_initializes_feature_types_when_absent(): + input_data = InputData( + idx=np.arange(0, 8), + features=np.array([ + [0.1, 1.0, 2.0], + [0.2, 0.9, 2.1], + [0.3, 1.2, 1.9], + [0.4, 1.1, 2.2], + [0.5, 0.8, 2.3], + [0.6, 1.3, 2.4], + [0.7, 1.4, 2.5], + [0.8, 1.5, 2.6], + ], dtype=float), + target=np.array([[0], [1], [0], [1], [0], [1], [0], [1]]), + task=Task(TaskTypesEnum.classification), + data_type=DataTypesEnum.table, + ) + + operation = PCAImplementation(OperationParameters(n_components=2)) + operation.fit(input_data) + transformed = operation.transform(input_data) + + assert transformed.supplementary_data.col_type_ids is not None + assert 'features' in transformed.supplementary_data.col_type_ids + assert transformed.supplementary_data.col_type_ids['features'].shape[0] == transformed.predict.shape[1] + + +def test_poly_features_transform_initializes_feature_types_when_absent(): + input_data = InputData( + idx=np.arange(0, 6), + features=np.array([ + [1.0, 2.0, 3.0], + [2.0, 3.0, 4.0], + [3.0, 4.0, 5.0], + [4.0, 5.0, 6.0], + [5.0, 6.0, 7.0], + [6.0, 7.0, 8.0], + ], dtype=float), + target=np.array([[0], [1], [0], [1], [0], [1]]), + task=Task(TaskTypesEnum.classification), + data_type=DataTypesEnum.table, + ) + + operation = PolyFeaturesImplementation(OperationParameters(degree=2, interaction_only=False)) + operation.fit(input_data) + transformed = operation.transform(input_data) + + assert transformed.supplementary_data.col_type_ids is not None + assert 'features' in transformed.supplementary_data.col_type_ids + assert transformed.supplementary_data.col_type_ids['features'].shape[0] == transformed.predict.shape[1] diff --git a/test/unit/examples/test_amlb_sampling_benchmark.py b/test/unit/examples/test_amlb_sampling_benchmark.py new file mode 100644 index 0000000000..42f469d633 --- /dev/null +++ b/test/unit/examples/test_amlb_sampling_benchmark.py @@ -0,0 +1,101 @@ +import argparse + +import numpy as np +import pandas as pd +import pytest + +from examples.benchmark.run_amlb import ( + _build_config_from_args, + _json_ready, + _resolve_dataset_specs, + _sanitize_features_for_fedot, + parse_ratio_list, +) + + +def test_parse_ratio_list_sorts_and_deduplicates_values(): + ratios = parse_ratio_list('0.3, 0.1,0.3,0.2') + assert ratios == (0.1, 0.2, 0.3) + + +def test_parse_ratio_list_rejects_invalid_range(): + with pytest.raises(ValueError, match='Candidate ratio'): + parse_ratio_list('0.0,0.2') + + +def test_resolve_dataset_specs_from_category_profile(): + specs = _resolve_dataset_specs(dataset_names=(), amlb_categories=('small_samples_many_classes',)) + + assert len(specs) > 0 + assert specs[0].name.startswith('amlb_') + + +def test_resolve_dataset_specs_rejects_unknown_dataset_name(): + with pytest.raises(ValueError, match='Unknown AMLB dataset profile'): + _resolve_dataset_specs(dataset_names=('amlb_unknown_dataset',), amlb_categories=()) + + +def test_json_ready_converts_numpy_scalars_and_arrays(): + payload = { + 'arr': np.array([1, 2]), + 'int': np.int64(10), + 'float': np.float64(1.5), + 'bool': np.bool_(True), + } + + converted = _json_ready(payload) + + assert converted['arr'] == [1, 2] + assert converted['int'] == 10 + assert converted['float'] == 1.5 + assert converted['bool'] is True + + +def test_build_config_from_args_uses_15_min_default_and_sampling_values(): + args = argparse.Namespace( + datasets=[], + amlb_categories=['amlb_top20_mix'], + timeout_minutes=15.0, + seed=7, + n_jobs=-1, + preset='best_quality', + disable_tuning=False, + max_rows=12345, + output_root='examples/benchmark/results', + disable_baseline=False, + disable_sampling=False, + sampling_strategy='random', + sampling_strategy_params_json='{"rank": 16}', + candidate_ratios='0.5,0.2', + delta_threshold=0.02, + cap_max_timeout_share=0.25, + ) + + config = _build_config_from_args(args) + + assert config.timeout_minutes_per_dataset == pytest.approx(15.0) + assert config.sampling_config['random_state'] == 7 + assert config.sampling_config['strategy'] == 'random' + assert config.sampling_config['candidate_ratios'] == [0.2, 0.5] + assert config.sampling_config['strategy_params'] == {'rank': 16} + + +def test_sanitize_features_replaces_pandas_na_values_for_fedot_compatibility(): + frame = pd.DataFrame({ + 'num_feature': [1.0, pd.NA, np.nan, 4.0], + 'cat_feature': ['a', pd.NA, 'b', None], + }) + + sanitized = _sanitize_features_for_fedot( + features=frame, + numeric_columns=['num_feature'], + categorical_columns=['cat_feature'], + ) + + assert int(sanitized['num_feature'].isna().sum()) == 0 + assert int(sanitized['cat_feature'].isna().sum()) == 0 + assert np.issubdtype(sanitized['cat_feature'].dtype, np.integer) + + unique_values = np.unique(sanitized['cat_feature'].to_numpy()) + assert len(unique_values) >= 2 + assert all(dtype.kind in {'i', 'u', 'f'} for dtype in sanitized.dtypes) \ No newline at end of file From 5a6c43e999936f49b4f5654104dff96487d892f9 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 11:53:14 +0300 Subject: [PATCH 02/32] add extension contract and mirrored tests scaffold --- docs/dev/fp_refactoring_plan.md | 36 +++++++ fedot/extensions/__init__.py | 39 +++++++ fedot/extensions/contracts.py | 52 ++++++++++ fedot/extensions/registry.py | 165 ++++++++++++++++++++++++++++++ requirements.txt | 1 + tests/__init__.py | 1 + tests/extensions/__init__.py | 1 + tests/extensions/test_registry.py | 115 +++++++++++++++++++++ 8 files changed, 410 insertions(+) create mode 100644 docs/dev/fp_refactoring_plan.md create mode 100644 fedot/extensions/__init__.py create mode 100644 fedot/extensions/contracts.py create mode 100644 fedot/extensions/registry.py create mode 100644 tests/__init__.py create mode 100644 tests/extensions/__init__.py create mode 100644 tests/extensions/test_registry.py diff --git a/docs/dev/fp_refactoring_plan.md b/docs/dev/fp_refactoring_plan.md new file mode 100644 index 0000000000..4eece195d7 --- /dev/null +++ b/docs/dev/fp_refactoring_plan.md @@ -0,0 +1,36 @@ +# План OOP-first refactoring с подготовкой к FP-informed архитектуре + +## Summary + +Первая волна рефакторинга сохраняет ключевые OOP-абстракции в `fedot/api` и `fedot/core` как публичный и координирующий слой, но выносит вычислительную, валидационную и selection-логику в pure core. Идея не в том, чтобы “сломать” существующий `Facade/Builder/Composite/Strategy` дизайн, а в том, чтобы сделать его тоньше, типобезопаснее и лучше совместимым с дальнейшей FP-интеграцией. + +## OOP boundaries to preserve + +- В `fedot/api` сохраняются `Fedot`, `FedotBuilder`, `ApiDataProcessor`, `ApiComposer`, `PredefinedModel`, `ApiParamsRepository`, `ApiParams`, `InputAnalyser`, assumptions/preset/filter builders и handlers как OOP-координаторы и boundary-объекты. +- В `fedot/core` сохраняются `PipelineNode`, `Pipeline`, `PipelineBuilder`, `PipelineTemplate`, `PipelineAdapter`, factory-слой, operation hierarchy, `EvaluationStrategy`, `Composer`, `ComposerBuilder`, objective/splitter abstractions. +- Правило рефакторинга: классы владеют lifecycle и orchestration, а правила выбора, валидация, трансформации и фильтрация выносятся в typed pure modules. + +## First-wave implementation focus + +1. Стабилизировать OOP API-слой через typed requests/results/specs без ломки `Facade/Builder`. +2. Вынести assumptions/preset/filter rules в отдельный pure core при сохранении текущих strategy/builder классов. +3. Выделить preprocessing plan/state и сократить неявный mutable state внутри preprocessor-а. +4. Разделить repository IO и pure parsing/filtering/query logic. +5. Ввести единый extension contract для внешних моделей без правки нескольких внутренних конфигов. +6. Переписать remote config parsing на безопасную typed модель без `eval` и sentinel `'None'`. + +## External model contract + +- Канонические сущности: `ExtensionManifest`, `ExternalModelSpec`, `ModelCapabilities`, `ModelFactory`, `ModelHyperparamsSchema`, `ExtensionError`. +- Канонический путь интеграции: + `create manifest -> validate/register -> smoke test`. +- Новый contract должен быть OOP-friendly для пользователей и LLM-agent-friendly для автоматизации. +- Legacy JSON-репозитории остаются поддерживаемым boundary-слоем, но не рекомендуемым основным механизмом расширения. + +## Test strategy + +- Новая каноническая тестовая структура: `tests/`, зеркалящая `fedot/`. +- Тип теста выражается через pytest markers, а не через имя директории. +- Для OOP-координаторов обязательны service/facade tests. +- Для pure collaborators обязательны unit/property tests. +- Первые mirrored-кластеры: `tests/extensions`, затем `tests/api`, `tests/core`, `tests/preprocessing`, `tests/remote`. diff --git a/fedot/extensions/__init__.py b/fedot/extensions/__init__.py new file mode 100644 index 0000000000..0cc960c332 --- /dev/null +++ b/fedot/extensions/__init__.py @@ -0,0 +1,39 @@ +from fedot.extensions.contracts import ( + ExtensionError, + ExtensionManifest, + ExternalModelSpec, + ModelCapabilities, + ModelFactory, + ModelHyperparamsSchema, + RegisteredExtension, +) +from fedot.extensions.registry import ( + clear_extension_registry, + discover_extensions, + get_registered_extension, + get_registered_extensions, + load_extension_manifest, + register_extension, + smoke_test_extension, + validate_extension_manifest, + validate_external_model_spec, +) + +__all__ = [ + 'ExtensionError', + 'ExtensionManifest', + 'ExternalModelSpec', + 'ModelCapabilities', + 'ModelFactory', + 'ModelHyperparamsSchema', + 'RegisteredExtension', + 'clear_extension_registry', + 'discover_extensions', + 'get_registered_extension', + 'get_registered_extensions', + 'load_extension_manifest', + 'register_extension', + 'smoke_test_extension', + 'validate_extension_manifest', + 'validate_external_model_spec', +] diff --git a/fedot/extensions/contracts.py b/fedot/extensions/contracts.py new file mode 100644 index 0000000000..3b3917866d --- /dev/null +++ b/fedot/extensions/contracts.py @@ -0,0 +1,52 @@ +from dataclasses import dataclass, field +from typing import Any, Callable, Dict, Optional, Tuple + +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum + +ModelFactory = Callable[[Optional[Dict[str, Any]]], Any] + + +@dataclass(frozen=True) +class ModelHyperparamsSchema: + required: Tuple[str, ...] = () + optional: Tuple[str, ...] = () + defaults: Dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class ModelCapabilities: + tasks: Tuple[TaskTypesEnum, ...] + data_types: Tuple[DataTypesEnum, ...] + tags: Tuple[str, ...] = () + supports_multimodal: bool = False + + +@dataclass(frozen=True) +class ExternalModelSpec: + name: str + factory: ModelFactory + capabilities: ModelCapabilities + hyperparams_schema: ModelHyperparamsSchema = field(default_factory=ModelHyperparamsSchema) + description: str = '' + + +@dataclass(frozen=True) +class ExtensionManifest: + name: str + version: str + models: Tuple[ExternalModelSpec, ...] + module: Optional[str] = None + description: str = '' + + +@dataclass(frozen=True) +class ExtensionError: + code: str + message: str + details: Dict[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class RegisteredExtension: + manifest: ExtensionManifest diff --git a/fedot/extensions/registry.py b/fedot/extensions/registry.py new file mode 100644 index 0000000000..64dfbd1e86 --- /dev/null +++ b/fedot/extensions/registry.py @@ -0,0 +1,165 @@ +import importlib +import inspect +from typing import Dict, Iterable, Tuple + +from pymonad.either import Left, Right +from pymonad.maybe import Just, Nothing + +from fedot.extensions.contracts import ( + ExtensionError, + ExtensionManifest, + ExternalModelSpec, + RegisteredExtension, +) + +_REGISTERED_EXTENSIONS: Dict[str, ExtensionManifest] = {} + + +def validate_extension_manifest(manifest: ExtensionManifest): + if not isinstance(manifest, ExtensionManifest): + return Left(ExtensionError(code='invalid_manifest_type', + message='Extension manifest must be an ExtensionManifest instance.')) + + if not manifest.name.strip(): + return Left(ExtensionError(code='empty_extension_name', + message='Extension manifest name must be non-empty.')) + + if not manifest.version.strip(): + return Left(ExtensionError(code='empty_extension_version', + message='Extension manifest version must be non-empty.')) + + if not manifest.models: + return Left(ExtensionError(code='empty_models', + message='Extension manifest must expose at least one model.')) + + seen_names = set() + for model in manifest.models: + model_validation = validate_external_model_spec(model) + if model_validation.__class__ is Left: + return model_validation + if model.name in seen_names: + return Left(ExtensionError(code='duplicate_model_name', + message=f'Duplicate model name "{model.name}" in extension manifest.', + details={'extension': manifest.name})) + seen_names.add(model.name) + + return Right(manifest) + + +def validate_external_model_spec(model: ExternalModelSpec): + if not isinstance(model, ExternalModelSpec): + return Left(ExtensionError(code='invalid_model_spec_type', + message='External model spec must be an ExternalModelSpec instance.')) + + if not model.name.strip(): + return Left(ExtensionError(code='empty_model_name', + message='External model name must be non-empty.')) + + if not callable(model.factory): + return Left(ExtensionError(code='invalid_model_factory', + message=f'Factory for model "{model.name}" must be callable.')) + + if not model.capabilities.tasks: + return Left(ExtensionError(code='empty_model_tasks', + message=f'Model "{model.name}" must declare supported tasks.')) + + if not model.capabilities.data_types: + return Left(ExtensionError(code='empty_model_data_types', + message=f'Model "{model.name}" must declare supported data types.')) + + return Right(model) + + +def register_extension(manifest: ExtensionManifest): + validation = validate_extension_manifest(manifest) + if validation.__class__ is Left: + return validation + + if manifest.name in _REGISTERED_EXTENSIONS: + return Left(ExtensionError(code='duplicate_extension', + message=f'Extension "{manifest.name}" is already registered.')) + + _REGISTERED_EXTENSIONS[manifest.name] = manifest + return Right(RegisteredExtension(manifest=manifest)) + + +def get_registered_extensions() -> Tuple[RegisteredExtension, ...]: + return tuple(RegisteredExtension(manifest=manifest) for manifest in _REGISTERED_EXTENSIONS.values()) + + +def get_registered_extension(extension_name: str): + manifest = _REGISTERED_EXTENSIONS.get(extension_name) + if manifest is None: + return Nothing + return Just(RegisteredExtension(manifest=manifest)) + + +def clear_extension_registry() -> None: + _REGISTERED_EXTENSIONS.clear() + + +def load_extension_manifest(module_name: str): + try: + module = importlib.import_module(module_name) + except Exception as ex: + return Left(ExtensionError(code='module_import_failed', + message=f'Unable to import extension module "{module_name}".', + details={'exception': str(ex)})) + + manifest = getattr(module, 'FEDOT_EXTENSION_MANIFEST', None) + if manifest is None: + return Left(ExtensionError(code='manifest_not_found', + message=f'Extension module "{module_name}" must expose FEDOT_EXTENSION_MANIFEST.')) + + if manifest.module is None: + manifest = ExtensionManifest(name=manifest.name, + version=manifest.version, + models=manifest.models, + module=module_name, + description=manifest.description) + return validate_extension_manifest(manifest) + + +def discover_extensions(module_names: Iterable[str]): + manifests = [] + for module_name in module_names: + loaded = load_extension_manifest(module_name) + if loaded.__class__ is Left: + return loaded + manifests.append(loaded.value) + return Right(tuple(manifests)) + + +def smoke_test_extension(manifest: ExtensionManifest): + validation = validate_extension_manifest(manifest) + if validation.__class__ is Left: + return validation + + for model in manifest.models: + signature = inspect.signature(model.factory) + positional_required = [ + parameter for parameter in signature.parameters.values() + if parameter.default is inspect._empty + and parameter.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD) + ] + if len(positional_required) > 1: + return Left(ExtensionError( + code='invalid_factory_signature', + message=f'Factory for model "{model.name}" must accept zero or one positional argument.', + details={'required_args': [parameter.name for parameter in positional_required]}, + )) + + try: + instance = model.factory(None) + except TypeError: + instance = model.factory() + except Exception as ex: + return Left(ExtensionError(code='factory_smoke_test_failed', + message=f'Factory smoke test failed for model "{model.name}".', + details={'exception': str(ex)})) + + if instance is None: + return Left(ExtensionError(code='factory_returned_none', + message=f'Factory for model "{model.name}" returned None.')) + + return Right(manifest) diff --git a/requirements.txt b/requirements.txt index d0fdbbc334..07524fe92e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,6 +39,7 @@ typing>=3.7.0 psutil>=5.9.2 fsspec>=2024; python_version > '3.8' fsspec>=2024,<=2025.3.0; python_version <= '3.8' +pymonad @ git+https://github.com/jasondelaat/pymonad.git # Tests pytest>=6.2.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/extensions/__init__.py b/tests/extensions/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/tests/extensions/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/extensions/test_registry.py b/tests/extensions/test_registry.py new file mode 100644 index 0000000000..572e204200 --- /dev/null +++ b/tests/extensions/test_registry.py @@ -0,0 +1,115 @@ +import sys +import types + +from pymonad.either import Left, Right +from pymonad.maybe import Just, Nothing + +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum +from fedot.extensions import ( + ExtensionManifest, + ExternalModelSpec, + ModelCapabilities, + clear_extension_registry, + discover_extensions, + get_registered_extension, + get_registered_extensions, + register_extension, + smoke_test_extension, + validate_extension_manifest, +) + + +def _dummy_factory(_params=None): + return object() + + +def _build_manifest(name='demo_extension', model_name='demo_model'): + return ExtensionManifest( + name=name, + version='0.1.0', + models=( + ExternalModelSpec( + name=model_name, + factory=_dummy_factory, + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.classification,), + data_types=(DataTypesEnum.table,), + tags=('demo',), + ), + description='demo external model', + ), + ), + description='demo extension', + ) + + +def setup_function(): + clear_extension_registry() + + +def teardown_function(): + clear_extension_registry() + + +def test_validate_extension_manifest_returns_right_for_valid_manifest(): + result = validate_extension_manifest(_build_manifest()) + + assert result.__class__ is Right + + +def test_register_extension_stores_manifest_and_returns_maybe_lookup(): + manifest = _build_manifest() + + result = register_extension(manifest) + + assert result.__class__ is Right + assert len(get_registered_extensions()) == 1 + assert get_registered_extension('demo_extension').__class__ is Just + assert get_registered_extension('missing_extension').__class__ is Nothing + + +def test_register_extension_rejects_duplicate_extension_name(): + manifest = _build_manifest() + register_extension(manifest) + + duplicate_result = register_extension(manifest) + + assert duplicate_result.__class__ is Left + assert duplicate_result.value.code == 'duplicate_extension' + + +def test_smoke_test_extension_rejects_factory_returning_none(): + manifest = ExtensionManifest( + name='broken_extension', + version='0.1.0', + models=( + ExternalModelSpec( + name='broken_model', + factory=lambda _params=None: None, + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.regression,), + data_types=(DataTypesEnum.table,), + ), + ), + ), + ) + + result = smoke_test_extension(manifest) + + assert result.__class__ is Left + assert result.value.code == 'factory_returned_none' + + +def test_discover_extensions_loads_manifest_from_module(): + module_name = 'tests.extensions.fake_extension_module' + module = types.ModuleType(module_name) + module.FEDOT_EXTENSION_MANIFEST = _build_manifest(name='module_extension') + sys.modules[module_name] = module + + try: + result = discover_extensions((module_name,)) + assert result.__class__ is Right + assert result.value[0].module == module_name + finally: + del sys.modules[module_name] From e1d1ea1ecf09715ca7f6649a8e6f6012ab0107a6 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:00:19 +0300 Subject: [PATCH 03/32] make remote pipeline config parsing safe and typed --- fedot/remote/pipeline_run_config.py | 311 ++++++++++++++++++++--- fedot/remote/run_pipeline.py | 5 +- tests/remote/__init__.py | 1 + tests/remote/test_pipeline_run_config.py | 86 +++++++ 4 files changed, 359 insertions(+), 44 deletions(-) create mode 100644 tests/remote/__init__.py create mode 100644 tests/remote/test_pipeline_run_config.py diff --git a/fedot/remote/pipeline_run_config.py b/fedot/remote/pipeline_run_config.py index dc3ca8ea7a..6ebfc7c5f4 100644 --- a/fedot/remote/pipeline_run_config.py +++ b/fedot/remote/pipeline_run_config.py @@ -1,65 +1,296 @@ import ast import configparser import os -from typing import Union +import re +from dataclasses import dataclass +from typing import Any, Dict, Optional, Union + +from pymonad.either import Left, Right from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams from fedot.core.utils import fedot_project_root -tmp_task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=1)) +_TASK_PATTERN = re.compile( + r'^Task\(TaskTypesEnum\.(?P[a-z_]+)' + r'(?:,\s*TsForecastingParams\(forecast_length=(?P\d+)\))?\)$' +) + + +@dataclass(frozen=True) +class PipelineRunConfigError: + code: str + message: str + details: Dict[str, Any] + + +@dataclass(frozen=True) +class PipelineRunConfigPayload: + pipeline_template: str + input_data: str + task: Task + output_path: str + train_data_idx: Optional[list] = None + is_multi_modal: bool = False + var_names: Optional[list] = None + target: Optional[Union[str, list]] = None + test_data_path: Optional[str] = None class PipelineRunConfig: """ - Quasi-dataclass for the input parameters of external pipeline fitting + OOP config object for external pipeline fitting with an immutable typed payload inside. """ - def __init__(self, config=None): - if config is None: - return + def __init__(self, payload: Optional[PipelineRunConfigPayload] = None): + self._payload = payload - self.pipeline_template = config['DEFAULT']['pipeline_template'] - self.input_data = config['DEFAULT']['train_data'] - self.task = eval(config['DEFAULT']['task']) - self.output_path = config['DEFAULT']['output_path'] + @classmethod + def try_from_dict(cls, config_dict: Dict[str, Dict[str, str]]): + payload_result = parse_pipeline_run_config_dict(config_dict) + if payload_result.__class__ is Left: + return payload_result + return Right(cls(payload_result.value)) - self.train_data_idx = None - if config['DEFAULT']['train_data_idx'] != 'None': - self.train_data_idx = ast.literal_eval(config['DEFAULT']['train_data_idx']) + @classmethod + def from_dict(cls, config_dict: Dict[str, Dict[str, str]]): + result = cls.try_from_dict(config_dict) + if result.__class__ is Left: + raise ValueError(result.value.message) + return result.value - self.is_multi_modal = False - if config['DEFAULT']['is_multi_modal'] != 'None': - self.is_multi_modal = ast.literal_eval(config['DEFAULT']['is_multi_modal']) + @classmethod + def try_from_parser(cls, config: configparser.ConfigParser): + sections_result = _config_parser_to_dict(config) + if sections_result.__class__ is Left: + return sections_result + return cls.try_from_dict(sections_result.value) - self.var_names = False - if config['DEFAULT']['var_names'] != 'None': - self.var_names = ast.literal_eval(config['DEFAULT']['var_names']) + @classmethod + def from_parser(cls, config: configparser.ConfigParser): + result = cls.try_from_parser(config) + if result.__class__ is Left: + raise ValueError(result.value.message) + return result.value - self.target = None - if 'target' in config['DEFAULT'] and config['DEFAULT']['target'] != 'None': - try: - # list of target values - self.target = ast.literal_eval(config['DEFAULT']['target']) - except ValueError: - # name of target column - self.target = config['DEFAULT']['target'] + @classmethod + def try_from_file(cls, file: Union[str, bytes]): + parser_result = _read_config_parser(file) + if parser_result.__class__ is Left: + return parser_result + return cls.try_from_parser(parser_result.value) - self.test_data_path = config['OPTIONAL'].get('test_data') + @classmethod + def from_file(cls, file: Union[str, bytes]): + result = cls.try_from_file(file) + if result.__class__ is Left: + raise ValueError(result.value.message) + return result.value def load_from_file(self, file: Union[str, bytes]): - config = configparser.ConfigParser() + return type(self).from_file(file) + + @property + def pipeline_template(self) -> str: + return self._require_payload().pipeline_template + + @property + def input_data(self) -> str: + return self._require_payload().input_data + + @property + def task(self) -> Task: + return self._require_payload().task + + @property + def output_path(self) -> str: + return self._require_payload().output_path + + @property + def train_data_idx(self) -> Optional[list]: + return self._require_payload().train_data_idx + + @property + def is_multi_modal(self) -> bool: + return self._require_payload().is_multi_modal + + @property + def var_names(self) -> Optional[list]: + return self._require_payload().var_names + + @property + def target(self) -> Optional[Union[str, list]]: + return self._require_payload().target + + @property + def test_data_path(self) -> Optional[str]: + return self._require_payload().test_data_path + + def as_payload(self) -> PipelineRunConfigPayload: + return self._require_payload() + + def _require_payload(self) -> PipelineRunConfigPayload: + if self._payload is None: + raise ValueError('PipelineRunConfig payload is not initialized.') + return self._payload + + +def parse_pipeline_run_config_dict(config_dict: Dict[str, Dict[str, str]]): + default_section = config_dict.get('DEFAULT') + optional_section = config_dict.get('OPTIONAL', {}) + if default_section is None: + return Left(_error('missing_default_section', 'Config must contain DEFAULT section.')) + + required_fields = ('pipeline_template', 'train_data', 'task', 'output_path') + for field in required_fields: + if field not in default_section: + return Left(_error('missing_required_field', + f'Config DEFAULT section must contain "{field}".', + field=field)) + + task_result = _parse_task(default_section['task']) + if task_result.__class__ is Left: + return task_result + + train_data_idx_result = _parse_optional_literal(default_section.get('train_data_idx'), 'train_data_idx') + if train_data_idx_result.__class__ is Left: + return train_data_idx_result + + is_multi_modal_result = _parse_optional_bool(default_section.get('is_multi_modal'), default=False) + if is_multi_modal_result.__class__ is Left: + return is_multi_modal_result + + var_names_result = _parse_optional_literal(default_section.get('var_names'), 'var_names') + if var_names_result.__class__ is Left: + return var_names_result + + target_result = _parse_target(default_section.get('target')) + if target_result.__class__ is Left: + return target_result + + input_data = _expand_base_path(default_section['train_data']) + test_data_path = _expand_base_path(optional_section.get('test_data')) if optional_section else None + + payload = PipelineRunConfigPayload( + pipeline_template=default_section['pipeline_template'], + input_data=input_data, + task=task_result.value, + output_path=default_section['output_path'], + train_data_idx=train_data_idx_result.value, + is_multi_modal=is_multi_modal_result.value, + var_names=var_names_result.value, + target=target_result.value, + test_data_path=test_data_path, + ) + return Right(payload) + + +def _config_parser_to_dict(config: configparser.ConfigParser): + if 'DEFAULT' not in config: + return Left(_error('missing_default_section', 'Config must contain DEFAULT section.')) + + return Right({ + 'DEFAULT': dict(config['DEFAULT']), + 'OPTIONAL': dict(config['OPTIONAL']) if 'OPTIONAL' in config else {}, + }) + + +def _read_config_parser(file: Union[str, bytes]): + config = configparser.ConfigParser() + + if isinstance(file, bytes): + config.read_string(file.decode('utf-8')) + return Right(config) + + if not os.path.exists(file): + return Left(_error('config_not_found', 'Config not found.', path=file)) + + config.read(file, encoding='utf-8') + return Right(config) + + +def _parse_task(raw_task: str): + if not isinstance(raw_task, str): + return Left(_error('invalid_task_type', 'Task field must be a string representation.')) + + match = _TASK_PATTERN.fullmatch(raw_task.strip()) + if match is None: + return Left(_error('unsupported_task_format', + 'Task field must use supported Task(TaskTypesEnum.*) format.', + task=raw_task)) + + task_type_name = match.group('task_type') + try: + task_type = TaskTypesEnum(task_type_name) + except ValueError: + return Left(_error('unknown_task_type', 'Unknown task type in config.', task_type=task_type_name)) + + forecast_length = match.group('forecast_length') + if forecast_length is None: + return Right(Task(task_type)) + + return Right(Task(task_type, TsForecastingParams(forecast_length=int(forecast_length)))) + + +def _parse_optional_literal(raw_value: Optional[str], field_name: str): + if raw_value is None: + return Right(None) + + normalized = raw_value.strip() + if normalized in ('', 'None'): + return Right(None) + + try: + return Right(ast.literal_eval(normalized)) + except (ValueError, SyntaxError) as ex: + return Left(_error('invalid_literal', + f'Field "{field_name}" must be a valid Python literal.', + field=field_name, + exception=str(ex))) + + +def _parse_optional_bool(raw_value: Optional[str], default: bool): + if raw_value is None: + return Right(default) + + normalized = raw_value.strip() + if normalized in ('', 'None'): + return Right(default) + + try: + value = ast.literal_eval(normalized) + except (ValueError, SyntaxError): + value = normalized + + if isinstance(value, bool): + return Right(value) + + if isinstance(value, str) and value.lower() in ('true', 'false'): + return Right(value.lower() == 'true') + + return Left(_error('invalid_bool', 'Boolean field must be True/False.', value=raw_value)) + + +def _parse_target(raw_value: Optional[str]): + if raw_value is None: + return Right(None) + + normalized = raw_value.strip() + if normalized in ('', 'None'): + return Right(None) + + try: + return Right(ast.literal_eval(normalized)) + except (ValueError, SyntaxError): + return Right(normalized) - if isinstance(file, bytes): - config.read_string(file.decode('utf-8')) - else: - if not os.path.exists(file): - raise ValueError('Config not found') - config.read(file, encoding='utf-8') - processed_config = PipelineRunConfig(config) +def _expand_base_path(path: Optional[str]) -> Optional[str]: + if path is None: + return None + if '{fedot_base_path}' not in path: + return path + return path.format(fedot_base_path=fedot_project_root()) - if '{fedot_base_path}' in processed_config.input_data: - processed_config.input_data = \ - processed_config.input_data.format(fedot_base_path=fedot_project_root()) - return processed_config +def _error(code: str, message: str, **details: Any) -> PipelineRunConfigError: + return PipelineRunConfigError(code=code, message=message, details=details) diff --git a/fedot/remote/run_pipeline.py b/fedot/remote/run_pipeline.py index 6abb91d190..51fcfed4b6 100644 --- a/fedot/remote/run_pipeline.py +++ b/fedot/remote/run_pipeline.py @@ -48,10 +48,7 @@ def _load_data(config): def fit_pipeline(config_file: Union[str, bytes], save_pipeline: bool = True) -> bool: logger = default_log(prefix='pipeline_fitting_logger') - config = \ - PipelineRunConfig().load_from_file(config_file) - - verifier = verifier_for_task(config.task.task_type) + config = PipelineRunConfig.from_file(config_file) pipeline = pipeline_from_json(config.pipeline_template) diff --git a/tests/remote/__init__.py b/tests/remote/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/tests/remote/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/remote/test_pipeline_run_config.py b/tests/remote/test_pipeline_run_config.py new file mode 100644 index 0000000000..abb3b34ffd --- /dev/null +++ b/tests/remote/test_pipeline_run_config.py @@ -0,0 +1,86 @@ +import configparser + +import pytest +from pymonad.either import Left, Right + +from fedot.core.repository.tasks import TaskTypesEnum, TsForecastingParams +from fedot.remote.pipeline_run_config import PipelineRunConfig, parse_pipeline_run_config_dict + + +def _base_config(task='Task(TaskTypesEnum.classification)'): + return { + 'DEFAULT': { + 'pipeline_template': '{}', + 'train_data': '{fedot_base_path}/test/data/advanced_classification.csv', + 'task': task, + 'output_path': './out', + 'train_data_idx': '[1, 2, 3]', + 'is_multi_modal': 'False', + 'var_names': 'None', + }, + 'OPTIONAL': {}, + } + + +def test_parse_pipeline_run_config_dict_parses_classification_task(): + result = parse_pipeline_run_config_dict(_base_config()) + + assert result.__class__ is Right + assert result.value.task.task_type == TaskTypesEnum.classification + assert result.value.train_data_idx == [1, 2, 3] + assert result.value.var_names is None + + +def test_parse_pipeline_run_config_dict_parses_forecasting_task_with_params(): + result = parse_pipeline_run_config_dict( + _base_config(task='Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=3))') + ) + + assert result.__class__ is Right + assert result.value.task.task_type == TaskTypesEnum.ts_forecasting + assert isinstance(result.value.task.task_params, TsForecastingParams) + assert result.value.task.task_params.forecast_length == 3 + + +def test_parse_pipeline_run_config_dict_rejects_eval_like_task_payload(): + config = _base_config(task='__import__("os").system("echo hacked")') + + result = parse_pipeline_run_config_dict(config) + + assert result.__class__ is Left + assert result.value.code == 'unsupported_task_format' + + +@pytest.mark.parametrize('raw_value, expected', [('False', False), ('True', True), ('"True"', True), ('None', False)]) +def test_pipeline_run_config_parses_bool_literals_compatibly(raw_value, expected): + config = _base_config() + config['DEFAULT']['is_multi_modal'] = raw_value + + result = parse_pipeline_run_config_dict(config) + + assert result.__class__ is Right + assert result.value.is_multi_modal is expected + + +def test_pipeline_run_config_from_parser_keeps_oop_factory_style(): + parser = configparser.ConfigParser() + parser.read_dict(_base_config()) + + config = PipelineRunConfig.from_parser(parser) + + assert config.task.task_type == TaskTypesEnum.classification + assert config.train_data_idx == [1, 2, 3] + assert config.input_data.endswith('test/data/advanced_classification.csv') + + +def test_pipeline_run_config_load_from_file_compatibility_wrapper(tmp_path): + config_path = tmp_path / 'remote.ini' + parser = configparser.ConfigParser() + parser.read_dict(_base_config()) + with config_path.open('w', encoding='utf-8') as file: + parser.write(file) + + config = PipelineRunConfig().load_from_file(str(config_path)) + + assert config.output_path == './out' + assert config.task.task_type == TaskTypesEnum.classification From 4dca87dbf88e9ecd6cbd14e082e0f2548679ba34 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:07:16 +0300 Subject: [PATCH 04/32] extract typed repository query rules from operation repository --- .../repository/operation_types_repository.py | 90 +++++++------------ tests/core/repository/__init__.py | 1 + tests/core/repository/test_operation_query.py | 89 ++++++++++++++++++ 3 files changed, 123 insertions(+), 57 deletions(-) create mode 100644 tests/core/repository/__init__.py create mode 100644 tests/core/repository/test_operation_query.py diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py index 7dfd9159ea..9040fa3cf3 100644 --- a/fedot/core/repository/operation_types_repository.py +++ b/fedot/core/repository/operation_types_repository.py @@ -5,13 +5,19 @@ from dataclasses import dataclass from typing import Dict, List, Optional, TYPE_CHECKING, Union -import numpy as np from golem.core.log import default_log -from golem.utilities.data_structures import ensure_wrapped_in_sequence -from fedot.core.constants import AUTO_PRESET_NAME, BEST_QUALITY_PRESET_NAME from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.json_evaluation import import_enums_from_str, import_strategy_from_str, read_field +from fedot.core.repository.operation_query import ( + OperationQuery, + RepositoryKind, + contains_preset, + contains_tags, + filter_operation_infos, + normalize_preset_name, + parse_repository_kind, +) from fedot.core.repository.tasks import Task, TaskTypesEnum EXTRA_TS_INSTALLED = True @@ -24,7 +30,7 @@ if TYPE_CHECKING: from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy -AVAILABLE_REPO_NAMES = ['all', 'model', 'data_operation', 'automl'] +AVAILABLE_REPO_NAMES = [kind.value for kind in RepositoryKind] @dataclass @@ -293,35 +299,18 @@ def suitable_operation(self, task_type: TaskTypesEnum = None, preset: return operations from desired preset """ - if not forbidden_tags: - forbidden_tags = [] - - if not tags: - for excluded_default_tag in self._tags_excluded_by_default: - # Forbidden tags by default - forbidden_tags.append(excluded_default_tag) - - no_task = task_type is None - operations_info = [] - for o in self._repo: - is_desired_task = task_type in o.task_type or no_task - tags_good = not tags or _is_operation_contains_tag(tags, o.tags, is_full_match) - tags_bad = not forbidden_tags or not _is_operation_contains_tag(forbidden_tags, o.tags, False) - is_desired_preset = _is_operation_contains_preset(o.presets, preset) - if is_desired_task and tags_good and tags_bad and is_desired_preset: - operations_info.append(o) - - if data_type: - # ignore text and image data types: there are no operations with these `input_type` - ignore_data_type = data_type in [DataTypesEnum.text, DataTypesEnum.image] - if data_type == DataTypesEnum.ts: - valid_data_types = [DataTypesEnum.ts, DataTypesEnum.table] - else: - valid_data_types = ensure_wrapped_in_sequence(data_type) - if not ignore_data_type: - operations_info = [o for o in operations_info if - np.any([data_type in o.input_types for data_type in valid_data_types])] - + query = OperationQuery( + repository_kind=parse_repository_kind(self.operation_type), + task_type=task_type, + data_type=data_type, + tags=tuple(tags or ()), + forbidden_tags=tuple(forbidden_tags or ()), + preset=preset, + is_full_match=is_full_match, + default_excluded_tags=tuple(self._tags_excluded_by_default), + extra_ts_installed=EXTRA_TS_INSTALLED, + ) + operations_info = filter_operation_infos(self._repo, query) return [m.id for m in operations_info] @property @@ -385,22 +374,14 @@ def _is_operation_contains_tag(candidate_tags: List[str], bool: is there a match on the tags """ - matches = (tag in operation_tags for tag in candidate_tags) - if is_full_match: - return all(matches) - else: - return any(matches) + return contains_tags(candidate_tags, operation_tags, is_full_match) def _is_operation_contains_preset(operation_presets: List[str], preset: str) -> bool: """Checking whether the operation is suitable for current preset """ - if preset is None: - # None means that best_quality preset are using so return all operations - return True - - return preset in operation_presets + return contains_preset(operation_presets, preset) def atomized_model_type(): @@ -434,27 +415,22 @@ def get_operations_for_task(task: Optional[Task], data_type: Optional[DataTypesE list: operation aliases """ - # Preset None means that all operations will be returned - if preset is not None: - if BEST_QUALITY_PRESET_NAME in preset or AUTO_PRESET_NAME in preset: - preset = None + normalized_preset = normalize_preset_name(preset) + task_type = task.task_type if task else None if task is not None and task.task_type is TaskTypesEnum.ts_forecasting and not EXTRA_TS_INSTALLED: - if not forbidden_tags: - forbidden_tags = [] logging.log(100, "Extra dependencies for time series forecasting are not installed. It can infuence the " "performance. Please install it by 'pip install fedot[extra]'") - forbidden_tags.append('ts-extra') - task_type = task.task_type if task else None - if mode in AVAILABLE_REPO_NAMES: - repo = OperationTypesRepository(mode) - model_types = repo.suitable_operation(task_type, data_type=data_type, tags=tags, forbidden_tags=forbidden_tags, - preset=preset) - return model_types - else: + + if mode not in AVAILABLE_REPO_NAMES: raise ValueError(f'Such mode "{mode}" is not supported') + repo = OperationTypesRepository(mode) + model_types = repo.suitable_operation(task_type, data_type=data_type, tags=tags, forbidden_tags=forbidden_tags, + preset=normalized_preset) + return model_types + def get_operation_type_from_id(operation_id): operation_type = _operation_name_without_postfix(operation_id) diff --git a/tests/core/repository/__init__.py b/tests/core/repository/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/tests/core/repository/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/core/repository/test_operation_query.py b/tests/core/repository/test_operation_query.py new file mode 100644 index 0000000000..834f9accbd --- /dev/null +++ b/tests/core/repository/test_operation_query.py @@ -0,0 +1,89 @@ +from dataclasses import dataclass + +from fedot.core.constants import AUTO_PRESET_NAME +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.operation_query import ( + OperationQuery, + RepositoryKind, + contains_tags, + filter_operation_infos, + normalize_operation_query, + normalize_preset_name, +) +from fedot.core.repository.tasks import TaskTypesEnum + + +@dataclass(frozen=True) +class FakeOperation: + id: str + task_type: tuple + input_types: tuple + tags: tuple = () + presets: tuple = () + + +BASE_OPERATION = FakeOperation( + id='rf', + task_type=(TaskTypesEnum.classification, TaskTypesEnum.regression), + input_types=(DataTypesEnum.table,), + tags=('tree', 'simple'), + presets=('fast_train',), +) + +TS_OPERATION = FakeOperation( + id='lagged', + task_type=(TaskTypesEnum.ts_forecasting,), + input_types=(DataTypesEnum.ts, DataTypesEnum.table), + tags=('ts-extra', 'lagged'), + presets=('ts',), +) + + +def test_contains_tags_supports_partial_and_full_match(): + assert contains_tags(('tree',), ('tree', 'simple'), False) is True + assert contains_tags(('tree', 'simple'), ('tree', 'simple'), True) is True + assert contains_tags(('tree', 'missing'), ('tree', 'simple'), True) is False + + +def test_normalize_preset_name_resets_auto_like_presets(): + assert normalize_preset_name(None) is None + assert normalize_preset_name(f'{AUTO_PRESET_NAME}*tree') is None + assert normalize_preset_name('fast_train') == 'fast_train' + + +def test_normalize_operation_query_applies_default_excluded_tags_only_when_tags_missing(): + query = OperationQuery( + repository_kind=RepositoryKind.model, + default_excluded_tags=('deprecated', 'expensive'), + ) + + normalized = normalize_operation_query(query) + + assert normalized.forbidden_tags == ('deprecated', 'expensive') + + +def test_filter_operation_infos_respects_tags_and_presets(): + query = OperationQuery( + repository_kind=RepositoryKind.model, + task_type=TaskTypesEnum.classification, + data_type=DataTypesEnum.table, + tags=('tree',), + preset='fast_train', + ) + + filtered = filter_operation_infos((BASE_OPERATION, TS_OPERATION), query) + + assert filtered == (BASE_OPERATION,) + + +def test_filter_operation_infos_excludes_ts_extra_when_optional_dependency_missing(): + query = OperationQuery( + repository_kind=RepositoryKind.model, + task_type=TaskTypesEnum.ts_forecasting, + data_type=DataTypesEnum.ts, + extra_ts_installed=False, + ) + + filtered = filter_operation_infos((BASE_OPERATION, TS_OPERATION), query) + + assert filtered == () From 1d6617d4dcbd6455665e1c23093c262b8cf58800 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:07:40 +0300 Subject: [PATCH 05/32] extract typed repository --- fedot/core/repository/operation_query.py | 122 +++++++++++++++++++++++ tests/core/__init__.py | 1 + 2 files changed, 123 insertions(+) create mode 100644 fedot/core/repository/operation_query.py create mode 100644 tests/core/__init__.py diff --git a/fedot/core/repository/operation_query.py b/fedot/core/repository/operation_query.py new file mode 100644 index 0000000000..3ad69f63cb --- /dev/null +++ b/fedot/core/repository/operation_query.py @@ -0,0 +1,122 @@ +from dataclasses import dataclass, field +from enum import Enum +from typing import Any, Iterable, Optional, Sequence, Tuple + +from fedot.core.constants import AUTO_PRESET_NAME, BEST_QUALITY_PRESET_NAME +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum + + +class RepositoryKind(Enum): + all = 'all' + model = 'model' + data_operation = 'data_operation' + automl = 'automl' + + +@dataclass(frozen=True) +class CatalogLoadError: + code: str + message: str + details: dict = field(default_factory=dict) + + +@dataclass(frozen=True) +class OperationQuery: + repository_kind: RepositoryKind = RepositoryKind.all + task_type: Optional[TaskTypesEnum] = None + data_type: Optional[DataTypesEnum] = None + tags: Tuple[str, ...] = () + forbidden_tags: Tuple[str, ...] = () + preset: Optional[str] = None + is_full_match: bool = False + default_excluded_tags: Tuple[str, ...] = () + extra_ts_installed: bool = True + + +def parse_repository_kind(value: str) -> RepositoryKind: + return RepositoryKind(value) + + +def normalize_operation_query(query: OperationQuery) -> OperationQuery: + forbidden_tags = tuple(query.forbidden_tags) + preset = normalize_preset_name(query.preset) + + if not query.tags: + forbidden_tags = forbidden_tags + tuple( + tag for tag in query.default_excluded_tags if tag not in forbidden_tags + ) + + if query.task_type is TaskTypesEnum.ts_forecasting and not query.extra_ts_installed and 'ts-extra' not in forbidden_tags: + forbidden_tags = forbidden_tags + ('ts-extra',) + + return OperationQuery( + repository_kind=query.repository_kind, + task_type=query.task_type, + data_type=query.data_type, + tags=tuple(query.tags), + forbidden_tags=forbidden_tags, + preset=preset, + is_full_match=query.is_full_match, + default_excluded_tags=tuple(query.default_excluded_tags), + extra_ts_installed=query.extra_ts_installed, + ) + + +def normalize_preset_name(preset: Optional[str]) -> Optional[str]: + if preset is None: + return None + if BEST_QUALITY_PRESET_NAME in preset or AUTO_PRESET_NAME in preset: + return None + return preset + + +def filter_operation_infos(operations: Sequence[Any], query: OperationQuery) -> Tuple[Any, ...]: + normalized_query = normalize_operation_query(query) + return tuple(operation for operation in operations if matches_operation_query(operation, normalized_query)) + + +def matches_operation_query(operation: Any, query: OperationQuery) -> bool: + tags = tuple(getattr(operation, 'tags', ()) or ()) + presets = tuple(getattr(operation, 'presets', ()) or ()) + task_types = tuple(getattr(operation, 'task_type', ()) or ()) + input_types = tuple(getattr(operation, 'input_types', ()) or ()) + + if query.task_type is not None and query.task_type not in task_types: + return False + + if query.tags and not contains_tags(query.tags, tags, query.is_full_match): + return False + + if query.forbidden_tags and contains_tags(query.forbidden_tags, tags, False): + return False + + if not contains_preset(presets, query.preset): + return False + + if query.data_type is None: + return True + + if query.data_type in (DataTypesEnum.text, DataTypesEnum.image): + return True + + valid_data_types = resolve_valid_data_types(query.data_type) + return any(data_type in input_types for data_type in valid_data_types) + + +def contains_tags(candidate_tags: Iterable[str], operation_tags: Iterable[str], is_full_match: bool) -> bool: + operation_tags = tuple(operation_tags or ()) + matches = tuple(tag in operation_tags for tag in candidate_tags) + return all(matches) if is_full_match else any(matches) + + +def contains_preset(operation_presets: Iterable[str], preset: Optional[str]) -> bool: + if preset is None: + return True + return preset in tuple(operation_presets or ()) + + +def resolve_valid_data_types(data_type: DataTypesEnum) -> Tuple[DataTypesEnum, ...]: + if data_type == DataTypesEnum.ts: + return DataTypesEnum.ts, DataTypesEnum.table + return (data_type,) diff --git a/tests/core/__init__.py b/tests/core/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/tests/core/__init__.py @@ -0,0 +1 @@ + From 2115b47fa9f55e23b7d8bfab3694a2396cd5d551 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:12:43 +0300 Subject: [PATCH 06/32] extract pure assumption and preset rules from api shell --- .../api_utils/assumptions/assumption_rules.py | 112 ++++++++++++++++++ .../assumptions/assumptions_builder.py | 45 +++---- fedot/api/api_utils/presets.py | 74 ++++++------ tests/api/__init__.py | 1 + tests/api/api_utils/__init__.py | 1 + tests/api/api_utils/assumptions/__init__.py | 1 + .../assumptions/test_assumption_rules.py | 78 ++++++++++++ tests/api/api_utils/test_presets.py | 39 ++++++ 8 files changed, 287 insertions(+), 64 deletions(-) create mode 100644 fedot/api/api_utils/assumptions/assumption_rules.py create mode 100644 tests/api/__init__.py create mode 100644 tests/api/api_utils/__init__.py create mode 100644 tests/api/api_utils/assumptions/__init__.py create mode 100644 tests/api/api_utils/assumptions/test_assumption_rules.py create mode 100644 tests/api/api_utils/test_presets.py diff --git a/fedot/api/api_utils/assumptions/assumption_rules.py b/fedot/api/api_utils/assumptions/assumption_rules.py new file mode 100644 index 0000000000..1a3ae4b6f3 --- /dev/null +++ b/fedot/api/api_utils/assumptions/assumption_rules.py @@ -0,0 +1,112 @@ +from dataclasses import dataclass +from typing import Iterable, Optional, Sequence, Tuple + +from fedot.core.constants import AUTO_PRESET_NAME, BEST_QUALITY_PRESET_NAME +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.operation_query import RepositoryKind + + +@dataclass(frozen=True) +class AssumptionsFilterDecision: + allow_filtering: bool + whitelist: Tuple[str, ...] + sampling_choices: Tuple[str, ...] + + +@dataclass(frozen=True) +class PresetSpec: + requested_preset: str + base_preset: str + modification: Optional[str] + use_auto: bool + use_stable: bool + use_gpu: bool + + +_REQUIRED_SOURCE_OPERATIONS = { + DataTypesEnum.image: ('data_source_img',), + DataTypesEnum.text: ('data_source_text',), + DataTypesEnum.table: ('data_source_table',), +} + + +def default_repository_name_for_data(data) -> str: + if data.data_type == DataTypesEnum.multi_ts: + return RepositoryKind.ALL.value + return RepositoryKind.MODEL.value + + +def required_operations_for_data(data, data_type: DataTypesEnum) -> Tuple[str, ...]: + required_operations = [] + + if hasattr(data, 'items'): + required_operations.extend(_REQUIRED_SOURCE_OPERATIONS.get(data_type, ())) + + if data_type is DataTypesEnum.image: + required_operations.append('cnn') + + return tuple(dict.fromkeys(required_operations)) + + +def build_operations_filter_decision(data, + data_type: DataTypesEnum, + available_operations: Optional[Sequence[str]], + suitable_operations: Iterable[str]) -> AssumptionsFilterDecision: + whitelist = tuple(dict.fromkeys(available_operations or ())) + suitable_set = set(suitable_operations) + sampling_choices = [operation for operation in whitelist if operation in suitable_set] + + for required_operation in required_operations_for_data(data, data_type): + if required_operation not in sampling_choices: + sampling_choices.append(required_operation) + + return AssumptionsFilterDecision( + allow_filtering=bool(sampling_choices), + whitelist=whitelist, + sampling_choices=tuple(sampling_choices), + ) + + +def parse_preset_spec(preset_name: Optional[str]) -> PresetSpec: + requested_preset = preset_name or '' + base_preset = requested_preset + modification = None + use_auto = AUTO_PRESET_NAME in requested_preset + use_stable = 'stable' in requested_preset + use_gpu = 'gpu' in requested_preset + + if use_stable: + base_preset = BEST_QUALITY_PRESET_NAME + + if '*' in base_preset: + base_name, suffix = base_preset.split('*', 1) + base_preset = base_name + modification = f'*{suffix}' + + return PresetSpec( + requested_preset=requested_preset, + base_preset=base_preset, + modification=modification, + use_auto=use_auto, + use_stable=use_stable, + use_gpu=use_gpu, + ) + + +def merge_preset_operations(base_operations: Iterable[str], + modification_operations: Optional[Iterable[str]] = None) -> Tuple[str, ...]: + merged_operations = list(dict.fromkeys(base_operations)) + if modification_operations is None: + return tuple(merged_operations) + + modification_set = set(modification_operations) + return tuple(operation for operation in merged_operations if operation in modification_set) + + +def exclude_operations(available_operations: Iterable[str], excluded_operations: Iterable[str]) -> Tuple[str, ...]: + excluded_set = set(excluded_operations) + return tuple(operation for operation in available_operations if operation not in excluded_set) + + +def finalize_operations(available_operations: Iterable[str], excluded_operations: Iterable[str] = ()) -> list[str]: + return sorted(set(exclude_operations(available_operations, excluded_operations))) diff --git a/fedot/api/api_utils/assumptions/assumptions_builder.py b/fedot/api/api_utils/assumptions/assumptions_builder.py index 5bea0f6383..4259c13460 100644 --- a/fedot/api/api_utils/assumptions/assumptions_builder.py +++ b/fedot/api/api_utils/assumptions/assumptions_builder.py @@ -1,8 +1,12 @@ -from abc import abstractmethod -from typing import List, Union, Optional, Set, Tuple +from abc import abstractmethod +from typing import List, Union, Optional, Tuple from golem.core.log import default_log +from fedot.api.api_utils.assumptions.assumption_rules import ( + build_operations_filter_decision, + default_repository_name_for_data, +) from fedot.api.api_utils.assumptions.operations_filter import OperationsFilter, WhitelistOperationsFilter from fedot.api.api_utils.assumptions.preprocessing_builder import PreprocessingBuilder from fedot.api.api_utils.assumptions.task_assumptions import TaskAssumptions @@ -27,11 +31,7 @@ def __init__(self, data: Union[InputData, MultiModalData], repository_name: str @staticmethod def get(data: Union[InputData, MultiModalData], repository_name: Optional[str] = None): if not repository_name: - if data.data_type == DataTypesEnum.multi_ts: - # It is needed to use data operations also for multi_ts data - repository_name = 'all' - else: - repository_name = 'model' + repository_name = default_repository_name_for_data(data) if isinstance(data, InputData): cls = UniModalAssumptionsBuilder @@ -74,13 +74,20 @@ def __init__(self, data: Union[InputData, MultiModalData], def from_operations(self, available_operations: Optional[List[str]] = None): if available_operations: operations_for_task_and_data = self.repo.suitable_operation(self.data.task.task_type, self.data_type) - operations_to_choose_from = set(operations_for_task_and_data).intersection(available_operations) - _check_operations_to_choose_from(self.data, self.data_type, operations_to_choose_from) - if operations_to_choose_from: - self.ops_filter = WhitelistOperationsFilter(available_operations, operations_to_choose_from) + filter_decision = build_operations_filter_decision( + data=self.data, + data_type=self.data_type, + available_operations=available_operations, + suitable_operations=operations_for_task_and_data, + ) + if filter_decision.allow_filtering: + self.ops_filter = WhitelistOperationsFilter( + filter_decision.whitelist, + filter_decision.sampling_choices, + ) else: # Don't filter pipelines as we're not able to create - # fallback pipelines without operations_to_choose_from. + # fallback pipelines without sampling choices. # So, leave default dumb ops_filter. self.logger.info(self.UNSUITABLE_AVAILABLE_OPERATIONS_MSG) return self @@ -142,17 +149,3 @@ def to_builders(self, initial_node: Optional[PipelineNode] = None, .join_branches(ensemble_operation) ensemble_builders.append(ensemble_builder) return ensemble_builders - - -def _check_operations_to_choose_from(data, data_type: DataTypesEnum, operations_to_choose_from: Set[str]): - """Since it is sometimes impossible to form a valid pipeline without some operations, - they are added to the set of operations for current task and data.""" - if isinstance(data, MultiModalData): - if data_type is DataTypesEnum.image and 'data_source_img' not in operations_to_choose_from: - operations_to_choose_from.add('data_source_img') - if data_type is DataTypesEnum.text and 'data_source_text' not in operations_to_choose_from: - operations_to_choose_from.add('data_source_text') - if data_type is DataTypesEnum.table and 'data_source_table' not in operations_to_choose_from: - operations_to_choose_from.add('data_source_table') - if data_type is DataTypesEnum.image and 'cnn' not in operations_to_choose_from: - operations_to_choose_from.add('cnn') diff --git a/fedot/api/api_utils/presets.py b/fedot/api/api_utils/presets.py index 7e64186447..4d821ef118 100644 --- a/fedot/api/api_utils/presets.py +++ b/fedot/api/api_utils/presets.py @@ -1,6 +1,12 @@ -from copy import copy +from copy import copy from typing import Optional +from fedot.api.api_utils.assumptions.assumption_rules import ( + exclude_operations, + finalize_operations, + merge_preset_operations, + parse_preset_spec, +) from fedot.api.time import ApiTime from fedot.core.constants import BEST_QUALITY_PRESET_NAME, \ FAST_TRAIN_PRESET_NAME, AUTO_PRESET_NAME @@ -40,56 +46,48 @@ def filter_operations_by_preset(self, data_type: Optional[DataTypesEnum] = None) """ Filter operations by preset, remove "heavy" operations and save appropriate ones """ - preset_name = self.preset_name - if AUTO_PRESET_NAME in preset_name: + preset_spec = parse_preset_spec(self.preset_name) + + if preset_spec.use_auto: available_operations = get_operations_for_task(self.task, data_type, mode='all') return available_operations - # TODO remove workaround - # Use best_quality preset but exclude several operations - if 'stable' in self.preset_name: - # Use best_quality preset but exclude several operations - preset_name = BEST_QUALITY_PRESET_NAME excluded = ['mlp', 'svc', 'svr', 'arima', 'exog_ts', 'text_clean', - 'lda', 'qda', 'lgbm', 'one_hot_encoding','polyfit', + 'lda', 'qda', 'lgbm', 'one_hot_encoding', 'polyfit', 'resample', 'stl_arima'] excluded_tree = [] - if '*' in preset_name: - self.modification_using = True - # The modification has been added - preset_name, modification = preset_name.split('*') - modification = ''.join(('*', modification)) - - mod_operations = get_operations_for_task(self.task, data_type, mode='all', preset=modification) - - # Get operations - available_operations = get_operations_for_task(self.task, data_type, mode='all', preset=preset_name) - - if self.modification_using: - # Find subsample of operations - filtered_operations = set(available_operations).intersection(set(mod_operations)) - available_operations = list(filtered_operations) - - # Exclude "heavy" operations if necessary - if 'stable' in self.preset_name: - available_operations = self.new_operations_without_heavy(excluded, available_operations) - - if 'gpu' in self.preset_name: + self.modification_using = preset_spec.modification is not None + if preset_spec.use_gpu: repository = OperationTypesRepository().assign_repo('model', 'gpu_models_repository.json') available_operations = repository.suitable_operation(task_type=self.task.task_type, data_type=data_type) - - filtered_operations = set(available_operations).difference(set(excluded_tree)) - available_operations = list(filtered_operations) - - return sorted(available_operations) + else: + base_operations = get_operations_for_task( + self.task, + data_type, + mode='all', + preset=preset_spec.base_preset, + ) + if self.modification_using: + mod_operations = get_operations_for_task( + self.task, + data_type, + mode='all', + preset=preset_spec.modification, + ) + available_operations = list(merge_preset_operations(base_operations, mod_operations)) + else: + available_operations = base_operations + + if preset_spec.use_stable: + available_operations = list(exclude_operations(available_operations, excluded)) + + return finalize_operations(available_operations, excluded_tree) @staticmethod def new_operations_without_heavy(excluded_operations, available_operations) -> list: """ Create new list without heavy operations """ - available_operations = [_ for _ in available_operations if _ not in excluded_operations] - - return available_operations + return list(exclude_operations(available_operations, excluded_operations)) def change_preset_based_on_initial_fit(timer: ApiTime, n_jobs: int) -> str: diff --git a/tests/api/__init__.py b/tests/api/__init__.py new file mode 100644 index 0000000000..e02abfc9b0 --- /dev/null +++ b/tests/api/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/api/api_utils/__init__.py b/tests/api/api_utils/__init__.py new file mode 100644 index 0000000000..e02abfc9b0 --- /dev/null +++ b/tests/api/api_utils/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/api/api_utils/assumptions/__init__.py b/tests/api/api_utils/assumptions/__init__.py new file mode 100644 index 0000000000..e02abfc9b0 --- /dev/null +++ b/tests/api/api_utils/assumptions/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/api/api_utils/assumptions/test_assumption_rules.py b/tests/api/api_utils/assumptions/test_assumption_rules.py new file mode 100644 index 0000000000..f36fade033 --- /dev/null +++ b/tests/api/api_utils/assumptions/test_assumption_rules.py @@ -0,0 +1,78 @@ +from fedot.api.api_utils.assumptions.assumption_rules import ( + build_operations_filter_decision, + default_repository_name_for_data, + exclude_operations, + finalize_operations, + merge_preset_operations, + parse_preset_spec, + required_operations_for_data, +) +from fedot.core.repository.dataset_types import DataTypesEnum + + +class _FakeData: + def __init__(self, data_type): + self.data_type = data_type + + +class _FakeMultiModalData(dict): + def __init__(self, data_type): + super().__init__() + self.data_type = data_type + + +def test_default_repository_name_for_multi_ts(): + assert default_repository_name_for_data(_FakeData(DataTypesEnum.multi_ts)) == 'all' + + +def test_default_repository_name_for_regular_data(): + assert default_repository_name_for_data(_FakeData(DataTypesEnum.table)) == 'model' + + +def test_required_operations_for_multimodal_image_include_sources_and_cnn(): + required = required_operations_for_data(_FakeMultiModalData(DataTypesEnum.image), DataTypesEnum.image) + assert required == ('data_source_img', 'cnn') + + +def test_build_operations_filter_decision_intersects_with_suitable_operations(): + decision = build_operations_filter_decision( + data=_FakeData(DataTypesEnum.table), + data_type=DataTypesEnum.table, + available_operations=['rf', 'lasso', 'xgboost'], + suitable_operations=['rf', 'xgboost', 'ridge'], + ) + + assert decision.allow_filtering is True + assert decision.whitelist == ('rf', 'lasso', 'xgboost') + assert decision.sampling_choices == ('rf', 'xgboost') + + +def test_build_operations_filter_decision_keeps_required_image_operations(): + decision = build_operations_filter_decision( + data=_FakeMultiModalData(DataTypesEnum.image), + data_type=DataTypesEnum.image, + available_operations=['rf'], + suitable_operations=['rf'], + ) + + assert decision.allow_filtering is True + assert decision.sampling_choices == ('rf', 'data_source_img', 'cnn') + + +def test_parse_preset_spec_extracts_stable_and_modification_flags(): + spec = parse_preset_spec('stable*tree') + + assert spec.base_preset == 'best_quality' + assert spec.modification == '*tree' + assert spec.use_stable is True + assert spec.use_auto is False + assert spec.use_gpu is False + + +def test_merge_exclude_and_finalize_operations_are_deterministic(): + merged = merge_preset_operations(['rf', 'xgboost', 'knn'], ['xgboost', 'rf']) + filtered = exclude_operations(merged, ['xgboost']) + + assert merged == ('rf', 'xgboost') + assert filtered == ('rf',) + assert finalize_operations(['rf', 'rf', 'knn']) == ['knn', 'rf'] diff --git a/tests/api/api_utils/test_presets.py b/tests/api/api_utils/test_presets.py new file mode 100644 index 0000000000..391361b7b3 --- /dev/null +++ b/tests/api/api_utils/test_presets.py @@ -0,0 +1,39 @@ +import fedot.api.api_utils.presets as presets_module +from fedot.api.api_utils.presets import OperationsPreset +from fedot.core.repository.tasks import Task, TaskTypesEnum + + +class _FakeRepository: + def suitable_operation(self, task_type, data_type): + return ['gpu_rf', 'gpu_logit'] + + +def test_new_operations_without_heavy_uses_pure_exclusion_rule(): + preset = OperationsPreset(Task(TaskTypesEnum.regression), 'best_quality') + + assert preset.new_operations_without_heavy(['heavy'], ['light', 'heavy']) == ['light'] + + +def test_filter_operations_by_preset_intersects_modification(monkeypatch): + def fake_get_operations_for_task(task, data_type=None, mode='all', preset=None): + if preset == 'best_quality': + return ['rf', 'xgboost', 'knn'] + if preset == '*tree': + return ['rf', 'xgboost'] + raise AssertionError(f'unexpected preset: {preset}') + + monkeypatch.setattr(presets_module, 'get_operations_for_task', fake_get_operations_for_task) + + preset = OperationsPreset(Task(TaskTypesEnum.classification), 'best_quality*tree') + assert preset.filter_operations_by_preset() == ['rf', 'xgboost'] + + +def test_filter_operations_by_preset_uses_gpu_repository(monkeypatch): + monkeypatch.setattr( + presets_module.OperationTypesRepository, + 'assign_repo', + staticmethod(lambda repo_name='model', path='gpu_models_repository.json': _FakeRepository()), + ) + + preset = OperationsPreset(Task(TaskTypesEnum.classification), 'gpu') + assert preset.filter_operations_by_preset() == ['gpu_logit', 'gpu_rf'] From 8e4f55d2fb8552768bbc8f923fc9c5f1856c6e95 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:17:03 +0300 Subject: [PATCH 07/32] extract pure input data rules from api data adapter --- fedot/api/api_utils/api_data.py | 111 ++++++++++--------- fedot/api/api_utils/api_data_rules.py | 120 +++++++++++++++++++++ fedot/api/api_utils/data_definition.py | 8 +- tests/api/api_utils/test_api_data.py | 78 ++++++++++++++ tests/api/api_utils/test_api_data_rules.py | 76 +++++++++++++ 5 files changed, 338 insertions(+), 55 deletions(-) create mode 100644 fedot/api/api_utils/api_data_rules.py create mode 100644 tests/api/api_utils/test_api_data.py create mode 100644 tests/api/api_utils/test_api_data_rules.py diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 5a421397eb..cc5ec7d91c 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -1,10 +1,17 @@ -from datetime import datetime +from datetime import datetime from typing import Dict, Union from typing import Optional import numpy as np from golem.core.log import default_log +from fedot.api.api_utils.api_data_rules import ( + iter_shared_index_assignments, + normalize_features_for_definition, + plan_fit_preprocessing, + plan_prediction, + plan_predict_preprocessing, +) from fedot.api.api_utils.data_definition import data_strategy_selector, FeaturesType, TargetType from fedot.core.data.data import InputData, OutputData, data_type_is_table from fedot.core.data.data_preprocessing import convert_into_column @@ -57,19 +64,15 @@ def define_data(self, Obligatory preprocessing steps are applying also. If features is dictionary there is a need to process MultiModalData """ + normalized_features = normalize_features_for_definition(features) + try: - # TODO remove workaround - idx = None - if isinstance(features, dict) and 'idx' in features: - idx = features['idx'] - del features['idx'] - data = data_strategy_selector(features=features, + data = data_strategy_selector(features=normalized_features.features, target=target, task=self.task, is_predict=is_predict) - if isinstance(data, dict) and idx is not None: - for key in data: - data[key].idx = idx + for data_source_name, shared_index in iter_shared_index_assignments(data, normalized_features.shared_index): + data[data_source_name].idx = shared_index except Exception as ex: raise ValueError('Please specify the "features" as path to csv file/' 'Numpy array/Pandas DataFrame/FEDOT InputData/dict for multimodal data, ' @@ -85,27 +88,26 @@ def define_data(self, def define_predictions(self, current_pipeline: Pipeline, test_data: Union[InputData, MultiModalData], in_sample: bool = False, validation_blocks: int = None) -> OutputData: """ Prepare predictions """ - if self.task.task_type == TaskTypesEnum.classification: - # Prediction should be converted into source labels - output_prediction = current_pipeline.predict(test_data, output_mode='labels') - elif self.task.task_type == TaskTypesEnum.ts_forecasting: - if in_sample: - forecast_length = test_data.task.task_params.forecast_length - validation_blocks = validation_blocks or 1 - horizon = forecast_length * validation_blocks - forecast = in_sample_ts_forecast(current_pipeline, test_data, horizon) - idx = test_data.idx[-horizon:] - prediction = convert_forecast_to_output(test_data, forecast, idx=idx) - else: - prediction = current_pipeline.predict(test_data) - # Convert forecast into one-dimensional array - forecast = np.ravel(np.array(prediction.predict)) - prediction.predict = forecast - output_prediction = prediction - else: - output_prediction = current_pipeline.predict(test_data) - - return output_prediction + forecast_length = getattr(test_data.task.task_params, 'forecast_length', None) + prediction_plan = plan_prediction( + task_type=self.task.task_type, + in_sample=in_sample, + validation_blocks=validation_blocks, + forecast_length=forecast_length, + ) + + if prediction_plan.output_mode is not None: + return current_pipeline.predict(test_data, output_mode=prediction_plan.output_mode) + + if prediction_plan.use_in_sample_forecast: + forecast = in_sample_ts_forecast(current_pipeline, test_data, prediction_plan.horizon) + idx = test_data.idx[-prediction_plan.horizon:] + return convert_forecast_to_output(test_data, forecast, idx=idx) + + prediction = current_pipeline.predict(test_data) + if prediction_plan.flatten_prediction: + prediction.predict = np.ravel(np.array(prediction.predict)) + return prediction def correct_predictions(self, real: InputData, prediction: OutputData): """ Change shape for models predictions if its necessary. Apply """ @@ -143,19 +145,11 @@ def fit_transform(self, train_data: InputData) -> InputData: self.log.message( f'Train Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}') - self.log.debug('- Obligatory preprocessing started') - train_data = self.preprocessor.obligatory_prepare_for_fit(data=train_data) - - self.log.debug('- Optional preprocessing started') - train_data = self.preprocessor.optional_prepare_for_fit(pipeline=Pipeline(), data=train_data) - - self.log.debug('- Converting indexes for fitting started') - train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data) - - self.log.debug('- Reducing memory started') - train_data = self.preprocessor.reduce_memory_size(data=train_data) - - train_data.supplementary_data.is_auto_preprocessed = True + train_data = self._apply_preprocessing_plan( + data=train_data, + current_pipeline=Pipeline(), + plan=plan_fit_preprocessing(), + ) memory_usage = convert_memory_size(train_data.memory_usage) @@ -176,13 +170,11 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: self.log.message( f'Test Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}') - test_data = self.preprocessor.obligatory_prepare_for_predict(data=test_data) - test_data = self.preprocessor.optional_prepare_for_predict(pipeline=current_pipeline, data=test_data) - test_data = self.preprocessor.convert_indexes_for_predict(pipeline=current_pipeline, data=test_data) - test_data = self.preprocessor.update_indices_for_time_series(test_data) - test_data.supplementary_data.is_auto_preprocessed = True - - test_data = self.preprocessor.reduce_memory_size(data=test_data) + test_data = self._apply_preprocessing_plan( + data=test_data, + current_pipeline=current_pipeline, + plan=plan_predict_preprocessing(), + ) memory_usage = convert_memory_size(test_data.memory_usage) features_shape = test_data.features.shape @@ -192,3 +184,20 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: self.log.message(f'Data preprocessing runtime = {datetime.now() - start_time}') return test_data + + def _apply_preprocessing_plan(self, + data: InputData, + current_pipeline: Pipeline, + plan) -> InputData: + for step_name in plan.steps: + self.log.debug(f'- {step_name} started') + step = getattr(self.preprocessor, step_name) + if step_name.startswith('optional_prepare') or step_name.startswith('convert_indexes'): + data = step(pipeline=current_pipeline, data=data) + else: + data = step(data=data) + + if plan.mark_auto_preprocessed: + data.supplementary_data.is_auto_preprocessed = True + + return data diff --git a/fedot/api/api_utils/api_data_rules.py b/fedot/api/api_utils/api_data_rules.py new file mode 100644 index 0000000000..7a06936b21 --- /dev/null +++ b/fedot/api/api_utils/api_data_rules.py @@ -0,0 +1,120 @@ +from dataclasses import dataclass +from typing import Any, Iterable, Optional, Tuple + +from fedot.core.repository.tasks import TaskTypesEnum + + +@dataclass(frozen=True) +class NormalizedFeatures: + features: Any + shared_index: Optional[Any] + + +@dataclass(frozen=True) +class PreprocessingPlan: + steps: Tuple[str, ...] + mark_auto_preprocessed: bool + + +@dataclass(frozen=True) +class PredictionPlan: + output_mode: Optional[str] + use_in_sample_forecast: bool + flatten_prediction: bool + horizon: Optional[int] + + +@dataclass(frozen=True) +class StrategyResolution: + strategy_factory: Any + + +class DataDefinitionResolutionError(TypeError): + pass + + +_FIT_PREPROCESSING_STEPS = ( + 'obligatory_prepare_for_fit', + 'optional_prepare_for_fit', + 'convert_indexes_for_fit', + 'reduce_memory_size', +) + +_PREDICT_PREPROCESSING_STEPS = ( + 'obligatory_prepare_for_predict', + 'optional_prepare_for_predict', + 'convert_indexes_for_predict', + 'update_indices_for_time_series', + 'reduce_memory_size', +) + + +def normalize_features_for_definition(features: Any) -> NormalizedFeatures: + if isinstance(features, dict) and 'idx' in features: + normalized_features = dict(features) + shared_index = normalized_features.pop('idx') + return NormalizedFeatures(features=normalized_features, shared_index=shared_index) + return NormalizedFeatures(features=features, shared_index=None) + + +def iter_shared_index_assignments(data: Any, shared_index: Optional[Any]) -> Tuple[Tuple[str, Any], ...]: + if shared_index is None or not isinstance(data, dict): + return tuple() + return tuple((data_source_name, shared_index) for data_source_name in data) + + +def plan_fit_preprocessing() -> PreprocessingPlan: + return PreprocessingPlan(steps=_FIT_PREPROCESSING_STEPS, mark_auto_preprocessed=True) + + +def plan_predict_preprocessing() -> PreprocessingPlan: + return PreprocessingPlan(steps=_PREDICT_PREPROCESSING_STEPS, mark_auto_preprocessed=True) + + +def plan_prediction(task_type: TaskTypesEnum, + in_sample: bool, + validation_blocks: Optional[int], + forecast_length: Optional[int]) -> PredictionPlan: + if task_type == TaskTypesEnum.classification: + return PredictionPlan( + output_mode='labels', + use_in_sample_forecast=False, + flatten_prediction=False, + horizon=None, + ) + + if task_type == TaskTypesEnum.ts_forecasting and in_sample: + blocks = validation_blocks or 1 + horizon = (forecast_length or 0) * blocks + return PredictionPlan( + output_mode=None, + use_in_sample_forecast=True, + flatten_prediction=False, + horizon=horizon, + ) + + if task_type == TaskTypesEnum.ts_forecasting: + return PredictionPlan( + output_mode=None, + use_in_sample_forecast=False, + flatten_prediction=True, + horizon=None, + ) + + return PredictionPlan( + output_mode=None, + use_in_sample_forecast=False, + flatten_prediction=False, + horizon=None, + ) + + +def resolve_strategy(features: Any, strategy_dispatch: Iterable[Tuple[type, Any]]) -> StrategyResolution: + for source_type, strategy_factory in strategy_dispatch: + if isinstance(features, source_type): + return StrategyResolution(strategy_factory=strategy_factory) + + supported_sources = ', '.join(source_type.__name__ for source_type, _ in strategy_dispatch) + raise DataDefinitionResolutionError( + f'Unsupported features type: {type(features).__name__}. Supported types: {supported_sources}.' + ) diff --git a/fedot/api/api_utils/data_definition.py b/fedot/api/api_utils/data_definition.py index eae4c994c3..eabea08b95 100644 --- a/fedot/api/api_utils/data_definition.py +++ b/fedot/api/api_utils/data_definition.py @@ -1,4 +1,4 @@ -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod from copy import deepcopy from os import PathLike from typing import Union, Optional @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from fedot.api.api_utils.api_data_rules import resolve_strategy from fedot.core.data.data import InputData, array_to_input_data from fedot.core.data.multi_modal import MultiModalData from fedot.core.repository.dataset_types import DataTypesEnum @@ -174,9 +175,8 @@ def define_data(self, features: dict, def data_strategy_selector(features: FeaturesType, target: Optional[str] = None, task: Task = None, is_predict: bool = None) -> Union[InputData, MultiModalData]: - strategy = [strategy for cls, strategy in _strategy_dispatch.items() if isinstance(features, cls)][0] - - data = DataDefiner(strategy()) + strategy_resolution = resolve_strategy(features, _strategy_dispatch.items()) + data = DataDefiner(strategy_resolution.strategy_factory()) return data.define_data(features, task, target, is_predict) diff --git a/tests/api/api_utils/test_api_data.py b/tests/api/api_utils/test_api_data.py new file mode 100644 index 0000000000..04825e742e --- /dev/null +++ b/tests/api/api_utils/test_api_data.py @@ -0,0 +1,78 @@ +from types import SimpleNamespace + +import numpy as np + +import fedot.api.api_utils.api_data as api_data_module +from fedot.api.api_utils.api_data import ApiDataProcessor +from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams + + +class _PassthroughPreprocessor: + def obligatory_prepare_for_fit(self, data): + return data + + def obligatory_prepare_for_predict(self, data): + return data + + def optional_prepare_for_fit(self, pipeline, data): + return data + + def optional_prepare_for_predict(self, pipeline, data): + return data + + def convert_indexes_for_fit(self, pipeline, data): + return data + + def convert_indexes_for_predict(self, pipeline, data): + return data + + def update_indices_for_time_series(self, data): + return data + + def reduce_memory_size(self, data): + return data + + +def test_define_data_does_not_mutate_original_multimodal_features(monkeypatch): + original_features = {'idx': np.array([10, 11]), 'table': np.array([[1], [2]])} + + def fake_strategy_selector(features, target=None, task=None, is_predict=None): + assert 'idx' not in features + return {'table': SimpleNamespace(idx=None)} + + monkeypatch.setattr(api_data_module, 'data_strategy_selector', fake_strategy_selector) + + processor = ApiDataProcessor(Task(TaskTypesEnum.classification), use_input_preprocessing=False) + processor.preprocessor = _PassthroughPreprocessor() + + result = processor.define_data(features=original_features, target=np.array([0, 1]), is_predict=False) + + assert 'idx' in original_features + assert np.array_equal(original_features['idx'], np.array([10, 11])) + assert np.array_equal(result['table'].idx, np.array([10, 11])) + + +def test_define_predictions_uses_in_sample_forecasting_plan(monkeypatch): + captured = {} + + def fake_in_sample_ts_forecast(pipeline, test_data, horizon): + captured['horizon'] = horizon + return np.array([1.0] * horizon) + + def fake_convert_forecast_to_output(test_data, forecast, idx): + captured['idx'] = idx + return SimpleNamespace(predict=forecast, idx=idx) + + monkeypatch.setattr(api_data_module, 'in_sample_ts_forecast', fake_in_sample_ts_forecast) + monkeypatch.setattr(api_data_module, 'convert_forecast_to_output', fake_convert_forecast_to_output) + + task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=2)) + processor = ApiDataProcessor(task, use_input_preprocessing=False) + test_data = SimpleNamespace(task=task, idx=np.array([0, 1, 2, 3, 4])) + + prediction = processor.define_predictions(current_pipeline=object(), test_data=test_data, + in_sample=True, validation_blocks=2) + + assert captured['horizon'] == 4 + assert np.array_equal(captured['idx'], np.array([1, 2, 3, 4])) + assert len(prediction.predict) == 4 diff --git a/tests/api/api_utils/test_api_data_rules.py b/tests/api/api_utils/test_api_data_rules.py new file mode 100644 index 0000000000..5b189e842d --- /dev/null +++ b/tests/api/api_utils/test_api_data_rules.py @@ -0,0 +1,76 @@ +import types + +import numpy as np +import pandas as pd +import pytest + +from fedot.api.api_utils.api_data_rules import ( + DataDefinitionResolutionError, + iter_shared_index_assignments, + normalize_features_for_definition, + plan_fit_preprocessing, + plan_prediction, + plan_predict_preprocessing, + resolve_strategy, +) +from fedot.core.repository.tasks import TaskTypesEnum + + +class _StrategyA: + pass + + +class _StrategyB: + pass + + +def test_normalize_features_for_definition_extracts_shared_index_without_mutation(): + original_features = {'idx': np.array([0, 1]), 'table': np.array([[1], [2]])} + + normalized = normalize_features_for_definition(original_features) + + assert 'idx' in original_features + assert 'idx' not in normalized.features + assert np.array_equal(normalized.shared_index, np.array([0, 1])) + + +def test_iter_shared_index_assignments_returns_pairs_for_multimodal_mapping(): + assignments = iter_shared_index_assignments({'first': object(), 'second': object()}, [1, 2]) + assert assignments == (('first', [1, 2]), ('second', [1, 2])) + + +def test_plan_preprocessing_steps_are_explicit_and_stable(): + assert plan_fit_preprocessing().steps == ( + 'obligatory_prepare_for_fit', + 'optional_prepare_for_fit', + 'convert_indexes_for_fit', + 'reduce_memory_size', + ) + assert plan_predict_preprocessing().steps == ( + 'obligatory_prepare_for_predict', + 'optional_prepare_for_predict', + 'convert_indexes_for_predict', + 'update_indices_for_time_series', + 'reduce_memory_size', + ) + + +@pytest.mark.parametrize('task_type,in_sample,validation_blocks,forecast_length,expected', [ + (TaskTypesEnum.classification, False, None, None, ('labels', False, False, None)), + (TaskTypesEnum.ts_forecasting, True, 2, 3, (None, True, False, 6)), + (TaskTypesEnum.ts_forecasting, False, None, 3, (None, False, True, None)), + (TaskTypesEnum.regression, False, None, None, (None, False, False, None)), +]) +def test_plan_prediction_returns_typed_branching_decision(task_type, in_sample, validation_blocks, forecast_length, expected): + plan = plan_prediction(task_type, in_sample, validation_blocks, forecast_length) + assert (plan.output_mode, plan.use_in_sample_forecast, plan.flatten_prediction, plan.horizon) == expected + + +def test_resolve_strategy_finds_matching_factory(): + resolution = resolve_strategy(pd.DataFrame([[1]]), [(np.ndarray, _StrategyA), (pd.DataFrame, _StrategyB)]) + assert resolution.strategy_factory is _StrategyB + + +def test_resolve_strategy_raises_typed_error_for_unsupported_source(): + with pytest.raises(DataDefinitionResolutionError): + resolve_strategy(123, [(np.ndarray, _StrategyA), (pd.DataFrame, _StrategyB)]) From 88813571d6895135440dbd5812f45c99aef94db9 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:22:15 +0300 Subject: [PATCH 08/32] extract pure recommendation rules from input analyser --- fedot/api/api_utils/input_analyser.py | 85 ++++++------ fedot/api/api_utils/recommendation_rules.py | 131 ++++++++++++++++++ tests/api/api_utils/test_input_analyser.py | 59 ++++++++ .../api_utils/test_recommendation_rules.py | 110 +++++++++++++++ 4 files changed, 346 insertions(+), 39 deletions(-) create mode 100644 fedot/api/api_utils/recommendation_rules.py create mode 100644 tests/api/api_utils/test_input_analyser.py create mode 100644 tests/api/api_utils/test_recommendation_rules.py diff --git a/fedot/api/api_utils/input_analyser.py b/fedot/api/api_utils/input_analyser.py index 61f91770e5..5d5c75db3a 100644 --- a/fedot/api/api_utils/input_analyser.py +++ b/fedot/api/api_utils/input_analyser.py @@ -1,10 +1,15 @@ -from functools import partial -from inspect import signature -from typing import Any, Dict, Tuple, Union +from typing import Any, Dict, Tuple, Union -import numpy as np from golem.core.log import default_log +from fedot.api.api_utils.recommendation_rules import ( + RecommendationLimits, + build_recommendation_bundle, + build_safe_data_recommendations, + collect_meta_rule_recommendations, + estimate_size_cut_border, + should_use_label_encoding, +) from fedot.core.composer.meta_rules import get_cv_folds_number, get_early_stopping_generations, get_recommended_preset from fedot.core.data.data import InputData from fedot.core.data.data_preprocessing import find_categorical_columns @@ -54,12 +59,19 @@ def give_recommendations(self, input_data: Union[InputData, MultiModalData], inp input_params=input_params) elif isinstance(input_data, InputData): if input_data.data_type in [DataTypesEnum.table, DataTypesEnum.text]: - recommendations_for_data = self._give_recommendations_for_data(input_data=input_data) - if 'use_meta_rules' in input_params and input_params['use_meta_rules']: - recommendations_for_params = self._give_recommendations_with_meta_rules(input_data=input_data, - input_params=input_params) + recommendation_bundle = build_recommendation_bundle( + input_data=input_data, + input_params=input_params, + safe_mode=self.safe_mode, + limits=self._limits(), + categorical_detector=find_categorical_columns, + meta_rules=meta_rules, + log=self._log, + ) + recommendations_for_data = recommendation_bundle.data + recommendations_for_params = recommendation_bundle.params if 'label_encoded' in recommendations_for_data: - recommendations_for_params['label_encoded'] = recommendations_for_data['label_encoded'] + self._log.info('Switch categorical encoder to label encoder') return recommendations_for_data, recommendations_for_params @@ -70,29 +82,20 @@ def _give_recommendations_for_data(self, input_data: InputData) -> Dict: :return : dict with str recommendations """ - recommendations_for_data = {} - if self.safe_mode: - is_cut_needed, border = self.control_size(input_data) - if is_cut_needed: - recommendations_for_data['cut'] = {'border': border} - is_label_encoding_needed = self.control_categorical(input_data) - if is_label_encoding_needed: - self._log.info('Switch categorical encoder to label encoder') - recommendations_for_data['label_encoded'] = {} - return recommendations_for_data + return build_safe_data_recommendations( + input_data=input_data, + safe_mode=self.safe_mode, + limits=self._limits(), + categorical_detector=find_categorical_columns, + ) def _give_recommendations_with_meta_rules(self, input_data: InputData, input_params: Dict): - recommendations = dict() - for rule in meta_rules: - if 'input_params' in signature(rule).parameters: - rule = partial(rule, input_params=input_params) - if 'input_data' in signature(rule).parameters: - rule = partial(rule, input_data=input_data) - cur_recommendation = rule(log=self._log) - # if there is recommendation to change parameter - if list(cur_recommendation.values())[0]: - recommendations.update(cur_recommendation) - return recommendations + return collect_meta_rule_recommendations( + input_data=input_data, + input_params=input_params, + rules=meta_rules, + log=self._log, + ) def control_size(self, input_data: InputData) -> Tuple[bool, Any]: """ @@ -102,11 +105,8 @@ def control_size(self, input_data: InputData) -> Tuple[bool, Any]: :return : (is_cut_needed, border) is cutting is needed | if yes - border of cutting, """ - if input_data.data_type == DataTypesEnum.table: - if input_data.features.shape[0] * input_data.features.shape[1] > self.max_size: - border = self.max_size // input_data.features.shape[1] - return True, border - return False, None + border = estimate_size_cut_border(input_data, self.max_size) + return border is not None, border def control_categorical(self, input_data: InputData) -> bool: """ @@ -115,7 +115,14 @@ def control_categorical(self, input_data: InputData) -> bool: :param input_data: data for preprocessing """ - categorical_ids, _ = find_categorical_columns(input_data.features) - # Counts unique categories for each feature, and then counts their number - uniques_cats = sum([len(np.unique(feature)) for feature in input_data.features[:, categorical_ids].astype(str)]) - return uniques_cats > self.max_cat_cardinality + return should_use_label_encoding( + input_data=input_data, + max_cat_cardinality=self.max_cat_cardinality, + categorical_detector=find_categorical_columns, + ) + + def _limits(self) -> RecommendationLimits: + return RecommendationLimits( + max_size=self.max_size, + max_cat_cardinality=self.max_cat_cardinality, + ) diff --git a/fedot/api/api_utils/recommendation_rules.py b/fedot/api/api_utils/recommendation_rules.py new file mode 100644 index 0000000000..7e27432864 --- /dev/null +++ b/fedot/api/api_utils/recommendation_rules.py @@ -0,0 +1,131 @@ +from dataclasses import dataclass +from functools import partial +from inspect import signature +from typing import Any, Callable, Dict, Iterable, Optional, Sequence + +import numpy as np + +from fedot.core.data.data import InputData +from fedot.core.repository.dataset_types import DataTypesEnum + +MetaRule = Callable[..., Dict[str, Any]] + + +@dataclass(frozen=True) +class RecommendationLimits: + max_size: int + max_cat_cardinality: int + + +@dataclass(frozen=True) +class RecommendationBundle: + data: Dict[str, Dict[str, Any]] + params: Dict[str, Any] + + +def supports_data_recommendations(input_data: InputData) -> bool: + return input_data.data_type in (DataTypesEnum.table, DataTypesEnum.text) + + +def estimate_size_cut_border(input_data: InputData, max_size: int) -> Optional[int]: + if input_data.data_type != DataTypesEnum.table: + return None + + rows, columns = input_data.features.shape[0], input_data.features.shape[1] + if rows * columns <= max_size: + return None + return max_size // columns + + +def estimate_categorical_cardinality(input_data: InputData, + categorical_detector: Callable[[Any], tuple[Sequence[int], Sequence[int]]]) -> int: + categorical_ids, _ = categorical_detector(input_data.features) + if not categorical_ids: + return 0 + return sum(len(np.unique(feature)) for feature in input_data.features[:, categorical_ids].astype(str)) + + +def should_use_label_encoding(input_data: InputData, + max_cat_cardinality: int, + categorical_detector: Callable[[Any], tuple[Sequence[int], Sequence[int]]]) -> bool: + return estimate_categorical_cardinality(input_data, categorical_detector) > max_cat_cardinality + + +def build_safe_data_recommendations(input_data: InputData, + safe_mode: bool, + limits: RecommendationLimits, + categorical_detector: Callable[[Any], tuple[Sequence[int], Sequence[int]]]) -> Dict[str, Dict[str, Any]]: + if not safe_mode or not supports_data_recommendations(input_data): + return {} + + recommendations: Dict[str, Dict[str, Any]] = {} + border = estimate_size_cut_border(input_data, limits.max_size) + if border is not None: + recommendations['cut'] = {'border': border} + + if should_use_label_encoding(input_data, limits.max_cat_cardinality, categorical_detector): + recommendations['label_encoded'] = {} + + return recommendations + + +def should_apply_meta_rules(input_params: Optional[Dict[str, Any]]) -> bool: + return bool(input_params and input_params.get('use_meta_rules')) + + +def evaluate_meta_rule(rule: MetaRule, + input_data: InputData, + input_params: Dict[str, Any], + log) -> Dict[str, Any]: + bound_rule = rule + if 'input_params' in signature(bound_rule).parameters: + bound_rule = partial(bound_rule, input_params=input_params) + if 'input_data' in signature(bound_rule).parameters: + bound_rule = partial(bound_rule, input_data=input_data) + return bound_rule(log=log) + + +def collect_meta_rule_recommendations(input_data: InputData, + input_params: Dict[str, Any], + rules: Iterable[MetaRule], + log) -> Dict[str, Any]: + recommendations: Dict[str, Any] = {} + if not should_apply_meta_rules(input_params): + return recommendations + + for rule in rules: + current_recommendation = evaluate_meta_rule(rule, input_data, input_params, log) + if any(value is not None and value is not False for value in current_recommendation.values()): + recommendations.update(current_recommendation) + return recommendations + + +def merge_parameter_recommendations(data_recommendations: Dict[str, Dict[str, Any]], + meta_recommendations: Dict[str, Any]) -> Dict[str, Any]: + merged = dict(meta_recommendations) + if 'label_encoded' in data_recommendations: + merged['label_encoded'] = data_recommendations['label_encoded'] + return merged + + +def build_recommendation_bundle(input_data: InputData, + input_params: Dict[str, Any], + safe_mode: bool, + limits: RecommendationLimits, + categorical_detector: Callable[[Any], tuple[Sequence[int], Sequence[int]]], + meta_rules: Iterable[MetaRule], + log) -> RecommendationBundle: + data_recommendations = build_safe_data_recommendations( + input_data=input_data, + safe_mode=safe_mode, + limits=limits, + categorical_detector=categorical_detector, + ) + meta_recommendations = collect_meta_rule_recommendations( + input_data=input_data, + input_params=input_params, + rules=meta_rules, + log=log, + ) + params_recommendations = merge_parameter_recommendations(data_recommendations, meta_recommendations) + return RecommendationBundle(data=data_recommendations, params=params_recommendations) diff --git a/tests/api/api_utils/test_input_analyser.py b/tests/api/api_utils/test_input_analyser.py new file mode 100644 index 0000000000..e6174f59da --- /dev/null +++ b/tests/api/api_utils/test_input_analyser.py @@ -0,0 +1,59 @@ +import numpy as np + +from fedot.api.api_utils.input_analyser import InputAnalyser +from fedot.core.data.data import InputData +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum + + +def _make_input_data(): + features = np.array([ + ['a', 'x', 1], + ['b', 'y', 2], + ['c', 'z', 3], + ['d', 'q', 4], + ], dtype=object) + target = np.array([0, 1, 0, 1]) + return InputData( + idx=np.arange(features.shape[0]), + features=features, + target=target, + task=Task(TaskTypesEnum.classification), + data_type=DataTypesEnum.table, + ) + + +def test_input_analyser_give_recommendations_uses_pure_bundle_rules(monkeypatch): + captured = {} + + def fake_build_recommendation_bundle(**kwargs): + captured['safe_mode'] = kwargs['safe_mode'] + captured['input_params'] = kwargs['input_params'] + return type('Bundle', (), {'data': {'cut': {'border': 2}}, 'params': {'preset': 'fast_train'}})() + + monkeypatch.setattr('fedot.api.api_utils.input_analyser.build_recommendation_bundle', fake_build_recommendation_bundle) + + analyser = InputAnalyser(safe_mode=True) + data_recommendations, params_recommendations = analyser.give_recommendations( + _make_input_data(), + input_params={'use_meta_rules': True}, + ) + + assert captured['safe_mode'] is True + assert captured['input_params'] == {'use_meta_rules': True} + assert data_recommendations == {'cut': {'border': 2}} + assert params_recommendations == {'preset': 'fast_train'} + + +def test_input_analyser_control_helpers_delegate_to_rules(): + analyser = InputAnalyser(safe_mode=True) + analyser.max_size = 8 + analyser.max_cat_cardinality = 5 + + input_data = _make_input_data() + + is_cut_needed, border = analyser.control_size(input_data) + + assert is_cut_needed is True + assert border == 2 + assert analyser.control_categorical(input_data) is True diff --git a/tests/api/api_utils/test_recommendation_rules.py b/tests/api/api_utils/test_recommendation_rules.py new file mode 100644 index 0000000000..317c93682e --- /dev/null +++ b/tests/api/api_utils/test_recommendation_rules.py @@ -0,0 +1,110 @@ +from types import SimpleNamespace + +import numpy as np + +from fedot.api.api_utils.recommendation_rules import ( + RecommendationLimits, + build_recommendation_bundle, + build_safe_data_recommendations, + collect_meta_rule_recommendations, + estimate_categorical_cardinality, + estimate_size_cut_border, + merge_parameter_recommendations, + should_apply_meta_rules, + should_use_label_encoding, +) +from fedot.core.repository.dataset_types import DataTypesEnum + + +class _FakeLog: + def info(self, message): + return message + + +def _fake_categorical_detector(_table): + return [0, 1], [2] + + +def test_estimate_size_cut_border_returns_border_only_for_large_tables(): + input_data = SimpleNamespace(data_type=DataTypesEnum.table, features=np.zeros((10, 4))) + + assert estimate_size_cut_border(input_data, max_size=30) == 7 + assert estimate_size_cut_border(input_data, max_size=100) is None + + +def test_estimate_categorical_cardinality_and_label_encoding_decision(): + input_data = SimpleNamespace( + data_type=DataTypesEnum.table, + features=np.array([ + ['a', 'x', 1], + ['b', 'y', 2], + ['c', 'z', 3], + ], dtype=object), + ) + + cardinality = estimate_categorical_cardinality(input_data, _fake_categorical_detector) + assert cardinality == 6 + assert should_use_label_encoding(input_data, max_cat_cardinality=5, categorical_detector=_fake_categorical_detector) + + +def test_build_safe_data_recommendations_is_empty_when_safe_mode_disabled(): + input_data = SimpleNamespace(data_type=DataTypesEnum.table, features=np.zeros((10, 4))) + + recommendations = build_safe_data_recommendations( + input_data=input_data, + safe_mode=False, + limits=RecommendationLimits(max_size=10, max_cat_cardinality=1), + categorical_detector=_fake_categorical_detector, + ) + + assert recommendations == {} + + +def test_collect_meta_rule_recommendations_uses_only_meaningful_values(): + def empty_rule(log): + return {'preset': None} + + def useful_rule(input_data, input_params, log): + return {'cv_folds': 3} + + recommendations = collect_meta_rule_recommendations( + input_data=SimpleNamespace(), + input_params={'use_meta_rules': True}, + rules=[empty_rule, useful_rule], + log=_FakeLog(), + ) + + assert recommendations == {'cv_folds': 3} + assert should_apply_meta_rules({'use_meta_rules': True}) is True + assert should_apply_meta_rules({'use_meta_rules': False}) is False + + +def test_build_recommendation_bundle_merges_data_and_param_recommendations(): + def meta_rule(input_data, input_params, log): + return {'preset': 'fast_train'} + + input_data = SimpleNamespace( + data_type=DataTypesEnum.table, + features=np.array([ + ['a', 'x', 1], + ['b', 'y', 2], + ['c', 'z', 3], + ], dtype=object), + ) + + bundle = build_recommendation_bundle( + input_data=input_data, + input_params={'use_meta_rules': True}, + safe_mode=True, + limits=RecommendationLimits(max_size=4, max_cat_cardinality=5), + categorical_detector=_fake_categorical_detector, + meta_rules=[meta_rule], + log=_FakeLog(), + ) + + assert bundle.data == {'cut': {'border': 1}, 'label_encoded': {}} + assert bundle.params == {'preset': 'fast_train', 'label_encoded': {}} + assert merge_parameter_recommendations({'label_encoded': {}}, {'cv_folds': 3}) == { + 'cv_folds': 3, + 'label_encoded': {}, + } From d39e2f7a0319870e068c724c1b90569803ebceff Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:25:05 +0300 Subject: [PATCH 09/32] extract typed fit and composer planning rules --- fedot/api/api_utils/api_composer.py | 399 ++++++++++---------- fedot/api/api_utils/api_run_planner.py | 84 +++++ fedot/api/main.py | 29 +- tests/api/api_utils/test_api_run_planner.py | 93 +++++ 4 files changed, 402 insertions(+), 203 deletions(-) create mode 100644 fedot/api/api_utils/api_run_planner.py create mode 100644 tests/api/api_utils/test_api_run_planner.py diff --git a/fedot/api/api_utils/api_composer.py b/fedot/api/api_utils/api_composer.py index cfb7d6ee49..f5ddc5113d 100644 --- a/fedot/api/api_utils/api_composer.py +++ b/fedot/api/api_utils/api_composer.py @@ -1,191 +1,210 @@ -import datetime -import gc -from copy import deepcopy -from typing import List, Optional, Sequence, Tuple, Union - -from golem.core.log import default_log -from golem.core.optimisers.opt_history_objects.opt_history import OptHistory -from golem.core.tuning.simultaneous import SimultaneousTuner - -from fedot.api.api_utils.assumptions.assumptions_handler import AssumptionsHandler -from fedot.api.api_utils.params import ApiParams -from fedot.api.time import ApiTime -from fedot.core.caching.operations_cache import OperationsCache -from fedot.core.caching.preprocessing_cache import PreprocessingCache -from fedot.core.caching.predictions_cache import PredictionsCache -from fedot.core.composer.composer_builder import ComposerBuilder -from fedot.core.composer.gp_composer.gp_composer import GPComposer -from fedot.core.constants import DEFAULT_TUNING_ITERATIONS_NUMBER -from fedot.core.data.data import InputData -from fedot.core.pipelines.pipeline import Pipeline -from fedot.core.pipelines.tuning.tuner_builder import TunerBuilder -from fedot.core.repository.metrics_repository import MetricIDType -from fedot.utilities.composer_timer import fedot_composer_timer - - -class ApiComposer: - - def __init__(self, api_params: ApiParams, metrics: Union[MetricIDType, Sequence[MetricIDType]]): - self.log = default_log(self) - self.params = api_params - self.metrics = metrics - self.operations_cache: Optional[OperationsCache] = None - self.preprocessing_cache: Optional[PreprocessingCache] = None - self.predictions_cache: Optional[PredictionsCache] = None - self.timer = None - # status flag indicating that composer step was applied - self.was_optimised = False - # status flag indicating that tuner step was applied` - self.was_tuned = False - self.init_cache() - - def init_cache(self): - use_operations_cache = self.params.get('use_operations_cache') - use_preprocessing_cache = self.params.get('use_preprocessing_cache') - use_predictions_cache = self.params.get('use_predictions_cache') - use_input_preprocessing = self.params.get('use_input_preprocessing') - cache_dir = self.params.get('cache_dir') - use_stats = self.params.get('use_stats') - if use_operations_cache: - self.operations_cache = OperationsCache(cache_dir=cache_dir, use_stats=use_stats) - # in case of previously generated singleton cache - self.operations_cache.reset() - if use_input_preprocessing and use_preprocessing_cache: - self.preprocessing_cache = PreprocessingCache(cache_dir=cache_dir, use_stats=use_stats) - # in case of previously generated singleton cache - self.preprocessing_cache.reset() - if use_predictions_cache: - self.predictions_cache = PredictionsCache(cache_dir=cache_dir, use_stats=use_stats) - # in case of previously generated singleton cache - self.predictions_cache.reset() - - def obtain_model(self, train_data: InputData) -> Tuple[Pipeline, Sequence[Pipeline], OptHistory]: - """ Function for composing FEDOT pipeline model """ - - with fedot_composer_timer.launch_composing(): - timeout: float = self.params.timeout - with_tuning = self.params.get('with_tuning') - - self.timer = ApiTime(time_for_automl=timeout, with_tuning=with_tuning) - - initial_assumption, fitted_assumption = self.propose_and_fit_initial_assumption(train_data) - - multi_objective = len(self.metrics) > 1 - self.params.init_params_for_composing(self.timer.timedelta_composing, multi_objective) - - self.log.message(f"AutoML configured." - f" Parameters tuning: {with_tuning}." - f" Time limit: {timeout} min." - f" Set of candidate models: {self.params.get('available_operations')}.") - - best_pipeline, best_pipeline_candidates, gp_composer = self.compose_pipeline( - train_data, - initial_assumption, - fitted_assumption - ) - - if with_tuning: - with fedot_composer_timer.launch_tuning('composing'): - best_pipeline = self.tune_final_pipeline(train_data, best_pipeline) - - if gp_composer.history: - adapter = self.params.graph_generation_params.adapter - gp_composer.history.tuning_result = adapter.adapt(best_pipeline) - # enforce memory cleaning - gc.collect() - - self.log.message('Model generation finished') - return best_pipeline, best_pipeline_candidates, gp_composer.history - - def propose_and_fit_initial_assumption(self, train_data: InputData) -> Tuple[Sequence[Pipeline], Pipeline]: - """ Method for obtaining and fitting initial assumption""" - available_operations = self.params.get('available_operations') - - preset = self.params.get('preset') - - assumption_handler = AssumptionsHandler(train_data) - - initial_assumption = assumption_handler.propose_assumptions(self.params.get('initial_assumption'), - available_operations, - use_input_preprocessing=self.params.get( - 'use_input_preprocessing')) - - with self.timer.launch_assumption_fit(n_folds=self.params.data['cv_folds']): - fitted_assumption = \ - assumption_handler.fit_assumption_and_check_correctness(deepcopy(initial_assumption[0]), - operations_cache=self.operations_cache, - preprocessing_cache=self.preprocessing_cache, - eval_n_jobs=self.params.n_jobs) - - self.log.message( - f'Initial pipeline was fitted in ' - f'{round(self.timer.assumption_fit_spend_time_single_fold.total_seconds(), 1)} sec.') - - self.log.message( - f'Taking into account n_folds={self.params.data["cv_folds"]}, estimated fit time for initial assumption ' - f'is {round(self.timer.assumption_fit_spend_time.total_seconds(), 1)} sec.') - - self.params.update(preset=assumption_handler.propose_preset(preset, self.timer, n_jobs=self.params.n_jobs)) - - return initial_assumption, fitted_assumption - - def compose_pipeline(self, train_data: InputData, initial_assumption: Sequence[Pipeline], - fitted_assumption: Pipeline) -> Tuple[Pipeline, List[Pipeline], GPComposer]: - - gp_composer: GPComposer = (ComposerBuilder(task=self.params.task) - .with_requirements(self.params.composer_requirements) - .with_initial_pipelines(initial_assumption) - .with_optimizer(self.params.get('optimizer')) - .with_optimizer_params(parameters=self.params.optimizer_params) - .with_metrics(self.metrics) - .with_cache(self.operations_cache, self.preprocessing_cache, self.predictions_cache) - .with_graph_generation_param(self.params.graph_generation_params) - .build()) - - if self.timer.have_time_for_composing(self.params.get('pop_size'), self.params.n_jobs): - # Launch pipeline structure composition - with self.timer.launch_composing(): - self.log.message('Pipeline composition started.') - self.was_optimised = False - best_pipelines = gp_composer.compose_pipeline(data=train_data) - best_pipeline_candidates = gp_composer.best_models - self.was_optimised = True - else: - # Use initial pipeline as final solution - self.log.message(f'Timeout is too small for composing and is skipped ' - f'because fit_time is {self.timer.assumption_fit_spend_time.total_seconds()} sec.') - best_pipelines = fitted_assumption - best_pipeline_candidates = [fitted_assumption] - - for pipeline in best_pipeline_candidates: - pipeline.log = self.log - best_pipeline = best_pipelines[0] if isinstance(best_pipelines, Sequence) else best_pipelines - return best_pipeline, best_pipeline_candidates, gp_composer - - def tune_final_pipeline(self, train_data: InputData, pipeline_gp_composed: Pipeline) -> Pipeline: - """ Launch tuning procedure for obtained pipeline by composer """ - timeout_for_tuning = abs(self.timer.determine_resources_for_tuning()) / 60 - tuner = (TunerBuilder(self.params.task) - .with_tuner(SimultaneousTuner) - .with_metric(self.metrics[0]) - .with_iterations(DEFAULT_TUNING_ITERATIONS_NUMBER) - .with_timeout(datetime.timedelta(minutes=timeout_for_tuning)) - .with_eval_time_constraint(self.params.composer_requirements.max_graph_fit_time) - .with_requirements(self.params.composer_requirements) - .build(train_data)) - - if self.timer.have_time_for_tuning(): - # Tune all nodes in the pipeline - with self.timer.launch_tuning(): - self.was_tuned = False - self.log.message(f'Hyperparameters tuning started with {round(timeout_for_tuning)} min. timeout') - tuned_pipeline = tuner.tune(pipeline_gp_composed) - self.log.message('Hyperparameters tuning finished') - else: - self.log.message(f'Time for pipeline composing was {str(self.timer.composing_spend_time)}.\n' - f'The remaining {max(0, round(timeout_for_tuning, 1))} seconds are not enough ' - f'to tune the hyperparameters.') - self.log.message('Composed pipeline returned without tuning.') - tuned_pipeline = pipeline_gp_composed - self.was_tuned = tuner.was_tuned +import datetime +import gc +from copy import deepcopy +from typing import List, Optional, Sequence, Tuple, Union + +from golem.core.log import default_log +from golem.core.optimisers.opt_history_objects.opt_history import OptHistory +from golem.core.tuning.simultaneous import SimultaneousTuner + +from fedot.api.api_utils.api_run_planner import build_composer_execution_plan +from fedot.api.api_utils.assumptions.assumptions_handler import AssumptionsHandler +from fedot.api.api_utils.params import ApiParams +from fedot.api.time import ApiTime +from fedot.core.caching.operations_cache import OperationsCache +from fedot.core.caching.preprocessing_cache import PreprocessingCache +from fedot.core.caching.predictions_cache import PredictionsCache +from fedot.core.composer.composer_builder import ComposerBuilder +from fedot.core.composer.gp_composer.gp_composer import GPComposer +from fedot.core.constants import DEFAULT_TUNING_ITERATIONS_NUMBER +from fedot.core.data.data import InputData +from fedot.core.pipelines.pipeline import Pipeline +from fedot.core.pipelines.tuning.tuner_builder import TunerBuilder +from fedot.core.repository.metrics_repository import MetricIDType +from fedot.utilities.composer_timer import fedot_composer_timer + + +class ApiComposer: + + def __init__(self, api_params: ApiParams, metrics: Union[MetricIDType, Sequence[MetricIDType]]): + self.log = default_log(self) + self.params = api_params + self.metrics = metrics + self.operations_cache: Optional[OperationsCache] = None + self.preprocessing_cache: Optional[PreprocessingCache] = None + self.predictions_cache: Optional[PredictionsCache] = None + self.timer = None + # status flag indicating that composer step was applied + self.was_optimised = False + # status flag indicating that tuner step was applied` + self.was_tuned = False + self.init_cache() + + def init_cache(self): + use_operations_cache = self.params.get('use_operations_cache') + use_preprocessing_cache = self.params.get('use_preprocessing_cache') + use_predictions_cache = self.params.get('use_predictions_cache') + use_input_preprocessing = self.params.get('use_input_preprocessing') + cache_dir = self.params.get('cache_dir') + use_stats = self.params.get('use_stats') + if use_operations_cache: + self.operations_cache = OperationsCache(cache_dir=cache_dir, use_stats=use_stats) + # in case of previously generated singleton cache + self.operations_cache.reset() + if use_input_preprocessing and use_preprocessing_cache: + self.preprocessing_cache = PreprocessingCache(cache_dir=cache_dir, use_stats=use_stats) + # in case of previously generated singleton cache + self.preprocessing_cache.reset() + if use_predictions_cache: + self.predictions_cache = PredictionsCache(cache_dir=cache_dir, use_stats=use_stats) + # in case of previously generated singleton cache + self.predictions_cache.reset() + + def obtain_model(self, train_data: InputData) -> Tuple[Pipeline, Sequence[Pipeline], OptHistory]: + """ Function for composing FEDOT pipeline model """ + + with fedot_composer_timer.launch_composing(): + timeout: float = self.params.timeout + with_tuning = self.params.get('with_tuning') + + self.timer = ApiTime(time_for_automl=timeout, with_tuning=with_tuning) + + initial_assumption, fitted_assumption = self.propose_and_fit_initial_assumption(train_data) + + multi_objective = len(self.metrics) > 1 + self.params.init_params_for_composing(self.timer.timedelta_composing, multi_objective) + + self.log.message(f"AutoML configured." + f" Parameters tuning: {with_tuning}." + f" Time limit: {timeout} min." + f" Set of candidate models: {self.params.get('available_operations')}.") + + best_pipeline, best_pipeline_candidates, gp_composer = self.compose_pipeline( + train_data, + initial_assumption, + fitted_assumption + ) + + timeout_for_tuning = abs(self.timer.determine_resources_for_tuning()) / 60 + execution_plan = build_composer_execution_plan( + with_tuning=with_tuning, + have_time_for_composing=self.was_optimised, + have_time_for_tuning=self.timer.have_time_for_tuning(), + tuning_timeout_minutes=timeout_for_tuning, + ) + + if execution_plan.should_tune: + with fedot_composer_timer.launch_tuning('composing'): + best_pipeline = self.tune_final_pipeline(train_data, best_pipeline, execution_plan) + elif with_tuning: + self.log.message(f'Time for pipeline composing was {str(self.timer.composing_spend_time)}.\n' + f'The remaining {max(0, round(execution_plan.tuning_timeout_minutes, 1))} seconds are not enough ' + f'to tune the hyperparameters.') + self.log.message('Composed pipeline returned without tuning.') + self.was_tuned = False + + if gp_composer.history: + adapter = self.params.graph_generation_params.adapter + gp_composer.history.tuning_result = adapter.adapt(best_pipeline) + # enforce memory cleaning + gc.collect() + + self.log.message('Model generation finished') + return best_pipeline, best_pipeline_candidates, gp_composer.history + + def propose_and_fit_initial_assumption(self, train_data: InputData) -> Tuple[Sequence[Pipeline], Pipeline]: + """ Method for obtaining and fitting initial assumption""" + available_operations = self.params.get('available_operations') + + preset = self.params.get('preset') + + assumption_handler = AssumptionsHandler(train_data) + + initial_assumption = assumption_handler.propose_assumptions(self.params.get('initial_assumption'), + available_operations, + use_input_preprocessing=self.params.get( + 'use_input_preprocessing')) + + with self.timer.launch_assumption_fit(n_folds=self.params.data['cv_folds']): + fitted_assumption = \ + assumption_handler.fit_assumption_and_check_correctness(deepcopy(initial_assumption[0]), + operations_cache=self.operations_cache, + preprocessing_cache=self.preprocessing_cache, + eval_n_jobs=self.params.n_jobs) + + self.log.message( + f'Initial pipeline was fitted in ' + f'{round(self.timer.assumption_fit_spend_time_single_fold.total_seconds(), 1)} sec.') + + self.log.message( + f'Taking into account n_folds={self.params.data["cv_folds"]}, estimated fit time for initial assumption ' + f'is {round(self.timer.assumption_fit_spend_time.total_seconds(), 1)} sec.') + + self.params.update(preset=assumption_handler.propose_preset(preset, self.timer, n_jobs=self.params.n_jobs)) + + return initial_assumption, fitted_assumption + + def compose_pipeline(self, train_data: InputData, initial_assumption: Sequence[Pipeline], + fitted_assumption: Pipeline) -> Tuple[Pipeline, List[Pipeline], GPComposer]: + + gp_composer: GPComposer = (ComposerBuilder(task=self.params.task) + .with_requirements(self.params.composer_requirements) + .with_initial_pipelines(initial_assumption) + .with_optimizer(self.params.get('optimizer')) + .with_optimizer_params(parameters=self.params.optimizer_params) + .with_metrics(self.metrics) + .with_cache(self.operations_cache, self.preprocessing_cache, self.predictions_cache) + .with_graph_generation_param(self.params.graph_generation_params) + .build()) + + have_time_for_composing = self.timer.have_time_for_composing(self.params.get('pop_size'), self.params.n_jobs) + execution_plan = build_composer_execution_plan( + with_tuning=self.params.get('with_tuning'), + have_time_for_composing=have_time_for_composing, + have_time_for_tuning=False, + tuning_timeout_minutes=0, + ) + + if execution_plan.should_compose: + # Launch pipeline structure composition + with self.timer.launch_composing(): + self.log.message('Pipeline composition started.') + self.was_optimised = False + best_pipelines = gp_composer.compose_pipeline(data=train_data) + best_pipeline_candidates = gp_composer.best_models + self.was_optimised = True + else: + # Use initial pipeline as final solution + self.log.message(f'Timeout is too small for composing and is skipped ' + f'because fit_time is {self.timer.assumption_fit_spend_time.total_seconds()} sec.') + best_pipelines = fitted_assumption + best_pipeline_candidates = [fitted_assumption] + self.was_optimised = False + + for pipeline in best_pipeline_candidates: + pipeline.log = self.log + best_pipeline = best_pipelines[0] if isinstance(best_pipelines, Sequence) else best_pipelines + return best_pipeline, best_pipeline_candidates, gp_composer + + def tune_final_pipeline(self, train_data: InputData, + pipeline_gp_composed: Pipeline, + execution_plan=None) -> Pipeline: + """ Launch tuning procedure for obtained pipeline by composer """ + timeout_for_tuning = execution_plan.tuning_timeout_minutes if execution_plan else abs( + self.timer.determine_resources_for_tuning()) / 60 + tuner = (TunerBuilder(self.params.task) + .with_tuner(SimultaneousTuner) + .with_metric(self.metrics[0]) + .with_iterations(DEFAULT_TUNING_ITERATIONS_NUMBER) + .with_timeout(datetime.timedelta(minutes=timeout_for_tuning)) + .with_eval_time_constraint(self.params.composer_requirements.max_graph_fit_time) + .with_requirements(self.params.composer_requirements) + .build(train_data)) + + with self.timer.launch_tuning(): + self.was_tuned = False + self.log.message(f'Hyperparameters tuning started with {round(timeout_for_tuning)} min. timeout') + tuned_pipeline = tuner.tune(pipeline_gp_composed) + self.log.message('Hyperparameters tuning finished') + self.was_tuned = tuner.was_tuned return tuned_pipeline diff --git a/fedot/api/api_utils/api_run_planner.py b/fedot/api/api_utils/api_run_planner.py new file mode 100644 index 0000000000..dcc4017570 --- /dev/null +++ b/fedot/api/api_utils/api_run_planner.py @@ -0,0 +1,84 @@ +from dataclasses import dataclass +from typing import Any, Optional + + +@dataclass(frozen=True) +class SamplingStagePlan: + resolved_predefined_model: Optional[Any] + should_run_sampling_stage: bool + skip_metadata: Optional[dict] + + +@dataclass(frozen=True) +class FinalFitPlan: + should_train_on_full_dataset: bool + + +@dataclass(frozen=True) +class ComposerExecutionPlan: + should_compose: bool + should_tune: bool + tuning_timeout_minutes: float + + +SKIP_REASON_PREDEFINED_MODEL = 'predefined_model' +SKIP_REASON_ATOMIZED_INITIAL_ASSUMPTION = 'atomized_initial_assumption' + + +def is_atomized_initial_assumption(initial_assumption: Optional[Any]) -> bool: + descriptive_id = getattr(initial_assumption, 'descriptive_id', '') + return bool(descriptive_id) and 'atomized' in descriptive_id + + +def plan_sampling_stage(requested_predefined_model: Optional[Any], + initial_assumption: Optional[Any], + sampling_config_present: bool) -> SamplingStagePlan: + if requested_predefined_model is not None: + return SamplingStagePlan( + resolved_predefined_model=requested_predefined_model, + should_run_sampling_stage=False, + skip_metadata=_skip_metadata(SKIP_REASON_PREDEFINED_MODEL) if sampling_config_present else None, + ) + + if is_atomized_initial_assumption(initial_assumption): + return SamplingStagePlan( + resolved_predefined_model=initial_assumption, + should_run_sampling_stage=False, + skip_metadata=_skip_metadata(SKIP_REASON_ATOMIZED_INITIAL_ASSUMPTION) if sampling_config_present else None, + ) + + return SamplingStagePlan( + resolved_predefined_model=None, + should_run_sampling_stage=sampling_config_present, + skip_metadata=None, + ) + + +def plan_final_fit(history: Optional[Any], pipeline_is_fitted: bool) -> FinalFitPlan: + return FinalFitPlan( + should_train_on_full_dataset=history_has_records(history) or not pipeline_is_fitted, + ) + + +def history_has_records(history: Optional[Any]) -> bool: + if history is None: + return False + is_empty = getattr(history, 'is_empty', None) + if callable(is_empty): + return not is_empty() + return True + + +def build_composer_execution_plan(with_tuning: bool, + have_time_for_composing: bool, + have_time_for_tuning: bool, + tuning_timeout_minutes: float) -> ComposerExecutionPlan: + return ComposerExecutionPlan( + should_compose=have_time_for_composing, + should_tune=with_tuning and have_time_for_tuning, + tuning_timeout_minutes=max(0.0, tuning_timeout_minutes), + ) + + +def _skip_metadata(reason: str) -> dict: + return {'status': 'skipped', 'reason': reason} diff --git a/fedot/api/main.py b/fedot/api/main.py index 84e5790c62..b6dcf0f782 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -12,6 +12,7 @@ from golem.visualisation.opt_viz_extra import visualise_pareto from fedot.api.api_utils.api_composer import ApiComposer +from fedot.api.api_utils.api_run_planner import plan_final_fit, plan_sampling_stage from fedot.api.api_utils.api_data import ApiDataProcessor from fedot.api.api_utils.data_definition import FeaturesType, TargetType from fedot.api.api_utils.input_analyser import InputAnalyser @@ -172,19 +173,20 @@ def fit(self, with fedot_composer_timer.launch_preprocessing(): self.train_data = self.data_processor.fit_transform(self.train_data) - # TODO: Workaround for AtomizedModel - init_asm = self.params.data.get('initial_assumption') - if predefined_model is None: - if isinstance(init_asm, Pipeline) and ("atomized" in init_asm.descriptive_id): + fit_plan = plan_sampling_stage( + requested_predefined_model=predefined_model, + initial_assumption=self.params.data.get('initial_assumption'), + sampling_config_present=self.params.get('sampling_config') is not None, + ) + predefined_model = fit_plan.resolved_predefined_model + if fit_plan.skip_metadata is not None: + self.sampling_stage_metadata = fit_plan.skip_metadata + if fit_plan.skip_metadata['reason'] == 'predefined_model': + self.log.message('Sampling stage skipped because predefined_model is specified.') + elif fit_plan.skip_metadata['reason'] == 'atomized_initial_assumption': self.log.message('Composition for AtomizedModel currently unavailable') - predefined_model = init_asm - if self.params.get('sampling_config') is not None: - self.sampling_stage_metadata = {'status': 'skipped', 'reason': 'atomized_initial_assumption'} - else: - self._run_sampling_stage_if_necessary() - elif self.params.get('sampling_config') is not None: - self.sampling_stage_metadata = {'status': 'skipped', 'reason': 'predefined_model'} - self.log.message('Sampling stage skipped because predefined_model is specified.') + elif fit_plan.should_run_sampling_stage: + self._run_sampling_stage_if_necessary() with fedot_composer_timer.launch_fitting(): if predefined_model is not None: @@ -204,7 +206,8 @@ def fit(self, # Final fit for obtained pipeline on full dataset with fedot_composer_timer.launch_train_inference(): - if self.history and not self.history.is_empty() or not self.current_pipeline.is_fitted: + final_fit_plan = plan_final_fit(self.history, self.current_pipeline.is_fitted) + if final_fit_plan.should_train_on_full_dataset: self._train_pipeline_on_full_dataset(recommendations_for_data, full_train_not_preprocessed) self.log.message('Final pipeline was fitted') else: diff --git a/tests/api/api_utils/test_api_run_planner.py b/tests/api/api_utils/test_api_run_planner.py new file mode 100644 index 0000000000..1f9c65ab54 --- /dev/null +++ b/tests/api/api_utils/test_api_run_planner.py @@ -0,0 +1,93 @@ +from types import SimpleNamespace + +from fedot.api.api_utils.api_run_planner import ( + SKIP_REASON_ATOMIZED_INITIAL_ASSUMPTION, + SKIP_REASON_PREDEFINED_MODEL, + build_composer_execution_plan, + history_has_records, + is_atomized_initial_assumption, + plan_final_fit, + plan_sampling_stage, +) + + +class _FakeHistory: + def __init__(self, is_empty_value): + self._is_empty_value = is_empty_value + + def is_empty(self): + return self._is_empty_value + + +def test_is_atomized_initial_assumption_detects_atomized_pipeline(): + atomized = SimpleNamespace(descriptive_id='some_atomized_pipeline') + regular = SimpleNamespace(descriptive_id='ordinary_pipeline') + + assert is_atomized_initial_assumption(atomized) is True + assert is_atomized_initial_assumption(regular) is False + assert is_atomized_initial_assumption(None) is False + + +def test_plan_sampling_stage_skips_for_explicit_predefined_model(): + plan = plan_sampling_stage( + requested_predefined_model='rf', + initial_assumption=None, + sampling_config_present=True, + ) + + assert plan.resolved_predefined_model == 'rf' + assert plan.should_run_sampling_stage is False + assert plan.skip_metadata == {'status': 'skipped', 'reason': SKIP_REASON_PREDEFINED_MODEL} + + +def test_plan_sampling_stage_skips_for_atomized_initial_assumption(): + atomized = SimpleNamespace(descriptive_id='my_atomized_pipeline') + plan = plan_sampling_stage( + requested_predefined_model=None, + initial_assumption=atomized, + sampling_config_present=True, + ) + + assert plan.resolved_predefined_model is atomized + assert plan.should_run_sampling_stage is False + assert plan.skip_metadata == {'status': 'skipped', 'reason': SKIP_REASON_ATOMIZED_INITIAL_ASSUMPTION} + + +def test_plan_sampling_stage_runs_only_when_sampling_config_present(): + plan = plan_sampling_stage( + requested_predefined_model=None, + initial_assumption=None, + sampling_config_present=True, + ) + no_sampling_plan = plan_sampling_stage( + requested_predefined_model=None, + initial_assumption=None, + sampling_config_present=False, + ) + + assert plan.should_run_sampling_stage is True + assert plan.skip_metadata is None + assert no_sampling_plan.should_run_sampling_stage is False + + +def test_plan_final_fit_respects_history_and_pipeline_fit_state(): + assert history_has_records(None) is False + assert history_has_records(_FakeHistory(is_empty_value=True)) is False + assert history_has_records(_FakeHistory(is_empty_value=False)) is True + + assert plan_final_fit(None, pipeline_is_fitted=True).should_train_on_full_dataset is False + assert plan_final_fit(_FakeHistory(is_empty_value=False), pipeline_is_fitted=True).should_train_on_full_dataset is True + assert plan_final_fit(None, pipeline_is_fitted=False).should_train_on_full_dataset is True + + +def test_build_composer_execution_plan_is_typed_and_deterministic(): + plan = build_composer_execution_plan( + with_tuning=True, + have_time_for_composing=True, + have_time_for_tuning=False, + tuning_timeout_minutes=-5, + ) + + assert plan.should_compose is True + assert plan.should_tune is False + assert plan.tuning_timeout_minutes == 0.0 From c56c7ec35de87b6aa63df87609bc13f50343b48d Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:27:38 +0300 Subject: [PATCH 10/32] extract typed api params validation and normalization rules --- fedot/api/api_utils/api_params_rules.py | 72 ++++++++++++++++++++ fedot/api/api_utils/params.py | 56 +++++++-------- tests/api/api_utils/test_api_params.py | 40 +++++++++++ tests/api/api_utils/test_api_params_rules.py | 51 ++++++++++++++ 4 files changed, 186 insertions(+), 33 deletions(-) create mode 100644 fedot/api/api_utils/api_params_rules.py create mode 100644 tests/api/api_utils/test_api_params.py create mode 100644 tests/api/api_utils/test_api_params_rules.py diff --git a/fedot/api/api_utils/api_params_rules.py b/fedot/api/api_utils/api_params_rules.py new file mode 100644 index 0000000000..f1303e93e9 --- /dev/null +++ b/fedot/api/api_utils/api_params_rules.py @@ -0,0 +1,72 @@ +from dataclasses import dataclass +from typing import Any, Dict, Optional + +from fedot.core.constants import AUTO_PRESET_NAME, DEFAULT_FORECAST_LENGTH +from fedot.core.repository.tasks import Task, TaskParams, TaskTypesEnum, TsForecastingParams + + +@dataclass(frozen=True) +class TaskResolution: + task: Task + warning_message: Optional[str] + + +@dataclass(frozen=True) +class TimeoutResolution: + timeout: Optional[float] + num_of_generations: Optional[int] + + +_SUPPORTED_PROBLEMS = { + 'regression': TaskTypesEnum.regression, + 'classification': TaskTypesEnum.classification, + 'ts_forecasting': TaskTypesEnum.ts_forecasting, +} + + +def resolve_task(problem: str, + task_params: Optional[TaskParams], + default_forecast_length: int = DEFAULT_FORECAST_LENGTH) -> TaskResolution: + if problem not in _SUPPORTED_PROBLEMS: + raise ValueError(f'Wrong type name of the given task: {problem}') + + warning_message = None + resolved_task_params = task_params + if problem == 'ts_forecasting' and task_params is None: + warning_message = f'The value of the forecast depth was set to {default_forecast_length}.' + resolved_task_params = TsForecastingParams(forecast_length=default_forecast_length) + + task_type = _SUPPORTED_PROBLEMS[problem] + return TaskResolution(task=Task(task_type, task_params=resolved_task_params), warning_message=warning_message) + + +def normalize_timeout_and_generations(timeout: Optional[float], + num_of_generations: Optional[int]) -> TimeoutResolution: + if timeout in (-1, None): + if num_of_generations is None: + raise ValueError('"num_of_generations" should be specified if infinite "timeout" is given') + return TimeoutResolution(timeout=None, num_of_generations=num_of_generations) + + if timeout <= 0: + raise ValueError(f'invalid "timeout" value: timeout={timeout}') + + if num_of_generations is None: + return TimeoutResolution(timeout=timeout, num_of_generations=10000) + return TimeoutResolution(timeout=timeout, num_of_generations=num_of_generations) + + +def build_label_encoded_preset_name(current_preset: Optional[str]) -> str: + if current_preset: + return f'{current_preset}*tree' + return '*tree' + + +def should_update_available_operations(preset: Optional[str]) -> bool: + return preset != AUTO_PRESET_NAME + + +def merge_param_recommendations(current_params: Dict[str, Any], recommendations: Dict[str, Any]) -> Dict[str, Any]: + updated_params = dict(current_params) + for key, value in recommendations.items(): + updated_params[key] = value + return updated_params diff --git a/fedot/api/api_utils/params.py b/fedot/api/api_utils/params.py index 5107963dad..e7a3c9690c 100644 --- a/fedot/api/api_utils/params.py +++ b/fedot/api/api_utils/params.py @@ -1,4 +1,4 @@ -import datetime +import datetime from collections import UserDict from copy import deepcopy, copy from typing import Any, Dict, Optional, Union @@ -9,8 +9,14 @@ from golem.utilities.utilities import determine_n_jobs from fedot.api.api_utils.api_params_repository import ApiParamsRepository +from fedot.api.api_utils.api_params_rules import ( + build_label_encoded_preset_name, + merge_param_recommendations, + normalize_timeout_and_generations, + resolve_task, + should_update_available_operations, +) from fedot.api.api_utils.presets import OperationsPreset -from fedot.core.constants import AUTO_PRESET_NAME, DEFAULT_FORECAST_LENGTH from fedot.core.data.data import InputData from fedot.core.data.multi_modal import MultiModalData from fedot.core.pipelines.adapters import PipelineAdapter @@ -20,7 +26,7 @@ from fedot.core.pipelines.verification import rules_by_task from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.pipeline_operation_repository import PipelineOperationRepository -from fedot.core.repository.tasks import Task, TaskTypesEnum, TaskParams, TsForecastingParams +from fedot.core.repository.tasks import Task, TaskTypesEnum, TaskParams class ApiParams(UserDict): @@ -28,7 +34,10 @@ class ApiParams(UserDict): def __init__(self, input_params: Dict[str, Any], problem: str, task_params: Optional[TaskParams] = None, n_jobs: int = -1, timeout: float = 5, seed=None): self.log: LoggerAdapter = default_log(self) - self.task: Task = self._get_task_with_params(problem, task_params) + task_resolution = resolve_task(problem, task_params) + if task_resolution.warning_message: + self.log.warning(task_resolution.warning_message) + self.task: Task = task_resolution.task self.n_jobs: int = determine_n_jobs(n_jobs) self.timeout = timeout @@ -45,7 +54,7 @@ def __init__(self, input_params: Dict[str, Any], problem: str, task_params: Opti def update_available_operations_by_preset(self, data: InputData): """ Updates available_operations by preset and data type""" preset = self.get('preset') - if preset != AUTO_PRESET_NAME: + if should_update_available_operations(preset): preset_operations = OperationsPreset(task=self.task, preset_name=preset) self.data = preset_operations.composer_params_based_on_preset(self.data, data.data_type) @@ -68,15 +77,11 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod self.change_preset_for_label_encoded_data(input_data.task, input_data.data_type) # update api params with recommendations obtained using meta rules - for key in recommendations: - self.update({key: recommendations[key]}) + self.data = merge_param_recommendations(self.data, recommendations) def change_preset_for_label_encoded_data(self, task: Task, data_type: DataTypesEnum): """ Change preset on tree like preset, if data had been label encoded """ - if 'preset' in self: - preset_name = ''.join((self['preset'], '*tree')) - else: - preset_name = '*tree' + preset_name = build_label_encoded_preset_name(self.get('preset')) preset_operations = OperationsPreset(task=task, preset_name=preset_name) self.pop('available_operations', None) @@ -84,30 +89,15 @@ def change_preset_for_label_encoded_data(self, task: Task, data_type: DataTypesE def _get_task_with_params(self, problem: str, task_params: Optional[TaskParams] = None) -> Task: """ Creates Task from problem name and task_params""" - if problem == 'ts_forecasting' and task_params is None: - self.log.warning(f'The value of the forecast depth was set to {DEFAULT_FORECAST_LENGTH}.') - task_params = TsForecastingParams(forecast_length=DEFAULT_FORECAST_LENGTH) - - task_dict = {'regression': Task(TaskTypesEnum.regression, task_params=task_params), - 'classification': Task(TaskTypesEnum.classification, task_params=task_params), - 'ts_forecasting': Task(TaskTypesEnum.ts_forecasting, task_params=task_params) - } - try: - return task_dict[problem] - except ValueError: - ValueError('Wrong type name of the given task') + task_resolution = resolve_task(problem, task_params) + if task_resolution.warning_message: + self.log.warning(task_resolution.warning_message) + return task_resolution.task def _check_timeout_vs_generations(self): - num_of_generations = self.get('num_of_generations') - if self.timeout in [-1, None]: - self.timeout = None - if num_of_generations is None: - raise ValueError('"num_of_generations" should be specified if infinite "timeout" is given') - elif self.timeout > 0: - if num_of_generations is None: - self['num_of_generations'] = 10000 - else: - raise ValueError(f'invalid "timeout" value: timeout={self.timeout}') + timeout_resolution = normalize_timeout_and_generations(self.timeout, self.get('num_of_generations')) + self.timeout = timeout_resolution.timeout + self['num_of_generations'] = timeout_resolution.num_of_generations def init_params_for_composing(self, datetime_composing: Optional[datetime.timedelta], multi_objective: bool): """ Method to initialize ``PipelineComposerRequirements``, ``GPAlgorithmParameters``, diff --git a/tests/api/api_utils/test_api_params.py b/tests/api/api_utils/test_api_params.py new file mode 100644 index 0000000000..2363771d04 --- /dev/null +++ b/tests/api/api_utils/test_api_params.py @@ -0,0 +1,40 @@ +from fedot.api.api_utils.params import ApiParams + + +def test_api_params_raises_value_error_for_unknown_problem(): + try: + ApiParams({}, problem='clustering') + except ValueError as error: + assert 'Wrong type name of the given task' in str(error) + else: + raise AssertionError('ApiParams should reject unknown problem values') + + +def test_api_params_normalizes_timeout_and_generations(): + params = ApiParams({'num_of_generations': 5}, problem='classification', timeout=-1) + + assert params.timeout is None + assert params['num_of_generations'] == 5 + + params_with_default_generations = ApiParams({}, problem='classification', timeout=1) + assert params_with_default_generations.timeout == 1 + assert params_with_default_generations['num_of_generations'] == 10000 + + +def test_api_params_accept_and_apply_recommendations_updates_internal_mapping(monkeypatch): + params = ApiParams({}, problem='classification', timeout=1) + captured = {'called': False} + + def fake_change_preset(task, data_type): + captured['called'] = True + + monkeypatch.setattr(params, 'change_preset_for_label_encoded_data', fake_change_preset) + + params.accept_and_apply_recommendations( + input_data=type('Data', (), {'task': params.task, 'data_type': None})(), + recommendations={'cv_folds': 3, 'label_encoded': {}}, + ) + + assert captured['called'] is True + assert params['cv_folds'] == 3 + assert params['label_encoded'] == {} diff --git a/tests/api/api_utils/test_api_params_rules.py b/tests/api/api_utils/test_api_params_rules.py new file mode 100644 index 0000000000..83106204e4 --- /dev/null +++ b/tests/api/api_utils/test_api_params_rules.py @@ -0,0 +1,51 @@ +import pytest + +from fedot.api.api_utils.api_params_rules import ( + build_label_encoded_preset_name, + merge_param_recommendations, + normalize_timeout_and_generations, + resolve_task, + should_update_available_operations, +) +from fedot.core.constants import AUTO_PRESET_NAME, DEFAULT_FORECAST_LENGTH +from fedot.core.repository.tasks import TaskTypesEnum, TsForecastingParams + + +def test_resolve_task_adds_default_ts_params_and_warning(): + resolution = resolve_task('ts_forecasting', None) + + assert resolution.task.task_type == TaskTypesEnum.ts_forecasting + assert isinstance(resolution.task.task_params, TsForecastingParams) + assert resolution.task.task_params.forecast_length == DEFAULT_FORECAST_LENGTH + assert resolution.warning_message is not None + + +def test_resolve_task_rejects_unknown_problem(): + with pytest.raises(ValueError, match='Wrong type name of the given task'): + resolve_task('clustering', None) + + +def test_normalize_timeout_and_generations_handles_infinite_and_default_cases(): + infinite_resolution = normalize_timeout_and_generations(-1, 5) + finite_resolution = normalize_timeout_and_generations(10, None) + + assert infinite_resolution.timeout is None + assert infinite_resolution.num_of_generations == 5 + assert finite_resolution.timeout == 10 + assert finite_resolution.num_of_generations == 10000 + + +def test_normalize_timeout_and_generations_rejects_invalid_values(): + with pytest.raises(ValueError, match='num_of_generations'): + normalize_timeout_and_generations(None, None) + + with pytest.raises(ValueError, match='invalid "timeout" value'): + normalize_timeout_and_generations(0, 5) + + +def test_small_preset_and_recommendation_helpers_are_deterministic(): + assert build_label_encoded_preset_name('fast_train') == 'fast_train*tree' + assert build_label_encoded_preset_name(None) == '*tree' + assert should_update_available_operations(AUTO_PRESET_NAME) is False + assert should_update_available_operations('fast_train') is True + assert merge_param_recommendations({'a': 1}, {'b': 2, 'a': 3}) == {'a': 3, 'b': 2} From bf09df65c0fe30691682642f23d22e596392d564 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:33:43 +0300 Subject: [PATCH 11/32] extract typed assumption handler rules and either-based fit result --- .../assumptions/assumptions_handler.py | 66 +++++++++---- .../assumptions/assumptions_handler_rules.py | 59 +++++++++++ .../assumptions/test_assumptions_handler.py | 68 +++++++++++++ .../test_assumptions_handler_rules.py | 61 ++++++++++++ tests/api/test_builder.py | 72 ++++++++++++++ tests/api/test_safety.py | 98 +++++++++++++++++++ tests/core/data/__init__.py | 1 + tests/core/data/test_data_definition.py | 84 ++++++++++++++++ 8 files changed, 488 insertions(+), 21 deletions(-) create mode 100644 fedot/api/api_utils/assumptions/assumptions_handler_rules.py create mode 100644 tests/api/api_utils/assumptions/test_assumptions_handler.py create mode 100644 tests/api/api_utils/assumptions/test_assumptions_handler_rules.py create mode 100644 tests/api/test_builder.py create mode 100644 tests/api/test_safety.py create mode 100644 tests/core/data/__init__.py create mode 100644 tests/core/data/test_data_definition.py diff --git a/fedot/api/api_utils/assumptions/assumptions_handler.py b/fedot/api/api_utils/assumptions/assumptions_handler.py index b1a99325ef..1b2d3665ba 100644 --- a/fedot/api/api_utils/assumptions/assumptions_handler.py +++ b/fedot/api/api_utils/assumptions/assumptions_handler.py @@ -1,9 +1,15 @@ -import traceback +import traceback from typing import List, Optional, Union from golem.core.log import default_log +from pymonad.either import Left, Right from fedot.api.api_utils.assumptions.assumptions_builder import AssumptionsBuilder +from fedot.api.api_utils.assumptions.assumptions_handler_rules import ( + build_assumption_fit_error, + decide_preset, + resolve_initial_assumption, +) from fedot.api.api_utils.presets import change_preset_based_on_initial_fit from fedot.api.time import ApiTime from fedot.core.caching.operations_cache import OperationsCache @@ -41,14 +47,13 @@ def propose_assumptions(self, list of initial assumption pipelines """ - if initial_assumption is None: - assumptions_builder = AssumptionsBuilder \ - .get(self.data) \ - .from_operations(available_operations) - initial_assumption = assumptions_builder.build(use_input_preprocessing=use_input_preprocessing) - elif isinstance(initial_assumption, Pipeline): - initial_assumption = [initial_assumption] - return initial_assumption + return resolve_initial_assumption( + initial_assumption, + builder=lambda: AssumptionsBuilder + .get(self.data) + .from_operations(available_operations) + .build(use_input_preprocessing=use_input_preprocessing), + ) def fit_assumption_and_check_correctness(self, pipeline: Pipeline, @@ -63,6 +68,21 @@ def fit_assumption_and_check_correctness(self, :param preprocessing_cache: Cache manager for optional preprocessing encoders and imputers, optional. :param eval_n_jobs: number of jobs to fit the initial pipeline """ + fit_result = self.try_fit_assumption( + pipeline=pipeline, + operations_cache=operations_cache, + preprocessing_cache=preprocessing_cache, + eval_n_jobs=eval_n_jobs, + ) + if fit_result.is_left(): + self._raise_evaluating_exception(fit_result.value) + return fit_result.value + + def try_fit_assumption(self, + pipeline: Pipeline, + operations_cache: Optional[OperationsCache] = None, + preprocessing_cache: Optional[PreprocessingCache] = None, + eval_n_jobs: int = -1): try: data_train, data_test = train_test_data_setup(self.data) self.log.info('Initial pipeline fitting started') @@ -79,17 +99,16 @@ def fit_assumption_and_check_correctness(self, self.log.info('Initial pipeline was fitted successfully') MemoryAnalytics.log(self.log, additional_info='fitting of the initial pipeline') + return Right(pipeline) except Exception as ex: - self._raise_evaluating_exception(ex) - return pipeline + fit_error = build_assumption_fit_error(ex) + self.log.info(f'Initial pipeline fit was failed due to: {fit_error.cause}.') + print(traceback.format_exc()) + return Left(fit_error) - def _raise_evaluating_exception(self, ex: Exception): - fit_failed_info = f'Initial pipeline fit was failed due to: {ex}.' - advice_info = f'{fit_failed_info} Check pipeline structure and the correctness of the data' - self.log.info(fit_failed_info) - print(traceback.format_exc()) - raise ValueError(advice_info) + def _raise_evaluating_exception(self, fit_error): + raise ValueError(fit_error.message) def propose_preset(self, preset: Union[str, None], timer: ApiTime, n_jobs: int) -> str: """ @@ -100,7 +119,12 @@ def propose_preset(self, preset: Union[str, None], timer: ApiTime, n_jobs: int) :param n_jobs: n_jobs parameter """ - if not preset or preset == 'auto': - preset = change_preset_based_on_initial_fit(timer, n_jobs) - self.log.message(f"Preset was changed to {preset} due to fit time estimation for initial model.") - return preset + decision = decide_preset( + preset=preset, + timer=timer, + n_jobs=n_jobs, + chooser=change_preset_based_on_initial_fit, + ) + if decision.was_changed: + self.log.message(f"Preset was changed to {decision.preset} due to fit time estimation for initial model.") + return decision.preset diff --git a/fedot/api/api_utils/assumptions/assumptions_handler_rules.py b/fedot/api/api_utils/assumptions/assumptions_handler_rules.py new file mode 100644 index 0000000000..186e0c2b69 --- /dev/null +++ b/fedot/api/api_utils/assumptions/assumptions_handler_rules.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass +from typing import Callable, List, Optional, Sequence, Union + +from fedot.api.time import ApiTime +from fedot.core.pipelines.pipeline import Pipeline + + +@dataclass(frozen=True) +class AssumptionFitError: + code: str + message: str + cause: str + + +@dataclass(frozen=True) +class PresetDecision: + preset: str + was_changed: bool + + +NormalizedInitialAssumption = Optional[List[Pipeline]] + + +def normalize_initial_assumption(initial_assumption: Union[List[Pipeline], Pipeline, None]) -> NormalizedInitialAssumption: + if initial_assumption is None: + return None + if isinstance(initial_assumption, Pipeline): + return [initial_assumption] + return initial_assumption + + +def resolve_initial_assumption(initial_assumption: Union[List[Pipeline], Pipeline, None], + builder: Callable[[], List[Pipeline]]) -> List[Pipeline]: + normalized = normalize_initial_assumption(initial_assumption) + if normalized is None: + return builder() + return normalized + + +def build_assumption_fit_error(ex: Exception) -> AssumptionFitError: + fit_failed_info = f'Initial pipeline fit was failed due to: {ex}.' + advice_info = f'{fit_failed_info} Check pipeline structure and the correctness of the data' + return AssumptionFitError( + code='initial_assumption_fit_failed', + message=advice_info, + cause=str(ex), + ) + + +def decide_preset(preset: Optional[str], + timer: ApiTime, + n_jobs: int, + chooser: Callable[[ApiTime, int], str]) -> PresetDecision: + if not preset or preset == 'auto': + return PresetDecision( + preset=chooser(timer, n_jobs), + was_changed=True, + ) + return PresetDecision(preset=preset, was_changed=False) diff --git a/tests/api/api_utils/assumptions/test_assumptions_handler.py b/tests/api/api_utils/assumptions/test_assumptions_handler.py new file mode 100644 index 0000000000..8250750065 --- /dev/null +++ b/tests/api/api_utils/assumptions/test_assumptions_handler.py @@ -0,0 +1,68 @@ +from types import SimpleNamespace + +from pymonad.either import Left, Right + +import fedot.api.api_utils.assumptions.assumptions_handler as handler_module +from fedot.api.api_utils.assumptions.assumptions_handler import AssumptionsHandler + + +class _FakePipeline: + def __init__(self, should_fail=False): + self.should_fail = should_fail + self.loaded = False + self.fitted = False + self.predicted = False + + def try_load_from_cache(self, operations_cache, preprocessing_cache): + self.loaded = True + + def fit(self, data_train, n_jobs=-1): + if self.should_fail: + raise RuntimeError('fit failed') + self.fitted = True + + def predict(self, data_test): + self.predicted = True + return 'ok' + + +def test_try_fit_assumption_returns_right_for_success(monkeypatch): + monkeypatch.setattr(handler_module, 'train_test_data_setup', lambda data: ('train', 'test')) + monkeypatch.setattr(handler_module.MemoryAnalytics, 'log', staticmethod(lambda *args, **kwargs: None)) + + pipeline = _FakePipeline() + result = AssumptionsHandler(SimpleNamespace()).try_fit_assumption(pipeline) + + assert result.is_right() + assert result.value is pipeline + assert pipeline.loaded is True + assert pipeline.fitted is True + assert pipeline.predicted is True + + +def test_try_fit_assumption_returns_left_for_expected_fit_failure(monkeypatch): + monkeypatch.setattr(handler_module, 'train_test_data_setup', lambda data: ('train', 'test')) + + pipeline = _FakePipeline(should_fail=True) + result = AssumptionsHandler(SimpleNamespace()).try_fit_assumption(pipeline) + + assert result.is_left() + assert result.monoid[0].code == 'initial_assumption_fit_failed' + assert 'fit failed' in result.monoid[0].message + + +def test_fit_assumption_and_check_correctness_keeps_compatibility_wrapper(monkeypatch): + handler = AssumptionsHandler(SimpleNamespace()) + pipeline = _FakePipeline() + monkeypatch.setattr(handler, 'try_fit_assumption', lambda **kwargs: Right(pipeline)) + + assert handler.fit_assumption_and_check_correctness(pipeline) is pipeline + + monkeypatch.setattr(handler, 'try_fit_assumption', lambda **kwargs: Left(SimpleNamespace(message='boom'))) + + try: + handler.fit_assumption_and_check_correctness(pipeline) + except ValueError as error: + assert str(error) == 'boom' + else: + raise AssertionError('Compatibility wrapper should raise ValueError for failed assumption fitting') diff --git a/tests/api/api_utils/assumptions/test_assumptions_handler_rules.py b/tests/api/api_utils/assumptions/test_assumptions_handler_rules.py new file mode 100644 index 0000000000..c18633c60f --- /dev/null +++ b/tests/api/api_utils/assumptions/test_assumptions_handler_rules.py @@ -0,0 +1,61 @@ +from types import SimpleNamespace + +from fedot.api.api_utils.assumptions.assumptions_handler_rules import ( + build_assumption_fit_error, + decide_preset, + normalize_initial_assumption, + resolve_initial_assumption, +) + + +class _FakePipeline: + pass + + +def test_normalize_initial_assumption_handles_none_single_and_list(): + pipeline = _FakePipeline() + + assert normalize_initial_assumption(None) is None + assert normalize_initial_assumption(pipeline) == [pipeline] + assert normalize_initial_assumption([pipeline]) == [pipeline] + + +def test_resolve_initial_assumption_uses_builder_only_when_needed(): + pipeline = _FakePipeline() + calls = {'count': 0} + + def builder(): + calls['count'] += 1 + return [pipeline] + + assert resolve_initial_assumption(None, builder) == [pipeline] + assert calls['count'] == 1 + assert resolve_initial_assumption(pipeline, builder) == [pipeline] + assert calls['count'] == 1 + + +def test_build_assumption_fit_error_returns_typed_error_message(): + error = build_assumption_fit_error(RuntimeError('broken fit')) + + assert error.code == 'initial_assumption_fit_failed' + assert 'broken fit' in error.message + assert error.cause == 'broken fit' + + +def test_decide_preset_changes_only_for_auto_like_values(): + timer = SimpleNamespace() + calls = {'count': 0} + + def chooser(_timer, n_jobs): + calls['count'] += 1 + assert n_jobs == 2 + return 'fast_train' + + changed = decide_preset(None, timer, 2, chooser) + unchanged = decide_preset('best_quality', timer, 2, chooser) + + assert changed.preset == 'fast_train' + assert changed.was_changed is True + assert unchanged.preset == 'best_quality' + assert unchanged.was_changed is False + assert calls['count'] == 1 diff --git a/tests/api/test_builder.py b/tests/api/test_builder.py new file mode 100644 index 0000000000..1515703d8b --- /dev/null +++ b/tests/api/test_builder.py @@ -0,0 +1,72 @@ +from inspect import signature +from itertools import chain + +import pytest + +from fedot import Fedot, FedotBuilder +from fedot.api.api_utils.api_params_repository import ApiParamsRepository +from fedot.api.api_utils.params import ApiParams +from fedot.core.repository.tasks import TaskTypesEnum + + +@pytest.fixture(name='fedot_builder_methods', scope='session') +def get_fedot_builder_methods(): + return {func_name: func for func_name in dir(FedotBuilder) if + callable(func := getattr(FedotBuilder, func_name)) and + not func_name.startswith('_') + } + + +def test_setters_chain_returns_the_builder(fedot_builder_methods): + builder = FedotBuilder('classification') + for method_name in fedot_builder_methods.keys(): + if method_name in ['build']: + continue + method = getattr(builder, method_name) + builder = method() + + assert isinstance(builder, FedotBuilder) + + +@pytest.mark.parametrize('task_type', TaskTypesEnum) +def test_fedot_api_creation_preserves_default_params(task_type): + if task_type is TaskTypesEnum.clustering: + return + task_type = task_type.name + builder = FedotBuilder(task_type) + fedot = builder.build() + fedot_params = fedot.params + default_params = ApiParams(input_params={}, problem=task_type) + + assert isinstance(fedot, Fedot) + assert fedot_params == default_params + + +def test_names_and_return_annotations_of_param_setters(fedot_builder_methods): + methods = fedot_builder_methods + setters_by_annotation = {func_name for func_name, func in methods.items() + if signature(func).return_annotation == FedotBuilder.__name__} + setters_by_name = {func_name for func_name in methods.keys() if func_name.startswith('setup_')} + assert setters_by_annotation == setters_by_name + + +def test_no_unexpected_method_names(fedot_builder_methods): + methods = fedot_builder_methods + unexpected_method_names = {func_name for func_name in methods.keys() if not ( + func_name.startswith('setup_') or + func_name in ['build'])} + assert not unexpected_method_names + + +def test_param_setters_has_all_api_parameters(fedot_builder_methods): + methods = fedot_builder_methods + setter_signs = [sign for func in methods.values() + if (sign := signature(func)).return_annotation == FedotBuilder.__name__] + builder_params = set(chain(*[sign.parameters.keys() for sign in setter_signs])) + builder_params.update(signature(FedotBuilder.__init__).parameters.keys()) + + fedot_api_all_params = set(ApiParamsRepository.default_params_for_task(TaskTypesEnum.classification).keys()) + fedot_api_all_params.update(signature(Fedot.__init__).parameters.keys()) + fedot_api_all_params.discard('composer_tuner_params') + + assert builder_params == fedot_api_all_params diff --git a/tests/api/test_safety.py b/tests/api/test_safety.py new file mode 100644 index 0000000000..314de6a8b2 --- /dev/null +++ b/tests/api/test_safety.py @@ -0,0 +1,98 @@ +import numpy as np + +from fedot import Fedot +from fedot.api.api_utils.api_data import ApiDataProcessor +from fedot.api.api_utils.input_analyser import InputAnalyser +from fedot.core.data.data import InputData +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.preprocessing.preprocessing import DataPreprocessor +from test.integration.api.test_main_api import TESTS_MAIN_API_DEFAULT_PARAMS + + +def get_data_analyser_with_specific_params(max_size=18, max_cat_cardinality=5): + safety_module = InputAnalyser(safe_mode=True) + preprocessor = ApiDataProcessor(Task(TaskTypesEnum.classification)) + safety_module.max_size = max_size + safety_module.max_cat_cardinality = max_cat_cardinality + return safety_module, preprocessor + + +def get_small_cat_data(): + features = np.array([['a', 'qq', 0.5], + ['b', 'pp', 1], + ['c', np.nan, 3], + ['d', 'oo', 3], + ['d', 'oo', 3], + ['d', 'oo', 3], + ['d', 'oo', 3], + ['d', 'oo', 3]], dtype=object) + target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) + input_data = InputData(idx=np.arange(features.shape[0]), + features=features, target=target, + data_type=DataTypesEnum.table, + task=Task(TaskTypesEnum.classification)) + input_data = DataPreprocessor().obligatory_prepare_for_fit(input_data) + return input_data + + +def test_safety_label_correct(): + api_safety, api_preprocessor = get_data_analyser_with_specific_params() + data = get_small_cat_data() + recs_for_data, _ = api_safety.give_recommendations(data) + api_preprocessor.accept_and_apply_recommendations(data, recs_for_data) + assert data.features.shape[0] * data.features.shape[1] <= api_safety.max_size + assert data.features.shape[1] == 3 + assert data.features[0, 0] != 'a' + + +def test_recommendations_works_correct_in_final_fit(): + api_safety, api_preprocessor = get_data_analyser_with_specific_params() + data = get_small_cat_data() + recs_for_data, _ = api_safety.give_recommendations(data) + api_preprocessor.accept_and_apply_recommendations(data, recs_for_data) + + data_new = get_small_cat_data() + if recs_for_data: + api_preprocessor.accept_and_apply_recommendations( + data_new, + {k: v for k, v in recs_for_data.items() if k != 'cut'}, + ) + + assert data_new.features.shape[1] == 3 + assert data_new.features[0, 0] != 'a' + + +def test_no_safety_needed_correct(): + api_safety, api_preprocessor = get_data_analyser_with_specific_params(max_size=100, max_cat_cardinality=100) + data = get_small_cat_data() + recs_for_data, _ = api_safety.give_recommendations(data) + api_preprocessor.accept_and_apply_recommendations(data, recs_for_data) + assert data.features.shape[0] * data.features.shape[1] == 24 + assert data.features.shape[1] == 3 + assert data.features[0, 0] == 'a' + + +def test_api_fit_predict_with_pseudo_large_dataset_with_label_correct(): + model = Fedot(problem='classification', + preset='fast_train', + safe_mode=True) + model.data_analyser.max_cat_cardinality = 5 + model.data_analyser.max_size = 18 + data = get_small_cat_data() + pipeline = model.fit(features=data, predefined_model='auto') + pipeline.predict(data) + model.predict(features=data) + + assert len(model.params.get('available_operations')) == 4 + assert 'logit' not in model.params.get('available_operations') + + +def test_api_fit_predict_with_pseudo_large_dataset_with_onehot_correct(): + model = Fedot(problem='classification', **TESTS_MAIN_API_DEFAULT_PARAMS) + model.data_analyser.max_size = 1000 + data = get_small_cat_data() + model.fit(features=data, predefined_model='auto') + + model.predict(features=data) + assert 'logit' in model.params.get('available_operations') diff --git a/tests/core/data/__init__.py b/tests/core/data/__init__.py new file mode 100644 index 0000000000..e02abfc9b0 --- /dev/null +++ b/tests/core/data/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/core/data/test_data_definition.py b/tests/core/data/test_data_definition.py new file mode 100644 index 0000000000..626c0c7e29 --- /dev/null +++ b/tests/core/data/test_data_definition.py @@ -0,0 +1,84 @@ +from datetime import datetime +from typing import Union, Tuple, Optional + +import numpy as np +import pandas as pd +import pytest + +import fedot.api.api_utils.data_definition as fedot_api_api_utils_data_definition +from fedot.api.api_utils.data_definition import PandasStrategy, TupleStrategy, NumpyStrategy, StrategyDefineData +from fedot.core.data.data import InputData +from fedot.core.data.data import np_datetime_to_numeric +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum + +_DATE = '2000-01-01T10:00:00.100' +_DATE_FORMAT = '%Y-%m-%dT%H:%M:%S.%f' + + +@pytest.mark.parametrize('features', [ + np.array([ + [_DATE, datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54, 54.] + ]), + np.array([ + [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 42] + ], dtype=object), + np.array([ + [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE), 54., 54] + ], dtype=object), + np.array([ + [*pd.date_range(_DATE, periods=3, freq='D').to_numpy(), 54, 54.] + ], dtype=object), + np.array([ + [*pd.date_range(_DATE, periods=3, freq='D')] + ], dtype=np.datetime64), + pd.date_range(_DATE, periods=3, freq='D').to_numpy(), + np.array([ + [datetime.strptime(_DATE, _DATE_FORMAT), np.datetime64(_DATE), pd.Timestamp(_DATE)] + ]), + np.array([ + ['without_datetime', 54, 54.] + ], dtype=object) +]) +def test_datetime_erasure(features: np.ndarray): + result = np_datetime_to_numeric(features) + assert 'datetime' not in str(pd.DataFrame(result).infer_objects().dtypes) + + +def _array_to_input_data(features_array: np.ndarray, + target_array: np.ndarray, + idx: Optional[np.ndarray] = None, + task: Task = Task(TaskTypesEnum.classification), + data_type: Optional[DataTypesEnum] = None) -> InputData: + return np.asarray(features_array), np.asarray(target_array) + + +@pytest.mark.parametrize('strategy, features, task, target, expected', [ + (NumpyStrategy, np.array([[1]]), Task(TaskTypesEnum.regression), None, (np.array([[1]]), np.array([]))), + (PandasStrategy, pd.DataFrame([[1]]), Task(TaskTypesEnum.regression), None, + (np.array([[1]]), np.array([]))), + (NumpyStrategy, np.array([[1]]), Task(TaskTypesEnum.ts_forecasting), None, (np.array([[1]]), np.array([1]))), + (NumpyStrategy, np.array([[1]]), Task(TaskTypesEnum.ts_forecasting), np.array([2]), + (np.array([[1]]), np.array([[1]]))), + (NumpyStrategy, np.array([[1, 2]]), Task(TaskTypesEnum.regression), 1, (np.array([[1]]), np.array([2]))), + (NumpyStrategy, np.array([[1]]), Task(TaskTypesEnum.regression), 1, (np.array([[1]]), np.array([]))), + (PandasStrategy, pd.DataFrame([[1, 2]], columns=['0', '1']), Task(TaskTypesEnum.regression), '1', + (np.array([[1]]), np.array([2]))), + (PandasStrategy, pd.DataFrame([[1]], columns=['0']), Task(TaskTypesEnum.regression), '1', + (np.array([[1]]), np.array([]))), + (NumpyStrategy, np.array([[1, 2]]), Task(TaskTypesEnum.regression), np.array([0, 1]), + (np.array([[1, 2]]), np.array([0, 1]))), + (PandasStrategy, pd.DataFrame([[1, 2]]), Task(TaskTypesEnum.regression), pd.Series([0, 1]), + (np.array([[1, 2]]), np.array([0, 1]))), + (TupleStrategy, ([1], [2]), Task(TaskTypesEnum.regression), None, ([1], [2])) +]) +def test_data_strategies(strategy: StrategyDefineData, features: Union[np.ndarray, pd.DataFrame, Tuple], + task: Task, target: Union[None, np.ndarray], expected: Tuple[np.ndarray, np.ndarray], + monkeypatch): + monkeypatch.setattr(fedot_api_api_utils_data_definition, 'array_to_input_data', _array_to_input_data) + + obtained_features, obtained_target = strategy().define_data(features=features, task=task, target=target) + expected_features, expected_target = expected + + assert np.allclose(obtained_features, expected_features) + assert np.allclose(obtained_target, expected_target) From 5b5eabdb6c0170be765603a1a6136c56db30c9cb Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:36:59 +0300 Subject: [PATCH 12/32] extract pure api params repository defaulting rules --- fedot/api/api_utils/api_params_repository.py | 69 +++----------------- 1 file changed, 8 insertions(+), 61 deletions(-) diff --git a/fedot/api/api_utils/api_params_repository.py b/fedot/api/api_utils/api_params_repository.py index 824c16b47c..c4d4430184 100644 --- a/fedot/api/api_utils/api_params_repository.py +++ b/fedot/api/api_utils/api_params_repository.py @@ -1,13 +1,12 @@ -import datetime -from dataclasses import asdict +import datetime from typing import Sequence from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum from golem.core.optimisers.genetic.operators.mutation import MutationTypesEnum +from fedot.api.api_utils.api_params_repository_rules import apply_default_params, build_default_api_params from fedot.api.sampling_stage.config import validate_sampling_config from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation, add_resample_mutation -from fedot.core.constants import AUTO_PRESET_NAME from fedot.core.repository.tasks import TaskTypesEnum from fedot.core.utils import default_fedot_data_dir @@ -33,68 +32,16 @@ def __init__(self, task_type: TaskTypesEnum): @staticmethod def default_params_for_task(task_type: TaskTypesEnum) -> dict: """ Returns a dict with default parameters""" - if task_type in [TaskTypesEnum.classification, TaskTypesEnum.regression]: - cv_folds = 5 - - elif task_type == TaskTypesEnum.ts_forecasting: - cv_folds = 3 - - # Dict with allowed keyword attributes for Api and their default values. If None - default value set - # in dataclasses ``PipelineComposerRequirements``, ``GPAlgorithmParameters``, ``GraphGenerationParams`` - # will be used. - default_param_values_dict = dict( - parallelization_mode='populational', - show_progress=True, - max_depth=6, - max_arity=3, - pop_size=20, - num_of_generations=None, - keep_n_best=1, - available_operations=None, - metric=None, - cv_folds=cv_folds, - genetic_scheme=None, - early_stopping_iterations=None, - early_stopping_timeout=10, - optimizer=None, - collect_intermediate_metric=False, - max_pipeline_fit_time=None, - initial_assumption=None, - preset=AUTO_PRESET_NAME, - use_operations_cache=True, - use_preprocessing_cache=True, - use_predictions_cache=True, - use_stats=False, - use_input_preprocessing=True, - use_auto_preprocessing=False, - use_meta_rules=False, - cache_dir=default_fedot_data_dir(), - keep_history=True, - history_dir=default_fedot_data_dir(), - with_tuning=True, - seed=None, - sampling_config=None, - ) - return default_param_values_dict + return build_default_api_params(task_type, default_fedot_data_dir()) def check_and_set_default_params(self, params: dict) -> dict: """ Sets default values for parameters which were not set by the user and raises KeyError for invalid parameter keys""" - allowed_keys = self.default_params.keys() - invalid_keys = params.keys() - allowed_keys - if invalid_keys: - raise KeyError(f"Invalid key parameters {invalid_keys}") - - if 'sampling_config' in params: - validated_sampling_config = validate_sampling_config(params['sampling_config']) - params['sampling_config'] = asdict(validated_sampling_config) if validated_sampling_config else None - - missing_params = self.default_params.keys() - params.keys() - for k in missing_params: - if (v := self.default_params[k]) is not None: - params[k] = v - - return params + return apply_default_params( + params=params, + default_params=self.default_params, + sampling_validator=validate_sampling_config, + ) @staticmethod def get_params_for_composer_requirements(params: dict) -> dict: From 4e2a1fed8bc294fe11ae7ae8e8644b1555f89c89 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 12:37:15 +0300 Subject: [PATCH 13/32] extract pure api params --- .../api_utils/api_params_repository_rules.py | 79 +++++++++++++++++++ .../api_utils/test_api_params_repository.py | 30 +++++++ .../test_api_params_repository_rules.py | 56 +++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 fedot/api/api_utils/api_params_repository_rules.py create mode 100644 tests/api/api_utils/test_api_params_repository.py create mode 100644 tests/api/api_utils/test_api_params_repository_rules.py diff --git a/fedot/api/api_utils/api_params_repository_rules.py b/fedot/api/api_utils/api_params_repository_rules.py new file mode 100644 index 0000000000..7c51253aa1 --- /dev/null +++ b/fedot/api/api_utils/api_params_repository_rules.py @@ -0,0 +1,79 @@ +from dataclasses import asdict +from typing import Any, Callable, Dict + +from fedot.core.constants import AUTO_PRESET_NAME +from fedot.core.repository.tasks import TaskTypesEnum + + +def default_cv_folds_for_task(task_type: TaskTypesEnum) -> int: + if task_type in (TaskTypesEnum.classification, TaskTypesEnum.regression): + return 5 + if task_type == TaskTypesEnum.ts_forecasting: + return 3 + raise ValueError(f'Unsupported task type for default params: {task_type}') + + +def build_default_api_params(task_type: TaskTypesEnum, default_data_dir: str) -> dict: + return dict( + parallelization_mode='populational', + show_progress=True, + max_depth=6, + max_arity=3, + pop_size=20, + num_of_generations=None, + keep_n_best=1, + available_operations=None, + metric=None, + cv_folds=default_cv_folds_for_task(task_type), + genetic_scheme=None, + early_stopping_iterations=None, + early_stopping_timeout=10, + optimizer=None, + collect_intermediate_metric=False, + max_pipeline_fit_time=None, + initial_assumption=None, + preset=AUTO_PRESET_NAME, + use_operations_cache=True, + use_preprocessing_cache=True, + use_predictions_cache=True, + use_stats=False, + use_input_preprocessing=True, + use_auto_preprocessing=False, + use_meta_rules=False, + cache_dir=default_data_dir, + keep_history=True, + history_dir=default_data_dir, + with_tuning=True, + seed=None, + sampling_config=None, + ) + + +def validate_api_param_keys(params: dict, allowed_keys) -> None: + invalid_keys = params.keys() - set(allowed_keys) + if invalid_keys: + raise KeyError(f'Invalid key parameters {invalid_keys}') + + +def normalize_sampling_config(config: Any, validator: Callable[[Any], Any]): + validated_sampling_config = validator(config) + return asdict(validated_sampling_config) if validated_sampling_config else None + + +def apply_default_params(params: Dict[str, Any], + default_params: Dict[str, Any], + sampling_validator: Callable[[Any], Any]) -> Dict[str, Any]: + validate_api_param_keys(params, default_params.keys()) + + normalized_params = dict(params) + if 'sampling_config' in normalized_params: + normalized_params['sampling_config'] = normalize_sampling_config( + normalized_params['sampling_config'], + sampling_validator, + ) + + for key, value in default_params.items(): + if key not in normalized_params and value is not None: + normalized_params[key] = value + + return normalized_params diff --git a/tests/api/api_utils/test_api_params_repository.py b/tests/api/api_utils/test_api_params_repository.py new file mode 100644 index 0000000000..f8dd85691b --- /dev/null +++ b/tests/api/api_utils/test_api_params_repository.py @@ -0,0 +1,30 @@ +import pytest + +from fedot.api.api_utils.api_params_repository import ApiParamsRepository +from fedot.core.repository.tasks import TaskTypesEnum + + +def test_api_params_repository_builds_task_specific_defaults(): + classification_repository = ApiParamsRepository(TaskTypesEnum.classification) + ts_repository = ApiParamsRepository(TaskTypesEnum.ts_forecasting) + + assert classification_repository.default_params['cv_folds'] == 5 + assert ts_repository.default_params['cv_folds'] == 3 + + +def test_api_params_repository_preserves_valid_sampling_config(): + repository = ApiParamsRepository(TaskTypesEnum.classification) + + result = repository.check_and_set_default_params({ + 'sampling_config': {'strategy': 'random', 'candidate_ratios': [0.2, 0.5]}, + }) + + assert result['sampling_config']['strategy'] == 'random' + assert tuple(result['sampling_config']['candidate_ratios']) == (0.2, 0.5) + + +def test_api_params_repository_rejects_unknown_param_key(): + repository = ApiParamsRepository(TaskTypesEnum.classification) + + with pytest.raises(KeyError, match='Invalid key parameters'): + repository.check_and_set_default_params({'unknown': 1}) diff --git a/tests/api/api_utils/test_api_params_repository_rules.py b/tests/api/api_utils/test_api_params_repository_rules.py new file mode 100644 index 0000000000..25597316aa --- /dev/null +++ b/tests/api/api_utils/test_api_params_repository_rules.py @@ -0,0 +1,56 @@ +import pytest +from dataclasses import dataclass + +from fedot.api.api_utils.api_params_repository_rules import ( + apply_default_params, + build_default_api_params, + default_cv_folds_for_task, + normalize_sampling_config, + validate_api_param_keys, +) +from fedot.core.constants import AUTO_PRESET_NAME +from fedot.core.repository.tasks import TaskTypesEnum + + +@dataclass(frozen=True) +class ValidatedConfig: + strategy: str = 'random' + + +def test_default_cv_folds_for_task_matches_supported_tasks(): + assert default_cv_folds_for_task(TaskTypesEnum.classification) == 5 + assert default_cv_folds_for_task(TaskTypesEnum.regression) == 5 + assert default_cv_folds_for_task(TaskTypesEnum.ts_forecasting) == 3 + + +def test_build_default_api_params_contains_expected_defaults(): + defaults = build_default_api_params(TaskTypesEnum.classification, 'cache_dir') + + assert defaults['preset'] == AUTO_PRESET_NAME + assert defaults['cv_folds'] == 5 + assert defaults['cache_dir'] == 'cache_dir' + assert defaults['history_dir'] == 'cache_dir' + + +def test_validate_api_param_keys_rejects_unknown_keys(): + with pytest.raises(KeyError, match='Invalid key parameters'): + validate_api_param_keys({'unknown': 1}, {'known'}) + + +def test_normalize_sampling_config_uses_validator_result(): + assert normalize_sampling_config({'strategy': 'random'}, lambda config: ValidatedConfig()) == {'strategy': 'random'} + assert normalize_sampling_config(None, lambda config: None) is None + + +def test_apply_default_params_adds_missing_values_and_normalizes_sampling(): + defaults = {'preset': AUTO_PRESET_NAME, 'sampling_config': None, 'show_progress': True} + + result = apply_default_params( + params={'sampling_config': {'strategy': 'random'}}, + default_params=defaults, + sampling_validator=lambda config: ValidatedConfig(), + ) + + assert result['preset'] == AUTO_PRESET_NAME + assert result['show_progress'] is True + assert result['sampling_config'] == {'strategy': 'random'} From d0f18ba9477bfa1264cd858c33270dcf7c46126b Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 13:18:45 +0300 Subject: [PATCH 14/32] extract cache and tuner setup rules from api composer --- fedot/api/api_utils/api_composer.py | 47 ++++++++++--------- fedot/api/api_utils/api_composer_rules.py | 41 ++++++++++++++++ tests/api/api_utils/test_api_composer.py | 40 ++++++++++++++++ .../api/api_utils/test_api_composer_rules.py | 26 ++++++++++ 4 files changed, 132 insertions(+), 22 deletions(-) create mode 100644 fedot/api/api_utils/api_composer_rules.py create mode 100644 tests/api/api_utils/test_api_composer.py create mode 100644 tests/api/api_utils/test_api_composer_rules.py diff --git a/fedot/api/api_utils/api_composer.py b/fedot/api/api_utils/api_composer.py index f5ddc5113d..e4ff7b3cba 100644 --- a/fedot/api/api_utils/api_composer.py +++ b/fedot/api/api_utils/api_composer.py @@ -7,6 +7,7 @@ from golem.core.optimisers.opt_history_objects.opt_history import OptHistory from golem.core.tuning.simultaneous import SimultaneousTuner +from fedot.api.api_utils.api_composer_rules import build_cache_init_plan, build_tuner_plan from fedot.api.api_utils.api_run_planner import build_composer_execution_plan from fedot.api.api_utils.assumptions.assumptions_handler import AssumptionsHandler from fedot.api.api_utils.params import ApiParams @@ -41,23 +42,23 @@ def __init__(self, api_params: ApiParams, metrics: Union[MetricIDType, Sequence[ self.init_cache() def init_cache(self): - use_operations_cache = self.params.get('use_operations_cache') - use_preprocessing_cache = self.params.get('use_preprocessing_cache') - use_predictions_cache = self.params.get('use_predictions_cache') - use_input_preprocessing = self.params.get('use_input_preprocessing') - cache_dir = self.params.get('cache_dir') - use_stats = self.params.get('use_stats') - if use_operations_cache: - self.operations_cache = OperationsCache(cache_dir=cache_dir, use_stats=use_stats) - # in case of previously generated singleton cache + cache_plan = build_cache_init_plan( + use_operations_cache=self.params.get('use_operations_cache'), + use_preprocessing_cache=self.params.get('use_preprocessing_cache'), + use_predictions_cache=self.params.get('use_predictions_cache'), + use_input_preprocessing=self.params.get('use_input_preprocessing'), + cache_dir=self.params.get('cache_dir'), + use_stats=self.params.get('use_stats'), + ) + + if cache_plan.use_operations_cache: + self.operations_cache = OperationsCache(cache_dir=cache_plan.cache_dir, use_stats=cache_plan.use_stats) self.operations_cache.reset() - if use_input_preprocessing and use_preprocessing_cache: - self.preprocessing_cache = PreprocessingCache(cache_dir=cache_dir, use_stats=use_stats) - # in case of previously generated singleton cache + if cache_plan.use_preprocessing_cache: + self.preprocessing_cache = PreprocessingCache(cache_dir=cache_plan.cache_dir, use_stats=cache_plan.use_stats) self.preprocessing_cache.reset() - if use_predictions_cache: - self.predictions_cache = PredictionsCache(cache_dir=cache_dir, use_stats=use_stats) - # in case of previously generated singleton cache + if cache_plan.use_predictions_cache: + self.predictions_cache = PredictionsCache(cache_dir=cache_plan.cache_dir, use_stats=cache_plan.use_stats) self.predictions_cache.reset() def obtain_model(self, train_data: InputData) -> Tuple[Pipeline, Sequence[Pipeline], OptHistory]: @@ -106,7 +107,6 @@ def obtain_model(self, train_data: InputData) -> Tuple[Pipeline, Sequence[Pipeli if gp_composer.history: adapter = self.params.graph_generation_params.adapter gp_composer.history.tuning_result = adapter.adapt(best_pipeline) - # enforce memory cleaning gc.collect() self.log.message('Model generation finished') @@ -166,7 +166,6 @@ def compose_pipeline(self, train_data: InputData, initial_assumption: Sequence[P ) if execution_plan.should_compose: - # Launch pipeline structure composition with self.timer.launch_composing(): self.log.message('Pipeline composition started.') self.was_optimised = False @@ -174,7 +173,6 @@ def compose_pipeline(self, train_data: InputData, initial_assumption: Sequence[P best_pipeline_candidates = gp_composer.best_models self.was_optimised = True else: - # Use initial pipeline as final solution self.log.message(f'Timeout is too small for composing and is skipped ' f'because fit_time is {self.timer.assumption_fit_spend_time.total_seconds()} sec.') best_pipelines = fitted_assumption @@ -192,18 +190,23 @@ def tune_final_pipeline(self, train_data: InputData, """ Launch tuning procedure for obtained pipeline by composer """ timeout_for_tuning = execution_plan.tuning_timeout_minutes if execution_plan else abs( self.timer.determine_resources_for_tuning()) / 60 + tuner_plan = build_tuner_plan( + metrics=self.metrics, + timeout_minutes=timeout_for_tuning, + iterations=DEFAULT_TUNING_ITERATIONS_NUMBER, + ) tuner = (TunerBuilder(self.params.task) .with_tuner(SimultaneousTuner) - .with_metric(self.metrics[0]) - .with_iterations(DEFAULT_TUNING_ITERATIONS_NUMBER) - .with_timeout(datetime.timedelta(minutes=timeout_for_tuning)) + .with_metric(tuner_plan.metric) + .with_iterations(tuner_plan.iterations) + .with_timeout(datetime.timedelta(minutes=tuner_plan.timeout_minutes)) .with_eval_time_constraint(self.params.composer_requirements.max_graph_fit_time) .with_requirements(self.params.composer_requirements) .build(train_data)) with self.timer.launch_tuning(): self.was_tuned = False - self.log.message(f'Hyperparameters tuning started with {round(timeout_for_tuning)} min. timeout') + self.log.message(f'Hyperparameters tuning started with {round(tuner_plan.timeout_minutes)} min. timeout') tuned_pipeline = tuner.tune(pipeline_gp_composed) self.log.message('Hyperparameters tuning finished') self.was_tuned = tuner.was_tuned diff --git a/fedot/api/api_utils/api_composer_rules.py b/fedot/api/api_utils/api_composer_rules.py new file mode 100644 index 0000000000..e1ca6ac5a0 --- /dev/null +++ b/fedot/api/api_utils/api_composer_rules.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass +from typing import Any, Sequence + + +@dataclass(frozen=True) +class CacheInitPlan: + use_operations_cache: bool + use_preprocessing_cache: bool + use_predictions_cache: bool + cache_dir: str | None + use_stats: bool + + +@dataclass(frozen=True) +class TunerPlan: + metric: Any + iterations: int + timeout_minutes: float + + +def build_cache_init_plan(use_operations_cache: bool, + use_preprocessing_cache: bool, + use_predictions_cache: bool, + use_input_preprocessing: bool, + cache_dir, + use_stats: bool) -> CacheInitPlan: + return CacheInitPlan( + use_operations_cache=bool(use_operations_cache), + use_preprocessing_cache=bool(use_input_preprocessing and use_preprocessing_cache), + use_predictions_cache=bool(use_predictions_cache), + cache_dir=cache_dir, + use_stats=bool(use_stats), + ) + + +def build_tuner_plan(metrics: Sequence[Any], timeout_minutes: float, iterations: int) -> TunerPlan: + return TunerPlan( + metric=metrics[0], + iterations=iterations, + timeout_minutes=max(0.0, timeout_minutes), + ) diff --git a/tests/api/api_utils/test_api_composer.py b/tests/api/api_utils/test_api_composer.py new file mode 100644 index 0000000000..cdb8a213c3 --- /dev/null +++ b/tests/api/api_utils/test_api_composer.py @@ -0,0 +1,40 @@ +import fedot.api.api_utils.api_composer as composer_module +from fedot.api.api_utils.api_composer import ApiComposer + + +class _FakeCache: + def __init__(self, cache_dir=None, use_stats=False): + self.cache_dir = cache_dir + self.use_stats = use_stats + self.was_reset = False + + def reset(self): + self.was_reset = True + + +class _FakeParams(dict): + timeout = 1 + n_jobs = -1 + + +def test_api_composer_init_cache_uses_typed_cache_plan(monkeypatch): + monkeypatch.setattr(composer_module, 'OperationsCache', _FakeCache) + monkeypatch.setattr(composer_module, 'PreprocessingCache', _FakeCache) + monkeypatch.setattr(composer_module, 'PredictionsCache', _FakeCache) + + params = _FakeParams( + use_operations_cache=True, + use_preprocessing_cache=True, + use_predictions_cache=True, + use_input_preprocessing=False, + cache_dir='cache_dir', + use_stats=True, + ) + + composer = ApiComposer(params, metrics=['f1']) + + assert isinstance(composer.operations_cache, _FakeCache) + assert composer.operations_cache.was_reset is True + assert composer.preprocessing_cache is None + assert isinstance(composer.predictions_cache, _FakeCache) + assert composer.predictions_cache.was_reset is True diff --git a/tests/api/api_utils/test_api_composer_rules.py b/tests/api/api_utils/test_api_composer_rules.py new file mode 100644 index 0000000000..ef473bc5fe --- /dev/null +++ b/tests/api/api_utils/test_api_composer_rules.py @@ -0,0 +1,26 @@ +from fedot.api.api_utils.api_composer_rules import build_cache_init_plan, build_tuner_plan + + +def test_build_cache_init_plan_respects_input_preprocessing_boundary(): + plan = build_cache_init_plan( + use_operations_cache=True, + use_preprocessing_cache=True, + use_predictions_cache=True, + use_input_preprocessing=False, + cache_dir='cache', + use_stats=True, + ) + + assert plan.use_operations_cache is True + assert plan.use_preprocessing_cache is False + assert plan.use_predictions_cache is True + assert plan.cache_dir == 'cache' + assert plan.use_stats is True + + +def test_build_tuner_plan_is_deterministic_and_clamps_timeout(): + plan = build_tuner_plan(metrics=['f1', 'roc_auc'], timeout_minutes=-3, iterations=42) + + assert plan.metric == 'f1' + assert plan.iterations == 42 + assert plan.timeout_minutes == 0.0 From 5645139f3168c83942194b9d1086381cfeb67cd5 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 13:20:19 +0300 Subject: [PATCH 15/32] extract pure builder parameter merge rules --- fedot/api/builder.py | 6 +++--- fedot/api/builder_rules.py | 25 +++++++++++++++++++++++++ tests/api/test_builder.py | 12 ++++++++++++ tests/api/test_builder_rules.py | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 fedot/api/builder_rules.py create mode 100644 tests/api/test_builder_rules.py diff --git a/fedot/api/builder.py b/fedot/api/builder.py index 73d1822520..7b3c7f77a7 100644 --- a/fedot/api/builder.py +++ b/fedot/api/builder.py @@ -4,6 +4,7 @@ from golem.core.optimisers.optimizer import GraphOptimizer +from fedot.api.builder_rules import build_fedot_kwargs, merge_builder_params from fedot.api.main import Fedot from fedot.core.pipelines.pipeline import Pipeline from fedot.core.repository.metrics_repository import MetricIDType @@ -66,8 +67,7 @@ def __init__(self, problem: str): def __update_params(self, **new_params): """ Saves all parameters set by user to the dictionary ``self.api_params``. """ - new_params = {k: v for k, v in new_params.items() if v != DEFAULT_VALUE} - self.api_params.update(new_params) + self.api_params = merge_builder_params(self.api_params, new_params, DEFAULT_VALUE) def setup_composition( self, @@ -468,4 +468,4 @@ def build(self) -> Fedot: Returns: :class:`~fedot.api.main.Fedot` instance. """ - return Fedot(**self.api_params) + return Fedot(**build_fedot_kwargs(self.api_params)) diff --git a/fedot/api/builder_rules.py b/fedot/api/builder_rules.py new file mode 100644 index 0000000000..8389391a6b --- /dev/null +++ b/fedot/api/builder_rules.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from typing import Any, Dict + + +@dataclass(frozen=True) +class BuilderParamsUpdate: + applied_params: Dict[str, Any] + + +def normalize_builder_params(new_params: Dict[str, Any], default_value: Any) -> BuilderParamsUpdate: + applied_params = {key: value for key, value in new_params.items() if value != default_value} + return BuilderParamsUpdate(applied_params=applied_params) + + +def merge_builder_params(current_params: Dict[str, Any], + new_params: Dict[str, Any], + default_value: Any) -> Dict[str, Any]: + normalized_update = normalize_builder_params(new_params, default_value) + merged = dict(current_params) + merged.update(normalized_update.applied_params) + return merged + + +def build_fedot_kwargs(api_params: Dict[str, Any]) -> Dict[str, Any]: + return dict(api_params) diff --git a/tests/api/test_builder.py b/tests/api/test_builder.py index 1515703d8b..dfff5e63fc 100644 --- a/tests/api/test_builder.py +++ b/tests/api/test_builder.py @@ -4,6 +4,7 @@ import pytest from fedot import Fedot, FedotBuilder +from fedot.api.builder import DEFAULT_VALUE from fedot.api.api_utils.api_params_repository import ApiParamsRepository from fedot.api.api_utils.params import ApiParams from fedot.core.repository.tasks import TaskTypesEnum @@ -70,3 +71,14 @@ def test_param_setters_has_all_api_parameters(fedot_builder_methods): fedot_api_all_params.discard('composer_tuner_params') assert builder_params == fedot_api_all_params + + + +def test_builder_preserves_previous_values_when_new_setup_uses_default_sentinel(): + builder = FedotBuilder('classification') + + builder.setup_composition(timeout=3, preset='fast_train') + builder.setup_composition(timeout=DEFAULT_VALUE) + + assert builder.api_params['timeout'] == 3 + assert builder.api_params['preset'] == 'fast_train' diff --git a/tests/api/test_builder_rules.py b/tests/api/test_builder_rules.py new file mode 100644 index 0000000000..8671a8e1cc --- /dev/null +++ b/tests/api/test_builder_rules.py @@ -0,0 +1,33 @@ +from fedot.api.builder import DEFAULT_VALUE +from fedot.api.builder_rules import ( + build_fedot_kwargs, + merge_builder_params, + normalize_builder_params, +) + + +def test_normalize_builder_params_skips_default_sentinel(): + update = normalize_builder_params( + {'timeout': 1, 'preset': DEFAULT_VALUE, 'seed': 42}, + DEFAULT_VALUE, + ) + + assert update.applied_params == {'timeout': 1, 'seed': 42} + + +def test_merge_builder_params_preserves_existing_values_for_default_updates(): + merged = merge_builder_params( + current_params={'problem': 'classification', 'timeout': 5}, + new_params={'timeout': DEFAULT_VALUE, 'preset': 'fast_train'}, + default_value=DEFAULT_VALUE, + ) + + assert merged == {'problem': 'classification', 'timeout': 5, 'preset': 'fast_train'} + + +def test_build_fedot_kwargs_returns_copy(): + api_params = {'problem': 'classification'} + kwargs = build_fedot_kwargs(api_params) + + assert kwargs == api_params + assert kwargs is not api_params From 88f7d3be543528ae0b6de856cb1f98c8ac89a478 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 13:25:26 +0300 Subject: [PATCH 16/32] extract preprocessing source and merge rule --- fedot/preprocessing/base_preprocessing.py | 39 +++++----- fedot/preprocessing/preprocessing.py | 33 ++++---- fedot/preprocessing/preprocessing_rules.py | 61 +++++++++++++++ tests/preprocessing/__init__.py | 1 + .../preprocessing/test_preprocessing_rules.py | 75 +++++++++++++++++++ 5 files changed, 169 insertions(+), 40 deletions(-) create mode 100644 fedot/preprocessing/preprocessing_rules.py create mode 100644 tests/preprocessing/__init__.py create mode 100644 tests/preprocessing/test_preprocessing_rules.py diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index 56c238ffb9..54250f4422 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -1,4 +1,4 @@ -from abc import ABC, abstractmethod +from abc import ABC, abstractmethod from typing import Dict, Union, TYPE_CHECKING import numpy as np @@ -15,6 +15,10 @@ ) from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor from fedot.preprocessing.data_types import TableTypesCorrector +from fedot.preprocessing.preprocessing_rules import ( + build_preprocessor_merge_plan, + iter_preprocessed_inputs, +) from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME from fedot.utilities.custom_errors import AbstractMethodNotImplementError @@ -216,8 +220,7 @@ def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligator data: data to be marked is_obligatory: was the data obligatorily or optionally preprocessed """ - values = [data] if isinstance(data, InputData) else data.values() - for input_data in values: + for input_data in iter_preprocessed_inputs(data): if is_obligatory: input_data.supplementary_data.obligatorily_preprocessed = True else: @@ -238,23 +241,17 @@ def merge_preprocessors(api_preprocessor: 'BasePreprocessor', Returns: merged preprocessor """ - # If was used auto preprocessor - if use_auto_preprocessing: - # Take all obligatory data preprocessing from obtained pipelines - new_data_preprocessor = api_preprocessor - - # If was used pipelines preprocessors - else: - # Take all obligatory data preprocessing from API - new_data_preprocessor = api_preprocessor - - # Update optional preprocessing (take it from obtained pipeline) - if not new_data_preprocessor.features_encoders: - # Store features encoder from obtained pipeline because in API there are no encoding - new_data_preprocessor.features_encoders = pipeline_preprocessor.features_encoders - - if not new_data_preprocessor.features_imputers: - # Same with Nan's imputers - new_data_preprocessor.features_imputers = pipeline_preprocessor.features_imputers + new_data_preprocessor = api_preprocessor + merge_plan = build_preprocessor_merge_plan( + use_auto_preprocessing=use_auto_preprocessing, + api_features_encoders=api_preprocessor.features_encoders, + api_features_imputers=api_preprocessor.features_imputers, + ) + + if merge_plan.take_pipeline_encoders: + new_data_preprocessor.features_encoders = pipeline_preprocessor.features_encoders + + if merge_plan.take_pipeline_imputers: + new_data_preprocessor.features_imputers = pipeline_preprocessor.features_imputers return new_data_preprocessor diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 09a7030f50..3d8331ad5c 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -27,6 +27,11 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum from fedot.preprocessing.base_preprocessing import BasePreprocessor +from fedot.preprocessing.preprocessing_rules import ( + resolve_main_target_source_name, + resolve_source_names, + should_initialize_source_helpers, +) from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor from fedot.preprocessing.data_type_check import exclude_image, exclude_multi_ts, exclude_ts from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector @@ -72,19 +77,16 @@ def _init_supplementary_preprocessors(self, data: Union[InputData, MultiModalDat Args: data: with input data for preprocessing """ - if self.binary_categorical_processors and self.types_correctors: - # Preprocessors have been already initialized + if not should_initialize_source_helpers( + has_binary_processors=bool(self.binary_categorical_processors), + has_type_correctors=bool(self.types_correctors), + ): return None - if isinstance(data, InputData): - self.binary_categorical_processors[DEFAULT_SOURCE_NAME] = BinaryCategoricalPreprocessor() - self.types_correctors[DEFAULT_SOURCE_NAME] = TableTypesCorrector() - elif isinstance(data, MultiModalData): - for data_source in data: - self.binary_categorical_processors[data_source] = BinaryCategoricalPreprocessor() - self.types_correctors[data_source] = TableTypesCorrector() - else: - raise ValueError('Unknown type of data.') + source_plan = resolve_source_names(data, DEFAULT_SOURCE_NAME) + for data_source in source_plan.source_names: + self.binary_categorical_processors[data_source] = BinaryCategoricalPreprocessor() + self.types_correctors[data_source] = TableTypesCorrector() def _init_main_target_source_name(self, multi_data: MultiModalData): """ @@ -93,14 +95,7 @@ def _init_main_target_source_name(self, multi_data: MultiModalData): Args: multi_data: `MultiModalData` """ - if self.main_target_source_name is not None: - # Target name has been already defined - return None - - for data_source_name, input_data in multi_data.items(): - if input_data.supplementary_data.is_main_target: - self.main_target_source_name = data_source_name - break + self.main_target_source_name = resolve_main_target_source_name(self.main_target_source_name, multi_data) @copy_doc(BasePreprocessor.obligatory_prepare_for_fit) def obligatory_prepare_for_fit(self, data: Union[InputData, MultiModalData]) -> Union[InputData, MultiModalData]: diff --git a/fedot/preprocessing/preprocessing_rules.py b/fedot/preprocessing/preprocessing_rules.py new file mode 100644 index 0000000000..02438cd4cc --- /dev/null +++ b/fedot/preprocessing/preprocessing_rules.py @@ -0,0 +1,61 @@ +from dataclasses import dataclass +from typing import Any, Iterable, Tuple + +from fedot.core.data.data import InputData +from fedot.core.data.multi_modal import MultiModalData + + +@dataclass(frozen=True) +class PreprocessingSourcePlan: + source_names: Tuple[str, ...] + + +@dataclass(frozen=True) +class PreprocessorMergePlan: + take_pipeline_encoders: bool + take_pipeline_imputers: bool + + +def resolve_source_names(data: Any, default_source_name: str) -> PreprocessingSourcePlan: + if isinstance(data, InputData): + return PreprocessingSourcePlan(source_names=(default_source_name,)) + if isinstance(data, MultiModalData): + return PreprocessingSourcePlan(source_names=tuple(data.keys())) + raise ValueError('Unknown type of data.') + + +def should_initialize_source_helpers(has_binary_processors: bool, has_type_correctors: bool) -> bool: + return not (has_binary_processors and has_type_correctors) + + +def resolve_main_target_source_name(current_source_name, multi_data: MultiModalData): + if current_source_name is not None: + return current_source_name + + for data_source_name, input_data in multi_data.items(): + if input_data.supplementary_data.is_main_target: + return data_source_name + return None + + +def iter_preprocessed_inputs(data: Any) -> Tuple[Any, ...]: + if isinstance(data, InputData): + return (data,) + if isinstance(data, MultiModalData): + return tuple(data.values()) + raise ValueError('Unknown type of data.') + + +def build_preprocessor_merge_plan(use_auto_preprocessing: bool, + api_features_encoders, + api_features_imputers) -> PreprocessorMergePlan: + if use_auto_preprocessing: + return PreprocessorMergePlan( + take_pipeline_encoders=False, + take_pipeline_imputers=False, + ) + + return PreprocessorMergePlan( + take_pipeline_encoders=not bool(api_features_encoders), + take_pipeline_imputers=not bool(api_features_imputers), + ) diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py new file mode 100644 index 0000000000..e02abfc9b0 --- /dev/null +++ b/tests/preprocessing/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/preprocessing/test_preprocessing_rules.py b/tests/preprocessing/test_preprocessing_rules.py new file mode 100644 index 0000000000..3592df5702 --- /dev/null +++ b/tests/preprocessing/test_preprocessing_rules.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +from fedot.core.data.data import InputData +from fedot.core.data.multi_modal import MultiModalData +from fedot.core.data.supplementary_data import SupplementaryData +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.preprocessing.preprocessing_rules import ( + build_preprocessor_merge_plan, + iter_preprocessed_inputs, + resolve_main_target_source_name, + resolve_source_names, + should_initialize_source_helpers, +) +from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME + + +def _make_input_data(*, is_main_target=True): + return InputData( + idx=np.array([0, 1]), + features=np.array([[1.0], [2.0]]), + target=np.array([[0.0], [1.0]]), + task=Task(TaskTypesEnum.regression), + data_type=DataTypesEnum.table, + supplementary_data=SupplementaryData(is_main_target=is_main_target), + ) + + +def test_resolve_source_names_handles_unimodal_and_multimodal(): + unimodal_plan = resolve_source_names(_make_input_data(), DEFAULT_SOURCE_NAME) + multimodal_plan = resolve_source_names( + MultiModalData({'left': _make_input_data(), 'right': _make_input_data(is_main_target=False)}), + DEFAULT_SOURCE_NAME, + ) + + assert unimodal_plan.source_names == (DEFAULT_SOURCE_NAME,) + assert multimodal_plan.source_names == ('left', 'right') + + +def test_resolve_source_names_rejects_unknown_data_type(): + with pytest.raises(ValueError, match='Unknown type of data'): + resolve_source_names(object(), DEFAULT_SOURCE_NAME) + + +def test_should_initialize_source_helpers_reflects_existing_state(): + assert should_initialize_source_helpers(False, False) is True + assert should_initialize_source_helpers(True, False) is True + assert should_initialize_source_helpers(True, True) is False + + +def test_resolve_main_target_source_name_prefers_existing_then_detects_main_branch(): + multi_data = MultiModalData({ + 'main': _make_input_data(is_main_target=True), + 'side': _make_input_data(is_main_target=False), + }) + + assert resolve_main_target_source_name('preset', multi_data) == 'preset' + assert resolve_main_target_source_name(None, multi_data) == 'main' + + +def test_iter_preprocessed_inputs_and_merge_plan_are_deterministic(): + input_data = _make_input_data() + multi_data = MultiModalData({'main': input_data, 'side': _make_input_data(is_main_target=False)}) + + assert iter_preprocessed_inputs(input_data) == (input_data,) + assert len(iter_preprocessed_inputs(multi_data)) == 2 + + auto_plan = build_preprocessor_merge_plan(True, {'enc': 1}, {'imp': 1}) + manual_plan = build_preprocessor_merge_plan(False, {}, {}) + + assert auto_plan.take_pipeline_encoders is False + assert auto_plan.take_pipeline_imputers is False + assert manual_plan.take_pipeline_encoders is True + assert manual_plan.take_pipeline_imputers is True From 703e4d2259fe410d98c23db088600651a77af436 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 14:37:28 +0300 Subject: [PATCH 17/32] integrate extension manifest discovery into operation queries --- .../repository/operation_types_repository.py | 15 ++- fedot/extensions/operation_rules.py | 59 ++++++++++ ...t_operation_types_repository_extensions.py | 45 ++++++++ tests/extensions/test_operation_rules.py | 54 ++++++++++ .../preprocessing/test_base_preprocessing.py | 101 ++++++++++++++++++ 5 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 fedot/extensions/operation_rules.py create mode 100644 tests/core/repository/test_operation_types_repository_extensions.py create mode 100644 tests/extensions/test_operation_rules.py create mode 100644 tests/preprocessing/test_base_preprocessing.py diff --git a/fedot/core/repository/operation_types_repository.py b/fedot/core/repository/operation_types_repository.py index 9040fa3cf3..dba71fcae9 100644 --- a/fedot/core/repository/operation_types_repository.py +++ b/fedot/core/repository/operation_types_repository.py @@ -19,6 +19,7 @@ parse_repository_kind, ) from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.extensions.operation_rules import get_extension_operation_names, should_include_extensions EXTRA_TS_INSTALLED = True try: @@ -311,7 +312,19 @@ def suitable_operation(self, task_type: TaskTypesEnum = None, extra_ts_installed=EXTRA_TS_INSTALLED, ) operations_info = filter_operation_infos(self._repo, query) - return [m.id for m in operations_info] + operation_names = [m.id for m in operations_info] + + if should_include_extensions(query.repository_kind): + operation_names.extend( + get_extension_operation_names( + task_type=task_type, + data_type=data_type, + tags=tags, + forbidden_tags=forbidden_tags, + ) + ) + + return sorted(set(operation_names)) @property def operations(self): diff --git a/fedot/extensions/operation_rules.py b/fedot/extensions/operation_rules.py new file mode 100644 index 0000000000..5a3093e8a4 --- /dev/null +++ b/fedot/extensions/operation_rules.py @@ -0,0 +1,59 @@ +from dataclasses import dataclass +from typing import Iterable, Optional, Sequence, Tuple + +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.operation_query import RepositoryKind +from fedot.core.repository.tasks import TaskTypesEnum +from fedot.extensions.registry import get_registered_extensions + + +@dataclass(frozen=True) +class ExtensionOperationView: + name: str + tasks: Tuple[TaskTypesEnum, ...] + data_types: Tuple[DataTypesEnum, ...] + tags: Tuple[str, ...] + + +def should_include_extensions(repository_kind: RepositoryKind) -> bool: + return repository_kind in (RepositoryKind.MODEL, RepositoryKind.ALL) + + +def get_extension_operation_views() -> Tuple[ExtensionOperationView, ...]: + views = [] + for registered_extension in get_registered_extensions(): + for model in registered_extension.manifest.models: + views.append(ExtensionOperationView( + name=model.name, + tasks=tuple(model.capabilities.tasks), + data_types=tuple(model.capabilities.data_types), + tags=tuple(model.capabilities.tags), + )) + return tuple(views) + + +def filter_extension_operation_views(task_type: Optional[TaskTypesEnum], + data_type: Optional[DataTypesEnum], + tags: Optional[Sequence[str]] = None, + forbidden_tags: Optional[Sequence[str]] = None) -> Tuple[ExtensionOperationView, ...]: + requested_tags = tuple(tags or ()) + forbidden = set(forbidden_tags or ()) + views = [] + for view in get_extension_operation_views(): + if task_type is not None and task_type not in view.tasks: + continue + if data_type is not None and data_type not in view.data_types: + continue + if requested_tags and not any(tag in view.tags for tag in requested_tags): + continue + if forbidden and any(tag in forbidden for tag in view.tags): + continue + views.append(view) + return tuple(views) + + +def get_extension_operation_names(task_type: Optional[TaskTypesEnum], + data_type: Optional[DataTypesEnum], + tags: Optional[Sequence[str]] = None, + forbidden_tags: Optional[Sequence[str]] = None) -> list[str]: + return sorted(view.name for view in filter_extension_operation_views(task_type, data_type, tags, forbidden_tags)) diff --git a/tests/core/repository/test_operation_types_repository_extensions.py b/tests/core/repository/test_operation_types_repository_extensions.py new file mode 100644 index 0000000000..3b67198d82 --- /dev/null +++ b/tests/core/repository/test_operation_types_repository_extensions.py @@ -0,0 +1,45 @@ +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.operation_types_repository import OperationTypesRepository, get_operations_for_task +from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities +from fedot.extensions.registry import clear_extension_registry, register_extension + + +def _make_manifest(): + return ExtensionManifest( + name='repository_extension', + version='1.0.0', + models=( + ExternalModelSpec( + name='external_linear', + factory=lambda params=None: object(), + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.regression,), + data_types=(DataTypesEnum.table,), + tags=('linear',), + ), + ), + ), + ) + + +def test_operation_repository_includes_registered_extension_models(): + clear_extension_registry() + register_extension(_make_manifest()) + + try: + task = Task(TaskTypesEnum.regression) + operations = OperationTypesRepository('model').suitable_operation( + task_type=task.task_type, + data_type=DataTypesEnum.table, + ) + queried_operations = get_operations_for_task( + task=task, + data_type=DataTypesEnum.table, + mode='model', + ) + + assert 'external_linear' in operations + assert 'external_linear' in queried_operations + finally: + clear_extension_registry() diff --git a/tests/extensions/test_operation_rules.py b/tests/extensions/test_operation_rules.py new file mode 100644 index 0000000000..3d1ec47d29 --- /dev/null +++ b/tests/extensions/test_operation_rules.py @@ -0,0 +1,54 @@ +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.operation_query import RepositoryKind +from fedot.core.repository.tasks import TaskTypesEnum +from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities +from fedot.extensions.operation_rules import ( + filter_extension_operation_views, + get_extension_operation_names, + should_include_extensions, +) +from fedot.extensions.registry import clear_extension_registry, register_extension + + +def _make_manifest(): + return ExtensionManifest( + name='demo_extension', + version='1.0.0', + models=( + ExternalModelSpec( + name='external_rf', + factory=lambda params=None: object(), + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.classification,), + data_types=(DataTypesEnum.table,), + tags=('tree', 'external'), + ), + ), + ), + ) + + +def test_extension_operation_rules_filter_registered_models(): + clear_extension_registry() + register_extension(_make_manifest()) + + try: + assert should_include_extensions(RepositoryKind.MODEL) is True + assert should_include_extensions(RepositoryKind.DATA_OPERATION) is False + + views = filter_extension_operation_views( + task_type=TaskTypesEnum.classification, + data_type=DataTypesEnum.table, + tags=('tree',), + ) + names = get_extension_operation_names( + task_type=TaskTypesEnum.classification, + data_type=DataTypesEnum.table, + tags=('tree',), + ) + + assert len(views) == 1 + assert views[0].name == 'external_rf' + assert names == ['external_rf'] + finally: + clear_extension_registry() diff --git a/tests/preprocessing/test_base_preprocessing.py b/tests/preprocessing/test_base_preprocessing.py new file mode 100644 index 0000000000..f7563c66d6 --- /dev/null +++ b/tests/preprocessing/test_base_preprocessing.py @@ -0,0 +1,101 @@ +import numpy as np + +from fedot.core.data.data import InputData +from fedot.core.data.multi_modal import MultiModalData +from fedot.core.data.supplementary_data import SupplementaryData +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.preprocessing.base_preprocessing import BasePreprocessor +from fedot.preprocessing.preprocessing import DataPreprocessor + + +class _FakePreprocessor(BasePreprocessor): + def obligatory_prepare_for_fit(self, data): + return data + + def obligatory_prepare_for_predict(self, data): + return data + + def optional_prepare_for_fit(self, pipeline, data): + return data + + def optional_prepare_for_predict(self, pipeline, data): + return data + + def label_encoding_for_fit(self, data, source_name='default'): + return None + + def cut_dataset(self, data, border: int): + return None + + def apply_inverse_target_encoding(self, column_to_transform): + return column_to_transform + + def convert_indexes_for_fit(self, pipeline, data): + return data + + def convert_indexes_for_predict(self, pipeline, data): + return data + + def restore_index(self, input_data, result): + return result + + def update_indices_for_time_series(self, test_data): + return test_data + + def reduce_memory_size(self, data): + return data + + +def _make_input_data(*, is_main_target=True): + return InputData( + idx=np.array([0, 1]), + features=np.array([[1.0], [2.0]]), + target=np.array([[0.0], [1.0]]), + task=Task(TaskTypesEnum.regression), + data_type=DataTypesEnum.table, + supplementary_data=SupplementaryData(is_main_target=is_main_target), + ) + + +def test_mark_as_preprocessed_marks_unimodal_and_multimodal_inputs(): + input_data = _make_input_data() + multi_data = MultiModalData({'main': _make_input_data(), 'side': _make_input_data(is_main_target=False)}) + + BasePreprocessor.mark_as_preprocessed(input_data) + BasePreprocessor.mark_as_preprocessed(multi_data, is_obligatory=False) + + assert input_data.supplementary_data.obligatorily_preprocessed is True + assert multi_data['main'].supplementary_data.optionally_preprocessed is True + assert multi_data['side'].supplementary_data.optionally_preprocessed is True + + +def test_merge_preprocessors_uses_typed_merge_plan(): + api_preprocessor = _FakePreprocessor() + pipeline_preprocessor = _FakePreprocessor() + pipeline_preprocessor.features_encoders = {'encoder': object()} + pipeline_preprocessor.features_imputers = {'imputer': object()} + + merged = BasePreprocessor.merge_preprocessors( + api_preprocessor=api_preprocessor, + pipeline_preprocessor=pipeline_preprocessor, + use_auto_preprocessing=False, + ) + + assert merged.features_encoders == pipeline_preprocessor.features_encoders + assert merged.features_imputers == pipeline_preprocessor.features_imputers + + +def test_data_preprocessor_initialization_uses_source_and_target_rules(): + preprocessor = DataPreprocessor() + multi_data = MultiModalData({ + 'main': _make_input_data(is_main_target=True), + 'side': _make_input_data(is_main_target=False), + }) + + preprocessor._init_supplementary_preprocessors(multi_data) + preprocessor._init_main_target_source_name(multi_data) + + assert set(preprocessor.binary_categorical_processors.keys()) == {'main', 'side'} + assert set(preprocessor.types_correctors.keys()) == {'main', 'side'} + assert preprocessor.main_target_source_name == 'main' From 522eca410505f6f4c36e232485079fb1e24d61cb Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 15:04:50 +0300 Subject: [PATCH 18/32] extract pipeline operation split rules and fix fluent repository setup --- fedot/core/operations/extension_model.py | 60 ++++++++ fedot/core/operations/factory.py | 11 +- .../pipeline_operation_repository.py | 35 +++-- .../repository/pipeline_operation_rules.py | 41 ++++++ fedot/extensions/runtime_rules.py | 131 ++++++++++++++++++ tests/core/operations/__init__.py | 1 + .../test_operation_factory_extensions.py | 77 ++++++++++ .../test_pipeline_operation_repository.py | 43 ++++++ .../test_pipeline_operation_rules.py | 36 +++++ tests/extensions/test_runtime_rules.py | 61 ++++++++ 10 files changed, 478 insertions(+), 18 deletions(-) create mode 100644 fedot/core/operations/extension_model.py create mode 100644 fedot/core/repository/pipeline_operation_rules.py create mode 100644 fedot/extensions/runtime_rules.py create mode 100644 tests/core/operations/__init__.py create mode 100644 tests/core/operations/test_operation_factory_extensions.py create mode 100644 tests/core/repository/test_pipeline_operation_repository.py create mode 100644 tests/core/repository/test_pipeline_operation_rules.py create mode 100644 tests/extensions/test_runtime_rules.py diff --git a/fedot/core/operations/extension_model.py b/fedot/core/operations/extension_model.py new file mode 100644 index 0000000000..2d0fe3bae3 --- /dev/null +++ b/fedot/core/operations/extension_model.py @@ -0,0 +1,60 @@ +from typing import Optional, Union + +from fedot.core.operations.evaluation.custom import CustomModelStrategy +from fedot.core.operations.hyperparameters_preprocessing import HyperparametersPreprocessor +from fedot.core.operations.model import Model +from fedot.core.operations.operation_parameters import OperationParameters +from fedot.core.repository.operation_types_repository import OperationMetaInfo +from fedot.extensions.runtime_rules import ( + build_extension_strategy_params, + get_extension_acceptable_task_types, + get_extension_data_types, +) + + +class ExtensionModel(Model): + """Runtime adapter for manifest-registered external models. + + It reuses FEDOT's existing custom-model evaluation strategy while keeping + the external model name as the public operation type. + """ + + def _init(self, task, **kwargs): + params = kwargs.get('params') + if not params: + params = OperationParameters() + if isinstance(params, dict): + params = OperationParameters(**params) + + user_params = HyperparametersPreprocessor( + operation_type='custom', + n_samples_data=kwargs.get('n_samples_data'), + ).correct(params.to_dict()) + strategy_params = build_extension_strategy_params( + operation_name=self.operation_type, + user_params=user_params, + output_mode=kwargs.get('output_mode', 'default'), + ) + params_for_fit = OperationParameters.from_operation_type('custom', **strategy_params) + self._eval_strategy = CustomModelStrategy('custom', params_for_fit) + if 'output_mode' in kwargs: + self._eval_strategy.output_mode = kwargs['output_mode'] + + @property + def acceptable_task_types(self): + return get_extension_acceptable_task_types(self.operation_type) + + @property + def metadata(self) -> OperationMetaInfo: + data_types = list(get_extension_data_types(self.operation_type)) + task_types = list(get_extension_acceptable_task_types(self.operation_type)) + return OperationMetaInfo( + id=self.operation_type, + input_types=data_types, + output_types=data_types, + task_type=task_types, + supported_strategies={task_type: CustomModelStrategy for task_type in task_types}, + allowed_positions=['any'], + tags=['external', 'custom_model'], + presets=['best_quality', 'fast_train', 'stable', 'gpu', 'automl', 'ts'], + ) diff --git a/fedot/core/operations/factory.py b/fedot/core/operations/factory.py index d98ef820bf..96d5593ab0 100644 --- a/fedot/core/operations/factory.py +++ b/fedot/core/operations/factory.py @@ -1,8 +1,10 @@ -from fedot.core.operations.automl import AutoML +from fedot.core.operations.automl import AutoML from fedot.core.operations.data_operation import DataOperation +from fedot.core.operations.extension_model import ExtensionModel from fedot.core.operations.model import Model from fedot.core.operations.operation import Operation from fedot.core.repository.operation_types_repository import OperationTypesRepository, get_operation_type_from_id +from fedot.extensions.runtime_rules import is_extension_operation_name class OperationFactory: @@ -26,6 +28,8 @@ def get_operation(self) -> Operation: if self.operation_type == 'model': operation = Model(operation_type=self.operation_name) + elif self.operation_type == 'extension_model': + operation = ExtensionModel(operation_type=self.operation_name) elif self.operation_type == 'data_operation': operation = DataOperation(operation_type=self.operation_name) elif self.operation_type == 'automl': @@ -46,7 +50,6 @@ def _define_operation_type(self) -> str: :return : operations type 'model', 'automl' or 'data_operation' """ - # Get available models from model_repository.json file operations_repo = OperationTypesRepository('data_operation') operations = operations_repo.operations if 'automl' in OperationTypesRepository.get_available_repositories(): @@ -57,12 +60,12 @@ def _define_operation_type(self) -> str: operation_name = get_operation_type_from_id(self.operation_name) - # If there is a such model in the list if any(operation_name == model.id for model in operations): operation_type = 'data_operation' elif any(operation_name == model.id for model in models_automl): operation_type = 'automl' - # Otherwise - it is model + elif is_extension_operation_name(operation_name): + operation_type = 'extension_model' else: operation_type = 'model' return operation_type diff --git a/fedot/core/repository/pipeline_operation_repository.py b/fedot/core/repository/pipeline_operation_repository.py index b379dcb1e0..b26eaee6fd 100644 --- a/fedot/core/repository/pipeline_operation_repository.py +++ b/fedot/core/repository/pipeline_operation_repository.py @@ -1,9 +1,13 @@ import itertools -from typing import List, Optional, Dict +from typing import Dict, List, Optional from fedot.api.api_utils.presets import OperationsPreset from fedot.core.repository.graph_operation_repository import GraphOperationRepository from fedot.core.repository.operation_types_repository import get_operations_for_task +from fedot.core.repository.pipeline_operation_rules import ( + build_pipeline_operations_by_role, + filter_available_pipeline_operations, +) from fedot.core.repository.tasks import Task, TaskTypesEnum @@ -28,10 +32,10 @@ def from_available_operations(self, task: Task, preset: str, available_operations: List[str]): """ Initialize repository from available operations, task and preset """ operations_by_task_preset = OperationsPreset(task, preset).filter_operations_by_preset() - all_operations = sorted(list(set.intersection(set(operations_by_task_preset), set(available_operations)))) - primary_operations, secondary_operations = \ - self.divide_operations(all_operations, task) + all_operations = filter_available_pipeline_operations(operations_by_task_preset, available_operations) + primary_operations, secondary_operations = self.divide_operations(list(all_operations), task) self.operations_by_keys = {'primary': primary_operations, 'secondary': secondary_operations} + return self def get_operations(self, is_primary: bool) -> List[str]: """ Get pipeline operations by specified model key """ @@ -53,18 +57,21 @@ def divide_operations(available_operations, task): mode='data_operation', tags=["non_lagged"]) # Remove exog data operation from the list - ts_data_operations.remove('exog_ts') + if 'exog_ts' in ts_data_operations: + ts_data_operations.remove('exog_ts') ts_primary_models = get_operations_for_task(task=task, mode='model', tags=["non_lagged"]) - # Union of the models and data operations - ts_primary_operations = ts_data_operations + ts_primary_models - - # Filter - remain only operations, which were in available ones - primary_operations = sorted(list(set(ts_primary_operations).intersection(available_operations))) - secondary_operations = available_operations + operations_by_role = build_pipeline_operations_by_role( + available_operations=available_operations, + task_type=task.task_type, + ts_data_operations=ts_data_operations, + ts_primary_models=ts_primary_models, + ) else: - primary_operations = available_operations - secondary_operations = available_operations - return primary_operations, secondary_operations + operations_by_role = build_pipeline_operations_by_role( + available_operations=available_operations, + task_type=task.task_type, + ) + return list(operations_by_role.primary), list(operations_by_role.secondary) diff --git a/fedot/core/repository/pipeline_operation_rules.py b/fedot/core/repository/pipeline_operation_rules.py new file mode 100644 index 0000000000..e4570d87b6 --- /dev/null +++ b/fedot/core/repository/pipeline_operation_rules.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass +from typing import Iterable, Tuple + +from fedot.core.repository.tasks import TaskTypesEnum + + +@dataclass(frozen=True) +class PipelineOperationsByRole: + primary: Tuple[str, ...] + secondary: Tuple[str, ...] + + def as_dict(self) -> dict[str, list[str]]: + return { + 'primary': list(self.primary), + 'secondary': list(self.secondary), + } + + +def filter_available_pipeline_operations(preset_operations: Iterable[str], + available_operations: Iterable[str]) -> Tuple[str, ...]: + return tuple(sorted(set(preset_operations).intersection(available_operations))) + + +def build_pipeline_operations_by_role(available_operations: Iterable[str], + task_type: TaskTypesEnum, + ts_data_operations: Iterable[str] = (), + ts_primary_models: Iterable[str] = ()) -> PipelineOperationsByRole: + normalized_available_operations = tuple(sorted(set(available_operations))) + + if task_type is not TaskTypesEnum.ts_forecasting: + return PipelineOperationsByRole( + primary=normalized_available_operations, + secondary=normalized_available_operations, + ) + + ts_primary_operations = set(ts_data_operations).union(ts_primary_models) + primary_operations = tuple(sorted(ts_primary_operations.intersection(normalized_available_operations))) + return PipelineOperationsByRole( + primary=primary_operations, + secondary=normalized_available_operations, + ) diff --git a/fedot/extensions/runtime_rules.py b/fedot/extensions/runtime_rules.py new file mode 100644 index 0000000000..3675910c37 --- /dev/null +++ b/fedot/extensions/runtime_rules.py @@ -0,0 +1,131 @@ +import inspect +from typing import Any, Dict, Optional + +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.extensions.contracts import ExternalModelSpec +from fedot.extensions.registry import get_registered_extensions + + +def get_extension_model_spec(operation_name: str) -> Optional[ExternalModelSpec]: + for registered_extension in get_registered_extensions(): + for model in registered_extension.manifest.models: + if model.name == operation_name: + return model + return None + + +def is_extension_operation_name(operation_name: str) -> bool: + return get_extension_model_spec(operation_name) is not None + + +def build_extension_strategy_params(operation_name: str, + user_params: Optional[Dict[str, Any]] = None, + output_mode: str = 'default') -> Dict[str, Any]: + model_spec = get_extension_model_spec(operation_name) + if model_spec is None: + raise ValueError(f'Extension model "{operation_name}" is not registered.') + + normalized_user_params = dict(user_params or {}) + return { + **normalized_user_params, + 'model_fit': _build_model_fit(model_spec), + 'model_predict': _build_model_predict(model_spec), + '_extension_output_mode': output_mode, + } + + +def get_extension_acceptable_task_types(operation_name: str): + model_spec = get_extension_model_spec(operation_name) + if model_spec is None: + raise ValueError(f'Extension model "{operation_name}" is not registered.') + return model_spec.capabilities.tasks + + +def get_extension_data_types(operation_name: str): + model_spec = get_extension_model_spec(operation_name) + if model_spec is None: + raise ValueError(f'Extension model "{operation_name}" is not registered.') + return model_spec.capabilities.data_types + + +def _build_model_fit(model_spec: ExternalModelSpec): + def _fit(idx, features, target, params): + model = _instantiate_model(model_spec, params) + fit_method = getattr(model, 'fit', None) + if callable(fit_method): + _call_with_supported_signature( + fit_method, + (features, target), + (features,), + (idx, features, target, params), + (idx, features, target), + ) + return model + + return _fit + + +def _build_model_predict(model_spec: ExternalModelSpec): + def _predict(fitted_model, idx, features, params): + model = fitted_model if fitted_model is not None else _instantiate_model(model_spec, params) + output_mode = params.get('_extension_output_mode', 'default') + + if output_mode in ('probs', 'full_probs', 'default') and hasattr(model, 'predict_proba'): + prediction = _call_with_supported_signature( + getattr(model, 'predict_proba'), + (features,), + (idx, features, params), + (idx, features), + ) + if output_mode != 'full_probs' and getattr(prediction, 'shape', None) is not None and len(prediction.shape) > 1 and prediction.shape[1] == 2: + prediction = prediction[:, 1] + elif hasattr(model, 'predict'): + prediction = _call_with_supported_signature( + getattr(model, 'predict'), + (features,), + (idx, features, params), + (idx, features), + ) + elif hasattr(model, 'transform'): + prediction = _call_with_supported_signature( + getattr(model, 'transform'), + (features,), + (idx, features, params), + (idx, features), + ) + else: + raise TypeError(f'Extension model "{model_spec.name}" must define predict, predict_proba, or transform.') + + output_type = _infer_output_type_name(model_spec) + return prediction, output_type + + return _predict + + +def _instantiate_model(model_spec: ExternalModelSpec, params: Dict[str, Any]): + factory = model_spec.factory + user_params = {key: value for key, value in params.items() if not key.startswith('_') and key not in ('model_fit', 'model_predict')} + try: + signature = inspect.signature(factory) + signature.bind_partial(user_params) + return factory(user_params) + except TypeError: + return factory() + + +def _call_with_supported_signature(method, *candidate_args): + signature = inspect.signature(method) + last_error = None + for args in candidate_args: + try: + signature.bind_partial(*args) + return method(*args) + except TypeError as error: + last_error = error + continue + raise last_error or TypeError('No supported signature found for extension model method.') + + +def _infer_output_type_name(model_spec: ExternalModelSpec) -> str: + preferred_data_type = model_spec.capabilities.data_types[0] if model_spec.capabilities.data_types else DataTypesEnum.table + return preferred_data_type.name diff --git a/tests/core/operations/__init__.py b/tests/core/operations/__init__.py new file mode 100644 index 0000000000..e02abfc9b0 --- /dev/null +++ b/tests/core/operations/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/core/operations/test_operation_factory_extensions.py b/tests/core/operations/test_operation_factory_extensions.py new file mode 100644 index 0000000000..6b442b8b3b --- /dev/null +++ b/tests/core/operations/test_operation_factory_extensions.py @@ -0,0 +1,77 @@ +import numpy as np + +from fedot.core.operations.extension_model import ExtensionModel +from fedot.core.operations.factory import OperationFactory +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities +from fedot.extensions.registry import clear_extension_registry, register_extension + + +class _ExternalEstimator: + def __init__(self, params=None): + self.params = params or {} + self.was_fitted = False + + def fit(self, features, target): + self.was_fitted = True + return self + + def predict(self, features): + return np.ones(features.shape[0]) + + +def _make_manifest(): + return ExtensionManifest( + name='factory_extension', + version='1.0.0', + models=( + ExternalModelSpec( + name='external_factory_model', + factory=lambda params=None: _ExternalEstimator(params), + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.regression,), + data_types=(DataTypesEnum.table,), + tags=('external',), + ), + ), + ), + ) + + +def test_operation_factory_returns_extension_model_for_registered_operation(): + clear_extension_registry() + register_extension(_make_manifest()) + + try: + operation = OperationFactory('external_factory_model').get_operation() + + assert isinstance(operation, ExtensionModel) + assert OperationFactory('external_factory_model').operation_type_name == 'extension_model' + finally: + clear_extension_registry() + + +def test_extension_model_uses_custom_strategy_adapter_for_runtime_init(): + clear_extension_registry() + register_extension(_make_manifest()) + + try: + model = ExtensionModel('external_factory_model') + task = Task(TaskTypesEnum.regression) + model._init(task, params={'alpha': 2.0}, output_mode='default', n_samples_data=4) + + strategy = model._eval_strategy + implementation = strategy.fit(type('Data', (), { + 'idx': np.arange(4), + 'features': np.array([[1.0], [2.0], [3.0], [4.0]]), + 'target': np.array([[1.0], [2.0], [3.0], [4.0]]), + 'task': task, + 'data_type': DataTypesEnum.table, + })()) + + assert strategy.operation_id == 'custom' + assert implementation.was_fitted is True + assert implementation.params.get('alpha') == 2.0 + finally: + clear_extension_registry() diff --git a/tests/core/repository/test_pipeline_operation_repository.py b/tests/core/repository/test_pipeline_operation_repository.py new file mode 100644 index 0000000000..32c8935758 --- /dev/null +++ b/tests/core/repository/test_pipeline_operation_repository.py @@ -0,0 +1,43 @@ +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.pipeline_operation_repository import PipelineOperationRepository +from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities +from fedot.extensions.registry import clear_extension_registry, register_extension + + + +def _make_manifest(): + return ExtensionManifest( + name='pipeline_repository_extension', + version='1.0.0', + models=( + ExternalModelSpec( + name='external_linear', + factory=lambda params=None: object(), + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.regression,), + data_types=(DataTypesEnum.table,), + tags=('linear',), + ), + ), + ), + ) + + + +def test_from_available_operations_returns_self_and_keeps_registered_extension(): + clear_extension_registry() + register_extension(_make_manifest()) + + try: + repository = PipelineOperationRepository() + returned_repository = repository.from_available_operations( + task=Task(TaskTypesEnum.regression), + preset='best_quality', + available_operations=['external_linear', 'ridge'], + ) + + assert returned_repository is repository + assert 'external_linear' in repository.get_all_operations() + finally: + clear_extension_registry() diff --git a/tests/core/repository/test_pipeline_operation_rules.py b/tests/core/repository/test_pipeline_operation_rules.py new file mode 100644 index 0000000000..957c0ef9ed --- /dev/null +++ b/tests/core/repository/test_pipeline_operation_rules.py @@ -0,0 +1,36 @@ +from fedot.core.repository.pipeline_operation_rules import ( + build_pipeline_operations_by_role, + filter_available_pipeline_operations, +) +from fedot.core.repository.tasks import TaskTypesEnum + + +def test_filter_available_pipeline_operations_returns_sorted_intersection(): + filtered = filter_available_pipeline_operations( + preset_operations=['ridge', 'rf', 'external_linear'], + available_operations=['external_linear', 'ridge', 'external_linear'], + ) + + assert filtered == ('external_linear', 'ridge') + + +def test_build_pipeline_operations_by_role_returns_all_operations_for_non_ts_task(): + operations_by_role = build_pipeline_operations_by_role( + available_operations=['ridge', 'external_linear'], + task_type=TaskTypesEnum.regression, + ) + + assert operations_by_role.primary == ('external_linear', 'ridge') + assert operations_by_role.secondary == ('external_linear', 'ridge') + + +def test_build_pipeline_operations_by_role_uses_non_lagged_ts_subset_for_primary_nodes(): + operations_by_role = build_pipeline_operations_by_role( + available_operations=['external_non_lagged', 'lagged', 'ridge'], + task_type=TaskTypesEnum.ts_forecasting, + ts_data_operations=['exog_ts', 'lagged'], + ts_primary_models=['external_non_lagged'], + ) + + assert operations_by_role.primary == ('external_non_lagged', 'lagged') + assert operations_by_role.secondary == ('external_non_lagged', 'lagged', 'ridge') diff --git a/tests/extensions/test_runtime_rules.py b/tests/extensions/test_runtime_rules.py new file mode 100644 index 0000000000..6aa6630869 --- /dev/null +++ b/tests/extensions/test_runtime_rules.py @@ -0,0 +1,61 @@ +import numpy as np + +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum +from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities +from fedot.extensions.registry import clear_extension_registry, register_extension +from fedot.extensions.runtime_rules import ( + build_extension_strategy_params, + get_extension_model_spec, + is_extension_operation_name, +) + + +class _ExternalEstimator: + def __init__(self, params=None): + self.params = params or {} + self.was_fitted = False + + def fit(self, features, target): + self.was_fitted = True + return self + + def predict(self, features): + return np.zeros(features.shape[0]) + + +def _make_manifest(): + return ExtensionManifest( + name='runtime_extension', + version='1.0.0', + models=( + ExternalModelSpec( + name='external_runtime_model', + factory=lambda params=None: _ExternalEstimator(params), + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.regression,), + data_types=(DataTypesEnum.table,), + tags=('external', 'linear'), + ), + ), + ), + ) + + +def test_runtime_rules_resolve_registered_extension_model_and_build_strategy_params(): + clear_extension_registry() + register_extension(_make_manifest()) + + try: + spec = get_extension_model_spec('external_runtime_model') + params = build_extension_strategy_params('external_runtime_model', {'alpha': 1.0}, output_mode='labels') + + assert spec is not None + assert spec.name == 'external_runtime_model' + assert is_extension_operation_name('external_runtime_model') is True + assert callable(params['model_fit']) + assert callable(params['model_predict']) + assert params['_extension_output_mode'] == 'labels' + assert params['alpha'] == 1.0 + finally: + clear_extension_registry() From caf961ba8a77cbccedaca11a17637d2f20d4ef79 Mon Sep 17 00:00:00 2001 From: v1docq Date: Thu, 12 Mar 2026 15:12:57 +0300 Subject: [PATCH 19/32] add typed extension parameter resolution and schema defaults --- fedot/extensions/parameter_rules.py | 44 +++++++++++++ fedot/extensions/runtime_rules.py | 45 ++++++++++--- fedot/preprocessing/preprocessing.py | 41 +++++++----- fedot/preprocessing/preprocessing_rules.py | 22 ++++++- .../test_operation_factory_extensions.py | 13 +++- tests/extensions/test_parameter_rules.py | 63 +++++++++++++++++++ tests/extensions/test_runtime_rules.py | 35 ++++++++++- .../preprocessing/test_base_preprocessing.py | 49 ++++++++++++++- .../preprocessing/test_preprocessing_rules.py | 25 +++++++- 9 files changed, 305 insertions(+), 32 deletions(-) create mode 100644 fedot/extensions/parameter_rules.py create mode 100644 tests/extensions/test_parameter_rules.py diff --git a/fedot/extensions/parameter_rules.py b/fedot/extensions/parameter_rules.py new file mode 100644 index 0000000000..7eaa4e5516 --- /dev/null +++ b/fedot/extensions/parameter_rules.py @@ -0,0 +1,44 @@ +from typing import Any, Dict, Optional, Tuple + +from pymonad.either import Left, Right + +from fedot.extensions.contracts import ExtensionError, ExternalModelSpec + + +RuntimeReservedKeys = ('model_fit', 'model_predict') + + +def normalize_extension_user_params(user_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + return dict(user_params or {}) + + +def apply_extension_defaults(defaults: Dict[str, Any], user_params: Dict[str, Any]) -> Dict[str, Any]: + return {**defaults, **user_params} + + +def find_missing_required_params(required: Tuple[str, ...], params: Dict[str, Any]) -> Tuple[str, ...]: + return tuple(param_name for param_name in required if param_name not in params) + + +def resolve_extension_params(model_spec: ExternalModelSpec, + user_params: Optional[Dict[str, Any]] = None): + normalized_user_params = normalize_extension_user_params(user_params) + resolved_params = apply_extension_defaults(model_spec.hyperparams_schema.defaults, normalized_user_params) + missing_required_params = find_missing_required_params(model_spec.hyperparams_schema.required, resolved_params) + + if missing_required_params: + return Left(ExtensionError( + code='missing_required_hyperparams', + message=f'Extension model "{model_spec.name}" is missing required hyperparameters.', + details={'required': list(missing_required_params)}, + )) + + return Right(resolved_params) + + +def extract_factory_params(strategy_params: Dict[str, Any]) -> Dict[str, Any]: + return { + key: value + for key, value in strategy_params.items() + if not key.startswith('_') and key not in RuntimeReservedKeys + } diff --git a/fedot/extensions/runtime_rules.py b/fedot/extensions/runtime_rules.py index 3675910c37..9cf90e59c3 100644 --- a/fedot/extensions/runtime_rules.py +++ b/fedot/extensions/runtime_rules.py @@ -1,11 +1,15 @@ -import inspect +import inspect from typing import Any, Dict, Optional +from pymonad.either import Left + from fedot.core.repository.dataset_types import DataTypesEnum from fedot.extensions.contracts import ExternalModelSpec +from fedot.extensions.parameter_rules import extract_factory_params, resolve_extension_params from fedot.extensions.registry import get_registered_extensions + def get_extension_model_spec(operation_name: str) -> Optional[ExternalModelSpec]: for registered_extension in get_registered_extensions(): for model in registered_extension.manifest.models: @@ -14,24 +18,41 @@ def get_extension_model_spec(operation_name: str) -> Optional[ExternalModelSpec] return None + def is_extension_operation_name(operation_name: str) -> bool: return get_extension_model_spec(operation_name) is not None -def build_extension_strategy_params(operation_name: str, - user_params: Optional[Dict[str, Any]] = None, - output_mode: str = 'default') -> Dict[str, Any]: + +def try_build_extension_strategy_params(operation_name: str, + user_params: Optional[Dict[str, Any]] = None, + output_mode: str = 'default'): model_spec = get_extension_model_spec(operation_name) if model_spec is None: raise ValueError(f'Extension model "{operation_name}" is not registered.') - normalized_user_params = dict(user_params or {}) - return { - **normalized_user_params, + params_resolution = resolve_extension_params(model_spec, user_params) + if params_resolution.__class__ is Left: + return params_resolution + + resolved_user_params = params_resolution.value + return params_resolution.__class__({ + **resolved_user_params, 'model_fit': _build_model_fit(model_spec), 'model_predict': _build_model_predict(model_spec), '_extension_output_mode': output_mode, - } + }) + + + +def build_extension_strategy_params(operation_name: str, + user_params: Optional[Dict[str, Any]] = None, + output_mode: str = 'default') -> Dict[str, Any]: + strategy_params = try_build_extension_strategy_params(operation_name, user_params, output_mode) + if strategy_params.__class__ is Left: + raise ValueError(strategy_params.value.message) + return strategy_params.value + def get_extension_acceptable_task_types(operation_name: str): @@ -41,6 +62,7 @@ def get_extension_acceptable_task_types(operation_name: str): return model_spec.capabilities.tasks + def get_extension_data_types(operation_name: str): model_spec = get_extension_model_spec(operation_name) if model_spec is None: @@ -48,6 +70,7 @@ def get_extension_data_types(operation_name: str): return model_spec.capabilities.data_types + def _build_model_fit(model_spec: ExternalModelSpec): def _fit(idx, features, target, params): model = _instantiate_model(model_spec, params) @@ -65,6 +88,7 @@ def _fit(idx, features, target, params): return _fit + def _build_model_predict(model_spec: ExternalModelSpec): def _predict(fitted_model, idx, features, params): model = fitted_model if fitted_model is not None else _instantiate_model(model_spec, params) @@ -102,9 +126,10 @@ def _predict(fitted_model, idx, features, params): return _predict + def _instantiate_model(model_spec: ExternalModelSpec, params: Dict[str, Any]): factory = model_spec.factory - user_params = {key: value for key, value in params.items() if not key.startswith('_') and key not in ('model_fit', 'model_predict')} + user_params = extract_factory_params(params) try: signature = inspect.signature(factory) signature.bind_partial(user_params) @@ -113,6 +138,7 @@ def _instantiate_model(model_spec: ExternalModelSpec, params: Dict[str, Any]): return factory() + def _call_with_supported_signature(method, *candidate_args): signature = inspect.signature(method) last_error = None @@ -126,6 +152,7 @@ def _call_with_supported_signature(method, *candidate_args): raise last_error or TypeError('No supported signature found for extension model method.') + def _infer_output_type_name(model_spec: ExternalModelSpec) -> str: preferred_data_type = model_spec.capabilities.data_types[0] if model_spec.capabilities.data_types else DataTypesEnum.table return preferred_data_type.name diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 3d8331ad5c..f220e7b71d 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -28,8 +28,10 @@ from fedot.core.repository.tasks import TaskTypesEnum from fedot.preprocessing.base_preprocessing import BasePreprocessor from fedot.preprocessing.preprocessing_rules import ( + build_optional_preprocessing_plan, resolve_main_target_source_name, resolve_source_names, + resolve_target_encoder_source_name, should_initialize_source_helpers, ) from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor @@ -264,19 +266,28 @@ def _prepare_optional(self, pipeline, data: InputData, source_name: str): if not data_type_is_table(data) or data.supplementary_data.optionally_preprocessed: return data - for has_problems, tag_to_check, action_if_no_tag in [ - (data_has_missing_values, 'imputation', self._apply_imputation_unidata), - (data_has_categorical_features, 'encoding', self._apply_categorical_encoding) - ]: - self.log.debug(f'Deciding to apply {tag_to_check} for data') - if has_problems(data): - self.log.debug(f'Finding {tag_to_check} is required and trying to apply') - # Data contains missing values - has_tag = PipelineStructureExplorer.check_structure_by_tag( - pipeline, tag_to_check=tag_to_check, source_name=source_name) + has_missing_values = data_has_missing_values(data) + has_categorical_features = data_has_categorical_features(data) + has_imputation_operation = has_missing_values and PipelineStructureExplorer.check_structure_by_tag( + pipeline, tag_to_check='imputation', source_name=source_name) + has_encoding_operation = has_categorical_features and PipelineStructureExplorer.check_structure_by_tag( + pipeline, tag_to_check='encoding', source_name=source_name) + optional_plan = build_optional_preprocessing_plan( + has_missing_values=has_missing_values, + has_categorical_features=has_categorical_features, + has_imputation_operation=has_imputation_operation, + has_encoding_operation=has_encoding_operation, + ) + + if optional_plan.apply_imputation: + self.log.debug('Applying optional imputation for data') + data = self._apply_imputation_unidata(data, source_name) - if not has_tag: - data = action_if_no_tag(data, source_name) + if optional_plan.apply_encoding: + self.log.debug('Applying optional categorical encoding for data') + data = self._apply_categorical_encoding(data, source_name) + + return data def _find_features_lacking_nans(self, data: InputData, source_name: str): """ @@ -475,11 +486,7 @@ def _determine_target_converter(self): Returns: selected data source name """ - # Choose data source node name with main target - if self.main_target_source_name is None: - return DEFAULT_SOURCE_NAME - else: - return self.main_target_source_name + return resolve_target_encoder_source_name(self.main_target_source_name, DEFAULT_SOURCE_NAME) @staticmethod def _correct_shapes(data: InputData) -> InputData: diff --git a/fedot/preprocessing/preprocessing_rules.py b/fedot/preprocessing/preprocessing_rules.py index 02438cd4cc..195da08045 100644 --- a/fedot/preprocessing/preprocessing_rules.py +++ b/fedot/preprocessing/preprocessing_rules.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass from typing import Any, Iterable, Tuple from fedot.core.data.data import InputData @@ -16,6 +16,12 @@ class PreprocessorMergePlan: take_pipeline_imputers: bool +@dataclass(frozen=True) +class OptionalPreprocessingPlan: + apply_imputation: bool + apply_encoding: bool + + def resolve_source_names(data: Any, default_source_name: str) -> PreprocessingSourcePlan: if isinstance(data, InputData): return PreprocessingSourcePlan(source_names=(default_source_name,)) @@ -38,6 +44,10 @@ def resolve_main_target_source_name(current_source_name, multi_data: MultiModalD return None +def resolve_target_encoder_source_name(current_source_name, default_source_name: str) -> str: + return current_source_name if current_source_name is not None else default_source_name + + def iter_preprocessed_inputs(data: Any) -> Tuple[Any, ...]: if isinstance(data, InputData): return (data,) @@ -59,3 +69,13 @@ def build_preprocessor_merge_plan(use_auto_preprocessing: bool, take_pipeline_encoders=not bool(api_features_encoders), take_pipeline_imputers=not bool(api_features_imputers), ) + + +def build_optional_preprocessing_plan(has_missing_values: bool, + has_categorical_features: bool, + has_imputation_operation: bool, + has_encoding_operation: bool) -> OptionalPreprocessingPlan: + return OptionalPreprocessingPlan( + apply_imputation=has_missing_values and not has_imputation_operation, + apply_encoding=has_categorical_features and not has_encoding_operation, + ) diff --git a/tests/core/operations/test_operation_factory_extensions.py b/tests/core/operations/test_operation_factory_extensions.py index 6b442b8b3b..e6d2e1d5fc 100644 --- a/tests/core/operations/test_operation_factory_extensions.py +++ b/tests/core/operations/test_operation_factory_extensions.py @@ -4,7 +4,12 @@ from fedot.core.operations.factory import OperationFactory from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum -from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities +from fedot.extensions.contracts import ( + ExtensionManifest, + ExternalModelSpec, + ModelCapabilities, + ModelHyperparamsSchema, +) from fedot.extensions.registry import clear_extension_registry, register_extension @@ -34,6 +39,11 @@ def _make_manifest(): data_types=(DataTypesEnum.table,), tags=('external',), ), + hyperparams_schema=ModelHyperparamsSchema( + required=(), + optional=('beta',), + defaults={'beta': 0.5}, + ), ), ), ) @@ -73,5 +83,6 @@ def test_extension_model_uses_custom_strategy_adapter_for_runtime_init(): assert strategy.operation_id == 'custom' assert implementation.was_fitted is True assert implementation.params.get('alpha') == 2.0 + assert implementation.params.get('beta') == 0.5 finally: clear_extension_registry() diff --git a/tests/extensions/test_parameter_rules.py b/tests/extensions/test_parameter_rules.py new file mode 100644 index 0000000000..1f1b2ac275 --- /dev/null +++ b/tests/extensions/test_parameter_rules.py @@ -0,0 +1,63 @@ +from pymonad.either import Left, Right + +from fedot.extensions.contracts import ExternalModelSpec, ModelCapabilities, ModelHyperparamsSchema +from fedot.extensions.parameter_rules import ( + apply_extension_defaults, + extract_factory_params, + find_missing_required_params, + normalize_extension_user_params, + resolve_extension_params, +) +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum + + + +def _make_model_spec(): + return ExternalModelSpec( + name='external_with_schema', + factory=lambda params=None: object(), + capabilities=ModelCapabilities( + tasks=(TaskTypesEnum.regression,), + data_types=(DataTypesEnum.table,), + tags=('external',), + ), + hyperparams_schema=ModelHyperparamsSchema( + required=('alpha',), + optional=('beta',), + defaults={'beta': 0.5}, + ), + ) + + + +def test_extension_parameter_rules_apply_defaults_and_filter_runtime_keys(): + normalized = normalize_extension_user_params({'alpha': 1.0}) + with_defaults = apply_extension_defaults({'beta': 0.5}, normalized) + factory_params = extract_factory_params({ + **with_defaults, + '_extension_output_mode': 'labels', + 'model_fit': object(), + 'model_predict': object(), + }) + + assert with_defaults == {'beta': 0.5, 'alpha': 1.0} + assert factory_params == {'beta': 0.5, 'alpha': 1.0} + + + +def test_extension_parameter_rules_detect_missing_required_params(): + missing = find_missing_required_params(('alpha', 'gamma'), {'alpha': 1.0}) + resolution = resolve_extension_params(_make_model_spec(), {'beta': 1.5}) + + assert missing == ('gamma',) + assert resolution.__class__ is Left + assert resolution.value.details['required'] == ['alpha'] + + + +def test_extension_parameter_rules_return_resolved_params_when_schema_is_satisfied(): + resolution = resolve_extension_params(_make_model_spec(), {'alpha': 1.0}) + + assert resolution.__class__ is Right + assert resolution.value == {'beta': 0.5, 'alpha': 1.0} diff --git a/tests/extensions/test_runtime_rules.py b/tests/extensions/test_runtime_rules.py index 6aa6630869..8458961f84 100644 --- a/tests/extensions/test_runtime_rules.py +++ b/tests/extensions/test_runtime_rules.py @@ -1,13 +1,21 @@ -import numpy as np +import numpy as np + +from pymonad.either import Left from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum -from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities +from fedot.extensions.contracts import ( + ExtensionManifest, + ExternalModelSpec, + ModelCapabilities, + ModelHyperparamsSchema, +) from fedot.extensions.registry import clear_extension_registry, register_extension from fedot.extensions.runtime_rules import ( build_extension_strategy_params, get_extension_model_spec, is_extension_operation_name, + try_build_extension_strategy_params, ) @@ -24,6 +32,7 @@ def predict(self, features): return np.zeros(features.shape[0]) + def _make_manifest(): return ExtensionManifest( name='runtime_extension', @@ -37,11 +46,17 @@ def _make_manifest(): data_types=(DataTypesEnum.table,), tags=('external', 'linear'), ), + hyperparams_schema=ModelHyperparamsSchema( + required=('alpha',), + optional=('beta',), + defaults={'beta': 0.5}, + ), ), ), ) + def test_runtime_rules_resolve_registered_extension_model_and_build_strategy_params(): clear_extension_registry() register_extension(_make_manifest()) @@ -57,5 +72,21 @@ def test_runtime_rules_resolve_registered_extension_model_and_build_strategy_par assert callable(params['model_predict']) assert params['_extension_output_mode'] == 'labels' assert params['alpha'] == 1.0 + assert params['beta'] == 0.5 + finally: + clear_extension_registry() + + + +def test_runtime_rules_return_left_when_required_extension_params_are_missing(): + clear_extension_registry() + register_extension(_make_manifest()) + + try: + params = try_build_extension_strategy_params('external_runtime_model', {'beta': 1.5}) + + assert params.__class__ is Left + assert params.value.code == 'missing_required_hyperparams' + assert params.value.details['required'] == ['alpha'] finally: clear_extension_registry() diff --git a/tests/preprocessing/test_base_preprocessing.py b/tests/preprocessing/test_base_preprocessing.py index f7563c66d6..448a175c3a 100644 --- a/tests/preprocessing/test_base_preprocessing.py +++ b/tests/preprocessing/test_base_preprocessing.py @@ -1,4 +1,4 @@ -import numpy as np +import numpy as np from fedot.core.data.data import InputData from fedot.core.data.multi_modal import MultiModalData @@ -7,6 +7,7 @@ from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.preprocessing.base_preprocessing import BasePreprocessor from fedot.preprocessing.preprocessing import DataPreprocessor +from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer class _FakePreprocessor(BasePreprocessor): @@ -47,6 +48,7 @@ def reduce_memory_size(self, data): return data + def _make_input_data(*, is_main_target=True): return InputData( idx=np.array([0, 1]), @@ -58,6 +60,22 @@ def _make_input_data(*, is_main_target=True): ) + +def _make_optional_input_data(): + data = InputData( + idx=np.array([0, 1]), + features=np.array([[1.0, 'a'], [np.nan, 'b']], dtype=object), + target=np.array([[0.0], [1.0]]), + task=Task(TaskTypesEnum.regression), + data_type=DataTypesEnum.table, + supplementary_data=SupplementaryData(), + ) + data.categorical_idx = np.array([1]) + data.numerical_idx = np.array([0]) + return data + + + def test_mark_as_preprocessed_marks_unimodal_and_multimodal_inputs(): input_data = _make_input_data() multi_data = MultiModalData({'main': _make_input_data(), 'side': _make_input_data(is_main_target=False)}) @@ -70,6 +88,7 @@ def test_mark_as_preprocessed_marks_unimodal_and_multimodal_inputs(): assert multi_data['side'].supplementary_data.optionally_preprocessed is True + def test_merge_preprocessors_uses_typed_merge_plan(): api_preprocessor = _FakePreprocessor() pipeline_preprocessor = _FakePreprocessor() @@ -86,6 +105,7 @@ def test_merge_preprocessors_uses_typed_merge_plan(): assert merged.features_imputers == pipeline_preprocessor.features_imputers + def test_data_preprocessor_initialization_uses_source_and_target_rules(): preprocessor = DataPreprocessor() multi_data = MultiModalData({ @@ -99,3 +119,30 @@ def test_data_preprocessor_initialization_uses_source_and_target_rules(): assert set(preprocessor.binary_categorical_processors.keys()) == {'main', 'side'} assert set(preprocessor.types_correctors.keys()) == {'main', 'side'} assert preprocessor.main_target_source_name == 'main' + + + +def test_prepare_optional_uses_typed_optional_plan_and_target_source_resolution(): + preprocessor = DataPreprocessor() + data = _make_optional_input_data() + applied_steps = [] + + original_check_structure = PipelineStructureExplorer.check_structure_by_tag + preprocessor._apply_imputation_unidata = lambda current_data, source_name: applied_steps.append( + ('imputation', source_name) + ) or current_data + preprocessor._apply_categorical_encoding = lambda current_data, source_name: applied_steps.append( + ('encoding', source_name) + ) or current_data + PipelineStructureExplorer.check_structure_by_tag = staticmethod( + lambda pipeline, tag_to_check, source_name: tag_to_check == 'imputation' + ) + + try: + preprocessor._prepare_optional(object(), data, DEFAULT_SOURCE_NAME) + preprocessor.main_target_source_name = None + assert preprocessor._determine_target_converter() == DEFAULT_SOURCE_NAME + finally: + PipelineStructureExplorer.check_structure_by_tag = original_check_structure + + assert applied_steps == [('encoding', DEFAULT_SOURCE_NAME)] diff --git a/tests/preprocessing/test_preprocessing_rules.py b/tests/preprocessing/test_preprocessing_rules.py index 3592df5702..fc0bcd0068 100644 --- a/tests/preprocessing/test_preprocessing_rules.py +++ b/tests/preprocessing/test_preprocessing_rules.py @@ -1,4 +1,4 @@ -import numpy as np +import numpy as np import pytest from fedot.core.data.data import InputData @@ -7,15 +7,18 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.preprocessing.preprocessing_rules import ( + build_optional_preprocessing_plan, build_preprocessor_merge_plan, iter_preprocessed_inputs, resolve_main_target_source_name, resolve_source_names, + resolve_target_encoder_source_name, should_initialize_source_helpers, ) from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME + def _make_input_data(*, is_main_target=True): return InputData( idx=np.array([0, 1]), @@ -27,6 +30,7 @@ def _make_input_data(*, is_main_target=True): ) + def test_resolve_source_names_handles_unimodal_and_multimodal(): unimodal_plan = resolve_source_names(_make_input_data(), DEFAULT_SOURCE_NAME) multimodal_plan = resolve_source_names( @@ -38,17 +42,20 @@ def test_resolve_source_names_handles_unimodal_and_multimodal(): assert multimodal_plan.source_names == ('left', 'right') + def test_resolve_source_names_rejects_unknown_data_type(): with pytest.raises(ValueError, match='Unknown type of data'): resolve_source_names(object(), DEFAULT_SOURCE_NAME) + def test_should_initialize_source_helpers_reflects_existing_state(): assert should_initialize_source_helpers(False, False) is True assert should_initialize_source_helpers(True, False) is True assert should_initialize_source_helpers(True, True) is False + def test_resolve_main_target_source_name_prefers_existing_then_detects_main_branch(): multi_data = MultiModalData({ 'main': _make_input_data(is_main_target=True), @@ -59,6 +66,7 @@ def test_resolve_main_target_source_name_prefers_existing_then_detects_main_bran assert resolve_main_target_source_name(None, multi_data) == 'main' + def test_iter_preprocessed_inputs_and_merge_plan_are_deterministic(): input_data = _make_input_data() multi_data = MultiModalData({'main': input_data, 'side': _make_input_data(is_main_target=False)}) @@ -73,3 +81,18 @@ def test_iter_preprocessed_inputs_and_merge_plan_are_deterministic(): assert auto_plan.take_pipeline_imputers is False assert manual_plan.take_pipeline_encoders is True assert manual_plan.take_pipeline_imputers is True + + + +def test_build_optional_preprocessing_plan_and_target_source_resolution_are_explicit(): + optional_plan = build_optional_preprocessing_plan( + has_missing_values=True, + has_categorical_features=True, + has_imputation_operation=False, + has_encoding_operation=True, + ) + + assert optional_plan.apply_imputation is True + assert optional_plan.apply_encoding is False + assert resolve_target_encoder_source_name(None, DEFAULT_SOURCE_NAME) == DEFAULT_SOURCE_NAME + assert resolve_target_encoder_source_name('main', DEFAULT_SOURCE_NAME) == 'main' From 0ef22320dae4c04326e2ff3897029ef1966eca05 Mon Sep 17 00:00:00 2001 From: v1docq Date: Fri, 13 Mar 2026 13:24:17 +0300 Subject: [PATCH 20/32] extract pipeline preprocess and postprocess rules --- fedot/api/api_utils/api_service_rules.py | 41 +++++++++++ fedot/api/main.py | 37 ++++++---- fedot/core/pipelines/pipeline.py | 34 +++++---- fedot/core/pipelines/pipeline_rules.py | 31 +++++++++ tests/api/api_utils/test_api_service_rules.py | 51 ++++++++++++++ tests/api/test_main.py | 69 +++++++++++++++++++ tests/core/pipelines/__init__.py | 1 + tests/core/pipelines/test_pipeline.py | 68 ++++++++++++++++++ tests/core/pipelines/test_pipeline_rules.py | 27 ++++++++ 9 files changed, 334 insertions(+), 25 deletions(-) create mode 100644 fedot/api/api_utils/api_service_rules.py create mode 100644 fedot/core/pipelines/pipeline_rules.py create mode 100644 tests/api/api_utils/test_api_service_rules.py create mode 100644 tests/api/test_main.py create mode 100644 tests/core/pipelines/__init__.py create mode 100644 tests/core/pipelines/test_pipeline.py create mode 100644 tests/core/pipelines/test_pipeline_rules.py diff --git a/fedot/api/api_utils/api_service_rules.py b/fedot/api/api_utils/api_service_rules.py new file mode 100644 index 0000000000..e1fee6a0cc --- /dev/null +++ b/fedot/api/api_utils/api_service_rules.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass +from typing import Any, Optional + + +@dataclass(frozen=True) +class TuneExecutionPlan: + input_data: Any + cv_folds: Optional[int] + n_jobs: int + metric: Any + + + +def build_tune_execution_plan(input_data: Any, + train_data: Any, + requested_cv_folds: Optional[int], + default_cv_folds: Optional[int], + requested_n_jobs: Optional[int], + default_n_jobs: int, + requested_metric: Any, + default_metric: Any) -> TuneExecutionPlan: + resolved_input_data = train_data if input_data is None else input_data + resolved_cv_folds = default_cv_folds if requested_cv_folds is None else requested_cv_folds + resolved_n_jobs = default_n_jobs if requested_n_jobs is None else requested_n_jobs + resolved_metric = default_metric if requested_metric is None else requested_metric + return TuneExecutionPlan( + input_data=resolved_input_data, + cv_folds=resolved_cv_folds, + n_jobs=resolved_n_jobs, + metric=resolved_metric, + ) + + + +def resolve_predict_proba_mode(probs_for_all_classes: bool) -> str: + return 'full_probs' if probs_for_all_classes else 'probs' + + + +def resolve_forecast_horizon(requested_horizon: Optional[int], forecast_length: int) -> int: + return forecast_length if requested_horizon is None else requested_horizon diff --git a/fedot/api/main.py b/fedot/api/main.py index b6dcf0f782..042f13a00a 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -13,6 +13,11 @@ from fedot.api.api_utils.api_composer import ApiComposer from fedot.api.api_utils.api_run_planner import plan_final_fit, plan_sampling_stage +from fedot.api.api_utils.api_service_rules import ( + build_tune_execution_plan, + resolve_forecast_horizon, + resolve_predict_proba_mode, +) from fedot.api.api_utils.api_data import ApiDataProcessor from fedot.api.api_utils.data_definition import FeaturesType, TargetType from fedot.api.api_utils.input_analyser import InputAnalyser @@ -255,23 +260,29 @@ def tune(self, raise ValueError(NOT_FITTED_ERR_MSG) with fedot_composer_timer.launch_tuning('post'): - if input_data is None: - input_data = self.train_data + tune_plan = build_tune_execution_plan( + input_data=input_data, + train_data=self.train_data, + requested_cv_folds=cv_folds, + default_cv_folds=self.params.get('cv_folds'), + requested_n_jobs=n_jobs, + default_n_jobs=self.params.n_jobs, + requested_metric=metric_name, + default_metric=self.metrics[0], + ) + if input_data is not None: + tune_input_data = self.data_processor.define_data(features=tune_plan.input_data, target=target, is_predict=False) else: - input_data = self.data_processor.define_data(features=input_data, target=target, is_predict=False) - cv_folds = cv_folds or self.params.get('cv_folds') - n_jobs = n_jobs or self.params.n_jobs - - metric = metric_name if metric_name else self.metrics[0] + tune_input_data = tune_plan.input_data pipeline_tuner = (TunerBuilder(self.params.task) .with_tuner(SimultaneousTuner) - .with_cv_folds(cv_folds) - .with_n_jobs(n_jobs) - .with_metric(metric) + .with_cv_folds(tune_plan.cv_folds) + .with_n_jobs(tune_plan.n_jobs) + .with_metric(tune_plan.metric) .with_iterations(iterations) .with_timeout(timeout) - .build(input_data)) + .build(tune_input_data)) self.current_pipeline = pipeline_tuner.tune(self.current_pipeline, show_progress=show_progress) self.api_composer.was_tuned = pipeline_tuner.was_tuned @@ -346,7 +357,7 @@ def predict_proba(self, self.test_data = self.data_processor.define_data(target=self.target, features=features, is_predict=True) - mode = 'full_probs' if probs_for_all_classes else 'probs' + mode = resolve_predict_proba_mode(probs_for_all_classes) self.prediction = self.current_pipeline.predict(self.test_data, output_mode=mode) @@ -375,7 +386,7 @@ def forecast(self, self._check_forecast_applicable() forecast_length = self.train_data.task.task_params.forecast_length - horizon = horizon or forecast_length + horizon = resolve_forecast_horizon(horizon, forecast_length) if pre_history is None: pre_history = self.train_data pre_history.target = None diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py index 9612252ed5..a3ca1162c5 100644 --- a/fedot/core/pipelines/pipeline.py +++ b/fedot/core/pipelines/pipeline.py @@ -23,6 +23,10 @@ from fedot.core.operations.data_operation import DataOperation from fedot.core.operations.model import Model from fedot.core.pipelines.node import PipelineNode +from fedot.core.pipelines.pipeline_rules import ( + build_pipeline_postprocess_plan, + build_pipeline_preprocess_plan, +) from fedot.core.pipelines.template import PipelineTemplate from fedot.core.repository.tasks import TaskTypesEnum from fedot.core.visualisation.pipeline_specific_visuals import PipelineVisualizer @@ -170,10 +174,12 @@ def _postprocess(self, copied_input_data: Optional[InputData], result: OutputDat Returns: OutputData: postprocessed ``result`` parameter """ + postprocess_plan = build_pipeline_postprocess_plan(output_mode, result.task.task_type) result = self.preprocessor.restore_index(copied_input_data, result) - # Prediction should be converted into source labels (if it is needed) - if output_mode == 'labels': + if postprocess_plan.should_restore_inverse_target_encoding: result.predict = self.preprocessor.apply_inverse_target_encoding(result.predict) + if postprocess_plan.should_flatten_prediction: + result.predict = result.predict.ravel() return result def fit(self, @@ -195,11 +201,15 @@ def fit(self, """ self.replace_n_jobs_in_nodes(n_jobs) - if isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed: - copied_input_data = deepcopy(input_data) - else: + preprocess_plan = build_pipeline_preprocess_plan( + is_fit_stage=True, + is_input_auto_preprocessed=isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed, + ) + if preprocess_plan.should_preprocess: with fedot_composer_timer.launch_preprocessing(): copied_input_data = self._preprocess(input_data) + else: + copied_input_data = deepcopy(input_data) copied_input_data = self._assign_data_to_nodes(copied_input_data) @@ -306,19 +316,19 @@ def predict(self, self.log.error(ex) raise ValueError(ex) - if isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed: - copied_input_data = deepcopy(input_data) - else: - # Make copy of the input data to avoid performing inplace operations + preprocess_plan = build_pipeline_preprocess_plan( + is_fit_stage=False, + is_input_auto_preprocessed=isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed, + ) + if preprocess_plan.should_preprocess: copied_input_data = self._preprocess(input_data, is_fit_stage=False) + else: + copied_input_data = deepcopy(input_data) copied_input_data = self._assign_data_to_nodes(copied_input_data) result = self.root_node.predict(input_data=copied_input_data, output_mode=output_mode, predictions_cache=predictions_cache, fold_id=fold_id) - if input_data.task.task_type == TaskTypesEnum.ts_forecasting: - result.predict = result.predict.ravel() - result = self._postprocess(copied_input_data, result, output_mode) return result diff --git a/fedot/core/pipelines/pipeline_rules.py b/fedot/core/pipelines/pipeline_rules.py new file mode 100644 index 0000000000..23ec0f6c7b --- /dev/null +++ b/fedot/core/pipelines/pipeline_rules.py @@ -0,0 +1,31 @@ +from dataclasses import dataclass + +from fedot.core.repository.tasks import TaskTypesEnum + + +@dataclass(frozen=True) +class PipelinePreprocessPlan: + should_preprocess: bool + should_update_time_series_indices: bool + + +@dataclass(frozen=True) +class PipelinePostprocessPlan: + should_restore_inverse_target_encoding: bool + should_flatten_prediction: bool + + + +def build_pipeline_preprocess_plan(is_fit_stage: bool, is_input_auto_preprocessed: bool) -> PipelinePreprocessPlan: + return PipelinePreprocessPlan( + should_preprocess=not is_input_auto_preprocessed, + should_update_time_series_indices=not is_fit_stage, + ) + + + +def build_pipeline_postprocess_plan(output_mode: str, task_type: TaskTypesEnum) -> PipelinePostprocessPlan: + return PipelinePostprocessPlan( + should_restore_inverse_target_encoding=output_mode == 'labels', + should_flatten_prediction=task_type is TaskTypesEnum.ts_forecasting, + ) diff --git a/tests/api/api_utils/test_api_service_rules.py b/tests/api/api_utils/test_api_service_rules.py new file mode 100644 index 0000000000..61508f9c59 --- /dev/null +++ b/tests/api/api_utils/test_api_service_rules.py @@ -0,0 +1,51 @@ +from fedot.api.api_utils.api_service_rules import ( + build_tune_execution_plan, + resolve_forecast_horizon, + resolve_predict_proba_mode, +) + + + +def test_build_tune_execution_plan_uses_explicit_values_when_provided(): + plan = build_tune_execution_plan( + input_data='new-data', + train_data='train-data', + requested_cv_folds=5, + default_cv_folds=3, + requested_n_jobs=2, + default_n_jobs=1, + requested_metric='roc_auc', + default_metric='f1', + ) + + assert plan.input_data == 'new-data' + assert plan.cv_folds == 5 + assert plan.n_jobs == 2 + assert plan.metric == 'roc_auc' + + + +def test_build_tune_execution_plan_uses_defaults_when_values_are_missing(): + plan = build_tune_execution_plan( + input_data=None, + train_data='train-data', + requested_cv_folds=None, + default_cv_folds=3, + requested_n_jobs=None, + default_n_jobs=4, + requested_metric=None, + default_metric='f1', + ) + + assert plan.input_data == 'train-data' + assert plan.cv_folds == 3 + assert plan.n_jobs == 4 + assert plan.metric == 'f1' + + + +def test_service_rules_resolve_predict_mode_and_forecast_horizon(): + assert resolve_predict_proba_mode(False) == 'probs' + assert resolve_predict_proba_mode(True) == 'full_probs' + assert resolve_forecast_horizon(None, 12) == 12 + assert resolve_forecast_horizon(5, 12) == 5 diff --git a/tests/api/test_main.py b/tests/api/test_main.py new file mode 100644 index 0000000000..4fe9f637ac --- /dev/null +++ b/tests/api/test_main.py @@ -0,0 +1,69 @@ +import numpy as np +import pytest + +from fedot import Fedot +from fedot.core.data.data import OutputData +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum + + +class _StubPipeline: + def __init__(self): + self.calls = [] + + def predict(self, test_data, output_mode='default'): + self.calls.append(output_mode) + return OutputData( + idx=np.arange(2), + predict=np.array([[0.2, 0.8], [0.7, 0.3]]), + target=None, + task=Task(TaskTypesEnum.classification), + data_type=DataTypesEnum.table, + ) + + + +def test_main_facade_raises_not_fitted_errors_for_predictive_methods(): + model = Fedot(problem='classification') + + with pytest.raises(ValueError, match='Model not fitted yet'): + model.predict(features=np.array([[1.0]])) + + with pytest.raises(ValueError, match='Model not fitted yet'): + model.tune() + + with pytest.raises(ValueError, match='Model not fitted yet'): + model.get_metrics() + + with pytest.raises(ValueError, match='Model not fitted yet'): + model.return_report() + + + +def test_main_facade_predict_proba_rejects_non_classification_tasks(): + model = Fedot(problem='regression') + model.current_pipeline = object() + + with pytest.raises(ValueError, match='Probabilities of predictions are available only for classification'): + model.predict_proba(features=np.array([[1.0]])) + + + +def test_main_facade_uses_service_rule_for_predict_proba_mode_selection(): + model = Fedot(problem='classification') + model.current_pipeline = _StubPipeline() + model.target = 'target' + model.data_processor.define_data = lambda **kwargs: type('Input', (), {'task': Task(TaskTypesEnum.classification)})() + + model.predict_proba(features=np.array([[1.0], [2.0]]), probs_for_all_classes=True) + + assert model.current_pipeline.calls == ['full_probs'] + + + +def test_main_facade_forecast_requires_time_series_task(): + model = Fedot(problem='classification') + model.current_pipeline = object() + + with pytest.raises(ValueError, match='Forecasting can be used only for the time series'): + model.forecast() diff --git a/tests/core/pipelines/__init__.py b/tests/core/pipelines/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/tests/core/pipelines/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/core/pipelines/test_pipeline.py b/tests/core/pipelines/test_pipeline.py new file mode 100644 index 0000000000..a91436bac7 --- /dev/null +++ b/tests/core/pipelines/test_pipeline.py @@ -0,0 +1,68 @@ +import numpy as np + +from fedot.core.data.data import InputData, OutputData +from fedot.core.data.supplementary_data import SupplementaryData +from fedot.core.pipelines.pipeline import Pipeline +from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams + + +class _StubPreprocessor: + def __init__(self): + self.calls = [] + + def restore_index(self, copied_input_data, result): + self.calls.append('restore_index') + return result + + def apply_inverse_target_encoding(self, prediction): + self.calls.append('inverse_target_encoding') + return prediction + 1 + + + +def _make_ts_output(): + return OutputData( + idx=np.arange(2), + predict=np.array([[1.0], [2.0]]), + target=None, + task=Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=2)), + data_type=DataTypesEnum.ts, + ) + + + +def _make_classification_input(is_auto_preprocessed: bool): + supplementary_data = SupplementaryData(is_auto_preprocessed=is_auto_preprocessed) + return InputData( + idx=np.arange(2), + features=np.array([[1.0], [2.0]]), + target=np.array([[0.0], [1.0]]), + task=Task(TaskTypesEnum.classification), + data_type=DataTypesEnum.table, + supplementary_data=supplementary_data, + ) + + + +def test_pipeline_postprocess_uses_typed_postprocess_plan(): + pipeline = Pipeline(use_input_preprocessing=False) + pipeline.preprocessor = _StubPreprocessor() + + result = pipeline._postprocess(None, _make_ts_output(), output_mode='labels') + + assert pipeline.preprocessor.calls == ['restore_index', 'inverse_target_encoding'] + assert result.predict.tolist() == [2.0, 3.0] + + + +def test_pipeline_fit_skips_preprocessing_when_input_is_marked_auto_preprocessed(): + pipeline = Pipeline(use_input_preprocessing=False) + pipeline._preprocess = lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError('_preprocess should not be called')) + pipeline._assign_data_to_nodes = lambda data: data + pipeline._fit = lambda input_data=None, predictions_cache=None, fold_id=None: 'ok' + input_data = _make_classification_input(is_auto_preprocessed=True) + + result = pipeline.fit(input_data) + + assert result == 'ok' diff --git a/tests/core/pipelines/test_pipeline_rules.py b/tests/core/pipelines/test_pipeline_rules.py new file mode 100644 index 0000000000..675a71e08b --- /dev/null +++ b/tests/core/pipelines/test_pipeline_rules.py @@ -0,0 +1,27 @@ +from fedot.core.pipelines.pipeline_rules import ( + build_pipeline_postprocess_plan, + build_pipeline_preprocess_plan, +) +from fedot.core.repository.tasks import TaskTypesEnum + + + +def test_build_pipeline_preprocess_plan_handles_fit_and_predict_stages(): + fit_plan = build_pipeline_preprocess_plan(is_fit_stage=True, is_input_auto_preprocessed=False) + predict_plan = build_pipeline_preprocess_plan(is_fit_stage=False, is_input_auto_preprocessed=True) + + assert fit_plan.should_preprocess is True + assert fit_plan.should_update_time_series_indices is False + assert predict_plan.should_preprocess is False + assert predict_plan.should_update_time_series_indices is True + + + +def test_build_pipeline_postprocess_plan_handles_labels_and_ts_outputs(): + labels_plan = build_pipeline_postprocess_plan('labels', TaskTypesEnum.classification) + ts_plan = build_pipeline_postprocess_plan('default', TaskTypesEnum.ts_forecasting) + + assert labels_plan.should_restore_inverse_target_encoding is True + assert labels_plan.should_flatten_prediction is False + assert ts_plan.should_restore_inverse_target_encoding is False + assert ts_plan.should_flatten_prediction is True From 1e2a3c6597dc3054dfd1fd847b4f28e05bef6063 Mon Sep 17 00:00:00 2001 From: v1docq Date: Fri, 13 Mar 2026 13:34:11 +0300 Subject: [PATCH 21/32] extract pipeline node parameter normalization rules --- fedot/core/pipelines/node.py | 23 +++++------- fedot/core/pipelines/pipeline_node_rules.py | 27 ++++++++++++++ tests/core/pipelines/test_node.py | 37 +++++++++++++++++++ .../pipelines/test_pipeline_node_rules.py | 27 ++++++++++++++ 4 files changed, 101 insertions(+), 13 deletions(-) create mode 100644 fedot/core/pipelines/pipeline_node_rules.py create mode 100644 tests/core/pipelines/test_node.py create mode 100644 tests/core/pipelines/test_pipeline_node_rules.py diff --git a/fedot/core/pipelines/node.py b/fedot/core/pipelines/node.py index eeb1395048..c5cd71ac80 100644 --- a/fedot/core/pipelines/node.py +++ b/fedot/core/pipelines/node.py @@ -15,6 +15,11 @@ from fedot.core.operations.factory import OperationFactory from fedot.core.operations.operation import Operation from fedot.core.operations.operation_parameters import OperationParameters +from fedot.core.pipelines.pipeline_node_rules import ( + merge_node_parameters, + normalize_node_parameters, + should_update_node_parameters, +) from fedot.core.repository.operation_types_repository import OperationTypesRepository from fedot.core.utils import DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL @@ -124,7 +129,7 @@ def update_params(self): """Updates :attr:`custom_params` with changed parameters""" new_params = self.fitted_operation.get_params() changed_parameters = new_params.changed_parameters - updated_parameters = {**self.parameters, **changed_parameters} + updated_parameters = merge_node_parameters(self.parameters, changed_parameters) self.parameters = updated_parameters @property @@ -218,9 +223,7 @@ def fit(self, descriptive_id=self.descriptive_id) # Update parameters after operation fitting (they can be corrected) - not_atomized_operation = 'atomized' not in self.operation.operation_type - - if not_atomized_operation and 'correct_params' in self.operation.metadata.tags: + if should_update_node_parameters(self.operation.operation_type, self.operation.metadata.tags): self.update_params() return operation_predict @@ -357,15 +360,9 @@ def parameters(self, params: dict): Args: params: new parameters to be placed instead of existing """ - if params is not None: - # The check for "default_params" is needed for backward compatibility. - if params == DEFAULT_PARAMS_STUB: - params = {} - # take nested params if they appeared (mostly used for tuning) - if NESTED_PARAMS_LABEL in params: - params = params[NESTED_PARAMS_LABEL] - self._parameters = OperationParameters.from_operation_type(self.operation.operation_type, **params) - self.content['params'] = self._parameters.to_dict() + normalized_params = normalize_node_parameters(params, DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL) + self._parameters = OperationParameters.from_operation_type(self.operation.operation_type, **normalized_params) + self.content['params'] = self._parameters.to_dict() def __str__(self) -> str: """Returns ``str`` representation of the node diff --git a/fedot/core/pipelines/pipeline_node_rules.py b/fedot/core/pipelines/pipeline_node_rules.py new file mode 100644 index 0000000000..df17d19771 --- /dev/null +++ b/fedot/core/pipelines/pipeline_node_rules.py @@ -0,0 +1,27 @@ +from typing import Dict, Iterable, Optional + + + +def normalize_node_parameters(params: Optional[dict], default_params_stub, nested_params_label: str) -> Dict: + if params is None: + return {} + if params == default_params_stub: + return {} + if nested_params_label in params: + return dict(params[nested_params_label]) + return dict(params) + + + +def merge_node_parameters(current_parameters: Optional[dict], changed_parameters: Optional[dict]) -> Dict: + return { + **dict(current_parameters or {}), + **dict(changed_parameters or {}), + } + + + +def should_update_node_parameters(operation_type: str, operation_tags: Optional[Iterable[str]]) -> bool: + if 'atomized' in operation_type: + return False + return 'correct_params' in set(operation_tags or ()) diff --git a/tests/core/pipelines/test_node.py b/tests/core/pipelines/test_node.py new file mode 100644 index 0000000000..6c0b1f30de --- /dev/null +++ b/tests/core/pipelines/test_node.py @@ -0,0 +1,37 @@ +from fedot.core.operations.operation_parameters import OperationParameters +from fedot.core.pipelines.node import PipelineNode +from fedot.core.utils import DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL + + +class _FittedOperationWithParams: + def __init__(self, params): + self._params = params + + def get_params(self): + return self._params + + + +def test_pipeline_node_parameters_setter_normalizes_default_and_nested_params(): + default_node = PipelineNode(operation_type='ridge') + nested_node = PipelineNode(operation_type='ridge') + + default_node.parameters = DEFAULT_PARAMS_STUB + nested_node.parameters = {NESTED_PARAMS_LABEL: {'alpha': 1.0}} + + assert default_node.parameters == {} + assert nested_node.parameters['alpha'] == 1.0 + + + +def test_pipeline_node_update_params_uses_typed_merge_rule(): + node = PipelineNode(operation_type='ridge') + node.parameters = {'alpha': 1.0} + fitted_params = OperationParameters(alpha=1.0) + fitted_params.update(beta=2.0) + node.fitted_operation = _FittedOperationWithParams(fitted_params) + + node.update_params() + + assert node.parameters['alpha'] == 1.0 + assert node.parameters['beta'] == 2.0 diff --git a/tests/core/pipelines/test_pipeline_node_rules.py b/tests/core/pipelines/test_pipeline_node_rules.py new file mode 100644 index 0000000000..91418b733b --- /dev/null +++ b/tests/core/pipelines/test_pipeline_node_rules.py @@ -0,0 +1,27 @@ +from fedot.core.pipelines.pipeline_node_rules import ( + merge_node_parameters, + normalize_node_parameters, + should_update_node_parameters, +) +from fedot.core.utils import DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL + + + +def test_normalize_node_parameters_handles_default_stub_and_nested_params(): + assert normalize_node_parameters(DEFAULT_PARAMS_STUB, DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL) == {} + assert normalize_node_parameters( + {NESTED_PARAMS_LABEL: {'alpha': 1.0}}, + DEFAULT_PARAMS_STUB, + NESTED_PARAMS_LABEL, + ) == {'alpha': 1.0} + assert normalize_node_parameters({'beta': 2.0}, DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL) == {'beta': 2.0} + + + +def test_merge_node_parameters_and_update_rule_are_explicit(): + merged = merge_node_parameters({'alpha': 1.0}, {'beta': 2.0}) + + assert merged == {'alpha': 1.0, 'beta': 2.0} + assert should_update_node_parameters('ridge', ['correct_params']) is True + assert should_update_node_parameters('atomized_operation', ['correct_params']) is False + assert should_update_node_parameters('ridge', ['linear']) is False From 45edef5d3f41924456429773309e84b22eb9f15f Mon Sep 17 00:00:00 2001 From: v1docq Date: Fri, 13 Mar 2026 13:40:06 +0300 Subject: [PATCH 22/32] extract operation parameter normalization and change tracking rules --- .../operations/operation_parameter_rules.py | 25 +++++++++++ fedot/core/operations/operation_parameters.py | 21 +++++---- .../test_operation_parameter_rules.py | 25 +++++++++++ .../operations/test_operation_parameters.py | 43 +++++++++++++++++++ 4 files changed, 105 insertions(+), 9 deletions(-) create mode 100644 fedot/core/operations/operation_parameter_rules.py create mode 100644 tests/core/operations/test_operation_parameter_rules.py create mode 100644 tests/core/operations/test_operation_parameters.py diff --git a/fedot/core/operations/operation_parameter_rules.py b/fedot/core/operations/operation_parameter_rules.py new file mode 100644 index 0000000000..b427819abc --- /dev/null +++ b/fedot/core/operations/operation_parameter_rules.py @@ -0,0 +1,25 @@ +from typing import Dict, Iterable, Tuple + + + +def merge_operation_default_params(default_parameters: Dict, passed_parameters: Dict) -> Dict: + return { + **dict(default_parameters or {}), + **dict(passed_parameters or {}), + } + + + +def collect_changed_keys(current_parameters: Dict, updated_parameters: Dict, existing_changed_keys: Iterable[str]) -> Tuple[str, ...]: + changed_keys = list(existing_changed_keys) + for key, value in updated_parameters.items(): + if key not in changed_keys and current_parameters.get(key) != value: + changed_keys.append(key) + return tuple(changed_keys) + + + +def resolve_setdefault_value(current_parameters: Dict, key, value): + if key in current_parameters: + return current_parameters[key], False + return value, True diff --git a/fedot/core/operations/operation_parameters.py b/fedot/core/operations/operation_parameters.py index 1936c965ce..4f7efd6ee2 100644 --- a/fedot/core/operations/operation_parameters.py +++ b/fedot/core/operations/operation_parameters.py @@ -1,6 +1,11 @@ from copy import deepcopy from typing import Iterable +from fedot.core.operations.operation_parameter_rules import ( + collect_changed_keys, + merge_operation_default_params, + resolve_setdefault_value, +) from fedot.core.repository.default_params_repository import DefaultOperationParamsRepository @@ -28,23 +33,21 @@ def __bool__(self): @staticmethod def from_operation_type(operation_type: str, **parameters): default_parameters = get_default_params(operation_type) - parameters = {**default_parameters, **parameters} + parameters = merge_operation_default_params(default_parameters, parameters) return OperationParameters(**parameters) def update(self, **params): - for key, value in params.items(): - if key not in self._changed_keys: - if self._parameters.get(key) != value: - self._changed_keys.append(key) - self._parameters.update({key: value}) + self._changed_keys = list(collect_changed_keys(self._parameters, params, self._changed_keys)) + self._parameters.update(params) def get(self, key, default_value=None): return self._parameters.get(key, default_value) def setdefault(self, key, value): - if key not in self._parameters.keys(): - self.update(**{key: value}) - return self.get(key) + resolved_value, should_update = resolve_setdefault_value(self._parameters, key, value) + if should_update: + self.update(**{key: resolved_value}) + return resolved_value def to_dict(self) -> dict: return deepcopy(self._parameters) diff --git a/tests/core/operations/test_operation_parameter_rules.py b/tests/core/operations/test_operation_parameter_rules.py new file mode 100644 index 0000000000..c5b2ab133e --- /dev/null +++ b/tests/core/operations/test_operation_parameter_rules.py @@ -0,0 +1,25 @@ +from fedot.core.operations.operation_parameter_rules import ( + collect_changed_keys, + merge_operation_default_params, + resolve_setdefault_value, +) + + + +def test_operation_parameter_rules_merge_defaults_and_track_changes(): + merged = merge_operation_default_params({'a': 1, 'b': 2}, {'b': 3, 'c': 4}) + changed_keys = collect_changed_keys({'a': 1, 'b': 2}, {'a': 1, 'b': 3, 'd': 4}, ()) + + assert merged == {'a': 1, 'b': 3, 'c': 4} + assert changed_keys == ('b', 'd') + + + +def test_operation_parameter_rules_resolve_setdefault_value_explicitly(): + existing_value, should_update_existing = resolve_setdefault_value({'a': 1}, 'a', 2) + missing_value, should_update_missing = resolve_setdefault_value({'a': 1}, 'b', 3) + + assert existing_value == 1 + assert should_update_existing is False + assert missing_value == 3 + assert should_update_missing is True diff --git a/tests/core/operations/test_operation_parameters.py b/tests/core/operations/test_operation_parameters.py new file mode 100644 index 0000000000..87c06eaf12 --- /dev/null +++ b/tests/core/operations/test_operation_parameters.py @@ -0,0 +1,43 @@ +from fedot.core.operations.operation_parameters import OperationParameters, get_default_params + + + +def test_params_keeper_update(): + params = {'a': 1, 'b': 2, 'c': 3} + keeper = OperationParameters(**params) + new_params = {'a': 1, 'b': 3, 'd': 4} + keeper.update(**new_params) + expected_params = {'a': 1, 'b': 3, 'c': 3, 'd': 4} + actual_params = keeper.to_dict() + changed_params = keeper.changed_parameters.keys() + assert actual_params == expected_params + assert 'a' not in changed_params + assert 'b' in changed_params + assert 'd' in changed_params + + + +def test_params_keeper_get(): + params = {'a': 1, 'b': 2, 'c': 3} + keeper = OperationParameters(**params) + a = keeper.get('a') + b = keeper.get('b', -1) + d = keeper.get('d', 5) + assert a == 1 + assert b == 2 + assert d == 5 + + + +def test_params_keeper_setdefault_and_defaults_from_repository(): + keeper = OperationParameters(alpha=1.0) + existing_value = keeper.setdefault('alpha', 2.0) + missing_value = keeper.setdefault('beta', 3.0) + default_params = get_default_params('ridge') + merged_keeper = OperationParameters.from_operation_type('ridge', alpha=0.75) + + assert existing_value == 1.0 + assert missing_value == 3.0 + assert keeper.get('beta') == 3.0 + assert merged_keeper.get('alpha') == 0.75 + assert set(default_params).issubset(set(merged_keeper.to_dict())) From fdab6c0430068bb37273fb4e4a0c90544e04d2e0 Mon Sep 17 00:00:00 2001 From: v1docq Date: Fri, 13 Mar 2026 13:48:21 +0300 Subject: [PATCH 23/32] `Refactor OOP shells to typed pure-core rules and add first mirrored tests slice` --- docs/dev/fp_refactoring_pr1_slice.md | 53 ++++++++++++++++++++++++++++ tests/README.md | 25 +++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 docs/dev/fp_refactoring_pr1_slice.md create mode 100644 tests/README.md diff --git a/docs/dev/fp_refactoring_pr1_slice.md b/docs/dev/fp_refactoring_pr1_slice.md new file mode 100644 index 0000000000..b00053f7e2 --- /dev/null +++ b/docs/dev/fp_refactoring_pr1_slice.md @@ -0,0 +1,53 @@ +# Первый PR: OOP Shell over Typed Pure Core + +## В чем идея PR + +В этом ПР сделана первая последовательная вертикальная часть плана рефакторинга: +оставлен общедоступный API ООП и объекты ядра, +логику принятия решений вынесена в чистые функции, +валидацию и нормализацию параметров так же. + +## Что поменялось + +- `fedot/extensions` + - extension contract + - registry + - operation discovery bridge + - runtime adapter + - typed extension parameter resolution +- `fedot/remote` + - safe typed pipeline config parsing without `eval` +- `fedot/api` + - typed run/service planning rules + - extracted params/defaulting/recommendation/preset/assumption rules + - `Fedot` facade still preserved as OOP shell +- `fedot/preprocessing` + - source, merge and optional-preprocessing planning rules +- `fedot/core/repository` + - typed operation query and pipeline operation split rules +- `fedot/core/pipelines` + - pipeline preprocess/postprocess rules + - pipeline node parameter normalization rules +- `fedot/core/operations` + - operation parameter normalization/change-tracking rules +- `tests/` + - mirrored tree for `api`, `core`, `extensions`, `preprocessing`, `remote` + +## Архитектурный эффект + +- Зоны влияния ООП остаются на месте. +- Скрытая логика ветвления и нормализации перенесена в небольшие чистые функции. +- Ожидаемые сбои на новых границах представлены более явно. +- Интеграция с внешней моделью больше не зависит от редактирования нескольких внутренних конфигураций. + +## Что намерено не было сделано в этом PR + +- рефактор индастриала +- рефактор CI +- работа над моделями и методами для фичей + +## В каком порядке сомтреть +1. extension contract and runtime bridge +2. remote config safety changes +3. api/core/preprocessing pure-rule extractions +4. mirrored tests structure and new `pytest` markers diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000000..44b2b75af0 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,25 @@ +# Tests Layout + +The target test layout mirrors the production package layout under `fedot/`. + +Examples: +- `fedot/api/main.py` -> `tests/api/test_main.py` +- `fedot/core/data/...` -> `tests/core/data/...` +- `fedot/core/pipelines/...` -> `tests/core/pipelines/...` +- `fedot/extensions/...` -> `tests/extensions/...` + +## Rules + +- Prefer `tests/` for all new and migrated tests. +- Keep `test/` as a temporary legacy location during the migration window only. +- Express test kind via pytest markers, not by directory name. +- Use `@pytest.mark.unit` for pure rules and narrow OOP-shell contracts. +- Use `@pytest.mark.integration` for subsystem and end-to-end behaviour. +- Use `@pytest.mark.property` for invariant and determinism checks. +- Use `@pytest.mark.slow` only when the scenario is materially expensive. + +## Migration strategy + +- Add new tests to `tests/` first. +- Mirror legacy coverage cluster-by-cluster instead of one large move. +- Remove legacy `test/` copies only after the mirrored path is stable in CI. From bdb1bad2c20a023d923ce0910da89fce074c7abd Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 14:47:29 +0300 Subject: [PATCH 24/32] chore: add setuptools pkg_resources libs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 07524fe92e..129fca5721 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,6 +32,7 @@ pyvis==0.2.1 seaborn>=0.9.0 # Misc +setuptools<81 # hyperopt imports pkg_resources removed in setuptools>=81 func_timeout==4.3.5 joblib>=0.17.0 requests>=2.0 From fd4c5b8a6cbdeea5c5d9c4056af30b739b828e6c Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 15:03:09 +0300 Subject: [PATCH 25/32] fix: change repo kinds enum values to lowercase --- fedot/api/api_utils/assumptions/assumption_rules.py | 6 +++--- fedot/extensions/operation_rules.py | 4 ++-- tests/extensions/test_operation_rules.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fedot/api/api_utils/assumptions/assumption_rules.py b/fedot/api/api_utils/assumptions/assumption_rules.py index 1a3ae4b6f3..05e1ec4a74 100644 --- a/fedot/api/api_utils/assumptions/assumption_rules.py +++ b/fedot/api/api_utils/assumptions/assumption_rules.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass from typing import Iterable, Optional, Sequence, Tuple from fedot.core.constants import AUTO_PRESET_NAME, BEST_QUALITY_PRESET_NAME @@ -32,8 +32,8 @@ class PresetSpec: def default_repository_name_for_data(data) -> str: if data.data_type == DataTypesEnum.multi_ts: - return RepositoryKind.ALL.value - return RepositoryKind.MODEL.value + return RepositoryKind.all.value + return RepositoryKind.model.value def required_operations_for_data(data, data_type: DataTypesEnum) -> Tuple[str, ...]: diff --git a/fedot/extensions/operation_rules.py b/fedot/extensions/operation_rules.py index 5a3093e8a4..e819a94d83 100644 --- a/fedot/extensions/operation_rules.py +++ b/fedot/extensions/operation_rules.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass from typing import Iterable, Optional, Sequence, Tuple from fedot.core.repository.dataset_types import DataTypesEnum @@ -16,7 +16,7 @@ class ExtensionOperationView: def should_include_extensions(repository_kind: RepositoryKind) -> bool: - return repository_kind in (RepositoryKind.MODEL, RepositoryKind.ALL) + return repository_kind in (RepositoryKind.model, RepositoryKind.all) def get_extension_operation_views() -> Tuple[ExtensionOperationView, ...]: diff --git a/tests/extensions/test_operation_rules.py b/tests/extensions/test_operation_rules.py index 3d1ec47d29..5bcb83183d 100644 --- a/tests/extensions/test_operation_rules.py +++ b/tests/extensions/test_operation_rules.py @@ -1,4 +1,4 @@ -from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.operation_query import RepositoryKind from fedot.core.repository.tasks import TaskTypesEnum from fedot.extensions.contracts import ExtensionManifest, ExternalModelSpec, ModelCapabilities @@ -33,8 +33,8 @@ def test_extension_operation_rules_filter_registered_models(): register_extension(_make_manifest()) try: - assert should_include_extensions(RepositoryKind.MODEL) is True - assert should_include_extensions(RepositoryKind.DATA_OPERATION) is False + assert should_include_extensions(RepositoryKind.model) is True + assert should_include_extensions(RepositoryKind.data_operation) is False views = filter_extension_operation_views( task_type=TaskTypesEnum.classification, From dbc1bd74a33734d1bc7ee17a143068899350171c Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 15:06:29 +0300 Subject: [PATCH 26/32] fix: change the order of using best preset name in presets parsing --- fedot/api/api_utils/assumptions/assumption_rules.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fedot/api/api_utils/assumptions/assumption_rules.py b/fedot/api/api_utils/assumptions/assumption_rules.py index 05e1ec4a74..293e802746 100644 --- a/fedot/api/api_utils/assumptions/assumption_rules.py +++ b/fedot/api/api_utils/assumptions/assumption_rules.py @@ -75,14 +75,14 @@ def parse_preset_spec(preset_name: Optional[str]) -> PresetSpec: use_stable = 'stable' in requested_preset use_gpu = 'gpu' in requested_preset - if use_stable: - base_preset = BEST_QUALITY_PRESET_NAME - if '*' in base_preset: base_name, suffix = base_preset.split('*', 1) base_preset = base_name modification = f'*{suffix}' + if use_stable: + base_preset = BEST_QUALITY_PRESET_NAME + return PresetSpec( requested_preset=requested_preset, base_preset=base_preset, From e9156d10aae2aea32daafd65db034aba8d40cef2 Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 15:16:01 +0300 Subject: [PATCH 27/32] fix: add proper chained exception thread in assumptions fit stage --- .../assumptions/assumptions_handler.py | 11 ++++++----- .../assumptions/assumptions_handler_rules.py | 4 +++- .../assumptions/test_assumptions_handler.py | 19 ++++++++++++++++++- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/fedot/api/api_utils/assumptions/assumptions_handler.py b/fedot/api/api_utils/assumptions/assumptions_handler.py index 1b2d3665ba..cb55836412 100644 --- a/fedot/api/api_utils/assumptions/assumptions_handler.py +++ b/fedot/api/api_utils/assumptions/assumptions_handler.py @@ -1,4 +1,3 @@ -import traceback from typing import List, Optional, Union from golem.core.log import default_log @@ -75,7 +74,8 @@ def fit_assumption_and_check_correctness(self, eval_n_jobs=eval_n_jobs, ) if fit_result.is_left(): - self._raise_evaluating_exception(fit_result.value) + fit_error = fit_result.monoid[0] if getattr(fit_result, 'monoid', None) else fit_result.value + self._raise_evaluating_exception(fit_error) return fit_result.value def try_fit_assumption(self, @@ -103,12 +103,13 @@ def try_fit_assumption(self, except Exception as ex: fit_error = build_assumption_fit_error(ex) - self.log.info(f'Initial pipeline fit was failed due to: {fit_error.cause}.') - print(traceback.format_exc()) + self.log.exception(f'Initial pipeline fit was failed due to: {fit_error.cause}.') return Left(fit_error) def _raise_evaluating_exception(self, fit_error): - raise ValueError(fit_error.message) + message = getattr(fit_error, 'message', str(fit_error)) + original_error = getattr(fit_error, 'exception', None) + raise ValueError(message) from original_error def propose_preset(self, preset: Union[str, None], timer: ApiTime, n_jobs: int) -> str: """ diff --git a/fedot/api/api_utils/assumptions/assumptions_handler_rules.py b/fedot/api/api_utils/assumptions/assumptions_handler_rules.py index 186e0c2b69..e1834ddeb0 100644 --- a/fedot/api/api_utils/assumptions/assumptions_handler_rules.py +++ b/fedot/api/api_utils/assumptions/assumptions_handler_rules.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass from typing import Callable, List, Optional, Sequence, Union from fedot.api.time import ApiTime @@ -10,6 +10,7 @@ class AssumptionFitError: code: str message: str cause: str + exception: Optional[Exception] = None @dataclass(frozen=True) @@ -44,6 +45,7 @@ def build_assumption_fit_error(ex: Exception) -> AssumptionFitError: code='initial_assumption_fit_failed', message=advice_info, cause=str(ex), + exception=ex, ) diff --git a/tests/api/api_utils/assumptions/test_assumptions_handler.py b/tests/api/api_utils/assumptions/test_assumptions_handler.py index 8250750065..b5aca2f6be 100644 --- a/tests/api/api_utils/assumptions/test_assumptions_handler.py +++ b/tests/api/api_utils/assumptions/test_assumptions_handler.py @@ -1,4 +1,4 @@ -from types import SimpleNamespace +from types import SimpleNamespace from pymonad.either import Left, Right @@ -49,6 +49,7 @@ def test_try_fit_assumption_returns_left_for_expected_fit_failure(monkeypatch): assert result.is_left() assert result.monoid[0].code == 'initial_assumption_fit_failed' assert 'fit failed' in result.monoid[0].message + assert isinstance(result.monoid[0].exception, RuntimeError) def test_fit_assumption_and_check_correctness_keeps_compatibility_wrapper(monkeypatch): @@ -66,3 +67,19 @@ def test_fit_assumption_and_check_correctness_keeps_compatibility_wrapper(monkey assert str(error) == 'boom' else: raise AssertionError('Compatibility wrapper should raise ValueError for failed assumption fitting') + + +def test_fit_assumption_and_check_correctness_raises_from_original_exception(monkeypatch): + handler = AssumptionsHandler(SimpleNamespace()) + pipeline = _FakePipeline() + original_error = RuntimeError('fit failed') + fit_error = SimpleNamespace(message='boom', exception=original_error) + monkeypatch.setattr(handler, 'try_fit_assumption', lambda **kwargs: Left(fit_error)) + + try: + handler.fit_assumption_and_check_correctness(pipeline) + except ValueError as error: + assert str(error) == 'boom' + assert error.__cause__ is original_error + else: + raise AssertionError('Compatibility wrapper should chain original fit error') From 4a0352e4f7b78bc27c3a490fa2a31435fcefd157 Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 15:21:56 +0300 Subject: [PATCH 28/32] fix: add inheritance for fake test pipeline from actual pipeline --- .../api_utils/assumptions/test_assumptions_handler_rules.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/api/api_utils/assumptions/test_assumptions_handler_rules.py b/tests/api/api_utils/assumptions/test_assumptions_handler_rules.py index c18633c60f..a770fb51b7 100644 --- a/tests/api/api_utils/assumptions/test_assumptions_handler_rules.py +++ b/tests/api/api_utils/assumptions/test_assumptions_handler_rules.py @@ -6,9 +6,10 @@ normalize_initial_assumption, resolve_initial_assumption, ) +from fedot.core.pipelines.pipeline import Pipeline -class _FakePipeline: +class _FakePipeline(Pipeline): pass From d3413c2fc1830a5eae018d7a3fd29c7b650366cf Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 16:44:32 +0300 Subject: [PATCH 29/32] fix: add Right monad in extension strategy params build method --- fedot/extensions/runtime_rules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/extensions/runtime_rules.py b/fedot/extensions/runtime_rules.py index 9cf90e59c3..116584b3d1 100644 --- a/fedot/extensions/runtime_rules.py +++ b/fedot/extensions/runtime_rules.py @@ -1,7 +1,7 @@ import inspect from typing import Any, Dict, Optional -from pymonad.either import Left +from pymonad.either import Left, Right from fedot.core.repository.dataset_types import DataTypesEnum from fedot.extensions.contracts import ExternalModelSpec @@ -36,7 +36,7 @@ def try_build_extension_strategy_params(operation_name: str, return params_resolution resolved_user_params = params_resolution.value - return params_resolution.__class__({ + return Right({ **resolved_user_params, 'model_fit': _build_model_fit(model_spec), 'model_predict': _build_model_predict(model_spec), From 03efc0beabfb635689717477272a8f61e36a833e Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 16:49:30 +0300 Subject: [PATCH 30/32] fix: add OperationParameters support for FP extraction --- fedot/extensions/parameter_rules.py | 7 +++++-- .../test_operation_factory_extensions.py | 4 ++-- tests/extensions/test_parameter_rules.py | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fedot/extensions/parameter_rules.py b/fedot/extensions/parameter_rules.py index 7eaa4e5516..0e17cf117f 100644 --- a/fedot/extensions/parameter_rules.py +++ b/fedot/extensions/parameter_rules.py @@ -1,7 +1,8 @@ -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Optional, Tuple, Union from pymonad.either import Left, Right +from fedot.core.operations.operation_parameters import OperationParameters from fedot.extensions.contracts import ExtensionError, ExternalModelSpec @@ -36,7 +37,9 @@ def resolve_extension_params(model_spec: ExternalModelSpec, return Right(resolved_params) -def extract_factory_params(strategy_params: Dict[str, Any]) -> Dict[str, Any]: +def extract_factory_params(strategy_params: Union[Dict[str, Any], OperationParameters]) -> Dict[str, Any]: + if isinstance(strategy_params, OperationParameters): + strategy_params = strategy_params.to_dict() return { key: value for key, value in strategy_params.items() diff --git a/tests/core/operations/test_operation_factory_extensions.py b/tests/core/operations/test_operation_factory_extensions.py index e6d2e1d5fc..23848154d1 100644 --- a/tests/core/operations/test_operation_factory_extensions.py +++ b/tests/core/operations/test_operation_factory_extensions.py @@ -1,4 +1,4 @@ -import numpy as np +import numpy as np from fedot.core.operations.extension_model import ExtensionModel from fedot.core.operations.factory import OperationFactory @@ -81,7 +81,7 @@ def test_extension_model_uses_custom_strategy_adapter_for_runtime_init(): })()) assert strategy.operation_id == 'custom' - assert implementation.was_fitted is True + assert implementation.fitted_model.was_fitted is True assert implementation.params.get('alpha') == 2.0 assert implementation.params.get('beta') == 0.5 finally: diff --git a/tests/extensions/test_parameter_rules.py b/tests/extensions/test_parameter_rules.py index 1f1b2ac275..5b0c67725a 100644 --- a/tests/extensions/test_parameter_rules.py +++ b/tests/extensions/test_parameter_rules.py @@ -1,5 +1,6 @@ from pymonad.either import Left, Right +from fedot.core.operations.operation_parameters import OperationParameters from fedot.extensions.contracts import ExternalModelSpec, ModelCapabilities, ModelHyperparamsSchema from fedot.extensions.parameter_rules import ( apply_extension_defaults, @@ -61,3 +62,17 @@ def test_extension_parameter_rules_return_resolved_params_when_schema_is_satisfi assert resolution.__class__ is Right assert resolution.value == {'beta': 0.5, 'alpha': 1.0} + + +def test_extract_factory_params_accepts_operation_parameters(): + strategy_params = OperationParameters( + alpha=1.0, + beta=0.5, + _extension_output_mode='labels', + model_fit=object(), + model_predict=object(), + ) + + factory_params = extract_factory_params(strategy_params) + + assert factory_params == {'alpha': 1.0, 'beta': 0.5} From a7d39940b78795fa07c3562cdd34e5f124b212c7 Mon Sep 17 00:00:00 2001 From: Lopa10ko Date: Thu, 19 Mar 2026 17:30:28 +0300 Subject: [PATCH 31/32] fix: update validation checks to use is_left, is_right methods for monads --- fedot/extensions/registry.py | 14 +++++++------- fedot/extensions/runtime_rules.py | 8 ++++---- fedot/remote/pipeline_run_config.py | 22 +++++++++++----------- tests/extensions/test_parameter_rules.py | 8 +++----- tests/extensions/test_registry.py | 21 +++++++++------------ tests/extensions/test_runtime_rules.py | 8 +++----- tests/remote/test_pipeline_run_config.py | 11 +++++------ 7 files changed, 42 insertions(+), 50 deletions(-) diff --git a/fedot/extensions/registry.py b/fedot/extensions/registry.py index 64dfbd1e86..43920c5045 100644 --- a/fedot/extensions/registry.py +++ b/fedot/extensions/registry.py @@ -1,6 +1,6 @@ import importlib import inspect -from typing import Dict, Iterable, Tuple +from typing import Any, Dict, Iterable, Tuple from pymonad.either import Left, Right from pymonad.maybe import Just, Nothing @@ -15,7 +15,7 @@ _REGISTERED_EXTENSIONS: Dict[str, ExtensionManifest] = {} -def validate_extension_manifest(manifest: ExtensionManifest): +def validate_extension_manifest(manifest: Any): if not isinstance(manifest, ExtensionManifest): return Left(ExtensionError(code='invalid_manifest_type', message='Extension manifest must be an ExtensionManifest instance.')) @@ -35,7 +35,7 @@ def validate_extension_manifest(manifest: ExtensionManifest): seen_names = set() for model in manifest.models: model_validation = validate_external_model_spec(model) - if model_validation.__class__ is Left: + if model_validation.is_left(): return model_validation if model.name in seen_names: return Left(ExtensionError(code='duplicate_model_name', @@ -46,7 +46,7 @@ def validate_extension_manifest(manifest: ExtensionManifest): return Right(manifest) -def validate_external_model_spec(model: ExternalModelSpec): +def validate_external_model_spec(model: Any): if not isinstance(model, ExternalModelSpec): return Left(ExtensionError(code='invalid_model_spec_type', message='External model spec must be an ExternalModelSpec instance.')) @@ -72,7 +72,7 @@ def validate_external_model_spec(model: ExternalModelSpec): def register_extension(manifest: ExtensionManifest): validation = validate_extension_manifest(manifest) - if validation.__class__ is Left: + if validation.is_left(): return validation if manifest.name in _REGISTERED_EXTENSIONS: @@ -124,7 +124,7 @@ def discover_extensions(module_names: Iterable[str]): manifests = [] for module_name in module_names: loaded = load_extension_manifest(module_name) - if loaded.__class__ is Left: + if loaded.is_left(): return loaded manifests.append(loaded.value) return Right(tuple(manifests)) @@ -132,7 +132,7 @@ def discover_extensions(module_names: Iterable[str]): def smoke_test_extension(manifest: ExtensionManifest): validation = validate_extension_manifest(manifest) - if validation.__class__ is Left: + if validation.is_left(): return validation for model in manifest.models: diff --git a/fedot/extensions/runtime_rules.py b/fedot/extensions/runtime_rules.py index 116584b3d1..f3e56ae802 100644 --- a/fedot/extensions/runtime_rules.py +++ b/fedot/extensions/runtime_rules.py @@ -1,7 +1,7 @@ import inspect from typing import Any, Dict, Optional -from pymonad.either import Left, Right +from pymonad.either import Right from fedot.core.repository.dataset_types import DataTypesEnum from fedot.extensions.contracts import ExternalModelSpec @@ -32,7 +32,7 @@ def try_build_extension_strategy_params(operation_name: str, raise ValueError(f'Extension model "{operation_name}" is not registered.') params_resolution = resolve_extension_params(model_spec, user_params) - if params_resolution.__class__ is Left: + if params_resolution.is_left(): return params_resolution resolved_user_params = params_resolution.value @@ -49,8 +49,8 @@ def build_extension_strategy_params(operation_name: str, user_params: Optional[Dict[str, Any]] = None, output_mode: str = 'default') -> Dict[str, Any]: strategy_params = try_build_extension_strategy_params(operation_name, user_params, output_mode) - if strategy_params.__class__ is Left: - raise ValueError(strategy_params.value.message) + if strategy_params.is_left(): + raise ValueError(strategy_params.monoid[0].message) return strategy_params.value diff --git a/fedot/remote/pipeline_run_config.py b/fedot/remote/pipeline_run_config.py index 6ebfc7c5f4..18117895cb 100644 --- a/fedot/remote/pipeline_run_config.py +++ b/fedot/remote/pipeline_run_config.py @@ -47,42 +47,42 @@ def __init__(self, payload: Optional[PipelineRunConfigPayload] = None): @classmethod def try_from_dict(cls, config_dict: Dict[str, Dict[str, str]]): payload_result = parse_pipeline_run_config_dict(config_dict) - if payload_result.__class__ is Left: + if payload_result.is_left(): return payload_result return Right(cls(payload_result.value)) @classmethod def from_dict(cls, config_dict: Dict[str, Dict[str, str]]): result = cls.try_from_dict(config_dict) - if result.__class__ is Left: + if result.is_left(): raise ValueError(result.value.message) return result.value @classmethod def try_from_parser(cls, config: configparser.ConfigParser): sections_result = _config_parser_to_dict(config) - if sections_result.__class__ is Left: + if sections_result.is_left(): return sections_result return cls.try_from_dict(sections_result.value) @classmethod def from_parser(cls, config: configparser.ConfigParser): result = cls.try_from_parser(config) - if result.__class__ is Left: + if result.is_left(): raise ValueError(result.value.message) return result.value @classmethod def try_from_file(cls, file: Union[str, bytes]): parser_result = _read_config_parser(file) - if parser_result.__class__ is Left: + if parser_result.is_left(): return parser_result return cls.try_from_parser(parser_result.value) @classmethod def from_file(cls, file: Union[str, bytes]): result = cls.try_from_file(file) - if result.__class__ is Left: + if result.is_left(): raise ValueError(result.value.message) return result.value @@ -148,23 +148,23 @@ def parse_pipeline_run_config_dict(config_dict: Dict[str, Dict[str, str]]): field=field)) task_result = _parse_task(default_section['task']) - if task_result.__class__ is Left: + if task_result.is_left(): return task_result train_data_idx_result = _parse_optional_literal(default_section.get('train_data_idx'), 'train_data_idx') - if train_data_idx_result.__class__ is Left: + if train_data_idx_result.is_left(): return train_data_idx_result is_multi_modal_result = _parse_optional_bool(default_section.get('is_multi_modal'), default=False) - if is_multi_modal_result.__class__ is Left: + if is_multi_modal_result.is_left(): return is_multi_modal_result var_names_result = _parse_optional_literal(default_section.get('var_names'), 'var_names') - if var_names_result.__class__ is Left: + if var_names_result.is_left(): return var_names_result target_result = _parse_target(default_section.get('target')) - if target_result.__class__ is Left: + if target_result.is_left(): return target_result input_data = _expand_base_path(default_section['train_data']) diff --git a/tests/extensions/test_parameter_rules.py b/tests/extensions/test_parameter_rules.py index 5b0c67725a..a387a9165c 100644 --- a/tests/extensions/test_parameter_rules.py +++ b/tests/extensions/test_parameter_rules.py @@ -1,5 +1,3 @@ -from pymonad.either import Left, Right - from fedot.core.operations.operation_parameters import OperationParameters from fedot.extensions.contracts import ExternalModelSpec, ModelCapabilities, ModelHyperparamsSchema from fedot.extensions.parameter_rules import ( @@ -52,15 +50,15 @@ def test_extension_parameter_rules_detect_missing_required_params(): resolution = resolve_extension_params(_make_model_spec(), {'beta': 1.5}) assert missing == ('gamma',) - assert resolution.__class__ is Left - assert resolution.value.details['required'] == ['alpha'] + assert resolution.is_left() + assert resolution.monoid[0].details['required'] == ['alpha'] def test_extension_parameter_rules_return_resolved_params_when_schema_is_satisfied(): resolution = resolve_extension_params(_make_model_spec(), {'alpha': 1.0}) - assert resolution.__class__ is Right + assert resolution.is_right() assert resolution.value == {'beta': 0.5, 'alpha': 1.0} diff --git a/tests/extensions/test_registry.py b/tests/extensions/test_registry.py index 572e204200..04c1df6220 100644 --- a/tests/extensions/test_registry.py +++ b/tests/extensions/test_registry.py @@ -1,9 +1,6 @@ import sys import types -from pymonad.either import Left, Right -from pymonad.maybe import Just, Nothing - from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum from fedot.extensions import ( @@ -55,7 +52,7 @@ def teardown_function(): def test_validate_extension_manifest_returns_right_for_valid_manifest(): result = validate_extension_manifest(_build_manifest()) - assert result.__class__ is Right + assert result.is_right() def test_register_extension_stores_manifest_and_returns_maybe_lookup(): @@ -63,10 +60,10 @@ def test_register_extension_stores_manifest_and_returns_maybe_lookup(): result = register_extension(manifest) - assert result.__class__ is Right + assert result.is_right() assert len(get_registered_extensions()) == 1 - assert get_registered_extension('demo_extension').__class__ is Just - assert get_registered_extension('missing_extension').__class__ is Nothing + assert get_registered_extension('demo_extension').is_just() + assert get_registered_extension('missing_extension').is_nothing() def test_register_extension_rejects_duplicate_extension_name(): @@ -75,8 +72,8 @@ def test_register_extension_rejects_duplicate_extension_name(): duplicate_result = register_extension(manifest) - assert duplicate_result.__class__ is Left - assert duplicate_result.value.code == 'duplicate_extension' + assert duplicate_result.is_left() + assert duplicate_result.monoid[0].code == 'duplicate_extension' def test_smoke_test_extension_rejects_factory_returning_none(): @@ -97,8 +94,8 @@ def test_smoke_test_extension_rejects_factory_returning_none(): result = smoke_test_extension(manifest) - assert result.__class__ is Left - assert result.value.code == 'factory_returned_none' + assert result.is_left() + assert result.monoid[0].code == 'factory_returned_none' def test_discover_extensions_loads_manifest_from_module(): @@ -109,7 +106,7 @@ def test_discover_extensions_loads_manifest_from_module(): try: result = discover_extensions((module_name,)) - assert result.__class__ is Right + assert result.is_right() assert result.value[0].module == module_name finally: del sys.modules[module_name] diff --git a/tests/extensions/test_runtime_rules.py b/tests/extensions/test_runtime_rules.py index 8458961f84..583d919c93 100644 --- a/tests/extensions/test_runtime_rules.py +++ b/tests/extensions/test_runtime_rules.py @@ -1,7 +1,5 @@ import numpy as np -from pymonad.either import Left - from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum from fedot.extensions.contracts import ( @@ -85,8 +83,8 @@ def test_runtime_rules_return_left_when_required_extension_params_are_missing(): try: params = try_build_extension_strategy_params('external_runtime_model', {'beta': 1.5}) - assert params.__class__ is Left - assert params.value.code == 'missing_required_hyperparams' - assert params.value.details['required'] == ['alpha'] + assert params.is_left() + assert params.monoid[0].code == 'missing_required_hyperparams' + assert params.monoid[0].details['required'] == ['alpha'] finally: clear_extension_registry() diff --git a/tests/remote/test_pipeline_run_config.py b/tests/remote/test_pipeline_run_config.py index abb3b34ffd..359f120951 100644 --- a/tests/remote/test_pipeline_run_config.py +++ b/tests/remote/test_pipeline_run_config.py @@ -1,7 +1,6 @@ import configparser import pytest -from pymonad.either import Left, Right from fedot.core.repository.tasks import TaskTypesEnum, TsForecastingParams from fedot.remote.pipeline_run_config import PipelineRunConfig, parse_pipeline_run_config_dict @@ -25,7 +24,7 @@ def _base_config(task='Task(TaskTypesEnum.classification)'): def test_parse_pipeline_run_config_dict_parses_classification_task(): result = parse_pipeline_run_config_dict(_base_config()) - assert result.__class__ is Right + assert result.is_right() assert result.value.task.task_type == TaskTypesEnum.classification assert result.value.train_data_idx == [1, 2, 3] assert result.value.var_names is None @@ -36,7 +35,7 @@ def test_parse_pipeline_run_config_dict_parses_forecasting_task_with_params(): _base_config(task='Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=3))') ) - assert result.__class__ is Right + assert result.is_right() assert result.value.task.task_type == TaskTypesEnum.ts_forecasting assert isinstance(result.value.task.task_params, TsForecastingParams) assert result.value.task.task_params.forecast_length == 3 @@ -47,8 +46,8 @@ def test_parse_pipeline_run_config_dict_rejects_eval_like_task_payload(): result = parse_pipeline_run_config_dict(config) - assert result.__class__ is Left - assert result.value.code == 'unsupported_task_format' + assert result.is_left() + assert result.monoid[0].code == 'unsupported_task_format' @pytest.mark.parametrize('raw_value, expected', [('False', False), ('True', True), ('"True"', True), ('None', False)]) @@ -58,7 +57,7 @@ def test_pipeline_run_config_parses_bool_literals_compatibly(raw_value, expected result = parse_pipeline_run_config_dict(config) - assert result.__class__ is Right + assert result.is_right() assert result.value.is_multi_modal is expected From 04885bb0aeb41788af7a11e4d1f241e05ece7f26 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 19 Mar 2026 14:38:21 +0000 Subject: [PATCH 32/32] Automated autopep8 fixes --- examples/benchmark/run_amlb.py | 14 +++++++------- fedot/api/api_utils/api_composer.py | 12 +++++++----- fedot/api/api_utils/api_service_rules.py | 3 --- .../assumptions/assumptions_handler_rules.py | 3 ++- .../api/api_utils/assumptions/task_assumptions.py | 2 +- fedot/api/api_utils/recommendation_rules.py | 6 +++++- fedot/api/main.py | 6 ++++-- fedot/api/sampling_stage/config.py | 1 - fedot/api/sampling_stage/executor.py | 1 - .../core/operations/operation_parameter_rules.py | 6 ++---- fedot/core/pipelines/pipeline.py | 10 ++++------ fedot/core/pipelines/pipeline_node_rules.py | 3 --- fedot/core/pipelines/pipeline_rules.py | 2 -- fedot/extensions/runtime_rules.py | 15 +++------------ .../api/test_sampling_stage_integration.py | 1 - test/unit/api/test_sampling_stage.py | 1 - .../unit/examples/test_amlb_sampling_benchmark.py | 2 +- tests/__init__.py | 1 - tests/api/__init__.py | 2 +- tests/api/api_utils/__init__.py | 2 +- tests/api/api_utils/assumptions/__init__.py | 2 +- tests/api/api_utils/test_api_data_rules.py | 3 ++- tests/api/api_utils/test_api_run_planner.py | 3 ++- tests/api/api_utils/test_api_service_rules.py | 3 --- tests/api/api_utils/test_input_analyser.py | 4 +++- tests/api/test_builder.py | 1 - tests/api/test_main.py | 7 ++----- tests/core/__init__.py | 1 - tests/core/data/__init__.py | 2 +- tests/core/operations/__init__.py | 2 +- .../operations/test_operation_parameter_rules.py | 2 -- .../core/operations/test_operation_parameters.py | 3 --- tests/core/pipelines/__init__.py | 1 - tests/core/pipelines/test_node.py | 2 -- tests/core/pipelines/test_pipeline.py | 7 ++----- tests/core/pipelines/test_pipeline_node_rules.py | 2 -- tests/core/pipelines/test_pipeline_rules.py | 2 -- tests/core/repository/__init__.py | 1 - .../test_pipeline_operation_repository.py | 2 -- tests/extensions/__init__.py | 1 - tests/extensions/test_parameter_rules.py | 4 ---- tests/extensions/test_runtime_rules.py | 3 --- tests/preprocessing/__init__.py | 2 +- tests/preprocessing/test_base_preprocessing.py | 6 ------ tests/preprocessing/test_preprocessing_rules.py | 7 ------- tests/remote/__init__.py | 1 - 46 files changed, 53 insertions(+), 114 deletions(-) diff --git a/examples/benchmark/run_amlb.py b/examples/benchmark/run_amlb.py index 2edb35b25e..9946b40b39 100644 --- a/examples/benchmark/run_amlb.py +++ b/examples/benchmark/run_amlb.py @@ -1,4 +1,9 @@ from __future__ import annotations +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score +from sklearn.datasets import fetch_openml +import pandas as pd +import numpy as np import argparse import inspect @@ -14,11 +19,6 @@ import matplotlib matplotlib.use('Agg') -import numpy as np -import pandas as pd -from sklearn.datasets import fetch_openml -from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score -from sklearn.model_selection import train_test_split ROOT_DIR = Path(__file__).resolve().parents[2] if str(ROOT_DIR) not in sys.path: @@ -478,10 +478,10 @@ def _run_fedot_mode(dataset: LoadedDataset, 'problem': dataset.spec.task_type, 'timeout': config.timeout_minutes_per_dataset, 'seed': config.seed, - 'n_jobs': 1, #config.n_jobs + 'n_jobs': 1, # config.n_jobs 'preset': config.preset, 'use_input_preprocessing': False, - 'logging_level':10, + 'logging_level': 10, 'with_tuning': config.with_tuning, 'history_dir': str(history_dir), 'keep_history': True, diff --git a/fedot/api/api_utils/api_composer.py b/fedot/api/api_utils/api_composer.py index e4ff7b3cba..934fe5cb46 100644 --- a/fedot/api/api_utils/api_composer.py +++ b/fedot/api/api_utils/api_composer.py @@ -55,7 +55,8 @@ def init_cache(self): self.operations_cache = OperationsCache(cache_dir=cache_plan.cache_dir, use_stats=cache_plan.use_stats) self.operations_cache.reset() if cache_plan.use_preprocessing_cache: - self.preprocessing_cache = PreprocessingCache(cache_dir=cache_plan.cache_dir, use_stats=cache_plan.use_stats) + self.preprocessing_cache = PreprocessingCache( + cache_dir=cache_plan.cache_dir, use_stats=cache_plan.use_stats) self.preprocessing_cache.reset() if cache_plan.use_predictions_cache: self.predictions_cache = PredictionsCache(cache_dir=cache_plan.cache_dir, use_stats=cache_plan.use_stats) @@ -98,9 +99,10 @@ def obtain_model(self, train_data: InputData) -> Tuple[Pipeline, Sequence[Pipeli with fedot_composer_timer.launch_tuning('composing'): best_pipeline = self.tune_final_pipeline(train_data, best_pipeline, execution_plan) elif with_tuning: - self.log.message(f'Time for pipeline composing was {str(self.timer.composing_spend_time)}.\n' - f'The remaining {max(0, round(execution_plan.tuning_timeout_minutes, 1))} seconds are not enough ' - f'to tune the hyperparameters.') + self.log.message( + f'Time for pipeline composing was {str(self.timer.composing_spend_time)}.\n' + f'The remaining {max(0, round(execution_plan.tuning_timeout_minutes, 1))} seconds are not enough ' + f'to tune the hyperparameters.') self.log.message('Composed pipeline returned without tuning.') self.was_tuned = False @@ -210,4 +212,4 @@ def tune_final_pipeline(self, train_data: InputData, tuned_pipeline = tuner.tune(pipeline_gp_composed) self.log.message('Hyperparameters tuning finished') self.was_tuned = tuner.was_tuned - return tuned_pipeline + return tuned_pipeline diff --git a/fedot/api/api_utils/api_service_rules.py b/fedot/api/api_utils/api_service_rules.py index e1fee6a0cc..8f2dcf1b0a 100644 --- a/fedot/api/api_utils/api_service_rules.py +++ b/fedot/api/api_utils/api_service_rules.py @@ -10,7 +10,6 @@ class TuneExecutionPlan: metric: Any - def build_tune_execution_plan(input_data: Any, train_data: Any, requested_cv_folds: Optional[int], @@ -31,11 +30,9 @@ def build_tune_execution_plan(input_data: Any, ) - def resolve_predict_proba_mode(probs_for_all_classes: bool) -> str: return 'full_probs' if probs_for_all_classes else 'probs' - def resolve_forecast_horizon(requested_horizon: Optional[int], forecast_length: int) -> int: return forecast_length if requested_horizon is None else requested_horizon diff --git a/fedot/api/api_utils/assumptions/assumptions_handler_rules.py b/fedot/api/api_utils/assumptions/assumptions_handler_rules.py index e1834ddeb0..8022bacd32 100644 --- a/fedot/api/api_utils/assumptions/assumptions_handler_rules.py +++ b/fedot/api/api_utils/assumptions/assumptions_handler_rules.py @@ -22,7 +22,8 @@ class PresetDecision: NormalizedInitialAssumption = Optional[List[Pipeline]] -def normalize_initial_assumption(initial_assumption: Union[List[Pipeline], Pipeline, None]) -> NormalizedInitialAssumption: +def normalize_initial_assumption( + initial_assumption: Union[List[Pipeline], Pipeline, None]) -> NormalizedInitialAssumption: if initial_assumption is None: return None if isinstance(initial_assumption, Pipeline): diff --git a/fedot/api/api_utils/assumptions/task_assumptions.py b/fedot/api/api_utils/assumptions/task_assumptions.py index 58d31077e5..df2921aecf 100644 --- a/fedot/api/api_utils/assumptions/task_assumptions.py +++ b/fedot/api/api_utils/assumptions/task_assumptions.py @@ -119,7 +119,7 @@ def builders(self): # 'xgboost': PipelineBuilder().add_node('xgboost'), # 'lgbm': PipelineBuilder().add_node('lgbm'), 'rf': PipelineBuilder().add_node('rf'), - #'logit': PipelineBuilder().add_node('logit'), + # 'logit': PipelineBuilder().add_node('logit'), } def ensemble_operation(self) -> str: diff --git a/fedot/api/api_utils/recommendation_rules.py b/fedot/api/api_utils/recommendation_rules.py index 7e27432864..d1d9b91888 100644 --- a/fedot/api/api_utils/recommendation_rules.py +++ b/fedot/api/api_utils/recommendation_rules.py @@ -54,7 +54,11 @@ def should_use_label_encoding(input_data: InputData, def build_safe_data_recommendations(input_data: InputData, safe_mode: bool, limits: RecommendationLimits, - categorical_detector: Callable[[Any], tuple[Sequence[int], Sequence[int]]]) -> Dict[str, Dict[str, Any]]: + categorical_detector: Callable[[Any], + tuple[Sequence[int], + Sequence[int]]]) -> Dict[str, + Dict[str, + Any]]: if not safe_mode or not supports_data_recommendations(input_data): return {} diff --git a/fedot/api/main.py b/fedot/api/main.py index 042f13a00a..2b12d9105e 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -202,7 +202,8 @@ def fit(self, api_preprocessor=self.data_processor.preprocessor, ).fit() else: - self.current_pipeline, self.best_models, self.history = self.api_composer.obtain_model(self.train_data) + self.current_pipeline, self.best_models, self.history = self.api_composer.obtain_model( + self.train_data) if self.current_pipeline is None: raise ValueError('No models were found') @@ -271,7 +272,8 @@ def tune(self, default_metric=self.metrics[0], ) if input_data is not None: - tune_input_data = self.data_processor.define_data(features=tune_plan.input_data, target=target, is_predict=False) + tune_input_data = self.data_processor.define_data( + features=tune_plan.input_data, target=target, is_predict=False) else: tune_input_data = tune_plan.input_data diff --git a/fedot/api/sampling_stage/config.py b/fedot/api/sampling_stage/config.py index 58c09e38a7..91e398daa1 100644 --- a/fedot/api/sampling_stage/config.py +++ b/fedot/api/sampling_stage/config.py @@ -169,4 +169,3 @@ def _validate_strategy_param_guards(config: SamplingConfig) -> None: f'"sampling_config.strategy_params.sample_size" exceeds guard_max_sample_size=' f'{config.guard_max_sample_size}.' ) - diff --git a/fedot/api/sampling_stage/executor.py b/fedot/api/sampling_stage/executor.py index 1725e2ccc7..efd52d4259 100644 --- a/fedot/api/sampling_stage/executor.py +++ b/fedot/api/sampling_stage/executor.py @@ -322,4 +322,3 @@ def _raise_if_budget_exceeded(started_at: float, budget_seconds: float) -> None: def _remaining_budget(started_at: float, budget_seconds: float) -> float: elapsed = time.perf_counter() - started_at return max(0.0, budget_seconds - elapsed) - diff --git a/fedot/core/operations/operation_parameter_rules.py b/fedot/core/operations/operation_parameter_rules.py index b427819abc..4c0b700ae4 100644 --- a/fedot/core/operations/operation_parameter_rules.py +++ b/fedot/core/operations/operation_parameter_rules.py @@ -1,7 +1,6 @@ from typing import Dict, Iterable, Tuple - def merge_operation_default_params(default_parameters: Dict, passed_parameters: Dict) -> Dict: return { **dict(default_parameters or {}), @@ -9,8 +8,8 @@ def merge_operation_default_params(default_parameters: Dict, passed_parameters: } - -def collect_changed_keys(current_parameters: Dict, updated_parameters: Dict, existing_changed_keys: Iterable[str]) -> Tuple[str, ...]: +def collect_changed_keys(current_parameters: Dict, updated_parameters: Dict, + existing_changed_keys: Iterable[str]) -> Tuple[str, ...]: changed_keys = list(existing_changed_keys) for key, value in updated_parameters.items(): if key not in changed_keys and current_parameters.get(key) != value: @@ -18,7 +17,6 @@ def collect_changed_keys(current_parameters: Dict, updated_parameters: Dict, exi return tuple(changed_keys) - def resolve_setdefault_value(current_parameters: Dict, key, value): if key in current_parameters: return current_parameters[key], False diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py index a3ca1162c5..2fd2054a15 100644 --- a/fedot/core/pipelines/pipeline.py +++ b/fedot/core/pipelines/pipeline.py @@ -202,9 +202,8 @@ def fit(self, self.replace_n_jobs_in_nodes(n_jobs) preprocess_plan = build_pipeline_preprocess_plan( - is_fit_stage=True, - is_input_auto_preprocessed=isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed, - ) + is_fit_stage=True, is_input_auto_preprocessed=isinstance( + input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed, ) if preprocess_plan.should_preprocess: with fedot_composer_timer.launch_preprocessing(): copied_input_data = self._preprocess(input_data) @@ -317,9 +316,8 @@ def predict(self, raise ValueError(ex) preprocess_plan = build_pipeline_preprocess_plan( - is_fit_stage=False, - is_input_auto_preprocessed=isinstance(input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed, - ) + is_fit_stage=False, is_input_auto_preprocessed=isinstance( + input_data, InputData) and input_data.supplementary_data.is_auto_preprocessed, ) if preprocess_plan.should_preprocess: copied_input_data = self._preprocess(input_data, is_fit_stage=False) else: diff --git a/fedot/core/pipelines/pipeline_node_rules.py b/fedot/core/pipelines/pipeline_node_rules.py index df17d19771..6efa1f4afd 100644 --- a/fedot/core/pipelines/pipeline_node_rules.py +++ b/fedot/core/pipelines/pipeline_node_rules.py @@ -1,7 +1,6 @@ from typing import Dict, Iterable, Optional - def normalize_node_parameters(params: Optional[dict], default_params_stub, nested_params_label: str) -> Dict: if params is None: return {} @@ -12,7 +11,6 @@ def normalize_node_parameters(params: Optional[dict], default_params_stub, neste return dict(params) - def merge_node_parameters(current_parameters: Optional[dict], changed_parameters: Optional[dict]) -> Dict: return { **dict(current_parameters or {}), @@ -20,7 +18,6 @@ def merge_node_parameters(current_parameters: Optional[dict], changed_parameters } - def should_update_node_parameters(operation_type: str, operation_tags: Optional[Iterable[str]]) -> bool: if 'atomized' in operation_type: return False diff --git a/fedot/core/pipelines/pipeline_rules.py b/fedot/core/pipelines/pipeline_rules.py index 23ec0f6c7b..48c6dd73ca 100644 --- a/fedot/core/pipelines/pipeline_rules.py +++ b/fedot/core/pipelines/pipeline_rules.py @@ -15,7 +15,6 @@ class PipelinePostprocessPlan: should_flatten_prediction: bool - def build_pipeline_preprocess_plan(is_fit_stage: bool, is_input_auto_preprocessed: bool) -> PipelinePreprocessPlan: return PipelinePreprocessPlan( should_preprocess=not is_input_auto_preprocessed, @@ -23,7 +22,6 @@ def build_pipeline_preprocess_plan(is_fit_stage: bool, is_input_auto_preprocesse ) - def build_pipeline_postprocess_plan(output_mode: str, task_type: TaskTypesEnum) -> PipelinePostprocessPlan: return PipelinePostprocessPlan( should_restore_inverse_target_encoding=output_mode == 'labels', diff --git a/fedot/extensions/runtime_rules.py b/fedot/extensions/runtime_rules.py index f3e56ae802..337fbcc456 100644 --- a/fedot/extensions/runtime_rules.py +++ b/fedot/extensions/runtime_rules.py @@ -9,7 +9,6 @@ from fedot.extensions.registry import get_registered_extensions - def get_extension_model_spec(operation_name: str) -> Optional[ExternalModelSpec]: for registered_extension in get_registered_extensions(): for model in registered_extension.manifest.models: @@ -18,12 +17,10 @@ def get_extension_model_spec(operation_name: str) -> Optional[ExternalModelSpec] return None - def is_extension_operation_name(operation_name: str) -> bool: return get_extension_model_spec(operation_name) is not None - def try_build_extension_strategy_params(operation_name: str, user_params: Optional[Dict[str, Any]] = None, output_mode: str = 'default'): @@ -44,7 +41,6 @@ def try_build_extension_strategy_params(operation_name: str, }) - def build_extension_strategy_params(operation_name: str, user_params: Optional[Dict[str, Any]] = None, output_mode: str = 'default') -> Dict[str, Any]: @@ -54,7 +50,6 @@ def build_extension_strategy_params(operation_name: str, return strategy_params.value - def get_extension_acceptable_task_types(operation_name: str): model_spec = get_extension_model_spec(operation_name) if model_spec is None: @@ -62,7 +57,6 @@ def get_extension_acceptable_task_types(operation_name: str): return model_spec.capabilities.tasks - def get_extension_data_types(operation_name: str): model_spec = get_extension_model_spec(operation_name) if model_spec is None: @@ -70,7 +64,6 @@ def get_extension_data_types(operation_name: str): return model_spec.capabilities.data_types - def _build_model_fit(model_spec: ExternalModelSpec): def _fit(idx, features, target, params): model = _instantiate_model(model_spec, params) @@ -88,7 +81,6 @@ def _fit(idx, features, target, params): return _fit - def _build_model_predict(model_spec: ExternalModelSpec): def _predict(fitted_model, idx, features, params): model = fitted_model if fitted_model is not None else _instantiate_model(model_spec, params) @@ -101,7 +93,9 @@ def _predict(fitted_model, idx, features, params): (idx, features, params), (idx, features), ) - if output_mode != 'full_probs' and getattr(prediction, 'shape', None) is not None and len(prediction.shape) > 1 and prediction.shape[1] == 2: + if output_mode != 'full_probs' and getattr( + prediction, 'shape', None) is not None and len( + prediction.shape) > 1 and prediction.shape[1] == 2: prediction = prediction[:, 1] elif hasattr(model, 'predict'): prediction = _call_with_supported_signature( @@ -126,7 +120,6 @@ def _predict(fitted_model, idx, features, params): return _predict - def _instantiate_model(model_spec: ExternalModelSpec, params: Dict[str, Any]): factory = model_spec.factory user_params = extract_factory_params(params) @@ -138,7 +131,6 @@ def _instantiate_model(model_spec: ExternalModelSpec, params: Dict[str, Any]): return factory() - def _call_with_supported_signature(method, *candidate_args): signature = inspect.signature(method) last_error = None @@ -152,7 +144,6 @@ def _call_with_supported_signature(method, *candidate_args): raise last_error or TypeError('No supported signature found for extension model method.') - def _infer_output_type_name(model_spec: ExternalModelSpec) -> str: preferred_data_type = model_spec.capabilities.data_types[0] if model_spec.capabilities.data_types else DataTypesEnum.table return preferred_data_type.name diff --git a/test/integration/api/test_sampling_stage_integration.py b/test/integration/api/test_sampling_stage_integration.py index 332dfa5c87..d1b4c60eb1 100644 --- a/test/integration/api/test_sampling_stage_integration.py +++ b/test/integration/api/test_sampling_stage_integration.py @@ -227,4 +227,3 @@ def fake_execute(self, train_data_input): assert model.sampling_stage_metadata is not None assert model.sampling_stage_metadata['status'] == 'applied' assert model.params.timeout == pytest.approx(0.2) - diff --git a/test/unit/api/test_sampling_stage.py b/test/unit/api/test_sampling_stage.py index b68dc6c4ab..7c7cfcf51e 100644 --- a/test/unit/api/test_sampling_stage.py +++ b/test/unit/api/test_sampling_stage.py @@ -188,4 +188,3 @@ def test_sampling_config_rejects_non_dict_value(): def test_sampling_config_rejects_invalid_validation_size_range(): with pytest.raises(ValueError, match='validation_size'): validate_sampling_config({'validation_size': 1.0}) - diff --git a/test/unit/examples/test_amlb_sampling_benchmark.py b/test/unit/examples/test_amlb_sampling_benchmark.py index 42f469d633..35059e02e0 100644 --- a/test/unit/examples/test_amlb_sampling_benchmark.py +++ b/test/unit/examples/test_amlb_sampling_benchmark.py @@ -98,4 +98,4 @@ def test_sanitize_features_replaces_pandas_na_values_for_fedot_compatibility(): unique_values = np.unique(sanitized['cat_feature'].to_numpy()) assert len(unique_values) >= 2 - assert all(dtype.kind in {'i', 'u', 'f'} for dtype in sanitized.dtypes) \ No newline at end of file + assert all(dtype.kind in {'i', 'u', 'f'} for dtype in sanitized.dtypes) diff --git a/tests/__init__.py b/tests/__init__.py index 8b13789179..e69de29bb2 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/api/__init__.py b/tests/api/__init__.py index e02abfc9b0..5f282702bb 100644 --- a/tests/api/__init__.py +++ b/tests/api/__init__.py @@ -1 +1 @@ - + \ No newline at end of file diff --git a/tests/api/api_utils/__init__.py b/tests/api/api_utils/__init__.py index e02abfc9b0..5f282702bb 100644 --- a/tests/api/api_utils/__init__.py +++ b/tests/api/api_utils/__init__.py @@ -1 +1 @@ - + \ No newline at end of file diff --git a/tests/api/api_utils/assumptions/__init__.py b/tests/api/api_utils/assumptions/__init__.py index e02abfc9b0..5f282702bb 100644 --- a/tests/api/api_utils/assumptions/__init__.py +++ b/tests/api/api_utils/assumptions/__init__.py @@ -1 +1 @@ - + \ No newline at end of file diff --git a/tests/api/api_utils/test_api_data_rules.py b/tests/api/api_utils/test_api_data_rules.py index 5b189e842d..9afa5959f9 100644 --- a/tests/api/api_utils/test_api_data_rules.py +++ b/tests/api/api_utils/test_api_data_rules.py @@ -61,7 +61,8 @@ def test_plan_preprocessing_steps_are_explicit_and_stable(): (TaskTypesEnum.ts_forecasting, False, None, 3, (None, False, True, None)), (TaskTypesEnum.regression, False, None, None, (None, False, False, None)), ]) -def test_plan_prediction_returns_typed_branching_decision(task_type, in_sample, validation_blocks, forecast_length, expected): +def test_plan_prediction_returns_typed_branching_decision( + task_type, in_sample, validation_blocks, forecast_length, expected): plan = plan_prediction(task_type, in_sample, validation_blocks, forecast_length) assert (plan.output_mode, plan.use_in_sample_forecast, plan.flatten_prediction, plan.horizon) == expected diff --git a/tests/api/api_utils/test_api_run_planner.py b/tests/api/api_utils/test_api_run_planner.py index 1f9c65ab54..77b119fba2 100644 --- a/tests/api/api_utils/test_api_run_planner.py +++ b/tests/api/api_utils/test_api_run_planner.py @@ -76,7 +76,8 @@ def test_plan_final_fit_respects_history_and_pipeline_fit_state(): assert history_has_records(_FakeHistory(is_empty_value=False)) is True assert plan_final_fit(None, pipeline_is_fitted=True).should_train_on_full_dataset is False - assert plan_final_fit(_FakeHistory(is_empty_value=False), pipeline_is_fitted=True).should_train_on_full_dataset is True + assert plan_final_fit(_FakeHistory(is_empty_value=False), + pipeline_is_fitted=True).should_train_on_full_dataset is True assert plan_final_fit(None, pipeline_is_fitted=False).should_train_on_full_dataset is True diff --git a/tests/api/api_utils/test_api_service_rules.py b/tests/api/api_utils/test_api_service_rules.py index 61508f9c59..be3801ff93 100644 --- a/tests/api/api_utils/test_api_service_rules.py +++ b/tests/api/api_utils/test_api_service_rules.py @@ -5,7 +5,6 @@ ) - def test_build_tune_execution_plan_uses_explicit_values_when_provided(): plan = build_tune_execution_plan( input_data='new-data', @@ -24,7 +23,6 @@ def test_build_tune_execution_plan_uses_explicit_values_when_provided(): assert plan.metric == 'roc_auc' - def test_build_tune_execution_plan_uses_defaults_when_values_are_missing(): plan = build_tune_execution_plan( input_data=None, @@ -43,7 +41,6 @@ def test_build_tune_execution_plan_uses_defaults_when_values_are_missing(): assert plan.metric == 'f1' - def test_service_rules_resolve_predict_mode_and_forecast_horizon(): assert resolve_predict_proba_mode(False) == 'probs' assert resolve_predict_proba_mode(True) == 'full_probs' diff --git a/tests/api/api_utils/test_input_analyser.py b/tests/api/api_utils/test_input_analyser.py index e6174f59da..2e000e7593 100644 --- a/tests/api/api_utils/test_input_analyser.py +++ b/tests/api/api_utils/test_input_analyser.py @@ -31,7 +31,9 @@ def fake_build_recommendation_bundle(**kwargs): captured['input_params'] = kwargs['input_params'] return type('Bundle', (), {'data': {'cut': {'border': 2}}, 'params': {'preset': 'fast_train'}})() - monkeypatch.setattr('fedot.api.api_utils.input_analyser.build_recommendation_bundle', fake_build_recommendation_bundle) + monkeypatch.setattr( + 'fedot.api.api_utils.input_analyser.build_recommendation_bundle', + fake_build_recommendation_bundle) analyser = InputAnalyser(safe_mode=True) data_recommendations, params_recommendations = analyser.give_recommendations( diff --git a/tests/api/test_builder.py b/tests/api/test_builder.py index dfff5e63fc..4593fc13b5 100644 --- a/tests/api/test_builder.py +++ b/tests/api/test_builder.py @@ -73,7 +73,6 @@ def test_param_setters_has_all_api_parameters(fedot_builder_methods): assert builder_params == fedot_api_all_params - def test_builder_preserves_previous_values_when_new_setup_uses_default_sentinel(): builder = FedotBuilder('classification') diff --git a/tests/api/test_main.py b/tests/api/test_main.py index 4fe9f637ac..279ac224f3 100644 --- a/tests/api/test_main.py +++ b/tests/api/test_main.py @@ -22,7 +22,6 @@ def predict(self, test_data, output_mode='default'): ) - def test_main_facade_raises_not_fitted_errors_for_predictive_methods(): model = Fedot(problem='classification') @@ -39,7 +38,6 @@ def test_main_facade_raises_not_fitted_errors_for_predictive_methods(): model.return_report() - def test_main_facade_predict_proba_rejects_non_classification_tasks(): model = Fedot(problem='regression') model.current_pipeline = object() @@ -48,19 +46,18 @@ def test_main_facade_predict_proba_rejects_non_classification_tasks(): model.predict_proba(features=np.array([[1.0]])) - def test_main_facade_uses_service_rule_for_predict_proba_mode_selection(): model = Fedot(problem='classification') model.current_pipeline = _StubPipeline() model.target = 'target' - model.data_processor.define_data = lambda **kwargs: type('Input', (), {'task': Task(TaskTypesEnum.classification)})() + model.data_processor.define_data = lambda **kwargs: type('Input', + (), {'task': Task(TaskTypesEnum.classification)})() model.predict_proba(features=np.array([[1.0], [2.0]]), probs_for_all_classes=True) assert model.current_pipeline.calls == ['full_probs'] - def test_main_facade_forecast_requires_time_series_task(): model = Fedot(problem='classification') model.current_pipeline = object() diff --git a/tests/core/__init__.py b/tests/core/__init__.py index 8b13789179..e69de29bb2 100644 --- a/tests/core/__init__.py +++ b/tests/core/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/core/data/__init__.py b/tests/core/data/__init__.py index e02abfc9b0..5f282702bb 100644 --- a/tests/core/data/__init__.py +++ b/tests/core/data/__init__.py @@ -1 +1 @@ - + \ No newline at end of file diff --git a/tests/core/operations/__init__.py b/tests/core/operations/__init__.py index e02abfc9b0..5f282702bb 100644 --- a/tests/core/operations/__init__.py +++ b/tests/core/operations/__init__.py @@ -1 +1 @@ - + \ No newline at end of file diff --git a/tests/core/operations/test_operation_parameter_rules.py b/tests/core/operations/test_operation_parameter_rules.py index c5b2ab133e..2960fa7a17 100644 --- a/tests/core/operations/test_operation_parameter_rules.py +++ b/tests/core/operations/test_operation_parameter_rules.py @@ -5,7 +5,6 @@ ) - def test_operation_parameter_rules_merge_defaults_and_track_changes(): merged = merge_operation_default_params({'a': 1, 'b': 2}, {'b': 3, 'c': 4}) changed_keys = collect_changed_keys({'a': 1, 'b': 2}, {'a': 1, 'b': 3, 'd': 4}, ()) @@ -14,7 +13,6 @@ def test_operation_parameter_rules_merge_defaults_and_track_changes(): assert changed_keys == ('b', 'd') - def test_operation_parameter_rules_resolve_setdefault_value_explicitly(): existing_value, should_update_existing = resolve_setdefault_value({'a': 1}, 'a', 2) missing_value, should_update_missing = resolve_setdefault_value({'a': 1}, 'b', 3) diff --git a/tests/core/operations/test_operation_parameters.py b/tests/core/operations/test_operation_parameters.py index 87c06eaf12..8cca13149c 100644 --- a/tests/core/operations/test_operation_parameters.py +++ b/tests/core/operations/test_operation_parameters.py @@ -1,7 +1,6 @@ from fedot.core.operations.operation_parameters import OperationParameters, get_default_params - def test_params_keeper_update(): params = {'a': 1, 'b': 2, 'c': 3} keeper = OperationParameters(**params) @@ -16,7 +15,6 @@ def test_params_keeper_update(): assert 'd' in changed_params - def test_params_keeper_get(): params = {'a': 1, 'b': 2, 'c': 3} keeper = OperationParameters(**params) @@ -28,7 +26,6 @@ def test_params_keeper_get(): assert d == 5 - def test_params_keeper_setdefault_and_defaults_from_repository(): keeper = OperationParameters(alpha=1.0) existing_value = keeper.setdefault('alpha', 2.0) diff --git a/tests/core/pipelines/__init__.py b/tests/core/pipelines/__init__.py index 8b13789179..e69de29bb2 100644 --- a/tests/core/pipelines/__init__.py +++ b/tests/core/pipelines/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/core/pipelines/test_node.py b/tests/core/pipelines/test_node.py index 6c0b1f30de..0e54ce6fa9 100644 --- a/tests/core/pipelines/test_node.py +++ b/tests/core/pipelines/test_node.py @@ -11,7 +11,6 @@ def get_params(self): return self._params - def test_pipeline_node_parameters_setter_normalizes_default_and_nested_params(): default_node = PipelineNode(operation_type='ridge') nested_node = PipelineNode(operation_type='ridge') @@ -23,7 +22,6 @@ def test_pipeline_node_parameters_setter_normalizes_default_and_nested_params(): assert nested_node.parameters['alpha'] == 1.0 - def test_pipeline_node_update_params_uses_typed_merge_rule(): node = PipelineNode(operation_type='ridge') node.parameters = {'alpha': 1.0} diff --git a/tests/core/pipelines/test_pipeline.py b/tests/core/pipelines/test_pipeline.py index a91436bac7..6022673c24 100644 --- a/tests/core/pipelines/test_pipeline.py +++ b/tests/core/pipelines/test_pipeline.py @@ -20,7 +20,6 @@ def apply_inverse_target_encoding(self, prediction): return prediction + 1 - def _make_ts_output(): return OutputData( idx=np.arange(2), @@ -31,7 +30,6 @@ def _make_ts_output(): ) - def _make_classification_input(is_auto_preprocessed: bool): supplementary_data = SupplementaryData(is_auto_preprocessed=is_auto_preprocessed) return InputData( @@ -44,7 +42,6 @@ def _make_classification_input(is_auto_preprocessed: bool): ) - def test_pipeline_postprocess_uses_typed_postprocess_plan(): pipeline = Pipeline(use_input_preprocessing=False) pipeline.preprocessor = _StubPreprocessor() @@ -55,10 +52,10 @@ def test_pipeline_postprocess_uses_typed_postprocess_plan(): assert result.predict.tolist() == [2.0, 3.0] - def test_pipeline_fit_skips_preprocessing_when_input_is_marked_auto_preprocessed(): pipeline = Pipeline(use_input_preprocessing=False) - pipeline._preprocess = lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError('_preprocess should not be called')) + pipeline._preprocess = lambda *args, **kwargs: (_ for _ in () + ).throw(AssertionError('_preprocess should not be called')) pipeline._assign_data_to_nodes = lambda data: data pipeline._fit = lambda input_data=None, predictions_cache=None, fold_id=None: 'ok' input_data = _make_classification_input(is_auto_preprocessed=True) diff --git a/tests/core/pipelines/test_pipeline_node_rules.py b/tests/core/pipelines/test_pipeline_node_rules.py index 91418b733b..54de16d8c6 100644 --- a/tests/core/pipelines/test_pipeline_node_rules.py +++ b/tests/core/pipelines/test_pipeline_node_rules.py @@ -6,7 +6,6 @@ from fedot.core.utils import DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL - def test_normalize_node_parameters_handles_default_stub_and_nested_params(): assert normalize_node_parameters(DEFAULT_PARAMS_STUB, DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL) == {} assert normalize_node_parameters( @@ -17,7 +16,6 @@ def test_normalize_node_parameters_handles_default_stub_and_nested_params(): assert normalize_node_parameters({'beta': 2.0}, DEFAULT_PARAMS_STUB, NESTED_PARAMS_LABEL) == {'beta': 2.0} - def test_merge_node_parameters_and_update_rule_are_explicit(): merged = merge_node_parameters({'alpha': 1.0}, {'beta': 2.0}) diff --git a/tests/core/pipelines/test_pipeline_rules.py b/tests/core/pipelines/test_pipeline_rules.py index 675a71e08b..6827c9114b 100644 --- a/tests/core/pipelines/test_pipeline_rules.py +++ b/tests/core/pipelines/test_pipeline_rules.py @@ -5,7 +5,6 @@ from fedot.core.repository.tasks import TaskTypesEnum - def test_build_pipeline_preprocess_plan_handles_fit_and_predict_stages(): fit_plan = build_pipeline_preprocess_plan(is_fit_stage=True, is_input_auto_preprocessed=False) predict_plan = build_pipeline_preprocess_plan(is_fit_stage=False, is_input_auto_preprocessed=True) @@ -16,7 +15,6 @@ def test_build_pipeline_preprocess_plan_handles_fit_and_predict_stages(): assert predict_plan.should_update_time_series_indices is True - def test_build_pipeline_postprocess_plan_handles_labels_and_ts_outputs(): labels_plan = build_pipeline_postprocess_plan('labels', TaskTypesEnum.classification) ts_plan = build_pipeline_postprocess_plan('default', TaskTypesEnum.ts_forecasting) diff --git a/tests/core/repository/__init__.py b/tests/core/repository/__init__.py index 8b13789179..e69de29bb2 100644 --- a/tests/core/repository/__init__.py +++ b/tests/core/repository/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/core/repository/test_pipeline_operation_repository.py b/tests/core/repository/test_pipeline_operation_repository.py index 32c8935758..24eb548e14 100644 --- a/tests/core/repository/test_pipeline_operation_repository.py +++ b/tests/core/repository/test_pipeline_operation_repository.py @@ -5,7 +5,6 @@ from fedot.extensions.registry import clear_extension_registry, register_extension - def _make_manifest(): return ExtensionManifest( name='pipeline_repository_extension', @@ -24,7 +23,6 @@ def _make_manifest(): ) - def test_from_available_operations_returns_self_and_keeps_registered_extension(): clear_extension_registry() register_extension(_make_manifest()) diff --git a/tests/extensions/__init__.py b/tests/extensions/__init__.py index 8b13789179..e69de29bb2 100644 --- a/tests/extensions/__init__.py +++ b/tests/extensions/__init__.py @@ -1 +0,0 @@ - diff --git a/tests/extensions/test_parameter_rules.py b/tests/extensions/test_parameter_rules.py index a387a9165c..7d105d41f9 100644 --- a/tests/extensions/test_parameter_rules.py +++ b/tests/extensions/test_parameter_rules.py @@ -11,7 +11,6 @@ from fedot.core.repository.tasks import TaskTypesEnum - def _make_model_spec(): return ExternalModelSpec( name='external_with_schema', @@ -29,7 +28,6 @@ def _make_model_spec(): ) - def test_extension_parameter_rules_apply_defaults_and_filter_runtime_keys(): normalized = normalize_extension_user_params({'alpha': 1.0}) with_defaults = apply_extension_defaults({'beta': 0.5}, normalized) @@ -44,7 +42,6 @@ def test_extension_parameter_rules_apply_defaults_and_filter_runtime_keys(): assert factory_params == {'beta': 0.5, 'alpha': 1.0} - def test_extension_parameter_rules_detect_missing_required_params(): missing = find_missing_required_params(('alpha', 'gamma'), {'alpha': 1.0}) resolution = resolve_extension_params(_make_model_spec(), {'beta': 1.5}) @@ -54,7 +51,6 @@ def test_extension_parameter_rules_detect_missing_required_params(): assert resolution.monoid[0].details['required'] == ['alpha'] - def test_extension_parameter_rules_return_resolved_params_when_schema_is_satisfied(): resolution = resolve_extension_params(_make_model_spec(), {'alpha': 1.0}) diff --git a/tests/extensions/test_runtime_rules.py b/tests/extensions/test_runtime_rules.py index 583d919c93..0c8ddd1d30 100644 --- a/tests/extensions/test_runtime_rules.py +++ b/tests/extensions/test_runtime_rules.py @@ -30,7 +30,6 @@ def predict(self, features): return np.zeros(features.shape[0]) - def _make_manifest(): return ExtensionManifest( name='runtime_extension', @@ -54,7 +53,6 @@ def _make_manifest(): ) - def test_runtime_rules_resolve_registered_extension_model_and_build_strategy_params(): clear_extension_registry() register_extension(_make_manifest()) @@ -75,7 +73,6 @@ def test_runtime_rules_resolve_registered_extension_model_and_build_strategy_par clear_extension_registry() - def test_runtime_rules_return_left_when_required_extension_params_are_missing(): clear_extension_registry() register_extension(_make_manifest()) diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py index e02abfc9b0..5f282702bb 100644 --- a/tests/preprocessing/__init__.py +++ b/tests/preprocessing/__init__.py @@ -1 +1 @@ - + \ No newline at end of file diff --git a/tests/preprocessing/test_base_preprocessing.py b/tests/preprocessing/test_base_preprocessing.py index 448a175c3a..6f717056ea 100644 --- a/tests/preprocessing/test_base_preprocessing.py +++ b/tests/preprocessing/test_base_preprocessing.py @@ -48,7 +48,6 @@ def reduce_memory_size(self, data): return data - def _make_input_data(*, is_main_target=True): return InputData( idx=np.array([0, 1]), @@ -60,7 +59,6 @@ def _make_input_data(*, is_main_target=True): ) - def _make_optional_input_data(): data = InputData( idx=np.array([0, 1]), @@ -75,7 +73,6 @@ def _make_optional_input_data(): return data - def test_mark_as_preprocessed_marks_unimodal_and_multimodal_inputs(): input_data = _make_input_data() multi_data = MultiModalData({'main': _make_input_data(), 'side': _make_input_data(is_main_target=False)}) @@ -88,7 +85,6 @@ def test_mark_as_preprocessed_marks_unimodal_and_multimodal_inputs(): assert multi_data['side'].supplementary_data.optionally_preprocessed is True - def test_merge_preprocessors_uses_typed_merge_plan(): api_preprocessor = _FakePreprocessor() pipeline_preprocessor = _FakePreprocessor() @@ -105,7 +101,6 @@ def test_merge_preprocessors_uses_typed_merge_plan(): assert merged.features_imputers == pipeline_preprocessor.features_imputers - def test_data_preprocessor_initialization_uses_source_and_target_rules(): preprocessor = DataPreprocessor() multi_data = MultiModalData({ @@ -121,7 +116,6 @@ def test_data_preprocessor_initialization_uses_source_and_target_rules(): assert preprocessor.main_target_source_name == 'main' - def test_prepare_optional_uses_typed_optional_plan_and_target_source_resolution(): preprocessor = DataPreprocessor() data = _make_optional_input_data() diff --git a/tests/preprocessing/test_preprocessing_rules.py b/tests/preprocessing/test_preprocessing_rules.py index fc0bcd0068..e0169ed279 100644 --- a/tests/preprocessing/test_preprocessing_rules.py +++ b/tests/preprocessing/test_preprocessing_rules.py @@ -18,7 +18,6 @@ from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME - def _make_input_data(*, is_main_target=True): return InputData( idx=np.array([0, 1]), @@ -30,7 +29,6 @@ def _make_input_data(*, is_main_target=True): ) - def test_resolve_source_names_handles_unimodal_and_multimodal(): unimodal_plan = resolve_source_names(_make_input_data(), DEFAULT_SOURCE_NAME) multimodal_plan = resolve_source_names( @@ -42,20 +40,17 @@ def test_resolve_source_names_handles_unimodal_and_multimodal(): assert multimodal_plan.source_names == ('left', 'right') - def test_resolve_source_names_rejects_unknown_data_type(): with pytest.raises(ValueError, match='Unknown type of data'): resolve_source_names(object(), DEFAULT_SOURCE_NAME) - def test_should_initialize_source_helpers_reflects_existing_state(): assert should_initialize_source_helpers(False, False) is True assert should_initialize_source_helpers(True, False) is True assert should_initialize_source_helpers(True, True) is False - def test_resolve_main_target_source_name_prefers_existing_then_detects_main_branch(): multi_data = MultiModalData({ 'main': _make_input_data(is_main_target=True), @@ -66,7 +61,6 @@ def test_resolve_main_target_source_name_prefers_existing_then_detects_main_bran assert resolve_main_target_source_name(None, multi_data) == 'main' - def test_iter_preprocessed_inputs_and_merge_plan_are_deterministic(): input_data = _make_input_data() multi_data = MultiModalData({'main': input_data, 'side': _make_input_data(is_main_target=False)}) @@ -83,7 +77,6 @@ def test_iter_preprocessed_inputs_and_merge_plan_are_deterministic(): assert manual_plan.take_pipeline_imputers is True - def test_build_optional_preprocessing_plan_and_target_source_resolution_are_explicit(): optional_plan = build_optional_preprocessing_plan( has_missing_values=True, diff --git a/tests/remote/__init__.py b/tests/remote/__init__.py index 8b13789179..e69de29bb2 100644 --- a/tests/remote/__init__.py +++ b/tests/remote/__init__.py @@ -1 +0,0 @@ -