From a08129feacb11572fb1cb49f0cbc4eab43a728cc Mon Sep 17 00:00:00 2001 From: Bumjin Kim Date: Tue, 16 Sep 2025 13:00:57 +0900 Subject: [PATCH 1/7] feat: Add DatetimeOrdinal transformer and unit tests (#818) Implement the new DatetimeOrdinal transformer for converting datetime features to their ordinal representation. This commit includes the transformer class itself and a full suite of pytest unit tests to ensure its correctness and robustness. --- docs/api_doc/datetime/DatetimeOrdinal.rst | 6 + docs/api_doc/datetime/index.rst | 1 + feature_engine/datetime/__init__.py | 3 +- feature_engine/datetime/datetime_ordinal.py | 249 +++++++++++++++++++ tests/test_datetime/test_datetime_ordinal.py | 153 ++++++++++++ 5 files changed, 411 insertions(+), 1 deletion(-) create mode 100644 docs/api_doc/datetime/DatetimeOrdinal.rst create mode 100644 feature_engine/datetime/datetime_ordinal.py create mode 100644 tests/test_datetime/test_datetime_ordinal.py diff --git a/docs/api_doc/datetime/DatetimeOrdinal.rst b/docs/api_doc/datetime/DatetimeOrdinal.rst new file mode 100644 index 000000000..f5960c8e0 --- /dev/null +++ b/docs/api_doc/datetime/DatetimeOrdinal.rst @@ -0,0 +1,6 @@ +DatetimeOrdinal +=============== + +.. automodule:: feature_engine.datetime.datetime_ordinal + :members: + diff --git a/docs/api_doc/datetime/index.rst b/docs/api_doc/datetime/index.rst index c81b6bef8..0b92d966d 100644 --- a/docs/api_doc/datetime/index.rst +++ b/docs/api_doc/datetime/index.rst @@ -11,4 +11,5 @@ features from existing datetime or object-like data. DatetimeFeatures DatetimeSubtraction + DatetimeOrdinal diff --git a/feature_engine/datetime/__init__.py b/feature_engine/datetime/__init__.py index aea69f9fe..a39f02ba7 100644 --- a/feature_engine/datetime/__init__.py +++ b/feature_engine/datetime/__init__.py @@ -2,5 +2,6 @@ from .datetime import DatetimeFeatures from .datetime_subtraction import DatetimeSubtraction +from .datetime_ordinal import DatetimeOrdinal -__all__ = ["DatetimeFeatures", "DatetimeSubtraction"] +__all__ = ["DatetimeFeatures", "DatetimeSubtraction", "DatetimeOrdinal"] diff --git a/feature_engine/datetime/datetime_ordinal.py b/feature_engine/datetime/datetime_ordinal.py new file mode 100644 index 000000000..8cdaef9f6 --- /dev/null +++ b/feature_engine/datetime/datetime_ordinal.py @@ -0,0 +1,249 @@ +from typing import List, Optional, Union +import datetime + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine._docstrings.fit_attributes import ( + _feature_names_in_docstring, + _n_features_in_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_not_learn_docstring, + _fit_transform_docstring, +) +from feature_engine._docstrings.substitute import Substitution +from feature_engine.dataframe_checks import ( + _check_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.variable_handling.check_variables import check_datetime_variables +from feature_engine.variable_handling.find_variables import find_datetime_variables + + +@Substitution( + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit=_fit_not_learn_docstring, + fit_transform=_fit_transform_docstring, +) +class DatetimeOrdinal(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + DatetimeOrdinal transforms datetime variables into their ordinal representation. + The ordinal representation is an integer value representing the number of days + since January 1, 0001 in the Gregorian calendar. + + Optionally, a `start_date` can be provided to set a custom reference point, + making the ordinal values relative to this date (starting from 1). This can be + useful for reducing the magnitude of the ordinal values and for aligning them + to a specific project timeline. + + Parameters + ---------- + variables: str, list, default=None + List with the variables from which date and time information will be extracted. + If None, the transformer will find and select all datetime variables, + including variables of type object that can be converted to datetime. + + missing_values: string, default='raise' + Indicates if missing values should be ignored or raised. If 'raise' the + transformer will return an error if the datasets to `fit` or `transform` + contain missing values. If 'ignore', missing data will be ignored when + performing the feature extraction. + + start_date: str, datetime.datetime, default=None + A reference date from which the ordinal values will be calculated. + If provided, the ordinal value of `start_date` will be subtracted from + each datetime variable's ordinal value, and 1 will be added, so the + `start_date` itself corresponds to an ordinal value of 1. + If None, the standard `datetime.toordinal()` value will be used. + The `start_date` can be a string (e.g., "YYYY-MM-DD") or a datetime object. + + drop_original: bool, default=True + If True, the original datetime variables will be dropped from the dataframe. + + Attributes + ---------- + variables_: + List of variables from which date and time features will be extracted. + + start_date_ordinal_: + The ordinal value of the provided `start_date`, if applicable. + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + {fit} + + {fit_transform} + + transform: + Add the ordinal datetime features. + + See also + -------- + pandas.to_datetime + datetime.toordinal + + Examples + -------- + >>> import pandas as pd + >>> from feature_engine.datetime import DatetimeOrdinal + >>> X = pd.DataFrame(dict(date = ["2023-01-01", "2023-01-02", "2023-01-03"])) + >>> dtf = DatetimeOrdinal(start_date="2023-01-01") + >>> dtf.fit(X) + >>> dtf.transform(X) + date_ordinal + 0 1 + 1 2 + 2 3 + """ + + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + missing_values: str = "raise", + start_date: Union[None, str, datetime.datetime] = None, + drop_original: bool = True, + ) -> None: + + if missing_values not in ["raise", "ignore"]: + raise ValueError( + "missing_values takes only values 'raise' or 'ignore'. " + f"Got {missing_values} instead." + ) + + if start_date is not None: + try: + self.start_date_ = pd.to_datetime(start_date) + except Exception as e: + raise ValueError( + f"start_date could not be converted to datetime. " + f"Got {start_date} instead. Error: {e}" + ) + else: + self.start_date_ = None + + if not isinstance(drop_original, bool): + raise ValueError( + "drop_original takes only booleans True or False. " + f"Got {drop_original} instead." + ) + + self.variables = _check_variables_input_value(variables) + self.missing_values = missing_values + self.drop_original = drop_original + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + This transformer does not learn any parameter. + + Finds datetime variables or checks that the variables selected by the user + can be converted to datetime. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. Can be the entire dataframe, not just the + variables to transform. + + y: pandas Series=None + It is not needed in this transformer. You can pass y or None. + """ + # check input dataframe + X = check_X(X) + + if self.variables is None: + self.variables_ = find_datetime_variables(X) + else: + self.variables_ = check_datetime_variables(X, self.variables) + + # check if datetime variables contains na + if self.missing_values == "raise": + _check_contains_na(X, self.variables_) + + if self.start_date_ is not None: + self.start_date_ordinal_ = self.start_date_.toordinal() + else: + self.start_date_ordinal_ = None + + # save input features + self.feature_names_in_ = X.columns.tolist() + + # save train set shape + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Extract the ordinal datetime features and add them to the dataframe. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. +, default + Returns + ------- + X_new: Pandas dataframe, shape = [n_samples, n_features x n_df_features] + The dataframe with the original variables plus the new variables. + """ + + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check if input data contains same number of columns as dataframe used to fit. + _check_X_matches_training_df(X, self.n_features_in_) + + # reorder variables to match train set + X = X[self.feature_names_in_] + + # create a copy(to protect original data) + X_new = X.copy() + + # check if dataset contains na + if self.missing_values == "raise": + _check_contains_na(X_new, self.variables_) + + for var in self.variables_: + # Convert to datetime, then to ordinal + datetime_series = pd.to_datetime(X_new[var]) + # Handle NaT values: toordinal() raises ValueError for NaT + ordinal_series = datetime_series.apply(lambda x: x.toordinal() if pd.notna(x) else pd.NA) + + if self.start_date_ordinal_ is not None: + # Only apply offset if not NaT + ordinal_series = ordinal_series.apply(lambda x: x - self.start_date_ordinal_ + 1 if pd.notna(x) else pd.NA) + + X_new[str(var) + "_ordinal"] = ordinal_series + + if self.drop_original: + X_new.drop(self.variables_, axis=1, inplace=True) + + return X_new + + def _get_new_features_name(self) -> List: + """create the names for the new features.""" + feature_names = [str(var) + "_ordinal" for var in self.variables_] + return feature_names + + def _more_tags(self): + tags_dict = {"variables": "datetime"} + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + return tags diff --git a/tests/test_datetime/test_datetime_ordinal.py b/tests/test_datetime/test_datetime_ordinal.py new file mode 100644 index 000000000..c39ae27d5 --- /dev/null +++ b/tests/test_datetime/test_datetime_ordinal.py @@ -0,0 +1,153 @@ +import datetime +import pandas as pd +import pytest + +from feature_engine.datetime import DatetimeOrdinal + + +@pytest.fixture(scope="module") +def df_datetime_ordinal(): + df = pd.DataFrame({ + "date_col_1": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"]), + "date_col_2": pd.to_datetime(["2024-02-10", "2024-02-11", "2024-02-12", "2024-02-13", "2024-02-14"]), + "non_date_col": [1, 2, 3, 4, 5], + }) + return df + + +@pytest.fixture(scope="module") +def df_datetime_ordinal_na(): + df = pd.DataFrame({ + "date_col_1": pd.to_datetime(["2023-01-01", "2023-01-02", None, "2023-01-04", "2023-01-05"]), + "date_col_2": pd.to_datetime(["2024-02-10", "2024-02-11", "2024-02-12", None, "2024-02-14"]), + }) + return df + + +def test_datetime_ordinal_no_start_date(df_datetime_ordinal): + transformer = DatetimeOrdinal(variables=["date_col_1", "date_col_2"]) + X_transformed = transformer.fit_transform(df_datetime_ordinal) + + expected_ordinal_1 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") + expected_ordinal_2 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal") + + pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) + pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) + assert "non_date_col" in X_transformed.columns + assert "date_col_1" not in X_transformed.columns + assert "date_col_2" not in X_transformed.columns + + +def test_datetime_ordinal_with_start_date(df_datetime_ordinal): + start_date_str = "2023-01-01" + transformer = DatetimeOrdinal(variables=["date_col_1"], start_date=start_date_str) + X_transformed = transformer.fit_transform(df_datetime_ordinal) + + start_ordinal = pd.to_datetime(start_date_str).toordinal() + expected_ordinal = pd.Series([d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") + + pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal) + assert "date_col_2" in X_transformed.columns + assert "date_col_1" not in X_transformed.columns + + +def test_datetime_ordinal_with_start_date_datetime_object(df_datetime_ordinal): + start_date_obj = datetime.date(2023, 1, 1) + transformer = DatetimeOrdinal(variables=["date_col_1"], start_date=start_date_obj) + X_transformed = transformer.fit_transform(df_datetime_ordinal) + + start_ordinal = pd.to_datetime(start_date_obj).toordinal() + expected_ordinal = pd.Series([d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") + + pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal) + + +def test_datetime_ordinal_no_variables_specified(df_datetime_ordinal): + transformer = DatetimeOrdinal() + X_transformed = transformer.fit_transform(df_datetime_ordinal) + + expected_ordinal_1 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") + expected_ordinal_2 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal") + + pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) + pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) + assert "non_date_col" in X_transformed.columns + assert "date_col_1" not in X_transformed.columns + assert "date_col_2" not in X_transformed.columns + + +def test_datetime_ordinal_missing_values_raise(df_datetime_ordinal_na): + transformer = DatetimeOrdinal(missing_values="raise") + with pytest.raises(ValueError): + transformer.fit(df_datetime_ordinal_na) + + +def test_datetime_ordinal_missing_values_ignore(df_datetime_ordinal_na): + transformer = DatetimeOrdinal(missing_values="ignore") + X_transformed = transformer.fit_transform(df_datetime_ordinal_na) + + # Expected values for date_col_1_ordinal, handling None + expected_ordinal_1 = pd.Series([d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_1"]], name="date_col_1_ordinal", dtype=object) + expected_ordinal_2 = pd.Series([d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_2"]], name="date_col_2_ordinal", dtype=object) + + pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) + pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) + + +def test_datetime_ordinal_invalid_start_date(): + with pytest.raises(ValueError): + DatetimeOrdinal(start_date="not-a-date") + + +def test_datetime_ordinal_non_datetime_variable_error(df_datetime_ordinal): + transformer = DatetimeOrdinal(variables=["non_date_col"]) + with pytest.raises(TypeError): + transformer.fit(df_datetime_ordinal) + + +def test_datetime_ordinal_drop_original_false(df_datetime_ordinal): + transformer = DatetimeOrdinal(variables=["date_col_1"], drop_original=False) + X_transformed = transformer.fit_transform(df_datetime_ordinal) + + assert "date_col_1" in X_transformed.columns + assert "date_col_1_ordinal" in X_transformed.columns + assert "date_col_2" in X_transformed.columns + + +def test_datetime_ordinal_get_feature_names_out(df_datetime_ordinal): + transformer = DatetimeOrdinal(variables=["date_col_1", "date_col_2"]) + transformer.fit(df_datetime_ordinal) + feature_names_out = transformer.get_feature_names_out() + + expected_feature_names = ["date_col_1_ordinal", "date_col_2_ordinal", "non_date_col"] + assert sorted(feature_names_out) == sorted(expected_feature_names) + + +def test_datetime_ordinal_get_feature_names_out_with_input_features(df_datetime_ordinal): + transformer = DatetimeOrdinal(variables=["date_col_1"], drop_original=False) + transformer.fit(df_datetime_ordinal) + feature_names_out = transformer.get_feature_names_out(input_features=df_datetime_ordinal.columns.tolist()) + + expected_feature_names = ["date_col_1_ordinal", "date_col_2", "non_date_col", "date_col_1"] + assert sorted(feature_names_out) == sorted(expected_feature_names) + + +def test_datetime_ordinal_get_feature_names_out_with_input_features_drop_original(df_datetime_ordinal): + transformer = DatetimeOrdinal(variables=["date_col_1"], drop_original=True) + transformer.fit(df_datetime_ordinal) + feature_names_out = transformer.get_feature_names_out(input_features=df_datetime_ordinal.columns.tolist()) + + expected_feature_names = ["date_col_1_ordinal", "date_col_2", "non_date_col"] + assert sorted(feature_names_out) == sorted(expected_feature_names) + + +def test_datetime_ordinal_non_datetime_variable_in_transform(df_datetime_ordinal): + transformer = DatetimeOrdinal(variables=["date_col_1"]) + transformer.fit(df_datetime_ordinal) + + # Create a new dataframe where 'date_col_1' is no longer datetime + X_test = df_datetime_ordinal.copy() + X_test["date_col_1"] = ["a", "b", "c", "d", "e"] + + with pytest.raises(ValueError): + transformer.transform(X_test) From 60ebd9569e2b668fd099a816104cd7fac35d9702 Mon Sep 17 00:00:00 2001 From: Bumjin Kim Date: Thu, 18 Sep 2025 17:11:32 +0900 Subject: [PATCH 2/7] fix: Correct CI build failures This commit fixes two issues that were causing the CI checks to fail for the DatetimeOrdinal transformer. - Corrected the docstring format in the transform() method to resolve the sphinx-build error. - Removed trailing whitespace from a test file to pass the flake8 style check. --- feature_engine/datetime/datetime_ordinal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/datetime/datetime_ordinal.py b/feature_engine/datetime/datetime_ordinal.py index 8cdaef9f6..d9f665425 100644 --- a/feature_engine/datetime/datetime_ordinal.py +++ b/feature_engine/datetime/datetime_ordinal.py @@ -192,7 +192,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: ---------- X: pandas dataframe of shape = [n_samples, n_features] The data to transform. -, default + Returns ------- X_new: Pandas dataframe, shape = [n_samples, n_features x n_df_features] From 72a260971e1fcc3fea9d0ef579b1123219ac2c3a Mon Sep 17 00:00:00 2001 From: Bumjin Kim Date: Thu, 18 Sep 2025 17:19:53 +0900 Subject: [PATCH 3/7] fix: Correct CI build failures This commit fixes issue that were causing the CI checks to fail for the DatetimeOrdinal transformer. - Removed trailing whitespace from a test file to pass the flake8 style checks. --- tests/test_datetime/test_datetime_ordinal.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_datetime/test_datetime_ordinal.py b/tests/test_datetime/test_datetime_ordinal.py index c39ae27d5..5399e1989 100644 --- a/tests/test_datetime/test_datetime_ordinal.py +++ b/tests/test_datetime/test_datetime_ordinal.py @@ -144,7 +144,6 @@ def test_datetime_ordinal_get_feature_names_out_with_input_features_drop_origina def test_datetime_ordinal_non_datetime_variable_in_transform(df_datetime_ordinal): transformer = DatetimeOrdinal(variables=["date_col_1"]) transformer.fit(df_datetime_ordinal) - # Create a new dataframe where 'date_col_1' is no longer datetime X_test = df_datetime_ordinal.copy() X_test["date_col_1"] = ["a", "b", "c", "d", "e"] From 6dc4b9a5b9c1a1ffdf6c0f29e2eb2fc62843f645 Mon Sep 17 00:00:00 2001 From: Bumjin Kim Date: Thu, 18 Sep 2025 17:34:34 +0900 Subject: [PATCH 4/7] fix: Correct CI build failures This commit fixes issue that were causing the CI checks to fail for the DatetimeOrdinal transformer. - Refactored long lines of code to resolve E501 errors reported by flake8. --- tests/test_datetime/test_datetime_ordinal.py | 60 +++++++++++++++----- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/tests/test_datetime/test_datetime_ordinal.py b/tests/test_datetime/test_datetime_ordinal.py index 5399e1989..682c21744 100644 --- a/tests/test_datetime/test_datetime_ordinal.py +++ b/tests/test_datetime/test_datetime_ordinal.py @@ -8,8 +8,12 @@ @pytest.fixture(scope="module") def df_datetime_ordinal(): df = pd.DataFrame({ - "date_col_1": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"]), - "date_col_2": pd.to_datetime(["2024-02-10", "2024-02-11", "2024-02-12", "2024-02-13", "2024-02-14"]), + "date_col_1": pd.to_datetime( + ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"] + ), + "date_col_2": pd.to_datetime( + ["2024-02-10", "2024-02-11", "2024-02-12", "2024-02-13", "2024-02-14"] + ), "non_date_col": [1, 2, 3, 4, 5], }) return df @@ -18,8 +22,12 @@ def df_datetime_ordinal(): @pytest.fixture(scope="module") def df_datetime_ordinal_na(): df = pd.DataFrame({ - "date_col_1": pd.to_datetime(["2023-01-01", "2023-01-02", None, "2023-01-04", "2023-01-05"]), - "date_col_2": pd.to_datetime(["2024-02-10", "2024-02-11", "2024-02-12", None, "2024-02-14"]), + "date_col_1": pd.to_datetime( + ["2023-01-01", "2023-01-02", None, "2023-01-04", "2023-01-05"] + ), + "date_col_2": pd.to_datetime( + ["2024-02-10", "2024-02-11", "2024-02-12", None, "2024-02-14"] + ), }) return df @@ -28,8 +36,12 @@ def test_datetime_ordinal_no_start_date(df_datetime_ordinal): transformer = DatetimeOrdinal(variables=["date_col_1", "date_col_2"]) X_transformed = transformer.fit_transform(df_datetime_ordinal) - expected_ordinal_1 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") - expected_ordinal_2 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal") + expected_ordinal_1 = pd.Series( + [d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal" + ) + expected_ordinal_2 = pd.Series( + [d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal" + ) pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) @@ -44,7 +56,10 @@ def test_datetime_ordinal_with_start_date(df_datetime_ordinal): X_transformed = transformer.fit_transform(df_datetime_ordinal) start_ordinal = pd.to_datetime(start_date_str).toordinal() - expected_ordinal = pd.Series([d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") + expected_ordinal = pd.Series( + [d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], + name="date_col_1_ordinal" + ) pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal) assert "date_col_2" in X_transformed.columns @@ -57,7 +72,10 @@ def test_datetime_ordinal_with_start_date_datetime_object(df_datetime_ordinal): X_transformed = transformer.fit_transform(df_datetime_ordinal) start_ordinal = pd.to_datetime(start_date_obj).toordinal() - expected_ordinal = pd.Series([d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") + expected_ordinal = pd.Series( + [d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], + name="date_col_1_ordinal" + ) pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal) @@ -66,8 +84,12 @@ def test_datetime_ordinal_no_variables_specified(df_datetime_ordinal): transformer = DatetimeOrdinal() X_transformed = transformer.fit_transform(df_datetime_ordinal) - expected_ordinal_1 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal") - expected_ordinal_2 = pd.Series([d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal") + expected_ordinal_1 = pd.Series( + [d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal" + ) + expected_ordinal_2 = pd.Series( + [d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal" + ) pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) @@ -87,8 +109,14 @@ def test_datetime_ordinal_missing_values_ignore(df_datetime_ordinal_na): X_transformed = transformer.fit_transform(df_datetime_ordinal_na) # Expected values for date_col_1_ordinal, handling None - expected_ordinal_1 = pd.Series([d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_1"]], name="date_col_1_ordinal", dtype=object) - expected_ordinal_2 = pd.Series([d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_2"]], name="date_col_2_ordinal", dtype=object) + expected_ordinal_1 = pd.Series( + [d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_1"]], + name="date_col_1_ordinal", dtype=object + ) + expected_ordinal_2 = pd.Series( + [d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_2"]], + name="date_col_2_ordinal", dtype=object + ) pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) @@ -126,7 +154,9 @@ def test_datetime_ordinal_get_feature_names_out(df_datetime_ordinal): def test_datetime_ordinal_get_feature_names_out_with_input_features(df_datetime_ordinal): transformer = DatetimeOrdinal(variables=["date_col_1"], drop_original=False) transformer.fit(df_datetime_ordinal) - feature_names_out = transformer.get_feature_names_out(input_features=df_datetime_ordinal.columns.tolist()) + feature_names_out = transformer.get_feature_names_out( + input_features=df_datetime_ordinal.columns.tolist() + ) expected_feature_names = ["date_col_1_ordinal", "date_col_2", "non_date_col", "date_col_1"] assert sorted(feature_names_out) == sorted(expected_feature_names) @@ -135,7 +165,9 @@ def test_datetime_ordinal_get_feature_names_out_with_input_features(df_datetime_ def test_datetime_ordinal_get_feature_names_out_with_input_features_drop_original(df_datetime_ordinal): transformer = DatetimeOrdinal(variables=["date_col_1"], drop_original=True) transformer.fit(df_datetime_ordinal) - feature_names_out = transformer.get_feature_names_out(input_features=df_datetime_ordinal.columns.tolist()) + feature_names_out = transformer.get_feature_names_out( + input_features=df_datetime_ordinal.columns.tolist() + ) expected_feature_names = ["date_col_1_ordinal", "date_col_2", "non_date_col"] assert sorted(feature_names_out) == sorted(expected_feature_names) From 51d39bf618ea6c26ca9faea7d55d6aa59fdeface Mon Sep 17 00:00:00 2001 From: Bumjin Kim Date: Thu, 18 Sep 2025 17:44:37 +0900 Subject: [PATCH 5/7] fix: Correct CI build failures --- feature_engine/datetime/datetime_ordinal.py | 8 +- tests/test_datetime/test_datetime_ordinal.py | 91 ++++++++++++++------ 2 files changed, 72 insertions(+), 27 deletions(-) diff --git a/feature_engine/datetime/datetime_ordinal.py b/feature_engine/datetime/datetime_ordinal.py index d9f665425..28fed0436 100644 --- a/feature_engine/datetime/datetime_ordinal.py +++ b/feature_engine/datetime/datetime_ordinal.py @@ -222,11 +222,15 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # Convert to datetime, then to ordinal datetime_series = pd.to_datetime(X_new[var]) # Handle NaT values: toordinal() raises ValueError for NaT - ordinal_series = datetime_series.apply(lambda x: x.toordinal() if pd.notna(x) else pd.NA) + ordinal_series = datetime_series.apply( + lambda x: x.toordinal() if pd.notna(x) else pd.NA + ) if self.start_date_ordinal_ is not None: # Only apply offset if not NaT - ordinal_series = ordinal_series.apply(lambda x: x - self.start_date_ordinal_ + 1 if pd.notna(x) else pd.NA) + ordinal_series = ordinal_series.apply( + lambda x: x - self.start_date_ordinal_ + 1 if pd.notna(x) else pd.NA + ) X_new[str(var) + "_ordinal"] = ordinal_series diff --git a/tests/test_datetime/test_datetime_ordinal.py b/tests/test_datetime/test_datetime_ordinal.py index 682c21744..e127f7765 100644 --- a/tests/test_datetime/test_datetime_ordinal.py +++ b/tests/test_datetime/test_datetime_ordinal.py @@ -37,14 +37,20 @@ def test_datetime_ordinal_no_start_date(df_datetime_ordinal): X_transformed = transformer.fit_transform(df_datetime_ordinal) expected_ordinal_1 = pd.Series( - [d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal" + [d.toordinal() for d in df_datetime_ordinal["date_col_1"]], + name="date_col_1_ordinal", ) expected_ordinal_2 = pd.Series( - [d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal" + [d.toordinal() for d in df_datetime_ordinal["date_col_2"]], + name="date_col_2_ordinal", ) - pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) - pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) + pd.testing.assert_series_equal( + X_transformed["date_col_1_ordinal"], expected_ordinal_1 + ) + pd.testing.assert_series_equal( + X_transformed["date_col_2_ordinal"], expected_ordinal_2 + ) assert "non_date_col" in X_transformed.columns assert "date_col_1" not in X_transformed.columns assert "date_col_2" not in X_transformed.columns @@ -57,11 +63,13 @@ def test_datetime_ordinal_with_start_date(df_datetime_ordinal): start_ordinal = pd.to_datetime(start_date_str).toordinal() expected_ordinal = pd.Series( - [d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], - name="date_col_1_ordinal" + [d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], + name="date_col_1_ordinal", ) - pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal) + pd.testing.assert_series_equal( + X_transformed["date_col_1_ordinal"], expected_ordinal + ) assert "date_col_2" in X_transformed.columns assert "date_col_1" not in X_transformed.columns @@ -73,11 +81,13 @@ def test_datetime_ordinal_with_start_date_datetime_object(df_datetime_ordinal): start_ordinal = pd.to_datetime(start_date_obj).toordinal() expected_ordinal = pd.Series( - [d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], - name="date_col_1_ordinal" + [d.toordinal() - start_ordinal + 1 for d in df_datetime_ordinal["date_col_1"]], + name="date_col_1_ordinal", ) - pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal) + pd.testing.assert_series_equal( + X_transformed["date_col_1_ordinal"], expected_ordinal + ) def test_datetime_ordinal_no_variables_specified(df_datetime_ordinal): @@ -85,14 +95,20 @@ def test_datetime_ordinal_no_variables_specified(df_datetime_ordinal): X_transformed = transformer.fit_transform(df_datetime_ordinal) expected_ordinal_1 = pd.Series( - [d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal" + [d.toordinal() for d in df_datetime_ordinal["date_col_1"]], + name="date_col_1_ordinal", ) expected_ordinal_2 = pd.Series( - [d.toordinal() for d in df_datetime_ordinal["date_col_2"]], name="date_col_2_ordinal" + [d.toordinal() for d in df_datetime_ordinal["date_col_2"]], + name="date_col_2_ordinal", ) - pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) - pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) + pd.testing.assert_series_equal( + X_transformed["date_col_1_ordinal"], expected_ordinal_1 + ) + pd.testing.assert_series_equal( + X_transformed["date_col_2_ordinal"], expected_ordinal_2 + ) assert "non_date_col" in X_transformed.columns assert "date_col_1" not in X_transformed.columns assert "date_col_2" not in X_transformed.columns @@ -110,16 +126,28 @@ def test_datetime_ordinal_missing_values_ignore(df_datetime_ordinal_na): # Expected values for date_col_1_ordinal, handling None expected_ordinal_1 = pd.Series( - [d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_1"]], - name="date_col_1_ordinal", dtype=object + [ + d.toordinal() if pd.notna(d) else pd.NA + for d in df_datetime_ordinal_na["date_col_1"] + ], + name="date_col_1_ordinal", + dtype=object, ) expected_ordinal_2 = pd.Series( - [d.toordinal() if pd.notna(d) else pd.NA for d in df_datetime_ordinal_na["date_col_2"]], - name="date_col_2_ordinal", dtype=object + [ + d.toordinal() if pd.notna(d) else pd.NA + for d in df_datetime_ordinal_na["date_col_2"] + ], + name="date_col_2_ordinal", + dtype=object, ) - pd.testing.assert_series_equal(X_transformed["date_col_1_ordinal"], expected_ordinal_1) - pd.testing.assert_series_equal(X_transformed["date_col_2_ordinal"], expected_ordinal_2) + pd.testing.assert_series_equal( + X_transformed["date_col_1_ordinal"], expected_ordinal_1 + ) + pd.testing.assert_series_equal( + X_transformed["date_col_2_ordinal"], expected_ordinal_2 + ) def test_datetime_ordinal_invalid_start_date(): @@ -147,22 +175,35 @@ def test_datetime_ordinal_get_feature_names_out(df_datetime_ordinal): transformer.fit(df_datetime_ordinal) feature_names_out = transformer.get_feature_names_out() - expected_feature_names = ["date_col_1_ordinal", "date_col_2_ordinal", "non_date_col"] + expected_feature_names = [ + "date_col_1_ordinal", + "date_col_2_ordinal", + "non_date_col", + ] assert sorted(feature_names_out) == sorted(expected_feature_names) -def test_datetime_ordinal_get_feature_names_out_with_input_features(df_datetime_ordinal): +def test_datetime_ordinal_get_feature_names_out_with_input_features( + df_datetime_ordinal, +): transformer = DatetimeOrdinal(variables=["date_col_1"], drop_original=False) transformer.fit(df_datetime_ordinal) feature_names_out = transformer.get_feature_names_out( input_features=df_datetime_ordinal.columns.tolist() ) - expected_feature_names = ["date_col_1_ordinal", "date_col_2", "non_date_col", "date_col_1"] + expected_feature_names = [ + "date_col_1_ordinal", + "date_col_2", + "non_date_col", + "date_col_1", + ] assert sorted(feature_names_out) == sorted(expected_feature_names) -def test_datetime_ordinal_get_feature_names_out_with_input_features_drop_original(df_datetime_ordinal): +def test_datetime_ordinal_get_feature_names_out_with_input_features_drop_original( + df_datetime_ordinal, +): transformer = DatetimeOrdinal(variables=["date_col_1"], drop_original=True) transformer.fit(df_datetime_ordinal) feature_names_out = transformer.get_feature_names_out( @@ -181,4 +222,4 @@ def test_datetime_ordinal_non_datetime_variable_in_transform(df_datetime_ordinal X_test["date_col_1"] = ["a", "b", "c", "d", "e"] with pytest.raises(ValueError): - transformer.transform(X_test) + transformer.transform(X_test) \ No newline at end of file From aaea0d9c066bf560565199a668e1321d130ee440 Mon Sep 17 00:00:00 2001 From: Bumjin Kim Date: Thu, 18 Sep 2025 17:46:56 +0900 Subject: [PATCH 6/7] fix: Correct CI build failures --- tests/test_datetime/test_datetime_ordinal.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_datetime/test_datetime_ordinal.py b/tests/test_datetime/test_datetime_ordinal.py index e127f7765..c4c4c67a5 100644 --- a/tests/test_datetime/test_datetime_ordinal.py +++ b/tests/test_datetime/test_datetime_ordinal.py @@ -35,7 +35,6 @@ def df_datetime_ordinal_na(): def test_datetime_ordinal_no_start_date(df_datetime_ordinal): transformer = DatetimeOrdinal(variables=["date_col_1", "date_col_2"]) X_transformed = transformer.fit_transform(df_datetime_ordinal) - expected_ordinal_1 = pd.Series( [d.toordinal() for d in df_datetime_ordinal["date_col_1"]], name="date_col_1_ordinal", From 920961daa73457c34452335bcd47c0cbb2cb5612 Mon Sep 17 00:00:00 2001 From: Bumjin Kim Date: Thu, 18 Sep 2025 17:50:41 +0900 Subject: [PATCH 7/7] fix: Correct CI build failures --- tests/test_datetime/test_datetime_ordinal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datetime/test_datetime_ordinal.py b/tests/test_datetime/test_datetime_ordinal.py index c4c4c67a5..7663fd00d 100644 --- a/tests/test_datetime/test_datetime_ordinal.py +++ b/tests/test_datetime/test_datetime_ordinal.py @@ -221,4 +221,4 @@ def test_datetime_ordinal_non_datetime_variable_in_transform(df_datetime_ordinal X_test["date_col_1"] = ["a", "b", "c", "d", "e"] with pytest.raises(ValueError): - transformer.transform(X_test) \ No newline at end of file + transformer.transform(X_test)