diff --git a/docs/task_on_kart.rst b/docs/task_on_kart.rst index 0941996f..e2300443 100644 --- a/docs/task_on_kart.rst +++ b/docs/task_on_kart.rst @@ -157,7 +157,7 @@ As an alternative, the `load` method loads individual task input by passing an i We can also omit the :func:`~gokart.task.TaskOnKart.requires` and write the task used by :func:`~gokart.parameter.TaskInstanceParameter`. -Extensions include :func:`~gokart.task.TaskOnKart.load_data_frame` and :func:`~gokart.task.TaskOnKart.load_generator`. Please refer to :func:`~gokart.task.TaskOnKart.load`, :doc:`task_parameters`, and described later Advanced Features section. +Also please refer to :func:`~gokart.task.TaskOnKart.load`, :doc:`task_parameters`, and described later Advanced Features section. TaskOnKart.dump @@ -242,15 +242,6 @@ It is dumped and zipped with ``gensim.model.Word2Vec.save``. Please refer to :func:`~gokart.task.TaskOnKart.make_model_target`. -TaskOnKart.load_data_frame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Please refer to :doc:`for_pandas`. - -.. warning:: - This function is deprecated. Please use :func:`~gokart.task.TaskOnKart.load` instead. - - TaskOnKart.fail_on_empty_dump ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -308,4 +299,4 @@ Please note that :class:`~InMemoryTarget` is an experimental feature. redis_timeout=self.redis_timeout, raise_task_lock_exception_on_collision=False, ) - return make_in_memory_target('dummy_path', task_lock_params, unique_id) \ No newline at end of file + return make_in_memory_target('dummy_path', task_lock_params, unique_id) diff --git a/gokart/task.py b/gokart/task.py index 04238eae..2271f583 100644 --- a/gokart/task.py +++ b/gokart/task.py @@ -5,17 +5,11 @@ import inspect import os import random -import sys import types from importlib import import_module from logging import getLogger from typing import Any, Callable, Dict, Generator, Generic, Iterable, List, Optional, Set, TypeVar, Union, overload -if sys.version_info < (3, 13): - from typing_extensions import deprecated -else: - from warnings import deprecated - import luigi import pandas as pd from luigi.parameter import ParameterVisibility @@ -330,32 +324,6 @@ def _load(targets): return _load(self._get_input_targets(target)) - @deprecated("""This function is deprecated. use `load` instead. -If you want to specify `required_columns` and `drop_columns`, please extract the columns after loading. ex: `load()[['colA', 'colB']]` -""") - def load_data_frame( - self, target: Union[None, str, TargetOnKart] = None, required_columns: Optional[Set[str]] = None, drop_columns: bool = False - ) -> pd.DataFrame: - def _flatten_recursively(dfs): - if isinstance(dfs, list): - return pd.concat([_flatten_recursively(df) for df in dfs]) - else: - return dfs - - dfs = self.load(target=target) - if isinstance(dfs, dict) and len(dfs) == 1: - dfs = list(dfs.values())[0] - - data = _flatten_recursively(dfs) - - required_columns = required_columns or set() - if data.empty and len(data.index) == 0 and len(required_columns - set(data.columns)) > 0: - return pd.DataFrame(columns=list(required_columns)) - assert required_columns.issubset(set(data.columns)), f'data must have columns {required_columns}, but actually have only {data.columns}.' - if drop_columns: - data = data[list(required_columns)] - return data - @overload def dump(self, obj: T, target: None = None, custom_labels: dict[Any, Any] | None = None) -> None: ... diff --git a/gokart/testing/check_if_run_with_empty_data_frame.py b/gokart/testing/check_if_run_with_empty_data_frame.py index 9e928e86..fac31344 100644 --- a/gokart/testing/check_if_run_with_empty_data_frame.py +++ b/gokart/testing/check_if_run_with_empty_data_frame.py @@ -3,7 +3,6 @@ from typing import List, Optional import luigi -import pandas as pd from luigi.cmdline_parser import CmdlineParser import gokart @@ -70,9 +69,8 @@ def _test_run_with_empty_data_frame(cmdline_args: List[str], test_run_params: te if test_run_params.namespace is not None: all_tasks = [t for t in all_tasks if t.task_namespace == test_run_params.namespace] - with patch('gokart.TaskOnKart.load_data_frame', new=lambda *args, required_columns=None, **kwargs: pd.DataFrame(columns=list(required_columns))): - with patch('gokart.TaskOnKart.dump', new=lambda *args, **kwargs: None): - test_status_list = [_run_with_test_status(t) for t in all_tasks] + with patch('gokart.TaskOnKart.dump', new=lambda *args, **kwargs: None): + test_status_list = [_run_with_test_status(t) for t in all_tasks] test_logger.info('gokart test results:\n' + '\n'.join(s.format() for s in test_status_list)) if any(s.fail() for s in test_status_list): diff --git a/test/test_task_on_kart.py b/test/test_task_on_kart.py index 8d590ead..6e2ccff6 100644 --- a/test/test_task_on_kart.py +++ b/test/test_task_on_kart.py @@ -443,52 +443,6 @@ class DummyTaskAddConfiguration(gokart.TaskOnKart): mock_cmdline.return_value = luigi.cmdline_parser.CmdlineParser(['DummyTaskAddConfiguration', '--DummyTaskAddConfiguration-aa', '2']) self.assertEqual(DummyTaskAddConfiguration().aa, 2) - def test_load_list_of_list_pandas(self): - task = _DummyTask() - task.load = Mock(return_value=[pd.DataFrame(dict(a=[1])), [pd.DataFrame(dict(a=[2])), pd.DataFrame(dict(a=[3]))]]) # type: ignore - - df = task.load_data_frame() - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(3, df.shape[0]) - - def test_load_single_value_dict_of_dataframe(self): - task = _DummyTask() - task.load = Mock(return_value={'a': pd.DataFrame(dict(a=[1]))}) # type: ignore - - df = task.load_data_frame() - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(1, df.shape[0]) - - def test_load_data_frame_drop_columns(self): - task = _DummyTask() - task.load = Mock(return_value=pd.DataFrame(dict(a=[1], b=[2], c=[3]))) # type: ignore - - df = task.load_data_frame(required_columns={'a', 'c'}, drop_columns=True) - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(1, df.shape[0]) - self.assertSetEqual({'a', 'c'}, set(df.columns)) - - def test_load_data_frame_empty_input(self): - task = _DummyTask() - task.load = Mock(return_value=pd.DataFrame(dict(a=[], b=[], c=[]))) # type: ignore - - df = task.load_data_frame(required_columns={'a', 'c'}) - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(0, df.shape[0]) - self.assertSetEqual({'a', 'b', 'c'}, set(df.columns)) - - def test_load_index_only_dataframe(self): - task = _DummyTask() - task.load = Mock(return_value=pd.DataFrame(index=range(3))) # type: ignore - - # connnot load index only frame with required_columns - self.assertRaises(AssertionError, lambda: task.load_data_frame(required_columns={'a', 'c'})) - - df: pd.DataFrame = task.load_data_frame() - self.assertIsInstance(df, pd.DataFrame) - self.assertTrue(df.empty) - self.assertListEqual(list(range(3)), list(df.index)) - def test_use_rerun_with_inherits(self): # All tasks are completed. task_c = _DummyTaskC() diff --git a/test/testing/test_run_with_empty_data_frame.py b/test/testing/test_run_with_empty_data_frame.py deleted file mode 100644 index 5c07315b..00000000 --- a/test/testing/test_run_with_empty_data_frame.py +++ /dev/null @@ -1,104 +0,0 @@ -import logging -import unittest -from unittest.mock import patch - -import luigi -import pandas as pd - -import gokart - - -class DummyModel: - def apply(self, x): - return x + 1 - - def get(self): - return 2 - - -class DummyModelTask(gokart.TaskOnKart): - task_namespace = f'{__name__}.dummy' - rerun = True - - def run(self): - self.dump(DummyModel()) - - -class DummyPandasDataFrameTask(gokart.TaskOnKart): - task_namespace = __name__ - param = luigi.Parameter() - rerun = True - - def run(self): - df = pd.DataFrame(dict(x=[1, 3, 4])) - self.dump(df) - - -class DummyWorkFlowWithError(gokart.TaskOnKart): - task_namespace = __name__ - rerun = True - - complete_check_at_run = False - - def requires(self): - return dict(model=DummyModelTask(), data_a=DummyPandasDataFrameTask(param='a')) - - def run(self): - model: DummyModel = self.load('model') - data = self.load_data_frame('data_a') - data['applied'] = data['x'].apply(model.apply) - data['y'] = data['applied'].apply(model.apply) - self.dump(data) - - -class DummyWorkFlowWithoutError(gokart.TaskOnKart): - task_namespace = __name__ - rerun = True - - def requires(self): - return dict(model=DummyModelTask(), data_a=DummyPandasDataFrameTask(param='a')) - - def run(self): - model: DummyModel = self.load('model') - data = self.load_data_frame('data_a', required_columns={'x'}) - data['y'] = data['x'].apply(model.apply) - self.dump(data) - - -class TestTestFrameworkForPandasDataFrame(unittest.TestCase): - def test_run_without_error(self): - argv = [f'{__name__}.DummyWorkFlowWithoutError', '--local-scheduler', '--test-run-pandas', '--log-level=CRITICAL', '--no-lock'] - logger = logging.getLogger('gokart.testing.check_if_run_with_empty_data_frame') - with patch.object(logger, 'info') as mock_debug: - with self.assertRaises(SystemExit) as exit_code: - gokart.run(argv) - log_str = mock_debug.call_args[0][0] - self.assertEqual(exit_code.exception.code, 0) - self.assertTrue('DummyModelTask' in log_str) - - def test_run_with_error(self): - argv = [f'{__name__}.DummyWorkFlowWithError', '--local-scheduler', '--test-run-pandas', '--log-level=CRITICAL', '--no-lock'] - logger = logging.getLogger('gokart.testing.check_if_run_with_empty_data_frame') - with patch.object(logger, 'info') as mock_debug: - with self.assertRaises(SystemExit) as exit_code: - gokart.run(argv) - log_str = mock_debug.call_args[0][0] - self.assertEqual(exit_code.exception.code, 1) - self.assertTrue('DummyModelTask' in log_str) - - def test_run_with_namespace(self): - argv = [ - f'{__name__}.DummyWorkFlowWithoutError', - '--local-scheduler', - '--test-run-pandas', - f'--test-run-namespace={__name__}', - '--log-level=CRITICAL', - '--no-lock', - ] - logger = logging.getLogger('gokart.testing.check_if_run_with_empty_data_frame') - with patch.object(logger, 'info') as mock_debug: - with self.assertRaises(SystemExit) as exit_code: - gokart.run(argv) - log_str = mock_debug.call_args[0][0] - self.assertEqual(exit_code.exception.code, 0) - self.assertTrue('DummyModelTask' not in log_str)