Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions docs/task_on_kart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ As an alternative, the `load` method loads individual task input by passing an i


We can also omit the :func:`~gokart.task.TaskOnKart.requires` and write the task used by :func:`~gokart.parameter.TaskInstanceParameter`.
Extensions include :func:`~gokart.task.TaskOnKart.load_data_frame` and :func:`~gokart.task.TaskOnKart.load_generator`. Please refer to :func:`~gokart.task.TaskOnKart.load`, :doc:`task_parameters`, and described later Advanced Features section.
Also please refer to :func:`~gokart.task.TaskOnKart.load`, :doc:`task_parameters`, and described later Advanced Features section.


TaskOnKart.dump
Expand Down Expand Up @@ -242,15 +242,6 @@ It is dumped and zipped with ``gensim.model.Word2Vec.save``.
Please refer to :func:`~gokart.task.TaskOnKart.make_model_target`.


TaskOnKart.load_data_frame
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Please refer to :doc:`for_pandas`.

.. warning::
This function is deprecated. Please use :func:`~gokart.task.TaskOnKart.load` instead.


TaskOnKart.fail_on_empty_dump
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -308,4 +299,4 @@ Please note that :class:`~InMemoryTarget` is an experimental feature.
redis_timeout=self.redis_timeout,
raise_task_lock_exception_on_collision=False,
)
return make_in_memory_target('dummy_path', task_lock_params, unique_id)
return make_in_memory_target('dummy_path', task_lock_params, unique_id)
32 changes: 0 additions & 32 deletions gokart/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,11 @@
import inspect
import os
import random
import sys
import types
from importlib import import_module
from logging import getLogger
from typing import Any, Callable, Dict, Generator, Generic, Iterable, List, Optional, Set, TypeVar, Union, overload

if sys.version_info < (3, 13):
from typing_extensions import deprecated
else:
from warnings import deprecated

import luigi
import pandas as pd
from luigi.parameter import ParameterVisibility
Expand Down Expand Up @@ -330,32 +324,6 @@ def _load(targets):

return _load(self._get_input_targets(target))

@deprecated("""This function is deprecated. use `load` instead.
If you want to specify `required_columns` and `drop_columns`, please extract the columns after loading. ex: `load()[['colA', 'colB']]`
""")
def load_data_frame(
self, target: Union[None, str, TargetOnKart] = None, required_columns: Optional[Set[str]] = None, drop_columns: bool = False
) -> pd.DataFrame:
def _flatten_recursively(dfs):
if isinstance(dfs, list):
return pd.concat([_flatten_recursively(df) for df in dfs])
else:
return dfs

dfs = self.load(target=target)
if isinstance(dfs, dict) and len(dfs) == 1:
dfs = list(dfs.values())[0]

data = _flatten_recursively(dfs)

required_columns = required_columns or set()
if data.empty and len(data.index) == 0 and len(required_columns - set(data.columns)) > 0:
return pd.DataFrame(columns=list(required_columns))
assert required_columns.issubset(set(data.columns)), f'data must have columns {required_columns}, but actually have only {data.columns}.'
if drop_columns:
data = data[list(required_columns)]
return data

@overload
def dump(self, obj: T, target: None = None, custom_labels: dict[Any, Any] | None = None) -> None: ...

Expand Down
6 changes: 2 additions & 4 deletions gokart/testing/check_if_run_with_empty_data_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import List, Optional

import luigi
import pandas as pd
from luigi.cmdline_parser import CmdlineParser

import gokart
Expand Down Expand Up @@ -70,9 +69,8 @@ def _test_run_with_empty_data_frame(cmdline_args: List[str], test_run_params: te
if test_run_params.namespace is not None:
all_tasks = [t for t in all_tasks if t.task_namespace == test_run_params.namespace]

with patch('gokart.TaskOnKart.load_data_frame', new=lambda *args, required_columns=None, **kwargs: pd.DataFrame(columns=list(required_columns))):
with patch('gokart.TaskOnKart.dump', new=lambda *args, **kwargs: None):
test_status_list = [_run_with_test_status(t) for t in all_tasks]
with patch('gokart.TaskOnKart.dump', new=lambda *args, **kwargs: None):
test_status_list = [_run_with_test_status(t) for t in all_tasks]

test_logger.info('gokart test results:\n' + '\n'.join(s.format() for s in test_status_list))
if any(s.fail() for s in test_status_list):
Expand Down
46 changes: 0 additions & 46 deletions test/test_task_on_kart.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,52 +443,6 @@ class DummyTaskAddConfiguration(gokart.TaskOnKart):
mock_cmdline.return_value = luigi.cmdline_parser.CmdlineParser(['DummyTaskAddConfiguration', '--DummyTaskAddConfiguration-aa', '2'])
self.assertEqual(DummyTaskAddConfiguration().aa, 2)

def test_load_list_of_list_pandas(self):
task = _DummyTask()
task.load = Mock(return_value=[pd.DataFrame(dict(a=[1])), [pd.DataFrame(dict(a=[2])), pd.DataFrame(dict(a=[3]))]]) # type: ignore

df = task.load_data_frame()
self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(3, df.shape[0])

def test_load_single_value_dict_of_dataframe(self):
task = _DummyTask()
task.load = Mock(return_value={'a': pd.DataFrame(dict(a=[1]))}) # type: ignore

df = task.load_data_frame()
self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(1, df.shape[0])

def test_load_data_frame_drop_columns(self):
task = _DummyTask()
task.load = Mock(return_value=pd.DataFrame(dict(a=[1], b=[2], c=[3]))) # type: ignore

df = task.load_data_frame(required_columns={'a', 'c'}, drop_columns=True)
self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(1, df.shape[0])
self.assertSetEqual({'a', 'c'}, set(df.columns))

def test_load_data_frame_empty_input(self):
task = _DummyTask()
task.load = Mock(return_value=pd.DataFrame(dict(a=[], b=[], c=[]))) # type: ignore

df = task.load_data_frame(required_columns={'a', 'c'})
self.assertIsInstance(df, pd.DataFrame)
self.assertEqual(0, df.shape[0])
self.assertSetEqual({'a', 'b', 'c'}, set(df.columns))

def test_load_index_only_dataframe(self):
task = _DummyTask()
task.load = Mock(return_value=pd.DataFrame(index=range(3))) # type: ignore

# connnot load index only frame with required_columns
self.assertRaises(AssertionError, lambda: task.load_data_frame(required_columns={'a', 'c'}))

df: pd.DataFrame = task.load_data_frame()
self.assertIsInstance(df, pd.DataFrame)
self.assertTrue(df.empty)
self.assertListEqual(list(range(3)), list(df.index))

def test_use_rerun_with_inherits(self):
# All tasks are completed.
task_c = _DummyTaskC()
Expand Down
104 changes: 0 additions & 104 deletions test/testing/test_run_with_empty_data_frame.py

This file was deleted.