Reduce the time cost of fetching tileable data (#2594)

hekaisheng · web-flow · commit 866a5faa0ce1 · 2021-12-09T15:58:31.000+08:00
diff --git a/mars/core/entity/executable.py b/mars/core/entity/executable.py
@@ -80,8 +80,7 @@ def register(self, tileable: TileableType, session: SessionType):
 def _get_session(executable: "_ExecutableMixin", session: SessionType = None):
     from ...deploy.oscar.session import get_default_session
 
-    if session is None and len(executable._executed_sessions) > 0:
-        session = executable._executed_sessions[-1]
+    # if session is not specified, use default session
     if session is None:
         session = get_default_session()
 
@@ -151,6 +150,9 @@ def _execute_and_fetch(self, session: SessionType = None, **kw):
 
         session = _get_session(self, session)
         fetch_kwargs = kw.pop("fetch_kwargs", dict())
+        if session in self._executed_sessions:
+            # if has been executed, fetch directly.
+            return self.fetch(session=session, **fetch_kwargs)
         ret = self.execute(session=session, **kw)
         if isinstance(ret, ExecutionInfo):
             # wait=False
diff --git a/mars/dataframe/core.py b/mars/dataframe/core.py
@@ -61,6 +61,7 @@
     on_serialize_numpy_type,
     ceildiv,
     tokenize,
+    estimate_pandas_size,
 )
 from .utils import fetch_corner_data, ReprSeries, parse_index, merge_index_value
 from ..tensor import statistics
@@ -565,28 +566,52 @@ def to_pandas(self, session=None, **kw):
 class _BatchedFetcher:
     __slots__ = ()
 
-    def _iter(self, batch_size=1000, session=None, **kw):
+    def _iter(self, batch_size=None, session=None, **kw):
         from .indexing.iloc import iloc
 
-        size = self.shape[0]
-        n_batch = ceildiv(size, batch_size)
+        if batch_size is not None:
+            size = self.shape[0]
+            n_batch = ceildiv(size, batch_size)
 
-        if n_batch > 1:
-            for i in range(n_batch):
-                batch_data = iloc(self)[batch_size * i : batch_size * (i + 1)]
-                yield batch_data._fetch(session=session, **kw)
+            if n_batch > 1:
+                for i in range(n_batch):
+                    batch_data = iloc(self)[batch_size * i : batch_size * (i + 1)]
+                    yield batch_data._fetch(session=session, **kw)
+            else:
+                yield self._fetch(session=session, **kw)
         else:
-            yield self._fetch(session=session, **kw)
+            # if batch_size is not specified, use first batch to estimate
+            # batch_size.
+            default_batch_bytes = 50 * 1024 ** 2
+            first_batch = 1000
+            size = self.shape[0]
+
+            if size >= first_batch:
+                batch_data = iloc(self)[:first_batch]
+                first_batch_data = batch_data._fetch(session=session, **kw)
+                yield first_batch_data
+                data_size = estimate_pandas_size(first_batch_data)
+                batch_size = int(default_batch_bytes / data_size * first_batch)
+                n_batch = ceildiv(size - 1000, batch_size)
+                for i in range(n_batch):
+                    batch_data = iloc(self)[
+                        first_batch
+                        + batch_size * i : first_batch
+                        + batch_size * (i + 1)
+                    ]
+                    yield batch_data._fetch(session=session, **kw)
+            else:
+                yield self._fetch(session=session, **kw)
 
-    def iterbatch(self, batch_size=1000, session=None, **kw):
+    def iterbatch(self, batch_size=None, session=None, **kw):
         # trigger execution
         self.execute(session=session, **kw)
         return self._iter(batch_size=batch_size, session=session)
 
     def fetch(self, session=None, **kw):
         from .indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem
 
-        batch_size = kw.pop("batch_size", 1000)
+        batch_size = kw.pop("batch_size", None)
         if isinstance(self.op, (DataFrameIlocGetItem, SeriesIlocGetItem)):
             # see GH#1871
             # already iloc, do not trigger batch fetch
diff --git a/mars/learn/tests/test_wrappers.py b/mars/learn/tests/test_wrappers.py
@@ -23,11 +23,9 @@
 from ..wrappers import ParallelPostFit
 
 
-raw_x, raw_y = make_classification(n_samples=1000)
-X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
-
-
 def test_parallel_post_fit_basic(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
     clf = ParallelPostFit(GradientBoostingClassifier())
     clf.fit(X, y)
 
@@ -47,6 +45,8 @@ def test_parallel_post_fit_basic(setup):
 
 
 def test_parallel_post_fit_predict(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
     base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")
     wrap = ParallelPostFit(LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs"))
 
@@ -67,6 +67,8 @@ def test_parallel_post_fit_predict(setup):
 
 
 def test_parallel_post_fit_transform(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
     base = PCA(random_state=0)
     wrap = ParallelPostFit(PCA(random_state=0))
 
@@ -79,6 +81,8 @@ def test_parallel_post_fit_transform(setup):
 
 
 def test_parallel_post_fit_multiclass(setup):
+    raw_x, raw_y = make_classification(n_samples=1000)
+    X, y = mt.tensor(raw_x, chunk_size=100), mt.tensor(raw_y, chunk_size=100)
     raw_x, raw_y = make_classification(n_classes=3, n_informative=4)
     X, y = mt.tensor(raw_x, chunk_size=50), mt.tensor(raw_y, chunk_size=50)
 
diff --git a/mars/services/meta/api/web.py b/mars/services/meta/api/web.py
@@ -48,6 +48,16 @@ async def get_chunk_meta(self, session_id: str, data_key: str):
         result = await oscar_api.get_chunk_meta(data_key, fields=fields, error=error)
         self.write(serialize_serializable(result))
 
+    @web_api("", method="post")
+    async def get_chunks_meta(self, session_id: str):
+        body_args = deserialize_serializable(self.request.body)
+        oscar_api = await self._get_oscar_meta_api(session_id)
+        get_metas = []
+        for data_key, fields, error in body_args:
+            get_metas.append(oscar_api.get_chunk_meta.delay(data_key, fields, error))
+        results = await oscar_api.get_chunk_meta.batch(*get_metas)
+        self.write(serialize_serializable(results))
+
 
 web_handlers = {MetaWebAPIHandler.get_root_pattern(): MetaWebAPIHandler}
 
@@ -67,3 +77,16 @@ async def get_chunk_meta(
             params["fields"] = ",".join(fields)
         res = await self._request_url("GET", req_addr, params=params)
         return deserialize_serializable(res.body)
+
+    @get_chunk_meta.batch
+    async def get_chunks_meta(self, args_list, kwargs_list):
+        get_chunk_metas = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            object_id, fields, error = self.get_chunk_meta.bind(*args, **kwargs)
+            get_chunk_metas.append([object_id, fields, error])
+
+        req_addr = f"{self._address}/api/session/{self._session_id}/meta"
+        res = await self._request_url(
+            "POST", req_addr, data=serialize_serializable(get_chunk_metas)
+        )
+        return deserialize_serializable(res.body)
diff --git a/mars/services/storage/api/web.py b/mars/services/storage/api/web.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import defaultdict
 from typing import Any, List
 
 from .... import oscar as mo
@@ -55,6 +56,25 @@ async def get_data(self, session_id: str, data_key: str):
         result = await oscar_api.get(data_key)
         self.write(serialize_serializable(result))
 
+    @web_api("batch/get", method="post")
+    async def get_batch_data(self, session_id: str):
+        body_args = deserialize_serializable(self.request.body)
+        storage_api_to_gets = defaultdict(list)
+        storage_api_to_idx = defaultdict(list)
+        results = [None] * len(body_args)
+        for i, (data_key, conditions, error) in enumerate(body_args):
+            oscar_api = await self._get_storage_api_by_object_id(session_id, data_key)
+            storage_api_to_idx[oscar_api].append(i)
+            storage_api_to_gets[oscar_api].append(
+                oscar_api.get.delay(data_key, conditions=conditions, error=error)
+            )
+        for api, fetches in storage_api_to_gets.items():
+            data_list = await api.get.batch(*fetches)
+            for idx, data in zip(storage_api_to_idx[api], data_list):
+                results[idx] = data
+        res_data = serialize_serializable(results)
+        self.write(res_data)
+
     @web_api("(?P<data_key>[^/]+)", method="post")
     async def get_data_by_post(self, session_id: str, data_key: str):
         body_args = (
@@ -110,6 +130,21 @@ async def get(
         )
         return deserialize_serializable(res.body)
 
+    @get.batch
+    async def get_batch(self, args_list, kwargs_list):
+        get_chunks = []
+        for args, kwargs in zip(args_list, kwargs_list):
+            data_key, conditions, error = self.get.bind(*args, **kwargs)
+            get_chunks.append([data_key, conditions, error])
+
+        path = f"{self._address}/api/session/{self._session_id}/storage/batch/get"
+        res = await self._request_url(
+            path=path,
+            method="POST",
+            data=serialize_serializable(get_chunks),
+        )
+        return deserialize_serializable(res.body)
+
     @mo.extensible
     async def put(
         self, data_key: str, obj: object, level: StorageLevel = StorageLevel.MEMORY
diff --git a/mars/utils.py b/mars/utils.py
@@ -1085,27 +1085,40 @@ def arrow_array_to_objects(
     return obj
 
 
+_enter_counter = 0
+_initial_session = None
+
+
 def enter_current_session(func: Callable):
     @functools.wraps(func)
     def wrapped(cls, ctx, op):
         from .deploy.oscar.session import AbstractSession, get_default_session
 
+        global _enter_counter, _initial_session
         # skip in some test cases
         if not hasattr(ctx, "get_current_session"):
             return func(cls, ctx, op)
 
-        session = ctx.get_current_session()
-        prev_default_session = get_default_session()
-        session.as_default()
+        with AbstractSession._lock:
+            if _enter_counter == 0:
+                # to handle nested call, only set initial session
+                # in first call
+                session = ctx.get_current_session()
+                _initial_session = get_default_session()
+                session.as_default()
+            _enter_counter += 1
 
         try:
             result = func(cls, ctx, op)
         finally:
-            if prev_default_session:
-                prev_default_session.as_default()
-            else:
-                AbstractSession.reset_default()
-
+            with AbstractSession._lock:
+                _enter_counter -= 1
+                if _enter_counter == 0:
+                    # set previous session when counter is 0
+                    if _initial_session:
+                        _initial_session.as_default()
+                    else:
+                        AbstractSession.reset_default()
         return result
 
     return wrapped