|
61 | 61 | on_serialize_numpy_type, |
62 | 62 | ceildiv, |
63 | 63 | tokenize, |
| 64 | + estimate_pandas_size, |
64 | 65 | ) |
65 | 66 | from .utils import fetch_corner_data, ReprSeries, parse_index, merge_index_value |
66 | 67 | from ..tensor import statistics |
@@ -565,28 +566,52 @@ def to_pandas(self, session=None, **kw): |
565 | 566 | class _BatchedFetcher: |
566 | 567 | __slots__ = () |
567 | 568 |
|
568 | | - def _iter(self, batch_size=1000, session=None, **kw): |
| 569 | + def _iter(self, batch_size=None, session=None, **kw): |
569 | 570 | from .indexing.iloc import iloc |
570 | 571 |
|
571 | | - size = self.shape[0] |
572 | | - n_batch = ceildiv(size, batch_size) |
| 572 | + if batch_size is not None: |
| 573 | + size = self.shape[0] |
| 574 | + n_batch = ceildiv(size, batch_size) |
573 | 575 |
|
574 | | - if n_batch > 1: |
575 | | - for i in range(n_batch): |
576 | | - batch_data = iloc(self)[batch_size * i : batch_size * (i + 1)] |
577 | | - yield batch_data._fetch(session=session, **kw) |
| 576 | + if n_batch > 1: |
| 577 | + for i in range(n_batch): |
| 578 | + batch_data = iloc(self)[batch_size * i : batch_size * (i + 1)] |
| 579 | + yield batch_data._fetch(session=session, **kw) |
| 580 | + else: |
| 581 | + yield self._fetch(session=session, **kw) |
578 | 582 | else: |
579 | | - yield self._fetch(session=session, **kw) |
| 583 | + # if batch_size is not specified, use first batch to estimate |
| 584 | + # batch_size. |
| 585 | + default_batch_bytes = 50 * 1024 ** 2 |
| 586 | + first_batch = 1000 |
| 587 | + size = self.shape[0] |
| 588 | + |
| 589 | + if size >= first_batch: |
| 590 | + batch_data = iloc(self)[:first_batch] |
| 591 | + first_batch_data = batch_data._fetch(session=session, **kw) |
| 592 | + yield first_batch_data |
| 593 | + data_size = estimate_pandas_size(first_batch_data) |
| 594 | + batch_size = int(default_batch_bytes / data_size * first_batch) |
| 595 | + n_batch = ceildiv(size - 1000, batch_size) |
| 596 | + for i in range(n_batch): |
| 597 | + batch_data = iloc(self)[ |
| 598 | + first_batch |
| 599 | + + batch_size * i : first_batch |
| 600 | + + batch_size * (i + 1) |
| 601 | + ] |
| 602 | + yield batch_data._fetch(session=session, **kw) |
| 603 | + else: |
| 604 | + yield self._fetch(session=session, **kw) |
580 | 605 |
|
581 | | - def iterbatch(self, batch_size=1000, session=None, **kw): |
| 606 | + def iterbatch(self, batch_size=None, session=None, **kw): |
582 | 607 | # trigger execution |
583 | 608 | self.execute(session=session, **kw) |
584 | 609 | return self._iter(batch_size=batch_size, session=session) |
585 | 610 |
|
586 | 611 | def fetch(self, session=None, **kw): |
587 | 612 | from .indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem |
588 | 613 |
|
589 | | - batch_size = kw.pop("batch_size", 1000) |
| 614 | + batch_size = kw.pop("batch_size", None) |
590 | 615 | if isinstance(self.op, (DataFrameIlocGetItem, SeriesIlocGetItem)): |
591 | 616 | # see GH#1871 |
592 | 617 | # already iloc, do not trigger batch fetch |
|
0 commit comments