Bump pyarrow to 8.0.0 (#5620)

lhoestq · web-flow · commit dd31bce76b55 · 2023-03-08T14:54:21.000+01:00
* bump pyarrow for pandas 2.0

* bump to 8.0.0

* remove all the pyarrow 7 related checks

* update ci

* minor

* albert's comment
diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml
@@ -15,7 +15,7 @@ requirements:
     - python
     - pip
     - numpy >=1.17
-    - pyarrow >=6.0.0
+    - pyarrow >=8.0.0
     - python-xxhash
     - dill
     - pandas
@@ -32,7 +32,7 @@ requirements:
     - python
     - pip
     - numpy >=1.17
-    - pyarrow >=6.0.0
+    - pyarrow >=8.0.0
     - python-xxhash
     - dill
     - pandas
diff --git a/.github/workflows/benchmarks.yaml b/.github/workflows/benchmarks.yaml
@@ -19,8 +19,8 @@ jobs:
           pip install setuptools wheel
           pip install -e .[benchmarks]
 
-          # pyarrow==6.0.0
-          pip install pyarrow==6.0.0
+          # pyarrow==8.0.0
+          pip install pyarrow==8.0.0
 
           dvc repro --force
 
@@ -29,7 +29,7 @@ jobs:
 
           python ./benchmarks/format.py report.json report.md
 
-          echo "<details>\n<summary>Show benchmarks</summary>\n\nPyArrow==6.0.0\n" > final_report.md
+          echo "<details>\n<summary>Show benchmarks</summary>\n\nPyArrow==8.0.0\n" > final_report.md
           cat report.md >> final_report.md
 
           # pyarrow
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -63,7 +63,7 @@ jobs:
         run: pip install --upgrade pyarrow huggingface-hub dill
       - name: Install depencencies (minimum versions)
         if: ${{ matrix.deps_versions != 'deps-latest' }}
-        run: pip install pyarrow==6.0.1 huggingface-hub==0.2.0 transformers dill==0.3.1.1
+        run: pip install pyarrow==8.0.0 huggingface-hub==0.2.0 transformers dill==0.3.1.1
       - name: Test with pytest
         run: |
           python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/
diff --git a/setup.py b/setup.py
@@ -110,8 +110,8 @@
     # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)
     "numpy>=1.17",
     # Backend and serialization.
-    # Minimum 6.0.0 to support wrap_array which is needed for ArrayND features
-    "pyarrow>=6.0.0",
+    # Minimum 8.0.0 to be able to use .to_reader()
+    "pyarrow>=8.0.0",
     # For smart caching dataset processing
     "dill>=0.3.0,<0.3.7",  # tmp pin until next 0.3.7 release: see https://github.com/huggingface/datasets/pull/5166
     # For performance gains with apache arrow
diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -30,9 +30,9 @@
         "To use `datasets`, Python>=3.7 is required, and the current version of Python doesn't match this condition."
     )
 
-if version.parse(pyarrow.__version__).major < 6:
+if version.parse(pyarrow.__version__).major < 8:
     raise ImportWarning(
-        "To use `datasets`, the module `pyarrow>=6.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n"
+        "To use `datasets`, the module `pyarrow>=8.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n"
         "If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`."
     )
 
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -2292,7 +2292,7 @@ def iter(self, batch_size: int, drop_last_batch: bool = False):
             drop_last_batch (:obj:`bool`, default `False`): Whether a last batch smaller than the batch_size should be
                 dropped
         """
-        if self._indices is None and config.PYARROW_VERSION.major >= 8:
+        if self._indices is None:
             # Fast iteration
             # Benchmark: https://gist.github.com/mariosasko/0248288a2e3a7556873969717c1fe52b (fast_iter_batch)
             format_kwargs = self._format_kwargs if self._format_kwargs is not None else {}
diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
@@ -16,19 +16,6 @@
 logger = datasets.utils.logging.get_logger(__name__)
 
 
-if datasets.config.PYARROW_VERSION.major >= 7:
-
-    def pa_table_to_pylist(table):
-        return table.to_pylist()
-
-else:
-
-    def pa_table_to_pylist(table):
-        keys = table.column_names
-        values = table.to_pydict().values()
-        return [{k: v for k, v in zip(keys, row_values)} for row_values in zip(*values)]
-
-
 def count_path_segments(path):
     return path.replace("\\", "/").count("/")
 
@@ -310,7 +297,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                                 metadata_dict = {
                                     os.path.normpath(file_name).replace("\\", "/"): sample_metadata
                                     for file_name, sample_metadata in zip(
-                                        pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table)
+                                        pa_file_name_array.to_pylist(), pa_metadata_table.to_pylist()
                                     )
                                 }
                             else:
@@ -376,7 +363,7 @@ def _generate_examples(self, files, metadata_files, split_name, add_metadata, ad
                                     metadata_dict = {
                                         os.path.normpath(file_name).replace("\\", "/"): sample_metadata
                                         for file_name, sample_metadata in zip(
-                                            pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table)
+                                            pa_file_name_array.to_pylist(), pa_metadata_table.to_pylist()
                                         )
                                     }
                                 else:
diff --git a/src/datasets/packaged_modules/json/json.py b/src/datasets/packaged_modules/json/json.py
@@ -15,25 +15,6 @@
 logger = datasets.utils.logging.get_logger(__name__)
 
 
-if datasets.config.PYARROW_VERSION.major >= 7:
-
-    def pa_table_from_pylist(mapping):
-        return pa.Table.from_pylist(mapping)
-
-else:
-
-    def pa_table_from_pylist(mapping):
-        # Copied from: https://github.com/apache/arrow/blob/master/python/pyarrow/table.pxi#L5193
-        arrays = []
-        names = []
-        if mapping:
-            names = list(mapping[0].keys())
-        for n in names:
-            v = [row[n] if n in row else None for row in mapping]
-            arrays.append(v)
-        return pa.Table.from_arrays(arrays, names)
-
-
 @dataclass
 class JsonConfig(datasets.BuilderConfig):
     """BuilderConfig for JSON."""
@@ -156,7 +137,7 @@ def _generate_tables(self, files):
                             # If possible, parse the file as a list of json objects and exit the loop
                             if isinstance(dataset, list):  # list is the only sequence type supported in JSON
                                 try:
-                                    pa_table = pa_table_from_pylist(dataset)
+                                    pa_table = pa.Table.from_pylist(dataset)
                                 except (pa.ArrowInvalid, AttributeError) as e:
                                     logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
                                     raise ValueError(f"Not able to read records in the JSON file at {file}.") from None
diff --git a/src/datasets/table.py b/src/datasets/table.py
@@ -148,20 +148,6 @@ def fast_slice(self, offset=0, length=None) -> pa.Table:
         return pa.Table.from_batches(batches, schema=self._schema)
 
 
-class _RecordBatchReader:
-    def __init__(self, table: "Table", max_chunksize: Optional[int] = None):
-        self.table = table
-        self.max_chunksize = max_chunksize
-
-    def __iter__(self):
-        for batch in self.table._batches:
-            if self.max_chunksize is None or len(batch) <= self.max_chunksize:
-                yield batch
-            else:
-                for offset in range(0, len(batch), self.max_chunksize):
-                    yield batch.slice(offset, self.max_chunksize)
-
-
 class Table(IndexedTableMixin):
     """
     Wraps a pyarrow Table by using composition.
@@ -359,10 +345,8 @@ def to_reader(self, max_chunksize: Optional[int] = None):
                 on the chunk layout of individual columns.
 
         Returns:
-            `pyarrow.RecordBatchReader` if pyarrow>=8.0.0, otherwise a `pyarrow.RecordBatch` iterable
+            `pyarrow.RecordBatchReader`
         """
-        if config.PYARROW_VERSION.major < 8:
-            return _RecordBatchReader(self, max_chunksize=max_chunksize)
         return self.table.to_reader(max_chunksize=max_chunksize)
 
     def field(self, *args, **kwargs):
@@ -816,11 +800,7 @@ def from_pylist(cls, mapping, *args, **kwargs):
         Returns:
             `datasets.table.Table`
         """
-        try:
-            return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
-        except AttributeError:  # pyarrow <7 does not have from_pylist, so we convert and use from_pydict
-            mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} if mapping else {}
-            return cls(pa.Table.from_pydict(mapping, *args, **kwargs))
+        return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
 
     @classmethod
     def from_batches(cls, *args, **kwargs):

Original file line number	Diff line number	Diff line change
`@@ -30,9 +30,9 @@`
`30`	`30`	"To use `datasets`, Python>=3.7 is required, and the current version of Python doesn't match this condition."
`31`	`31`	`)`
`32`	`32`
`33`		`-if version.parse(pyarrow.__version__).major < 6:`
	`33`	`+if version.parse(pyarrow.__version__).major < 8:`
`34`	`34`	`raise ImportWarning(`
`35`		- "To use `datasets`, the module `pyarrow>=6.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n"
	`35`	+ "To use `datasets`, the module `pyarrow>=8.0.0` is required, and the current version of `pyarrow` doesn't match this condition.\n"
`36`	`36`	"If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`."
`37`	`37`	`)`
`38`	`38`