Remove references to torchtext.legacy from PyTorch Lightning (#10724)

abhinavarora · pre-commit-ci[bot] · awaelchli · lexierule · commit a971a6a98d0b · 2022-02-09T15:22:34.000-05:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py
@@ -23,9 +23,10 @@
 import numpy as np
 import torch
 
-from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_AVAILABLE
+from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY
+from pytorch_lightning.utilities.warnings import rank_zero_deprecation
 
-if _TORCHTEXT_AVAILABLE:
+if _TORCHTEXT_LEGACY:
     if _compare_version("torchtext", operator.ge, "0.9.0"):
         from torchtext.legacy.data import Batch
     else:
@@ -260,8 +261,13 @@ def move_data_to_device(batch: Any, device: Union[str, torch.device]) -> Any:
 
     def batch_to(data: Any) -> Any:
         # try to move torchtext data first
-        if _TORCHTEXT_AVAILABLE and isinstance(data, Batch):
-
+        if _TORCHTEXT_LEGACY and isinstance(data, Batch):
+            # TODO: also remove the torchtext dependency with Lightning 1.8
+            rank_zero_deprecation(
+                "The `torchtext.legacy.Batch` object is deprecated and Lightning will remove support for it in v1.8."
+                " We recommend you to migrate away from Batch by following the TorchText README:"
+                " https://github.com/pytorch/text#bc-breaking-legacy"
+            )
             # Shallow copy because each Batch has a reference to Dataset which contains all examples
             device_data = copy(data)
             for field, field_value in data.dataset.fields.items():
@@ -281,7 +287,7 @@ def batch_to(data: Any) -> Any:
         # user wrongly implemented the `TransferableDataType` and forgot to return `self`.
         return data
 
-    dtype = (TransferableDataType, Batch) if _TORCHTEXT_AVAILABLE else TransferableDataType
+    dtype = (TransferableDataType, Batch) if _TORCHTEXT_LEGACY else TransferableDataType
     return apply_to_collection(batch, dtype=dtype, function=batch_to)
 
 
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
@@ -118,6 +118,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 _RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2")
 _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"])
 _TORCHTEXT_AVAILABLE = _package_available("torchtext")
+_TORCHTEXT_LEGACY: bool = _TORCHTEXT_AVAILABLE and _compare_version("torchtext", operator.lt, "0.11.0")
 _TORCHVISION_AVAILABLE = _package_available("torchvision")
 _XLA_AVAILABLE: bool = _package_available("torch_xla")
 
diff --git a/tests/deprecated_api/test_remove_1-8.py b/tests/deprecated_api/test_remove_1-8.py
@@ -0,0 +1,29 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test deprecated functionality which will be removed in v1.8.0."""
+import pytest
+import torch
+
+from pytorch_lightning.utilities.apply_func import move_data_to_device
+from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
+from tests.helpers.torchtext_utils import get_dummy_torchtext_data_iterator
+
+
+@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.")
+def test_v1_8_0_deprecated_torchtext_batch():
+
+    with pytest.deprecated_call(match="is deprecated and Lightning will remove support for it in v1.8"):
+        data_iterator, _ = get_dummy_torchtext_data_iterator(num_samples=3, batch_size=3)
+        batch = next(iter(data_iterator))
+        _ = move_data_to_device(batch=batch, device=torch.device("cpu"))
diff --git a/tests/helpers/imports.py b/tests/helpers/imports.py
@@ -1,6 +1,16 @@
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
+import operator
 
-if _TORCH_GREATER_EQUAL_1_8:
-    from torchtext.legacy.data import Batch, Dataset, Example, Field, Iterator, LabelField
+from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY
+
+if _TORCHTEXT_LEGACY:
+    if _compare_version("torchtext", operator.ge, "0.9.0"):
+        from torchtext.legacy.data import Batch, Dataset, Example, Field, Iterator, LabelField
+    else:
+        from torchtext.data import Batch, Dataset, Example, Field, Iterator, LabelField
 else:
-    from torchtext.data import Batch, Dataset, Example, Field, Iterator, LabelField  # noqa: F401
+    Batch = type(None)
+    Dataset = type(None)
+    Example = type(None)
+    Field = type(None)
+    Iterator = type(None)
+    LabelField = type(None)
diff --git a/tests/helpers/torchtext_utils.py b/tests/helpers/torchtext_utils.py
@@ -0,0 +1,54 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import string
+
+from tests.helpers.imports import Dataset, Example, Field, Iterator
+
+
+def _generate_random_string(length: int = 10):
+    return "".join(random.choices(string.ascii_letters, k=length))
+
+
+def get_dummy_torchtext_data_iterator(num_samples: int, batch_size: int, include_lengths: bool = False):
+    text_field = Field(
+        sequential=True,
+        pad_first=False,  # nosec
+        init_token="<s>",
+        eos_token="</s>",  # nosec
+        include_lengths=include_lengths,
+    )  # nosec
+
+    dataset = Dataset(
+        [
+            Example.fromdict({"text": _generate_random_string()}, {"text": ("text", text_field)})
+            for _ in range(num_samples)
+        ],
+        {"text": text_field},
+    )
+    text_field.build_vocab(dataset)
+
+    iterator = Iterator(
+        dataset,
+        batch_size=batch_size,
+        sort_key=None,
+        device=None,
+        batch_size_fn=None,
+        train=True,
+        repeat=False,
+        shuffle=None,
+        sort=None,
+        sort_within_batch=None,
+    )
+    return iterator, text_field
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
@@ -26,7 +26,7 @@
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _compare_version
+from pytorch_lightning.utilities.imports import _compare_version, _TORCHTEXT_LEGACY
 from tests.helpers import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.imports import Batch, Dataset, Example, Field, LabelField
@@ -309,6 +309,9 @@ def to(self, *args, **kwargs):
     assert batch.a.type() == "torch.cuda.FloatTensor"
 
     # torchtext.data.Batch
+    if not _TORCHTEXT_LEGACY:
+        return
+
     samples = [
         {"text": "PyTorch Lightning is awesome!", "label": 0},
         {"text": "Please make it work with torchtext", "label": 1},
@@ -326,7 +329,8 @@ def to(self, *args, **kwargs):
     label_field.build_vocab(dataset)
 
     batch = Batch(data=examples, dataset=dataset)
-    batch = trainer.accelerator.batch_to_device(batch, torch.device("cuda:0"))
+    with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"):
+        batch = trainer.accelerator.batch_to_device(batch, torch.device("cuda:0"))
 
     assert batch.text.type() == "torch.cuda.LongTensor"
     assert batch.label.type() == "torch.cuda.LongTensor"
diff --git a/tests/utilities/test_apply_func_torchtext.py b/tests/utilities/test_apply_func_torchtext.py
@@ -15,49 +15,22 @@
 import torch
 
 from pytorch_lightning.utilities.apply_func import move_data_to_device
-from tests.helpers.imports import Dataset, Example, Field, Iterator
+from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
 from tests.helpers.runif import RunIf
-
-
-def _get_torchtext_data_iterator(include_lengths=False):
-    text_field = Field(
-        sequential=True,
-        pad_first=False,  # nosec
-        init_token="<s>",
-        eos_token="</s>",  # nosec
-        include_lengths=include_lengths,
-    )  # nosec
-
-    example1 = Example.fromdict({"text": "a b c a c"}, {"text": ("text", text_field)})
-    example2 = Example.fromdict({"text": "b c a a"}, {"text": ("text", text_field)})
-    example3 = Example.fromdict({"text": "c b a"}, {"text": ("text", text_field)})
-
-    dataset = Dataset([example1, example2, example3], {"text": text_field})
-    text_field.build_vocab(dataset)
-
-    iterator = Iterator(
-        dataset,
-        batch_size=3,
-        sort_key=None,
-        device=None,
-        batch_size_fn=None,
-        train=True,
-        repeat=False,
-        shuffle=None,
-        sort=None,
-        sort_within_batch=None,
-    )
-    return iterator, text_field
+from tests.helpers.torchtext_utils import get_dummy_torchtext_data_iterator
 
 
 @pytest.mark.parametrize("include_lengths", [False, True])
 @pytest.mark.parametrize("device", [torch.device("cuda", 0)])
+@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.")
 @RunIf(min_gpus=1)
 def test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, device):
-    data_iterator, _ = _get_torchtext_data_iterator(include_lengths=include_lengths)
+    data_iterator, _ = get_dummy_torchtext_data_iterator(num_samples=3, batch_size=3, include_lengths=include_lengths)
     data_iter = iter(data_iterator)
     batch = next(data_iter)
-    batch_on_device = move_data_to_device(batch, device)
+
+    with pytest.deprecated_call(match="The `torchtext.legacy.Batch` object is deprecated"):
+        batch_on_device = move_data_to_device(batch, device)
 
     if include_lengths:
         # tensor with data
@@ -69,5 +42,6 @@ def test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, de
 
 
 @pytest.mark.parametrize("include_lengths", [False, True])
+@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.")
 def test_batch_move_data_to_device_torchtext_include_lengths_cpu(include_lengths):
     test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, torch.device("cpu"))