From 8947d135d6dd87f170648f9ec90f56b4826b5b11 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 3 Apr 2024 23:53:46 +0200 Subject: [PATCH 001/179] Skip test with compile error on torch=2.2.2 on Windows (#19734) --- tests/tests_pytorch/utilities/test_compile.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index c363bbc94cf8a..42daba6e05a36 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -16,6 +16,7 @@ import pytest import torch +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.utilities.compile import from_compiled, to_uncompiled @@ -114,6 +115,9 @@ def has_dynamo(fn): # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") +@pytest.mark.xfail( + sys.platform == "win32" and _TORCH_GREATER_EQUAL_2_2, strict=False, reason="RuntimeError: Failed to import" +) @RunIf(dynamo=True) def test_trainer_compiled_model_that_logs(tmp_path): class MyModel(BoringModel): @@ -140,6 +144,9 @@ def training_step(self, batch, batch_idx): # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") +@pytest.mark.xfail( + sys.platform == "win32" and _TORCH_GREATER_EQUAL_2_2, strict=False, reason="RuntimeError: Failed to import" +) @RunIf(dynamo=True) def test_trainer_compiled_model_test(tmp_path): model = BoringModel() From ce88483c6f8789d47e00f3d0b8f21f1e0332ddf2 Mon Sep 17 00:00:00 2001 From: Alexander Jipa Date: Wed, 3 Apr 2024 18:16:14 -0400 Subject: [PATCH 002/179] Add synchronous parameter to MLflowLogger (#19639) Co-authored-by: Alexander Jipa --- src/lightning/pytorch/loggers/mlflow.py | 12 +++-- tests/tests_pytorch/loggers/test_mlflow.py | 59 +++++++++++++++++++++- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/src/lightning/pytorch/loggers/mlflow.py b/src/lightning/pytorch/loggers/mlflow.py index cf874b6d690bf..4437d5f8a7c76 100644 --- a/src/lightning/pytorch/loggers/mlflow.py +++ b/src/lightning/pytorch/loggers/mlflow.py @@ -42,6 +42,7 @@ log = logging.getLogger(__name__) LOCAL_FILE_URI_PREFIX = "file:" _MLFLOW_AVAILABLE = RequirementCache("mlflow>=1.0.0", "mlflow") +_MLFLOW_SYNCHRONOUS_AVAILABLE = RequirementCache("mlflow>=2.8.0", "mlflow") class MLFlowLogger(Logger): @@ -100,6 +101,8 @@ def any_lightning_module_function_or_hook(self): artifact_location: The location to store run artifacts. If not provided, the server picks an appropriate default. run_id: The run identifier of the experiment. If not provided, a new run is started. + synchronous: Hints mlflow whether to block the execution for every logging call until complete where + applicable. Requires mlflow >= 2.8.0 Raises: ModuleNotFoundError: @@ -120,9 +123,12 @@ def __init__( prefix: str = "", artifact_location: Optional[str] = None, run_id: Optional[str] = None, + synchronous: Optional[bool] = None, ): if not _MLFLOW_AVAILABLE: raise ModuleNotFoundError(str(_MLFLOW_AVAILABLE)) + if synchronous is not None and not _MLFLOW_SYNCHRONOUS_AVAILABLE: + raise ModuleNotFoundError("`synchronous` requires mlflow>=2.8.0") super().__init__() if not tracking_uri: tracking_uri = f"{LOCAL_FILE_URI_PREFIX}{save_dir}" @@ -138,7 +144,7 @@ def __init__( self._checkpoint_callback: Optional[ModelCheckpoint] = None self._prefix = prefix self._artifact_location = artifact_location - + self._log_batch_kwargs = {} if synchronous is None else {"synchronous": synchronous} self._initialized = False from mlflow.tracking import MlflowClient @@ -233,7 +239,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: # Log in chunks of 100 parameters (the maximum allowed by MLflow). for idx in range(0, len(params_list), 100): - self.experiment.log_batch(run_id=self.run_id, params=params_list[idx : idx + 100]) + self.experiment.log_batch(run_id=self.run_id, params=params_list[idx : idx + 100], **self._log_batch_kwargs) @override @rank_zero_only @@ -261,7 +267,7 @@ def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) k = new_k metrics_list.append(Metric(key=k, value=v, timestamp=timestamp_ms, step=step or 0)) - self.experiment.log_batch(run_id=self.run_id, metrics=metrics_list) + self.experiment.log_batch(run_id=self.run_id, metrics=metrics_list, **self._log_batch_kwargs) @override @rank_zero_only diff --git a/tests/tests_pytorch/loggers/test_mlflow.py b/tests/tests_pytorch/loggers/test_mlflow.py index 372491a34c249..f02cfdb90901a 100644 --- a/tests/tests_pytorch/loggers/test_mlflow.py +++ b/tests/tests_pytorch/loggers/test_mlflow.py @@ -18,7 +18,12 @@ import pytest from lightning.pytorch import Trainer from lightning.pytorch.demos.boring_classes import BoringModel -from lightning.pytorch.loggers.mlflow import _MLFLOW_AVAILABLE, MLFlowLogger, _get_resolve_tags +from lightning.pytorch.loggers.mlflow import ( + _MLFLOW_AVAILABLE, + _MLFLOW_SYNCHRONOUS_AVAILABLE, + MLFlowLogger, + _get_resolve_tags, +) def mock_mlflow_run_creation(logger, experiment_name=None, experiment_id=None, run_id=None): @@ -260,6 +265,58 @@ def test_mlflow_logger_experiment_calls(mlflow_mock, tmp_path): ) +@pytest.mark.parametrize("synchronous", [False, True]) +@mock.patch("lightning.pytorch.loggers.mlflow._get_resolve_tags", Mock()) +def test_mlflow_logger_experiment_calls_with_synchronous(mlflow_mock, tmp_path, synchronous): + """Test that the logger calls methods on the mlflow experiment with the specified synchronous flag.""" + if not _MLFLOW_SYNCHRONOUS_AVAILABLE: + pytest.skip("this test requires mlflow>=2.8.0") + + time = mlflow_mock.entities.time + metric = mlflow_mock.entities.Metric + param = mlflow_mock.entities.Param + time.return_value = 1 + + mlflow_client = mlflow_mock.tracking.MlflowClient.return_value + mlflow_client.get_experiment_by_name.return_value = None + logger = MLFlowLogger( + "test", save_dir=str(tmp_path), artifact_location="my_artifact_location", synchronous=synchronous + ) + + params = {"test": "test_param"} + logger.log_hyperparams(params) + + mlflow_client.log_batch.assert_called_once_with( + run_id=logger.run_id, params=[param(key="test", value="test_param")], synchronous=synchronous + ) + param.assert_called_with(key="test", value="test_param") + + metrics = {"some_metric": 10} + logger.log_metrics(metrics) + + mlflow_client.log_batch.assert_called_with( + run_id=logger.run_id, + metrics=[metric(key="some_metric", value=10, timestamp=1000, step=0)], + synchronous=synchronous, + ) + metric.assert_called_with(key="some_metric", value=10, timestamp=1000, step=0) + + mlflow_client.create_experiment.assert_called_once_with(name="test", artifact_location="my_artifact_location") + + +@mock.patch("lightning.pytorch.loggers.mlflow._get_resolve_tags", Mock()) +@mock.patch.dict("lightning.pytorch.loggers.mlflow.__dict__", {"_MLFLOW_SYNCHRONOUS_AVAILABLE": False}) +def test_mlflow_logger_no_synchronous_support(mlflow_mock, tmp_path): + """Test that the logger does not support synchronous flag.""" + time = mlflow_mock.entities.time + time.return_value = 1 + + mlflow_client = mlflow_mock.tracking.MlflowClient.return_value + mlflow_client.get_experiment_by_name.return_value = None + with pytest.raises(ModuleNotFoundError): + MLFlowLogger("test", save_dir=str(tmp_path), artifact_location="my_artifact_location", synchronous=True) + + @mock.patch("lightning.pytorch.loggers.mlflow._get_resolve_tags", Mock()) def test_mlflow_logger_with_long_param_value(mlflow_mock, tmp_path): """Test that long parameter values are truncated to 250 characters.""" From 76b691d80c6c5203c66365272ce246ac86e418f0 Mon Sep 17 00:00:00 2001 From: Dominic Kerr Date: Thu, 4 Apr 2024 01:42:25 +0100 Subject: [PATCH 003/179] Support pathlib.Path file paths when saving ONNX models (#19727) Co-authored-by: dominicgkerr --- src/lightning/pytorch/core/module.py | 2 +- tests/tests_pytorch/models/test_onnx.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index 3075e8952b148..faeda00ce5aa9 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -1395,7 +1395,7 @@ def forward(self, x): input_sample = self._on_before_batch_transfer(input_sample) input_sample = self._apply_batch_transfer_handler(input_sample) - torch.onnx.export(self, input_sample, file_path, **kwargs) + torch.onnx.export(self, input_sample, str(file_path), **kwargs) self.train(mode) @torch.no_grad() diff --git a/tests/tests_pytorch/models/test_onnx.py b/tests/tests_pytorch/models/test_onnx.py index 9bb20579b7162..15d06355946fc 100644 --- a/tests/tests_pytorch/models/test_onnx.py +++ b/tests/tests_pytorch/models/test_onnx.py @@ -13,6 +13,7 @@ # limitations under the License. import operator import os +from pathlib import Path from unittest.mock import patch import numpy as np @@ -32,11 +33,14 @@ def test_model_saves_with_input_sample(tmp_path): """Test that ONNX model saves with input sample and size is greater than 3 MB.""" model = BoringModel() - trainer = Trainer(fast_dev_run=True) - trainer.fit(model) - - file_path = os.path.join(tmp_path, "model.onnx") input_sample = torch.randn((1, 32)) + + file_path = os.path.join(tmp_path, "os.path.onnx") + model.to_onnx(file_path, input_sample) + assert os.path.isfile(file_path) + assert os.path.getsize(file_path) > 4e2 + + file_path = Path(tmp_path) / "pathlib.onnx" model.to_onnx(file_path, input_sample) assert os.path.isfile(file_path) assert os.path.getsize(file_path) > 4e2 From 316cc71c2b5030967b29098ae608129193d5cb26 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 11 Apr 2024 11:01:27 +0200 Subject: [PATCH 004/179] Skip tests that cause CLI argparse errors on Python 3.11.9 (#19756) --- tests/tests_pytorch/loggers/conftest.py | 3 ++- tests/tests_pytorch/loggers/test_mlflow.py | 3 --- tests/tests_pytorch/loggers/test_wandb.py | 3 +++ tests/tests_pytorch/test_cli.py | 27 +++++++++++++++++++++- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/tests/tests_pytorch/loggers/conftest.py b/tests/tests_pytorch/loggers/conftest.py index 0a0923b6902a6..7cc5cc94fe8cc 100644 --- a/tests/tests_pytorch/loggers/conftest.py +++ b/tests/tests_pytorch/loggers/conftest.py @@ -38,7 +38,8 @@ def mlflow_mock(monkeypatch): mlflow.tracking = mlflow_tracking mlflow.entities = mlflow_entities - (monkeypatch.setattr("lightning.pytorch.loggers.mlflow._MLFLOW_AVAILABLE", True),) + monkeypatch.setattr("lightning.pytorch.loggers.mlflow._MLFLOW_AVAILABLE", True) + monkeypatch.setattr("lightning.pytorch.loggers.mlflow._MLFLOW_SYNCHRONOUS_AVAILABLE", True) return mlflow diff --git a/tests/tests_pytorch/loggers/test_mlflow.py b/tests/tests_pytorch/loggers/test_mlflow.py index f02cfdb90901a..14af36680904c 100644 --- a/tests/tests_pytorch/loggers/test_mlflow.py +++ b/tests/tests_pytorch/loggers/test_mlflow.py @@ -20,7 +20,6 @@ from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.loggers.mlflow import ( _MLFLOW_AVAILABLE, - _MLFLOW_SYNCHRONOUS_AVAILABLE, MLFlowLogger, _get_resolve_tags, ) @@ -269,8 +268,6 @@ def test_mlflow_logger_experiment_calls(mlflow_mock, tmp_path): @mock.patch("lightning.pytorch.loggers.mlflow._get_resolve_tags", Mock()) def test_mlflow_logger_experiment_calls_with_synchronous(mlflow_mock, tmp_path, synchronous): """Test that the logger calls methods on the mlflow experiment with the specified synchronous flag.""" - if not _MLFLOW_SYNCHRONOUS_AVAILABLE: - pytest.skip("this test requires mlflow>=2.8.0") time = mlflow_mock.entities.time metric = mlflow_mock.entities.Metric diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index 16c69d2c6d773..f667b0a7b5ee1 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -25,6 +25,8 @@ from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning_utilities.test.warning import no_warning_call +from tests_pytorch.test_cli import _xfail_python_ge_3_11_9 + def test_wandb_project_name(wandb_mock): with mock.patch.dict(os.environ, {}): @@ -548,6 +550,7 @@ def test_wandb_logger_download_artifact(wandb_mock, tmp_path): wandb_mock.Api().artifact.assert_called_once_with("test_artifact", type="model") +@_xfail_python_ge_3_11_9 @pytest.mark.parametrize(("log_model", "expected"), [("True", True), ("False", False), ("all", "all")]) def test_wandb_logger_cli_integration(log_model, expected, wandb_mock, monkeypatch, tmp_path): """Test that the WandbLogger can be used with the LightningCLI.""" diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 605f4ec3845e7..4c28d6588cea2 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -48,6 +48,7 @@ from lightning.pytorch.utilities.imports import _TORCHVISION_AVAILABLE from lightning_utilities import compare_version from lightning_utilities.test.warning import no_warning_call +from packaging.version import Version from tensorboard.backend.event_processing import event_accumulator from tensorboard.plugins.hparams.plugin_data_pb2 import HParamsPluginData from torch.optim import SGD @@ -64,6 +65,14 @@ def lazy_instance(*args, **kwargs): return None +_xfail_python_ge_3_11_9 = pytest.mark.xfail( + # https://github.com/omni-us/jsonargparse/issues/484 + Version(f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}") >= Version("3.11.9"), + strict=False, + reason="jsonargparse + Python 3.11.9 compatibility issue", +) + + @contextmanager def mock_subclasses(baseclass, *subclasses): """Mocks baseclass so that it only has the given child subclasses.""" @@ -347,6 +356,7 @@ def test_save_to_log_dir_false_error(): ) +@_xfail_python_ge_3_11_9 def test_lightning_cli_logger_save_config(cleandir): class LoggerSaveConfigCallback(SaveConfigCallback): def __init__(self, *args, **kwargs) -> None: @@ -736,6 +746,7 @@ def add_arguments_to_parser(self, parser): assert cli.trainer.lr_scheduler_configs[0].scheduler.step_size == 50 +@_xfail_python_ge_3_11_9 @pytest.mark.parametrize("use_generic_base_class", [False, True]) def test_lightning_cli_optimizers_and_lr_scheduler_with_link_to(use_generic_base_class): class MyLightningCLI(LightningCLI): @@ -782,7 +793,7 @@ def __init__(self, optim1: dict, optim2: dict, scheduler: dict): assert isinstance(cli.model.scheduler, torch.optim.lr_scheduler.ExponentialLR) -@pytest.mark.skipif(compare_version("jsonargparse", operator.lt, "4.21.3"), reason="vulnerability with failing imports") +@_xfail_python_ge_3_11_9 def test_lightning_cli_optimizers_and_lr_scheduler_with_callable_type(): class TestModel(BoringModel): def __init__( @@ -1031,6 +1042,7 @@ def __init__(self, foo, bar=5): self.bar = bar +@_xfail_python_ge_3_11_9 def test_lightning_cli_model_short_arguments(): with mock.patch("sys.argv", ["any.py", "fit", "--model=BoringModel"]), mock.patch( "lightning.pytorch.Trainer._fit_impl" @@ -1055,6 +1067,7 @@ def __init__(self, foo, bar=5): self.bar = bar +@_xfail_python_ge_3_11_9 def test_lightning_cli_datamodule_short_arguments(): # with set model with mock.patch("sys.argv", ["any.py", "fit", "--data=BoringDataModule"]), mock.patch( @@ -1100,6 +1113,7 @@ def test_lightning_cli_datamodule_short_arguments(): assert cli.parser.groups["data"].group_class is BoringDataModule +@_xfail_python_ge_3_11_9 @pytest.mark.parametrize("use_class_path_callbacks", [False, True]) def test_callbacks_append(use_class_path_callbacks): """This test validates registries are used when simplified command line are being used.""" @@ -1143,6 +1157,7 @@ def test_callbacks_append(use_class_path_callbacks): assert all(t in callback_types for t in expected) +@_xfail_python_ge_3_11_9 def test_optimizers_and_lr_schedulers_reload(cleandir): base = ["any.py", "--trainer.max_epochs=1"] input = base + [ @@ -1174,6 +1189,7 @@ def test_optimizers_and_lr_schedulers_reload(cleandir): LightningCLI(BoringModel, run=False) +@_xfail_python_ge_3_11_9 def test_optimizers_and_lr_schedulers_add_arguments_to_parser_implemented_reload(cleandir): class TestLightningCLI(LightningCLI): def __init__(self, *args): @@ -1427,6 +1443,7 @@ def test_cli_help_message(): assert "Implements Adam" in shorthand_help.getvalue() +@_xfail_python_ge_3_11_9 def test_cli_reducelronplateau(): with mock.patch( "sys.argv", ["any.py", "--optimizer=Adam", "--lr_scheduler=ReduceLROnPlateau", "--lr_scheduler.monitor=foo"] @@ -1437,6 +1454,7 @@ def test_cli_reducelronplateau(): assert config["lr_scheduler"]["scheduler"].monitor == "foo" +@_xfail_python_ge_3_11_9 def test_cli_configureoptimizers_can_be_overridden(): class MyCLI(LightningCLI): def __init__(self): @@ -1481,6 +1499,7 @@ def __init__(self, activation: torch.nn.Module = lazy_instance(torch.nn.LeakyReL assert cli.model.activation is not model.activation +@_xfail_python_ge_3_11_9 def test_ddpstrategy_instantiation_and_find_unused_parameters(mps_count_0): strategy_default = lazy_instance(DDPStrategy, find_unused_parameters=True) with mock.patch("sys.argv", ["any.py", "--trainer.strategy.process_group_backend=group"]): @@ -1496,6 +1515,7 @@ def test_ddpstrategy_instantiation_and_find_unused_parameters(mps_count_0): assert strategy_default is not cli.config_init.trainer.strategy +@_xfail_python_ge_3_11_9 def test_cli_logger_shorthand(): with mock.patch("sys.argv", ["any.py"]): cli = LightningCLI(TestModel, run=False, trainer_defaults={"logger": False}) @@ -1526,6 +1546,7 @@ def _test_logger_init_args(logger_name, init, unresolved=None): assert data["dict_kwargs"] == unresolved +@_xfail_python_ge_3_11_9 def test_comet_logger_init_args(): _test_logger_init_args( "CometLogger", @@ -1541,6 +1562,7 @@ def test_comet_logger_init_args(): strict=False, reason="TypeError on Windows when parsing", ) +@_xfail_python_ge_3_11_9 def test_neptune_logger_init_args(): _test_logger_init_args( "NeptuneLogger", @@ -1549,6 +1571,7 @@ def test_neptune_logger_init_args(): ) +@_xfail_python_ge_3_11_9 def test_tensorboard_logger_init_args(): _test_logger_init_args( "TensorBoardLogger", @@ -1560,6 +1583,7 @@ def test_tensorboard_logger_init_args(): ) +@_xfail_python_ge_3_11_9 def test_wandb_logger_init_args(): _test_logger_init_args( "WandbLogger", @@ -1644,6 +1668,7 @@ def __init__(self, a_func: Callable = torch.nn.Softmax): assert "a_func: torch.nn.Softmax" in out.getvalue() +@_xfail_python_ge_3_11_9 def test_pytorch_profiler_init_args(): from lightning.pytorch.profilers import Profiler, PyTorchProfiler From dcb91d53d2133b4db1bf3201b4f965646dea76fd Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 11 Apr 2024 11:52:28 +0200 Subject: [PATCH 005/179] Fix initialized weights resetting in `Fabric.setup()` when using FSDP (#19755) --- src/lightning/fabric/CHANGELOG.md | 3 ++- .../fabric/utilities/device_dtype_mixin.py | 6 ++---- .../strategies/test_fsdp_integration.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index d88f2ec12827a..d77a6a5131c78 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -51,7 +51,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed issue where some model methods couldn't be monkeypatched after being Fabric wrapped ([#19705](https://github.com/Lightning-AI/pytorch-lightning/pull/19705)) -- +- Fixed an issue causing weights to be reset in `Fabric.setup()` when using FSDP ([#19755](https://github.com/Lightning-AI/pytorch-lightning/pull/19755)) + ## [2.2.1] - 2024-03-04 diff --git a/src/lightning/fabric/utilities/device_dtype_mixin.py b/src/lightning/fabric/utilities/device_dtype_mixin.py index a9e614cbbd1d2..9f06dc50cfbef 100644 --- a/src/lightning/fabric/utilities/device_dtype_mixin.py +++ b/src/lightning/fabric/utilities/device_dtype_mixin.py @@ -109,14 +109,12 @@ def half(self) -> Self: def _update_properties( root: torch.nn.Module, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None ) -> None: - def apply_fn(module: Union[_DeviceDtypeModuleMixin, Module]) -> None: + for module in root.modules(): if not isinstance(module, _DeviceDtypeModuleMixin): - return + continue # cannot use `module.to()` because we don't actually want to move the model in case there are multiple # devices types (such as partial meta parameters) if device is not None: module._device = device if dtype is not None: module._dtype = dtype - - root.apply(apply_fn) diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index 4a971294a326d..03d1d0979ea66 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -668,3 +668,22 @@ def test_save_sharded_and_consolidate_and_load(tmp_path): model, optimizer = fabric.setup(model, optimizer) state = {"model": model, "optimizer": optimizer, "steps": 1} fabric.load(checkpoint_path_full, state) + + +@RunIf(min_cuda_gpus=2, standalone=True) +def test_no_call_to_apply(monkeypatch): + """Regression test to ensure we're not calling `FSDP.apply()` indirectly (see #19755).""" + monkeypatch.setattr(torch.distributed.fsdp.FullyShardedDataParallel, "apply", Mock()) + + fabric = Fabric( + accelerator="cuda", + strategy=FSDPStrategy(auto_wrap_policy=always_wrap_policy), + devices=2, + ) + fabric.launch() + + for setup_method in ("setup", "setup_module"): + model = BoringModel() + setup = getattr(fabric, setup_method) + model = setup(model) + model._forward_module.apply.assert_not_called() From 3f97e16cd447729d95d45c17565b579f03e0b022 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Apr 2024 06:40:25 -0400 Subject: [PATCH 006/179] [pre-commit.ci] pre-commit suggestions (#19723) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 088d622821839..61281a8597ffc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -70,7 +70,7 @@ repos: - id: sphinx-lint - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.2 + rev: v0.3.5 hooks: # try to fix what is possible - id: ruff From f642d685089a70bbc44bd4e4195d37f065a0c229 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 12 Apr 2024 13:11:21 +0200 Subject: [PATCH 007/179] ci/lint: simlify prettier (#19742) --- .pre-commit-config.yaml | 1 + .prettierignore | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 .prettierignore diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 61281a8597ffc..834a903bcac19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -101,4 +101,5 @@ repos: hooks: - id: prettier # https://prettier.io/docs/en/options.html#print-width + files: \.(json|yml|yaml|toml) args: ["--print-width=120"] diff --git a/.prettierignore b/.prettierignore deleted file mode 100644 index c5646f7112746..0000000000000 --- a/.prettierignore +++ /dev/null @@ -1,2 +0,0 @@ -# Ignore all MD files: -**/*.md From 67b270bd4d4f1fd3878cb875cd7908088767bbd8 Mon Sep 17 00:00:00 2001 From: PL Ghost <75324987+pl-ghost@users.noreply.github.com> Date: Fri, 12 Apr 2024 09:19:39 -0400 Subject: [PATCH 008/179] Adding test for legacy checkpoint created with 2.2.2 (#19760) --- tests/legacy/back-compatible-versions.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/legacy/back-compatible-versions.txt b/tests/legacy/back-compatible-versions.txt index 1d8e1abccfdd1..a6b218707420c 100644 --- a/tests/legacy/back-compatible-versions.txt +++ b/tests/legacy/back-compatible-versions.txt @@ -98,3 +98,4 @@ 2.1.3 2.2.0.post0 2.2.1 +2.2.2 From ce90b3898aff1dac89215f49f40b19777c91125a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 14 Apr 2024 06:01:58 -0700 Subject: [PATCH 009/179] Sanitize hparams that can't be json-serialized in `WandbLogger.log_hyperparameters()` (#19769) --- src/lightning/fabric/utilities/logger.py | 19 +++++++++++++- src/lightning/pytorch/CHANGELOG.md | 3 ++- src/lightning/pytorch/loggers/wandb.py | 8 +++++- tests/tests_fabric/utilities/test_logger.py | 29 ++++++++++++++++++++- tests/tests_pytorch/loggers/test_wandb.py | 6 +++-- 5 files changed, 59 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/utilities/logger.py b/src/lightning/fabric/utilities/logger.py index 2604a0d926d21..abe5816deda1a 100644 --- a/src/lightning/fabric/utilities/logger.py +++ b/src/lightning/fabric/utilities/logger.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import json from argparse import Namespace from dataclasses import asdict, is_dataclass from typing import Any, Dict, Mapping, MutableMapping, Optional, Union @@ -132,6 +132,23 @@ def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: return params +def _convert_json_serializable(params: Dict[str, Any]) -> Dict[str, Any]: + """Convert non-serializable objects in params to string.""" + return {k: str(v) if not _is_json_serializable(v) else v for k, v in params.items()} + + +def _is_json_serializable(value: Any) -> bool: + """Test whether a variable can be encoded as json.""" + if value is None or isinstance(value, (bool, int, float, str, list, dict)): # fast path + return True + try: + json.dumps(value) + return True + except (TypeError, OverflowError): + # OverflowError is raised if number is too large to encode + return False + + def _add_prefix( metrics: Mapping[str, Union[Tensor, float]], prefix: str, separator: str ) -> Mapping[str, Union[Tensor, float]]: diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index be7b66a27ca66..9f2a1b40acb1a 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -49,7 +49,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue causing a TypeError when using `torch.compile` as a decorator ([#19627](https://github.com/Lightning-AI/pytorch-lightning/pull/19627)) -- + +- Fixed `WandbLogger.log_hyperparameters()` raising an error if hyperparameters are not JSON serializable ([#19769](https://github.com/Lightning-AI/pytorch-lightning/pull/19769)) - diff --git a/src/lightning/pytorch/loggers/wandb.py b/src/lightning/pytorch/loggers/wandb.py index 4025f2cd18004..c5d995bff35a5 100644 --- a/src/lightning/pytorch/loggers/wandb.py +++ b/src/lightning/pytorch/loggers/wandb.py @@ -26,7 +26,12 @@ from torch import Tensor from typing_extensions import override -from lightning.fabric.utilities.logger import _add_prefix, _convert_params, _sanitize_callable_params +from lightning.fabric.utilities.logger import ( + _add_prefix, + _convert_json_serializable, + _convert_params, + _sanitize_callable_params, +) from lightning.fabric.utilities.types import _PATH from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint from lightning.pytorch.loggers.logger import Logger, rank_zero_experiment @@ -419,6 +424,7 @@ def watch( def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: params = _convert_params(params) params = _sanitize_callable_params(params) + params = _convert_json_serializable(params) self.experiment.config.update(params, allow_val_change=True) @override diff --git a/tests/tests_fabric/utilities/test_logger.py b/tests/tests_fabric/utilities/test_logger.py index 5b6211331474a..33681c65f7fa2 100644 --- a/tests/tests_fabric/utilities/test_logger.py +++ b/tests/tests_fabric/utilities/test_logger.py @@ -11,14 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from argparse import Namespace from dataclasses import dataclass +from pathlib import Path import numpy as np import torch from lightning.fabric.utilities.logger import ( _add_prefix, + _convert_json_serializable, _convert_params, _flatten_dict, _sanitize_callable_params, @@ -167,3 +168,29 @@ def test_add_prefix(): assert "prefix-metric2" not in metrics assert metrics["prefix2_prefix-metric1"] == 1 assert metrics["prefix2_prefix-metric2"] == 2 + + +def test_convert_json_serializable(): + data = { + # JSON-serializable + "none": None, + "int": 1, + "float": 1.1, + "bool": True, + "dict": {"a": 1}, + "list": [2, 3, 4], + # not JSON-serializable + "path": Path("path"), + "tensor": torch.tensor(1), + } + expected = { + "none": None, + "int": 1, + "float": 1.1, + "bool": True, + "dict": {"a": 1}, + "list": [2, 3, 4], + "path": "path", + "tensor": "tensor(1)", + } + assert _convert_json_serializable(data) == expected diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index f667b0a7b5ee1..a8e70bfb6589d 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -13,6 +13,7 @@ # limitations under the License. import os import pickle +from pathlib import Path from unittest import mock import pytest @@ -113,9 +114,10 @@ def test_wandb_logger_init(wandb_mock): wandb_mock.init().log.assert_called_with({"acc": 1.0, "trainer/global_step": 6}) # log hyper parameters - hparams = {"test": None, "nested": {"a": 1}, "b": [2, 3, 4]} + hparams = {"none": None, "dict": {"a": 1}, "b": [2, 3, 4], "path": Path("path")} + expected = {"none": None, "dict": {"a": 1}, "b": [2, 3, 4], "path": "path"} logger.log_hyperparams(hparams) - wandb_mock.init().config.update.assert_called_once_with(hparams, allow_val_change=True) + wandb_mock.init().config.update.assert_called_once_with(expected, allow_val_change=True) # watch a model logger.watch("model", "log", 10, False) From 58ad56afece3ea7faec2f1b7f68d90195f316d78 Mon Sep 17 00:00:00 2001 From: David de la Iglesia Castro Date: Mon, 15 Apr 2024 16:16:17 +0200 Subject: [PATCH 010/179] Use `step` interval in `estimated_stepping_batches` docs example (#19774) --- docs/source-pytorch/common/trainer.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index 0ad4592754134..c40ad1fcf92e2 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -1229,7 +1229,10 @@ dataloader if hadn't been set up already. optimizer = ... stepping_batches = self.trainer.estimated_stepping_batches scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, total_steps=stepping_batches) - return [optimizer], [scheduler] + return { + "optimizer": optimizer, + "lr_scheduler": {"scheduler": scheduler, "interval": "step"}, + } state ***** From c235f20e7131af2c7be4cc9080d3c946d93d58ea Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 17 Apr 2024 01:28:44 +0200 Subject: [PATCH 011/179] Remove the requirement for FSDPStrategy subclasses to only support GPU (#19781) --- .../trainer/connectors/accelerator_connector.py | 7 ++++--- tests/tests_pytorch/strategies/test_fsdp.py | 6 +----- .../connectors/test_accelerator_connector.py | 15 +++++++++++---- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 269a1c4c7b754..a191859c06c43 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -455,10 +455,11 @@ def _check_strategy_and_fallback(self) -> None: strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag if ( - strategy_flag in FSDPStrategy.get_registered_strategies() or isinstance(self._strategy_flag, FSDPStrategy) + strategy_flag in FSDPStrategy.get_registered_strategies() or type(self._strategy_flag) is FSDPStrategy ) and self._accelerator_flag not in ("cuda", "gpu"): - raise MisconfigurationException( - f"You selected strategy to be `{FSDPStrategy.strategy_name}`, but GPU accelerator is not used." + raise ValueError( + f"The strategy `{FSDPStrategy.strategy_name}` requires a GPU accelerator, but got:" + f" {self._accelerator_flag}" ) if strategy_flag in _DDP_FORK_ALIASES and "fork" not in torch.multiprocessing.get_all_start_methods(): raise ValueError( diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 751a42b96b42f..413a5e6c9dddd 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -28,7 +28,6 @@ from lightning.pytorch.strategies import FSDPStrategy from lightning.pytorch.trainer.states import TrainerFn from lightning.pytorch.utilities.consolidate_checkpoint import _format_checkpoint -from lightning.pytorch.utilities.exceptions import MisconfigurationException from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, FullyShardedDataParallel, MixedPrecision from torch.distributed.fsdp.wrap import always_wrap_policy, size_based_auto_wrap_policy, wrap from torchmetrics import Accuracy @@ -216,10 +215,7 @@ def _assert_save_equality(trainer, ckpt_path, cls=TestFSDPModel): def test_invalid_on_cpu(tmp_path, cuda_count_0): """Test to ensure that we raise Misconfiguration for FSDP on CPU.""" - with pytest.raises( - MisconfigurationException, - match=f"You selected strategy to be `{FSDPStrategy.strategy_name}`, but GPU accelerator is not used.", - ): + with pytest.raises(ValueError, match="The strategy `fsdp` requires a GPU accelerator"): trainer = Trainer(accelerator="cpu", default_root_dir=tmp_path, fast_dev_run=True, strategy="fsdp") assert isinstance(trainer.strategy, FSDPStrategy) trainer.strategy.setup_environment() diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index ee19c3951cb25..977e297b42665 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -564,12 +564,19 @@ def test_strategy_choice_ddp_cpu_slurm(cuda_count_0, strategy): def test_check_fsdp_strategy_and_fallback(): - with pytest.raises( - MisconfigurationException, - match=f"You selected strategy to be `{FSDPStrategy.strategy_name}`, but GPU accelerator is not used.", - ): + with pytest.raises(ValueError, match="The strategy `fsdp` requires a GPU accelerator"): Trainer(accelerator="cpu", strategy="fsdp") + class FSDPStrategySubclass(FSDPStrategy): + pass + + class AcceleratorSubclass(CPUAccelerator): + pass + + # we allow subclasses of FSDPStrategy to be used with other accelerators + Trainer(accelerator="cpu", strategy=FSDPStrategySubclass()) + Trainer(accelerator=AcceleratorSubclass(), strategy=FSDPStrategySubclass()) + @mock.patch.dict(os.environ, {}, clear=True) def test_unsupported_tpu_choice(xla_available, tpu_available): From a2b3dddf1dbef9e398cb14e7f07ca6df7648399f Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 22 Apr 2024 17:47:04 +0100 Subject: [PATCH 012/179] Update Lightning Cloud to 0.5.67 (#19795) --- requirements/app/app.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/app.txt b/requirements/app/app.txt index 85e5b270c09e8..002e0a1c1c23f 100644 --- a/requirements/app/app.txt +++ b/requirements/app/app.txt @@ -1,4 +1,4 @@ -lightning-cloud == 0.5.65 # Must be pinned to ensure compatibility +lightning-cloud == 0.5.67 # Must be pinned to ensure compatibility packaging typing-extensions >=4.4.0, <4.10.0 deepdiff >=5.7.0, <6.6.0 From b9680a364da4e875b237ec3c03e67a9c32ef475b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 22 Apr 2024 19:52:43 +0200 Subject: [PATCH 013/179] Update changelog after 2.2.2 release (#19770) --- src/lightning/fabric/CHANGELOG.md | 14 +++++++++++--- src/lightning/pytorch/CHANGELOG.md | 13 +++++++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index d77a6a5131c78..d53529d391418 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -47,12 +47,20 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed an issue causing a TypeError when using `torch.compile` as a decorator ([#19627](https://github.com/Lightning-AI/pytorch-lightning/pull/19627)) +- -- Fixed issue where some model methods couldn't be monkeypatched after being Fabric wrapped ([#19705](https://github.com/Lightning-AI/pytorch-lightning/pull/19705)) +- + +- -- Fixed an issue causing weights to be reset in `Fabric.setup()` when using FSDP ([#19755](https://github.com/Lightning-AI/pytorch-lightning/pull/19755)) +## [2.2.2] - 2024-04-11 + +### Fixed + +- Fixed an issue causing a TypeError when using `torch.compile` as a decorator ([#19627](https://github.com/Lightning-AI/pytorch-lightning/pull/19627)) +- Fixed issue where some model methods couldn't be monkeypatched after being Fabric wrapped ([#19705](https://github.com/Lightning-AI/pytorch-lightning/pull/19705)) +- Fixed an issue causing weights to be reset in `Fabric.setup()` when using FSDP ([#19755](https://github.com/Lightning-AI/pytorch-lightning/pull/19755)) ## [2.2.1] - 2024-03-04 diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 9f2a1b40acb1a..3838a6258b052 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -44,10 +44,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed a KeyError when saving a FSDP sharded checkpoint and setting `save_weights_only=True` ([#19524](https://github.com/Lightning-AI/pytorch-lightning/pull/19524)) - - -- Fixed an issue causing a TypeError when using `torch.compile` as a decorator ([#19627](https://github.com/Lightning-AI/pytorch-lightning/pull/19627)) +- - Fixed `WandbLogger.log_hyperparameters()` raising an error if hyperparameters are not JSON serializable ([#19769](https://github.com/Lightning-AI/pytorch-lightning/pull/19769)) @@ -55,6 +52,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - +## [2.2.2] - 2024-04-11 + +### Fixed + +- Fixed a KeyError when saving a FSDP sharded checkpoint and setting `save_weights_only=True` ([#19524](https://github.com/Lightning-AI/pytorch-lightning/pull/19524)) +- Fixed an issue causing a TypeError when using `torch.compile` as a decorator ([#19627](https://github.com/Lightning-AI/pytorch-lightning/pull/19627)) + + ## [2.2.1] - 2024-03-04 From 5e0e02b79e07b1c7f76d75d2fcb669dbc13e958e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 27 Apr 2024 07:24:07 +0200 Subject: [PATCH 014/179] Remove support for PyTorch 1.13 (#19706) --- .github/checkgroup.yml | 46 ++-- .github/workflows/ci-tests-fabric.yml | 24 +- .github/workflows/ci-tests-pytorch.yml | 24 +- .github/workflows/docker-build.yml | 7 +- dockers/release/Dockerfile | 6 +- requirements/fabric/base.txt | 2 +- requirements/fabric/examples.txt | 2 +- requirements/fabric/strategies.txt | 2 +- requirements/pytorch/base.txt | 2 +- requirements/pytorch/examples.txt | 2 +- requirements/pytorch/strategies.txt | 2 +- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/fabric/accelerators/cuda.py | 212 +----------------- src/lightning/fabric/fabric.py | 31 +-- src/lightning/fabric/plugins/precision/amp.py | 5 +- .../fabric/plugins/precision/fsdp.py | 10 +- src/lightning/fabric/strategies/fsdp.py | 65 ++---- src/lightning/fabric/strategies/strategy.py | 7 +- src/lightning/fabric/strategies/xla_fsdp.py | 11 - src/lightning/fabric/utilities/imports.py | 3 +- src/lightning/fabric/utilities/load.py | 7 +- .../fabric/utilities/testing/_runif.py | 3 +- src/lightning/fabric/utilities/types.py | 50 +---- src/lightning/fabric/wrappers.py | 27 +-- src/lightning/pytorch/CHANGELOG.md | 2 +- .../callbacks/stochastic_weight_avg.py | 2 +- src/lightning/pytorch/cli.py | 10 +- src/lightning/pytorch/core/hooks.py | 4 +- src/lightning/pytorch/core/module.py | 7 +- src/lightning/pytorch/core/optimizer.py | 3 +- src/lightning/pytorch/demos/boring_classes.py | 4 +- .../pytorch/plugins/precision/amp.py | 5 +- .../pytorch/plugins/precision/fsdp.py | 10 +- src/lightning/pytorch/strategies/deepspeed.py | 3 +- src/lightning/pytorch/strategies/fsdp.py | 43 +--- src/lightning/pytorch/strategies/strategy.py | 6 +- .../connectors/logger_connector/result.py | 6 +- src/lightning/pytorch/trainer/trainer.py | 15 +- src/lightning/pytorch/tuner/lr_finder.py | 21 +- src/lightning/pytorch/utilities/compile.py | 22 +- .../utilities/model_summary/model_summary.py | 6 +- .../pytorch/utilities/testing/_runif.py | 3 +- src/lightning/pytorch/utilities/types.py | 11 +- tests/tests_fabric/accelerators/test_cuda.py | 14 -- .../plugins/precision/test_fsdp.py | 22 +- tests/tests_fabric/strategies/test_ddp.py | 3 +- .../strategies/test_ddp_integration.py | 9 +- tests/tests_fabric/strategies/test_fsdp.py | 29 --- .../strategies/test_fsdp_integration.py | 29 +-- .../tests_fabric/strategies/test_strategy.py | 7 +- .../tests_fabric/strategies/test_xla_fsdp.py | 6 +- .../strategies/test_xla_fsdp_integration.py | 6 +- tests/tests_fabric/test_fabric.py | 18 +- tests/tests_fabric/test_wrappers.py | 8 +- tests/tests_fabric/utilities/test_load.py | 7 - .../core/test_lightning_module.py | 4 - tests/tests_pytorch/models/test_hooks.py | 13 +- .../plugins/precision/test_fsdp.py | 22 +- tests/tests_pytorch/strategies/test_common.py | 4 +- tests/tests_pytorch/strategies/test_ddp.py | 3 +- .../strategies/test_ddp_integration.py | 5 +- tests/tests_pytorch/strategies/test_fsdp.py | 123 +++------- .../optimization/test_manual_optimization.py | 7 +- tests/tests_pytorch/trainer/test_trainer.py | 9 - tests/tests_pytorch/utilities/test_compile.py | 4 + .../utilities/test_model_summary.py | 5 - 66 files changed, 221 insertions(+), 871 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 7a89f64b14181..37f1e3cd844d2 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -19,21 +19,21 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "pl-cpu (macOS-11, lightning, 3.8, 1.13, oldest)" - - "pl-cpu (macOS-11, lightning, 3.10, 1.13)" + - "pl-cpu (macOS-11, lightning, 3.8, 2.0, oldest)" + - "pl-cpu (macOS-11, lightning, 3.10, 2.0)" - "pl-cpu (macOS-11, lightning, 3.10, 2.1)" - "pl-cpu (macOS-11, lightning, 3.10, 2.2)" - - "pl-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)" - - "pl-cpu (ubuntu-20.04, lightning, 3.10, 1.13)" + - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" + - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.2)" - - "pl-cpu (windows-2022, lightning, 3.8, 1.13, oldest)" - - "pl-cpu (windows-2022, lightning, 3.10, 1.13)" + - "pl-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" + - "pl-cpu (windows-2022, lightning, 3.10, 2.0)" - "pl-cpu (windows-2022, lightning, 3.10, 2.1)" - "pl-cpu (windows-2022, lightning, 3.10, 2.2)" - - "pl-cpu (macOS-11, pytorch, 3.8, 1.13)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 1.13)" - - "pl-cpu (windows-2022, pytorch, 3.8, 1.13)" + - "pl-cpu (macOS-11, pytorch, 3.8, 2.0)" + - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.0)" + - "pl-cpu (windows-2022, pytorch, 3.8, 2.0)" - "pl-cpu (macOS-12, pytorch, 3.11, 2.0)" - "pl-cpu (macOS-12, pytorch, 3.11, 2.1)" - "pl-cpu (ubuntu-22.04, pytorch, 3.11, 2.0)" @@ -140,15 +140,17 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "build-cuda (3.9, 1.13, 11.8.0)" - - "build-cuda (3.9, 1.13, 12.0.1)" - "build-cuda (3.10, 2.0, 11.8.0)" - "build-cuda (3.10, 2.1, 12.1.0)" + - "build-cuda (3.10, 2.2, 12.1.0)" + - "build-cuda (3.11, 2.1, 12.1.0)" + - "build-cuda (3.11, 2.2, 12.1.0)" #- "build-NGC" - - "build-pl (3.9, 1.13, 11.8.0)" - - "build-pl (3.9, 1.13, 12.0.1)" - "build-pl (3.10, 2.0, 11.8.0)" - "build-pl (3.10, 2.1, 12.1.0)" + - "build-pl (3.10, 2.2, 12.1.0)" + - "build-pl (3.11, 2.1, 12.1.0)" + - "build-pl (3.11, 2.2, 12.1.0)" # SECTION: lightning_fabric @@ -165,21 +167,21 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "fabric-cpu (macOS-11, lightning, 3.8, 1.13, oldest)" - - "fabric-cpu (macOS-11, lightning, 3.10, 1.13)" + - "fabric-cpu (macOS-11, lightning, 3.8, 2.0, oldest)" + - "fabric-cpu (macOS-11, lightning, 3.10, 2.0)" - "fabric-cpu (macOS-11, lightning, 3.11, 2.1)" - "fabric-cpu (macOS-11, lightning, 3.11, 2.2)" - - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 1.13, oldest)" - - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 1.13)" + - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" + - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2)" - - "fabric-cpu (windows-2022, lightning, 3.8, 1.13, oldest)" - - "fabric-cpu (windows-2022, lightning, 3.10, 1.13)" + - "fabric-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" + - "fabric-cpu (windows-2022, lightning, 3.10, 2.0)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)" - - "fabric-cpu (macOS-11, fabric, 3.8, 1.13)" - - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 1.13)" - - "fabric-cpu (windows-2022, fabric, 3.8, 1.13)" + - "fabric-cpu (macOS-11, fabric, 3.8, 2.0)" + - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.0)" + - "fabric-cpu (windows-2022, fabric, 3.8, 2.0)" - "fabric-cpu (macOS-12, fabric, 3.11, 2.0)" - "fabric-cpu (macOS-12, fabric, 3.11, 2.1)" - "fabric-cpu (ubuntu-22.04, fabric, 3.11, 2.0)" diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index ccb6fc928a014..61c60889a5aa0 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -39,9 +39,9 @@ jobs: fail-fast: false matrix: include: - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" } + - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } # only run PyTorch latest - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } @@ -57,31 +57,25 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } # "oldest" versions tests, only on minimum Python - - { - os: "macOS-11", - pkg-name: "lightning", - python-version: "3.8", - pytorch-version: "1.13", - requires: "oldest", - } + - { os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "1.13", + pytorch-version: "2.0", requires: "oldest", } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "1.13", + pytorch-version: "2.0", requires: "oldest", } # "fabric" installs the standalone package - - { os: "macOS-11", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.13" } - - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.13" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.13" } + - { os: "macOS-11", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } + - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } + - { os: "windows-2022", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } timeout-minutes: 25 # because of building grpcio on Mac env: PACKAGE_NAME: ${{ matrix.pkg-name }} diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 1e5835054fdfe..b7f5b14baf255 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -43,9 +43,9 @@ jobs: fail-fast: false matrix: include: - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "1.13" } + - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } # only run PyTorch latest - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } @@ -61,31 +61,25 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" } # "oldest" versions tests, only on minimum Python - - { - os: "macOS-11", - pkg-name: "lightning", - python-version: "3.8", - pytorch-version: "1.13", - requires: "oldest", - } + - { os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "1.13", + pytorch-version: "2.0", requires: "oldest", } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "1.13", + pytorch-version: "2.0", requires: "oldest", } # "pytorch" installs the standalone package - - { os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13" } - - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "1.13" } + - { os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } + - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } timeout-minutes: 50 env: PACKAGE_NAME: ${{ matrix.pkg-name }} diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 8005d3386ba5e..7ea9f824bb6b1 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -43,10 +43,11 @@ jobs: include: # We only release one docker image per PyTorch version. # Make sure the matrix here matches the one below. - - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.8.0" } - - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "12.0.1" } - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" } - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } + - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } steps: - uses: actions/checkout@v4 with: @@ -103,8 +104,6 @@ jobs: include: # These are the base images for PL release docker images. # Make sure the matrix here matches the one above. - - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.8.0" } - - { python_version: "3.9", pytorch_version: "1.13", cuda_version: "12.0.1" } - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" } - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index 5001929b14798..6f8b884857b65 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.13 -ARG CUDA_VERSION=11.3.1 +ARG PYTHON_VERSION=3.10 +ARG PYTORCH_VERSION=2.0 +ARG CUDA_VERSION=11.8.0 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index c57d24a49e583..3a6cdbacd302f 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy >=1.17.2, <1.27.0 -torch >=1.13.0, <2.3.0 +torch >=2.0.0, <2.3.0 fsspec[http] >=2022.5.0, <2023.11.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt index e077065766b76..d0be7e3af8496 100644 --- a/requirements/fabric/examples.txt +++ b/requirements/fabric/examples.txt @@ -1,6 +1,6 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision >=0.14.0, <0.18.0 +torchvision >=0.15.0, <0.18.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt index 6c302f21269e3..4aee89d9f68e7 100644 --- a/requirements/fabric/strategies.txt +++ b/requirements/fabric/strategies.txt @@ -5,5 +5,5 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 -deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" # strict +deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict bitsandbytes >=0.42.0,<0.43.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index ed4250bb3832b..3578917e2cdf0 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy >=1.17.2, <1.27.0 -torch >=1.13.0, <2.3.0 +torch >=2.0.0, <2.3.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 fsspec[http] >=2022.5.0, <2023.11.0 diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 56b7971eb61b0..716b033def533 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment requests <2.32.0 -torchvision >=0.14.0, <0.18.0 +torchvision >=0.15.0, <0.18.0 gym[classic_control] >=0.17.0, <0.27.0 ipython[all] <8.15.0 torchmetrics >=0.10.0, <1.3.0 diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 751ca213d3b53..8d3af408a98fe 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -3,4 +3,4 @@ # note: is a bug around 0.10 with `MPS_Accelerator must implement all abstract methods` # shall be resolved by https://github.com/microsoft/DeepSpeed/issues/4372 -deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" # strict +deepspeed >=0.8.2, <=0.9.3; platform_system != "Windows" and platform_system != "Darwin" # strict diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index d53529d391418..154433a1c101d 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -39,7 +39,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Removed -- +- Removed support for PyTorch 1.13 ([#19706](https://github.com/Lightning-AI/lightning/pull/19706)) - diff --git a/src/lightning/fabric/accelerators/cuda.py b/src/lightning/fabric/accelerators/cuda.py index 8613c6549e4c9..4afc9be723fc2 100644 --- a/src/lightning/fabric/accelerators/cuda.py +++ b/src/lightning/fabric/accelerators/cuda.py @@ -11,18 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import warnings -from contextlib import contextmanager from functools import lru_cache -from typing import Generator, List, Optional, Union, cast +from typing import List, Optional, Union import torch from typing_extensions import override from lightning.fabric.accelerators.accelerator import Accelerator from lightning.fabric.accelerators.registry import _AcceleratorRegistry -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.rank_zero import rank_zero_info @@ -144,211 +140,15 @@ def _get_all_visible_cuda_devices() -> List[int]: return list(range(num_cuda_devices())) -# TODO: Remove once minimum supported PyTorch version is 2.0 -@contextmanager -def _patch_cuda_is_available() -> Generator: - """Context manager that safely patches :func:`torch.cuda.is_available` with its NVML-based version if possible.""" - if hasattr(torch._C, "_cuda_getDeviceCount") and _device_count_nvml() >= 0 and not _TORCH_GREATER_EQUAL_2_0: - # we can safely patch is_available if both torch has CUDA compiled and the NVML count is succeeding - # otherwise, patching is_available could lead to attribute errors or infinite recursion - orig_check = torch.cuda.is_available - torch.cuda.is_available = is_cuda_available - try: - yield - finally: - torch.cuda.is_available = orig_check - else: - yield - - -@lru_cache(1) def num_cuda_devices() -> int: - """Returns the number of available CUDA devices. - - Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, - if the platform allows it. - - """ - if _TORCH_GREATER_EQUAL_2_0: - return torch.cuda.device_count() - - # Implementation copied from upstream: https://github.com/pytorch/pytorch/pull/84879 - # TODO: Remove once minimum supported PyTorch version is 2.0 - nvml_count = _device_count_nvml() - return torch.cuda.device_count() if nvml_count < 0 else nvml_count + """Returns the number of available CUDA devices.""" + return torch.cuda.device_count() def is_cuda_available() -> bool: - """Returns a bool indicating if CUDA is currently available. - - Unlike :func:`torch.cuda.is_available`, this function does its best not to create a CUDA context for fork support, - if the platform allows it. - - """ + """Returns a bool indicating if CUDA is currently available.""" # We set `PYTORCH_NVML_BASED_CUDA_CHECK=1` in lightning.fabric.__init__.py - return torch.cuda.is_available() if _TORCH_GREATER_EQUAL_2_0 else num_cuda_devices() > 0 - - -# TODO: Remove once minimum supported PyTorch version is 2.0 -def _parse_visible_devices() -> Union[List[int], List[str]]: - """Parse CUDA_VISIBLE_DEVICES environment variable.""" - var = os.getenv("CUDA_VISIBLE_DEVICES") - if var is None: - return list(range(64)) - - def _strtoul(s: str) -> int: - """Return -1 or positive integer sequence string starts with,""" - if not s: - return -1 - for idx, c in enumerate(s): - if not (c.isdigit() or (idx == 0 and c in "+-")): - break - if idx + 1 == len(s): - idx += 1 - return int(s[:idx]) if idx > 0 else -1 - - def parse_list_with_prefix(lst: str, prefix: str) -> List[str]: - rcs: List[str] = [] - for elem in lst.split(","): - # Repeated id results in empty set - if elem in rcs: - return cast(List[str], []) - # Anything other but prefix is ignored - if not elem.startswith(prefix): - break - rcs.append(elem) - return rcs - - if var.startswith("GPU-"): - return parse_list_with_prefix(var, "GPU-") - if var.startswith("MIG-"): - return parse_list_with_prefix(var, "MIG-") - # CUDA_VISIBLE_DEVICES uses something like strtoul - # which makes `1gpu2,2ampere` is equivalent to `1,2` - rc: List[int] = [] - for elem in var.split(","): - x = _strtoul(elem.strip()) - # Repeated ordinal results in empty set - if x in rc: - return cast(List[int], []) - # Negative value aborts the sequence - if x < 0: - break - rc.append(x) - return rc - - -# TODO: Remove once minimum supported PyTorch version is 2.0 -def _raw_device_count_nvml() -> int: - """Return number of devices as reported by NVML or negative value if NVML discovery/initialization failed.""" - from ctypes import CDLL, byref, c_int - - nvml_h = CDLL("libnvidia-ml.so.1") - rc = nvml_h.nvmlInit() - if rc != 0: - warnings.warn("Can't initialize NVML") - return -1 - dev_count = c_int(-1) - rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) - if rc != 0: - warnings.warn("Can't get nvml device count") - return -1 - del nvml_h - return dev_count.value - - -# TODO: Remove once minimum supported PyTorch version is 2.0 -def _raw_device_uuid_nvml() -> Optional[List[str]]: - """Return list of device UUID as reported by NVML or None if NVM discovery/initialization failed.""" - from ctypes import CDLL, byref, c_int, c_void_p, create_string_buffer - - nvml_h = CDLL("libnvidia-ml.so.1") - rc = nvml_h.nvmlInit() - if rc != 0: - warnings.warn("Can't initialize NVML") - return None - dev_count = c_int(-1) - rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count)) - if rc != 0: - warnings.warn("Can't get nvml device count") - return None - uuids: List[str] = [] - for idx in range(dev_count.value): - dev_id = c_void_p() - rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id)) - if rc != 0: - warnings.warn("Can't get device handle") - return None - buf_len = 96 - buf = create_string_buffer(buf_len) - rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len) - if rc != 0: - warnings.warn("Can't get device UUID") - return None - uuids.append(buf.raw.decode("ascii").strip("\0")) - del nvml_h - return uuids - - -# TODO: Remove once minimum supported PyTorch version is 2.0 -def _transform_uuid_to_ordinals(candidates: List[str], uuids: List[str]) -> List[int]: - """Given the set of partial uuids and list of known uuids builds a set of ordinals excluding ambiguous partials - IDs.""" - - def uuid_to_orinal(candidate: str, uuids: List[str]) -> int: - best_match = -1 - for idx, uuid in enumerate(uuids): - if not uuid.startswith(candidate): - continue - # Ambigous candidate - if best_match != -1: - return -1 - best_match = idx - return best_match - - rc: List[int] = [] - for candidate in candidates: - idx = uuid_to_orinal(candidate, uuids) - # First invalid ordinal stops parsing - if idx < 0: - break - # Duplicates result in empty set - if idx in rc: - return cast(List[int], []) - rc.append(idx) - return rc - - -# TODO: Remove once minimum supported PyTorch version is 2.0 -def _device_count_nvml() -> int: - """Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account. - - Negative value is returned if NVML discovery or initialization has failed. - - """ - visible_devices = _parse_visible_devices() - if not visible_devices: - return 0 - try: - if isinstance(visible_devices[0], str): - # Skip MIG parsing - if visible_devices[0].startswith("MIG-"): - return -1 - uuids = _raw_device_uuid_nvml() - if uuids is None: - return -1 - visible_devices = _transform_uuid_to_ordinals(cast(List[str], visible_devices), uuids) - else: - raw_cnt = _raw_device_count_nvml() - if raw_cnt <= 0: - return raw_cnt - # Trim the list up to a maximum available device - for idx, val in enumerate(visible_devices): - if cast(int, val) >= raw_cnt: - return idx - except (OSError, AttributeError): - return -1 - return len(visible_devices) + return torch.cuda.is_available() def _is_ampere_or_later(device: Optional[torch.device] = None) -> bool: @@ -375,7 +175,7 @@ def _check_cuda_matmul_precision(device: torch.device) -> None: def _clear_cuda_memory() -> None: # strangely, the attribute function be undefined when torch.compile is used - if _TORCH_GREATER_EQUAL_2_0 and hasattr(torch._C, "_cuda_clearCublasWorkspaces"): + if hasattr(torch._C, "_cuda_clearCublasWorkspaces"): # https://github.com/pytorch/pytorch/issues/95668 torch._C._cuda_clearCublasWorkspaces() torch.cuda.empty_cache() diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index 4b9c14eb06e62..aa67d2e7ce9ac 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -51,7 +51,6 @@ FSDPStrategy, SingleDeviceStrategy, Strategy, - XLAFSDPStrategy, XLAStrategy, ) from lightning.fabric.strategies.fsdp import _has_meta_device_parameters @@ -67,7 +66,6 @@ ) from lightning.fabric.utilities.device_dtype_mixin import _update_properties from lightning.fabric.utilities.distributed import DistributedSamplerWrapper, _InfiniteBarrier -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn from lightning.fabric.utilities.registry import _load_external_callbacks from lightning.fabric.utilities.seed import seed_everything @@ -699,26 +697,14 @@ def sharded_model(self) -> ContextManager: def init_tensor(self) -> ContextManager: """Tensors that you instantiate under this context manager will be created on the device right away and have - the right data type depending on the precision setting in Fabric. - - The automatic device placement under this context manager is only supported with PyTorch 2.0 and newer. - - """ - if not _TORCH_GREATER_EQUAL_2_0 and self.device.type != "cpu": - rank_zero_warn( - "`Fabric.init_tensor()` can't place tensors on the device directly" - " with PyTorch < 2.0. Parameters will remain on CPU until `Fabric.setup()` is called." - " Upgrade to PyTorch >= 2.0 to fully utilize this feature.", - category=PossibleUserWarning, - ) + the right data type depending on the precision setting in Fabric.""" return self._strategy.tensor_init_context() def init_module(self, empty_init: Optional[bool] = None) -> ContextManager: """Instantiate the model and its parameters under this context manager to reduce peak memory usage. The parameters get created on the device and with the right data type right away without wasting memory being - allocated unnecessarily. The automatic device placement under this context manager is only supported with - PyTorch 2.0 and newer. + allocated unnecessarily. Args: empty_init: Whether to initialize the model with empty weights (uninitialized memory). @@ -727,13 +713,6 @@ def init_module(self, empty_init: Optional[bool] = None) -> ContextManager: """ self._validate_launched() - if not _TORCH_GREATER_EQUAL_2_0 and self.device.type != "cpu": - rank_zero_warn( - "`Fabric.init_module()` can't place the model parameters on the device directly" - " with PyTorch < 2.0. Parameters will remain on CPU until `Fabric.setup()` is called." - " Upgrade to PyTorch >= 2.0 to fully utilize this feature.", - category=PossibleUserWarning, - ) return self._strategy.module_init_context(empty_init=empty_init) def save( @@ -1036,12 +1015,6 @@ def _validate_setup(self, module: nn.Module, optimizers: Sequence[Optimizer]) -> if any(isinstance(opt, _FabricOptimizer) for opt in optimizers): raise ValueError("An optimizer should be passed only once to the `setup` method.") - if isinstance(self._strategy, (FSDPStrategy, XLAFSDPStrategy)) and not _TORCH_GREATER_EQUAL_2_0: - raise RuntimeError( - f"The `{type(self).__name__}` requires the model and optimizer(s) to be set up separately." - " Create and set up the model first through `model = self.setup_module(model)`. Then create the" - " optimizer and set it up: `optimizer = self.setup_optimizer(optimizer)`." - ) if isinstance(self._strategy, FSDPStrategy) and any( _has_meta_device_parameters(optimizer) for optimizer in optimizers ): diff --git a/src/lightning/fabric/plugins/precision/amp.py b/src/lightning/fabric/plugins/precision/amp.py index 0ec21247c9881..75d7932ddb916 100644 --- a/src/lightning/fabric/plugins/precision/amp.py +++ b/src/lightning/fabric/plugins/precision/amp.py @@ -20,7 +20,6 @@ from torch.optim import LBFGS, Optimizer from typing_extensions import override -from lightning.fabric.accelerators.cuda import _patch_cuda_is_available from lightning.fabric.plugins.precision.precision import Precision from lightning.fabric.plugins.precision.utils import _convert_fp_tensor from lightning.fabric.utilities.types import Optimizable @@ -50,9 +49,7 @@ def __init__( self.precision = precision if scaler is None and self.precision == "16-mixed": - with _patch_cuda_is_available(): - # if possible, we defer CUDA initialization to support strategies that will attempt forks - scaler = torch.cuda.amp.GradScaler() + scaler = torch.cuda.amp.GradScaler() if scaler is not None and self.precision == "bf16-mixed": raise ValueError(f"`precision='bf16-mixed'` does not use a scaler, found {scaler}.") self.device = device diff --git a/src/lightning/fabric/plugins/precision/fsdp.py b/src/lightning/fabric/plugins/precision/fsdp.py index 161ad98f43475..179fc21cdd90d 100644 --- a/src/lightning/fabric/plugins/precision/fsdp.py +++ b/src/lightning/fabric/plugins/precision/fsdp.py @@ -23,7 +23,6 @@ from lightning.fabric.plugins.precision.amp import _optimizer_handles_unscaling from lightning.fabric.plugins.precision.precision import Precision from lightning.fabric.plugins.precision.utils import _convert_fp_tensor, _DtypeContextManager -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.types import Optimizable if TYPE_CHECKING: @@ -78,21 +77,18 @@ def __init__(self, precision: _PRECISION_INPUT, scaler: Optional["ShardedGradSca def mixed_precision_config(self) -> "TorchMixedPrecision": from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision - # With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision` - # property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to - # `torch.float32` here with PyTorch < 2.0. if self.precision == "16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.float16 elif self.precision == "bf16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.bfloat16 elif self.precision == "16-true": param_dtype = reduce_dtype = buffer_dtype = torch.float16 elif self.precision == "bf16-true": param_dtype = reduce_dtype = buffer_dtype = torch.bfloat16 elif self.precision == "32-true": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.float32 else: raise ValueError(f"Was unable to infer precision type, received {self.precision!r}.") diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index ed89629f720e8..30251a9315cd4 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -63,7 +63,6 @@ ) from lightning.fabric.utilities.distributed import group as _group from lightning.fabric.utilities.imports import ( - _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1, _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3, @@ -76,14 +75,9 @@ if TYPE_CHECKING: from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision, ShardingStrategy + from torch.distributed.fsdp.wrap import ModuleWrapPolicy - if _TORCH_GREATER_EQUAL_2_0: - from torch.distributed.fsdp.wrap import ModuleWrapPolicy - - _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy] - else: - _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool]] # type: ignore[misc] - + _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy] _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]] _FSDP_ALIASES = ("fsdp", "fsdp_cpu_offload") @@ -168,9 +162,8 @@ def __init__( self._backward_sync_control = _FSDPBackwardSyncControl() self._fsdp_kwargs = _auto_wrap_policy_kwargs(auto_wrap_policy, kwargs) - if _TORCH_GREATER_EQUAL_2_0: - # Enables joint setup of model and optimizer, multiple optimizer param groups, and `torch.compile()` - self._fsdp_kwargs.setdefault("use_orig_params", True) + # Enables joint setup of model and optimizer, multiple optimizer param groups, and `torch.compile()` + self._fsdp_kwargs.setdefault("use_orig_params", True) self._activation_checkpointing_kwargs = _activation_checkpointing_kwargs( activation_checkpointing, activation_checkpointing_policy @@ -259,12 +252,6 @@ def setup_module_and_optimizers( ) -> Tuple[Module, List[Optimizer]]: """Wraps the model into a :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel` module and sets `use_orig_params=True` to keep the reference to the original parameters in the optimizer.""" - if not _TORCH_GREATER_EQUAL_2_0: - raise NotImplementedError( - f"The `{type(self).__name__}` does not support the joint setup of module and optimizer(s)." - " Please do it in this order: Create the model, call `setup_module`, create the optimizer," - " call `setup_optimizer`." - ) use_orig_params = self._fsdp_kwargs.get("use_orig_params") if use_orig_params is False: raise ValueError( @@ -428,11 +415,6 @@ def save_checkpoint( creates a metadata file `meta.pt` with the rest of the user's state (only saved from rank 0). """ - if not _TORCH_GREATER_EQUAL_2_0: - raise NotImplementedError( - "Saving and loading checkpoints with the `FSDPStrategy` is not supported in PyTorch < 2.0." - " Please upgrade `torch` or file an issue: `https://github.com/Lightning-AI/lightning/issues`." - ) if storage_options is not None: raise TypeError( "`FSDPStrategy.save_checkpoint(..., storage_options=...)` is not supported because" @@ -530,11 +512,6 @@ def load_checkpoint( directory of multiple files rather than a single file. """ - if not _TORCH_GREATER_EQUAL_2_0: - raise NotImplementedError( - "Saving and loading checkpoints with the `FSDPStrategy` is not supported in PyTorch < 2.0." - " Please upgrade `torch` or file an issue: `https://github.com/Lightning-AI/lightning/issues`." - ) if not state: raise ValueError( f"Got FSDPStrategy.load_checkpoint(..., state={state!r}) but a state with at least " @@ -614,16 +591,15 @@ def load_checkpoint( return metadata if _is_full_checkpoint(path): - checkpoint = _lazy_load(path) if _TORCH_GREATER_EQUAL_2_0 else torch.load(path, map_location="cpu") + checkpoint = _lazy_load(path) _load_raw_module_state(checkpoint.pop(module_key), module=module, world_size=self.world_size, strict=strict) if isinstance(state, Module): return {} - if _TORCH_GREATER_EQUAL_2_0: - # Materialize lazy tensors if there are any left in the checkpoint - # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues - checkpoint = _materialize_tensors(checkpoint) + # Materialize lazy tensors if there are any left in the checkpoint + # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues + checkpoint = _materialize_tensors(checkpoint) # Load optimizer states for optim_key, optim in optimizers.items(): @@ -840,27 +816,20 @@ def _get_full_state_dict_context( ) -> Generator[None, None, None]: from torch.distributed.fsdp import FullStateDictConfig, StateDictType from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp.api import FullOptimStateDictConfig - # In PyTorch <= 2.0, offload to CPU in combination with `world_size=1` is not possible + # In PyTorch < 2.1, offload to CPU in combination with `world_size=1` is not possible offload_to_cpu = world_size > 1 or _TORCH_GREATER_EQUAL_2_1 state_dict_config = FullStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only) - if _TORCH_GREATER_EQUAL_2_0: - from torch.distributed.fsdp.api import FullOptimStateDictConfig + optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only) + state_dict_type_context = FSDP.state_dict_type( + module=module, + state_dict_type=StateDictType.FULL_STATE_DICT, + state_dict_config=state_dict_config, + optim_state_dict_config=optim_state_dict_config, + ) - optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only) - state_dict_type_context = FSDP.state_dict_type( - module=module, - state_dict_type=StateDictType.FULL_STATE_DICT, - state_dict_config=state_dict_config, - optim_state_dict_config=optim_state_dict_config, - ) - else: - state_dict_type_context = FSDP.state_dict_type( - module=module, - state_dict_type=StateDictType.FULL_STATE_DICT, - state_dict_config=state_dict_config, - ) return state_dict_type_context # type: ignore[return-value] diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index 1c64f97394fa2..6bfed6a270b68 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -29,7 +29,6 @@ from lightning.fabric.strategies.launchers.launcher import _Launcher from lightning.fabric.strategies.registry import _StrategyRegistry from lightning.fabric.utilities.apply_func import move_data_to_device -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.init import _EmptyInit from lightning.fabric.utilities.types import _PATH, Optimizable, ReduceOp, _Stateful @@ -122,8 +121,7 @@ def tensor_init_context(self) -> ContextManager: """Controls how tensors get created (device, dtype).""" precision_init_ctx = self.precision.tensor_init_context() stack = ExitStack() - if _TORCH_GREATER_EQUAL_2_0: - stack.enter_context(self.root_device) + stack.enter_context(self.root_device) stack.enter_context(precision_init_ctx) return stack @@ -140,8 +138,7 @@ def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManag """ precision_module_ctx = self.precision.module_init_context() stack = ExitStack() - if _TORCH_GREATER_EQUAL_2_0: - stack.enter_context(self.root_device) + stack.enter_context(self.root_device) stack.enter_context(_EmptyInit(enabled=bool(empty_init))) stack.enter_context(precision_module_ctx) return stack diff --git a/src/lightning/fabric/strategies/xla_fsdp.py b/src/lightning/fabric/strategies/xla_fsdp.py index 1b53292ff1581..6da693bafb1c8 100644 --- a/src/lightning/fabric/strategies/xla_fsdp.py +++ b/src/lightning/fabric/strategies/xla_fsdp.py @@ -39,7 +39,6 @@ _validate_keys_for_strict_loading, ) from lightning.fabric.utilities.cloud_io import get_filesystem -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.init import _EmptyInit from lightning.fabric.utilities.rank_zero import rank_zero_only, rank_zero_warn from lightning.fabric.utilities.types import _PATH, Optimizable, ReduceOp @@ -420,11 +419,6 @@ def save_checkpoint( consolidated checkpoint combining all of the sharded checkpoints. """ - if not _TORCH_GREATER_EQUAL_2_0: - raise NotImplementedError( - "Saving and loading checkpoints with the `XLAFSDPStrategy` is not supported in PyTorch < 2.0." - " Please upgrade `torch`." - ) # broadcast the path from rank 0 to ensure all the states are saved in a common path path = Path(self.broadcast(path)) if path.is_dir() and any(path.iterdir()): @@ -527,11 +521,6 @@ def load_checkpoint( directory of multiple files rather than a single file. """ - if not _TORCH_GREATER_EQUAL_2_0: - raise NotImplementedError( - "Saving and loading checkpoints with the `FSDPStrategy` is not supported in PyTorch < 2.0." - " Please upgrade `torch` or file an issue: `https://github.com/Lightning-AI/lightning/issues`." - ) if not state: raise ValueError( f"Got `XLAFSDPStrategy.load_checkpoint(..., state={state!r})` but a state with at least " diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py index cc069a2a73338..bcfeadf3298ca 100644 --- a/src/lightning/fabric/utilities/imports.py +++ b/src/lightning/fabric/utilities/imports.py @@ -26,11 +26,10 @@ # 2. The inspection mode via `python -i`: https://stackoverflow.com/a/6879085/1162383 _IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive) -_TORCH_GREATER_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0") _TORCH_GREATER_EQUAL_2_1 = compare_version("torch", operator.ge, "2.1.0") _TORCH_GREATER_EQUAL_2_2 = compare_version("torch", operator.ge, "2.2.0") _TORCH_GREATER_EQUAL_2_3 = compare_version("torch", operator.ge, "2.3.0", use_base_version=True) -_TORCH_EQUAL_2_0 = _TORCH_GREATER_EQUAL_2_0 and not _TORCH_GREATER_EQUAL_2_1 +_TORCH_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0") and not _TORCH_GREATER_EQUAL_2_1 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) diff --git a/src/lightning/fabric/utilities/load.py b/src/lightning/fabric/utilities/load.py index 29ccca9e4375f..9862cc2bd981e 100644 --- a/src/lightning/fabric/utilities/load.py +++ b/src/lightning/fabric/utilities/load.py @@ -25,10 +25,7 @@ from torch.nn import Parameter from typing_extensions import override -from lightning.fabric.utilities.imports import ( - _TORCH_GREATER_EQUAL_2_0, - _TORCH_GREATER_EQUAL_2_3, -) +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.types import _PATH, _Stateful _METADATA_FILENAME = "meta.pt" @@ -202,8 +199,6 @@ def persistent_load(self, pid: tuple) -> "TypedStorage": def _lazy_load(filename: _PATH) -> Any: - if not _TORCH_GREATER_EQUAL_2_0: - raise NotImplementedError("Lazy-loading is only supported with PyTorch >= 2.0.") if not os.path.isfile(filename): raise FileNotFoundError(f"Path {str(filename)!r} does not exist or is not a file.") file_reader = torch.PyTorchFileReader(str(filename)) diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py index b9bfd1e269d71..6ab2ff730eec9 100644 --- a/src/lightning/fabric/utilities/testing/_runif.py +++ b/src/lightning/fabric/utilities/testing/_runif.py @@ -24,7 +24,7 @@ from lightning.fabric.accelerators.cuda import num_cuda_devices from lightning.fabric.accelerators.mps import MPSAccelerator from lightning.fabric.strategies.deepspeed import _DEEPSPEED_AVAILABLE -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 def _runif_reasons( @@ -122,7 +122,6 @@ def _runif_reasons( cond = not is_dynamo_supported() else: cond = sys.platform == "win32" or sys.version_info >= (3, 11) - cond |= not _TORCH_GREATER_EQUAL_2_0 if cond: reasons.append("torch.dynamo") diff --git a/src/lightning/fabric/utilities/types.py b/src/lightning/fabric/utilities/types.py index c4bc32f3cf319..2e18dc89b05b2 100644 --- a/src/lightning/fabric/utilities/types.py +++ b/src/lightning/fabric/utilities/types.py @@ -28,10 +28,10 @@ import torch from torch import Tensor -from torch.optim import Optimizer -from typing_extensions import TypeAlias, overload -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 +# TODO: Unused import, but lightning_habana imports these from here +from torch.optim.lr_scheduler import LRScheduler, ReduceLROnPlateau # noqa: F401 +from typing_extensions import TypeAlias, overload UntypedStorage: TypeAlias = torch.UntypedStorage @@ -42,7 +42,6 @@ ] _PARAMETERS = Iterator[torch.nn.Parameter] - if torch.distributed.is_available(): from torch.distributed import ProcessGroup, ReduceOp @@ -70,49 +69,6 @@ def size(self) -> int: ... def rank(self) -> int: ... -# Inferred from `torch.optim.lr_scheduler.pyi` -# Missing attributes were added to improve typing -@runtime_checkable -class LRScheduler(_Stateful[str], Protocol): - optimizer: Optimizer - base_lrs: List[float] - - def __init__(self, optimizer: Optimizer, *args: Any, **kwargs: Any) -> None: ... - - def step(self, epoch: Optional[int] = None) -> None: ... - - -_TORCH_LRSCHEDULER: TypeAlias = ( - torch.optim.lr_scheduler.LRScheduler # type: ignore[valid-type] - if _TORCH_GREATER_EQUAL_2_0 - else torch.optim.lr_scheduler._LRScheduler -) - - -# Inferred from `torch.optim.lr_scheduler.pyi` -# Missing attributes were added to improve typing -@runtime_checkable -class ReduceLROnPlateau(_Stateful[str], Protocol): - in_cooldown: bool - optimizer: Optimizer - - def __init__( - self, - optimizer: Optimizer, - mode: str = ..., - factor: float = ..., - patience: int = ..., - verbose: bool = ..., - threshold: float = ..., - threshold_mode: str = ..., - cooldown: int = ..., - min_lr: float = ..., - eps: float = ..., - ) -> None: ... - - def step(self, metrics: Union[float, int, Tensor], epoch: Optional[int] = None) -> None: ... - - @runtime_checkable class Steppable(Protocol): """To structurally type ``optimizer.step()``""" diff --git a/src/lightning/fabric/wrappers.py b/src/lightning/fabric/wrappers.py index 093b355e2c376..f932750e14239 100644 --- a/src/lightning/fabric/wrappers.py +++ b/src/lightning/fabric/wrappers.py @@ -15,7 +15,6 @@ from copy import deepcopy from functools import partial, wraps from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -35,6 +34,7 @@ from lightning_utilities.core.apply_func import apply_to_collection from torch import Tensor from torch import nn as nn +from torch._dynamo import OptimizedModule from torch.nn.modules.module import _IncompatibleKeys from torch.optim import Optimizer from torch.utils.data import DataLoader @@ -45,12 +45,8 @@ from lightning.fabric.utilities import move_data_to_device from lightning.fabric.utilities.data import _set_sampler_epoch from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.types import Optimizable -if TYPE_CHECKING: - from torch._dynamo import OptimizedModule - T_destination = TypeVar("T_destination", bound=Dict[str, Any]) _LIGHTNING_MODULE_STEP_METHODS = ("training_step", "validation_step", "test_step", "predict_step") @@ -329,26 +325,17 @@ def _unwrap( return obj types = [_FabricModule, _FabricOptimizer, _FabricDataLoader] - if _TORCH_GREATER_EQUAL_2_0: - from torch._dynamo import OptimizedModule - - types.append(OptimizedModule) + types.append(OptimizedModule) return apply_to_collection(collection, dtype=tuple(types), function=_unwrap) -def _unwrap_compiled(obj: Union[Any, "OptimizedModule"]) -> Tuple[Union[Any, nn.Module], Optional[Dict[str, Any]]]: +def _unwrap_compiled(obj: Union[Any, OptimizedModule]) -> Tuple[Union[Any, nn.Module], Optional[Dict[str, Any]]]: """Removes the :class:`torch._dynamo.OptimizedModule` around the object if it is wrapped. Use this function before instance checks against e.g. :class:`_FabricModule`. """ - if not _TORCH_GREATER_EQUAL_2_0: - # obj can't be an `OptimizedModule` anyway - return obj, None - - from torch._dynamo import OptimizedModule - if isinstance(obj, OptimizedModule): if (compile_kwargs := getattr(obj, "_compile_kwargs", None)) is None: raise RuntimeError( @@ -359,10 +346,7 @@ def _unwrap_compiled(obj: Union[Any, "OptimizedModule"]) -> Tuple[Union[Any, nn. return obj, None -def _to_compiled(module: nn.Module, compile_kwargs: Dict[str, Any]) -> "OptimizedModule": - if not _TORCH_GREATER_EQUAL_2_0: - raise RuntimeError("Converting to a compiled module is only supported in PyTorch >= 2.0.0") - +def _to_compiled(module: nn.Module, compile_kwargs: Dict[str, Any]) -> OptimizedModule: return torch.compile(module, **compile_kwargs) # type: ignore[return-value] @@ -414,5 +398,4 @@ def _capture(*args: Any, **kwargs: Any) -> Any: return _capture -if _TORCH_GREATER_EQUAL_2_0: - torch.compile = _capture_compile_kwargs(torch.compile) +torch.compile = _capture_compile_kwargs(torch.compile) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 3838a6258b052..e4ae5a29c336c 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -38,7 +38,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the Bagua integration (`Trainer(strategy="bagua")`) ([#19445](https://github.com/Lightning-AI/lightning/pull/19445)) -- +- Removed support for PyTorch 1.13 ([#19706](https://github.com/Lightning-AI/lightning/pull/19706)) - diff --git a/src/lightning/pytorch/callbacks/stochastic_weight_avg.py b/src/lightning/pytorch/callbacks/stochastic_weight_avg.py index 731f161683102..c3d5cf4496fe5 100644 --- a/src/lightning/pytorch/callbacks/stochastic_weight_avg.py +++ b/src/lightning/pytorch/callbacks/stochastic_weight_avg.py @@ -21,11 +21,11 @@ import torch from torch import Tensor, nn +from torch.optim.lr_scheduler import LRScheduler from torch.optim.swa_utils import SWALR from typing_extensions import override import lightning.pytorch as pl -from lightning.fabric.utilities.types import LRScheduler from lightning.pytorch.callbacks.callback import Callback from lightning.pytorch.strategies import DeepSpeedStrategy from lightning.pytorch.strategies.fsdp import FSDPStrategy diff --git a/src/lightning/pytorch/cli.py b/src/lightning/pytorch/cli.py index a6854b9bf6d89..09f025b988089 100644 --- a/src/lightning/pytorch/cli.py +++ b/src/lightning/pytorch/cli.py @@ -23,11 +23,11 @@ from lightning_utilities.core.imports import RequirementCache from lightning_utilities.core.rank_zero import _warn from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler from typing_extensions import override import lightning.pytorch as pl from lightning.fabric.utilities.cloud_io import get_filesystem -from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER from lightning.pytorch import Callback, LightningDataModule, LightningModule, Trainer, seed_everything from lightning.pytorch.core.mixins.hparams_mixin import _given_hyperparameters_context from lightning.pytorch.utilities.exceptions import MisconfigurationException @@ -63,15 +63,15 @@ def __init__(self, optimizer: Optimizer, monitor: str, *args: Any, **kwargs: Any # LightningCLI requires the ReduceLROnPlateau defined here, thus it shouldn't accept the one from pytorch: -LRSchedulerTypeTuple = (_TORCH_LRSCHEDULER, ReduceLROnPlateau) -LRSchedulerTypeUnion = Union[_TORCH_LRSCHEDULER, ReduceLROnPlateau] -LRSchedulerType = Union[Type[_TORCH_LRSCHEDULER], Type[ReduceLROnPlateau]] +LRSchedulerTypeTuple = (LRScheduler, ReduceLROnPlateau) +LRSchedulerTypeUnion = Union[LRScheduler, ReduceLROnPlateau] +LRSchedulerType = Union[Type[LRScheduler], Type[ReduceLROnPlateau]] # Type aliases intended for convenience of CLI developers ArgsType = Optional[Union[List[str], Dict[str, Any], Namespace]] OptimizerCallable = Callable[[Iterable], Optimizer] -LRSchedulerCallable = Callable[[Optimizer], Union[_TORCH_LRSCHEDULER, ReduceLROnPlateau]] +LRSchedulerCallable = Callable[[Optimizer], Union[LRScheduler, ReduceLROnPlateau]] class LightningArgumentParser(ArgumentParser): diff --git a/src/lightning/pytorch/core/hooks.py b/src/lightning/pytorch/core/hooks.py index 4a4cad3d5f080..5495a0262036d 100644 --- a/src/lightning/pytorch/core/hooks.py +++ b/src/lightning/pytorch/core/hooks.py @@ -19,7 +19,6 @@ from torch import Tensor from torch.optim.optimizer import Optimizer -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch.utilities import move_data_to_device from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.types import EVAL_DATALOADERS, STEP_OUTPUT, TRAIN_DATALOADERS @@ -158,8 +157,7 @@ def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: in def on_validation_model_zero_grad(self) -> None: """Called by the training loop to release gradients before entering the validation loop.""" - zero_grad_kwargs = {} if _TORCH_GREATER_EQUAL_2_0 else {"set_to_none": True} - self.zero_grad(**zero_grad_kwargs) + self.zero_grad() def on_validation_model_eval(self) -> None: """Called when the validation loop starts. diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index faeda00ce5aa9..3cb55566fb8b7 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -50,7 +50,7 @@ from lightning.fabric.utilities.apply_func import convert_to_tensors from lightning.fabric.utilities.cloud_io import get_filesystem from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin -from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1 +from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.utilities.types import _MAP_LOCATION_TYPE, _PATH from lightning.fabric.wrappers import _FabricOptimizer from lightning.pytorch.callbacks.callback import Callback @@ -217,9 +217,6 @@ def trainer(self, trainer: Optional["pl.Trainer"]) -> None: for v in self.children(): if isinstance(v, LightningModule): v.trainer = trainer # type: ignore[assignment] - # https://github.com/pytorch/pytorch/issues/95857 - if not _TORCH_GREATER_EQUAL_2_0 and trainer is not None and not isinstance(trainer, weakref.ProxyTypes): - trainer = weakref.proxy(trainer) self._trainer = trainer @property @@ -1377,7 +1374,7 @@ def forward(self, x): model.to_onnx("export.onnx", input_sample, export_params=True) """ - if _TORCH_GREATER_EQUAL_2_0 and not _ONNX_AVAILABLE: + if not _ONNX_AVAILABLE: raise ModuleNotFoundError( f"`torch>=2.0` requires `onnx` to be installed to use `{type(self).__name__}.to_onnx()`" ) diff --git a/src/lightning/pytorch/core/optimizer.py b/src/lightning/pytorch/core/optimizer.py index b7a63a8e17cab..777dca0b51dfe 100644 --- a/src/lightning/pytorch/core/optimizer.py +++ b/src/lightning/pytorch/core/optimizer.py @@ -19,10 +19,11 @@ import torch from torch import optim from torch.optim import Optimizer +from torch.optim.lr_scheduler import ReduceLROnPlateau from typing_extensions import override import lightning.pytorch as pl -from lightning.fabric.utilities.types import Optimizable, ReduceLROnPlateau, _Stateful +from lightning.fabric.utilities.types import Optimizable, _Stateful from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.model_helpers import is_overridden from lightning.pytorch.utilities.rank_zero import rank_zero_warn diff --git a/src/lightning/pytorch/demos/boring_classes.py b/src/lightning/pytorch/demos/boring_classes.py index 3dd7bd8b1afc8..fd2660228146e 100644 --- a/src/lightning/pytorch/demos/boring_classes.py +++ b/src/lightning/pytorch/demos/boring_classes.py @@ -18,9 +18,9 @@ import torch.nn.functional as F from torch import Tensor from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import DataLoader, Dataset, IterableDataset, Subset -from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER from lightning.pytorch import LightningDataModule, LightningModule from lightning.pytorch.core.optimizer import LightningOptimizer from lightning.pytorch.utilities.types import STEP_OUTPUT @@ -134,7 +134,7 @@ def validation_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT: def test_step(self, batch: Any, batch_idx: int) -> STEP_OUTPUT: return {"y": self.step(batch)} - def configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[_TORCH_LRSCHEDULER]]: + def configure_optimizers(self) -> Tuple[List[torch.optim.Optimizer], List[LRScheduler]]: optimizer = torch.optim.SGD(self.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) return [optimizer], [lr_scheduler] diff --git a/src/lightning/pytorch/plugins/precision/amp.py b/src/lightning/pytorch/plugins/precision/amp.py index 70ce9a87fb37a..c0a309f070ef6 100644 --- a/src/lightning/pytorch/plugins/precision/amp.py +++ b/src/lightning/pytorch/plugins/precision/amp.py @@ -18,7 +18,6 @@ from typing_extensions import override import lightning.pytorch as pl -from lightning.fabric.accelerators.cuda import _patch_cuda_is_available from lightning.fabric.plugins.precision.amp import _optimizer_handles_unscaling from lightning.fabric.utilities.types import Optimizable from lightning.pytorch.plugins.precision.precision import Precision @@ -50,9 +49,7 @@ def __init__( self.precision = precision if scaler is None and self.precision == "16-mixed": - with _patch_cuda_is_available(): - # if possible, we defer CUDA initialization to support strategies that will attempt forks - scaler = torch.cuda.amp.GradScaler() + scaler = torch.cuda.amp.GradScaler() if scaler is not None and self.precision == "bf16-mixed": raise MisconfigurationException(f"`precision='bf16-mixed'` does not use a scaler, found {scaler}.") self.device = device diff --git a/src/lightning/pytorch/plugins/precision/fsdp.py b/src/lightning/pytorch/plugins/precision/fsdp.py index c41199adb480e..e6c684967ed40 100644 --- a/src/lightning/pytorch/plugins/precision/fsdp.py +++ b/src/lightning/pytorch/plugins/precision/fsdp.py @@ -22,7 +22,6 @@ from lightning.fabric.plugins.precision.amp import _optimizer_handles_unscaling from lightning.fabric.plugins.precision.fsdp import _PRECISION_INPUT from lightning.fabric.plugins.precision.utils import _convert_fp_tensor, _DtypeContextManager -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.types import Optimizable from lightning.pytorch.plugins.precision.precision import Precision from lightning.pytorch.utilities.exceptions import MisconfigurationException @@ -87,21 +86,18 @@ def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: def mixed_precision_config(self) -> "TorchMixedPrecision": from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision as TorchMixedPrecision - # With PyTorch < 2.0, FSDP uses the noneness of `param_dtype` as a proxy for the `_uses_param_mixed_precision` - # property. In order to avoid FSDP assertion failures, we therefore avoid setting `param_dtype` to - # `torch.float32` here with PyTorch < 2.0. if self.precision == "16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.float16 elif self.precision == "bf16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.bfloat16 elif self.precision == "16-true": param_dtype = reduce_dtype = buffer_dtype = torch.float16 elif self.precision == "bf16-true": param_dtype = reduce_dtype = buffer_dtype = torch.bfloat16 elif self.precision == "32-true": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.float32 else: raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py index 2cc099be39d26..6be3d3f8ba590 100644 --- a/src/lightning/pytorch/strategies/deepspeed.py +++ b/src/lightning/pytorch/strategies/deepspeed.py @@ -24,6 +24,7 @@ import torch from torch.nn import Module from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler, ReduceLROnPlateau from typing_extensions import override import lightning.pytorch as pl @@ -37,7 +38,7 @@ ) from lightning.fabric.utilities.optimizer import _optimizers_to_device from lightning.fabric.utilities.seed import reset_seed -from lightning.fabric.utilities.types import _PATH, LRScheduler, ReduceLROnPlateau +from lightning.fabric.utilities.types import _PATH from lightning.pytorch.accelerators.cuda import CUDAAccelerator from lightning.pytorch.core.optimizer import _init_optimizers_and_lr_schedulers from lightning.pytorch.plugins.precision import Precision diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 4c6f1fec5fe15..657fb438c1e8a 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -54,10 +54,7 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import ( - _TORCH_GREATER_EQUAL_2_0, - _TORCH_GREATER_EQUAL_2_1, -) +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.utilities.init import _EmptyInit from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors from lightning.fabric.utilities.optimizer import _optimizers_to_device @@ -75,14 +72,9 @@ if TYPE_CHECKING: from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision, ShardingStrategy + from torch.distributed.fsdp.wrap import ModuleWrapPolicy - if _TORCH_GREATER_EQUAL_2_0: - from torch.distributed.fsdp.wrap import ModuleWrapPolicy - - _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy] - else: - _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool]] # type: ignore[misc] - + _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy] _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]] @@ -175,20 +167,13 @@ def __init__( self.kwargs = _auto_wrap_policy_kwargs(auto_wrap_policy, kwargs) self.sharding_strategy = _init_sharding_strategy(sharding_strategy, self.kwargs) - if _TORCH_GREATER_EQUAL_2_0: - # Avoids the need for user to reference params in `configure_optimizers` via - # `self.trainer.model.parameters()` and enables support for multiple parameter groups. - self.kwargs.setdefault("use_orig_params", True) + # Avoids the need for user to reference params in `configure_optimizers` via + # `self.trainer.model.parameters()` and enables support for multiple parameter groups. + self.kwargs.setdefault("use_orig_params", True) self._activation_checkpointing_kwargs = _activation_checkpointing_kwargs( activation_checkpointing, activation_checkpointing_policy ) - - if state_dict_type == "sharded" and not _TORCH_GREATER_EQUAL_2_0: - raise NotImplementedError( - "Saving checkpoints with `FSDPStrategy(state_dict_type='sharded')` is not supported in PyTorch < 2.0." - " Please upgrade `torch`." - ) self._state_dict_type = state_dict_type @property @@ -517,10 +502,6 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr @override def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: - if not _TORCH_GREATER_EQUAL_2_0: - rank_zero_warn("FSDP in Lightning with PyTorch < 2.0 does not support saving the optimizer state.") - return {} - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp import OptimStateKeyType @@ -629,7 +610,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: return metadata if _is_full_checkpoint(path): - checkpoint = _lazy_load(path) if _TORCH_GREATER_EQUAL_2_0 else torch.load(path, map_location="cpu") + checkpoint = _lazy_load(path) _load_raw_module_state( checkpoint.pop("state_dict"), module=self.model, @@ -637,10 +618,9 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: strict=self.lightning_module.strict_loading, ) - if _TORCH_GREATER_EQUAL_2_0: - # Materialize lazy tensors if there are any left in the checkpoint - # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues - checkpoint = _materialize_tensors(checkpoint) + # Materialize lazy tensors if there are any left in the checkpoint + # The `torch.Optimizer.load_state_dict` method can't load lazy tensors because of deepcopy pickle issues + checkpoint = _materialize_tensors(checkpoint) from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp import OptimStateKeyType @@ -649,9 +629,6 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: if optimizer_states is None or self.lightning_module.trainer.state.fn != TrainerFn.FITTING: # If the optimizer states are not present, we don't need to do anything (backward compatibility) return checkpoint - if not _TORCH_GREATER_EQUAL_2_0: - rank_zero_warn("FSDP in Lightning with PyTorch < 2.0 does not support loading the optimizer state.") - return checkpoint if len(self.optimizers) != len(optimizer_states): raise RuntimeError( f"You have configured {len(self.optimizers)} optimizers but the checkpoint contains" diff --git a/src/lightning/pytorch/strategies/strategy.py b/src/lightning/pytorch/strategies/strategy.py index f2acd8ac98eba..9534822939f66 100644 --- a/src/lightning/pytorch/strategies/strategy.py +++ b/src/lightning/pytorch/strategies/strategy.py @@ -13,7 +13,7 @@ # limitations under the License. import logging from abc import ABC, abstractmethod -from contextlib import contextmanager, nullcontext +from contextlib import contextmanager from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple, TypeVar, Union import torch @@ -26,7 +26,6 @@ from lightning.fabric.strategies import _StrategyRegistry from lightning.fabric.utilities import move_data_to_device from lightning.fabric.utilities.distributed import ReduceOp -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.init import _EmptyInit from lightning.fabric.utilities.optimizer import _optimizer_to_device, _optimizers_to_device from lightning.fabric.utilities.types import _PATH @@ -509,9 +508,8 @@ def tensor_init_context(self, empty_init: Optional[bool] = None) -> Generator[No If ``None``, the strategy will decide. Some strategies may not support all options. """ - device_context = self.root_device if _TORCH_GREATER_EQUAL_2_0 else nullcontext() empty_init_context = _EmptyInit(enabled=bool(empty_init)) - with empty_init_context, device_context, self.precision_plugin.tensor_init_context(): + with empty_init_context, self.root_device, self.precision_plugin.tensor_init_context(): yield @contextmanager diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py index 7e0ef433031bd..d7320c2c2e251 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py @@ -24,7 +24,7 @@ from lightning.fabric.utilities import move_data_to_device from lightning.fabric.utilities.apply_func import convert_tensors_to_scalars from lightning.fabric.utilities.distributed import _distributed_is_initialized -from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_0 +from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0 from lightning.pytorch.utilities.data import extract_batch_size from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_1_0_0 @@ -305,9 +305,7 @@ def __repr__(self) -> str: @override def to(self, *args: Any, **kwargs: Any) -> "_ResultMetric": - d = self.__dict__ - if _TORCH_GREATER_EQUAL_2_0: # https://github.com/pytorch/pytorch/issues/96198 - d = dict(d) + d = dict(self.__dict__) self.__dict__.update(apply_to_collection(d, (Tensor, Metric), move_data_to_device, *args, **kwargs)) return self diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index 6436fc54b7bed..bf7d47a880da3 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -35,7 +35,6 @@ import lightning.pytorch as pl from lightning.fabric.utilities.apply_func import convert_tensors_to_scalars from lightning.fabric.utilities.cloud_io import _is_local_file_protocol -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.types import _PATH from lightning.pytorch.accelerators import Accelerator from lightning.pytorch.callbacks import Callback, Checkpoint, EarlyStopping, ProgressBar @@ -1018,9 +1017,7 @@ def _teardown(self) -> None: def _run_stage(self) -> Optional[Union[_PREDICT_OUTPUT, _EVALUATE_OUTPUT]]: # wait for all to join if on distributed self.strategy.barrier("run-stage") - - zero_grad_kwargs = {} if _TORCH_GREATER_EQUAL_2_0 else {"set_to_none": True} - self.lightning_module.zero_grad(**zero_grad_kwargs) + self.lightning_module.zero_grad() if self.evaluating: return self._evaluation_loop.run() @@ -1084,8 +1081,7 @@ def init_module(self, empty_init: Optional[bool] = None) -> Generator: the right data type depending on the precision setting in the Trainer. The parameters and tensors get created on the device and with the right data type right away without wasting - memory being allocated unnecessarily. The automatic device placement under this context manager is only - supported with PyTorch 2.0 and newer. + memory being allocated unnecessarily. Args: empty_init: Whether to initialize the model with empty weights (uninitialized memory). @@ -1093,13 +1089,6 @@ def init_module(self, empty_init: Optional[bool] = None) -> Generator: Set this to ``True`` if you are loading a checkpoint into a large model. """ - if not _TORCH_GREATER_EQUAL_2_0 and self.strategy.root_device.type != "cpu": - rank_zero_warn( - "`Trainer.init_module()` can't place tensors on the device directly" - " with PyTorch < 2.0. Parameters will remain on CPU until the trainer starts." - " Upgrade to PyTorch >= 2.0 to fully utilize this feature.", - category=PossibleUserWarning, - ) if is_overridden("model_sharded_context", self.strategy, parent=Strategy): # warning instead of error so that code changes are not required when changing strategies # this is a limitation because processes are not expected to have been launched when this is called diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py index f39788b8ea290..8eebd3cd7f974 100644 --- a/src/lightning/pytorch/tuner/lr_finder.py +++ b/src/lightning/pytorch/tuner/lr_finder.py @@ -16,19 +16,19 @@ import os import uuid from copy import deepcopy -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import torch from lightning_utilities.core.imports import RequirementCache +from torch.optim.lr_scheduler import LRScheduler from typing_extensions import override import lightning.pytorch as pl -from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER from lightning.pytorch.callbacks import Callback from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.parsing import lightning_hasattr, lightning_setattr from lightning.pytorch.utilities.rank_zero import rank_zero_warn -from lightning.pytorch.utilities.types import STEP_OUTPUT, LRScheduler, LRSchedulerConfig +from lightning.pytorch.utilities.types import STEP_OUTPUT, LRSchedulerConfig # check if ipywidgets is installed before importing tqdm.auto # to ensure it won't fail and a progress bar is displayed @@ -127,7 +127,6 @@ def _exchange_scheduler(self, trainer: "pl.Trainer") -> None: args = (optimizer, self.lr_max, self.num_training) scheduler = _LinearLR(*args) if self.mode == "linear" else _ExponentialLR(*args) - scheduler = cast(LRScheduler, scheduler) trainer.strategy.optimizers = [optimizer] trainer.strategy.lr_scheduler_configs = [LRSchedulerConfig(scheduler, interval="step")] @@ -439,7 +438,7 @@ def on_train_batch_end( self.losses.append(smoothed_loss) -class _LinearLR(_TORCH_LRSCHEDULER): +class _LinearLR(LRScheduler): """Linearly increases the learning rate between two boundaries over a number of iterations. Args: @@ -459,9 +458,8 @@ def __init__(self, optimizer: torch.optim.Optimizer, end_lr: float, num_iter: in self.num_iter = num_iter super().__init__(optimizer, last_epoch) - # mypy can't follow the _TORCH_LRSCHEDULER TypeAlias, so ignore "no base method" error - @override # type: ignore[misc] - def get_lr(self) -> List[float]: + @override + def get_lr(self) -> List[float]: # type: ignore[override] curr_iter = self.last_epoch + 1 r = curr_iter / self.num_iter @@ -477,7 +475,7 @@ def lr(self) -> Union[float, List[float]]: return self._lr -class _ExponentialLR(_TORCH_LRSCHEDULER): +class _ExponentialLR(LRScheduler): """Exponentially increases the learning rate between two boundaries over a number of iterations. Arguments: @@ -497,9 +495,8 @@ def __init__(self, optimizer: torch.optim.Optimizer, end_lr: float, num_iter: in self.num_iter = num_iter super().__init__(optimizer, last_epoch) - # mypy can't follow the _TORCH_LRSCHEDULER TypeAlias, so ignore "no base method" error - @override # type: ignore[misc] - def get_lr(self) -> List[float]: + @override + def get_lr(self) -> List[float]: # type: ignore[override] curr_iter = self.last_epoch + 1 r = curr_iter / self.num_iter diff --git a/src/lightning/pytorch/utilities/compile.py b/src/lightning/pytorch/utilities/compile.py index a77ed553d418e..7c5a8067740a4 100644 --- a/src/lightning/pytorch/utilities/compile.py +++ b/src/lightning/pytorch/utilities/compile.py @@ -14,14 +14,15 @@ from typing import Union import torch +from torch._dynamo import OptimizedModule import lightning.pytorch as pl -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_1 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.pytorch.strategies import DDPStrategy, DeepSpeedStrategy, FSDPStrategy, SingleDeviceStrategy, Strategy from lightning.pytorch.utilities.model_helpers import _check_mixed_imports -def from_compiled(model: "torch._dynamo.OptimizedModule") -> "pl.LightningModule": +def from_compiled(model: OptimizedModule) -> "pl.LightningModule": """Returns an instance LightningModule from the output of ``torch.compile``. .. warning:: This is an :ref:`experimental ` feature. @@ -33,11 +34,6 @@ def from_compiled(model: "torch._dynamo.OptimizedModule") -> "pl.LightningModule Use this method to obtain a LightningModule that still runs with all the optimizations from ``torch.compile``. """ - if not _TORCH_GREATER_EQUAL_2_0: - raise ModuleNotFoundError("`from_compiled` requires torch>=2.0") - - from torch._dynamo import OptimizedModule - if not isinstance(model, OptimizedModule): raise ValueError(f"`model` is required to be a `OptimizedModule`. Found a `{type(model).__name__}` instead.") @@ -82,11 +78,6 @@ def to_uncompiled(model: Union["pl.LightningModule", "torch._dynamo.OptimizedMod Note: this method will in-place modify the ``LightningModule`` that is passed in. """ - if not _TORCH_GREATER_EQUAL_2_0: - raise ModuleNotFoundError("`to_uncompiled` requires torch>=2.0") - - from torch._dynamo import OptimizedModule - if isinstance(model, OptimizedModule): original = model._orig_mod if not isinstance(original, pl.LightningModule): @@ -117,13 +108,6 @@ def to_uncompiled(model: Union["pl.LightningModule", "torch._dynamo.OptimizedMod def _maybe_unwrap_optimized(model: object) -> "pl.LightningModule": - if not _TORCH_GREATER_EQUAL_2_0: - if not isinstance(model, pl.LightningModule): - _check_mixed_imports(model) - raise TypeError(f"`model` must be a `LightningModule`, got `{type(model).__qualname__}`") - return model - from torch._dynamo import OptimizedModule - if isinstance(model, OptimizedModule): return from_compiled(model) if isinstance(model, pl.LightningModule): diff --git a/src/lightning/pytorch/utilities/model_summary/model_summary.py b/src/lightning/pytorch/utilities/model_summary/model_summary.py index ef2827d3b7eed..806724e1c434a 100644 --- a/src/lightning/pytorch/utilities/model_summary/model_summary.py +++ b/src/lightning/pytorch/utilities/model_summary/model_summary.py @@ -25,7 +25,6 @@ from torch.utils.hooks import RemovableHandle import lightning.pytorch as pl -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch.utilities.model_helpers import _ModuleMode from lightning.pytorch.utilities.rank_zero import WarningCache @@ -107,10 +106,7 @@ def hook_with_kwargs(_: nn.Module, args: Any, kwargs: Any, out: Any) -> None: handle = None if not isinstance(self._module, torch.jit.ScriptModule): - if _TORCH_GREATER_EQUAL_2_0: - handle = self._module.register_forward_hook(hook_with_kwargs, with_kwargs=True) - else: - handle = self._module.register_forward_hook(hook) + handle = self._module.register_forward_hook(hook_with_kwargs, with_kwargs=True) return handle diff --git a/src/lightning/pytorch/utilities/testing/_runif.py b/src/lightning/pytorch/utilities/testing/_runif.py index c3e0262d9906f..03b3afd61b875 100644 --- a/src/lightning/pytorch/utilities/testing/_runif.py +++ b/src/lightning/pytorch/utilities/testing/_runif.py @@ -15,7 +15,6 @@ from lightning_utilities.core.imports import RequirementCache -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.testing import _runif_reasons as fabric_run_if from lightning.pytorch.accelerators.cpu import _PSUTIL_AVAILABLE from lightning.pytorch.callbacks.progress.rich_progress import _RICH_AVAILABLE @@ -94,7 +93,7 @@ def _runif_reasons( if sklearn and not _SKLEARN_AVAILABLE: reasons.append("scikit-learn") - if onnx and _TORCH_GREATER_EQUAL_2_0 and not _ONNX_AVAILABLE: + if onnx and not _ONNX_AVAILABLE: reasons.append("onnx") return reasons, kwargs diff --git a/src/lightning/pytorch/utilities/types.py b/src/lightning/pytorch/utilities/types.py index 203df53f22ba7..bc75e0f50aeb0 100644 --- a/src/lightning/pytorch/utilities/types.py +++ b/src/lightning/pytorch/utilities/types.py @@ -38,10 +38,11 @@ import torch from torch import Tensor from torch.optim import Optimizer +from torch.optim.lr_scheduler import LRScheduler, ReduceLROnPlateau from torchmetrics import Metric from typing_extensions import NotRequired, Required -from lightning.fabric.utilities.types import _TORCH_LRSCHEDULER, LRScheduler, ProcessGroup, ReduceLROnPlateau +from lightning.fabric.utilities.types import ProcessGroup _NUMBER = Union[int, float] _METRIC = Union[Metric, Tensor, _NUMBER] @@ -76,15 +77,15 @@ def no_sync(self) -> Generator: ... # todo: improve LRSchedulerType naming/typing -LRSchedulerTypeTuple = (_TORCH_LRSCHEDULER, torch.optim.lr_scheduler.ReduceLROnPlateau) -LRSchedulerTypeUnion = Union[_TORCH_LRSCHEDULER, torch.optim.lr_scheduler.ReduceLROnPlateau] -LRSchedulerType = Union[Type[_TORCH_LRSCHEDULER], Type[torch.optim.lr_scheduler.ReduceLROnPlateau]] +LRSchedulerTypeTuple = (LRScheduler, ReduceLROnPlateau) +LRSchedulerTypeUnion = Union[LRScheduler, ReduceLROnPlateau] +LRSchedulerType = Union[Type[LRScheduler], Type[ReduceLROnPlateau]] LRSchedulerPLType = Union[LRScheduler, ReduceLROnPlateau] @dataclass class LRSchedulerConfig: - scheduler: Union[_TORCH_LRSCHEDULER, ReduceLROnPlateau] + scheduler: Union[LRScheduler, ReduceLROnPlateau] # no custom name name: Optional[str] = None # after epoch is over diff --git a/tests/tests_fabric/accelerators/test_cuda.py b/tests/tests_fabric/accelerators/test_cuda.py index 4b2265670b8bf..e323ada908cd1 100644 --- a/tests/tests_fabric/accelerators/test_cuda.py +++ b/tests/tests_fabric/accelerators/test_cuda.py @@ -25,8 +25,6 @@ CUDAAccelerator, _check_cuda_matmul_precision, find_usable_cuda_devices, - is_cuda_available, - num_cuda_devices, ) from tests_fabric.helpers.runif import RunIf @@ -67,18 +65,6 @@ def test_set_cuda_device(_, set_device_mock): set_device_mock.assert_called_once_with(device) -@mock.patch("lightning.fabric.accelerators.cuda._device_count_nvml", return_value=-1) -@mock.patch("torch.cuda.is_available", return_value=True) -@mock.patch("torch.cuda.device_count", return_value=100) -def test_num_cuda_devices_without_nvml(*_): - """Test that if NVML can't be loaded, our helper functions fall back to the default implementation for determining - CUDA availability.""" - num_cuda_devices.cache_clear() - assert is_cuda_available() - assert num_cuda_devices() == 100 - num_cuda_devices.cache_clear() - - @mock.patch.dict(os.environ, {}, clear=True) def test_force_nvml_based_cuda_check(): """Test that we force PyTorch to use the NVML-based CUDA checks.""" diff --git a/tests/tests_fabric/plugins/precision/test_fsdp.py b/tests/tests_fabric/plugins/precision/test_fsdp.py index 74c1034518c39..148292dcd48df 100644 --- a/tests/tests_fabric/plugins/precision/test_fsdp.py +++ b/tests/tests_fabric/plugins/precision/test_fsdp.py @@ -26,25 +26,9 @@ [ ("16-true", (torch.float16, torch.float16, torch.float16)), ("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)), - pytest.param( - "16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0" - ), - pytest.param( - "16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0" - ), - pytest.param( - "bf16-mixed", - (torch.float32, torch.bfloat16, torch.bfloat16), - marks=RunIf(min_torch="2.0"), - id="bf16-mixed-ge2_0", - ), - pytest.param( - "bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0" - ), - pytest.param( - "32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0" - ), - pytest.param("32-true", (None, torch.float32, torch.float32), marks=RunIf(max_torch="2.0"), id="32-true-lt2_0"), + ("16-mixed", (torch.float32, torch.float16, torch.float16)), + ("bf16-mixed", (torch.float32, torch.bfloat16, torch.bfloat16)), + ("32-true", (torch.float32, torch.float32, torch.float32)), ], ) def test_fsdp_precision_config(precision, expected): diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index beea7eccb69c2..56d9875dfefed 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -23,7 +23,6 @@ from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies import DDPStrategy from lightning.fabric.strategies.ddp import _DDPBackwardSyncControl -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from torch.nn.parallel import DistributedDataParallel from tests_fabric.helpers.runif import RunIf @@ -128,7 +127,7 @@ def __instancecheck__(self, instance): def test_module_init_context(precision, expected_dtype): """Test that the module under the init-context gets moved to the right device and dtype.""" parallel_devices = [torch.device("cuda", 0), torch.device("cuda", 1)] - expected_device = parallel_devices[1] if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu") + expected_device = parallel_devices[1] strategy = DDPStrategy( parallel_devices=parallel_devices, precision=precision, cluster_environment=LightningEnvironment() diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py index 65eaacde2ff2c..6f003748b9cce 100644 --- a/tests/tests_fabric/strategies/test_ddp_integration.py +++ b/tests/tests_fabric/strategies/test_ddp_integration.py @@ -19,7 +19,7 @@ import pytest import torch from lightning.fabric import Fabric -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 +from torch._dynamo import OptimizedModule from torch.nn.parallel.distributed import DistributedDataParallel from tests_fabric.helpers.runif import RunIf @@ -71,15 +71,10 @@ def assert_params_equal(params0, params1): @RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0", dynamo=True) -@mock.patch( - "lightning.fabric.wrappers.torch.compile", - Mock(wraps=(torch.compile if _TORCH_GREATER_EQUAL_2_0 else None)), -) +@mock.patch("lightning.fabric.wrappers.torch.compile", Mock(wraps=torch.compile)) @mock.patch.dict(os.environ, {}) def test_reapply_compile(): """Test that Fabric can rewrap a compiled module such that compilation happens over the DDP-wrapper.""" - from torch._dynamo import OptimizedModule - fabric = Fabric(accelerator="cuda", devices=2, strategy="ddp") fabric.launch() diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index 3f2d02e06be2a..5eeed7065fb5f 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -33,8 +33,6 @@ from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, FullyShardedDataParallel, MixedPrecision from torch.optim import Adam -from tests_fabric.helpers.runif import RunIf - def test_fsdp_custom_mixed_precision(): """Test that passing a custom mixed precision config works.""" @@ -74,7 +72,6 @@ def test_fsdp_sharding_strategy(): assert strategy.sharding_strategy == ShardingStrategy.NO_SHARD -@RunIf(min_torch="2.0") @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"]) def test_fsdp_hybrid_shard_configuration(sharding_strategy): """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg.""" @@ -108,22 +105,6 @@ def test_fsdp_checkpoint_io_unsupported(): strategy.checkpoint_io = Mock() -@pytest.mark.parametrize("torch_ge_2_0", [False, True]) -def test_fsdp_setup_optimizer_validation(torch_ge_2_0): - """Test that `setup_optimizer()` validates the param groups and reference to FSDP parameters.""" - module = nn.Linear(2, 2) - with mock.patch("lightning.fabric.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", torch_ge_2_0): - strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")]) - bad_optimizer = Adam(module.parameters()) - - if torch_ge_2_0: - strategy.setup_optimizer(bad_optimizer) - else: - with pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameter"): - strategy.setup_optimizer(bad_optimizer) - - -@RunIf(min_torch="2.0.0") @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.setup_module") def test_fsdp_setup_use_orig_params(_): module = nn.Linear(2, 2) @@ -234,7 +215,6 @@ def test_fsdp_grad_clipping_norm_error(): strategy.clip_gradients_norm(Mock(), Mock(), Mock()) -@RunIf(min_torch="2.0.0") def test_fsdp_save_checkpoint_storage_options(tmp_path): """Test that the FSDP strategy does not accept storage options for saving checkpoints.""" strategy = FSDPStrategy() @@ -242,7 +222,6 @@ def test_fsdp_save_checkpoint_storage_options(tmp_path): strategy.save_checkpoint(path=tmp_path, state=Mock(), storage_options=Mock()) -@RunIf(min_torch="2.0.0") @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) @mock.patch("lightning.fabric.strategies.fsdp._get_full_state_dict_context") @mock.patch("lightning.fabric.strategies.fsdp._get_sharded_state_dict_context") @@ -305,7 +284,6 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, assert path.is_dir() -@RunIf(min_torch="2.0.0") @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) def test_fsdp_save_checkpoint_one_fsdp_module_required(tmp_path): """Test that the FSDP strategy can only save one FSDP model per checkpoint.""" @@ -326,7 +304,6 @@ def test_fsdp_save_checkpoint_one_fsdp_module_required(tmp_path): strategy.save_checkpoint(path=tmp_path, state={"model1": model1, "model2": model2}) -@RunIf(min_torch="2.0.0") def test_fsdp_load_checkpoint_no_state(tmp_path): """Test that the FSDP strategy can't load the full state without access to a model instance from the user.""" strategy = FSDPStrategy() @@ -336,7 +313,6 @@ def test_fsdp_load_checkpoint_no_state(tmp_path): strategy.load_checkpoint(path=tmp_path, state={}) -@RunIf(min_torch="2.0.0") @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) @mock.patch("lightning.fabric.strategies.fsdp._lazy_load", Mock()) def test_fsdp_load_checkpoint_one_fsdp_module_required(tmp_path): @@ -364,7 +340,6 @@ def test_fsdp_load_checkpoint_one_fsdp_module_required(tmp_path): strategy.load_checkpoint(path=path, state=model) -@RunIf(min_torch="2.0.0") @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path): strategy = FSDPStrategy(state_dict_type="invalid") @@ -374,7 +349,6 @@ def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path): strategy.save_checkpoint(path=tmp_path, state={"model": model}) -@RunIf(min_torch="2.0.0") def test_fsdp_load_unknown_checkpoint_type(tmp_path): """Test that the strategy validates the contents at the checkpoint path.""" strategy = FSDPStrategy() @@ -386,7 +360,6 @@ def test_fsdp_load_unknown_checkpoint_type(tmp_path): strategy.load_checkpoint(path=path, state={"model": model}) -@RunIf(min_torch="2.0.0") def test_fsdp_load_raw_checkpoint_validate_single_file(tmp_path): """Test that we validate the given checkpoint is a single file when loading a raw PyTorch state-dict checkpoint.""" strategy = FSDPStrategy() @@ -397,7 +370,6 @@ def test_fsdp_load_raw_checkpoint_validate_single_file(tmp_path): strategy.load_checkpoint(path=path, state=model) -@RunIf(min_torch="2.0.0") def test_fsdp_load_raw_checkpoint_optimizer_unsupported(tmp_path): """Validate that the FSDP strategy does not yet support loading the raw PyTorch state-dict for an optimizer.""" strategy = FSDPStrategy() @@ -443,7 +415,6 @@ def test_has_meta_device_parameters(): _has_meta_device_parameters(None) -@RunIf(min_torch="2.0") @pytest.mark.parametrize("torch_ge_2_1", [True, False]) @mock.patch("torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.set_state_dict_type") def test_get_full_state_dict_context_offload(set_type_mock, monkeypatch, torch_ge_2_1): diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index 03d1d0979ea66..88d015c5cd138 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -23,12 +23,10 @@ from lightning.fabric import Fabric from lightning.fabric.plugins import FSDPPrecision from lightning.fabric.strategies import FSDPStrategy -from lightning.fabric.utilities.imports import ( - _TORCH_GREATER_EQUAL_2_0, - _TORCH_GREATER_EQUAL_2_1, -) +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.fabric.wrappers import _FabricOptimizer +from torch._dynamo import OptimizedModule from torch.distributed.fsdp import FlatParameter, FullyShardedDataParallel, OptimStateKeyType from torch.distributed.fsdp.wrap import always_wrap_policy, wrap from torch.nn import Parameter @@ -121,7 +119,7 @@ def get_model(self): return model -@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0") +@RunIf(min_cuda_gpus=2, standalone=True) @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))]) @pytest.mark.parametrize("manual_wrapping", [True, False]) def test_fsdp_train_save_load(tmp_path, manual_wrapping, precision): @@ -176,7 +174,7 @@ def test_fsdp_train_save_load(tmp_path, manual_wrapping, precision): assert state["coconut"] == 11 -@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0") +@RunIf(min_cuda_gpus=2, standalone=True) def test_fsdp_save_full_state_dict(tmp_path): """Test that FSDP saves the full state into a single file with `state_dict_type="full"`.""" fabric = Fabric( @@ -290,7 +288,7 @@ def test_fsdp_save_full_state_dict(tmp_path): trainer.run() -@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0") +@RunIf(min_cuda_gpus=2, standalone=True) def test_fsdp_load_full_state_dict_into_sharded_model(tmp_path): """Test that the strategy can load a full-state checkpoint into a FSDP sharded model.""" from torch.distributed.fsdp import FullyShardedDataParallel as FSDP @@ -362,11 +360,7 @@ def test_setup_module_move_to_device(fabric_module_mock, move_to_device): # the linear layer got sharded and each part is on the expected device assert next(fabric_model.parameters()).device == torch.device("cuda", fabric.local_rank) assert next(fabric_model.parameters()).numel() == 50 - if _TORCH_GREATER_EQUAL_2_0: - # In PyTorch >= 2.0 we set `use_orig_params=True` and don't see flattened parameters - assert isinstance(next(fabric_model.parameters()), Parameter) - else: - assert isinstance(next(fabric_model.parameters()), FlatParameter) + assert isinstance(next(fabric_model.parameters()), Parameter) # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for models with pieces on # different devices @@ -374,7 +368,7 @@ def test_setup_module_move_to_device(fabric_module_mock, move_to_device): assert fabric.device == torch.device("cuda", fabric.local_rank) -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, min_torch="2.0.0") +@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) def test_setup_with_orig_params_and_multiple_param_groups(): """Test that Fabric sets `use_orig_params` for the user when jointly setting up model and optimizer.""" strategy = FSDPStrategy(auto_wrap_policy=always_wrap_policy) @@ -407,15 +401,10 @@ def test_setup_with_orig_params_and_multiple_param_groups(): @RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0", dynamo=True, skip_windows=True) -@mock.patch( - "lightning.fabric.wrappers.torch.compile", - Mock(wraps=(torch.compile if _TORCH_GREATER_EQUAL_2_0 else None)), -) +@mock.patch("lightning.fabric.wrappers.torch.compile", Mock(wraps=torch.compile)) @mock.patch.dict(os.environ, {}) def test_reapply_compile(): """Test that Fabric can rewrap a compiled module such that compilation happens over the FSDP-wrapper.""" - from torch._dynamo import OptimizedModule - strategy = FSDPStrategy(auto_wrap_policy=always_wrap_policy) fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy) fabric.launch() @@ -485,7 +474,7 @@ def _run_setup_assertions(empty_init, expected_device): _run_setup_assertions(empty_init=True, expected_device=torch.device("cpu")) -@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0") +@RunIf(min_cuda_gpus=2, standalone=True) def test_fsdp_save_filter(tmp_path): fabric = Fabric(accelerator="cuda", strategy=FSDPStrategy(state_dict_type="full"), devices=2) fabric.launch() diff --git a/tests/tests_fabric/strategies/test_strategy.py b/tests/tests_fabric/strategies/test_strategy.py index cbbbf963b3607..a7a1dba87cb97 100644 --- a/tests/tests_fabric/strategies/test_strategy.py +++ b/tests/tests_fabric/strategies/test_strategy.py @@ -18,7 +18,6 @@ import torch from lightning.fabric.plugins import DoublePrecision, HalfPrecision, Precision from lightning.fabric.strategies import SingleDeviceStrategy -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.fabric.utilities.types import _Stateful from tests_fabric.helpers.runif import RunIf @@ -239,8 +238,7 @@ def test_module_init_context(device, precision, dtype, empty_init, monkeypatch): with strategy.module_init_context(empty_init=empty_init): module = torch.nn.Linear(2, 2) - expected_device = device if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu") - assert module.weight.device == module.bias.device == expected_device + assert module.weight.device == module.bias.device == device assert module.weight.dtype == module.bias.dtype == dtype if not empty_init: init_mock.assert_called() @@ -274,8 +272,7 @@ def test_tensor_init_context(device, precision, dtype): tensor1 = torch.tensor(42) tensor2 = torch.tensor(42.0, dtype=torch.half) - expected_device = device if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu") - assert tensor0.device == tensor1.device == tensor2.device == expected_device + assert tensor0.device == tensor1.device == tensor2.device == device assert tensor0.dtype == dtype assert tensor1.dtype == torch.long # `.init_tensor()` only affects floating point dtypes assert tensor2.dtype == torch.half # this tensor was created with an explicit dtype assignment diff --git a/tests/tests_fabric/strategies/test_xla_fsdp.py b/tests/tests_fabric/strategies/test_xla_fsdp.py index bcd2a6e637417..e2864b684c4a7 100644 --- a/tests/tests_fabric/strategies/test_xla_fsdp.py +++ b/tests/tests_fabric/strategies/test_xla_fsdp.py @@ -27,7 +27,7 @@ from tests_fabric.helpers.runif import RunIf -@RunIf(min_torch="2.0", tpu=True) +@RunIf(tpu=True) def test_xla_fsdp_setup_optimizer_validation(): """Test that `setup_optimizer()` validates the param groups and reference to FSDP parameters.""" module = nn.Linear(2, 2) @@ -39,7 +39,7 @@ def test_xla_fsdp_setup_optimizer_validation(): strategy.setup_optimizer(bad_optimizer) -@RunIf(min_torch="2.0", tpu=True) +@RunIf(tpu=True) def test_xla_fsdp_no_backward_sync(): """Test that the backward sync control calls `.no_sync()`, and only on a module wrapped in XlaFullyShardedDataParallel.""" @@ -64,7 +64,7 @@ def test_xla_fsdp_no_backward_sync(): module.no_sync.assert_called_once() -@RunIf(min_torch="2.0", tpu=True) +@RunIf(tpu=True) def test_xla_fsdp_grad_clipping_value_error(): strategy = XLAFSDPStrategy() with pytest.raises(NotImplementedError, match="does not support to clip gradients by value"): diff --git a/tests/tests_fabric/strategies/test_xla_fsdp_integration.py b/tests/tests_fabric/strategies/test_xla_fsdp_integration.py index 999b8473b28aa..20c2ef042272e 100644 --- a/tests/tests_fabric/strategies/test_xla_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_xla_fsdp_integration.py @@ -45,7 +45,7 @@ def _xla_fsdp_rewrap_warning(fabric: Fabric): assert isinstance(model._forward_module[2], XlaFullyShardedDataParallel) -@RunIf(min_torch="2.0", tpu=True, standalone=True) +@RunIf(tpu=True, standalone=True) def test_xla_fsdp_rewrap_warning(): """Test that XLAFSDP warns about rewrapping the modules.""" from torch_xla.distributed.fsdp.wrap import always_wrap_policy @@ -159,7 +159,7 @@ def step(model, batch): torch.testing.assert_close(p0, p1, atol=0, rtol=0, equal_nan=True) -@RunIf(min_torch="2.0", tpu=True, standalone=True) +@RunIf(tpu=True, standalone=True) @pytest.mark.parametrize( ("use_auto_wrap_policy", "state_dict_type", "sequential_save"), [ @@ -196,7 +196,7 @@ def _test_setup_module_move_to_device(fabric, move_to_device): assert fabric.device.type == "xla" -@RunIf(min_torch="2.0", tpu=True, standalone=True) +@RunIf(tpu=True, standalone=True) @pytest.mark.parametrize("move_to_device", [True, False]) def test_setup_module_move_to_device(move_to_device): """Test that `move_to_device` does nothing, FSDP decides which device parameters get moved to which device diff --git a/tests/tests_fabric/test_fabric.py b/tests/tests_fabric/test_fabric.py index fde9479c73eaf..f76a846e80a75 100644 --- a/tests/tests_fabric/test_fabric.py +++ b/tests/tests_fabric/test_fabric.py @@ -623,7 +623,7 @@ def test_backward(): ("auto", "32-true", False), ("auto", "bf16-true", False), ("auto", "bf16-mixed", True), - pytest.param("fsdp", "32-true", True, marks=RunIf(min_cuda_gpus=1, min_torch="2.0.0")), + pytest.param("fsdp", "32-true", True, marks=RunIf(min_cuda_gpus=1)), ], ) @pytest.mark.parametrize("setup_method", ["setup", "setup_module"]) @@ -855,7 +855,6 @@ def test_module_sharding_context(): def test_init_module_context(monkeypatch): """Test that the strategy returns the context manager for initializing the module.""" - import lightning.fabric fabric = Fabric(accelerator="cpu") strategy = SingleDeviceStrategy(device=torch.device("cuda")) @@ -866,17 +865,9 @@ def test_init_module_context(monkeypatch): strategy.module_init_context.assert_called_once_with(empty_init=None) strategy.module_init_context.reset_mock() - # Pretend we are using PyTorch < 2.0 - monkeypatch.setattr(lightning.fabric.fabric, "_TORCH_GREATER_EQUAL_2_0", False) - with pytest.warns(PossibleUserWarning, match="can't place the model parameters on the device"): # noqa: SIM117 - with fabric.init_module(): - pass - strategy.module_init_context.assert_called_once() - def test_init_tensor_context(monkeypatch): """Test that `.init_tensor()` warns if using PyTorch < 2.0.""" - import lightning.fabric fabric = Fabric(accelerator="cpu") strategy = SingleDeviceStrategy(device=torch.device("cuda")) @@ -887,13 +878,6 @@ def test_init_tensor_context(monkeypatch): strategy.tensor_init_context.assert_called_once() strategy.tensor_init_context.reset_mock() - # Pretend we are using PyTorch < 2.0 - monkeypatch.setattr(lightning.fabric.fabric, "_TORCH_GREATER_EQUAL_2_0", False) - with pytest.warns(PossibleUserWarning, match="can't place tensors on the device directly"): # noqa: SIM117 - with fabric.init_tensor(): - pass - strategy.tensor_init_context.assert_called_once() - def test_callbacks_input(): """Test the various ways in which callbacks can be registered with Fabric.""" diff --git a/tests/tests_fabric/test_wrappers.py b/tests/tests_fabric/test_wrappers.py index 0923c601d51c3..599d8f085d16c 100644 --- a/tests/tests_fabric/test_wrappers.py +++ b/tests/tests_fabric/test_wrappers.py @@ -28,6 +28,7 @@ _unwrap_objects, is_wrapped, ) +from torch._dynamo import OptimizedModule from torch.utils.data import BatchSampler, DistributedSampler from torch.utils.data.dataloader import DataLoader @@ -492,8 +493,6 @@ def test_is_wrapped(compile): # _FabricModule inside an OptimizedModule if compile: - from torch._dynamo import OptimizedModule - module = torch.nn.Linear(2, 2) wrapped = torch.compile(_FabricModule(module, Mock())) assert isinstance(wrapped, OptimizedModule) @@ -624,11 +623,6 @@ def test_unwrap_compiled(): # We wrap `torch.compile` on import of lightning in `wrappers.py` assert torch.compile.__wrapped__ - with mock.patch("lightning.fabric.wrappers", "_TORCH_GREATER_EQUAL_2_0", False): - unwrapped, compile_kwargs = _unwrap_compiled(model) - assert unwrapped is model - assert compile_kwargs is None - compiled = torch.compile(model, fullgraph=True, dynamic=True, disable=False) assert compiled._compile_kwargs == {"fullgraph": True, "dynamic": True, "disable": False} unwrapped, compile_kwargs = _unwrap_compiled(compiled) diff --git a/tests/tests_fabric/utilities/test_load.py b/tests/tests_fabric/utilities/test_load.py index 574f8bf36247b..c53686ceb9a26 100644 --- a/tests/tests_fabric/utilities/test_load.py +++ b/tests/tests_fabric/utilities/test_load.py @@ -21,10 +21,7 @@ _NotYetLoadedTensor, ) -from tests_fabric.helpers.runif import RunIf - -@RunIf(min_torch="2.0.0") def test_lazy_load_module(tmp_path): model0 = nn.Linear(2, 2) torch.save(model0.state_dict(), tmp_path / "model.pt") @@ -43,7 +40,6 @@ class ATensor(torch.Tensor): pass -@RunIf(min_torch="2.0.0") def test_lazy_load_tensor(tmp_path): """Test that lazy load can handle different classes of tensors.""" expected = { @@ -61,7 +57,6 @@ def test_lazy_load_tensor(tmp_path): assert torch.equal(t0, t1_materialized) -@RunIf(min_torch="2.0.0") def test_lazy_load_mixed_state(tmp_path): model0 = nn.Linear(2, 2) optim0 = torch.optim.Adam(model0.parameters()) @@ -82,13 +77,11 @@ def test_lazy_load_mixed_state(tmp_path): optim1.load_state_dict(loaded_checkpoint["optimizer"]) -@RunIf(min_torch="2.0.0") def test_lazy_load_raises(): with pytest.raises(FileNotFoundError, match="foo' does not exist"): _lazy_load("foo") -@RunIf(min_torch="2.0.0") def test_materialize_tensors(tmp_path): # Single tensor tensor = torch.tensor([1, 2]) diff --git a/tests/tests_pytorch/core/test_lightning_module.py b/tests/tests_pytorch/core/test_lightning_module.py index d5aec835ad581..5ee91e82689f4 100644 --- a/tests/tests_pytorch/core/test_lightning_module.py +++ b/tests/tests_pytorch/core/test_lightning_module.py @@ -18,7 +18,6 @@ import pytest import torch from lightning.fabric import Fabric -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.core.module import _TrainerFabricShim from lightning.pytorch.demos.boring_classes import BoringModel @@ -444,9 +443,6 @@ def test_trainer_reference_recursively(): ensemble.trainer = trainer # references match assert ensemble.trainer is inner.trainer - if not _TORCH_GREATER_EQUAL_2_0: - # and the trainer was weakly referenced - assert inner.trainer is weakref.proxy(trainer) def test_fabric_reference_recursively(): diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index aa56e8ca02ba4..763a140982a8d 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -18,7 +18,6 @@ import pytest import torch -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch import Callback, LightningDataModule, LightningModule, Trainer, __version__ from lightning.pytorch.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset from lightning.pytorch.utilities.model_helpers import is_overridden @@ -479,7 +478,7 @@ def training_step(self, batch, batch_idx): {"name": "configure_optimizers"}, {"name": "Callback.on_fit_start", "args": (trainer, model)}, {"name": "on_fit_start"}, - {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})}, + {"name": "zero_grad"}, {"name": "Callback.on_sanity_check_start", "args": (trainer, model)}, {"name": "val_dataloader"}, {"name": "train", "args": (False,)}, @@ -497,7 +496,7 @@ def training_step(self, batch, batch_idx): {"name": "Callback.on_train_epoch_start", "args": (trainer, model)}, {"name": "on_train_epoch_start"}, *model._train_batch(trainer, model, train_batches, device=device, **kwargs), - {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})}, + {"name": "zero_grad"}, {"name": "on_validation_model_zero_grad"}, {"name": "train", "args": (False,)}, {"name": "on_validation_model_eval"}, @@ -577,7 +576,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume_max_epochs(tmp_path): {"name": "configure_optimizers"}, {"name": "Callback.on_fit_start", "args": (trainer, model)}, {"name": "on_fit_start"}, - {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})}, + {"name": "zero_grad"}, {"name": "train_dataloader"}, {"name": "Callback.on_train_start", "args": (trainer, model)}, {"name": "on_train_start"}, @@ -655,7 +654,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume_max_steps(tmp_path): {"name": "configure_optimizers"}, {"name": "Callback.on_fit_start", "args": (trainer, model)}, {"name": "on_fit_start"}, - {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})}, + {"name": "zero_grad"}, {"name": "train_dataloader"}, {"name": "Callback.on_train_start", "args": (trainer, model)}, {"name": "on_train_start"}, @@ -718,7 +717,7 @@ def test_trainer_model_hook_system_eval(tmp_path, override_on_x_model_train, bat {"name": "Callback.setup", "args": (trainer, model), "kwargs": {"stage": verb}}, {"name": "setup", "kwargs": {"stage": verb}}, {"name": "configure_model"}, - {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})}, + {"name": "zero_grad"}, *(hooks if batches else []), {"name": "Callback.teardown", "args": (trainer, model), "kwargs": {"stage": verb}}, {"name": "teardown", "kwargs": {"stage": verb}}, @@ -741,7 +740,7 @@ def test_trainer_model_hook_system_predict(tmp_path): {"name": "Callback.setup", "args": (trainer, model), "kwargs": {"stage": "predict"}}, {"name": "setup", "kwargs": {"stage": "predict"}}, {"name": "configure_model"}, - {"name": "zero_grad", **({} if _TORCH_GREATER_EQUAL_2_0 else {"kwargs": {"set_to_none": True}})}, + {"name": "zero_grad"}, {"name": "predict_dataloader"}, {"name": "train", "args": (False,)}, {"name": "on_predict_model_eval"}, diff --git a/tests/tests_pytorch/plugins/precision/test_fsdp.py b/tests/tests_pytorch/plugins/precision/test_fsdp.py index e4d652cb15864..6b19fdabdf6d6 100644 --- a/tests/tests_pytorch/plugins/precision/test_fsdp.py +++ b/tests/tests_pytorch/plugins/precision/test_fsdp.py @@ -26,25 +26,9 @@ [ ("16-true", (torch.float16, torch.float16, torch.float16)), ("bf16-true", (torch.bfloat16, torch.bfloat16, torch.bfloat16)), - pytest.param( - "16-mixed", (torch.float32, torch.float16, torch.float16), marks=RunIf(min_torch="2.0"), id="16-mixed-ge2_0" - ), - pytest.param( - "16-mixed", (None, torch.float16, torch.float16), marks=RunIf(max_torch="2.0"), id="16-mixed-lt2_0" - ), - pytest.param( - "bf16-mixed", - (torch.float32, torch.bfloat16, torch.bfloat16), - marks=RunIf(min_torch="2.0"), - id="bf16-mixed-ge2_0", - ), - pytest.param( - "bf16-mixed", (None, torch.bfloat16, torch.bfloat16), marks=RunIf(max_torch="2.0"), id="bf16-mixed-lt2_0" - ), - pytest.param( - "32-true", (torch.float32, torch.float32, torch.float32), marks=RunIf(min_torch="2.0"), id="32-true-ge2_0" - ), - pytest.param("32-true", (None, torch.float32, torch.float32), marks=RunIf(max_torch="2.0"), id="32-true-lt2_0"), + ("16-mixed", (torch.float32, torch.float16, torch.float16)), + ("bf16-mixed", (torch.float32, torch.bfloat16, torch.bfloat16)), + ("32-true", (torch.float32, torch.float32, torch.float32)), ], ) def test_fsdp_precision_config(precision, expected): diff --git a/tests/tests_pytorch/strategies/test_common.py b/tests/tests_pytorch/strategies/test_common.py index f352ead871102..699424b3c53b9 100644 --- a/tests/tests_pytorch/strategies/test_common.py +++ b/tests/tests_pytorch/strategies/test_common.py @@ -15,7 +15,6 @@ import pytest import torch -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch import Trainer from lightning.pytorch.plugins import DoublePrecision, HalfPrecision, Precision from lightning.pytorch.strategies import SingleDeviceStrategy @@ -82,8 +81,7 @@ def test_module_init_context(device, precision, dtype, empty_init, monkeypatch): with strategy.tensor_init_context(empty_init=empty_init): module = torch.nn.Linear(2, 2) - expected_device = device if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu") - assert module.weight.device == module.bias.device == expected_device + assert module.weight.device == module.bias.device == device assert module.weight.dtype == module.bias.dtype == dtype if not empty_init: init_mock.assert_called() diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index dadd49c359e06..b23d306b9d907 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -18,7 +18,6 @@ import pytest import torch from lightning.fabric.plugins.environments import LightningEnvironment -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.plugins import DoublePrecision, HalfPrecision, Precision @@ -102,7 +101,7 @@ def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs, mps_count_ def test_tensor_init_context(precision_plugin, expected_dtype): """Test that the module under the init-context gets moved to the right device and dtype.""" parallel_devices = [torch.device("cuda", 0), torch.device("cuda", 1)] - expected_device = parallel_devices[1] if _TORCH_GREATER_EQUAL_2_0 else torch.device("cpu") + expected_device = parallel_devices[1] strategy = DDPStrategy( parallel_devices=parallel_devices, precision_plugin=precision_plugin, cluster_environment=LightningEnvironment() diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py index 0b841cde8de67..17135a98fc089 100644 --- a/tests/tests_pytorch/strategies/test_ddp_integration.py +++ b/tests/tests_pytorch/strategies/test_ddp_integration.py @@ -20,7 +20,6 @@ import torch from lightning.fabric.plugins.environments import ClusterEnvironment, LightningEnvironment from lightning.fabric.utilities.distributed import _distributed_is_initialized -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch import Trainer from lightning.pytorch.callbacks import Callback, EarlyStopping from lightning.pytorch.demos.boring_classes import BoringDataModule, BoringModel @@ -112,9 +111,7 @@ class CustomCallback(Callback): def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: assert isinstance(trainer.strategy.model, DistributedDataParallel) expected = ["something"] - assert ( - trainer.strategy.model.parameters_to_ignore == set(expected) if _TORCH_GREATER_EQUAL_2_0 else expected - ) + assert trainer.strategy.model.parameters_to_ignore == set(expected) assert trainer.strategy.model.module._ddp_params_and_buffers_to_ignore == expected model = CustomModel() diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 413a5e6c9dddd..e27aec2e7989a 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -14,11 +14,7 @@ import torch.nn as nn from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint -from lightning.fabric.utilities.imports import ( - _TORCH_GREATER_EQUAL_2_0, - _TORCH_GREATER_EQUAL_2_1, - _TORCH_GREATER_EQUAL_2_2, -) +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1, _TORCH_GREATER_EQUAL_2_2 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.pytorch import Trainer from lightning.pytorch.callbacks import ModelCheckpoint @@ -29,16 +25,11 @@ from lightning.pytorch.trainer.states import TrainerFn from lightning.pytorch.utilities.consolidate_checkpoint import _format_checkpoint from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, FullyShardedDataParallel, MixedPrecision -from torch.distributed.fsdp.wrap import always_wrap_policy, size_based_auto_wrap_policy, wrap +from torch.distributed.fsdp.wrap import ModuleWrapPolicy, always_wrap_policy, size_based_auto_wrap_policy, wrap from torchmetrics import Accuracy from tests_pytorch.helpers.runif import RunIf -if _TORCH_GREATER_EQUAL_2_0: - from torch.distributed.fsdp.wrap import ModuleWrapPolicy -else: - ModuleWrapPolicy = object - class TestFSDPModel(BoringModel): def __init__(self): @@ -87,10 +78,10 @@ def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecision) if self.trainer.precision == "16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.float16 elif self.trainer.precision == "bf16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.bfloat16 elif self.trainer.precision == "16-true": param_dtype = reduce_dtype = buffer_dtype = torch.float16 @@ -119,10 +110,8 @@ def __init__(self, wrap_min_params: int = 2): self.should_be_wrapped = [wrap_min_params < (32 * 32 + 32), None, wrap_min_params < (32 * 2 + 2)] def configure_optimizers(self): - parameters = self.parameters() if _TORCH_GREATER_EQUAL_2_0 else self.trainer.model.parameters() - # SGD's FSDP optimier state is fixed in https://github.com/pytorch/pytorch/pull/99214 - return torch.optim.AdamW(parameters, lr=0.1) + return torch.optim.AdamW(self.parameters(), lr=0.1) class TestFSDPModelAutoWrapped(TestBoringModel): @@ -150,10 +139,10 @@ def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.trainer.strategy.precision_plugin, FSDPPrecision) if self.trainer.precision == "16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.float16 elif self.trainer.precision == "bf16-mixed": - param_dtype = None if not _TORCH_GREATER_EQUAL_2_0 else torch.float32 + param_dtype = torch.float32 reduce_dtype = buffer_dtype = torch.bfloat16 elif self.trainer.precision == "16-true": param_dtype = reduce_dtype = buffer_dtype = torch.float16 @@ -287,23 +276,12 @@ def test_fsdp_strategy_checkpoint(state_dict_type, precision, tmp_path): _run_multiple_stages(trainer, model, os.path.join(tmp_path, "last.ckpt")) -if _TORCH_GREATER_EQUAL_2_0: - - def custom_auto_wrap_policy( - module, - recurse, - nonwrapped_numel: int, - ) -> bool: - return nonwrapped_numel >= 2 - -else: - - def custom_auto_wrap_policy( - module, - recurse, - unwrapped_params: int, - ) -> bool: - return unwrapped_params >= 2 +def custom_auto_wrap_policy( + module, + recurse, + nonwrapped_numel: int, +) -> bool: + return nonwrapped_numel >= 2 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @@ -350,14 +328,6 @@ def test_fsdp_strategy_full_state_dict(tmp_path, wrap_min_params): TestFSDPModelAutoWrapped(), FSDPStrategy, {"auto_wrap_policy": custom_auto_wrap_policy}, - marks=RunIf(max_torch="2.0.0"), - id="autowrap_1x", - ), - pytest.param( - TestFSDPModelAutoWrapped(), - FSDPStrategy, - {"auto_wrap_policy": custom_auto_wrap_policy}, - marks=RunIf(min_torch="2.0.0"), id="autowrap_2x", ), pytest.param( @@ -400,7 +370,7 @@ def test_fsdp_checkpoint_multi_gpus(tmp_path, model, strategy, strategy_cfg): @pytest.mark.parametrize("use_orig_params", [None, False, True]) def test_invalid_parameters_in_optimizer(use_orig_params): fsdp_kwargs = {} - if _TORCH_GREATER_EQUAL_2_0 and use_orig_params is not None: + if use_orig_params is not None: fsdp_kwargs = {"use_orig_params": use_orig_params} trainer = Trainer( @@ -412,7 +382,7 @@ def test_invalid_parameters_in_optimizer(use_orig_params): error_context = ( nullcontext() - if _TORCH_GREATER_EQUAL_2_0 and (_TORCH_GREATER_EQUAL_2_1 or use_orig_params is not False) + if _TORCH_GREATER_EQUAL_2_1 or use_orig_params is not False else pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters") ) @@ -431,7 +401,7 @@ def configure_optimizers(self): error_context = ( nullcontext() - if _TORCH_GREATER_EQUAL_2_0 and use_orig_params is not False + if use_orig_params is not False else pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters") ) @@ -530,7 +500,6 @@ def test_fsdp_sharding_strategy(): assert strategy.sharding_strategy == ShardingStrategy.NO_SHARD -@RunIf(min_torch="2.0") @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"]) def test_fsdp_hybrid_sharding_strategy(sharding_strategy): """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg.""" @@ -555,16 +524,11 @@ def test_fsdp_hybrid_sharding_strategy(sharding_strategy): def test_fsdp_use_orig_params(): - """Test that Lightning enables `use_orig_params` in PyTorch >= 2.0.""" - with mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", False): - strategy = FSDPStrategy() - assert "use_orig_params" not in strategy.kwargs - - with mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", True): - strategy = FSDPStrategy() - assert strategy.kwargs["use_orig_params"] - strategy = FSDPStrategy(use_orig_params=False) - assert not strategy.kwargs["use_orig_params"] + """Test that Lightning enables `use_orig_params` automatically.""" + strategy = FSDPStrategy() + assert strategy.kwargs["use_orig_params"] + strategy = FSDPStrategy(use_orig_params=False) + assert not strategy.kwargs["use_orig_params"] @mock.patch("torch.distributed.init_process_group") @@ -583,7 +547,6 @@ def test_set_timeout(init_process_group_mock): ) -@RunIf(min_torch="2.0") @mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state") def test_fsdp_strategy_load_optimizer_states_multiple(_, tmp_path): strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")], state_dict_type="full") @@ -640,12 +603,9 @@ def test_fsdp_strategy_save_optimizer_states(tmp_path, wrap_min_params): if trainer.global_rank != 0: assert len(model_state_dict) == 0 - if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1 or not _TORCH_GREATER_EQUAL_2_0: + if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1: assert len(optimizer_state_dict) == 0 - if not _TORCH_GREATER_EQUAL_2_0: - return - # restore model to ddp model = TestBoringModel() trainer = Trainer(default_root_dir=tmp_path, accelerator="gpu", devices=2, strategy="ddp", max_epochs=1) @@ -714,10 +674,10 @@ def test_fsdp_strategy_load_optimizer_states(tmp_path, wrap_min_params): if trainer.global_rank != 0: assert len(restored_model_state_dict) == 0 - if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1 or not _TORCH_GREATER_EQUAL_2_0: + if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1: assert len(restored_optimizer_state_dict) == 0 - if trainer.global_rank == 0 and _TORCH_GREATER_EQUAL_2_0: + if trainer.global_rank == 0: # assert everything is the same assert len(model_state_dict) == len(restored_model_state_dict) assert len(optimizer_state_dict) == len(restored_optimizer_state_dict) @@ -766,33 +726,6 @@ def on_fit_start(self): trainer.fit(model) -@mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", False) -@mock.patch("lightning.pytorch.strategies.fsdp.torch.load") -@mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state") -def test_load_save_optimizer_torch_lt_2_0(_, __, tmp_path): - strategy = FSDPStrategy(state_dict_type="full") - with pytest.warns(UserWarning, match="does not support saving the optimizer state"): - strategy.optimizer_state(Mock()) - - file = tmp_path / "test.ckpt" - file.touch() - trainer = Trainer() - trainer.state.fn = TrainerFn.FITTING - strategy._lightning_module = Mock(trainer=trainer) - with pytest.warns(UserWarning, match="does not support loading the optimizer state"): - strategy.load_checkpoint(file) - - -@mock.patch("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_0", False) -def test_sharded_state_dict_type_support(): - """Test that the sharded state dict type is supported.""" - with pytest.raises( - NotImplementedError, - match=escape("`FSDPStrategy(state_dict_type='sharded')` is not supported in PyTorch < 2.0"), - ): - FSDPStrategy(state_dict_type="sharded") - - def test_save_checkpoint_storage_options(tmp_path): """Test that the FSDP strategy does not accept storage options for saving checkpoints.""" strategy = FSDPStrategy() @@ -800,7 +733,6 @@ def test_save_checkpoint_storage_options(tmp_path): strategy.save_checkpoint(filepath=tmp_path, checkpoint=Mock(), storage_options=Mock()) -@RunIf(min_torch="2.0.0") @mock.patch("lightning.pytorch.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) @mock.patch("lightning.pytorch.strategies.fsdp._get_full_state_dict_context") @mock.patch("lightning.pytorch.strategies.fsdp._get_sharded_state_dict_context") @@ -899,7 +831,7 @@ def on_train_start(self): torch.testing.assert_close(p0, p1, atol=0, rtol=0, equal_nan=True) -@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.0.0") +@RunIf(min_cuda_gpus=2, standalone=True) def test_save_load_sharded_state_dict(tmp_path): """Test FSDP saving and loading with the sharded state dict format.""" strategy = FSDPStrategy(auto_wrap_policy={nn.Linear}, state_dict_type="sharded") @@ -955,10 +887,7 @@ def test_fsdp_lazy_load_full_state_dict(_, lazy_load_mock, torch_load_mock, tmp_ file.touch() strategy.load_checkpoint(checkpoint_path=file) - if _TORCH_GREATER_EQUAL_2_0: - lazy_load_mock.assert_called_once() - else: - torch_load_mock.assert_called_once() + lazy_load_mock.assert_called_once() @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py index 6a9123f2980a6..449a1d72ed3a0 100644 --- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py +++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py @@ -22,7 +22,6 @@ import torch.distributed as torch_distrib import torch.nn.functional as F from lightning.fabric.utilities.exceptions import MisconfigurationException -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch import Trainer, seed_everything from lightning.pytorch.demos.boring_classes import BoringModel, ManualOptimBoringModel from lightning.pytorch.strategies import Strategy @@ -31,11 +30,7 @@ def assert_emtpy_grad(grad): - if _TORCH_GREATER_EQUAL_2_0: - assert grad is None - else: - if grad is not None: # backward has been called - assert torch.all(grad == 0) + assert grad is None class ManualOptModel(BoringModel): diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 565971e1554b1..c4af0d37453ee 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -24,8 +24,6 @@ from unittest.mock import ANY, Mock, call, patch import cloudpickle -import lightning.fabric -import lightning.pytorch import pytest import torch import torch.nn as nn @@ -51,7 +49,6 @@ from lightning.pytorch.trainer.states import RunningStage, TrainerFn from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _OMEGACONF_AVAILABLE -from lightning.pytorch.utilities.warnings import PossibleUserWarning from torch.multiprocessing import ProcessRaisedException from torch.nn.parallel.distributed import DistributedDataParallel from torch.optim import SGD @@ -2080,12 +2077,6 @@ def test_init_module_context(monkeypatch): strategy.tensor_init_context.assert_called_once_with(empty_init=None) strategy.tensor_init_context.reset_mock() - # Pretend we are using PyTorch < 2.0 - monkeypatch.setattr(lightning.pytorch.trainer.trainer, "_TORCH_GREATER_EQUAL_2_0", False) - with pytest.warns(PossibleUserWarning, match="can't place .* on the device"), trainer.init_module(): - pass - strategy.tensor_init_context.assert_called_once() - def test_expand_home_trainer(): """Test that the dirpath gets expanded if it contains `~`.""" diff --git a/tests/tests_pytorch/utilities/test_compile.py b/tests/tests_pytorch/utilities/test_compile.py index 42daba6e05a36..9da6c390e5da1 100644 --- a/tests/tests_pytorch/utilities/test_compile.py +++ b/tests/tests_pytorch/utilities/test_compile.py @@ -25,6 +25,8 @@ from tests_pytorch.conftest import mock_cuda_count from tests_pytorch.helpers.runif import RunIf +_PYTHON_GREATER_EQUAL_3_9_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 9) + # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") @@ -115,6 +117,7 @@ def has_dynamo(fn): # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") +@pytest.mark.skipif(not _PYTHON_GREATER_EQUAL_3_9_0, reason="AssertionError: failed to reach fixed point") @pytest.mark.xfail( sys.platform == "win32" and _TORCH_GREATER_EQUAL_2_2, strict=False, reason="RuntimeError: Failed to import" ) @@ -144,6 +147,7 @@ def training_step(self, batch, batch_idx): # https://github.com/pytorch/pytorch/issues/95708 @pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found") +@pytest.mark.skipif(not _PYTHON_GREATER_EQUAL_3_9_0, reason="AssertionError: failed to reach fixed point") @pytest.mark.xfail( sys.platform == "win32" and _TORCH_GREATER_EQUAL_2_2, strict=False, reason="RuntimeError: Failed to import" ) diff --git a/tests/tests_pytorch/utilities/test_model_summary.py b/tests/tests_pytorch/utilities/test_model_summary.py index 290dfb67faf7d..a50ec425fc894 100644 --- a/tests/tests_pytorch/utilities/test_model_summary.py +++ b/tests/tests_pytorch/utilities/test_model_summary.py @@ -17,7 +17,6 @@ import pytest import torch import torch.nn as nn -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from lightning.pytorch import LightningModule, Trainer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.utilities.model_summary.model_summary import ( @@ -294,10 +293,6 @@ def __init__(self): def forward(self, *args, **kwargs): return self.layer(*args, **kwargs) - if isinstance(example_input, dict) and not _TORCH_GREATER_EQUAL_2_0: - # kwargs are not supported when torch < 2.0 - expected_size = UNKNOWN_SIZE - model = DummyLightningModule() model.example_input_array = example_input summary = summarize(model, max_depth=max_depth) From 29136332d69c8c893fc6bf402666c57f5fe8c7eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 28 Apr 2024 17:56:40 +0200 Subject: [PATCH 015/179] Avoid interactions through test artifacts (#19821) --- tests/tests_fabric/conftest.py | 12 ++++++++++++ tests/tests_pytorch/__init__.py | 17 ++++++----------- .../progress/test_rich_progress_bar.py | 3 ++- .../progress/test_tqdm_progress_bar.py | 3 ++- .../callbacks/test_device_stats_monitor.py | 2 +- .../callbacks/test_finetuning_callback.py | 4 ++-- .../callbacks/test_prediction_writer.py | 18 +++++++++--------- tests/tests_pytorch/callbacks/test_pruning.py | 12 +++++++++--- tests/tests_pytorch/callbacks/test_spike.py | 2 ++ tests/tests_pytorch/callbacks/test_timer.py | 8 ++++---- .../test_checkpoint_callback_frequency.py | 2 +- tests/tests_pytorch/conftest.py | 11 +++++++++++ tests/tests_pytorch/core/test_datamodules.py | 3 ++- .../core/test_lightning_optimizer.py | 3 +++ .../core/test_metric_result_integration.py | 2 +- tests/tests_pytorch/core/test_saving.py | 2 ++ tests/tests_pytorch/helpers/datasets.py | 17 ----------------- tests/tests_pytorch/helpers/test_datasets.py | 14 ++++++++++++++ tests/tests_pytorch/helpers/utils.py | 6 +----- tests/tests_pytorch/loggers/test_all.py | 5 +++-- tests/tests_pytorch/loggers/test_csv.py | 2 +- tests/tests_pytorch/loggers/test_neptune.py | 9 ++++++--- .../loops/test_training_epoch_loop.py | 1 + tests/tests_pytorch/models/test_restore.py | 2 +- .../plugins/precision/test_double.py | 1 + .../serve/test_servable_module_validator.py | 2 +- .../launchers/test_multiprocessing.py | 8 +++++--- .../strategies/test_ddp_integration.py | 4 ++-- tests/tests_pytorch/strategies/test_fsdp.py | 12 +++++++++--- .../tests_pytorch/strategies/test_registry.py | 2 +- .../trainer/flags/test_env_vars.py | 4 ++-- .../trainer/flags/test_min_max_epochs.py | 4 ++-- .../trainer/flags/test_val_check_interval.py | 10 +++++++++- .../optimization/test_backward_calls.py | 14 ++++++++------ .../optimization/test_manual_optimization.py | 2 +- .../optimization/test_multiple_optimizers.py | 4 ++-- .../test_estimated_stepping_batches.py | 4 ++-- .../tests_pytorch/trainer/test_dataloaders.py | 11 ++++++++--- tests/tests_pytorch/trainer/test_trainer.py | 8 ++++---- .../tuner/test_scale_batch_size.py | 2 +- 40 files changed, 154 insertions(+), 98 deletions(-) diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py index 7dbd8da055995..4a4371eb9d3a1 100644 --- a/tests/tests_fabric/conftest.py +++ b/tests/tests_fabric/conftest.py @@ -14,6 +14,7 @@ import os import sys import threading +from pathlib import Path from typing import List from unittest.mock import Mock @@ -185,6 +186,17 @@ def caplog(caplog): lightning_logger.propagate = propagate +@pytest.fixture(autouse=True) +def leave_no_artifacts_behind(): + tests_root = Path(__file__).parent.parent + files_before = {p for p in tests_root.rglob("*") if "__pycache__" not in p.parts} + yield + files_after = {p for p in tests_root.rglob("*") if "__pycache__" not in p.parts} + difference = files_after - files_before + difference = {str(f.relative_to(tests_root)) for f in difference} + assert not difference, f"Test left artifacts behind: {difference}" + + def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None: """An adaptation of `tests/tests_pytorch/conftest.py::pytest_collection_modifyitems`""" initial_size = len(items) diff --git a/tests/tests_pytorch/__init__.py b/tests/tests_pytorch/__init__.py index df603cce7b830..a43ffae6a83b4 100644 --- a/tests/tests_pytorch/__init__.py +++ b/tests/tests_pytorch/__init__.py @@ -13,24 +13,19 @@ # limitations under the License. import os import warnings +from pathlib import Path import pytest -_TEST_ROOT = os.path.dirname(__file__) -_PROJECT_ROOT = os.path.dirname(_TEST_ROOT) -_TEMP_PATH = os.path.join(_PROJECT_ROOT, "test_temp") -_PATH_DATASETS = os.path.join(_PROJECT_ROOT, "Datasets") -_PATH_LEGACY = os.path.join(_PROJECT_ROOT, "legacy") +_TEST_ROOT = Path(__file__).parent.parent +_PROJECT_ROOT = _TEST_ROOT.parent +_PATH_DATASETS = _PROJECT_ROOT / "Datasets" +_PATH_LEGACY = _TEST_ROOT / "legacy" # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages -if _PROJECT_ROOT not in os.getenv("PYTHONPATH", ""): +if str(_PROJECT_ROOT) not in os.getenv("PYTHONPATH", ""): splitter = ":" if os.environ.get("PYTHONPATH", "") else "" os.environ["PYTHONPATH"] = f'{_PROJECT_ROOT}{splitter}{os.environ.get("PYTHONPATH", "")}' - -if not os.path.isdir(_TEMP_PATH): - os.mkdir(_TEMP_PATH) - - # Ignore cleanup warnings from pytest (rarely happens due to a race condition when executing pytest in parallel) warnings.filterwarnings("ignore", category=pytest.PytestWarning, message=r".*\(rm_rf\) error removing.*") diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py index a74bc300f4194..22e83443ef9cf 100644 --- a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py @@ -447,9 +447,10 @@ def test_rich_progress_bar_padding(): @RunIf(rich=True) -def test_rich_progress_bar_can_be_pickled(): +def test_rich_progress_bar_can_be_pickled(tmp_path): bar = RichProgressBar() trainer = Trainer( + default_root_dir=tmp_path, callbacks=[bar], max_epochs=1, limit_train_batches=1, diff --git a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py index cbe07164fb427..e98f9077f8fcb 100644 --- a/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_tqdm_progress_bar.py @@ -550,9 +550,10 @@ def test_tqdm_progress_bar_print_disabled(tqdm_write, mock_print, tmp_path): tqdm_write.assert_not_called() -def test_tqdm_progress_bar_can_be_pickled(): +def test_tqdm_progress_bar_can_be_pickled(tmp_path): bar = TQDMProgressBar() trainer = Trainer( + default_root_dir=tmp_path, callbacks=[bar], max_epochs=1, limit_train_batches=1, diff --git a/tests/tests_pytorch/callbacks/test_device_stats_monitor.py b/tests/tests_pytorch/callbacks/test_device_stats_monitor.py index eecb8b975533a..aacee958faa45 100644 --- a/tests/tests_pytorch/callbacks/test_device_stats_monitor.py +++ b/tests/tests_pytorch/callbacks/test_device_stats_monitor.py @@ -162,7 +162,7 @@ def test_device_stats_monitor_warning_when_psutil_not_available(monkeypatch, tmp monkeypatch.setattr(imports, "_PSUTIL_AVAILABLE", False) monitor = DeviceStatsMonitor() - trainer = Trainer(logger=CSVLogger(tmp_path)) + trainer = Trainer(accelerator="cpu", logger=CSVLogger(tmp_path)) assert trainer.strategy.root_device == torch.device("cpu") with pytest.raises(ModuleNotFoundError, match="psutil` is not installed"): monitor.setup(trainer, Mock(), "fit") diff --git a/tests/tests_pytorch/callbacks/test_finetuning_callback.py b/tests/tests_pytorch/callbacks/test_finetuning_callback.py index 078e3d4462b44..56d46a62048cd 100644 --- a/tests/tests_pytorch/callbacks/test_finetuning_callback.py +++ b/tests/tests_pytorch/callbacks/test_finetuning_callback.py @@ -113,7 +113,7 @@ def configure_optimizers(self): trainer.fit(model) assert model.backbone.has_been_used - trainer = Trainer(max_epochs=3) + trainer = Trainer(default_root_dir=tmp_path, max_epochs=3) trainer.fit(model, ckpt_path=chk.last_model_path) @@ -245,7 +245,7 @@ def configure_optimizers(self): model = FreezeModel() cb = OnEpochLayerFinetuning() - trainer = Trainer(max_epochs=10, callbacks=[cb]) + trainer = Trainer(default_root_dir=tmp_path, max_epochs=10, callbacks=[cb]) with pytest.raises(IndexError, match="index 6 is out of range"): trainer.fit(model, ckpt_path=chk.last_model_path) diff --git a/tests/tests_pytorch/callbacks/test_prediction_writer.py b/tests/tests_pytorch/callbacks/test_prediction_writer.py index 343716d332279..02604f5a195fe 100644 --- a/tests/tests_pytorch/callbacks/test_prediction_writer.py +++ b/tests/tests_pytorch/callbacks/test_prediction_writer.py @@ -35,7 +35,7 @@ def test_prediction_writer_invalid_write_interval(): DummyPredictionWriter("something") -def test_prediction_writer_hook_call_intervals(): +def test_prediction_writer_hook_call_intervals(tmp_path): """Test that the `write_on_batch_end` and `write_on_epoch_end` hooks get invoked based on the defined interval.""" DummyPredictionWriter.write_on_batch_end = Mock() DummyPredictionWriter.write_on_epoch_end = Mock() @@ -44,7 +44,7 @@ def test_prediction_writer_hook_call_intervals(): model = BoringModel() cb = DummyPredictionWriter("batch_and_epoch") - trainer = Trainer(limit_predict_batches=4, callbacks=cb) + trainer = Trainer(default_root_dir=tmp_path, logger=False, limit_predict_batches=4, callbacks=cb) results = trainer.predict(model, dataloaders=dataloader) assert len(results) == 4 assert cb.write_on_batch_end.call_count == 4 @@ -54,7 +54,7 @@ def test_prediction_writer_hook_call_intervals(): DummyPredictionWriter.write_on_epoch_end.reset_mock() cb = DummyPredictionWriter("batch_and_epoch") - trainer = Trainer(limit_predict_batches=4, callbacks=cb) + trainer = Trainer(default_root_dir=tmp_path, logger=False, limit_predict_batches=4, callbacks=cb) trainer.predict(model, dataloaders=dataloader, return_predictions=False) assert cb.write_on_batch_end.call_count == 4 assert cb.write_on_epoch_end.call_count == 1 @@ -63,7 +63,7 @@ def test_prediction_writer_hook_call_intervals(): DummyPredictionWriter.write_on_epoch_end.reset_mock() cb = DummyPredictionWriter("batch") - trainer = Trainer(limit_predict_batches=4, callbacks=cb) + trainer = Trainer(default_root_dir=tmp_path, logger=False, limit_predict_batches=4, callbacks=cb) trainer.predict(model, dataloaders=dataloader, return_predictions=False) assert cb.write_on_batch_end.call_count == 4 assert cb.write_on_epoch_end.call_count == 0 @@ -72,21 +72,21 @@ def test_prediction_writer_hook_call_intervals(): DummyPredictionWriter.write_on_epoch_end.reset_mock() cb = DummyPredictionWriter("epoch") - trainer = Trainer(limit_predict_batches=4, callbacks=cb) + trainer = Trainer(default_root_dir=tmp_path, logger=False, limit_predict_batches=4, callbacks=cb) trainer.predict(model, dataloaders=dataloader, return_predictions=False) assert cb.write_on_batch_end.call_count == 0 assert cb.write_on_epoch_end.call_count == 1 @pytest.mark.parametrize("num_workers", [0, 2]) -def test_prediction_writer_batch_indices(num_workers): +def test_prediction_writer_batch_indices(num_workers, tmp_path): DummyPredictionWriter.write_on_batch_end = Mock() DummyPredictionWriter.write_on_epoch_end = Mock() dataloader = DataLoader(RandomDataset(32, 64), batch_size=4, num_workers=num_workers) model = BoringModel() writer = DummyPredictionWriter("batch_and_epoch") - trainer = Trainer(limit_predict_batches=4, callbacks=writer) + trainer = Trainer(default_root_dir=tmp_path, logger=False, limit_predict_batches=4, callbacks=writer) trainer.predict(model, dataloaders=dataloader) writer.write_on_batch_end.assert_has_calls([ @@ -101,7 +101,7 @@ def test_prediction_writer_batch_indices(num_workers): ]) -def test_batch_level_batch_indices(): +def test_batch_level_batch_indices(tmp_path): """Test that batch_indices are returned when `return_predictions=False`.""" DummyPredictionWriter.write_on_batch_end = Mock() @@ -112,7 +112,7 @@ def on_predict_epoch_end(self, *args, **kwargs): writer = DummyPredictionWriter("batch") model = CustomBoringModel() dataloader = DataLoader(RandomDataset(32, 64), batch_size=4) - trainer = Trainer(limit_predict_batches=4, callbacks=writer) + trainer = Trainer(default_root_dir=tmp_path, logger=False, limit_predict_batches=4, callbacks=writer) trainer.predict(model, dataloaders=dataloader, return_predictions=False) writer.write_on_batch_end.assert_has_calls([ diff --git a/tests/tests_pytorch/callbacks/test_pruning.py b/tests/tests_pytorch/callbacks/test_pruning.py index e0c6a1d9236dc..0a5d19ead213c 100644 --- a/tests/tests_pytorch/callbacks/test_pruning.py +++ b/tests/tests_pytorch/callbacks/test_pruning.py @@ -190,7 +190,7 @@ def test_pruning_callback_ddp_cpu(tmp_path): @pytest.mark.parametrize("resample_parameters", [False, True]) -def test_pruning_lth_callable(tmp_path, resample_parameters: bool): +def test_pruning_lth_callable(tmp_path, resample_parameters): model = TestModel() class ModelPruningTestCallback(ModelPruning): @@ -206,7 +206,7 @@ def apply_lottery_ticket_hypothesis(self): curr, curr_name = self._parameters_to_prune[i] assert name == curr_name actual, expected = getattr(curr, name).data, getattr(copy, name).data - allclose = torch.allclose(actual, expected) + allclose = torch.allclose(actual.cpu(), expected) assert not allclose if self._resample_parameters else allclose pruning = ModelPruningTestCallback( @@ -310,7 +310,13 @@ def on_save_checkpoint(self, trainer, pl_module, checkpoint): ckpt_callback = ModelCheckpoint( monitor="test", save_top_k=2, save_last=True, save_on_train_epoch_end=save_on_train_epoch_end ) - trainer = Trainer(callbacks=[pruning_callback, ckpt_callback], max_epochs=3, enable_progress_bar=False) + trainer = Trainer( + default_root_dir=tmp_path, + logger=False, + callbacks=[pruning_callback, ckpt_callback], + max_epochs=3, + enable_progress_bar=False, + ) with caplog.at_level(INFO): trainer.fit(model) diff --git a/tests/tests_pytorch/callbacks/test_spike.py b/tests/tests_pytorch/callbacks/test_spike.py index f4d0c946cefa0..5634feaf221cd 100644 --- a/tests/tests_pytorch/callbacks/test_spike.py +++ b/tests/tests_pytorch/callbacks/test_spike.py @@ -213,6 +213,8 @@ def test_trainer_spike_detection_integration(tmp_path, global_rank_spike, num_de cb.should_raise = spike_value is None or finite_only or spike_value == float("inf") trainer = Trainer( + default_root_dir=tmp_path, + logger=False, callbacks=[cb], accelerator="cpu", devices=num_devices, diff --git a/tests/tests_pytorch/callbacks/test_timer.py b/tests/tests_pytorch/callbacks/test_timer.py index 3a62acca2a026..e6359a2e9a5e1 100644 --- a/tests/tests_pytorch/callbacks/test_timer.py +++ b/tests/tests_pytorch/callbacks/test_timer.py @@ -26,24 +26,24 @@ from tests_pytorch.helpers.runif import RunIf -def test_trainer_flag(caplog): +def test_trainer_flag(caplog, tmp_path): class TestModel(BoringModel): def on_fit_start(self): raise SystemExit() - trainer = Trainer(max_time={"seconds": 1337}) + trainer = Trainer(default_root_dir=tmp_path, logger=False, max_time={"seconds": 1337}) with pytest.raises(SystemExit): trainer.fit(TestModel()) timer = [c for c in trainer.callbacks if isinstance(c, Timer)][0] assert timer._duration == 1337 - trainer = Trainer(max_time={"seconds": 1337}, callbacks=[Timer()]) + trainer = Trainer(default_root_dir=tmp_path, logger=False, max_time={"seconds": 1337}, callbacks=[Timer()]) with pytest.raises(SystemExit), caplog.at_level(level=logging.INFO): trainer.fit(TestModel()) assert "callbacks list already contains a Timer" in caplog.text # Make sure max_time still honored even if max_epochs == -1 - trainer = Trainer(max_time={"seconds": 1}, max_epochs=-1) + trainer = Trainer(default_root_dir=tmp_path, logger=False, max_time={"seconds": 1}, max_epochs=-1) with pytest.raises(SystemExit): trainer.fit(TestModel()) timer = [c for c in trainer.callbacks if isinstance(c, Timer)][0] diff --git a/tests/tests_pytorch/checkpointing/test_checkpoint_callback_frequency.py b/tests/tests_pytorch/checkpointing/test_checkpoint_callback_frequency.py index 44a284d92bbb2..ff8f3c95e43c5 100644 --- a/tests/tests_pytorch/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/tests_pytorch/checkpointing/test_checkpoint_callback_frequency.py @@ -24,7 +24,7 @@ def test_disabled_checkpointing(): # no callback - trainer = Trainer(max_epochs=3, enable_checkpointing=False) + trainer = Trainer(logger=False, max_epochs=3, enable_checkpointing=False) assert not trainer.checkpoint_callbacks trainer.fit(BoringModel()) assert not trainer.checkpoint_callbacks diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 9365dfb8d9bd3..be05768cab430 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -308,6 +308,17 @@ def single_process_pg(): os.environ.update(orig_environ) +@pytest.fixture(autouse=True) +def leave_no_artifacts_behind(): + tests_root = Path(__file__).parent.parent + files_before = {p for p in tests_root.rglob("*") if "__pycache__" not in p.parts} + yield + files_after = {p for p in tests_root.rglob("*") if "__pycache__" not in p.parts} + difference = files_after - files_before + difference = {str(f.relative_to(tests_root)) for f in difference} + assert not difference, f"Test left artifacts behind: {difference}" + + def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.Config) -> None: initial_size = len(items) conditions = [] diff --git a/tests/tests_pytorch/core/test_datamodules.py b/tests/tests_pytorch/core/test_datamodules.py index 926cd3d9e3bf5..42a4db8270b68 100644 --- a/tests/tests_pytorch/core/test_datamodules.py +++ b/tests/tests_pytorch/core/test_datamodules.py @@ -452,11 +452,12 @@ class BoringDataModule2(LightningDataModule): @RunIf(skip_windows=True) # TODO: all durations are 0 on Windows -def test_datamodule_hooks_are_profiled(): +def test_datamodule_hooks_are_profiled(tmp_path): """Test that `LightningDataModule` hooks are profiled.""" def get_trainer(): return Trainer( + default_root_dir=tmp_path, max_steps=1, limit_val_batches=0, profiler="simple", diff --git a/tests/tests_pytorch/core/test_lightning_optimizer.py b/tests/tests_pytorch/core/test_lightning_optimizer.py index 33a66308271ca..89e7c5ee5c5c7 100644 --- a/tests/tests_pytorch/core/test_lightning_optimizer.py +++ b/tests/tests_pytorch/core/test_lightning_optimizer.py @@ -23,6 +23,8 @@ from lightning.pytorch.tuner.tuning import Tuner from torch.optim import SGD, Adam, Optimizer +from tests_pytorch.helpers.runif import RunIf + @pytest.mark.parametrize("auto", [True, False]) def test_lightning_optimizer(tmp_path, auto): @@ -232,6 +234,7 @@ def configure_optimizers(self): assert sgd["zero_grad"].call_count == limit_train_batches +@RunIf(mps=False) # mps does not support LBFGS def test_lightning_optimizer_automatic_optimization_lbfgs_zero_grad(tmp_path): """Test zero_grad is called the same number of times as LBFGS requires for reevaluation of the loss in automatic_optimization.""" diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py index 3d49190c9b4bb..e966a62641fc3 100644 --- a/tests/tests_pytorch/core/test_metric_result_integration.py +++ b/tests/tests_pytorch/core/test_metric_result_integration.py @@ -395,7 +395,7 @@ def on_train_epoch_end(self) -> None: @pytest.mark.parametrize( "kwargs", [ - {}, + pytest.param({}, marks=RunIf(mps=False)), pytest.param({"strategy": "ddp", "accelerator": "gpu", "devices": 1}, marks=RunIf(min_cuda_gpus=1)), pytest.param( {"strategy": "ddp", "accelerator": "gpu", "devices": 2}, marks=RunIf(min_cuda_gpus=2, standalone=True) diff --git a/tests/tests_pytorch/core/test_saving.py b/tests/tests_pytorch/core/test_saving.py index e20d5cb803b85..2bb08c18381a5 100644 --- a/tests/tests_pytorch/core/test_saving.py +++ b/tests/tests_pytorch/core/test_saving.py @@ -13,6 +13,8 @@ def create_boring_checkpoint(tmp_path, model, accelerator="cuda"): checkpoint_callback = ModelCheckpoint(dirpath=tmp_path, filename="checkpoint") trainer = pl.Trainer( + default_root_dir=tmp_path, + logger=False, devices=1, accelerator=accelerator, max_epochs=1, diff --git a/tests/tests_pytorch/helpers/datasets.py b/tests/tests_pytorch/helpers/datasets.py index 8860160d6f30e..3769e76d9edde 100644 --- a/tests/tests_pytorch/helpers/datasets.py +++ b/tests/tests_pytorch/helpers/datasets.py @@ -39,14 +39,6 @@ class MNIST(Dataset): download: If true, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again. - - Examples: - >>> dataset = MNIST(".", download=True) - >>> len(dataset) - 60000 - >>> torch.bincount(dataset.targets) - tensor([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949]) - """ RESOURCES = ( @@ -141,15 +133,6 @@ class TrialMNIST(MNIST): digits: list selected MNIST digits/classes kwargs: Same as MNIST - Examples: - >>> dataset = TrialMNIST(".", download=True) - >>> len(dataset) - 300 - >>> sorted(set([d.item() for d in dataset.targets])) - [0, 1, 2] - >>> torch.bincount(dataset.targets) - tensor([100, 100, 100]) - """ def __init__(self, root: str, num_samples: int = 100, digits: Optional[Sequence] = (0, 1, 2), **kwargs): diff --git a/tests/tests_pytorch/helpers/test_datasets.py b/tests/tests_pytorch/helpers/test_datasets.py index 5731b857303e2..0af0a93b709a5 100644 --- a/tests/tests_pytorch/helpers/test_datasets.py +++ b/tests/tests_pytorch/helpers/test_datasets.py @@ -15,11 +15,25 @@ import cloudpickle import pytest +import torch from tests_pytorch import _PATH_DATASETS from tests_pytorch.helpers.datasets import MNIST, AverageDataset, TrialMNIST +def test_mnist(tmp_path): + dataset = MNIST(tmp_path, download=True) + assert len(dataset) == 60000 + assert torch.bincount(dataset.targets).tolist() == [5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949] + + +def test_trial_mnist(tmp_path): + dataset = TrialMNIST(tmp_path, download=True) + assert len(dataset) == 300 + assert set(dataset.targets.tolist()) == {0, 1, 2} + assert torch.bincount(dataset.targets).tolist() == [100, 100, 100] + + @pytest.mark.parametrize( ("dataset_cls", "args"), [(MNIST, {"root": _PATH_DATASETS}), (TrialMNIST, {"root": _PATH_DATASETS}), (AverageDataset, {})], diff --git a/tests/tests_pytorch/helpers/utils.py b/tests/tests_pytorch/helpers/utils.py index 418e4328d2c64..ccd50cca301a0 100644 --- a/tests/tests_pytorch/helpers/utils.py +++ b/tests/tests_pytorch/helpers/utils.py @@ -18,23 +18,19 @@ from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.loggers import TensorBoardLogger -from tests_pytorch import _TEMP_PATH - def get_default_logger(save_dir, version=None): # set up logger object without actually saving logs return TensorBoardLogger(save_dir, name="lightning_logs", version=version) -def get_data_path(expt_logger, path_dir=None): +def get_data_path(expt_logger, path_dir): # some calls contain only experiment not complete logger # each logger has to have these attributes name, version = expt_logger.name, expt_logger.version # the other experiments... - if not path_dir: - path_dir = expt_logger.save_dir if hasattr(expt_logger, "save_dir") and expt_logger.save_dir else _TEMP_PATH path_expt = os.path.join(path_dir, name, "version_%s" % version) # try if the new sub-folder exists, typical case for test-tube diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index 62ef2e6b6db52..ef37f23a94d7e 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -70,8 +70,9 @@ def _instantiate_logger(logger_class, save_dir, **override_kwargs): @mock.patch.dict(os.environ, {}) @mock.patch("lightning.pytorch.loggers.mlflow._get_resolve_tags", Mock()) @pytest.mark.parametrize("logger_class", ALL_LOGGER_CLASSES) -def test_loggers_fit_test_all(logger_class, mlflow_mock, wandb_mock, comet_mock, neptune_mock, tmp_path): +def test_loggers_fit_test_all(logger_class, mlflow_mock, wandb_mock, comet_mock, neptune_mock, tmp_path, monkeypatch): """Verify that basic functionality of all loggers.""" + monkeypatch.chdir(tmp_path) class CustomModel(BoringModel): def training_step(self, batch, batch_idx): @@ -116,12 +117,12 @@ def log_metrics(self, metrics, step): model = CustomModel() trainer = Trainer( + default_root_dir=tmp_path, max_epochs=1, logger=logger, limit_train_batches=1, limit_val_batches=1, log_every_n_steps=1, - default_root_dir=tmp_path, ) trainer.fit(model) trainer.test() diff --git a/tests/tests_pytorch/loggers/test_csv.py b/tests/tests_pytorch/loggers/test_csv.py index a03b8a7d62f78..27b85bb4ad745 100644 --- a/tests/tests_pytorch/loggers/test_csv.py +++ b/tests/tests_pytorch/loggers/test_csv.py @@ -168,7 +168,7 @@ def test_metrics_reset_after_save(tmp_path): # Mock the existance check, so we can simulate appending to the metrics file "lightning.fabric.loggers.csv_logs._ExperimentWriter._check_log_dir_exists" ) -def test_append_metrics_file(tmp_path): +def test_append_metrics_file(_, tmp_path): """Test that the logger appends to the file instead of rewriting it on every save.""" logger = CSVLogger(tmp_path, name="test", version=0, flush_logs_every_n_steps=1) diff --git a/tests/tests_pytorch/loggers/test_neptune.py b/tests/tests_pytorch/loggers/test_neptune.py index 13941c18db8e8..0a39337ac5c16 100644 --- a/tests/tests_pytorch/loggers/test_neptune.py +++ b/tests/tests_pytorch/loggers/test_neptune.py @@ -149,15 +149,17 @@ def test_neptune_additional_methods(neptune_mock): run_instance_mock.__getitem__().log.assert_called_once_with(torch.ones(1)) -def test_neptune_leave_open_experiment_after_fit(neptune_mock, tmp_path): +def test_neptune_leave_open_experiment_after_fit(neptune_mock, tmp_path, monkeypatch): """Verify that neptune experiment was NOT closed after training.""" + monkeypatch.chdir(tmp_path) logger, run_instance_mock, _ = _get_logger_with_mocks(api_key="test", project="project") _fit_and_test(logger=logger, model=BoringModel(), tmp_path=tmp_path) assert run_instance_mock.stop.call_count == 0 -def test_neptune_log_metrics_on_trained_model(neptune_mock, tmp_path): +def test_neptune_log_metrics_on_trained_model(neptune_mock, tmp_path, monkeypatch): """Verify that trained models do log data.""" + monkeypatch.chdir(tmp_path) class LoggingModel(BoringModel): def on_validation_epoch_end(self): @@ -305,9 +307,10 @@ def test_get_full_model_names_from_exp_structure(): assert NeptuneLogger._get_full_model_names_from_exp_structure(input_dict, "foo/bar") == expected_keys -def test_inactive_run(neptune_mock, tmp_path): +def test_inactive_run(neptune_mock, tmp_path, monkeypatch): from neptune.exceptions import InactiveRunException + monkeypatch.chdir(tmp_path) logger, run_instance_mock, _ = _get_logger_with_mocks(api_key="test", project="project") run_instance_mock.__setitem__.side_effect = InactiveRunException diff --git a/tests/tests_pytorch/loops/test_training_epoch_loop.py b/tests/tests_pytorch/loops/test_training_epoch_loop.py index 16ed3842e3a96..a110a20bfaf84 100644 --- a/tests/tests_pytorch/loops/test_training_epoch_loop.py +++ b/tests/tests_pytorch/loops/test_training_epoch_loop.py @@ -30,6 +30,7 @@ def test_no_val_on_train_epoch_loop_restart(tmp_path): "limit_train_batches": 1, "limit_val_batches": 1, "num_sanity_val_steps": 0, + "logger": False, "enable_checkpointing": False, } trainer = Trainer(**trainer_kwargs) diff --git a/tests/tests_pytorch/models/test_restore.py b/tests/tests_pytorch/models/test_restore.py index 211dc49d42eda..a76fb1e216a8b 100644 --- a/tests/tests_pytorch/models/test_restore.py +++ b/tests/tests_pytorch/models/test_restore.py @@ -258,7 +258,7 @@ def on_train_epoch_end(self, *_): def test_try_resume_from_non_existing_checkpoint(tmp_path): """Test that trying to resume from non-existing `ckpt_path` fails with an error.""" model = BoringModel() - trainer = Trainer() + trainer = Trainer(logger=False) with pytest.raises(FileNotFoundError, match="Checkpoint file not found"): trainer.fit(model, ckpt_path=str(tmp_path / "non_existing.ckpt")) diff --git a/tests/tests_pytorch/plugins/precision/test_double.py b/tests/tests_pytorch/plugins/precision/test_double.py index f985c09888847..1ee89752fcbae 100644 --- a/tests/tests_pytorch/plugins/precision/test_double.py +++ b/tests/tests_pytorch/plugins/precision/test_double.py @@ -134,6 +134,7 @@ def training_step(self, batch, batch_idx): return super().training_step(batch, batch_idx) +@RunIf(mps=False) # mps does not support float64 @pytest.mark.parametrize( "boring_model", [ diff --git a/tests/tests_pytorch/serve/test_servable_module_validator.py b/tests/tests_pytorch/serve/test_servable_module_validator.py index fce882852823b..7c883c419ea82 100644 --- a/tests/tests_pytorch/serve/test_servable_module_validator.py +++ b/tests/tests_pytorch/serve/test_servable_module_validator.py @@ -37,7 +37,7 @@ def test_servable_module_validator(): @pytest.mark.flaky(reruns=3) -def test_servable_module_validator_with_trainer(tmp_path): +def test_servable_module_validator_with_trainer(tmp_path, mps_count_0): callback = ServableModuleValidator() trainer = Trainer( default_root_dir=tmp_path, diff --git a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py index a3af440292306..b0462c0105a9f 100644 --- a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py +++ b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py @@ -194,14 +194,16 @@ def on_fit_start(self) -> None: assert torch.equal(self.layer.weight.data, self.tied_layer.weight.data) -def test_memory_sharing_disabled(): +def test_memory_sharing_disabled(tmp_path): """Test that the multiprocessing launcher disables memory sharing on model parameters and buffers to avoid race conditions on model updates.""" model = SimpleModel() assert not model.layer.weight.is_shared() assert model.layer.weight.data_ptr() == model.tied_layer.weight.data_ptr() - trainer = Trainer(accelerator="cpu", devices=2, strategy="ddp_spawn", max_steps=0) + trainer = Trainer( + default_root_dir=tmp_path, logger=False, accelerator="cpu", devices=2, strategy="ddp_spawn", max_steps=0 + ) trainer.fit(model) @@ -214,7 +216,7 @@ def test_check_for_missing_main_guard(): launcher.launch(function=Mock()) -def test_fit_twice_raises(): +def test_fit_twice_raises(mps_count_0): model = BoringModel() trainer = Trainer( limit_train_batches=1, diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py index 17135a98fc089..bd86bfa6ce42d 100644 --- a/tests/tests_pytorch/strategies/test_ddp_integration.py +++ b/tests/tests_pytorch/strategies/test_ddp_integration.py @@ -284,10 +284,10 @@ def configure_optimizers(self): @RunIf(min_cuda_gpus=2, skip_windows=True) @pytest.mark.parametrize("strategy", [pytest.param("ddp", marks=RunIf(standalone=True)), "ddp_spawn"]) -def test_ddp_strategy_checkpoint_zero_redundancy_optimizer(tmp_path, strategy): +def test_ddp_strategy_checkpoint_zero_redundancy_optimizer(strategy, tmp_path): """Test to ensure that checkpoint is saved correctly when using zero redundancy optimizer.""" model = BoringZeroRedundancyOptimizerModel() - trainer = Trainer(accelerator="gpu", devices=2, strategy=strategy, max_steps=1) + trainer = Trainer(default_root_dir=tmp_path, accelerator="gpu", devices=2, strategy=strategy, max_steps=1) trainer.fit(model) diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index e27aec2e7989a..76c336fecf107 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -630,7 +630,7 @@ def test_fsdp_strategy_save_optimizer_states(tmp_path, wrap_min_params): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.parametrize("wrap_min_params", [2, 1024, 100000000]) -def test_fsdp_strategy_load_optimizer_states(tmp_path, wrap_min_params): +def test_fsdp_strategy_load_optimizer_states(wrap_min_params, tmp_path): """Test to ensure that the full state dict and optimizer states can be load when using FSDP strategy. Based on `wrap_min_params`, the model will be fully wrapped, half wrapped, and not wrapped at all. If the DDP model @@ -694,14 +694,17 @@ def test_fsdp_strategy_load_optimizer_states(tmp_path, wrap_min_params): ("32-true", torch.float32), ], ) -def test_configure_model(precision, expected_dtype): +def test_configure_model(precision, expected_dtype, tmp_path): """Test that the module under configure_model gets moved to the right device and dtype.""" trainer = Trainer( + default_root_dir=tmp_path, accelerator="cuda", devices=2, strategy=FSDPStrategy(auto_wrap_policy=always_wrap_policy), precision=precision, max_epochs=1, + enable_checkpointing=False, + logger=False, ) class MyModel(BoringModel): @@ -899,7 +902,7 @@ def test_fsdp_lazy_load_full_state_dict(_, lazy_load_mock, torch_load_mock, tmp_ pytest.param("bf16-true", torch.bfloat16, marks=RunIf(bf16_cuda=True)), ], ) -def test_module_init_context(precision, expected_dtype): +def test_module_init_context(precision, expected_dtype, tmp_path): """Test that the module under the init-context gets moved to the right device and dtype.""" class Model(BoringModel): @@ -915,12 +918,15 @@ def on_train_start(self): def _run_setup_assertions(empty_init, expected_device): trainer = Trainer( + default_root_dir=tmp_path, accelerator="cuda", devices=2, strategy=FSDPStrategy(auto_wrap_policy={torch.nn.Linear}), precision=precision, max_steps=1, barebones=True, + enable_checkpointing=False, + logger=False, ) with trainer.init_module(empty_init=empty_init): model = Model() diff --git a/tests/tests_pytorch/strategies/test_registry.py b/tests/tests_pytorch/strategies/test_registry.py index abc1c83ec5143..90e15638bfd06 100644 --- a/tests/tests_pytorch/strategies/test_registry.py +++ b/tests/tests_pytorch/strategies/test_registry.py @@ -40,7 +40,7 @@ def test_strategy_registry_with_deepspeed_strategies(strategy_name, init_params) @RunIf(deepspeed=True) @pytest.mark.parametrize("strategy", ["deepspeed", "deepspeed_stage_2_offload", "deepspeed_stage_3"]) -def test_deepspeed_strategy_registry_with_trainer(tmp_path, strategy): +def test_deepspeed_strategy_registry_with_trainer(tmp_path, strategy, mps_count_0): trainer = Trainer(default_root_dir=tmp_path, strategy=strategy, precision="16-mixed") assert isinstance(trainer.strategy, DeepSpeedStrategy) diff --git a/tests/tests_pytorch/trainer/flags/test_env_vars.py b/tests/tests_pytorch/trainer/flags/test_env_vars.py index 62c94d4cc277e..b47bf2d5b03fb 100644 --- a/tests/tests_pytorch/trainer/flags/test_env_vars.py +++ b/tests/tests_pytorch/trainer/flags/test_env_vars.py @@ -25,7 +25,7 @@ def test_passing_no_env_variables(): assert trainer.logger is not None assert trainer.max_steps == -1 assert trainer.max_epochs is None - trainer = Trainer(logger=False, max_steps=1) + trainer = Trainer(max_steps=1, logger=False, enable_checkpointing=False) trainer.fit(model) assert trainer.logger is None assert trainer.max_steps == 1 @@ -49,7 +49,7 @@ def test_passing_env_variables_defaults(): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "PL_TRAINER_DEVICES": "2"}) -def test_passing_env_variables_devices(cuda_count_2): +def test_passing_env_variables_devices(cuda_count_2, mps_count_0): """Testing overwriting trainer arguments.""" trainer = Trainer() assert trainer.num_devices == 2 diff --git a/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py b/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py index 73e8017dcfcfc..25aaeb8cff77e 100644 --- a/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py +++ b/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py @@ -36,7 +36,7 @@ def test_min_max_steps_epochs(tmp_path, min_epochs, max_epochs, min_steps, max_s assert trainer.global_step == trainer.max_steps -def test_max_epochs_not_set_warning(): +def test_max_epochs_not_set_warning(tmp_path): """Test that a warning is only emitted when `max_epochs` was not set by the user.""" class CustomModel(BoringModel): @@ -46,7 +46,7 @@ def training_step(self, *args, **kwargs): match = "`max_epochs` was not set. Setting it to 1000 epochs." model = CustomModel() - trainer = Trainer(max_epochs=None, limit_train_batches=1) + trainer = Trainer(logger=False, enable_checkpointing=False, max_epochs=None, limit_train_batches=1) with pytest.warns(PossibleUserWarning, match=match): trainer.fit(model) diff --git a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py index ac94cc0482c47..b776263e9953d 100644 --- a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py +++ b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py @@ -37,7 +37,13 @@ def on_validation_epoch_start(self) -> None: self.val_epoch_calls += 1 model = TestModel() - trainer = Trainer(max_epochs=max_epochs, val_check_interval=1 / denominator, logger=False) + trainer = Trainer( + default_root_dir=tmp_path, + enable_checkpointing=False, + logger=False, + max_epochs=max_epochs, + val_check_interval=1 / denominator, + ) trainer.fit(model) assert model.train_epoch_calls == max_epochs @@ -107,6 +113,8 @@ def test_validation_check_interval_exceed_data_length_wrong(): trainer = Trainer( limit_train_batches=10, val_check_interval=100, + logger=False, + enable_checkpointing=False, ) model = BoringModel() diff --git a/tests/tests_pytorch/trainer/optimization/test_backward_calls.py b/tests/tests_pytorch/trainer/optimization/test_backward_calls.py index e464f6dbac58d..b91dbff8c6d09 100644 --- a/tests/tests_pytorch/trainer/optimization/test_backward_calls.py +++ b/tests/tests_pytorch/trainer/optimization/test_backward_calls.py @@ -11,7 +11,7 @@ def test_backward_count_simple(torch_backward, num_steps): """Test that backward is called exactly once per step.""" model = BoringModel() - trainer = Trainer(max_steps=num_steps) + trainer = Trainer(max_steps=num_steps, logger=False, enable_checkpointing=False) trainer.fit(model) assert torch_backward.call_count == num_steps @@ -25,19 +25,21 @@ def test_backward_count_simple(torch_backward, num_steps): def test_backward_count_with_grad_accumulation(torch_backward): """Test that backward is called the correct number of times when accumulating gradients.""" model = BoringModel() - trainer = Trainer(max_epochs=1, limit_train_batches=6, accumulate_grad_batches=2) + trainer = Trainer( + max_epochs=1, limit_train_batches=6, accumulate_grad_batches=2, logger=False, enable_checkpointing=False + ) trainer.fit(model) assert torch_backward.call_count == 6 torch_backward.reset_mock() - trainer = Trainer(max_steps=6, accumulate_grad_batches=2) + trainer = Trainer(max_steps=6, accumulate_grad_batches=2, logger=False, enable_checkpointing=False) trainer.fit(model) assert torch_backward.call_count == 12 @patch("torch.Tensor.backward") -def test_backward_count_with_closure(torch_backward): +def test_backward_count_with_closure(torch_backward, tmp_path): """Using a closure (e.g. with LBFGS) should lead to no extra backward calls.""" class TestModel(BoringModel): @@ -45,12 +47,12 @@ def configure_optimizers(self): return torch.optim.LBFGS(self.parameters(), lr=0.1) model = TestModel() - trainer = Trainer(max_steps=5) + trainer = Trainer(max_steps=5, logger=False, enable_checkpointing=False) trainer.fit(model) assert torch_backward.call_count == 5 torch_backward.reset_mock() - trainer = Trainer(max_steps=5, accumulate_grad_batches=2) + trainer = Trainer(max_steps=5, accumulate_grad_batches=2, logger=False, enable_checkpointing=False) trainer.fit(model) assert torch_backward.call_count == 10 diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py index 449a1d72ed3a0..f0ab8fe401633 100644 --- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py +++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py @@ -910,7 +910,7 @@ def configure_optimizers(self): return [optimizer], [scheduler] model = Model() - trainer = Trainer(accelerator="cpu", max_epochs=0) + trainer = Trainer(accelerator="cpu", max_epochs=0, logger=False, enable_checkpointing=False) if automatic_optimization: with pytest.raises(MisconfigurationException, match="doesn't follow PyTorch's LRScheduler"): trainer.fit(model) diff --git a/tests/tests_pytorch/trainer/optimization/test_multiple_optimizers.py b/tests/tests_pytorch/trainer/optimization/test_multiple_optimizers.py index fbba239303352..319eafeb0d0bb 100644 --- a/tests/tests_pytorch/trainer/optimization/test_multiple_optimizers.py +++ b/tests/tests_pytorch/trainer/optimization/test_multiple_optimizers.py @@ -36,7 +36,7 @@ def training_step(self, batch, batch_idx, optimizer_idx): model = TestModel() model.automatic_optimization = True - trainer = pl.Trainer() + trainer = pl.Trainer(logger=False, enable_checkpointing=False) with pytest.raises(RuntimeError, match="Remove the `optimizer_idx` argument from `training_step`"): trainer.fit(model) @@ -47,7 +47,7 @@ def configure_optimizers(self): model = TestModel() model.automatic_optimization = True - trainer = pl.Trainer() + trainer = pl.Trainer(logger=False, enable_checkpointing=False) with pytest.raises(RuntimeError, match="multiple optimizers is only supported with manual optimization"): trainer.fit(model) diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index f178f8395cd45..0f9d80424528a 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -86,9 +86,9 @@ def test_num_stepping_batches_infinite_training(): @pytest.mark.parametrize("max_steps", [2, 100]) -def test_num_stepping_batches_with_max_steps(max_steps): +def test_num_stepping_batches_with_max_steps(max_steps, tmp_path): """Test stepping batches with `max_steps`.""" - trainer = Trainer(max_steps=max_steps) + trainer = Trainer(max_steps=max_steps, default_root_dir=tmp_path, logger=False, enable_checkpointing=False) model = BoringModel() trainer.fit(model) assert trainer.estimated_stepping_batches == max_steps diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py index cea398eb6f501..75cb3cd45fcf0 100644 --- a/tests/tests_pytorch/trainer/test_dataloaders.py +++ b/tests/tests_pytorch/trainer/test_dataloaders.py @@ -679,7 +679,11 @@ def test_warning_with_small_dataloader_and_logging_interval(tmp_path): with pytest.warns(UserWarning, match=r"The number of training batches \(1\) is smaller than the logging interval"): trainer = Trainer( - default_root_dir=tmp_path, max_epochs=1, log_every_n_steps=2, limit_train_batches=1, logger=CSVLogger(".") + default_root_dir=tmp_path, + max_epochs=1, + log_every_n_steps=2, + limit_train_batches=1, + logger=CSVLogger(tmp_path), ) trainer.fit(model) @@ -727,7 +731,7 @@ def __len__(self): @pytest.mark.parametrize("yield_at_all", [False, True]) -def test_iterable_dataset_stop_iteration_at_epoch_beginning(yield_at_all): +def test_iterable_dataset_stop_iteration_at_epoch_beginning(yield_at_all, tmp_path): """Test that the training loop skips execution if the iterator is empty from the start.""" class TestDataset(IterableDataset): @@ -748,7 +752,8 @@ def gen(self): model = TestModel() train_dataloader = DataLoader(TestDataset(model.gen), batch_size=2) trainer = Trainer( - default_root_dir=os.getcwd(), + default_root_dir=tmp_path, + logger=False, max_epochs=2, enable_model_summary=False, ) diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index c4af0d37453ee..1791f498d5512 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -2032,7 +2032,7 @@ def on_fit_start(self): @pytest.mark.parametrize("exception_type", [KeyboardInterrupt, RuntimeError]) -def test_trainer_calls_strategy_on_exception(exception_type): +def test_trainer_calls_strategy_on_exception(exception_type, tmp_path): """Test that when an exception occurs, the Trainer lets the strategy process it.""" exception = exception_type("Test exception") @@ -2040,7 +2040,7 @@ class ExceptionModel(BoringModel): def on_fit_start(self): raise exception - trainer = Trainer() + trainer = Trainer(default_root_dir=tmp_path) with mock.patch("lightning.pytorch.strategies.strategy.Strategy.on_exception") as on_exception_mock, suppress( Exception ): @@ -2049,7 +2049,7 @@ def on_fit_start(self): @pytest.mark.parametrize("exception_type", [KeyboardInterrupt, RuntimeError]) -def test_trainer_calls_datamodule_on_exception(exception_type): +def test_trainer_calls_datamodule_on_exception(exception_type, tmp_path): """Test that when an exception occurs, the Trainer lets the data module process it.""" exception = exception_type("Test exception") @@ -2059,7 +2059,7 @@ def on_fit_start(self): datamodule = BoringDataModule() datamodule.on_exception = Mock() - trainer = Trainer() + trainer = Trainer(default_root_dir=tmp_path) with suppress(Exception): trainer.fit(ExceptionModel(), datamodule=datamodule) diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py index 6ab81c581afa3..8dd66fe9bfcff 100644 --- a/tests/tests_pytorch/tuner/test_scale_batch_size.py +++ b/tests/tests_pytorch/tuner/test_scale_batch_size.py @@ -438,7 +438,7 @@ class CustomModel(BoringModel): def val_dataloader(self): return [super().val_dataloader(), super().val_dataloader()] - trainer = Trainer() + trainer = Trainer(logger=False, enable_checkpointing=False) tuner = Tuner(trainer) model = CustomModel() From 49ed2b102befd4c9876e2d14badeb1251af5b2b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 29 Apr 2024 13:16:13 +0200 Subject: [PATCH 016/179] Add PyTorch 2.3 to CI matrix (#19708) --- .azure/gpu-benchmarks.yml | 3 +-- .azure/gpu-tests-fabric.yml | 5 ++--- .azure/gpu-tests-pytorch.yml | 5 ++--- .github/checkgroup.yml | 6 ++++++ .github/workflows/ci-tests-fabric.yml | 7 ++++++- .github/workflows/ci-tests-pytorch.yml | 7 ++++++- .github/workflows/docker-build.yml | 1 + dockers/base-cuda/Dockerfile | 6 ------ requirements/fabric/base.txt | 2 +- requirements/fabric/examples.txt | 2 +- requirements/pytorch/base.txt | 2 +- requirements/pytorch/examples.txt | 3 +-- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/fabric/accelerators/mps.py | 4 +++- src/lightning/pytorch/CHANGELOG.md | 3 ++- tests/tests_fabric/plugins/precision/test_fsdp.py | 4 +++- tests/tests_pytorch/callbacks/test_finetuning_callback.py | 5 +++++ tests/tests_pytorch/conftest.py | 1 + tests/tests_pytorch/plugins/precision/test_fsdp.py | 4 +++- 19 files changed, 46 insertions(+), 26 deletions(-) diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml index bae7babf69266..d869084fb351b 100644 --- a/.azure/gpu-benchmarks.yml +++ b/.azure/gpu-benchmarks.yml @@ -46,8 +46,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - # TODO: Upgrade to Python 3.11 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" options: "--gpus=all --shm-size=32g" strategy: matrix: diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ba86449e92355..1a854604606aa 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -56,12 +56,11 @@ jobs: options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp" strategy: matrix: - # TODO: Upgrade to Python 3.11 "Fabric | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "fabric" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "lightning" workspace: clean: all diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b9ab6ead7f0d1..156513d604210 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -48,12 +48,11 @@ jobs: cancelTimeoutInMinutes: "2" strategy: matrix: - # TODO: Upgrade to Python 3.11 "PyTorch | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "pytorch" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "lightning" pool: lit-rtx-3090 variables: diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 37f1e3cd844d2..0c5e7e4579ccb 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -23,14 +23,17 @@ subprojects: - "pl-cpu (macOS-11, lightning, 3.10, 2.0)" - "pl-cpu (macOS-11, lightning, 3.10, 2.1)" - "pl-cpu (macOS-11, lightning, 3.10, 2.2)" + - "pl-cpu (macOS-14, lightning, 3.10, 2.3)" - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.2)" + - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.3)" - "pl-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" - "pl-cpu (windows-2022, lightning, 3.10, 2.0)" - "pl-cpu (windows-2022, lightning, 3.10, 2.1)" - "pl-cpu (windows-2022, lightning, 3.10, 2.2)" + - "pl-cpu (windows-2022, lightning, 3.10, 2.3)" - "pl-cpu (macOS-11, pytorch, 3.8, 2.0)" - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.0)" - "pl-cpu (windows-2022, pytorch, 3.8, 2.0)" @@ -171,14 +174,17 @@ subprojects: - "fabric-cpu (macOS-11, lightning, 3.10, 2.0)" - "fabric-cpu (macOS-11, lightning, 3.11, 2.1)" - "fabric-cpu (macOS-11, lightning, 3.11, 2.2)" + - "fabric-cpu (macOS-14, lightning, 3.10, 2.3)" - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2)" + - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.3)" - "fabric-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" - "fabric-cpu (windows-2022, lightning, 3.10, 2.0)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)" + - "fabric-cpu (windows-2022, lightning, 3.11, 2.3)" - "fabric-cpu (macOS-11, fabric, 3.8, 2.0)" - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.0)" - "fabric-cpu (windows-2022, fabric, 3.8, 2.0)" diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 61c60889a5aa0..082636a617227 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -49,6 +49,9 @@ jobs: - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - { os: "macOS-12", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" } - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" } @@ -83,6 +86,8 @@ jobs: PYPI_CACHE_DIR: "_pip-wheels" TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch_test.html" + # TODO: Remove this - Enable running MPS tests on this platform + DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: - uses: actions/checkout@v4 @@ -119,7 +124,7 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.2' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.3' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index b7f5b14baf255..b0b10ef4acea8 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -53,6 +53,9 @@ jobs: - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues - { os: "macOS-12", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" } - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" } @@ -88,6 +91,8 @@ jobs: TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch_test.html" FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" + # TODO: Remove this - Enable running MPS tests on this platform + DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: - uses: actions/checkout@v4 @@ -125,7 +130,7 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.2' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.3' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 7ea9f824bb6b1..73c6e7496f9fa 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -109,6 +109,7 @@ jobs: - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" } # - { python_version: "3.12", pytorch_version: "2.2", cuda_version: "12.1.0" } # todo: pending on `onnxruntime` steps: - uses: actions/checkout@v4 diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index d5b72768148ed..b8c29d01b0f77 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -34,13 +34,7 @@ ENV \ MAKEFLAGS="-j2" RUN \ - # TODO: Remove the manual key installation once the base image is updated. - # https://github.com/NVIDIA/nvidia-docker/issues/1631 - # https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214 apt-get update && apt-get install -y wget && \ - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \ - echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \ apt-get update -qq --fix-missing && \ NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ CUDA_VERSION_MM=${CUDA_VERSION%.*} && \ diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 3a6cdbacd302f..4cac78e19bf23 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy >=1.17.2, <1.27.0 -torch >=2.0.0, <2.3.0 +torch >=2.0.0, <2.4.0 fsspec[http] >=2022.5.0, <2023.11.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt index d0be7e3af8496..0e2feb97eccc4 100644 --- a/requirements/fabric/examples.txt +++ b/requirements/fabric/examples.txt @@ -1,6 +1,6 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision >=0.15.0, <0.18.0 +torchvision >=0.15.0, <0.19.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 3578917e2cdf0..9af0b13c15ccd 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy >=1.17.2, <1.27.0 -torch >=2.0.0, <2.3.0 +torch >=2.0.0, <2.4.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 fsspec[http] >=2022.5.0, <2023.11.0 diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 716b033def533..55b85025bddb2 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -2,8 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment requests <2.32.0 -torchvision >=0.15.0, <0.18.0 -gym[classic_control] >=0.17.0, <0.27.0 +torchvision >=0.15.0, <0.19.0 ipython[all] <8.15.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 154433a1c101d..15e8ba16b6c72 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -11,7 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Enabled consolidating distributed checkpoints through `fabric consolidate` in the new CLI [#19560](https://github.com/Lightning-AI/pytorch-lightning/pull/19560)) -- +- Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) - diff --git a/src/lightning/fabric/accelerators/mps.py b/src/lightning/fabric/accelerators/mps.py index d0f36698616d4..75497169cda0f 100644 --- a/src/lightning/fabric/accelerators/mps.py +++ b/src/lightning/fabric/accelerators/mps.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import platform from functools import lru_cache from typing import List, Optional, Union @@ -70,7 +71,8 @@ def auto_device_count() -> int: @lru_cache(1) def is_available() -> bool: """MPS is only available on a machine with the ARM-based Apple Silicon processors.""" - return torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64") + mps_disabled = os.getenv("DISABLE_MPS", "0") == "1" + return not mps_disabled and torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64") @classmethod @override diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index e4ae5a29c336c..11c9238a6d409 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -16,7 +16,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `on_exception` hook to `LightningDataModule` ([#19601](https://github.com/Lightning-AI/pytorch-lightning/pull/19601)) -- +- Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) + ### Changed diff --git a/tests/tests_fabric/plugins/precision/test_fsdp.py b/tests/tests_fabric/plugins/precision/test_fsdp.py index 148292dcd48df..e42df493dd725 100644 --- a/tests/tests_fabric/plugins/precision/test_fsdp.py +++ b/tests/tests_fabric/plugins/precision/test_fsdp.py @@ -58,8 +58,10 @@ def test_fsdp_precision_scaler_with_bf16(): @RunIf(min_cuda_gpus=1) def test_fsdp_precision_forward_context(): """Test to ensure that the context manager correctly is set to bfloat16.""" + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler + precision = FSDPPrecision(precision="16-mixed") - assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) + assert isinstance(precision.scaler, ShardedGradScaler) assert torch.get_default_dtype() == torch.float32 with precision.forward_context(): assert torch.get_autocast_gpu_dtype() == torch.float16 diff --git a/tests/tests_pytorch/callbacks/test_finetuning_callback.py b/tests/tests_pytorch/callbacks/test_finetuning_callback.py index 56d46a62048cd..0c09ae5d5042a 100644 --- a/tests/tests_pytorch/callbacks/test_finetuning_callback.py +++ b/tests/tests_pytorch/callbacks/test_finetuning_callback.py @@ -15,6 +15,7 @@ import pytest import torch +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.pytorch import LightningModule, Trainer, seed_everything from lightning.pytorch.callbacks import BackboneFinetuning, BaseFinetuning, ModelCheckpoint from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset @@ -359,6 +360,8 @@ def test_callbacks_restore(tmp_path): "foreach": None, "differentiable": False, } + if _TORCH_GREATER_EQUAL_2_3: + expected["fused"] = None assert callback._internal_optimizer_metadata[0][0] == expected @@ -374,6 +377,8 @@ def test_callbacks_restore(tmp_path): "foreach": None, "differentiable": False, } + if _TORCH_GREATER_EQUAL_2_3: + expected["fused"] = None assert callback._internal_optimizer_metadata[0][1] == expected diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index be05768cab430..8b9ca14684db0 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -88,6 +88,7 @@ def restore_env_variables(): "KMP_DUPLICATE_LIB_OK", # leaked by PyTorch "CRC32C_SW_MODE", # leaked by tensorboardX "TRITON_CACHE_DIR", # leaked by torch.compile + "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR", # leaked by torch.compile "OMP_NUM_THREADS", # set by our launchers # leaked by XLA "ALLOW_MULTIPLE_LIBTPU_LOAD", diff --git a/tests/tests_pytorch/plugins/precision/test_fsdp.py b/tests/tests_pytorch/plugins/precision/test_fsdp.py index 6b19fdabdf6d6..8b595c2c74a32 100644 --- a/tests/tests_pytorch/plugins/precision/test_fsdp.py +++ b/tests/tests_pytorch/plugins/precision/test_fsdp.py @@ -58,8 +58,10 @@ def test_fsdp_precision_scaler_with_bf16(): @RunIf(min_cuda_gpus=1) def test_fsdp_precision_forward_context(): """Test to ensure that the context manager correctly is set to bfloat16.""" + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler + precision = FSDPPrecision(precision="16-mixed") - assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) + assert isinstance(precision.scaler, ShardedGradScaler) assert torch.get_default_dtype() == torch.float32 with precision.forward_context(): assert torch.get_autocast_gpu_dtype() == torch.float16 From d1949766f8cddd424e2fac3a68b275bebe13d3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 29 Apr 2024 14:51:56 +0200 Subject: [PATCH 017/179] Fix TensorBoardLogger test on Windows (#19824) --- requirements/fabric/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- .../tests_pytorch/loggers/test_tensorboard.py | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 4cac78e19bf23..7487dd9b754b3 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -3,7 +3,7 @@ numpy >=1.17.2, <1.27.0 torch >=2.0.0, <2.4.0 -fsspec[http] >=2022.5.0, <2023.11.0 +fsspec[http] >=2022.5.0, <2024.4.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 9af0b13c15ccd..4993a918af099 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -5,7 +5,7 @@ numpy >=1.17.2, <1.27.0 torch >=2.0.0, <2.4.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 -fsspec[http] >=2022.5.0, <2023.11.0 +fsspec[http] >=2022.5.0, <2024.4.0 torchmetrics >=0.7.0, <1.3.0 # needed for using fixed compare_version packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 diff --git a/tests/tests_pytorch/loggers/test_tensorboard.py b/tests/tests_pytorch/loggers/test_tensorboard.py index 61e689831ac1a..2ee033f893340 100644 --- a/tests/tests_pytorch/loggers/test_tensorboard.py +++ b/tests/tests_pytorch/loggers/test_tensorboard.py @@ -109,7 +109,6 @@ def test_tensorboard_no_name(tmp_path, name): assert os.listdir(tmp_path / "version_0") -@mock.patch.dict(os.environ, {}, clear=True) def test_tensorboard_log_sub_dir(tmp_path): class TestLogger(TensorBoardLogger): # for reproducibility @@ -141,14 +140,15 @@ def name(self): trainer = Trainer(**trainer_args, logger=logger) assert trainer.logger.log_dir == os.path.join(explicit_save_dir, "name", "version", "sub_dir") - # test env var (`$`) handling - test_env_dir = "some_directory" - os.environ["TEST_ENV_DIR"] = test_env_dir - save_dir = "$TEST_ENV_DIR/tmp" - explicit_save_dir = f"{test_env_dir}/tmp" - logger = TestLogger(save_dir, sub_dir="sub_dir") - trainer = Trainer(**trainer_args, logger=logger) - assert trainer.logger.log_dir == os.path.join(explicit_save_dir, "name", "version", "sub_dir") + with mock.patch.dict(os.environ, {}): + # test env var (`$`) handling + test_env_dir = "some_directory" + os.environ["TEST_ENV_DIR"] = test_env_dir + save_dir = "$TEST_ENV_DIR/tmp" + explicit_save_dir = f"{test_env_dir}/tmp" + logger = TestLogger(save_dir, sub_dir="sub_dir") + trainer = Trainer(**trainer_args, logger=logger) + assert trainer.logger.log_dir == os.path.join(explicit_save_dir, "name", "version", "sub_dir") @pytest.mark.parametrize("step_idx", [10, None]) From 8103bd7e0124da47de9b441ad3bb0d6f993ace33 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Wed, 1 May 2024 15:29:41 -0400 Subject: [PATCH 018/179] Make sure the HTTP client for queues retries for POST and 5xx --- src/lightning/app/utilities/network.py | 42 +++++++++++------------ tests/tests_app/utilities/test_network.py | 41 +++++++++++++++++++++- 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/src/lightning/app/utilities/network.py b/src/lightning/app/utilities/network.py index d7ba2a4f88102..5c446cdcd4330 100644 --- a/src/lightning/app/utilities/network.py +++ b/src/lightning/app/utilities/network.py @@ -89,18 +89,32 @@ def _find_free_network_port_cloudspace(): _DEFAULT_REQUEST_TIMEOUT = 30 # seconds +def create_retry_strategy(): + return Retry( + # wait time between retries increases exponentially according to: backoff_factor * (2 ** (retry - 1)) + # but the the maximum wait time is 120 secs. By setting a large value (2880), we'll make sure clients + # are going to be alive for a very long time (~ 4 days) but retries every 120 seconds + total=_CONNECTION_RETRY_TOTAL, + backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR, + status_forcelist={ + 408, # Request Timeout + 429, # Too Many Requests + *range(500, 600) # Any 5xx Server Error status + }, + allowed_methods = { + "POST", # Default methods are idempotent, add POST here + *Retry.DEFAULT_ALLOWED_METHODS + } + ) + + def _configure_session() -> Session: """Configures the session for GET and POST requests. It enables a generous retrial strategy that waits for the application server to connect. """ - retry_strategy = Retry( - # wait time between retries increases exponentially according to: backoff_factor * (2 ** (retry - 1)) - total=_CONNECTION_RETRY_TOTAL, - backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR, - status_forcelist=[429, 500, 502, 503, 504], - ) + retry_strategy = create_retry_strategy() adapter = HTTPAdapter(max_retries=retry_strategy) http = requests.Session() http.mount("https://", adapter) @@ -157,21 +171,7 @@ def __init__( self, base_url: str, auth_token: Optional[str] = None, log_callback: Optional[Callable] = None ) -> None: self.base_url = base_url - retry_strategy = Retry( - # wait time between retries increases exponentially according to: backoff_factor * (2 ** (retry - 1)) - # but the the maximum wait time is 120 secs. By setting a large value (2880), we'll make sure clients - # are going to be alive for a very long time (~ 4 days) but retries every 120 seconds - total=_CONNECTION_RETRY_TOTAL, - backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR, - status_forcelist=[ - 408, # Request Timeout - 429, # Too Many Requests - 500, # Internal Server Error - 502, # Bad Gateway - 503, # Service Unavailable - 504, # Gateway Timeout - ], - ) + retry_strategy = create_retry_strategy() adapter = CustomRetryAdapter(max_retries=retry_strategy, timeout=_DEFAULT_REQUEST_TIMEOUT) self.session = requests.Session() diff --git a/tests/tests_app/utilities/test_network.py b/tests/tests_app/utilities/test_network.py index e3ccaf662d57d..c9e982774f1b6 100644 --- a/tests/tests_app/utilities/test_network.py +++ b/tests/tests_app/utilities/test_network.py @@ -1,8 +1,9 @@ from unittest import mock +from http.client import HTTPMessage import pytest from lightning.app.core import constants -from lightning.app.utilities.network import find_free_network_port +from lightning.app.utilities.network import find_free_network_port, HTTPClient def test_find_free_network_port(): @@ -42,3 +43,41 @@ def test_find_free_network_port_cloudspace(_, patch_constants): # Shouldn't use the APP_SERVER_PORT assert constants.APP_SERVER_PORT not in ports + + +@mock.patch("urllib3.connectionpool.HTTPConnectionPool._get_conn") +def test_http_client_retry_post(getconn_mock): + getconn_mock.return_value.getresponse.side_effect = [ + mock.Mock(status=500, msg=HTTPMessage()), + mock.Mock(status=429, msg=HTTPMessage()), + mock.Mock(status=200, msg=HTTPMessage()), + ] + + client = HTTPClient(base_url="http://test.url") + r = client.post("/test") + r.raise_for_status() + + assert getconn_mock.return_value.request.mock_calls == [ + mock.call("POST", "/test", body=None, headers=mock.ANY), + mock.call("POST", "/test", body=None, headers=mock.ANY), + mock.call("POST", "/test", body=None, headers=mock.ANY), + ] + + +@mock.patch("urllib3.connectionpool.HTTPConnectionPool._get_conn") +def test_http_client_retry_get(getconn_mock): + getconn_mock.return_value.getresponse.side_effect = [ + mock.Mock(status=500, msg=HTTPMessage()), + mock.Mock(status=429, msg=HTTPMessage()), + mock.Mock(status=200, msg=HTTPMessage()), + ] + + client = HTTPClient(base_url="http://test.url") + r = client.get("/test") + r.raise_for_status() + + assert getconn_mock.return_value.request.mock_calls == [ + mock.call("GET", "/test", body=None, headers=mock.ANY), + mock.call("GET", "/test", body=None, headers=mock.ANY), + mock.call("GET", "/test", body=None, headers=mock.ANY), + ] From 4219f30c961b4afced46f4ef76240c679eb4d326 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Wed, 1 May 2024 15:31:13 -0400 Subject: [PATCH 019/179] Fix formatting --- src/lightning/app/utilities/network.py | 14 +++++++------- tests/tests_app/utilities/test_network.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/lightning/app/utilities/network.py b/src/lightning/app/utilities/network.py index 5c446cdcd4330..04afdb0b4f92c 100644 --- a/src/lightning/app/utilities/network.py +++ b/src/lightning/app/utilities/network.py @@ -97,14 +97,14 @@ def create_retry_strategy(): total=_CONNECTION_RETRY_TOTAL, backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR, status_forcelist={ - 408, # Request Timeout - 429, # Too Many Requests - *range(500, 600) # Any 5xx Server Error status + 408, # Request Timeout + 429, # Too Many Requests + *range(500, 600), # Any 5xx Server Error status + }, + allowed_methods={ + "POST", # Default methods are idempotent, add POST here + *Retry.DEFAULT_ALLOWED_METHODS, }, - allowed_methods = { - "POST", # Default methods are idempotent, add POST here - *Retry.DEFAULT_ALLOWED_METHODS - } ) diff --git a/tests/tests_app/utilities/test_network.py b/tests/tests_app/utilities/test_network.py index c9e982774f1b6..38c8961919db6 100644 --- a/tests/tests_app/utilities/test_network.py +++ b/tests/tests_app/utilities/test_network.py @@ -1,9 +1,9 @@ -from unittest import mock from http.client import HTTPMessage +from unittest import mock import pytest from lightning.app.core import constants -from lightning.app.utilities.network import find_free_network_port, HTTPClient +from lightning.app.utilities.network import HTTPClient, find_free_network_port def test_find_free_network_port(): From d623708192d5c03dc32f0eb3cafc86621a19761f Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Wed, 1 May 2024 16:17:50 -0400 Subject: [PATCH 020/179] xfail tests for deprecated functionality --- tests/tests_app/cli/test_cmd_install.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/tests_app/cli/test_cmd_install.py b/tests/tests_app/cli/test_cmd_install.py index 5fddaea097fb3..93df54cbc8389 100644 --- a/tests/tests_app/cli/test_cmd_install.py +++ b/tests/tests_app/cli/test_cmd_install.py @@ -9,6 +9,7 @@ from lightning.app.testing.helpers import _RunIf +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") @mock.patch("lightning.app.cli.cmd_install.subprocess", mock.MagicMock()) def test_valid_org_app_name(): """Valid organization name.""" @@ -69,6 +70,7 @@ def test_app_install(tmpdir, monkeypatch): assert test_app_pip_name in str(new_env_output), f"{test_app_pip_name} should be in the env" +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") @mock.patch("lightning.app.cli.cmd_install.subprocess", mock.MagicMock()) def test_valid_org_component_name(): runner = CliRunner() @@ -135,6 +137,7 @@ def test_component_install(real_component, test_component_pip_name): ), f"{test_component_pip_name} should not be in the env after cleanup" +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") def test_prompt_actions(): # TODO: each of these installs must check that a package is installed in the environment correctly app_to_use = "lightning/invideo" @@ -164,6 +167,7 @@ def test_prompt_actions(): # result = runner.invoke(lightning_cli.cmd_install.install_app, [app_to_use], input='') +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") @mock.patch("lightning.app.cli.cmd_install.subprocess", mock.MagicMock()) def test_version_arg_component(tmpdir, monkeypatch): monkeypatch.chdir(tmpdir) @@ -186,6 +190,7 @@ def test_version_arg_component(tmpdir, monkeypatch): assert result.exit_code == 0 +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") @mock.patch("lightning.app.cli.cmd_install.subprocess", mock.MagicMock()) @mock.patch("lightning.app.cli.cmd_install.os.chdir", mock.MagicMock()) def test_version_arg_app(tmpdir): @@ -237,6 +242,7 @@ def test_install_resolve_latest_version(mock_show_install_app_prompt, tmpdir): assert mock_show_install_app_prompt.call_args[0][0]["version"] == "0.0.4" +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") def test_proper_url_parsing(): name = "lightning/invideo" @@ -311,12 +317,14 @@ def test_install_app_shows_error(tmpdir): # os.chdir(cwd) +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") def test_app_and_component_gallery_app(monkeypatch): monkeypatch.setattr(cmd_install, "_install_app_from_source", mock.MagicMock()) path = cmd_install.gallery_apps_and_components("lightning/flashy", True, "latest") assert path == os.path.join(os.getcwd(), "app.py") +@pytest.mark.xfail(strict=False, reason="lightning app cli was deprecated") def test_app_and_component_gallery_component(monkeypatch): monkeypatch.setattr(cmd_install, "_install_app_from_source", mock.MagicMock()) path = cmd_install.gallery_apps_and_components("lightning/lit-jupyter", True, "latest") From 0f12271d7feeacb6fbe5d70d2ce057da4a04d8b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 1 May 2024 23:08:22 +0200 Subject: [PATCH 021/179] bump lightning cloud --- requirements/app/app.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/app.txt b/requirements/app/app.txt index 002e0a1c1c23f..587c0538e1a81 100644 --- a/requirements/app/app.txt +++ b/requirements/app/app.txt @@ -1,4 +1,4 @@ -lightning-cloud == 0.5.67 # Must be pinned to ensure compatibility +lightning-cloud == 0.5.68 # Must be pinned to ensure compatibility packaging typing-extensions >=4.4.0, <4.10.0 deepdiff >=5.7.0, <6.6.0 From 0c8a193d3c46f9ddba46a3ab1818e24ec41698af Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 7 May 2024 23:02:58 +0200 Subject: [PATCH 022/179] (1/n) Support 2D Parallelism (#19846) --- docs/source-pytorch/conf.py | 4 +- examples/fabric/tensor_parallel/README.md | 45 ++ examples/fabric/tensor_parallel/data.py | 21 + examples/fabric/tensor_parallel/model.py | 436 ++++++++++++++++ .../fabric/tensor_parallel/parallelism.py | 101 ++++ examples/fabric/tensor_parallel/train.py | 78 +++ src/lightning/fabric/fabric.py | 6 +- src/lightning/fabric/strategies/__init__.py | 1 + src/lightning/fabric/strategies/fsdp.py | 16 +- .../fabric/strategies/model_parallel.py | 312 +++++++++++ src/lightning/fabric/utilities/init.py | 48 +- src/lightning/pytorch/strategies/fsdp.py | 5 +- tests/tests_fabric/strategies/test_fsdp.py | 60 +-- .../strategies/test_fsdp_integration.py | 10 +- .../strategies/test_model_parallel.py | 228 ++++++++ .../test_model_parallel_integration.py | 488 ++++++++++++++++++ tests/tests_fabric/utilities/test_init.py | 33 +- 17 files changed, 1821 insertions(+), 71 deletions(-) create mode 100644 examples/fabric/tensor_parallel/README.md create mode 100644 examples/fabric/tensor_parallel/data.py create mode 100644 examples/fabric/tensor_parallel/model.py create mode 100644 examples/fabric/tensor_parallel/parallelism.py create mode 100644 examples/fabric/tensor_parallel/train.py create mode 100644 src/lightning/fabric/strategies/model_parallel.py create mode 100644 tests/tests_fabric/strategies/test_model_parallel.py create mode 100644 tests/tests_fabric/strategies/test_model_parallel_integration.py diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index 9be0c5764a327..0baba58ee0a38 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -356,8 +356,6 @@ def _load_py_module(name: str, location: str) -> ModuleType: "torchmetrics": ("https://lightning.ai/docs/torchmetrics/stable/", None), "lightning_habana": ("https://lightning-ai.github.io/lightning-Habana/", None), "tensorboardX": ("https://tensorboardx.readthedocs.io/en/stable/", None), - # needed for referencing App from lightning scope - "lightning.app": ("https://lightning.ai/docs/app/stable/", None), # needed for referencing Fabric from lightning scope "lightning.fabric": ("https://lightning.ai/docs/fabric/stable/", None), # TODO: these are missing objects.inv @@ -637,4 +635,6 @@ def package_list_from_file(file): "https://www.intel.com/content/www/us/en/products/docs/processors/what-is-a-gpu.html", "https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/", # noqa: E501 "https://stackoverflow.com/questions/66640705/how-can-i-install-grpcio-on-an-apple-m1-silicon-laptop", + "https://openai.com/blog/.*", + "https://tinyurl.com/.*", # has a human verification check on redirect ] diff --git a/examples/fabric/tensor_parallel/README.md b/examples/fabric/tensor_parallel/README.md new file mode 100644 index 0000000000000..4abd1fc058515 --- /dev/null +++ b/examples/fabric/tensor_parallel/README.md @@ -0,0 +1,45 @@ +## Tensor Parallel and 2D Parallel + +This example shows how to apply tensor-parallelism to your model (here Llama 2 7B) with the `ModelParallelStrategy`, and how it can be combined with FSDP (2D parallelism). +PyTorch 2.3+ and a machine with at least 4 GPUs and 24 GB memory each are required to run this example. + +```bash +pip install 'torch>=2.3' +``` + +Navigate to this example folder and run the training script: + +```bash +cd examples/fabric/tensor_parallel +python train.py +``` + +You should see an output like this: + +``` +Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 +Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4 +Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4 +Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4 +---------------------------------------------------------------------------------------------------- +distributed_backend=nccl +All distributed processes registered. Starting with 4 processes +---------------------------------------------------------------------------------------------------- + +Number of model parameters: 6.7 B +Starting training ... +Iteration 0 complete +Iteration 1 complete +Iteration 2 complete +Iteration 3 complete +Iteration 4 complete +Iteration 5 complete +Iteration 6 complete +Iteration 7 complete +Saving a (distributed) checkpoint ... +Training successfully completed! +Peak memory usage: 17.95 GB +``` + +> \[!NOTE\] +> The `ModelParallelStrategy` is experimental and subject to change. Report issues on [GitHub](https://github.com/Lightning-AI/pytorch-lightning/issues). diff --git a/examples/fabric/tensor_parallel/data.py b/examples/fabric/tensor_parallel/data.py new file mode 100644 index 0000000000000..ba36987283ffd --- /dev/null +++ b/examples/fabric/tensor_parallel/data.py @@ -0,0 +1,21 @@ +import torch +from torch.utils.data import Dataset + + +class RandomTokenDataset(Dataset): + def __init__(self, vocab_size: int, seq_length: int): + self.vocab_size = vocab_size + self.seq_length = seq_length + self.tokens = torch.randint( + self.vocab_size, + size=(len(self), self.seq_length + 1), + # Set a seed to make this toy dataset the same on each rank + # Fabric will add a `DistributedSampler` to shard the data correctly + generator=torch.Generator().manual_seed(42), + ) + + def __len__(self) -> int: + return 128 + + def __getitem__(self, item: int): + return self.tokens[item] diff --git a/examples/fabric/tensor_parallel/model.py b/examples/fabric/tensor_parallel/model.py new file mode 100644 index 0000000000000..ad8dbd99e1c08 --- /dev/null +++ b/examples/fabric/tensor_parallel/model.py @@ -0,0 +1,436 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import nn + + +@dataclass +class ModelArgs: + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + + max_batch_size: int = 32 + max_seq_len: int = 32768 + # If `True`, then each transformer block init uses its layer ID, and if + # `False`, each uses the total number of transformer blocks + depth_init: bool = True + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + """Precompute the frequency tensor for complex exponentials (cis) with given dimensions. + + This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' + and the end index 'end'. The 'theta' parameter scales the frequencies. + The returned tensor contains complex values in complex64 data type. + + Args: + dim (int): Dimension of the frequency tensor. + end (int): End index for precomputing frequencies. + theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0. + + Returns: + torch.Tensor: Precomputed frequency tensor with complex exponentials. + + """ + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) # type: ignore + freqs = torch.outer(t, freqs).float() # type: ignore + return torch.polar(torch.ones_like(freqs), freqs) # complex64 + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + """Reshape frequency tensor for broadcasting it with another tensor. + + This function reshapes the frequency tensor to have the same shape as the target tensor 'x' + for the purpose of broadcasting the frequency tensor during element-wise operations. + + Args: + freqs_cis (torch.Tensor): Frequency tensor to be reshaped. + x (torch.Tensor): Target tensor for broadcasting compatibility. + + Returns: + torch.Tensor: Reshaped frequency tensor. + + """ + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Apply rotary embeddings to input tensors using the given frequency tensor. + + This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided + frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor + is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are + returned as real tensors. + + Args: + xq (torch.Tensor): Query tensor to apply rotary embeddings. + xk (torch.Tensor): Key tensor to apply rotary embeddings. + freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + + """ + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + x[:, :, :, None, :] + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + + +class RMSNorm(nn.Module): + """Initialize the RMSNorm normalization layer. + + Args: + dim (int): The dimension of the input tensor. + eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. + + Attributes: + eps (float): A small value added to the denominator for numerical stability. + weight (nn.Parameter): Learnable scaling parameter. + + """ + + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x: torch.Tensor): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x: torch.Tensor): + output = self._norm(x.float()).type_as(x) + return output * self.weight + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) # type: ignore + + +class Attention(nn.Module): + """Multi-head attention module. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Attributes: + n_kv_heads (int): Number of key and value heads. + n_heads (int): Number of query heads. + n_local_kv_heads (int): Number of local key and value heads. + n_rep (int): Number of repetitions for local heads. + head_dim (int): Dimension size of each attention head. + wq (Linear): Linear transformation for queries. + wk (Linear): Linear transformation for keys. + wv (Linear): Linear transformation for values. + wo (Linear): Linear transformation for output. + + """ + + def __init__(self, model_args: ModelArgs): + super().__init__() + self.n_heads = model_args.n_heads + self.n_kv_heads = model_args.n_heads if model_args.n_kv_heads is None else model_args.n_kv_heads + self.n_rep = self.n_heads // self.n_kv_heads + self.head_dim = model_args.dim // model_args.n_heads + + self.wq = nn.Linear(model_args.dim, model_args.n_heads * self.head_dim, bias=False) + self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wo = nn.Linear(model_args.n_heads * self.head_dim, model_args.dim, bias=False) + + def init_weights(self, init_std: float): + for linear in (self.wq, self.wk, self.wv): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std) + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + ): + """Forward pass of the attention module. + + Args: + x (torch.Tensor): Input tensor. + freqs_cis (torch.Tensor): Precomputed frequency tensor. + + Returns: + torch.Tensor: Output tensor after attention. + + """ + bsz, seqlen, _ = x.shape + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + + xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim) + xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim) + xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim) + + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + keys = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + values = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + + # we use casual mask for training + output = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True) + output = output.transpose(1, 2).contiguous() # (bs, seqlen, n_local_heads, head_dim) + output = output.view(bsz, seqlen, -1) + return self.wo(output) + + +class FeedForward(nn.Module): + """FeedForward module. + + Args: + dim (int): Input dimension. + hidden_dim (int): Hidden dimension of the feedforward layer. + multiple_of (int): Value to ensure hidden dimension is a multiple of this value. + ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None. + + Attributes: + w1 (Linear): Linear transformation for the first layer. + w2 (Linear): Linear transformation for the second layer. + w3 (Linear): Linear transformation for the third layer. + + """ + + def __init__( + self, + dim: int, + hidden_dim: int, + multiple_of: int, + ffn_dim_multiplier: Optional[float], + ): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + def init_weights(self, init_std: float): + nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02) + for linear in (self.w2, self.w3): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + +class TransformerBlock(nn.Module): + """TransformerBlock Module. + + Args: + layer_id (int): Identifier for the layer. + model_args (ModelArgs): Model configuration arguments. + + Attributes: + n_heads (int): Number of attention heads. + dim (int): Dimension size of the model. + head_dim (int): Dimension size of each attention head. + attention (Attention): Attention module. + feed_forward (FeedForward): FeedForward module. + layer_id (int): Identifier for the layer. + attention_norm (RMSNorm): Layer normalization for attention output. + ffn_norm (RMSNorm): Layer normalization for feedforward output. + + """ + + def __init__(self, layer_id: int, model_args: ModelArgs): + super().__init__() + self.n_heads = model_args.n_heads + self.dim = model_args.dim + self.attention = Attention(model_args) + self.feed_forward = FeedForward( + dim=model_args.dim, + hidden_dim=4 * model_args.dim, + multiple_of=model_args.multiple_of, + ffn_dim_multiplier=model_args.ffn_dim_multiplier, + ) + self.layer_id = layer_id + self.num_layers = model_args.n_layers + + self.attention_norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + self.ffn_norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + + if model_args.depth_init: + self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5 + else: + self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5 + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + ): + """Perform a forward pass through the TransformerBlock. + + Args: + x (torch.Tensor): Input tensor. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + + Returns: + torch.Tensor: Output tensor after applying attention and feedforward layers. + + """ + h = x + self.attention(self.attention_norm(x), freqs_cis) + return h + self.feed_forward(self.ffn_norm(h)) + + def init_weights(self): + for norm in (self.attention_norm, self.ffn_norm): + norm.reset_parameters() + self.attention.init_weights(self.weight_init_std) + self.feed_forward.init_weights(self.weight_init_std) + + +class Transformer(nn.Module): + """Transformer Module. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Attributes: + model_args (ModelArgs): Model configuration arguments. + vocab_size (int): Vocabulary size. + n_layers (int): Number of layers in the model. + tok_embeddings (ParallelEmbedding): Token embeddings. + layers (torch.nn.ModuleList): List of Transformer blocks. + norm (RMSNorm): Layer normalization for the model output. + output (ColumnParallelLinear): Linear layer for final output. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + + """ + + def __init__(self, model_args: ModelArgs): + super().__init__() + self.model_args = model_args + self.vocab_size = model_args.vocab_size + self.n_layers = model_args.n_layers + self.model_dim = model_args.dim + + self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) + self.register_buffer( + "freqs_cis", + precompute_freqs_cis( + model_args.dim // model_args.n_heads, + # Need to compute until at least the max token limit for generation + # (use 2x max sequence length to be safe) + model_args.max_seq_len * 2, + ), + ) + self.layers = torch.nn.ModuleList() + for layer_id in range(model_args.n_layers): + self.layers.append(TransformerBlock(layer_id, model_args)) + + self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False) + + def reset_parameters(self): + with torch.device(self.freqs_cis.device): + self.freqs_cis = precompute_freqs_cis( + self.model_args.dim // self.model_args.n_heads, + # Need to compute until at least the max token limit for generation + # (use 2x max sequence length to be safe) + self.model_args.max_seq_len * 2, + ) + + def init_weights(self): + """[Note: On ``init_weights`` vs. + + ``reset_parameters``] + Modules may define ``reset_parameters`` to initialize parameter values. + ``reset_parameters`` is meant to only initialize directly owned + parameters/buffers, not those of their child modules, and it can be + used to give the initial values for these tensors. + Separately, users may want custom initialization for their modules, + different from that in ``reset_parameters``. For this, we define + ``init_weights``. We only call it in the constructor of this + ``Transformer`` root module to avoid reinitializing tensors. + + """ + nn.init.normal_(self.tok_embeddings.weight) + for layer in self.layers: + layer.init_weights() + self.norm.reset_parameters() + final_out_std = self.model_args.dim**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + self.output.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + + def forward(self, tokens: torch.Tensor): + """Perform a forward pass through the Transformer model. + + Args: + tokens (torch.Tensor): Input token indices. + + Returns: + torch.Tensor: Output logits after applying the Transformer model. + + """ + _bsz, seqlen = tokens.shape + h = self.tok_embeddings(tokens) + self.freqs_cis = self.freqs_cis.to(h.device) + freqs_cis = self.freqs_cis[0:seqlen] + + for layer in self.layers: + h = layer(h, freqs_cis) + h = self.norm(h) + return self.output(h).float() + + @classmethod + def from_model_args(cls, model_args: ModelArgs) -> "Transformer": + """Initialize a Transformer model from a ModelArgs object. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Returns: + Transformer: Transformer model. + + """ + return cls(model_args) diff --git a/examples/fabric/tensor_parallel/parallelism.py b/examples/fabric/tensor_parallel/parallelism.py new file mode 100644 index 0000000000000..38a091d1b8859 --- /dev/null +++ b/examples/fabric/tensor_parallel/parallelism.py @@ -0,0 +1,101 @@ +import torch +from model import Transformer +from torch.distributed._composable.fsdp import MixedPrecisionPolicy +from torch.distributed._composable.fsdp.fully_shard import fully_shard +from torch.distributed._tensor import Replicate, Shard +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + PrepareModuleInput, + RowwiseParallel, + SequenceParallel, + parallelize_module, +) + + +# Taken and modified from torchtitan +# https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py +def parallelize(model: Transformer, device_mesh: DeviceMesh) -> Transformer: + """Apply parallelisms and activation checkpointing to the model. + + NOTE: The passed-in model preferably should be on meta device. Otherwise, + the model must fit on GPU or CPU memory. + + """ + + dp_mesh = device_mesh["data_parallel"] + tp_mesh = device_mesh["tensor_parallel"] + + if tp_mesh.size() > 1: + # 1. Parallelize the first embedding and the last linear proj layer + # 2. Parallelize the root norm layer over the sequence dim + # 3. Shard the first transformer block's inputs + # Parallelize the first embedding and the last linear out projection + plan = { + "tok_embeddings": RowwiseParallel( + input_layouts=Replicate(), + ), + "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()), + "norm": SequenceParallel(), + "layers.0": PrepareModuleInput( + input_layouts=(Replicate(), None), + desired_input_layouts=(Shard(1), None), + use_local_output=True, + ), + } + model = parallelize_module(model, tp_mesh, plan) + + # Parallelize each transformer block + for transformer_block in model.layers: + plan = { + "attention": PrepareModuleInput( + input_layouts=(Shard(1), None), + desired_input_layouts=(Replicate(), None), + ), + "attention.wq": ColwiseParallel(), + "attention.wk": ColwiseParallel(), + "attention.wv": ColwiseParallel(), + "attention.wo": RowwiseParallel(output_layouts=Shard(1)), + "attention_norm": SequenceParallel(), + "feed_forward": PrepareModuleInput( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + "feed_forward.w1": ColwiseParallel(), + "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)), + "feed_forward.w3": ColwiseParallel(), + "ffn_norm": SequenceParallel(), + } + + # Adjust attention module to use the local number of heads + attn_layer = transformer_block.attention + attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size() + attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size() + + # Apply the plan for the current transformer block + parallelize_module(transformer_block, tp_mesh, plan) + + if dp_mesh.size() > 1: + assert dp_mesh.ndim == 1 # Hybrid-sharding not supported + + # NOTE: Currently, the user is required to manually handle precision settings such as the `mp_policy` here + # because the model parallel strategy does not respect all settings of `Fabric(precision=...)` at the moment. + mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) + + fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} + for layer_id, transformer_block in enumerate(model.layers): + # Apply activation checkpointing + transformer_block = checkpoint_wrapper(transformer_block) + # As an optimization, do not reshard after forward for the last + # transformer block since FSDP would prefetch it immediately + reshard_after_forward = layer_id < len(model.layers) - 1 + fully_shard( + transformer_block, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + model.layers[layer_id] = transformer_block + model = fully_shard(model, **fsdp_config) + + return model diff --git a/examples/fabric/tensor_parallel/train.py b/examples/fabric/tensor_parallel/train.py new file mode 100644 index 0000000000000..cfae4ca90250e --- /dev/null +++ b/examples/fabric/tensor_parallel/train.py @@ -0,0 +1,78 @@ +import lightning as L +import torch +import torch.nn.functional as F +from data import RandomTokenDataset +from lightning.fabric.strategies import ModelParallelStrategy +from model import ModelArgs, Transformer +from parallelism import parallelize +from torch.distributed.tensor.parallel import loss_parallel +from torch.utils.data import DataLoader + + +def train(): + strategy = ModelParallelStrategy( + # User-defined function that applies the desired parallelizations specific to the model + # (TP, FSDP2, activation checkpointing, ...) + parallelize_fn=parallelize, + # Define the size of the 2D parallelism + # Set to "auto" to apply TP intra-node and DP inter-node + data_parallel_size=2, + tensor_parallel_size=2, + ) + + fabric = L.Fabric(accelerator="cuda", devices=4, strategy=strategy) + fabric.launch() + + # Initialize the model + model_args = ModelArgs(vocab_size=32000) + with fabric.init_module(empty_init=True): + model = Transformer(model_args) + + fabric.print(f"Number of model parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f} B") + + # Define the optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, foreach=True) + + # Set up model and optimizer + model, optimizer = fabric.setup(model, optimizer) + + model.init_weights() + + # Define dataset/dataloader + dataset = RandomTokenDataset(vocab_size=model_args.vocab_size, seq_length=128) + dataloader = DataLoader(dataset, batch_size=8) + + # Fabric configures the sampler automatically for you such that + # all batches in a tensor-parallel group are identical + dataloader = fabric.setup_dataloaders(dataloader) + + # Simplified training loop + fabric.print("Starting training ...") + + for i, batch in enumerate(dataloader): + inputs = batch[:, :-1] + labels = batch[:, 1:] + + output = model(inputs) + + with loss_parallel(): + loss = F.cross_entropy(output.reshape(-1, output.size(-1)), labels.reshape(-1)) + + fabric.backward(loss) + optimizer.step() + optimizer.zero_grad() + fabric.print(f"Iteration {i} complete") + + # See `fabric consolidate --help` if you need to convert the checkpoint to a single file + fabric.print("Saving a (distributed) checkpoint ...") + state = {"model": model, "optimizer": optimizer, "iteration": i} + fabric.save("checkpoint.pt", state) + + fabric.print("Training successfully completed!") + fabric.print(f"Peak memory usage: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB") + + +if __name__ == "__main__": + assert torch.cuda.device_count() >= 4, "This example requires at least 4 GPUs with 24 GB of memory each." + torch.set_float32_matmul_precision("high") + train() diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index aa67d2e7ce9ac..71d8f623dcee4 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -53,7 +53,6 @@ Strategy, XLAStrategy, ) -from lightning.fabric.strategies.fsdp import _has_meta_device_parameters from lightning.fabric.strategies.launchers import _MultiProcessingLauncher, _XLALauncher from lightning.fabric.strategies.strategy import TBroadcast, _Sharded from lightning.fabric.utilities import move_data_to_device @@ -66,6 +65,7 @@ ) from lightning.fabric.utilities.device_dtype_mixin import _update_properties from lightning.fabric.utilities.distributed import DistributedSamplerWrapper, _InfiniteBarrier +from lightning.fabric.utilities.init import _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.rank_zero import rank_zero_deprecation, rank_zero_warn from lightning.fabric.utilities.registry import _load_external_callbacks from lightning.fabric.utilities.seed import seed_everything @@ -1016,7 +1016,7 @@ def _validate_setup(self, module: nn.Module, optimizers: Sequence[Optimizer]) -> raise ValueError("An optimizer should be passed only once to the `setup` method.") if isinstance(self._strategy, FSDPStrategy) and any( - _has_meta_device_parameters(optimizer) for optimizer in optimizers + _has_meta_device_parameters_or_buffers(optimizer) for optimizer in optimizers ): raise RuntimeError( "The optimizer has references to the model's meta-device parameters. Materializing them is" @@ -1044,7 +1044,7 @@ def _validate_setup_optimizers(self, optimizers: Sequence[Optimizer]) -> None: if any(isinstance(opt, _FabricOptimizer) for opt in optimizers): raise ValueError("An optimizer should be passed only once to the `setup_optimizers` method.") - if any(_has_meta_device_parameters(optimizer) for optimizer in optimizers): + if any(_has_meta_device_parameters_or_buffers(optimizer) for optimizer in optimizers): raise RuntimeError( "The optimizer has references to the model's meta-device parameters. Materializing them is" " is currently not supported. Create the optimizer after setting up the model, then call" diff --git a/src/lightning/fabric/strategies/__init__.py b/src/lightning/fabric/strategies/__init__.py index ff48b152750ef..f561c4f426aac 100644 --- a/src/lightning/fabric/strategies/__init__.py +++ b/src/lightning/fabric/strategies/__init__.py @@ -17,6 +17,7 @@ from lightning.fabric.strategies.deepspeed import DeepSpeedStrategy # noqa: F401 from lightning.fabric.strategies.dp import DataParallelStrategy # noqa: F401 from lightning.fabric.strategies.fsdp import FSDPStrategy # noqa: F401 +from lightning.fabric.strategies.model_parallel import ModelParallelStrategy # noqa: F401 from lightning.fabric.strategies.parallel import ParallelStrategy # noqa: F401 from lightning.fabric.strategies.registry import _StrategyRegistry from lightning.fabric.strategies.single_device import SingleDeviceStrategy # noqa: F401 diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 30251a9315cd4..1bd470ff56bcd 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -36,7 +36,7 @@ from lightning_utilities.core.imports import RequirementCache from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only from torch import Tensor -from torch.nn import Module, Parameter +from torch.nn import Module from torch.optim import Optimizer from typing_extensions import TypeGuard, override @@ -67,7 +67,7 @@ _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3, ) -from lightning.fabric.utilities.init import _EmptyInit +from lightning.fabric.utilities.init import _EmptyInit, _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.load import _METADATA_FILENAME, _lazy_load, _materialize_tensors, _move_state_into from lightning.fabric.utilities.rank_zero import rank_zero_deprecation, rank_zero_only, rank_zero_warn from lightning.fabric.utilities.seed import reset_seed @@ -271,7 +271,7 @@ def setup_module(self, module: Module) -> Module: if any(isinstance(mod, FullyShardedDataParallel) for mod in module.modules()): # The user has wrapped their submodules manually, don't apply the auto wrap policy. - if _has_meta_device_parameters(module): + if _has_meta_device_parameters_or_buffers(module): rank_zero_warn( "The model is already wrapped in `FSDP` but there are still parameters on the meta device." ) @@ -870,16 +870,6 @@ def _load_raw_module_state(state_dict: Dict[str, Any], module: Module, world_siz module.load_state_dict(state_dict, strict=strict) -def _has_meta_device_parameters(obj: Union[Module, Optimizer]) -> bool: - if isinstance(obj, Optimizer): - return any( - t.is_meta for param_group in obj.param_groups for t in param_group["params"] if isinstance(t, Parameter) - ) - if isinstance(obj, Module): - return any(t.is_meta for t in obj.parameters()) - raise TypeError(f"Expected `torch.nn.Module` or `torch.optim.Optimizer`, got: {type(obj).__name__}") - - def _move_torchmetrics_to_device(module: torch.nn.Module, device: torch.device) -> None: # FSDP doesn't move modules without parameters (e.g. Metrics) to the device # https://github.com/pytorch/pytorch/issues/113113 diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py new file mode 100644 index 0000000000000..5679fa7b5ac06 --- /dev/null +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -0,0 +1,312 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from contextlib import ExitStack +from datetime import timedelta +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Dict, Literal, Optional, TypeVar, Union + +import torch +from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only +from torch import Tensor +from torch.nn import Module +from torch.optim import Optimizer +from typing_extensions import override + +from lightning.fabric.plugins import CheckpointIO +from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout +from lightning.fabric.strategies.fsdp import ( + _distributed_checkpoint_load, + _distributed_checkpoint_save, +) +from lightning.fabric.strategies.launchers.subprocess_script import _SubprocessScriptLauncher +from lightning.fabric.strategies.parallel import ParallelStrategy +from lightning.fabric.strategies.strategy import TBroadcast, _BackwardSyncControl +from lightning.fabric.utilities.distributed import ( + ReduceOp, + _distributed_is_initialized, + _get_default_process_group_backend_for_device, + _init_dist_connection, + _sync_ddp_if_available, +) +from lightning.fabric.utilities.distributed import group as _group +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 +from lightning.fabric.utilities.init import _materialize_distributed_module +from lightning.fabric.utilities.rank_zero import rank_zero_only +from lightning.fabric.utilities.seed import reset_seed +from lightning.fabric.utilities.types import _PATH + +if TYPE_CHECKING: + from torch.distributed.device_mesh import DeviceMesh + +TModel = TypeVar("TModel", bound=Module) + + +class ModelParallelStrategy(ParallelStrategy): + """Enables user-defined parallelism applied to a model. + + .. warning:: This is an :ref:`experimental ` feature. + + Currently supports up to 2D parallelism. Specifically, it supports the combination of + Fully Sharded Data-Parallel 2 (FSDP2) with Tensor Parallelism (DTensor). These PyTorch APIs are currently still + experimental in PyTorch. Requires PyTorch 2.3 or newer. + + Arguments: + parallelize_fn: A function that applies parallelisms to a module. The strategy will provide the + model and device mesh as input. + data_parallel_size: The number of devices within a data-parallel group. Defaults to ``"auto"``, which + sets this size to the number of nodes in the cluster. + tensor_parallel_size: The number of devices within a tensor-parallel group. Defaults to ``"auto"``, which + sets this size to the number of GPUs in a single node. + + """ + + def __init__( + self, + parallelize_fn: Callable[[TModel, "DeviceMesh"], TModel], + data_parallel_size: Union[Literal["auto"], int] = "auto", + tensor_parallel_size: Union[Literal["auto"], int] = "auto", + process_group_backend: Optional[str] = None, + timeout: Optional[timedelta] = default_pg_timeout, + ) -> None: + super().__init__() + if not _TORCH_GREATER_EQUAL_2_3: + raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.3 or higher.") + self._parallelize_fn = parallelize_fn + self._data_parallel_size = data_parallel_size + self._tensor_parallel_size = tensor_parallel_size + self._num_nodes = 1 + self._process_group_backend: Optional[str] = process_group_backend + self._timeout: Optional[timedelta] = timeout + self._backward_sync_control = _ParallelBackwardSyncControl() + + self._device_mesh: Optional["DeviceMesh"] = None + + @property + def device_mesh(self) -> "DeviceMesh": + if self._device_mesh is None: + raise RuntimeError("Accessing the device mesh before processes have initialized is not allowed.") + return self._device_mesh + + @property + @override + def checkpoint_io(self) -> CheckpointIO: + raise NotImplementedError(f"The `{type(self).__name__}` does not use the `CheckpointIO` plugin interface.") + + @checkpoint_io.setter + @override + def checkpoint_io(self, io: CheckpointIO) -> None: + raise NotImplementedError(f"The `{type(self).__name__}` does not support setting a `CheckpointIO` plugin.") + + @property + @override + def root_device(self) -> torch.device: + assert self.parallel_devices is not None + return self.parallel_devices[self.local_rank] + + @property + def num_nodes(self) -> int: + return self._num_nodes + + @num_nodes.setter + def num_nodes(self, num_nodes: int) -> None: + self._num_nodes = num_nodes + + @property + def num_processes(self) -> int: + return len(self.parallel_devices) if self.parallel_devices is not None else 0 + + @property + @override + def distributed_sampler_kwargs(self) -> Dict[str, Any]: + assert self.device_mesh is not None + data_parallel_mesh = self.device_mesh["data_parallel"] + return {"num_replicas": data_parallel_mesh.size(), "rank": data_parallel_mesh.get_local_rank()} + + @property + def process_group_backend(self) -> Optional[str]: + return self._process_group_backend + + @override + def _configure_launcher(self) -> None: + assert self.cluster_environment is not None + if not self.cluster_environment.creates_processes_externally: + self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes) + + @override + def setup_environment(self) -> None: + super().setup_environment() + self._setup_distributed() + self._setup_device_mesh() + + @override + def setup_module(self, module: TModel) -> TModel: + from torch.distributed.fsdp import FullyShardedDataParallel + + if any(isinstance(mod, FullyShardedDataParallel) for mod in module.modules()): + raise TypeError( + "Found modules that are wrapped with `torch.distributed.fsdp.FullyShardedDataParallel`." + f" The `{self.__class__.__name__}` only supports the new FSDP2 APIs in PyTorch >= 2.3." + ) + + module = self._parallelize_fn(module, self.device_mesh) + if not isinstance(module, Module): + raise TypeError( + f"The `parallelize_fn` must return a `nn.Module` instance, but got: {type(module).__name__}" + ) + _materialize_distributed_module(module, self.root_device) + return module + + @override + def module_to_device(self, module: Module) -> None: + pass + + @override + def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManager: + precision_init_ctx = self.precision.module_init_context() + stack = ExitStack() + if empty_init: + # Materializaton happens in `setup_module` + # TODO: Introduce `Fabric.materialize(module)` to give user control over materialization + stack.enter_context(torch.device("meta")) + stack.enter_context(precision_init_ctx) + return stack + + @override + def all_reduce( + self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean" + ) -> Tensor: + if isinstance(tensor, Tensor): + return _sync_ddp_if_available(tensor, group, reduce_op=reduce_op) + return tensor + + @override + def barrier(self, *args: Any, **kwargs: Any) -> None: + if not _distributed_is_initialized(): + return + if torch.distributed.get_backend() == "nccl": + torch.distributed.barrier(device_ids=[self.root_device.index]) + else: + torch.distributed.barrier() + + @override + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + if not _distributed_is_initialized(): + return obj + + obj = [obj] + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) + return obj[0] + + @override + def save_checkpoint( + self, + path: _PATH, + state: Dict[str, Union[Module, Optimizer, Any]], + storage_options: Optional[Any] = None, + filter: Optional[Dict[str, Callable[[str, Any], bool]]] = None, + ) -> None: + """Save model, optimizer, and other state to a checkpoint on disk.""" + if storage_options is not None: + raise TypeError( + f"`{self.__class__.__name__}.save_checkpoint(..., storage_options=...)` is not supported because" + f" `{self.__class__.__name__}` does not use the `CheckpointIO`." + ) + if filter is not None: + raise NotImplementedError(f"{self.__class__.__name__} does not yet support the `filter` argument.") + + # broadcast the path from rank 0 to ensure all the states are saved in a common path + path = Path(self.broadcast(path)) + _distributed_checkpoint_save(state, path) + + @override + def load_checkpoint( + self, + path: _PATH, + state: Optional[Union[Module, Optimizer, Dict[str, Union[Module, Optimizer, Any]]]] = None, + strict: bool = True, + ) -> Dict[str, Any]: + if isinstance(state, (Module, Optimizer)): + raise NotImplementedError( + "Loading a module or optimizer object from a checkpoint directly is not yet supported." + ) + if strict is False: + raise NotImplementedError(f"Non-strict loading is not yet supported in {self.__class__.__name__}.") + + # broadcast the path from rank 0 to ensure all the states are loaded from a common path + path = Path(self.broadcast(path)) + _distributed_checkpoint_load(state, path) # type: ignore[arg-type] + return {} + + def _setup_distributed(self) -> None: + reset_seed() + self._set_world_ranks() + self._process_group_backend = self._get_process_group_backend() + assert self.cluster_environment is not None + _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + + def _setup_device_mesh(self) -> None: + from torch.distributed.device_mesh import init_device_mesh + + if self._data_parallel_size == "auto": + self._data_parallel_size = self.num_nodes + if self._tensor_parallel_size == "auto": + self._tensor_parallel_size = self.num_processes + if self._data_parallel_size * self._tensor_parallel_size != self.world_size: + raise RuntimeError( + f"The sizes `data_parallel_size={self._data_parallel_size}` and" + f" `tensor_parallel_size={self._tensor_parallel_size}` multiplied should equal the world size" + f" ({self.world_size})." + ) + self._device_mesh = init_device_mesh( + device_type=self.root_device.type, + mesh_shape=(self._data_parallel_size, self._tensor_parallel_size), + mesh_dim_names=("data_parallel", "tensor_parallel"), + ) + + def _get_process_group_backend(self) -> str: + return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) + + def _set_world_ranks(self) -> None: + if self.cluster_environment is not None: + self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) + self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) + # `LightningEnvironment.set_global_rank` will do this too, but we cannot rely on that implementation detail + # additionally, for some implementations, the setter is a no-op, so it's safer to access the getter + rank_zero_only.rank = utils_rank_zero_only.rank = self.global_rank + + +class _ParallelBackwardSyncControl(_BackwardSyncControl): + @override + def no_backward_sync(self, module: Module, enabled: bool) -> ContextManager: + """Blocks gradient synchronization inside the FSDP2 modules.""" + return _FSDPNoSync(module=module, enabled=enabled) + + +class _FSDPNoSync(ContextManager): + def __init__(self, module: Module, enabled: bool) -> None: + self._module = module + self._enabled = enabled + + def _set_requires_grad_sync(self, requires_grad_sync: bool) -> None: + from torch.distributed._composable.fsdp import FSDP + + for mod in self._module.modules(): + if isinstance(mod, FSDP): + mod.set_requires_gradient_sync(requires_grad_sync, recurse=False) + + def __enter__(self) -> None: + self._set_requires_grad_sync(not self._enabled) + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + self._set_requires_grad_sync(self._enabled) diff --git a/src/lightning/fabric/utilities/init.py b/src/lightning/fabric/utilities/init.py index e1b80f7a55fe1..fccdce7aa813d 100644 --- a/src/lightning/fabric/utilities/init.py +++ b/src/lightning/fabric/utilities/init.py @@ -12,13 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import itertools -from typing import Any, Callable, Dict, Optional, Sequence +from typing import Any, Callable, Dict, Optional, Sequence, Union import torch +from torch.nn import Module, Parameter +from torch.optim import Optimizer from torch.overrides import TorchFunctionMode from typing_extensions import override from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 +from lightning.fabric.utilities.rank_zero import rank_zero_warn from lightning.fabric.utilities.types import _DEVICE @@ -56,7 +59,7 @@ def __torch_function__( return func(*args, **kwargs) -def _materialize(module: torch.nn.Module, device: _DEVICE) -> None: +def _materialize(module: Module, device: _DEVICE) -> None: """Materialize a module.""" if not _TORCH_GREATER_EQUAL_2_1: raise RuntimeError("recurse=False requires torch 2.1") @@ -69,8 +72,45 @@ def _materialize(module: torch.nn.Module, device: _DEVICE) -> None: module.reset_parameters() -def _materialize_meta_tensors(module: torch.nn.Module, device: _DEVICE) -> None: +def _materialize_meta_tensors(module: Module, device: _DEVICE) -> None: """Materialize all tensors in a given module.""" for module in module.modules(): - if any(t.is_meta for t in itertools.chain(module.parameters(recurse=False), module.buffers(recurse=False))): + if _has_meta_device_parameters_or_buffers(module, recurse=False): _materialize(module, device) + + +def _materialize_distributed_module(module: Module, device: torch.device) -> None: + # Reference: https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md#meta-device-initialization + # TODO: Introduce `Fabric.materialize(module)` to give user control when materialization should happen + # TODO: Make `torchmetrics.Metric` compatible with the `to_empty()` + `reset_parameters()` semantics + if not _has_meta_device_parameters_or_buffers(module): + return + + module.to_empty(device=device) # has to be called on the root module + + uninitialized_modules = set() + for submodule in module.modules(): + if all(False for _ in itertools.chain(submodule.parameters(recurse=False), submodule.buffers(recurse=False))): + # module has no parameters or buffers + continue + if callable(reset_method := getattr(submodule, "reset_parameters", None)): + reset_method() + else: + uninitialized_modules.add(type(submodule).__name__) + + if uninitialized_modules: + rank_zero_warn( + "Parameter initialization incomplete. The following modules have parameters or buffers with uninitialized" + " memory because they don't define a `reset_parameters()` method for re-initialization:" + f" {', '.join(uninitialized_modules)}" + ) + + +def _has_meta_device_parameters_or_buffers(obj: Union[Module, Optimizer], recurse: bool = True) -> bool: + if isinstance(obj, Optimizer): + return any( + t.is_meta for param_group in obj.param_groups for t in param_group["params"] if isinstance(t, Parameter) + ) + if isinstance(obj, Module): + return any(t.is_meta for t in itertools.chain(obj.parameters(recurse=recurse), obj.buffers(recurse=recurse))) + raise TypeError(f"Expected `torch.nn.Module` or `torch.optim.Optimizer`, got: {type(obj).__name__}") diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 657fb438c1e8a..1e7341c907bf9 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -37,7 +37,6 @@ _distributed_checkpoint_save, _get_full_state_dict_context, _get_sharded_state_dict_context, - _has_meta_device_parameters, _init_cpu_offload, _init_sharding_strategy, _is_full_checkpoint, @@ -55,7 +54,7 @@ ) from lightning.fabric.utilities.distributed import group as _group from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 -from lightning.fabric.utilities.init import _EmptyInit +from lightning.fabric.utilities.init import _EmptyInit, _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors from lightning.fabric.utilities.optimizer import _optimizers_to_device from lightning.fabric.utilities.seed import reset_seed @@ -269,7 +268,7 @@ def _setup_model(self, model: Module) -> Module: from torch.distributed.fsdp import FullyShardedDataParallel if any(isinstance(mod, FullyShardedDataParallel) for mod in model.modules()): - if _has_meta_device_parameters(model): + if _has_meta_device_parameters_or_buffers(model): rank_zero_warn( "The model is already wrapped in `FSDP` but there are still parameters on the meta device." ) diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index 5eeed7065fb5f..01e70a27a7cc9 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -26,7 +26,6 @@ from lightning.fabric.strategies.fsdp import ( _FSDPBackwardSyncControl, _get_full_state_dict_context, - _has_meta_device_parameters, _is_sharded_checkpoint, ) from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1, _TORCH_GREATER_EQUAL_2_2 @@ -34,14 +33,14 @@ from torch.optim import Adam -def test_fsdp_custom_mixed_precision(): +def test_custom_mixed_precision(): """Test that passing a custom mixed precision config works.""" config = MixedPrecision() strategy = FSDPStrategy(mixed_precision=config) assert strategy.mixed_precision_config == config -def test_fsdp_cpu_offload(): +def test_cpu_offload(): """Test the different ways cpu offloading can be enabled.""" # bool strategy = FSDPStrategy(cpu_offload=True) @@ -53,7 +52,7 @@ def test_fsdp_cpu_offload(): assert strategy.cpu_offload == config -def test_fsdp_sharding_strategy(): +def test_sharding_strategy(): """Test the different ways the sharding strategy can be set.""" from torch.distributed.fsdp import ShardingStrategy @@ -73,7 +72,7 @@ def test_fsdp_sharding_strategy(): @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"]) -def test_fsdp_hybrid_shard_configuration(sharding_strategy): +def test_hybrid_shard_configuration(sharding_strategy): """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg.""" with pytest.raises(RuntimeError, match="The hybrid sharding strategy requires you to pass at least one of"): FSDPStrategy(sharding_strategy=sharding_strategy) @@ -95,7 +94,7 @@ def test_fsdp_hybrid_shard_configuration(sharding_strategy): FSDPStrategy(sharding_strategy=sharding_strategy, process_group=process_group, device_mesh=device_mesh) -def test_fsdp_checkpoint_io_unsupported(): +def test_checkpoint_io_unsupported(): """Test that the FSDP strategy does not support the `CheckpointIO` plugin.""" strategy = FSDPStrategy() with pytest.raises(NotImplementedError, match="does not use the `CheckpointIO` plugin"): @@ -106,7 +105,7 @@ def test_fsdp_checkpoint_io_unsupported(): @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.setup_module") -def test_fsdp_setup_use_orig_params(_): +def test_setup_use_orig_params(_): module = nn.Linear(2, 2) optimizer = Adam(module.parameters()) @@ -122,7 +121,7 @@ def test_fsdp_setup_use_orig_params(_): assert strategy._fsdp_kwargs["use_orig_params"] -def test_fsdp_no_backward_sync(): +def test_no_backward_sync(): """Test that the backward sync control calls `.no_sync()`, and only on a module wrapped in FullyShardedDataParallel.""" @@ -143,14 +142,14 @@ def test_fsdp_no_backward_sync(): module.no_sync.assert_called_once() -def test_fsdp_activation_checkpointing_support(monkeypatch): +def test_activation_checkpointing_support(monkeypatch): """Test that we error out if activation checkpointing requires a newer PyTorch version.""" monkeypatch.setattr(lightning.fabric.strategies.fsdp, "_TORCH_GREATER_EQUAL_2_1", False) with pytest.raises(ValueError, match="activation_checkpointing_policy` requires torch >= 2.1.0"): FSDPStrategy(activation_checkpointing_policy=Mock()) -def test_fsdp_activation_checkpointing(): +def test_activation_checkpointing(): """Test that the FSDP strategy can apply activation checkpointing to the given layers.""" class Block1(nn.Linear): @@ -197,7 +196,7 @@ def __init__(self): apply_mock.assert_called_with(wrapped, checkpoint_wrapper_fn=ANY, **strategy._activation_checkpointing_kwargs) -def test_fsdp_forbidden_precision_raises(): +def test_forbidden_precision_raises(): with pytest.raises(TypeError, match="can only work with the `FSDPPrecision"): FSDPStrategy(precision=HalfPrecision()) @@ -206,7 +205,7 @@ def test_fsdp_forbidden_precision_raises(): strategy.precision = HalfPrecision() -def test_fsdp_grad_clipping_norm_error(): +def test_grad_clipping_norm_error(): strategy = FSDPStrategy() with pytest.raises( TypeError, @@ -215,7 +214,7 @@ def test_fsdp_grad_clipping_norm_error(): strategy.clip_gradients_norm(Mock(), Mock(), Mock()) -def test_fsdp_save_checkpoint_storage_options(tmp_path): +def test_save_checkpoint_storage_options(tmp_path): """Test that the FSDP strategy does not accept storage options for saving checkpoints.""" strategy = FSDPStrategy() with pytest.raises(TypeError, match=escape("FSDPStrategy.save_checkpoint(..., storage_options=...)` is not")): @@ -227,7 +226,7 @@ def test_fsdp_save_checkpoint_storage_options(tmp_path): @mock.patch("lightning.fabric.strategies.fsdp._get_sharded_state_dict_context") @mock.patch("lightning.fabric.strategies.fsdp.torch.save") @mock.patch("lightning.fabric.strategies.fsdp.shutil") -def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, tmp_path): +def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, tmp_path): strategy = FSDPStrategy(state_dict_type="full") # state_dict_type='full', path exists, path is not a sharded checkpoint: error @@ -285,7 +284,7 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) -def test_fsdp_save_checkpoint_one_fsdp_module_required(tmp_path): +def test_save_checkpoint_one_fsdp_module_required(tmp_path): """Test that the FSDP strategy can only save one FSDP model per checkpoint.""" strategy = FSDPStrategy() @@ -304,7 +303,7 @@ def test_fsdp_save_checkpoint_one_fsdp_module_required(tmp_path): strategy.save_checkpoint(path=tmp_path, state={"model1": model1, "model2": model2}) -def test_fsdp_load_checkpoint_no_state(tmp_path): +def test_load_checkpoint_no_state(tmp_path): """Test that the FSDP strategy can't load the full state without access to a model instance from the user.""" strategy = FSDPStrategy() with pytest.raises(ValueError, match=escape("Got FSDPStrategy.load_checkpoint(..., state=None")): @@ -315,7 +314,7 @@ def test_fsdp_load_checkpoint_no_state(tmp_path): @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) @mock.patch("lightning.fabric.strategies.fsdp._lazy_load", Mock()) -def test_fsdp_load_checkpoint_one_fsdp_module_required(tmp_path): +def test_load_checkpoint_one_fsdp_module_required(tmp_path): """Test that the FSDP strategy can only load one FSDP model per checkpoint.""" strategy = FSDPStrategy() @@ -341,7 +340,7 @@ def test_fsdp_load_checkpoint_one_fsdp_module_required(tmp_path): @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) -def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path): +def test_save_checkpoint_unknown_state_dict_type(tmp_path): strategy = FSDPStrategy(state_dict_type="invalid") model = Mock(spec=FullyShardedDataParallel) model.modules.return_value = [model] @@ -349,7 +348,7 @@ def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path): strategy.save_checkpoint(path=tmp_path, state={"model": model}) -def test_fsdp_load_unknown_checkpoint_type(tmp_path): +def test_load_unknown_checkpoint_type(tmp_path): """Test that the strategy validates the contents at the checkpoint path.""" strategy = FSDPStrategy() model = Mock(spec=FullyShardedDataParallel) @@ -360,7 +359,7 @@ def test_fsdp_load_unknown_checkpoint_type(tmp_path): strategy.load_checkpoint(path=path, state={"model": model}) -def test_fsdp_load_raw_checkpoint_validate_single_file(tmp_path): +def test_load_raw_checkpoint_validate_single_file(tmp_path): """Test that we validate the given checkpoint is a single file when loading a raw PyTorch state-dict checkpoint.""" strategy = FSDPStrategy() model = Mock(spec=nn.Module) @@ -370,7 +369,7 @@ def test_fsdp_load_raw_checkpoint_validate_single_file(tmp_path): strategy.load_checkpoint(path=path, state=model) -def test_fsdp_load_raw_checkpoint_optimizer_unsupported(tmp_path): +def test_load_raw_checkpoint_optimizer_unsupported(tmp_path): """Validate that the FSDP strategy does not yet support loading the raw PyTorch state-dict for an optimizer.""" strategy = FSDPStrategy() optimizer = Mock(spec=torch.optim.Optimizer) @@ -396,25 +395,6 @@ def test_set_timeout(init_process_group_mock): ) -def test_has_meta_device_parameters(): - """Test that the `_has_meta_device_parameters` function can find meta-device parameters in models and - optimizers.""" - # nn.Module - module = nn.Linear(2, 2) - meta_module = nn.Linear(2, 2, device="meta") - assert not _has_meta_device_parameters(module) - assert _has_meta_device_parameters(meta_module) - assert _has_meta_device_parameters(nn.Sequential(module, meta_module, nn.ReLU())) - # optim.Optimizer - optimizer = torch.optim.SGD(module.parameters(), lr=0.1) - meta_optimizer = torch.optim.SGD(meta_module.parameters(), lr=0.1) - assert not _has_meta_device_parameters(optimizer) - assert _has_meta_device_parameters(meta_optimizer) - # unsupported objects - with pytest.raises(TypeError, match="Expected `torch.nn.Module` or `torch.optim.Optimizer`"): - _has_meta_device_parameters(None) - - @pytest.mark.parametrize("torch_ge_2_1", [True, False]) @mock.patch("torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.set_state_dict_type") def test_get_full_state_dict_context_offload(set_type_mock, monkeypatch, torch_ge_2_1): diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index 88d015c5cd138..b2246892156f1 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -122,7 +122,7 @@ def get_model(self): @RunIf(min_cuda_gpus=2, standalone=True) @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))]) @pytest.mark.parametrize("manual_wrapping", [True, False]) -def test_fsdp_train_save_load(tmp_path, manual_wrapping, precision): +def test_train_save_load(tmp_path, manual_wrapping, precision): """Test FSDP training, saving and loading with different wrapping and precision settings.""" trainer_cls = _TrainerManualWrapping if manual_wrapping else _Trainer fabric = Fabric( @@ -175,7 +175,7 @@ def test_fsdp_train_save_load(tmp_path, manual_wrapping, precision): @RunIf(min_cuda_gpus=2, standalone=True) -def test_fsdp_save_full_state_dict(tmp_path): +def test_save_full_state_dict(tmp_path): """Test that FSDP saves the full state into a single file with `state_dict_type="full"`.""" fabric = Fabric( accelerator="cuda", @@ -289,7 +289,7 @@ def test_fsdp_save_full_state_dict(tmp_path): @RunIf(min_cuda_gpus=2, standalone=True) -def test_fsdp_load_full_state_dict_into_sharded_model(tmp_path): +def test_load_full_state_dict_into_sharded_model(tmp_path): """Test that the strategy can load a full-state checkpoint into a FSDP sharded model.""" from torch.distributed.fsdp import FullyShardedDataParallel as FSDP @@ -475,7 +475,7 @@ def _run_setup_assertions(empty_init, expected_device): @RunIf(min_cuda_gpus=2, standalone=True) -def test_fsdp_save_filter(tmp_path): +def test_save_filter(tmp_path): fabric = Fabric(accelerator="cuda", strategy=FSDPStrategy(state_dict_type="full"), devices=2) fabric.launch() model = nn.Linear(32, 2) @@ -498,7 +498,7 @@ def test_fsdp_save_filter(tmp_path): @RunIf(min_cuda_gpus=1) -def test_fsdp_manual_activation_checkpointing(): +def test_manual_activation_checkpointing(): model = torch.nn.Sequential(torch.nn.Linear(1, 1), torch.nn.Linear(1, 1)) strategy = FSDPStrategy(activation_checkpointing_policy={torch.nn.Linear}) fabric = Fabric(devices=1, accelerator="cuda", strategy=strategy) diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py new file mode 100644 index 0000000000000..c2a31e2489d7c --- /dev/null +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -0,0 +1,228 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from datetime import timedelta +from re import escape +from unittest import mock +from unittest.mock import Mock + +import pytest +import torch +import torch.nn as nn +from lightning.fabric.plugins.environments import LightningEnvironment +from lightning.fabric.strategies import ModelParallelStrategy +from lightning.fabric.strategies.model_parallel import _ParallelBackwardSyncControl +from torch.optim import Adam + +from tests_fabric.helpers.runif import RunIf + + +@mock.patch("lightning.fabric.strategies.model_parallel._TORCH_GREATER_EQUAL_2_3", False) +def test_torch_greater_equal_2_3(): + with pytest.raises(ImportError, match="ModelParallelStrategy requires PyTorch 2.3 or higher"): + ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + + +@RunIf(min_torch="2.3") +def test_device_mesh_access(): + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + with pytest.raises(RuntimeError, match="Accessing the device mesh .* not allowed"): + _ = strategy.device_mesh + + +@RunIf(min_torch="2.3") +@pytest.mark.parametrize( + ("num_nodes", "devices", "invalid_dp_size", "invalid_tp_size"), + [ + (1, 4, 1, 1), + (1, 4, 2, 3), + (1, 4, 4, 2), + (2, 4, 1, 4), + (2, 4, 2, 1), + ], +) +def test_validate_device_mesh_dimensions(num_nodes, devices, invalid_dp_size, invalid_tp_size): + """Test passing sizes that don't multiply to the world size raises an error.""" + strategy = ModelParallelStrategy( + parallelize_fn=(lambda m, _: m), + data_parallel_size=invalid_dp_size, + tensor_parallel_size=invalid_tp_size, + ) + strategy._setup_distributed = Mock() + strategy._accelerator = Mock() + strategy.cluster_environment = Mock( + world_size=Mock(return_value=(num_nodes * devices)), local_rank=Mock(return_value=1) + ) + strategy.parallel_devices = [torch.device("cpu")] * devices + strategy.num_nodes = num_nodes + with pytest.raises(RuntimeError, match="multiplied should equal the world size"): + strategy.setup_environment() + + +@RunIf(min_torch="2.3") +def test_checkpoint_io_unsupported(): + """Test that the ModelParallel strategy does not support the `CheckpointIO` plugin.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + with pytest.raises(NotImplementedError, match="does not use the `CheckpointIO` plugin"): + _ = strategy.checkpoint_io + + with pytest.raises(NotImplementedError, match="does not support setting a `CheckpointIO` plugin"): + strategy.checkpoint_io = Mock() + + +@RunIf(min_torch="2.3") +def test_save_filter_unsupported(tmp_path): + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + with pytest.raises(NotImplementedError, match="does not yet support the `filter` argument"): + strategy.save_checkpoint(tmp_path / "checkpoint.pth", state={}, filter=Mock()) + + +@RunIf(min_torch="2.3") +def test_load_raw_unsupported(tmp_path): + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + model = nn.Linear(2, 2) + optimizer = Adam(model.parameters()) + with pytest.raises(NotImplementedError, match="object from a checkpoint directly is not yet supported"): + strategy.load_checkpoint(tmp_path / "checkpoint.pth", state=model) + with pytest.raises(NotImplementedError, match="object from a checkpoint directly is not yet supported"): + strategy.load_checkpoint(tmp_path / "checkpoint.pth", state=optimizer) + + +@RunIf(min_torch="2.3") +def test_load_non_strict_unsupported(tmp_path): + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + with pytest.raises(NotImplementedError, match="Non-strict loading is not yet supported"): + strategy.load_checkpoint(tmp_path / "checkpoint.pth", state={}, strict=False) + + +@RunIf(min_torch="2.3") +def test_fsdp_v1_modules_unsupported(): + """Test that the strategy won't allow setting up a module wrapped with the legacy FSDP API.""" + from torch.distributed.fsdp import FullyShardedDataParallel + + module = Mock(modules=Mock(return_value=[Mock(spec=FullyShardedDataParallel)])) + strategy = ModelParallelStrategy(parallelize_fn=(lambda x, _: x)) + with pytest.raises(TypeError, match="only supports the new FSDP2 APIs in PyTorch >= 2.3"): + strategy.setup_module(module) + + +@RunIf(min_torch="2.3") +def test_parallelize_fn_call(): + model = nn.Linear(2, 2) + optimizer = Adam(model.parameters()) + + parallel_model_mock = Mock(spec=nn.Module, parameters=Mock(return_value=[]), buffers=Mock(return_value=[])) + parallelize_fn = Mock(return_value=parallel_model_mock) + strategy = ModelParallelStrategy(parallelize_fn=parallelize_fn) + strategy._device_mesh = Mock() + strategy.parallel_devices = [torch.device("cpu")] + model_setup, [optimizer_setup] = strategy.setup_module_and_optimizers(model, [optimizer]) + assert model_setup is parallel_model_mock + assert optimizer_setup is optimizer + parallelize_fn.assert_called_with(model, strategy.device_mesh) + + # Raises an error if parallelize_fn does not return a module + parallelize_fn = Mock(return_value=None) + strategy = ModelParallelStrategy(parallelize_fn=parallelize_fn) + strategy._device_mesh = Mock() + strategy.parallel_devices = [torch.device("cpu")] + with pytest.raises(TypeError, match="The `parallelize_fn` must return a `nn.Module` instance"): + strategy.setup_module_and_optimizers(model, [optimizer]) + + +@RunIf(min_torch="2.3") +def test_no_backward_sync(): + """Test that the backward sync control calls `.no_sync()`, and only on a module wrapped in + FullyShardedDataParallel.""" + from torch.distributed._composable.fsdp import FSDP + + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + assert isinstance(strategy._backward_sync_control, _ParallelBackwardSyncControl) + + fsdp_layer = Mock(spec=FSDP) + other_layer = nn.Linear(2, 2) + module = Mock() + module.modules = Mock(return_value=[fsdp_layer, other_layer]) + + with strategy._backward_sync_control.no_backward_sync(module, True): + fsdp_layer.set_requires_gradient_sync.assert_called_with(False, recurse=False) + fsdp_layer.set_requires_gradient_sync.assert_called_with(True, recurse=False) + + with strategy._backward_sync_control.no_backward_sync(module, False): + fsdp_layer.set_requires_gradient_sync.assert_called_with(True, recurse=False) + fsdp_layer.set_requires_gradient_sync.assert_called_with(False, recurse=False) + + +@RunIf(min_torch="2.3") +def test_save_checkpoint_storage_options(tmp_path): + """Test that the FSDP strategy does not accept storage options for saving checkpoints.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + with pytest.raises( + TypeError, match=escape("ModelParallelStrategy.save_checkpoint(..., storage_options=...)` is not") + ): + strategy.save_checkpoint(path=tmp_path, state=Mock(), storage_options=Mock()) + + +@RunIf(min_torch="2.3") +@mock.patch("lightning.fabric.strategies.ModelParallelStrategy._setup_device_mesh") +@mock.patch("torch.distributed.init_process_group") +def test_set_timeout(init_process_group_mock, _): + """Test that the timeout gets passed to the ``torch.distributed.init_process_group`` function.""" + test_timedelta = timedelta(seconds=30) + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m), timeout=test_timedelta) + strategy.parallel_devices = [torch.device("cpu")] + strategy.cluster_environment = LightningEnvironment() + strategy.accelerator = Mock() + strategy.setup_environment() + process_group_backend = strategy._get_process_group_backend() + global_rank = strategy.cluster_environment.global_rank() + world_size = strategy.cluster_environment.world_size() + init_process_group_mock.assert_called_with( + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + ) + + +@RunIf(min_torch="2.3") +def test_meta_device_materialization(): + """Test that the `setup_module()` method materializes meta-device tensors in the module.""" + + class NoResetParameters(nn.Module): + def __init__(self): + super().__init__() + self.weight = nn.Parameter(torch.ones(4, 4)) + + class CustomModel(nn.Module): + def __init__(self): + super().__init__() + # nn.Sequential as a parameterless module + self.layer1 = nn.Sequential(NoResetParameters(), NoResetParameters()) + self.layer2 = nn.Linear(4, 4) + self.register_buffer("buffer", torch.rand(2)) + + def reset_parameters(self): + self.buffer.fill_(1.0) + + strategy = ModelParallelStrategy(parallelize_fn=(lambda x, _: x)) + strategy._device_mesh = Mock() + strategy._parallel_devices = [torch.device("cpu")] + + with torch.device("meta"): + model = CustomModel() + assert model.layer1[0].weight.is_meta + assert model.layer2.weight.is_meta + assert model.buffer.is_meta + + with pytest.warns(UserWarning, match=r"`reset_parameters\(\)` method for re-initialization: NoResetParameters"): + model = strategy.setup_module(model) + assert all(not p.is_meta for p in model.parameters()) + assert all(not b.is_meta for b in model.buffers()) diff --git a/tests/tests_fabric/strategies/test_model_parallel_integration.py b/tests/tests_fabric/strategies/test_model_parallel_integration.py new file mode 100644 index 0000000000000..589560e7fea4f --- /dev/null +++ b/tests/tests_fabric/strategies/test_model_parallel_integration.py @@ -0,0 +1,488 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from unittest import mock + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from lightning.fabric import Fabric +from lightning.fabric.strategies import ModelParallelStrategy +from lightning.fabric.utilities.load import _load_distributed_checkpoint +from torch.utils.data import DataLoader, DistributedSampler + +from tests_fabric.helpers.datasets import RandomDataset +from tests_fabric.helpers.runif import RunIf + + +@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) +def test_setup_device_mesh(): + from torch.distributed.device_mesh import DeviceMesh + + for dp_size, tp_size in ((1, 4), (4, 1), (2, 2)): + strategy = ModelParallelStrategy( + parallelize_fn=(lambda m, _: m), + data_parallel_size=dp_size, + tensor_parallel_size=tp_size, + ) + fabric = Fabric(accelerator="auto", devices=4, strategy=strategy) + fabric.launch() + + device_mesh = fabric.strategy.device_mesh + assert isinstance(device_mesh, DeviceMesh) + assert device_mesh.device_type == fabric.device.type + assert device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") + assert device_mesh.size(0) == dp_size + assert device_mesh.size(1) == tp_size + assert device_mesh.ndim == 2 + + fabric.barrier() + + # Passing "auto" will select internode and intranode dimensions automatically + strategy = ModelParallelStrategy( + parallelize_fn=(lambda m, _: m), + data_parallel_size="auto", + tensor_parallel_size="auto", + ) + fabric = Fabric(accelerator="auto", devices=4, num_nodes=1, strategy=strategy) + fabric.launch() + assert fabric.strategy.device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") + assert fabric.strategy.device_mesh.size(0) == 1 + assert fabric.strategy.device_mesh.size(1) == 4 + + +class FeedForward(nn.Module): + def __init__(self): + super().__init__() + self.w1 = nn.Linear(32, 64) + self.w2 = nn.Linear(32, 64) + self.w3 = nn.Linear(64, 32) + + def forward(self, x): + return self.w3(F.silu(self.w1(x)) * self.w2(x)) + + +def _parallelize_feed_forward_tp(model, device_mesh): + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module + + tp_mesh = device_mesh["tensor_parallel"] + tp_plan = { + "w1": ColwiseParallel(), + "w2": ColwiseParallel(), + "w3": RowwiseParallel(), + } + parallelize_module(model, tp_mesh, tp_plan) + return model + + +def _parallelize_feed_forward_fsdp2(model, device_mesh): + from torch.distributed._composable.fsdp.fully_shard import fully_shard + from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper + + dp_mesh = device_mesh["data_parallel"] + assert dp_mesh.ndim == 1 # Hybrid-sharding not supported + + # Fully-shard each layer + fully_shard(model.w1, mesh=dp_mesh) + fully_shard(model.w2, mesh=dp_mesh) + fully_shard(model.w3, mesh=dp_mesh) + + # Activation checkpointing + model = checkpoint_wrapper(model) + + return model + + +def _parallelize_feed_forward_fsdp2_tp(model, device_mesh): + model = _parallelize_feed_forward_tp(model, device_mesh) + model = _parallelize_feed_forward_fsdp2(model, device_mesh) + return model + + +@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=2) +def test_tensor_parallel(): + from torch.distributed._tensor import DTensor + + strategy = ModelParallelStrategy(parallelize_fn=_parallelize_feed_forward_tp) + fabric = Fabric(accelerator="auto", devices=2, strategy=strategy) + fabric.launch() + + fabric.seed_everything(0) + + with fabric.init_module(empty_init=True): + model = FeedForward() + + model = fabric.setup(model) + optimizer = torch.optim.AdamW(model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) + + device_mesh = fabric.strategy.device_mesh + assert all(tensor.device_mesh == device_mesh["tensor_parallel"] for tensor in optimizer.param_groups[0]["params"]) + assert all(isinstance(weight, DTensor) for weight in model.parameters()) + assert model.w1.weight.device_mesh == device_mesh["tensor_parallel"] + + dataset_size = 6 + dataset = RandomDataset(32, dataset_size) + dataloader = DataLoader(dataset, batch_size=2) + dataloader = fabric.setup_dataloaders(dataloader) + + # No data sharding, all GPUs get the same input inside a TP group + assert len(dataloader) == dataset_size // dataloader.batch_size + assert isinstance(dataloader.sampler, DistributedSampler) + + for _, batch in enumerate(dataloader): + # All batches must be identical across TP group + batches = fabric.all_gather(batch) + assert all(torch.equal(batches[0], batches[i]) for i in range(1, len(batches))) + + output = model(batch) + fabric.backward(output.sum()) + assert isinstance(model.w1.weight.grad, DTensor) + assert model.w1.weight.grad.device_mesh == device_mesh["tensor_parallel"] + optimizer.step() + optimizer.zero_grad() + + +@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) +def test_fsdp2_tensor_parallel(): + from torch.distributed._tensor import DTensor + + strategy = ModelParallelStrategy( + parallelize_fn=_parallelize_feed_forward_fsdp2_tp, + data_parallel_size=2, + tensor_parallel_size=2, + ) + fabric = Fabric(accelerator="auto", devices=4, strategy=strategy) + fabric.launch() + + fabric.seed_everything(0) + + with fabric.init_module(empty_init=True): + model = FeedForward() + + model = fabric.setup(model) + optimizer = torch.optim.AdamW(model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) + + assert all(isinstance(weight, DTensor) for weight in model.parameters()) + assert all(isinstance(tensor, DTensor) for tensor in optimizer.param_groups[0]["params"]) + assert model.w1.weight.device_mesh.ndim == 2 + assert model.w1.weight.device_mesh.size(0) == 2 + assert model.w1.weight.device_mesh.size(1) == 2 + assert all(weight.device.type != "meta" for weight in model.parameters()) + assert all(tensor.device_mesh.ndim == 2 for tensor in optimizer.param_groups[0]["params"]) + assert all(tensor.device.type != "meta" for tensor in optimizer.param_groups[0]["params"]) + + dataset_size = 8 + dataset = RandomDataset(32, dataset_size) + dataloader = DataLoader(dataset, batch_size=2) + dataloader = fabric.setup_dataloaders(dataloader) + + # No data sharding across TP dimension, sharding across data-parallel dimension only + device_mesh = fabric.strategy.device_mesh + dp_mesh = device_mesh["data_parallel"] + tp_mesh = device_mesh["tensor_parallel"] + assert len(dataloader) == dataset_size // dataloader.batch_size // dp_mesh.size() + assert isinstance(dataloader.sampler, DistributedSampler) + + for _, batch in enumerate(dataloader): + batches = fabric.all_gather(batch) + # Batches across the TP dimension must be identical + batches_tp = batches[tp_mesh.mesh] + assert all(torch.equal(batches_tp[0], batches_tp[i]) for i in range(1, len(batches_tp))) + # Batches across the DP dimension must be different + batches_dp = batches[dp_mesh.mesh] + assert all(not torch.equal(batches_dp[0], batches_dp[i]) for i in range(1, len(batches_dp))) + + output = model(batch) + fabric.backward(output.sum()) + assert isinstance(model.w1.weight.grad, DTensor) + assert model.w1.weight.grad.device_mesh == device_mesh + optimizer.step() + optimizer.zero_grad() + + +@RunIf(min_torch="2.3", min_cuda_gpus=4, standalone=True) +@pytest.mark.parametrize( + "precision", + [ + pytest.param( + "16-mixed", marks=pytest.mark.xfail(reason="Precision plugin does not implement ShardedGradScaler yet") + ), + pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True)), + ], +) +def test_train_save_load(precision, tmp_path): + """Test 2D-parallel training, saving and loading precision settings.""" + strategy = ModelParallelStrategy( + _parallelize_feed_forward_fsdp2_tp, + data_parallel_size=2, + tensor_parallel_size=2, + ) + fabric = Fabric(accelerator="cuda", devices=4, strategy=strategy, precision=precision) + fabric.launch() + + fabric.seed_everything(0) + with fabric.init_module(empty_init=True): + model = FeedForward() + model = fabric.setup(model) + optimizer = torch.optim.AdamW(model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) + output = model(torch.rand(2, 32, device=fabric.device)) + fabric.backward(output.sum()) + optimizer.step() + optimizer.zero_grad() + + checkpoint_path = fabric.broadcast(str(tmp_path / "dist-checkpoint")) + + params_before = [p.full_tensor().clone() for p in model.parameters()] + state = {"model": model, "optimizer": optimizer, "steps": 1} + fabric.save(checkpoint_path, state) + assert set(os.listdir(checkpoint_path)) == { + ".metadata", + "__0_0.distcp", + "__1_0.distcp", + "__2_0.distcp", + "__3_0.distcp", + } + + # re-init all objects and resume + strategy = ModelParallelStrategy( + _parallelize_feed_forward_fsdp2_tp, + data_parallel_size=2, + tensor_parallel_size=2, + ) + fabric = Fabric(accelerator="cuda", devices=4, strategy=strategy, precision=precision) + fabric.launch() + + fabric.seed_everything(0) + with fabric.init_module(empty_init=True): + model = FeedForward() + model = fabric.setup(model) + optimizer = torch.optim.AdamW(model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) + output = model(torch.rand(2, 32, device=fabric.device)) + fabric.backward(output.sum()) + optimizer.step() + optimizer.zero_grad() + + # check correctness with loaded state + state = {"model": model, "optimizer": optimizer, "steps": 0} + metadata = fabric.load(checkpoint_path, state) + for p0, p1 in zip(params_before, (p.full_tensor() for p in model.parameters())): + torch.testing.assert_close(p0, p1, atol=0, rtol=0, equal_nan=True) + + # check user data in state reloaded + # TODO: This should be 1, torch.distributed.checkpoint only supports tensor data + assert state["steps"] == 0 + assert not metadata + + # TODO: Test strict and non-strict loading here once supported + # attempt to load a key not in the metadata checkpoint + # state = {"model": model, "coconut": 11} + # with pytest.raises(KeyError, match="The requested state contains a key 'coconut' that does not exist"): + # fabric.load(checkpoint_path, state) + + # # `strict=False` ignores the missing key + # state = {"model": trainer.model, "coconut": 11} + # fabric.load(checkpoint_path, state, strict=False) + # assert state["coconut"] == 11 + + +@RunIf(min_torch="2.3", min_cuda_gpus=2, skip_windows=True, standalone=True) +@pytest.mark.parametrize("move_to_device", [True, False]) +@mock.patch("lightning.fabric.wrappers._FabricModule") +def test_setup_module_move_to_device(fabric_module_mock, move_to_device): + """Test that `move_to_device` does nothing, ModelParallel decides which device parameters get moved to which device + (sharding).""" + from torch.distributed._tensor import DTensor + + strategy = ModelParallelStrategy(parallelize_fn=_parallelize_feed_forward_fsdp2) + fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy) + fabric.launch() + + model = FeedForward() + fabric_model = fabric.setup_module(model, move_to_device=move_to_device) + fabric_module_mock.assert_not_called() + + # the linear layer got sharded and each part is on the expected device + assert fabric_model.w1.weight.device == torch.device("cuda", fabric.local_rank) + assert isinstance(fabric_model.w1.weight, DTensor) + + # The _DeviceDtypeModuleMixin currently can't represent the device in a meaningful way for models with pieces on + # different devices + assert fabric_model.device == torch.device("cuda", fabric.local_rank) + assert fabric.device == torch.device("cuda", fabric.local_rank) + + +@RunIf(min_torch="2.3", min_cuda_gpus=2, skip_windows=True, standalone=True) +@pytest.mark.parametrize( + ("precision", "expected_dtype"), + [ + ("32-true", torch.float32), + ("16-true", torch.float16), + pytest.param("bf16-true", torch.bfloat16, marks=RunIf(bf16_cuda=True)), + ], +) +def test_module_init_context(precision, expected_dtype): + """Test that the module under the init-context gets moved to the right device and dtype.""" + strategy = ModelParallelStrategy(parallelize_fn=_parallelize_feed_forward_fsdp2) + fabric = Fabric(accelerator="cuda", devices=2, strategy=strategy, precision=precision) + fabric.launch() + + def _run_setup_assertions(empty_init, expected_device): + with fabric.init_module(empty_init=empty_init): + model = FeedForward() + + # The model is on the CPU/meta-device until after `.setup()`` + assert all(weight.device == expected_device for weight in model.parameters()) + assert all(weight.dtype == expected_dtype for weight in model.parameters()) + model = fabric.setup(model) + # Parameters get sharded in `.setup()` and moved to the target device + assert all(weight.device == torch.device("cuda", fabric.local_rank) for weight in model.parameters()) + assert all(weight.dtype == expected_dtype for weight in model.parameters()) + + _run_setup_assertions(empty_init=False, expected_device=torch.device("cpu")) + _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) + + +def _parallelize_single_linear_tp_fsdp2(model, device_mesh): + from torch.distributed._composable.fsdp.fully_shard import fully_shard + from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module + + dp_mesh = device_mesh["data_parallel"] + tp_mesh = device_mesh["tensor_parallel"] + + parallelize_module(model, tp_mesh, ColwiseParallel()) + fully_shard(model, mesh=dp_mesh) + return model + + +@RunIf(min_torch="2.3", min_cuda_gpus=2, standalone=True) +@pytest.mark.parametrize( + "precision", + [ + "32-true", + pytest.param("16-mixed"), + pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True)), + ], +) +@pytest.mark.parametrize( + "clip_type", + [ + pytest.param("norm", marks=pytest.mark.skip("Gradient clipping by norm is not correct.")), + pytest.param( + "val", + marks=pytest.mark.xfail( + raises=RecursionError, strict=False, reason="Recursion error when clipping DTensor" + ), + ), + ], +) +def test_clip_gradients(clip_type, precision): + if clip_type == "norm" and precision == "16-mixed": + pytest.skip(reason="Clipping by norm with 16-mixed is numerically unstable.") + + strategy = ModelParallelStrategy(_parallelize_single_linear_tp_fsdp2) + fabric = Fabric(accelerator="auto", devices=2, precision=precision, strategy=strategy) + fabric.launch() + + in_features, out_features = 32, 2 + model = torch.nn.Linear(in_features, out_features, bias=False) + model.weight.data.fill_(0.01) + + model = fabric.setup(model) + optimizer = torch.optim.Adam(model.parameters(), lr=0.1) + optimizer = fabric.setup_optimizers(optimizer) + + batch = torch.full((1, in_features), 0.1, device=fabric.device) + loss = model(batch).sum() + + # The example is constructed such that the gradients are all the same + fabric.backward(loss) + + if clip_type == "norm": + norm = torch.linalg.vector_norm(model.weight.grad.full_tensor().detach().cpu(), 2, dtype=torch.float32).item() + new_norm = norm / 10 + fabric.clip_gradients(model, optimizer, max_norm=new_norm * 10) + assert torch.allclose( + torch.linalg.vector_norm(model.weight.grad.full_tensor().detach().cpu(), 2, dtype=torch.float32), + torch.tensor(new_norm), + ) + elif clip_type == "val": + val = model.weight.grad.full_tensor()[0, 0].item() + new_val = val / 2.0 + fabric.clip_gradients(model, optimizer, clip_val=new_val) + assert torch.allclose( + model.weight.grad.full_tensor(), torch.full_like(model.weight.grad.full_tensor(), new_val) + ) + else: + raise AssertionError(f"Unknown clip type: {clip_type}") + + optimizer.step() + optimizer.zero_grad() + + +# TODO: Support loading full checkpoint +@pytest.mark.xfail(raises=NotADirectoryError, reason="Loading from full checkpoint not supported yet.") +@RunIf(min_torch="2.3", min_cuda_gpus=4, standalone=True) +def test_save_sharded_and_consolidate_and_load(tmp_path): + """Test the consolidation of a distributed (DTensor) checkpoint into a single file.""" + strategy = ModelParallelStrategy( + _parallelize_feed_forward_fsdp2_tp, + data_parallel_size=2, + tensor_parallel_size=2, + ) + fabric = Fabric(accelerator="cuda", devices=4, strategy=strategy) + fabric.launch() + + model = FeedForward() + optimizer = torch.optim.Adam(model.parameters()) + model, optimizer = fabric.setup(model, optimizer) + state = {"model": model, "optimizer": optimizer, "steps": 1} + + # run one iteration to init the state of the optimizer + loss = model(torch.rand(1, 32, device=fabric.device)).sum() + fabric.backward(loss) + optimizer.step() + + checkpoint_path_sharded = fabric.broadcast(str(tmp_path / "checkpoint_sharded")) + fabric.save(checkpoint_path_sharded, state) + assert set(os.listdir(checkpoint_path_sharded)) == { + ".metadata", + "__0_0.distcp", + "__1_0.distcp", + "__2_0.distcp", + "__3_0.distcp", + } + + # consolidate the checkpoint to a single file + checkpoint_path_full = fabric.broadcast(str(tmp_path / "checkpoint_full.pt")) + if fabric.global_rank == 0: + checkpoint = _load_distributed_checkpoint(Path(checkpoint_path_sharded)) + torch.save(checkpoint, checkpoint_path_full) + fabric.barrier() + + # re-init and load from full checkpoint + strategy = ModelParallelStrategy(_parallelize_feed_forward_fsdp2_tp) + fabric = Fabric(accelerator="cuda", devices=4, strategy=strategy) + fabric.launch() + + model = FeedForward() + optimizer = torch.optim.Adam(model.parameters()) + model, optimizer = fabric.setup(model, optimizer) + state = {"model": model, "optimizer": optimizer, "steps": 1} + fabric.load(checkpoint_path_full, state) diff --git a/tests/tests_fabric/utilities/test_init.py b/tests/tests_fabric/utilities/test_init.py index 25d53abd7d261..bdbca90495561 100644 --- a/tests/tests_fabric/utilities/test_init.py +++ b/tests/tests_fabric/utilities/test_init.py @@ -16,7 +16,11 @@ import pytest import torch.nn -from lightning.fabric.utilities.init import _EmptyInit, _materialize_meta_tensors +from lightning.fabric.utilities.init import ( + _EmptyInit, + _has_meta_device_parameters_or_buffers, + _materialize_meta_tensors, +) from tests_fabric.helpers.runif import RunIf @@ -85,3 +89,30 @@ def reset_parameters(self): assert model.buf.device.type == "cpu" assert len(list(model.parameters())) == 4 assert all(p.device.type == "cpu" for p in model.parameters()) + + +def test_has_meta_device_parameters_or_buffers(): + """Test that the `_has_meta_device_parameters_or_buffers` function can find meta-device parameters in models and + optimizers.""" + + class BufferModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.register_buffer("buffer", torch.ones(2, device="meta")) + + # nn.Module + module = torch.nn.Linear(2, 2) + meta_module = torch.nn.Linear(2, 2, device="meta") + buffer_meta_module = BufferModule() + assert not _has_meta_device_parameters_or_buffers(module) + assert _has_meta_device_parameters_or_buffers(meta_module) + assert _has_meta_device_parameters_or_buffers(torch.nn.Sequential(module, meta_module, torch.nn.ReLU())) + assert _has_meta_device_parameters_or_buffers(buffer_meta_module) + # optim.Optimizer + optimizer = torch.optim.SGD(module.parameters(), lr=0.1) + meta_optimizer = torch.optim.SGD(meta_module.parameters(), lr=0.1) + assert not _has_meta_device_parameters_or_buffers(optimizer) + assert _has_meta_device_parameters_or_buffers(meta_optimizer) + # unsupported objects + with pytest.raises(TypeError, match="Expected `torch.nn.Module` or `torch.optim.Optimizer`"): + _has_meta_device_parameters_or_buffers(None) From e0307277a03c0822c26b525c1cdfa71425ed0214 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 8 May 2024 22:58:33 +0200 Subject: [PATCH 023/179] Add function to explicitly mark forward methods in Fabric (#19690) Co-authored-by: Sebastian Raschka --- docs/source-fabric/api/fabric_methods.rst | 2 +- docs/source-fabric/api/wrappers.rst | 147 ++++++++++++++++++++++ docs/source-fabric/glossary/index.rst | 6 + src/lightning/fabric/CHANGELOG.md | 4 +- src/lightning/fabric/wrappers.py | 28 ++++- tests/tests_fabric/test_wrappers.py | 80 ++++++++++-- 6 files changed, 250 insertions(+), 17 deletions(-) create mode 100644 docs/source-fabric/api/wrappers.rst diff --git a/docs/source-fabric/api/fabric_methods.rst b/docs/source-fabric/api/fabric_methods.rst index 568c92808ad9b..87b22578c1202 100644 --- a/docs/source-fabric/api/fabric_methods.rst +++ b/docs/source-fabric/api/fabric_methods.rst @@ -49,7 +49,7 @@ Moves the model and optimizer to the correct device automatically. The setup method also prepares the model for the selected precision choice so that operations during ``forward()`` get -cast automatically. +cast automatically. Advanced users should read :doc:`the notes on models wrapped by Fabric <../api/wrappers>`. setup_dataloaders ================= diff --git a/docs/source-fabric/api/wrappers.rst b/docs/source-fabric/api/wrappers.rst new file mode 100644 index 0000000000000..e87874eb08666 --- /dev/null +++ b/docs/source-fabric/api/wrappers.rst @@ -0,0 +1,147 @@ +######################## +Models wrapped by Fabric +######################## + +When you :doc:`set up <../api/fabric_methods>` a model in Fabric, it gets automatically wrapped by a new module, the ``FabricModule``: + +.. code-block:: python + + import torch + import lightning as L + + fabric = L.Fabric() + model = torch.nn.Linear(10, 2) + model = fabric.setup(model) + + print(type(model)) # + +This wrapper module takes care of a few things for you, notably: + +- Strategy: Handles strategy-specific logic for the forward method (DDP, FSDP, etc.). +- Precision: Inputs and outputs passed through ``forward`` get automatically converted to the right precision depending on the ``Fabric(precision=...)`` setting. +- Device: The wrapper remembers which device the model is on. You can access it with `model.device`. + +.. note:: + The ``FabricModule`` wrapper is completely transparent and most users will never need to interact with it directly. + +Below we describe a few functions and properties of the wrapper for advanced use cases. +This might be useful if you are building a custom Trainer using Fabric as the core. + + +---- + + +******************************** +Accessing methods and attributes +******************************** + +Access to methods and attributes gets redirected to the original model automatically: + +.. code-block:: python + + import torch + import lightning as L + + fabric = L.Fabric() + model = torch.nn.Linear(10, 2) + fabric_model = fabric.setup(model) + + # You can access attributes and methods normally + print(fabric_model.weight is model.weight) # True + + +---- + + +******************** +Unwrapping the model +******************** + +You can check whether a model is wrapped in a ``FabricModule`` with the ``is_wrapped`` utility function: + +.. code-block:: python + + import torch + import lightning as L + from lightning.fabric import is_wrapped + + fabric = L.Fabric() + model = torch.nn.Linear(10, 2) + fabric_model = fabric.setup(model) + + print(is_wrapped(model)) # False + print(is_wrapped(fabric_model)) # True + + +If you ever need to, you can access the original model explicitly via ``.module``: + +.. code-block:: python + + # Access the original model explicitly + original_model = fabric_model.module + + print(original_model is model) # True + + +---- + + +************************************************ +Using methods other than forward for computation +************************************************ + +PyTorch's ``nn.Modules`` have a special contract you need to follow when using them for training: Your forward computation has to be defined in the **forward** method and you should call this forward method directly. +But sometimes your model may need to define different flavors of `forward`, like in this example below where the regular `forward` is used for training, but the `generate` method does something slightly different for inference: + +.. code-block:: python + + import torch + import lightning as L + + + class MyModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.layer = torch.nn.Linear(10, 2) + + def forward(self, x): + return self.layer(x) + + def generate(self): + sample = torch.randn(10) + return self(sample) + + +If you were to run this model in Fabric with multiple devices (DDP or FSDP), you would get an error: + +.. code-block:: python + + fabric = L.Fabric(accelerator="cpu", devices=2) + fabric.launch() + model = MyModel() + model = fabric.setup(model) + + # OK: Calling the model directly + output = model(torch.randn(10)) + + # OK: Calling the model's forward (equivalent to the abvoe) + output = model.forward(torch.randn(10)) + + # ERROR: Calling another method that calls forward indirectly + output = model.generate() + +Fabric produces an error there informing the user about incorrect usage because this is normally not allowed in PyTorch and could potentially lead to silent correctness bugs. +If you want to use such methods, you need to mark them explicitly with ``.mark_forward_method()`` so that Fabric can do some rerouting behind the scenes for you to do the right thing: + +.. code-block:: python + + # You must mark special forward methods explicitly: + model.mark_forward_method(model.generate) + + # Passing just the name is also sufficient + model.mark_forward_method("generate") + + # OK: Fabric will do some rerouting behind the scenes now + output = model.generate() + +| diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst index b08bc4f830163..e229b4fe5c998 100644 --- a/docs/source-fabric/glossary/index.rst +++ b/docs/source-fabric/glossary/index.rst @@ -8,6 +8,7 @@ Glossary Checkpoint <../guide/checkpoint/index> Weights and Biases <../guide/loggers/wandb> + Wrappers <../api/wrappers> .. raw:: html @@ -80,6 +81,11 @@ Glossary :button_link: ../fundamentals/accelerators.html :col_css: col-md-4 +.. displayitem:: + :header: FabricModule + :button_link: ../api/wrappers.html + :col_css: col-md-4 + .. displayitem:: :header: FSDP :button_link: ../advanced/model_parallel/fsdp.html diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 15e8ba16b6c72..3e31d4c8ed811 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -9,7 +9,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Enabled consolidating distributed checkpoints through `fabric consolidate` in the new CLI [#19560](https://github.com/Lightning-AI/pytorch-lightning/pull/19560)) +- Enabled consolidating distributed checkpoints through `fabric consolidate` in the new CLI ([#19560](https://github.com/Lightning-AI/pytorch-lightning/pull/19560)) + +- Added the ability to explicitly mark forward methods in Fabric via `_FabricModule.mark_forward_method()` ([#19690](https://github.com/Lightning-AI/pytorch-lightning/pull/19690)) - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) diff --git a/src/lightning/fabric/wrappers.py b/src/lightning/fabric/wrappers.py index f932750e14239..c57f1974a6bba 100644 --- a/src/lightning/fabric/wrappers.py +++ b/src/lightning/fabric/wrappers.py @@ -14,6 +14,7 @@ import inspect from copy import deepcopy from functools import partial, wraps +from types import MethodType from typing import ( Any, Callable, @@ -123,6 +124,7 @@ def __init__( self._forward_module = forward_module self._original_module = original_module or forward_module self._strategy = strategy + self._forward_methods = set(_LIGHTNING_MODULE_STEP_METHODS) self._fabric_module_initialized = True @property @@ -165,6 +167,20 @@ def load_state_dict( # type: ignore[override] ) -> _IncompatibleKeys: return self._original_module.load_state_dict(state_dict=state_dict, strict=strict, **kwargs) + def mark_forward_method(self, method: Union[MethodType, str]) -> None: + """Mark a method as a 'forward' method to prevent it bypassing the strategy wrapper (e.g., DDP).""" + if not isinstance(method, (MethodType, str)): + raise TypeError(f"Expected a method or a string, but got: {type(method).__name__}") + name = method if isinstance(method, str) else method.__name__ + if name == "forward": + raise ValueError("You cannot mark the forward method itself as a forward method.") + if not isinstance(getattr(self._original_module, name, None), MethodType): + raise AttributeError( + f"You marked '{name}' as a forward method, but `{type(self._original_module).__name__}.{name}` does not" + f" exist or is not a method." + ) + self._forward_methods.add(name) + def _redirection_through_forward(self, method_name: str) -> Callable: assert method_name != "forward" original_forward = self._original_module.forward @@ -207,8 +223,8 @@ def _wrapped_method(*args: Any, **kwargs: Any) -> Any: if module_called: raise RuntimeError( f"You are calling the method `{type(self._original_module).__name__}.{name}()` from outside the" - " model. This will bypass the wrapper from the strategy and result in incorrect behavior in" - " `.backward()`. You should pass your inputs through `forward()`.", + " model. To avoid issues with the currently selected strategy, explicitly mark it as a" + f" forward method with `fabric_model.mark_forward_method({name!r})` after `fabric.setup()`." ) for handle in handles: handle.remove() @@ -231,8 +247,12 @@ def _register_backward_hook(self, tensor: Tensor) -> Tensor: @override def __getattr__(self, item: Any) -> Any: - if item in _LIGHTNING_MODULE_STEP_METHODS and self._forward_module != self._original_module: - # Special support for `LightningModule`, to prevent bypassing DDP's forward + if ( + item != "_forward_methods" + and item in self._forward_methods + and self._forward_module != self._original_module + ): + # Special support for methods marked by `mark_forward_method` to prevent bypassing DDP's forward return self._redirection_through_forward(item) try: diff --git a/tests/tests_fabric/test_wrappers.py b/tests/tests_fabric/test_wrappers.py index 599d8f085d16c..91f516d03a00c 100644 --- a/tests/tests_fabric/test_wrappers.py +++ b/tests/tests_fabric/test_wrappers.py @@ -102,15 +102,20 @@ def __init__(self, module): super().__init__() self.wrapped = module + def forward(self, *args, **kwargs): + return self.wrapped(*args, **kwargs) + # Regular case: forward_module == original_module -> no warnings original_module = OriginalModule() fabric_module = _FabricModule(forward_module=original_module, strategy=Mock(), original_module=original_module) assert fabric_module.method_without_module_invocation() == 100 - # Special case: original module wrapped by forward module: -> warn if method accepts args + # Special case: original module wrapped by forward module: -> error if method requires rerouting original_module = OriginalModule() wrapped_module = ModuleWrapper(original_module) - fabric_module = _FabricModule(forward_module=wrapped_module, strategy=Mock(), original_module=original_module) + fabric_module = _FabricModule( + forward_module=wrapped_module, strategy=Mock(precision=Precision()), original_module=original_module + ) assert fabric_module.method_without_module_invocation() == 100 with pytest.raises( RuntimeError, match=r"You are calling the method `OriginalModule.method_with_submodule_invocation\(\)` from" @@ -121,6 +126,51 @@ def __init__(self, module): ): assert fabric_module.method_with_self_invocation() == 102 + # No error if explicitly marked as forward method + fabric_module.mark_forward_method("method_with_self_invocation") + assert fabric_module.method_with_self_invocation() == 102 + + +def test_fabric_module_mark_forward_method(): + class OriginalModule(torch.nn.Module): + attribute = 1 + + def forward(self, x): + return x + + def special(self): + pass + + original_module = OriginalModule() + fabric_module = _FabricModule(original_module, Mock(), original_module=original_module) + + with pytest.raises(ValueError, match="You cannot mark the forward method itself"): + fabric_module.mark_forward_method("forward") + + with pytest.raises(AttributeError, match="`OriginalModule.not_exist` does not exist or is not a method."): + fabric_module.mark_forward_method("not_exist") + + with pytest.raises(AttributeError, match="`OriginalModule.attribute` does not exist or is not a method."): + fabric_module.mark_forward_method("attribute") + + def special(x): + return x + + with pytest.raises(TypeError, match="Expected a method or a string"): + fabric_module.mark_forward_method(special) + + lightning_module_methods = {"training_step", "validation_step", "test_step", "predict_step"} + assert fabric_module._forward_methods == lightning_module_methods + + # Mark via name + fabric_module.mark_forward_method("special") + assert fabric_module._forward_methods == {"special"} | lightning_module_methods + + # Mark by passing in the method itself + fabric_module = _FabricModule(original_module, Mock(), original_module=original_module) + fabric_module.mark_forward_method(original_module.special) + assert fabric_module._forward_methods == {"special"} | lightning_module_methods + def test_fabric_module_setattr(): """Test that setattr sets attributes on the original module.""" @@ -549,8 +599,8 @@ def test_unwrap_objects(compile): def test_step_method_redirection(): - """Test that the FabricModule redirects the special `LightningModule.*_step` methods through the forward- - module.""" + """Test that the FabricModule redirects methods marked as 'forward methods' through forward to avoid bypassing the + DDP/FSDP wrappers.""" class DDP(torch.nn.Module): def __init__(self, module): @@ -570,11 +620,11 @@ def training_step(self, arg, kwarg=None): assert kwarg == "train_kwarg" return "training_step_return" - def validation_step(self, arg, kwarg=None): + def marked_method(self, arg, kwarg=None): assert self() == "forward_return" - assert arg == "val_arg" - assert kwarg == "val_kwarg" - return "validation_step_return" + assert arg == "marked_arg" + assert kwarg == "marked_kwarg" + return "marked_method_return" def normal_method(self): pass @@ -602,10 +652,18 @@ def normal_method(self): assert original_module.forward.__name__ == "forward" # The special methods get redirected correctly to produce the expected output + strategy.precision.forward_context.reset_mock() assert fabric_module.training_step("train_arg", kwarg="train_kwarg") == "training_step_return" assert fabric_module.training_step("train_arg", kwarg="train_kwarg") == "training_step_return" # call 2nd time - assert fabric_module.validation_step("val_arg", kwarg="val_kwarg") == "validation_step_return" - strategy.precision.forward_context.assert_called() + assert strategy.precision.forward_context.call_count == 2 + + # Other methods must be marked explicitly to be redirected + strategy.precision.forward_context.reset_mock() + with pytest.raises(RuntimeError, match="You are calling the method .* from outside the model"): + fabric_module.marked_method("marked_arg", kwarg="marked_kwarg") + fabric_module.mark_forward_method("marked_method") + assert fabric_module.marked_method("marked_arg", kwarg="marked_kwarg") == "marked_method_return" + strategy.precision.forward_context.assert_called_once() # The forward method remains untouched/unpatched after the special methods have been called assert original_module.forward.__name__ == "forward" @@ -613,7 +671,7 @@ def normal_method(self): # Special case: forward_module == original_module -> no special treatment applied fabric_module = _FabricModule(forward_module=original_module, strategy=Mock(), original_module=original_module) assert fabric_module.training_step == original_module.training_step - assert fabric_module.validation_step == original_module.validation_step + assert fabric_module.marked_method == original_module.marked_method @RunIf(dynamo=True) From 8453e3102862307ed5b8fc271d30c15003708728 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 9 May 2024 12:46:27 +0100 Subject: [PATCH 024/179] Reduce queue fetching (#19856) * update * update --- src/lightning/app/core/app.py | 12 +++++++---- src/lightning/app/core/constants.py | 2 ++ src/lightning/app/runners/multiprocess.py | 21 ++++++++++--------- tests/tests_app/core/test_lightning_app.py | 24 ++++++++++++++++++++++ 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/lightning/app/core/app.py b/src/lightning/app/core/app.py index b9ff54f9a8852..c29a43ba9db0a 100644 --- a/src/lightning/app/core/app.py +++ b/src/lightning/app/core/app.py @@ -30,6 +30,7 @@ from lightning.app.api.request_types import _APIRequest, _CommandRequest, _DeltaRequest from lightning.app.core.constants import ( BATCH_DELTA_COUNT, + CHECK_ERROR_QUEUE_INTERVAL, DEBUG_ENABLED, FLOW_DURATION_SAMPLES, FLOW_DURATION_THRESHOLD, @@ -165,6 +166,7 @@ def __init__( self._last_run_time: float = 0.0 self._run_times: list = [] + self._last_check_error_queue: float = 0.0 # Path attributes can't get properly attached during the initialization, because the full name # is only available after all Flows and Works have been instantiated. @@ -318,10 +320,12 @@ def batch_get_state_changed_from_queue(q: BaseQueue, timeout: Optional[float] = return [] def check_error_queue(self) -> None: - exception: Exception = self.get_state_changed_from_queue(self.error_queue) # type: ignore[assignment,arg-type] - if isinstance(exception, Exception): - self.exception = exception - self.stage = AppStage.FAILED + if (time() - self._last_check_error_queue) > CHECK_ERROR_QUEUE_INTERVAL: + exception: Exception = self.get_state_changed_from_queue(self.error_queue) # type: ignore[assignment,arg-type] + if isinstance(exception, Exception): + self.exception = exception + self.stage = AppStage.FAILED + self._last_check_error_queue = time() @property def flows(self) -> List[Union[LightningWork, "LightningFlow"]]: diff --git a/src/lightning/app/core/constants.py b/src/lightning/app/core/constants.py index f33278e5bf5ca..64b159e57fea8 100644 --- a/src/lightning/app/core/constants.py +++ b/src/lightning/app/core/constants.py @@ -70,6 +70,7 @@ def get_lightning_cloud_url() -> str: LIGHTNING_COMPONENT_PUBLIC_REGISTRY = "https://lightning.ai/v1/components" LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps" LIGHTNING_MODELS_PUBLIC_REGISTRY = "https://lightning.ai/v1/models" +ENABLE_ORCHESTRATOR = bool(int(os.getenv("ENABLE_ORCHESTRATOR", "1"))) LIGHTNING_CLOUDSPACE_HOST = os.getenv("LIGHTNING_CLOUDSPACE_HOST") LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT = int(os.getenv("LIGHTNING_CLOUDSPACE_EXPOSED_PORT_COUNT", "0")) @@ -99,6 +100,7 @@ def get_lightning_cloud_url() -> str: SYS_CUSTOMIZATIONS_SYNC_PATH = ".sys-customizations-sync" BATCH_DELTA_COUNT = int(os.getenv("BATCH_DELTA_COUNT", "128")) +CHECK_ERROR_QUEUE_INTERVAL = float(os.getenv("CHECK_ERROR_QUEUE_INTERVAL", "30")) def enable_multiple_works_in_default_container() -> bool: diff --git a/src/lightning/app/runners/multiprocess.py b/src/lightning/app/runners/multiprocess.py index c3217197a6a33..94d627e95fc7b 100644 --- a/src/lightning/app/runners/multiprocess.py +++ b/src/lightning/app/runners/multiprocess.py @@ -81,16 +81,17 @@ def dispatch(self, *args: Any, open_ui: bool = True, **kwargs: Any): _set_flow_context() - storage_orchestrator = StorageOrchestrator( - self.app, - self.app.request_queues, - self.app.response_queues, - self.app.copy_request_queues, - self.app.copy_response_queues, - ) - self.threads.append(storage_orchestrator) - storage_orchestrator.setDaemon(True) - storage_orchestrator.start() + if constants.ENABLE_ORCHESTRATOR: + storage_orchestrator = StorageOrchestrator( + self.app, + self.app.request_queues, + self.app.response_queues, + self.app.copy_request_queues, + self.app.copy_response_queues, + ) + self.threads.append(storage_orchestrator) + storage_orchestrator.setDaemon(True) + storage_orchestrator.start() if self.start_server: self.app.should_publish_changes_to_api = True diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index 49d6bdbf44f4e..08a2d7c641b29 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -1188,3 +1188,27 @@ def run(self): def test_lightning_work_stopped(): app = LightningApp(SimpleWork2()) MultiProcessRuntime(app, start_server=False).dispatch() + + +class FailedWork(LightningWork): + def run(self): + raise Exception + + +class CheckErrorQueueLightningApp(LightningApp): + def check_error_queue(self): + super().check_error_queue() + + +def test_error_queue_check(monkeypatch): + import sys + + from lightning.app.core import app as app_module + + sys_mock = mock.MagicMock() + monkeypatch.setattr(app_module, "CHECK_ERROR_QUEUE_INTERVAL", 0) + monkeypatch.setattr(sys, "exit", sys_mock) + app = LightningApp(FailedWork()) + MultiProcessRuntime(app, start_server=False).dispatch() + assert app.stage == AppStage.FAILED + assert app._last_check_error_queue != 0.0 From 90d04b5b86f37994cdceccc6de32f0e93b1cc7f0 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 9 May 2024 16:12:30 +0100 Subject: [PATCH 025/179] Update Lightning Cloud 0.5.69 (#19857) --- requirements/app/app.txt | 2 +- src/lightning/app/runners/cloud.py | 4 ++-- tests/tests_app/runners/test_cloud.py | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements/app/app.txt b/requirements/app/app.txt index 587c0538e1a81..d2bffc5cecfd3 100644 --- a/requirements/app/app.txt +++ b/requirements/app/app.txt @@ -1,4 +1,4 @@ -lightning-cloud == 0.5.68 # Must be pinned to ensure compatibility +lightning-cloud == 0.5.69 # Must be pinned to ensure compatibility packaging typing-extensions >=4.4.0, <4.10.0 deepdiff >=5.7.0, <6.6.0 diff --git a/src/lightning/app/runners/cloud.py b/src/lightning/app/runners/cloud.py index c488014450b9b..80fb03499e678 100644 --- a/src/lightning/app/runners/cloud.py +++ b/src/lightning/app/runners/cloud.py @@ -34,7 +34,7 @@ CloudspaceIdRunsBody, Externalv1LightningappInstance, Gridv1ImageSpec, - IdGetBody1, + IdGetBody, ProjectIdCloudspacesBody, V1BuildSpec, V1CloudSpace, @@ -1027,7 +1027,7 @@ def _api_create_run_instance( project_id=project_id, cloudspace_id=cloudspace_id, id=run_id, - body=IdGetBody1( + body=IdGetBody( cluster_id=cluster_id, name=run_name, desired_state=desired_state, diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 74b74c99a8049..5f397284ebeaa 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -24,7 +24,7 @@ Externalv1Cluster, Externalv1LightningappInstance, Gridv1ImageSpec, - IdGetBody1, + IdGetBody, ProjectIdProjectclustersbindingsBody, V1BuildSpec, V1CloudSpace, @@ -508,7 +508,7 @@ def test_basic_auth_enabled(self, tmpdir, monkeypatch): project_id="test-project-id", cloudspace_id=mock.ANY, id=mock.ANY, - body=IdGetBody1( + body=IdGetBody( desired_state=mock.ANY, name=mock.ANY, env=mock.ANY, @@ -712,7 +712,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk cloud_runtime.dispatch() # calling with no env variable set - body = IdGetBody1( + body = IdGetBody( desired_state=V1LightningappInstanceState.STOPPED, env=[], name=mock.ANY, @@ -727,7 +727,7 @@ def test_call_with_queue_server_type_specified(self, tmpdir, lightningapps, monk monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "http") cloud_runtime.backend.client.reset_mock() cloud_runtime.dispatch() - body = IdGetBody1( + body = IdGetBody( desired_state=V1LightningappInstanceState.STOPPED, env=mock.ANY, name=mock.ANY, @@ -998,7 +998,7 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin project_id="test-project-id", cloudspace_id=mock.ANY, id=mock.ANY, - body=IdGetBody1( + body=IdGetBody( desired_state=V1LightningappInstanceState.STOPPED, name=mock.ANY, env=[V1EnvVar(name="ENABLE_APP_COMMENT_COMMAND_EXECUTION", value="1")], From 9455871c9356b840ae35421b5e74b689b42f5e1a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 15 May 2024 14:19:08 +0200 Subject: [PATCH 026/179] (2/n) Support 2D Parallelism - Distributed Checkpoints (#19852) * distributed checkpoints * use decorator * refactor if-strict * update example * filter non-persistent buffers (todo, add test) * simplify checkpoint loading for model --- .../fabric/tensor_parallel/parallelism.py | 5 +- examples/fabric/tensor_parallel/train.py | 10 +- src/lightning/fabric/CHANGELOG.md | 3 +- src/lightning/fabric/strategies/fsdp.py | 38 +-- .../fabric/strategies/model_parallel.py | 273 ++++++++++++++++-- src/lightning/fabric/utilities/imports.py | 4 +- src/lightning/pytorch/strategies/fsdp.py | 2 +- tests/tests_fabric/strategies/test_fsdp.py | 7 +- .../strategies/test_fsdp_integration.py | 2 +- .../strategies/test_model_parallel.py | 202 +++++++++++-- .../test_model_parallel_integration.py | 263 ++++++++++++++--- 11 files changed, 682 insertions(+), 127 deletions(-) diff --git a/examples/fabric/tensor_parallel/parallelism.py b/examples/fabric/tensor_parallel/parallelism.py index 38a091d1b8859..088d3f2ef1bd0 100644 --- a/examples/fabric/tensor_parallel/parallelism.py +++ b/examples/fabric/tensor_parallel/parallelism.py @@ -31,11 +31,10 @@ def parallelize(model: Transformer, device_mesh: DeviceMesh) -> Transformer: # 1. Parallelize the first embedding and the last linear proj layer # 2. Parallelize the root norm layer over the sequence dim # 3. Shard the first transformer block's inputs + # Parallelize the first embedding and the last linear out projection plan = { - "tok_embeddings": RowwiseParallel( - input_layouts=Replicate(), - ), + "tok_embeddings": RowwiseParallel(input_layouts=Replicate()), "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()), "norm": SequenceParallel(), "layers.0": PrepareModuleInput( diff --git a/examples/fabric/tensor_parallel/train.py b/examples/fabric/tensor_parallel/train.py index cfae4ca90250e..2c3ab3819830c 100644 --- a/examples/fabric/tensor_parallel/train.py +++ b/examples/fabric/tensor_parallel/train.py @@ -30,14 +30,14 @@ def train(): fabric.print(f"Number of model parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f} B") - # Define the optimizer - optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, foreach=True) - # Set up model and optimizer - model, optimizer = fabric.setup(model, optimizer) - + model = fabric.setup(model) model.init_weights() + # Define the optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, foreach=True) + optimizer = fabric.setup_optimizers(optimizer) + # Define dataset/dataloader dataset = RandomTokenDataset(vocab_size=model_args.vocab_size, seq_length=128) dataloader = DataLoader(dataset, batch_size=8) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 3e31d4c8ed811..c5b883320bf2e 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -15,7 +15,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) -- +- Added `ModelParallelStrategy` to support 2D parallelism ([#19846](https://github.com/Lightning-AI/pytorch-lightning/pull/19846), [#19852](https://github.com/Lightning-AI/pytorch-lightning/pull/19852)) + ### Changed diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 1bd470ff56bcd..bd3548a22be9b 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -393,7 +393,7 @@ def clip_gradients_norm( # the root must be wrapped raise TypeError( "Gradient clipping with FSDP is only possible if the module passed to" - f" `{self.__class__.__name__}.clip_gradients_norm` is wrapped in `FullyShardedDataParallel`." + f" `{type(self).__name__}.clip_gradients_norm` is wrapped in `FullyShardedDataParallel`." f" Got: {module.__class__.__name__}." ) self.precision.unscale_gradients(optimizer) @@ -506,12 +506,7 @@ def load_checkpoint( state: Optional[Union[Module, Optimizer, Dict[str, Union[Module, Optimizer, Any]]]] = None, strict: bool = True, ) -> Dict[str, Any]: - """Load the contents from a checkpoint and restore the state of the given objects. - - The strategy currently only supports saving and loading sharded checkpoints which are stored in form of a - directory of multiple files rather than a single file. - - """ + """Load the contents from a checkpoint and restore the state of the given objects.""" if not state: raise ValueError( f"Got FSDPStrategy.load_checkpoint(..., state={state!r}) but a state with at least " @@ -522,6 +517,8 @@ def load_checkpoint( path = Path(self.broadcast(path)) if isinstance(state, Module): + from lightning.fabric.strategies.model_parallel import _load_raw_module_state_from_path + _load_raw_module_state_from_path(path, module=state, world_size=self.world_size, strict=strict) return {} @@ -592,6 +589,9 @@ def load_checkpoint( if _is_full_checkpoint(path): checkpoint = _lazy_load(path) + + from lightning.fabric.strategies.model_parallel import _load_raw_module_state + _load_raw_module_state(checkpoint.pop(module_key), module=module, world_size=self.world_size, strict=strict) if isinstance(state, Module): @@ -755,7 +755,7 @@ def no_backward_sync(self, module: Module, enabled: bool) -> ContextManager: # the root must be wrapped raise TypeError( "Blocking backward sync is only possible if the module passed to" - f" `{self.__class__.__name__}.no_backward_sync` is wrapped in `FullyShardedDataParallel`." + f" `{type(self).__name__}.no_backward_sync` is wrapped in `FullyShardedDataParallel`." f" Got: {module.__class__.__name__}." ) return module.no_sync() @@ -848,28 +848,6 @@ def _has_fsdp_modules(module: object) -> TypeGuard[Module]: return isinstance(module, Module) and any(isinstance(m, FullyShardedDataParallel) for m in module.modules()) -def _load_raw_module_state_from_path(path: Path, module: Module, world_size: int, strict: bool = True) -> None: - """Loads the state dict from a file path into the FSDP module.""" - if not _is_full_checkpoint(path): - raise ValueError( - "Failed to load checkpoint directly into the model. The given path must be a single file containing the" - f" full state dict: {path}" - ) - # Use `lazy_load` instead of `torch.load` here to avoid storing a copy of the full checkpoint per rank - _load_raw_module_state(state_dict=_lazy_load(path), module=module, world_size=world_size, strict=strict) - - -def _load_raw_module_state(state_dict: Dict[str, Any], module: Module, world_size: int, strict: bool = True) -> None: - """Loads the state dict into the module by gathering all weights first and then and writing back to each shard.""" - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - - if not isinstance(module, FSDP): - module.load_state_dict(state_dict, strict=strict) - else: - with _get_full_state_dict_context(module, world_size=world_size, rank0_only=False): - module.load_state_dict(state_dict, strict=strict) - - def _move_torchmetrics_to_device(module: torch.nn.Module, device: torch.device) -> None: # FSDP doesn't move modules without parameters (e.g. Metrics) to the device # https://github.com/pytorch/pytorch/issues/113113 diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index 5679fa7b5ac06..9cd721f930d1b 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import shutil from contextlib import ExitStack from datetime import timedelta from pathlib import Path @@ -21,17 +22,25 @@ from torch import Tensor from torch.nn import Module from torch.optim import Optimizer -from typing_extensions import override +from typing_extensions import TypeGuard, override from lightning.fabric.plugins import CheckpointIO from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout from lightning.fabric.strategies.fsdp import ( _distributed_checkpoint_load, _distributed_checkpoint_save, + _get_full_state_dict_context, + _is_full_checkpoint, + _is_sharded_checkpoint, ) from lightning.fabric.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from lightning.fabric.strategies.parallel import ParallelStrategy -from lightning.fabric.strategies.strategy import TBroadcast, _BackwardSyncControl +from lightning.fabric.strategies.strategy import ( + TBroadcast, + _apply_filter, + _BackwardSyncControl, + _validate_keys_for_strict_loading, +) from lightning.fabric.utilities.distributed import ( ReduceOp, _distributed_is_initialized, @@ -40,11 +49,12 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3, _TORCH_GREATER_EQUAL_2_4 from lightning.fabric.utilities.init import _materialize_distributed_module +from lightning.fabric.utilities.load import _METADATA_FILENAME, _lazy_load, _move_state_into from lightning.fabric.utilities.rank_zero import rank_zero_only from lightning.fabric.utilities.seed import reset_seed -from lightning.fabric.utilities.types import _PATH +from lightning.fabric.utilities.types import _PATH, _Stateful if TYPE_CHECKING: from torch.distributed.device_mesh import DeviceMesh @@ -68,6 +78,9 @@ class ModelParallelStrategy(ParallelStrategy): sets this size to the number of nodes in the cluster. tensor_parallel_size: The number of devices within a tensor-parallel group. Defaults to ``"auto"``, which sets this size to the number of GPUs in a single node. + save_distributed_checkpoint: If ``True``, each rank saves its shard of weights and optimizer states to a file. + The checkpoint is a folder with as many files as the world size. + If ``False``, the full weights and optimizer states get assembled on rank 0 and saved to a single file. """ @@ -76,16 +89,18 @@ def __init__( parallelize_fn: Callable[[TModel, "DeviceMesh"], TModel], data_parallel_size: Union[Literal["auto"], int] = "auto", tensor_parallel_size: Union[Literal["auto"], int] = "auto", + save_distributed_checkpoint: bool = True, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, ) -> None: super().__init__() if not _TORCH_GREATER_EQUAL_2_3: - raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.3 or higher.") + raise ImportError(f"{type(self).__name__} requires PyTorch 2.3 or higher.") self._parallelize_fn = parallelize_fn self._data_parallel_size = data_parallel_size self._tensor_parallel_size = tensor_parallel_size self._num_nodes = 1 + self._save_distributed_checkpoint = save_distributed_checkpoint self._process_group_backend: Optional[str] = process_group_backend self._timeout: Optional[timedelta] = timeout self._backward_sync_control = _ParallelBackwardSyncControl() @@ -216,18 +231,35 @@ def save_checkpoint( storage_options: Optional[Any] = None, filter: Optional[Dict[str, Callable[[str, Any], bool]]] = None, ) -> None: - """Save model, optimizer, and other state to a checkpoint on disk.""" + """Save model, optimizer, and other state to a checkpoint on disk. + + If distributed checkpointing is enabled (default), the checkpoint gets saved as a directory containing one file + per process, with model- and optimizer shards stored per file. Additionally, it creates a metadata file + `meta.pt` with the rest of the user's state (only saved from rank 0). + If distributed checkpointing is disabled (``save_distributed_checkpoint=False``), the checkpoint will be + written to a single file containing the weights, optimizer state and other metadata. + + """ if storage_options is not None: raise TypeError( - f"`{self.__class__.__name__}.save_checkpoint(..., storage_options=...)` is not supported because" - f" `{self.__class__.__name__}` does not use the `CheckpointIO`." + f"`{type(self).__name__}.save_checkpoint(..., storage_options=...)` is not supported because" + f" `{type(self).__name__}` does not use the `CheckpointIO`." + ) + if filter is not None and self._save_distributed_checkpoint: + # https://github.com/pytorch/pytorch/issues/105379 + raise NotImplementedError( + f"{type(self).__name__} doesn't support loading distributed filtered checkpoints," + " so saving them is disabled." ) - if filter is not None: - raise NotImplementedError(f"{self.__class__.__name__} does not yet support the `filter` argument.") - # broadcast the path from rank 0 to ensure all the states are saved in a common path path = Path(self.broadcast(path)) - _distributed_checkpoint_save(state, path) + _save_checkpoint( + path=path, + state=state, + full_state_dict=(not self._save_distributed_checkpoint), + rank=self.global_rank, + filter=filter, + ) @override def load_checkpoint( @@ -236,17 +268,32 @@ def load_checkpoint( state: Optional[Union[Module, Optimizer, Dict[str, Union[Module, Optimizer, Any]]]] = None, strict: bool = True, ) -> Dict[str, Any]: - if isinstance(state, (Module, Optimizer)): - raise NotImplementedError( - "Loading a module or optimizer object from a checkpoint directly is not yet supported." - ) - if strict is False: - raise NotImplementedError(f"Non-strict loading is not yet supported in {self.__class__.__name__}.") + """Load the contents from a checkpoint and restore the state of the given objects. + Currently does not support loading the optimizer state if the model is distributed but the checkpoint is a full, + non-distributed checkpoint. + + """ + if not state: + raise ValueError( + f"Got {type(self).__name__}.load_checkpoint(..., state={state!r}) but a state with at least " + " a model instance to reload is required. Pass it in like so:" + f" {type(self).__name__}.load_checkpoint(..., state={{'model': model, ...}})" + ) # broadcast the path from rank 0 to ensure all the states are loaded from a common path path = Path(self.broadcast(path)) - _distributed_checkpoint_load(state, path) # type: ignore[arg-type] - return {} + + if isinstance(state, Module): + _load_raw_module_state_from_path(path, module=state, world_size=self.world_size, strict=strict) + return {} + + if isinstance(state, Optimizer): + raise NotImplementedError( + f"Loading a single optimizer object from a checkpoint is not supported yet with" + f" {type(self).__name__}." + ) + + return _load_checkpoint(path=path, state=state, strict=strict) def _setup_distributed(self) -> None: reset_seed() @@ -310,3 +357,189 @@ def __enter__(self) -> None: def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: self._set_requires_grad_sync(self._enabled) + + +def _save_checkpoint( + path: Path, + state: Dict[str, Union[Module, Optimizer, Any]], + full_state_dict: bool, + rank: int, + filter: Optional[Dict[str, Callable[[str, Any], bool]]] = None, +) -> None: + if path.is_dir() and full_state_dict and not _is_sharded_checkpoint(path): + raise IsADirectoryError(f"The checkpoint path exists and is a directory: {path}") + + modules = [module for module in state.values() if _has_dtensor_modules(module)] + if len(modules) == 0: + raise ValueError( + "Could not find a distributed model in the provided checkpoint state. Please provide the model as" + " part of the state like so: `save_checkpoint(..., state={'model': model, ...})`. Make sure" + " you set up the model (and optimizers if any) through the strategy before saving the checkpoint." + ) + if len(modules) > 1: + raise ValueError( + "Found multiple distributed models in the given state. Saving distributed checkpoints is" + " currently limited to a single model per checkpoint. To save multiple models, call the" + " save method for each model separately with a different path." + ) + module = modules[0] + + from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict, get_optimizer_state_dict + + state_dict_options = StateDictOptions(full_state_dict=full_state_dict, cpu_offload=True) + + # replace the modules and optimizer objects in the state with their local state dict + # and separate the user's metadata + converted_state: Dict[str, Any] = {} + metadata: Dict[str, Any] = {} + for key, obj in state.items(): + converted: Any + if isinstance(obj, Module): + converted = get_model_state_dict(obj, options=state_dict_options) + target_dict = converted_state + elif isinstance(obj, Optimizer): + converted = get_optimizer_state_dict(module, obj, options=state_dict_options) + target_dict = converted_state + else: # everything not a module or optimizer is considered metadata + converted = obj.state_dict() if isinstance(obj, _Stateful) else obj + target_dict = metadata + _apply_filter(key, filter or {}, converted, target_dict) + + if full_state_dict: + if _is_sharded_checkpoint(path): + shutil.rmtree(path) + converted_state.update(metadata) + if rank == 0: + torch.save(converted_state, path) + else: + if path.is_file(): + path.unlink() + path.mkdir(parents=True, exist_ok=True) + _distributed_checkpoint_save(converted_state, path) + if rank == 0: + torch.save(metadata, path / _METADATA_FILENAME) + + +def _load_checkpoint( + path: Path, + state: Dict[str, Union[Module, Optimizer, Any]], + strict: bool = True, +) -> Dict[str, Any]: + from torch.distributed.checkpoint.state_dict import ( + StateDictOptions, + get_model_state_dict, + get_optimizer_state_dict, + set_model_state_dict, + set_optimizer_state_dict, + ) + + modules = {key: module for key, module in state.items() if _has_dtensor_modules(module)} + if len(modules) == 0: + raise ValueError( + "Could not find a distributed model in the provided checkpoint state. Please provide the model as" + " part of the state like so: `load_checkpoint(..., state={'model': model, ...})`. Make sure" + " you set up the model (and optimizers if any) through the strategy before loading the checkpoint." + ) + optimizers = {key: optim for key, optim in state.items() if isinstance(optim, Optimizer)} + if len(modules) > 1: + raise ValueError( + "Found multiple distributed models in the given state. Loading distributed checkpoints is" + " currently limited to a single model per checkpoint. To load multiple models, call the" + " load method for each model separately with a different path." + ) + module_key, module = list(modules.items())[0] + + if _is_sharded_checkpoint(path): + state_dict_options = StateDictOptions(cpu_offload=True) + + module_state = {module_key: get_model_state_dict(module)} + _distributed_checkpoint_load(module_state, path) + module.load_state_dict(module_state[module_key], strict=strict) + + # the optimizer states must be loaded separately + for optim_key, optim in optimizers.items(): + optim_state = {optim_key: get_optimizer_state_dict(module, optim)} + _distributed_checkpoint_load(optim_state, path) + set_optimizer_state_dict(module, optim, optim_state_dict=optim_state[optim_key], options=state_dict_options) + + # Load metadata (anything not a module or optimizer) + metadata = torch.load(path / _METADATA_FILENAME) + requested_metadata_keys = state.keys() - modules.keys() - optimizers.keys() + _validate_keys_for_strict_loading(requested_metadata_keys, metadata.keys(), strict=strict) + for key in requested_metadata_keys: + if key not in metadata: + continue + state[key] = metadata.pop(key) + + # return the remaining metadata that wasn't requested as part of `state` + return metadata + + if _is_full_checkpoint(path): + # TODO: Support loading optimizer states + if any(isinstance(obj, Optimizer) for obj in state.values()): + raise NotImplementedError( + "Loading the optimizer states from a non-distributed checkpoint into a distributed model" + " is currently not supported." + ) + if not _TORCH_GREATER_EQUAL_2_4: + raise ImportError("Loading a non-distributed checkpoint into a distributed model requires PyTorch >= 2.4.") + + state_dict_options = StateDictOptions( + broadcast_from_rank0=True, # type: ignore[call-arg] + full_state_dict=True, + strict=strict, + ) + checkpoint = torch.load(path, mmap=True, map_location="cpu") + set_model_state_dict(module, checkpoint.pop(module_key), options=state_dict_options) + + requested_metadata_keys = state.keys() - modules.keys() - optimizers.keys() + _validate_keys_for_strict_loading(requested_metadata_keys, checkpoint.keys(), strict=strict) + + # Load metadata (anything not a module or optimizer) + _move_state_into(source=checkpoint, destination=state, keys=requested_metadata_keys) + + # return the remaining metadata that wasn't requested as part of `state` + return checkpoint + + raise ValueError( + f"The path {str(path)!r} does not point to a valid checkpoint. Make sure the path points to either a" + " directory with distributed checkpoint shards, or a single file with a full checkpoint." + ) + + +def _has_dtensor_modules(module: object) -> TypeGuard[Module]: + from torch.distributed._tensor import DTensor + + return isinstance(module, Module) and any(isinstance(t, DTensor) for t in module.parameters()) + + +def _load_raw_module_state_from_path(path: Path, module: Module, world_size: int, strict: bool = True) -> None: + """Loads the state dict from a file path into the FSDP module.""" + if not _is_full_checkpoint(path): + raise ValueError( + "Failed to load checkpoint directly into the model. The given path must be a single file containing the" + f" full state dict: {path}" + ) + # Use `lazy_load`/`mmap` instead to avoid storing a copy of the full checkpoint per rank + state_dict = torch.load(path, mmap=True, map_location="cpu") if _TORCH_GREATER_EQUAL_2_3 else _lazy_load(path) + _load_raw_module_state(state_dict=state_dict, module=module, world_size=world_size, strict=strict) + + +def _load_raw_module_state(state_dict: Dict[str, Any], module: Module, world_size: int, strict: bool = True) -> None: + """Loads the state dict into the module by gathering all weights first and then and writing back to each shard.""" + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + + if _has_dtensor_modules(module): + if not _TORCH_GREATER_EQUAL_2_4: + raise ImportError("Loading a non-distributed checkpoint into a distributed model requires PyTorch >= 2.4.") + + from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict + + state_dict_options = StateDictOptions(broadcast_from_rank0=True, full_state_dict=True) # type: ignore[call-arg] + set_model_state_dict(module, state_dict, options=state_dict_options) + + elif isinstance(module, FSDP): + with _get_full_state_dict_context(module, world_size=world_size, rank0_only=False): + module.load_state_dict(state_dict, strict=strict) + else: + module.load_state_dict(state_dict, strict=strict) diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py index bcfeadf3298ca..46374e23ad2b5 100644 --- a/src/lightning/fabric/utilities/imports.py +++ b/src/lightning/fabric/utilities/imports.py @@ -28,7 +28,9 @@ _TORCH_GREATER_EQUAL_2_1 = compare_version("torch", operator.ge, "2.1.0") _TORCH_GREATER_EQUAL_2_2 = compare_version("torch", operator.ge, "2.2.0") -_TORCH_GREATER_EQUAL_2_3 = compare_version("torch", operator.ge, "2.3.0", use_base_version=True) +_TORCH_GREATER_EQUAL_2_3 = compare_version("torch", operator.ge, "2.3.0") +_TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0", use_base_version=True) + _TORCH_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0") and not _TORCH_GREATER_EQUAL_2_1 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 1e7341c907bf9..1aae8b678b674 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -41,11 +41,11 @@ _init_sharding_strategy, _is_full_checkpoint, _is_sharded_checkpoint, - _load_raw_module_state, _move_torchmetrics_to_device, _optimizer_has_flat_params, _setup_activation_checkpointing, ) +from lightning.fabric.strategies.model_parallel import _load_raw_module_state from lightning.fabric.utilities.distributed import ( _distributed_is_initialized, _get_default_process_group_backend_for_device, diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index 01e70a27a7cc9..ed0dda85ffaef 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -283,7 +283,6 @@ def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, tmp_ assert path.is_dir() -@mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) def test_save_checkpoint_one_fsdp_module_required(tmp_path): """Test that the FSDP strategy can only save one FSDP model per checkpoint.""" strategy = FSDPStrategy() @@ -292,7 +291,7 @@ def test_save_checkpoint_one_fsdp_module_required(tmp_path): with pytest.raises(ValueError, match="Could not find a FSDP model in the provided checkpoint state."): strategy.save_checkpoint(path=tmp_path, state={}) with pytest.raises(ValueError, match="Could not find a FSDP model in the provided checkpoint state."): - strategy.load_checkpoint(path=tmp_path, state={"model": torch.nn.Linear(3, 3)}) + strategy.save_checkpoint(path=tmp_path, state={"model": torch.nn.Linear(3, 3)}) # multiple FSDP models model1 = Mock(spec=FullyShardedDataParallel) @@ -313,7 +312,8 @@ def test_load_checkpoint_no_state(tmp_path): @mock.patch("lightning.fabric.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) -@mock.patch("lightning.fabric.strategies.fsdp._lazy_load", Mock()) +@mock.patch("lightning.fabric.strategies.model_parallel._lazy_load", Mock()) +@mock.patch("lightning.fabric.strategies.model_parallel.torch.load", Mock()) def test_load_checkpoint_one_fsdp_module_required(tmp_path): """Test that the FSDP strategy can only load one FSDP model per checkpoint.""" strategy = FSDPStrategy() @@ -334,6 +334,7 @@ def test_load_checkpoint_one_fsdp_module_required(tmp_path): # A raw nn.Module instead of a dictionary is ok model = Mock(spec=nn.Module) + model.parameters.return_value = [torch.zeros(2, 1)] path = tmp_path / "full.ckpt" path.touch() strategy.load_checkpoint(path=path, state=model) diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index b2246892156f1..16e4910c7ec33 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -489,7 +489,7 @@ def test_save_filter(tmp_path): fabric.save(checkpoint_path, state, filter=filter) checkpoint = torch.load(checkpoint_path)["model"] assert set(checkpoint) == {"bias"} - assert isinstance(checkpoint["bias"], torch.Tensor) + assert type(checkpoint["bias"]) is torch.Tensor fabric.strategy._state_dict_type = "sharded" checkpoint_path = tmp_path / "sharded" diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py index c2a31e2489d7c..54efb999a8cf1 100644 --- a/tests/tests_fabric/strategies/test_model_parallel.py +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -21,6 +21,7 @@ import torch.nn as nn from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies import ModelParallelStrategy +from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint from lightning.fabric.strategies.model_parallel import _ParallelBackwardSyncControl from torch.optim import Adam @@ -80,31 +81,6 @@ def test_checkpoint_io_unsupported(): strategy.checkpoint_io = Mock() -@RunIf(min_torch="2.3") -def test_save_filter_unsupported(tmp_path): - strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) - with pytest.raises(NotImplementedError, match="does not yet support the `filter` argument"): - strategy.save_checkpoint(tmp_path / "checkpoint.pth", state={}, filter=Mock()) - - -@RunIf(min_torch="2.3") -def test_load_raw_unsupported(tmp_path): - strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) - model = nn.Linear(2, 2) - optimizer = Adam(model.parameters()) - with pytest.raises(NotImplementedError, match="object from a checkpoint directly is not yet supported"): - strategy.load_checkpoint(tmp_path / "checkpoint.pth", state=model) - with pytest.raises(NotImplementedError, match="object from a checkpoint directly is not yet supported"): - strategy.load_checkpoint(tmp_path / "checkpoint.pth", state=optimizer) - - -@RunIf(min_torch="2.3") -def test_load_non_strict_unsupported(tmp_path): - strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) - with pytest.raises(NotImplementedError, match="Non-strict loading is not yet supported"): - strategy.load_checkpoint(tmp_path / "checkpoint.pth", state={}, strict=False) - - @RunIf(min_torch="2.3") def test_fsdp_v1_modules_unsupported(): """Test that the strategy won't allow setting up a module wrapped with the legacy FSDP API.""" @@ -173,6 +149,182 @@ def test_save_checkpoint_storage_options(tmp_path): strategy.save_checkpoint(path=tmp_path, state=Mock(), storage_options=Mock()) +@RunIf(min_torch="2.3") +@mock.patch("lightning.fabric.strategies.model_parallel.ModelParallelStrategy.broadcast", lambda _, x: x) +@mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True) +@mock.patch("torch.distributed.checkpoint.state_dict.get_model_state_dict", return_value={}) +@mock.patch("torch.distributed.checkpoint.state_dict.get_optimizer_state_dict", return_value={}) +@mock.patch("lightning.fabric.strategies.model_parallel.torch.save") +@mock.patch("lightning.fabric.strategies.model_parallel.shutil") +def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, _, __, ___, tmp_path): + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m), save_distributed_checkpoint=False) + + # save_distributed_checkpoint=False, path exists, path is not a sharded checkpoint: error + path = tmp_path / "not-empty" + path.mkdir() + (path / "file").touch() + assert not _is_sharded_checkpoint(path) + with pytest.raises(IsADirectoryError, match="exists and is a directory"): + strategy.save_checkpoint(path=path, state=Mock()) + + # save_distributed_checkpoint=False, path exists, path is a sharded checkpoint: no error (overwrite) + path = tmp_path / "sharded-checkpoint" + path.mkdir() + (path / "meta.pt").touch() + assert _is_sharded_checkpoint(path) + model = Mock() + model.modules.return_value = [model] + strategy.save_checkpoint(path=path, state={"model": model}) + shutil_mock.rmtree.assert_called_once_with(path) + + # save_distributed_checkpoint=False, path exists, path is a file: no error (overwrite) + path = tmp_path / "file.pt" + path.touch() + model = Mock(spec=nn.Module) + torch_save_mock.reset_mock() + strategy.save_checkpoint(path=path, state={"model": model}) + torch_save_mock.assert_called_once() + + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m), save_distributed_checkpoint=True) + save_mock = mock.patch("torch.distributed.checkpoint.save") + + # save_distributed_checkpoint=True, path exists, path is a folder: no error (overwrite) + path = tmp_path / "not-empty-2" + path.mkdir() + (path / "file").touch() + model = Mock(spec=nn.Module) + with save_mock: + strategy.save_checkpoint(path=path, state={"model": model}) + assert (path / "file").exists() + + # save_distributed_checkpoint=True, path exists, path is a file: no error (overwrite) + path = tmp_path / "file-2.pt" + path.touch() + model = Mock(spec=nn.Module) + with save_mock: + strategy.save_checkpoint(path=path, state={"model": model}) + assert path.is_dir() + + +@RunIf(min_torch="2.3") +def test_save_checkpoint_one_dist_module_required(tmp_path): + """Test that the ModelParallelStrategy strategy can only save one distributed model per checkpoint.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + + # missing DTensor model + with pytest.raises(ValueError, match="Could not find a distributed model in the provided checkpoint state."): + strategy.save_checkpoint(path=tmp_path, state={}) + with pytest.raises(ValueError, match="Could not find a distributed model in the provided checkpoint state."): + strategy.save_checkpoint(path=tmp_path, state={"model": torch.nn.Linear(3, 3)}) + + # multiple DTensor models + with mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True): + model1 = Mock(spec=nn.Module) + model1.modules.return_value = [model1] + model2 = Mock(spec=nn.Module) + model2.modules.return_value = [model2] + with pytest.raises(ValueError, match="Found multiple distributed models in the given state."): + strategy.save_checkpoint(path=tmp_path, state={"model1": model1, "model2": model2}) + + +@RunIf(min_torch="2.3") +@mock.patch("lightning.fabric.strategies.model_parallel.torch.load", Mock()) +@mock.patch("lightning.fabric.strategies.model_parallel._TORCH_GREATER_EQUAL_2_4", False) +def test_load_full_checkpoint_support(tmp_path): + """Test that loading non-distributed checkpoints into distributed models requires PyTorch >= 2.4.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + model = Mock(spec=nn.Module) + model.parameters.return_value = [torch.zeros(2, 1)] + path = tmp_path / "full.ckpt" + path.touch() + + with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( + "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True + ): + strategy.load_checkpoint(path=path, state={"model": model}) + + with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( + "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True + ): + strategy.load_checkpoint(path=path, state=model) + + +@RunIf(min_torch="2.3") +def test_load_checkpoint_no_state(tmp_path): + """Test that the ModelParallelStrategy strategy can't load the full state without access to a model instance from + the user.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + with pytest.raises(ValueError, match=escape("Got ModelParallelStrategy.load_checkpoint(..., state=None")): + strategy.load_checkpoint(path=tmp_path, state=None) + with pytest.raises(ValueError, match=escape("Got ModelParallelStrategy.load_checkpoint(..., state={})")): + strategy.load_checkpoint(path=tmp_path, state={}) + + +@RunIf(min_torch="2.3") +@mock.patch("lightning.fabric.strategies.model_parallel.ModelParallelStrategy.broadcast", lambda _, x: x) +@mock.patch("lightning.fabric.strategies.model_parallel.torch.load", Mock()) +def test_load_checkpoint_one_dist_module_required(tmp_path): + """Test that the ModelParallelStrategy strategy can only load one distributed model per checkpoint.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + + # missing DTensor model + with pytest.raises(ValueError, match="Could not find a distributed model in the provided checkpoint state."): + strategy.load_checkpoint(path=tmp_path, state={"other": "data"}) + with pytest.raises(ValueError, match="Could not find a distributed model in the provided checkpoint state."): + strategy.load_checkpoint(path=tmp_path, state={"model": torch.nn.Linear(3, 3)}) + + # multiple DTensor models + with mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True): + model1 = Mock(spec=nn.Module) + model1.modules.return_value = [model1] + model2 = Mock(spec=nn.Module) + model2.modules.return_value = [model2] + with pytest.raises(ValueError, match="Found multiple distributed models in the given state."): + strategy.load_checkpoint(path=tmp_path, state={"model1": model1, "model2": model2}) + + # A raw nn.Module instead of a dictionary is ok + model = Mock(spec=nn.Module) + model.parameters.return_value = [torch.zeros(2, 1)] + path = tmp_path / "full.ckpt" + path.touch() + strategy.load_checkpoint(path=path, state=model) + + +@RunIf(min_torch="2.3") +@mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True) +def test_load_unknown_checkpoint_type(_, tmp_path): + """Test that the strategy validates the contents at the checkpoint path.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + model = Mock() + path = tmp_path / "empty_dir" # neither a single file nor a directory with meta file + path.mkdir() + with pytest.raises(ValueError, match="does not point to a valid checkpoint"): + strategy.load_checkpoint(path=path, state={"model": model}) + + +@RunIf(min_torch="2.3") +def test_load_raw_checkpoint_validate_single_file(tmp_path): + """Test that we validate the given checkpoint is a single file when loading a raw PyTorch state-dict checkpoint.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + model = Mock(spec=nn.Module) + path = tmp_path / "folder" + path.mkdir() + with pytest.raises(ValueError, match="The given path must be a single file containing the full state dict"): + strategy.load_checkpoint(path=path, state=model) + + +@RunIf(min_torch="2.3") +def test_load_raw_checkpoint_optimizer_unsupported(tmp_path): + """Validate that the ModelParallelStrategy strategy does not yet support loading the raw PyTorch state-dict for an + optimizer.""" + strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) + optimizer = Mock(spec=torch.optim.Optimizer) + with pytest.raises( + NotImplementedError, match="Loading a single optimizer object from a checkpoint is not supported" + ): + strategy.load_checkpoint(path=tmp_path, state=optimizer) + + @RunIf(min_torch="2.3") @mock.patch("lightning.fabric.strategies.ModelParallelStrategy._setup_device_mesh") @mock.patch("torch.distributed.init_process_group") diff --git a/tests/tests_fabric/strategies/test_model_parallel_integration.py b/tests/tests_fabric/strategies/test_model_parallel_integration.py index 589560e7fea4f..d864a9687ebb5 100644 --- a/tests/tests_fabric/strategies/test_model_parallel_integration.py +++ b/tests/tests_fabric/strategies/test_model_parallel_integration.py @@ -215,6 +215,24 @@ def test_fsdp2_tensor_parallel(): optimizer.zero_grad() +def _train(fabric, model=None, optimizer=None): + fabric.seed_everything(0) + + if model is None: + with fabric.init_module(empty_init=True): + model = FeedForward() + model = fabric.setup(model) + if optimizer is None: + optimizer = torch.optim.AdamW(model.parameters()) + optimizer = fabric.setup_optimizers(optimizer) + + output = model(torch.rand(2, 32, device=fabric.device)) + fabric.backward(output.sum()) + optimizer.step() + optimizer.zero_grad() + return model, optimizer + + @RunIf(min_torch="2.3", min_cuda_gpus=4, standalone=True) @pytest.mark.parametrize( "precision", @@ -234,17 +252,7 @@ def test_train_save_load(precision, tmp_path): ) fabric = Fabric(accelerator="cuda", devices=4, strategy=strategy, precision=precision) fabric.launch() - - fabric.seed_everything(0) - with fabric.init_module(empty_init=True): - model = FeedForward() - model = fabric.setup(model) - optimizer = torch.optim.AdamW(model.parameters()) - optimizer = fabric.setup_optimizers(optimizer) - output = model(torch.rand(2, 32, device=fabric.device)) - fabric.backward(output.sum()) - optimizer.step() - optimizer.zero_grad() + model, optimizer = _train(fabric) checkpoint_path = fabric.broadcast(str(tmp_path / "dist-checkpoint")) @@ -257,6 +265,7 @@ def test_train_save_load(precision, tmp_path): "__1_0.distcp", "__2_0.distcp", "__3_0.distcp", + "meta.pt", } # re-init all objects and resume @@ -267,17 +276,7 @@ def test_train_save_load(precision, tmp_path): ) fabric = Fabric(accelerator="cuda", devices=4, strategy=strategy, precision=precision) fabric.launch() - - fabric.seed_everything(0) - with fabric.init_module(empty_init=True): - model = FeedForward() - model = fabric.setup(model) - optimizer = torch.optim.AdamW(model.parameters()) - optimizer = fabric.setup_optimizers(optimizer) - output = model(torch.rand(2, 32, device=fabric.device)) - fabric.backward(output.sum()) - optimizer.step() - optimizer.zero_grad() + model, optimizer = _train(fabric) # check correctness with loaded state state = {"model": model, "optimizer": optimizer, "steps": 0} @@ -286,20 +285,172 @@ def test_train_save_load(precision, tmp_path): torch.testing.assert_close(p0, p1, atol=0, rtol=0, equal_nan=True) # check user data in state reloaded - # TODO: This should be 1, torch.distributed.checkpoint only supports tensor data - assert state["steps"] == 0 + assert state["steps"] == 1 assert not metadata - # TODO: Test strict and non-strict loading here once supported # attempt to load a key not in the metadata checkpoint - # state = {"model": model, "coconut": 11} - # with pytest.raises(KeyError, match="The requested state contains a key 'coconut' that does not exist"): - # fabric.load(checkpoint_path, state) + state = {"model": model, "coconut": 11} + with pytest.raises(KeyError, match="The requested state contains a key 'coconut' that does not exist"): + fabric.load(checkpoint_path, state) + + # `strict=False` ignores the missing key + state = {"model": model, "coconut": 11} + fabric.load(checkpoint_path, state, strict=False) + assert state["coconut"] == 11 - # # `strict=False` ignores the missing key - # state = {"model": trainer.model, "coconut": 11} - # fabric.load(checkpoint_path, state, strict=False) - # assert state["coconut"] == 11 + +@RunIf(min_torch="2.4", min_cuda_gpus=2, standalone=True) +def test_save_full_state_dict(tmp_path): + """Test that ModelParallelStrategy saves the full state into a single file with + `save_distributed_checkpoint=False`.""" + from torch.distributed.checkpoint.state_dict import get_optimizer_state_dict + + strategy = ModelParallelStrategy( + _parallelize_feed_forward_fsdp2, + data_parallel_size=2, + tensor_parallel_size=1, + save_distributed_checkpoint=False, + ) + fabric = Fabric(accelerator="cuda", strategy=strategy, devices=2) + fabric.launch() + model, optimizer = _train(fabric) + + checkpoint_path = Path(fabric.broadcast(str(tmp_path / "fsdp-checkpoint.pt"))) + state = {"model": model, "optimizer": optimizer, "steps": 1} + fabric.save(checkpoint_path, state) + + checkpoint = torch.load(checkpoint_path) + assert checkpoint["steps"] == 1 + loaded_state_dict = checkpoint["model"] + + # assert the correct state model was saved + state_dict = model.state_dict() + assert set(loaded_state_dict.keys()) == set(state_dict.keys()) + for param_name in state_dict: + assert torch.equal(loaded_state_dict[param_name], state_dict[param_name].full_tensor().cpu()) + params_before = [p.full_tensor().cpu() for p in model.parameters()] + + # assert the correct optimizer state was saved + optimizer_state_before = get_optimizer_state_dict(model, optimizer) + assert set(checkpoint["optimizer"].keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} + + # 1. verify the FSDP state can be loaded back into a FSDP model/strategy directly + strategy = ModelParallelStrategy(_parallelize_feed_forward_fsdp2, data_parallel_size=2, tensor_parallel_size=1) + fabric = Fabric(accelerator="cuda", strategy=strategy, devices=2) + fabric.launch() + model, optimizer = _train(fabric) + + # TODO: Support loading optimizer states from full checkpoint + with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): + fabric.load(checkpoint_path, {"model": model, "optimizer": optimizer}) + + metadata = fabric.load(checkpoint_path, {"model": model}) + assert metadata == {"steps": 1, "optimizer": mock.ANY} + + params_after = [p.full_tensor() for p in model.parameters()] + assert all(torch.equal(p0.cpu(), p1.cpu()) for p0, p1 in zip(params_before, params_after)) + + # TODO: assert the correct optimizer state was loaded + # optimizer_state_after = get_optimizer_state_dict(model, optimizer) + # assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} + # torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) + # assert optimizer_state_after["param_groups"] == optimizer_state_before["param_groups"] + + # run a step to verify the optimizer state is correct + _train(fabric, model, optimizer) + + # 2. verify the FSDP state can be loaded back into a single-device model/strategy + fabric = Fabric(accelerator="cpu", devices=1) + model, optimizer = _train(fabric) + metadata = fabric.load(checkpoint_path, {"model": model, "optimizer": optimizer}) + assert metadata == {"steps": 1} + params_after = list(model.parameters()) + assert all(torch.equal(p0, p1) for p0, p1 in zip(params_before, params_after)) + + # get optimizer state after loading + normal_checkpoint_path = Path(fabric.broadcast(str(tmp_path / "normal-checkpoint.pt"))) + fabric.save(normal_checkpoint_path, {"model": model, "optimizer": optimizer, "steps": 2}) + + # TODO: assert the correct optimizer state was loaded + # optimizer_state_after = torch.load(normal_checkpoint_path)["optimizer"] + # assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} + # torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) + + # run a step to verify the optimizer state is correct + _train(fabric, model, optimizer) + + # 3. verify that a single-device model/strategy states can be loaded into a FSDP model/strategy + strategy = ModelParallelStrategy(_parallelize_feed_forward_fsdp2, data_parallel_size=2, tensor_parallel_size=1) + fabric = Fabric(accelerator="cuda", strategy=strategy, devices=2) + fabric.launch() + model, optimizer = _train(fabric) + + # TODO: Support loading optimizer states from full checkpoint + with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): + fabric.load(checkpoint_path, {"model": model, "optimizer": optimizer}) + metadata = fabric.load(normal_checkpoint_path, {"model": model}) + assert metadata == {"steps": 2, "optimizer": mock.ANY} + + params_after = [p.full_tensor() for p in model.parameters()] + assert all(torch.equal(p0.cpu(), p1.cpu()) for p0, p1 in zip(params_before, params_after)) + + # TODO: assert the correct optimizer state was loaded + # optimizer_state_after = get_optimizer_state_dict(model, optimizer) + # assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} + # torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) + # assert optimizer_state_after["param_groups"] == optimizer_state_before["param_groups"] + + # run a step to verify the optimizer state is correct + _train(fabric, model, optimizer) + + +@RunIf(min_torch="2.4", min_cuda_gpus=2, standalone=True) +def test_load_full_state_dict_into_sharded_model(tmp_path): + """Test that the strategy can load a full-state checkpoint into a distributed model.""" + fabric = Fabric(accelerator="cuda", devices=1) + fabric.seed_everything(0) + model, optimizer = _train(fabric) + + # Save a full-state-dict checkpoint + checkpoint_path = Path(fabric.broadcast(str(tmp_path / "full-checkpoint.pt"))) + state = {"model": model, "optimizer": optimizer, "steps": 1} + fabric.save(checkpoint_path, state) + + # Gather all weights and store a copy manually + params_before = torch.cat([p.cpu().view(-1) for p in model.parameters()]) + + # Create a FSDP sharded model + strategy = ModelParallelStrategy(_parallelize_feed_forward_fsdp2, data_parallel_size=2, tensor_parallel_size=1) + fabric = Fabric(accelerator="cuda", strategy=strategy, devices=2) + fabric.launch() + model, optimizer = _train(fabric) + + # TODO: Support loading optimizer states from full checkpoint + with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): + state = {"model": model, "optimizer": optimizer, "steps": 44} + fabric.load(checkpoint_path, state) + + state = {"model": model, "steps": 44} + fabric.load(checkpoint_path, state) + assert state["steps"] == 1 + + # Gather all weights and compare + params_after = torch.cat([p.full_tensor().cpu().view(-1) for p in model.parameters()]) + assert torch.equal(params_before, params_after) + + # Create a raw state-dict checkpoint to test `Fabric.load_raw` too + raw_checkpoint_path = checkpoint_path.with_name("model-state-dict") + if fabric.global_rank == 0: + checkpoint = torch.load(checkpoint_path) + torch.save(checkpoint["model"], raw_checkpoint_path) + fabric.barrier() + + _train(fabric, model, optimizer) + fabric.load_raw(raw_checkpoint_path, model) + + # Gather all weights and compare + params_after = torch.cat([p.full_tensor().cpu().view(-1) for p in model.parameters()]) + assert torch.equal(params_before, params_after) @RunIf(min_torch="2.3", min_cuda_gpus=2, skip_windows=True, standalone=True) @@ -359,6 +510,33 @@ def _run_setup_assertions(empty_init, expected_device): _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) +@RunIf(min_torch="2.3", min_cuda_gpus=2, standalone=True) +def test_save_filter(tmp_path): + strategy = ModelParallelStrategy( + parallelize_fn=_parallelize_feed_forward_fsdp2, + save_distributed_checkpoint=False, + ) + fabric = Fabric(accelerator="cuda", strategy=strategy, devices=2) + fabric.launch() + model = FeedForward() + model = fabric.setup_module(model) + + tmp_path = Path(fabric.broadcast(str(tmp_path))) + state = {"model": model} + filter = {"model": lambda k, v: "bias" in k} + + checkpoint_path = tmp_path / "full.pth" + fabric.save(checkpoint_path, state, filter=filter) + checkpoint = torch.load(checkpoint_path)["model"] + assert set(checkpoint) == {"w1.bias", "w2.bias", "w3.bias"} + assert type(checkpoint["w1.bias"]) is torch.Tensor + + fabric.strategy._save_distributed_checkpoint = True + checkpoint_path = tmp_path / "distributed" + with pytest.raises(NotImplementedError, match="doesn't support loading distributed filtered"): + fabric.save(checkpoint_path, state, filter=filter) + + def _parallelize_single_linear_tp_fsdp2(model, device_mesh): from torch.distributed._composable.fsdp.fully_shard import fully_shard from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module @@ -436,8 +614,6 @@ def test_clip_gradients(clip_type, precision): optimizer.zero_grad() -# TODO: Support loading full checkpoint -@pytest.mark.xfail(raises=NotADirectoryError, reason="Loading from full checkpoint not supported yet.") @RunIf(min_torch="2.3", min_cuda_gpus=4, standalone=True) def test_save_sharded_and_consolidate_and_load(tmp_path): """Test the consolidation of a distributed (DTensor) checkpoint into a single file.""" @@ -450,8 +626,9 @@ def test_save_sharded_and_consolidate_and_load(tmp_path): fabric.launch() model = FeedForward() + model = fabric.setup(model) optimizer = torch.optim.Adam(model.parameters()) - model, optimizer = fabric.setup(model, optimizer) + optimizer = fabric.setup_optimizers(optimizer) state = {"model": model, "optimizer": optimizer, "steps": 1} # run one iteration to init the state of the optimizer @@ -467,6 +644,7 @@ def test_save_sharded_and_consolidate_and_load(tmp_path): "__1_0.distcp", "__2_0.distcp", "__3_0.distcp", + "meta.pt", } # consolidate the checkpoint to a single file @@ -477,12 +655,23 @@ def test_save_sharded_and_consolidate_and_load(tmp_path): fabric.barrier() # re-init and load from full checkpoint - strategy = ModelParallelStrategy(_parallelize_feed_forward_fsdp2_tp) + strategy = ModelParallelStrategy( + _parallelize_feed_forward_fsdp2_tp, + data_parallel_size=2, + tensor_parallel_size=2, + ) fabric = Fabric(accelerator="cuda", devices=4, strategy=strategy) fabric.launch() model = FeedForward() + model = fabric.setup(model) optimizer = torch.optim.Adam(model.parameters()) - model, optimizer = fabric.setup(model, optimizer) + optimizer = fabric.setup_optimizers(optimizer) state = {"model": model, "optimizer": optimizer, "steps": 1} + + # TODO: Support loading optimizer states from full checkpoint + with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): + fabric.load(checkpoint_path_full, state) + + state = {"model": model, "steps": 1} fabric.load(checkpoint_path_full, state) From cd8acc26c3dd4d37a2b9aca458bd0d511d77df5c Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 15 May 2024 19:07:31 +0200 Subject: [PATCH 027/179] (3/n) Support 2D Parallelism - Efficient loading of full-state checkpoints (#19870) * memory-optimized loading of full checkpoints into dist model * simplify * handle buffers * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * handle strict loading, buffers, and add test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * chlog --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/lightning/fabric/CHANGELOG.md | 2 +- .../fabric/strategies/model_parallel.py | 47 ++++++++++++++----- .../test_model_parallel_integration.py | 46 +++++++++++++++++- 3 files changed, 82 insertions(+), 13 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index c5b883320bf2e..b74d9c34ea546 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -15,7 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) -- Added `ModelParallelStrategy` to support 2D parallelism ([#19846](https://github.com/Lightning-AI/pytorch-lightning/pull/19846), [#19852](https://github.com/Lightning-AI/pytorch-lightning/pull/19852)) +- Added `ModelParallelStrategy` to support 2D parallelism ([#19846](https://github.com/Lightning-AI/pytorch-lightning/pull/19846), [#19852](https://github.com/Lightning-AI/pytorch-lightning/pull/19852), [#19870](https://github.com/Lightning-AI/pytorch-lightning/pull/19870)) ### Changed diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index 9cd721f930d1b..4141ea454ca51 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -11,11 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import itertools import shutil from contextlib import ExitStack from datetime import timedelta from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Dict, Literal, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Dict, Generator, Literal, Optional, TypeVar, Union import torch from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only @@ -429,7 +430,6 @@ def _load_checkpoint( StateDictOptions, get_model_state_dict, get_optimizer_state_dict, - set_model_state_dict, set_optimizer_state_dict, ) @@ -484,13 +484,8 @@ def _load_checkpoint( if not _TORCH_GREATER_EQUAL_2_4: raise ImportError("Loading a non-distributed checkpoint into a distributed model requires PyTorch >= 2.4.") - state_dict_options = StateDictOptions( - broadcast_from_rank0=True, # type: ignore[call-arg] - full_state_dict=True, - strict=strict, - ) checkpoint = torch.load(path, mmap=True, map_location="cpu") - set_model_state_dict(module, checkpoint.pop(module_key), options=state_dict_options) + _load_raw_module_state(checkpoint.pop(module_key), module, strict=strict) requested_metadata_keys = state.keys() - modules.keys() - optimizers.keys() _validate_keys_for_strict_loading(requested_metadata_keys, checkpoint.keys(), strict=strict) @@ -525,7 +520,9 @@ def _load_raw_module_state_from_path(path: Path, module: Module, world_size: int _load_raw_module_state(state_dict=state_dict, module=module, world_size=world_size, strict=strict) -def _load_raw_module_state(state_dict: Dict[str, Any], module: Module, world_size: int, strict: bool = True) -> None: +def _load_raw_module_state( + state_dict: Dict[str, Any], module: Module, world_size: int = 1, strict: bool = True +) -> None: """Loads the state dict into the module by gathering all weights first and then and writing back to each shard.""" from torch.distributed.fsdp import FullyShardedDataParallel as FSDP @@ -535,11 +532,39 @@ def _load_raw_module_state(state_dict: Dict[str, Any], module: Module, world_siz from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict - state_dict_options = StateDictOptions(broadcast_from_rank0=True, full_state_dict=True) # type: ignore[call-arg] - set_model_state_dict(module, state_dict, options=state_dict_options) + state_dict_options = StateDictOptions( + broadcast_from_rank0=True, # type: ignore[call-arg] + full_state_dict=True, + strict=strict, # gets ignored at the moment + ) + + for submodule_name, submodule in module.named_modules(): + for param_name, _ in _named_parameters_and_buffers_to_load(submodule): + full_param_name = f"{submodule_name}{'.' if submodule_name else ''}{param_name}" + if full_param_name not in state_dict: + # Note: PyTorch does not currently respect the `strict` setting in state_dict_options! + if not strict: + continue + raise KeyError( + f"The model contains a key '{full_param_name}' that does not exist in the loaded checkpoint." + " To disable strict loading, set `strict=False`." + ) + local_state_dict = {param_name: state_dict[full_param_name]} + set_model_state_dict(submodule, local_state_dict, options=state_dict_options) elif isinstance(module, FSDP): with _get_full_state_dict_context(module, world_size=world_size, rank0_only=False): module.load_state_dict(state_dict, strict=strict) else: module.load_state_dict(state_dict, strict=strict) + + +def _named_parameters_and_buffers_to_load(module: Module) -> Generator: + """Returns parameters and buffers, with non-persistent buffers excluded.""" + for param_name, param in itertools.chain( + module.named_buffers(recurse=False), + module.named_parameters(recurse=False), + ): + if param_name in module._non_persistent_buffers_set: + continue + yield param_name, param diff --git a/tests/tests_fabric/strategies/test_model_parallel_integration.py b/tests/tests_fabric/strategies/test_model_parallel_integration.py index d864a9687ebb5..1f12822c69ee6 100644 --- a/tests/tests_fabric/strategies/test_model_parallel_integration.py +++ b/tests/tests_fabric/strategies/test_model_parallel_integration.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from copy import deepcopy from pathlib import Path from unittest import mock @@ -20,7 +21,7 @@ import torch.nn as nn import torch.nn.functional as F from lightning.fabric import Fabric -from lightning.fabric.strategies import ModelParallelStrategy +from lightning.fabric.strategies.model_parallel import ModelParallelStrategy, _load_raw_module_state from lightning.fabric.utilities.load import _load_distributed_checkpoint from torch.utils.data import DataLoader, DistributedSampler @@ -675,3 +676,46 @@ def test_save_sharded_and_consolidate_and_load(tmp_path): state = {"model": model, "steps": 1} fabric.load(checkpoint_path_full, state) + + +@RunIf(min_torch="2.4", min_cuda_gpus=2, standalone=True) +def test_load_raw_module_state(): + from torch.distributed.device_mesh import init_device_mesh + from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module + + class CustomModel(nn.Module): + def __init__(self): + super().__init__() + self.parameter = nn.Parameter(torch.rand(2, 2)) + self.layer1 = nn.Linear(4, 4) + self.layer2 = nn.Linear(4, 4) + self.register_buffer("persistent_buffer", torch.rand(2), persistent=True) + self.register_buffer("non_persistent_buffer", torch.rand(2), persistent=False) + + fabric = Fabric(accelerator="cuda", devices=2) + fabric.launch() + fabric.seed_everything(0) + + with fabric.init_module(): + model = CustomModel() + + state_dict = deepcopy(model.state_dict()) + + with fabric.init_module(): + model = CustomModel() + + device_mesh = init_device_mesh("cuda", mesh_shape=(2,), mesh_dim_names=("tp",)) + plan = {"layer1": ColwiseParallel()} + parallelize_module(model, device_mesh, plan) + _load_raw_module_state(state_dict, model, strict=True) + + assert torch.equal(model.parameter, state_dict["parameter"]) + assert torch.equal(model.layer1.weight.full_tensor(), state_dict["layer1.weight"]) + assert torch.equal(model.layer2.weight, state_dict["layer2.weight"]) + assert torch.equal(model.persistent_buffer, state_dict["persistent_buffer"]) + + state_dict.pop("parameter") + with pytest.raises(KeyError, match="The model contains a key 'parameter' that does not exist"): + _load_raw_module_state(state_dict, model, strict=True) + + _load_raw_module_state(state_dict, model, strict=False) From 1d0c6aae9634da61bbf58262fd85edd08438c6cc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 17 May 2024 23:17:32 +0200 Subject: [PATCH 028/179] (4/n) Support 2D Parallelism - Loading optimizer states correctly (#19872) * Load optimizer state * move to utility * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/fabric/strategies/fsdp.py | 15 ++--- .../fabric/strategies/model_parallel.py | 31 +++++++-- .../test_model_parallel_integration.py | 67 ++++++++----------- 4 files changed, 58 insertions(+), 57 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index b74d9c34ea546..6dc4101c8f3d9 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -15,7 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) -- Added `ModelParallelStrategy` to support 2D parallelism ([#19846](https://github.com/Lightning-AI/pytorch-lightning/pull/19846), [#19852](https://github.com/Lightning-AI/pytorch-lightning/pull/19852), [#19870](https://github.com/Lightning-AI/pytorch-lightning/pull/19870)) +- Added `ModelParallelStrategy` to support 2D parallelism ([#19846](https://github.com/Lightning-AI/pytorch-lightning/pull/19846), [#19852](https://github.com/Lightning-AI/pytorch-lightning/pull/19852), [#19870](https://github.com/Lightning-AI/pytorch-lightning/pull/19870), [#19872](https://github.com/Lightning-AI/pytorch-lightning/pull/19872)) ### Changed diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index bd3548a22be9b..fdd842209a0e3 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -529,7 +529,6 @@ def load_checkpoint( from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict from torch.distributed.fsdp import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp import OptimStateKeyType modules = {key: module for key, module in state.items() if _has_fsdp_modules(module)} if len(modules) == 0: @@ -590,7 +589,10 @@ def load_checkpoint( if _is_full_checkpoint(path): checkpoint = _lazy_load(path) - from lightning.fabric.strategies.model_parallel import _load_raw_module_state + from lightning.fabric.strategies.model_parallel import ( + _load_raw_module_state, + _rekey_optimizer_state_if_needed, + ) _load_raw_module_state(checkpoint.pop(module_key), module=module, world_size=self.world_size, strict=strict) @@ -605,14 +607,7 @@ def load_checkpoint( for optim_key, optim in optimizers.items(): # rank0_only should be false because we need to load the optimizer state on all ranks with _get_full_state_dict_context(module, world_size=self.world_size, rank0_only=False): - temp_state_dict = checkpoint.pop(optim_key) - - # Handling the case where the optimizer state is saved from a normal optimizer - if isinstance(list(temp_state_dict["state"].keys())[0], int): - temp_state_dict = FSDP.rekey_optim_state_dict( - temp_state_dict, OptimStateKeyType.PARAM_NAME, module - ) - + temp_state_dict = _rekey_optimizer_state_if_needed(checkpoint.pop(optim_key), module) optim_state_dict = FSDP.optim_state_dict_to_load( optim_state_dict=temp_state_dict, model=module, diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index 4141ea454ca51..fd927ed26fbfb 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -475,18 +475,26 @@ def _load_checkpoint( return metadata if _is_full_checkpoint(path): - # TODO: Support loading optimizer states - if any(isinstance(obj, Optimizer) for obj in state.values()): - raise NotImplementedError( - "Loading the optimizer states from a non-distributed checkpoint into a distributed model" - " is currently not supported." - ) if not _TORCH_GREATER_EQUAL_2_4: raise ImportError("Loading a non-distributed checkpoint into a distributed model requires PyTorch >= 2.4.") checkpoint = torch.load(path, mmap=True, map_location="cpu") _load_raw_module_state(checkpoint.pop(module_key), module, strict=strict) + state_dict_options = StateDictOptions( + broadcast_from_rank0=True, # type: ignore[call-arg] + full_state_dict=True, + strict=strict, + ) + for optimizer_name, optimizer in optimizers.items(): + optimizer_state = _rekey_optimizer_state_if_needed(checkpoint.pop(optimizer_name), module) + set_optimizer_state_dict( + module, + optimizer, + optim_state_dict=optimizer_state, + options=state_dict_options, + ) + requested_metadata_keys = state.keys() - modules.keys() - optimizers.keys() _validate_keys_for_strict_loading(requested_metadata_keys, checkpoint.keys(), strict=strict) @@ -568,3 +576,14 @@ def _named_parameters_and_buffers_to_load(module: Module) -> Generator: if param_name in module._non_persistent_buffers_set: continue yield param_name, param + + +def _rekey_optimizer_state_if_needed(optimizer_state_dict: Dict[str, Any], module: Module) -> Dict[str, Any]: + """Handles the case where the optimizer state is saved from a normal optimizer and converts the keys to parameter + names.""" + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp import OptimStateKeyType + + if isinstance(list(optimizer_state_dict["state"].keys())[0], int): + optimizer_state_dict = FSDP.rekey_optim_state_dict(optimizer_state_dict, OptimStateKeyType.PARAM_NAME, module) + return optimizer_state_dict diff --git a/tests/tests_fabric/strategies/test_model_parallel_integration.py b/tests/tests_fabric/strategies/test_model_parallel_integration.py index 1f12822c69ee6..a023cbc56811b 100644 --- a/tests/tests_fabric/strategies/test_model_parallel_integration.py +++ b/tests/tests_fabric/strategies/test_model_parallel_integration.py @@ -91,7 +91,6 @@ def _parallelize_feed_forward_tp(model, device_mesh): def _parallelize_feed_forward_fsdp2(model, device_mesh): from torch.distributed._composable.fsdp.fully_shard import fully_shard - from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper dp_mesh = device_mesh["data_parallel"] assert dp_mesh.ndim == 1 # Hybrid-sharding not supported @@ -101,8 +100,12 @@ def _parallelize_feed_forward_fsdp2(model, device_mesh): fully_shard(model.w2, mesh=dp_mesh) fully_shard(model.w3, mesh=dp_mesh) - # Activation checkpointing - model = checkpoint_wrapper(model) + # TODO: Re-enable activation checkpointing + # Currently, state dict keys get prefixed with '_checkpoint_wrapper' in the keys + # which leads to mismatches when loading weights into a checkpoint-wrapped module. + # PyTorch should handle this automatically. + + # model = checkpoint_wrapper(model) return model @@ -341,21 +344,17 @@ def test_save_full_state_dict(tmp_path): fabric.launch() model, optimizer = _train(fabric) - # TODO: Support loading optimizer states from full checkpoint - with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): - fabric.load(checkpoint_path, {"model": model, "optimizer": optimizer}) - - metadata = fabric.load(checkpoint_path, {"model": model}) - assert metadata == {"steps": 1, "optimizer": mock.ANY} + metadata = fabric.load(checkpoint_path, {"model": model, "optimizer": optimizer}) + assert metadata == {"steps": 1} params_after = [p.full_tensor() for p in model.parameters()] assert all(torch.equal(p0.cpu(), p1.cpu()) for p0, p1 in zip(params_before, params_after)) - # TODO: assert the correct optimizer state was loaded - # optimizer_state_after = get_optimizer_state_dict(model, optimizer) - # assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} - # torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) - # assert optimizer_state_after["param_groups"] == optimizer_state_before["param_groups"] + optimizer_state_after = get_optimizer_state_dict(model, optimizer) + optimizer_state_after["param_groups"][0]["betas"] = tuple(optimizer_state_after["param_groups"][0]["betas"]) + assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} + torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) + assert optimizer_state_after["param_groups"] == optimizer_state_before["param_groups"] # run a step to verify the optimizer state is correct _train(fabric, model, optimizer) @@ -372,10 +371,12 @@ def test_save_full_state_dict(tmp_path): normal_checkpoint_path = Path(fabric.broadcast(str(tmp_path / "normal-checkpoint.pt"))) fabric.save(normal_checkpoint_path, {"model": model, "optimizer": optimizer, "steps": 2}) - # TODO: assert the correct optimizer state was loaded - # optimizer_state_after = torch.load(normal_checkpoint_path)["optimizer"] - # assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} - # torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) + optimizer_state_after = torch.load(normal_checkpoint_path)["optimizer"] + assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} + assert torch.equal( + optimizer_state_after["state"][0]["exp_avg"], + optimizer_state_before["state"]["_forward_module.w1.weight"]["exp_avg"].full_tensor().cpu(), + ) # run a step to verify the optimizer state is correct _train(fabric, model, optimizer) @@ -386,20 +387,17 @@ def test_save_full_state_dict(tmp_path): fabric.launch() model, optimizer = _train(fabric) - # TODO: Support loading optimizer states from full checkpoint - with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): - fabric.load(checkpoint_path, {"model": model, "optimizer": optimizer}) - metadata = fabric.load(normal_checkpoint_path, {"model": model}) - assert metadata == {"steps": 2, "optimizer": mock.ANY} + metadata = fabric.load(normal_checkpoint_path, {"model": model, "optimizer": optimizer}) + assert metadata == {"steps": 2} params_after = [p.full_tensor() for p in model.parameters()] assert all(torch.equal(p0.cpu(), p1.cpu()) for p0, p1 in zip(params_before, params_after)) - # TODO: assert the correct optimizer state was loaded - # optimizer_state_after = get_optimizer_state_dict(model, optimizer) - # assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} - # torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) - # assert optimizer_state_after["param_groups"] == optimizer_state_before["param_groups"] + optimizer_state_after = get_optimizer_state_dict(model, optimizer) + optimizer_state_after["param_groups"][0]["betas"] = tuple(optimizer_state_after["param_groups"][0]["betas"]) + assert set(optimizer_state_after.keys()) == set(optimizer_state_before.keys()) == {"state", "param_groups"} + torch.testing.assert_close(optimizer_state_after["state"], optimizer_state_before["state"], atol=0, rtol=0) + assert optimizer_state_after["param_groups"] == optimizer_state_before["param_groups"] # run a step to verify the optimizer state is correct _train(fabric, model, optimizer) @@ -426,12 +424,7 @@ def test_load_full_state_dict_into_sharded_model(tmp_path): fabric.launch() model, optimizer = _train(fabric) - # TODO: Support loading optimizer states from full checkpoint - with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): - state = {"model": model, "optimizer": optimizer, "steps": 44} - fabric.load(checkpoint_path, state) - - state = {"model": model, "steps": 44} + state = {"model": model, "optimizer": optimizer, "steps": 44} fabric.load(checkpoint_path, state) assert state["steps"] == 1 @@ -669,12 +662,6 @@ def test_save_sharded_and_consolidate_and_load(tmp_path): optimizer = torch.optim.Adam(model.parameters()) optimizer = fabric.setup_optimizers(optimizer) state = {"model": model, "optimizer": optimizer, "steps": 1} - - # TODO: Support loading optimizer states from full checkpoint - with pytest.raises(NotImplementedError, match="Loading the optimizer states .* not supported"): - fabric.load(checkpoint_path_full, state) - - state = {"model": model, "steps": 1} fabric.load(checkpoint_path_full, state) From 32e241870b61a872a6e61e4503229397cb34b49a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 18 May 2024 01:03:31 +0200 Subject: [PATCH 029/179] (5/n) Support 2D Parallelism in Lightning Trainer (#19878) * ModelParallelStrategy for Lightning Trainer * mypy * import fix * fix torchscript errors * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix docs issue * fix test execution * Update src/lightning/pytorch/strategies/model_parallel.py --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Luca Antiga --- .../fabric/strategies/model_parallel.py | 48 +-- src/lightning/pytorch/core/module.py | 14 + src/lightning/pytorch/strategies/__init__.py | 2 + .../pytorch/strategies/model_parallel.py | 321 +++++++++++++++++ .../connectors/accelerator_connector.py | 2 + .../strategies/test_model_parallel.py | 7 +- .../test_model_parallel_integration.py | 72 ++-- .../strategies/test_model_parallel.py | 211 +++++++++++ .../test_model_parallel_integration.py | 340 ++++++++++++++++++ 9 files changed, 957 insertions(+), 60 deletions(-) create mode 100644 src/lightning/pytorch/strategies/model_parallel.py create mode 100644 tests/tests_pytorch/strategies/test_model_parallel.py create mode 100644 tests/tests_pytorch/strategies/test_model_parallel_integration.py diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index fd927ed26fbfb..a143d168cdede 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -163,7 +163,13 @@ def _configure_launcher(self) -> None: def setup_environment(self) -> None: super().setup_environment() self._setup_distributed() - self._setup_device_mesh() + if self._data_parallel_size == "auto": + self._data_parallel_size = self.num_nodes + if self._tensor_parallel_size == "auto": + self._tensor_parallel_size = self.num_processes + self._device_mesh = _setup_device_mesh( + self._data_parallel_size, self._tensor_parallel_size, self.world_size, self.root_device + ) @override def setup_module(self, module: TModel) -> TModel: @@ -303,25 +309,6 @@ def _setup_distributed(self) -> None: assert self.cluster_environment is not None _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) - def _setup_device_mesh(self) -> None: - from torch.distributed.device_mesh import init_device_mesh - - if self._data_parallel_size == "auto": - self._data_parallel_size = self.num_nodes - if self._tensor_parallel_size == "auto": - self._tensor_parallel_size = self.num_processes - if self._data_parallel_size * self._tensor_parallel_size != self.world_size: - raise RuntimeError( - f"The sizes `data_parallel_size={self._data_parallel_size}` and" - f" `tensor_parallel_size={self._tensor_parallel_size}` multiplied should equal the world size" - f" ({self.world_size})." - ) - self._device_mesh = init_device_mesh( - device_type=self.root_device.type, - mesh_shape=(self._data_parallel_size, self._tensor_parallel_size), - mesh_dim_names=("data_parallel", "tensor_parallel"), - ) - def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) @@ -510,6 +497,27 @@ def _load_checkpoint( ) +def _setup_device_mesh( + data_parallel_size: int, + tensor_parallel_size: int, + world_size: int, + device: torch.device, +) -> "DeviceMesh": + from torch.distributed.device_mesh import init_device_mesh + + if data_parallel_size * tensor_parallel_size != world_size: + raise RuntimeError( + f"The sizes `data_parallel_size={data_parallel_size}` and" + f" `tensor_parallel_size={tensor_parallel_size}` multiplied should equal the world size" + f" ({world_size})." + ) + return init_device_mesh( + device_type=device.type, + mesh_shape=(data_parallel_size, tensor_parallel_size), + mesh_dim_names=("data_parallel", "tensor_parallel"), + ) + + def _has_dtensor_modules(module: object) -> TypeGuard[Module]: from torch.distributed._tensor import DTensor diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index 3cb55566fb8b7..5a4f8d4e1bbb1 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -20,6 +20,7 @@ from pathlib import Path from typing import ( IO, + TYPE_CHECKING, Any, Callable, Dict, @@ -76,6 +77,9 @@ OptimizerLRScheduler, ) +if TYPE_CHECKING: + from torch.distributed.device_mesh import DeviceMesh + _ONNX_AVAILABLE = RequirementCache("onnx") warning_cache = WarningCache() @@ -110,6 +114,7 @@ class LightningModule( "trainer", "fabric", "strict_loading", + "device_mesh", ] + _DeviceDtypeModuleMixin.__jit_unused_properties__ + HyperparametersMixin.__jit_unused_properties__ @@ -142,6 +147,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._fabric: Optional["lf.Fabric"] = None self._fabric_optimizers: List[_FabricOptimizer] = [] + # access to device mesh in `conigure_model()` hook + self._device_mesh: Optional["DeviceMesh"] = None + @overload def optimizers( self, use_pl_optimizer: Literal[True] = True @@ -319,6 +327,12 @@ def loggers(self) -> Union[List[Logger], List[FabricLogger]]: return self._trainer.loggers return [] + @property + def device_mesh(self) -> Optional["DeviceMesh"]: + """Strategies like ``ModelParallelStrategy`` will create a device mesh that can be accessed in the + :meth:`~lightning.pytorch.core.hooks.ModelHooks.configure_model` hook to parallelize the LightningModule.""" + return self._device_mesh + def _call_batch_hook(self, hook_name: str, *args: Any) -> Any: trainer = self._trainer if trainer: diff --git a/src/lightning/pytorch/strategies/__init__.py b/src/lightning/pytorch/strategies/__init__.py index 14ffe52870ba5..9c2b2a6a3a621 100644 --- a/src/lightning/pytorch/strategies/__init__.py +++ b/src/lightning/pytorch/strategies/__init__.py @@ -18,6 +18,7 @@ from lightning.pytorch.strategies.ddp import DDPStrategy from lightning.pytorch.strategies.deepspeed import DeepSpeedStrategy from lightning.pytorch.strategies.fsdp import FSDPStrategy +from lightning.pytorch.strategies.model_parallel import ModelParallelStrategy from lightning.pytorch.strategies.parallel import ParallelStrategy from lightning.pytorch.strategies.single_device import SingleDeviceStrategy from lightning.pytorch.strategies.single_xla import SingleDeviceXLAStrategy # noqa: F401 @@ -31,6 +32,7 @@ "DDPStrategy", "DeepSpeedStrategy", "FSDPStrategy", + "ModelParallelStrategy", "ParallelStrategy", "SingleDeviceStrategy", "Strategy", diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py new file mode 100644 index 0000000000000..304b9bc04fc2d --- /dev/null +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -0,0 +1,321 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from contextlib import contextmanager, nullcontext +from datetime import timedelta +from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Mapping, Optional, Union + +import torch +from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only +from torch import Tensor +from torch.optim import Optimizer +from typing_extensions import override + +import lightning.pytorch as pl +from lightning.fabric.plugins import CheckpointIO +from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout +from lightning.fabric.strategies.model_parallel import _setup_device_mesh +from lightning.fabric.utilities.distributed import ( + _distributed_is_initialized, + _get_default_process_group_backend_for_device, + _init_dist_connection, + _sync_ddp_if_available, +) +from lightning.fabric.utilities.distributed import group as _group +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 +from lightning.fabric.utilities.init import _materialize_distributed_module +from lightning.fabric.utilities.optimizer import _optimizers_to_device +from lightning.fabric.utilities.seed import reset_seed +from lightning.fabric.utilities.types import _PATH, ReduceOp +from lightning.pytorch.core.optimizer import LightningOptimizer +from lightning.pytorch.strategies.launchers.subprocess_script import _SubprocessScriptLauncher +from lightning.pytorch.strategies.parallel import ParallelStrategy +from lightning.pytorch.strategies.strategy import TBroadcast +from lightning.pytorch.trainer.states import TrainerFn +from lightning.pytorch.utilities.model_helpers import is_overridden +from lightning.pytorch.utilities.rank_zero import rank_zero_only + +if TYPE_CHECKING: + from torch.distributed.device_mesh import DeviceMesh + + +class ModelParallelStrategy(ParallelStrategy): + """Enables user-defined parallelism applied to a model. + + .. warning:: This is an :ref:`experimental ` feature. + + Currently supports up to 2D parallelism. Specifically, it supports the combination of + Fully Sharded Data-Parallel 2 (FSDP2) with Tensor Parallelism (DTensor). These PyTorch APIs are currently still + experimental in PyTorch (see https://pytorch.org/docs/stable/distributed.tensor.parallel.html). + Requires PyTorch 2.3 or newer. + + Arguments: + data_parallel_size: The number of devices within a data-parallel group. Defaults to ``"auto"``, which + sets this size to the number of nodes in the cluster. + tensor_parallel_size: The number of devices within a tensor-parallel group. Defaults to ``"auto"``, which + sets this size to the number of GPUs in a single node. + save_distributed_checkpoint: If ``True``, each rank saves its shard of weights and optimizer states to a file. + The checkpoint is a folder with as many files as the world size. + If ``False``, the full weights and optimizer states get assembled on rank 0 and saved to a single file. + + """ + + def __init__( + self, + data_parallel_size: Union[Literal["auto"], int] = "auto", + tensor_parallel_size: Union[Literal["auto"], int] = "auto", + save_distributed_checkpoint: bool = True, + process_group_backend: Optional[str] = None, + timeout: Optional[timedelta] = default_pg_timeout, + ) -> None: + super().__init__() + if not _TORCH_GREATER_EQUAL_2_3: + raise ImportError(f"{type(self).__name__} requires PyTorch 2.3 or higher.") + self._data_parallel_size = data_parallel_size + self._tensor_parallel_size = tensor_parallel_size + self._save_distributed_checkpoint = save_distributed_checkpoint + self._process_group_backend: Optional[str] = process_group_backend + self._timeout: Optional[timedelta] = timeout + self._device_mesh: Optional["DeviceMesh"] = None + self.num_nodes = 1 + + @property + def device_mesh(self) -> "DeviceMesh": + if self._device_mesh is None: + raise RuntimeError("Accessing the device mesh before processes have initialized is not allowed.") + return self._device_mesh + + @property + @override + def checkpoint_io(self) -> CheckpointIO: + raise NotImplementedError(f"The `{type(self).__name__}` does not use the `CheckpointIO` plugin interface.") + + @checkpoint_io.setter + @override + def checkpoint_io(self, io: CheckpointIO) -> None: + raise NotImplementedError(f"The `{type(self).__name__}` does not support setting a `CheckpointIO` plugin.") + + @property + @override + def root_device(self) -> torch.device: + assert self.parallel_devices is not None + return self.parallel_devices[self.local_rank] + + @property + def num_processes(self) -> int: + return len(self.parallel_devices) if self.parallel_devices is not None else 0 + + @property + @override + def distributed_sampler_kwargs(self) -> Dict[str, Any]: + assert self.device_mesh is not None + data_parallel_mesh = self.device_mesh["data_parallel"] + return {"num_replicas": data_parallel_mesh.size(), "rank": data_parallel_mesh.get_local_rank()} + + @property + def process_group_backend(self) -> Optional[str]: + return self._process_group_backend + + @property + @override + def restore_checkpoint_after_setup(self) -> bool: + return True + + @property + @override + def lightning_restore_optimizer(self) -> bool: + return False + + @override + def _configure_launcher(self) -> None: + assert self.cluster_environment is not None + if not self.cluster_environment.creates_processes_externally: + self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes) + + @override + def setup_environment(self) -> None: + super().setup_environment() + self._setup_distributed() + if self._data_parallel_size == "auto": + self._data_parallel_size = self.num_nodes + if self._tensor_parallel_size == "auto": + self._tensor_parallel_size = self.num_processes + self._device_mesh = _setup_device_mesh( + self._data_parallel_size, self._tensor_parallel_size, self.world_size, self.root_device + ) + # Users can access device mesh in `LightningModule.configure_model()` + assert self.lightning_module is not None + self.lightning_module._device_mesh = self._device_mesh + + @override + def setup(self, trainer: "pl.Trainer") -> None: + from torch.distributed.fsdp import FullyShardedDataParallel + + assert self.model is not None + assert self.accelerator is not None + self.accelerator.setup(trainer) + + if not is_overridden("configure_model", self.lightning_module): + raise TypeError( + f"When using the {type(self).__name__}, you are required to override the `configure_model()` hook in" + f" the LightningModule and apply parallelization there." + ) + if any(isinstance(mod, FullyShardedDataParallel) for mod in self.model.modules()): + raise TypeError( + "Found modules that are wrapped with `torch.distributed.fsdp.FullyShardedDataParallel`." + f" The `{self.__class__.__name__}` only supports the new FSDP2 APIs in PyTorch >= 2.3." + ) + + _materialize_distributed_module(self.model, self.root_device) + + self.model = self.precision_plugin.convert_module(self.model) + self.model_to_device() # move all remaining layers if any left on CPU. + + self.barrier() + + if trainer.state.fn == TrainerFn.FITTING: + self.setup_optimizers(trainer) + self.setup_precision_plugin() + if trainer.state.fn == TrainerFn.FITTING: + _optimizers_to_device(self.optimizers, self.root_device) + + @override + def setup_optimizers(self, trainer: "pl.Trainer") -> None: + # If we're setting up for evaluation after fitting, we need to discard the optimizers + # since we're rewrapping the model, otherwise optimizer param references are no longer valid + # and subsequent checkpoint saving can fail + self._reset_optimizers_and_schedulers() + + return super().setup_optimizers(trainer) + + @override + def model_to_device(self) -> None: + assert self.model is not None + self.model.to(self.root_device) + + @contextmanager + @override + def tensor_init_context(self, empty_init: Optional[bool] = None) -> Generator[None, None, None]: + # Materializaton happens in `setup()` + empty_init_context = torch.device("meta") if empty_init else nullcontext() + with empty_init_context, self.precision_plugin.tensor_init_context(): + yield + + @override + def barrier(self, name: Optional[str] = None) -> None: + if not _distributed_is_initialized(): + return + if torch.distributed.get_backend() == "nccl": + torch.distributed.barrier(device_ids=self._determine_device_ids()) + else: + torch.distributed.barrier() + + @override + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: + if not _distributed_is_initialized(): + return obj + + obj = [obj] + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) + return obj[0] + + @override + def reduce( + self, + tensor: Union[Tensor, Any], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = "mean", + ) -> Tensor: + if isinstance(tensor, Tensor): + return _sync_ddp_if_available(tensor, group, reduce_op=reduce_op) + return tensor + + def _determine_device_ids(self) -> List[int]: + return [self.root_device.index] + + @override + def teardown(self) -> None: + assert self.cluster_environment is not None + assert self.accelerator is not None + self.cluster_environment.teardown() + self.precision_plugin.teardown() + self.accelerator.teardown() + + @override + def lightning_module_state_dict(self) -> Dict[str, Any]: + from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict + + state_dict_options = StateDictOptions(full_state_dict=(not self._save_distributed_checkpoint), cpu_offload=True) + assert self.model is not None + return get_model_state_dict(self.model, options=state_dict_options) + + @override + def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None: + # Override to do nothing, the strategy already loaded the states in `load_checkpoint()` + pass + + @override + def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Any]: + from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp import OptimStateKeyType + + state_dict_options = StateDictOptions(full_state_dict=(not self._save_distributed_checkpoint), cpu_offload=True) + if isinstance(optimizer, LightningOptimizer): + optimizer = optimizer._optimizer + + assert self.model is not None + state_dict = get_optimizer_state_dict(self.model, optimizer, options=state_dict_options) + if not self._save_distributed_checkpoint: + # Store the optimizer state dict in standard format + state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_ID, self.model) + return state_dict + + @override + def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + # Override to do nothing, the strategy already loaded the states in `load_checkpoint()` + pass + + @override + def save_checkpoint( + self, checkpoint: Dict[str, Any], filepath: _PATH, storage_options: Optional[Any] = None + ) -> None: + if storage_options is not None: + raise TypeError( + f"`{type(self).__name__}.save_checkpoint(..., storage_options=...)` is not supported because" + f" `{type(self).__name__}` does not use the `CheckpointIO`." + ) + raise NotImplementedError("Checkpoint saving is not yet implemented.") + + @override + def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: + raise NotImplementedError("Checkpoint loading is not yet implemented.") + + def _setup_distributed(self) -> None: + super().setup_environment() + reset_seed() + self.set_world_ranks() + self._process_group_backend = self._get_process_group_backend() + assert self.cluster_environment is not None + _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + + def _get_process_group_backend(self) -> str: + return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) + + def set_world_ranks(self) -> None: + if self.cluster_environment is not None: + self.cluster_environment.set_global_rank(self.node_rank * self.num_processes + self.local_rank) + self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) + # `LightningEnvironment.set_global_rank` will do this too, but we cannot rely on that implementation detail + # additionally, for some implementations, the setter is a no-op, so it's safer to access the getter + rank_zero_only.rank = utils_rank_zero_only.rank = self.global_rank diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index a191859c06c43..6a350030ea0f7 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -53,6 +53,7 @@ DDPStrategy, DeepSpeedStrategy, FSDPStrategy, + ModelParallelStrategy, ParallelStrategy, SingleDeviceStrategy, SingleDeviceXLAStrategy, @@ -600,6 +601,7 @@ def is_distributed(self) -> bool: DDPStrategy, FSDPStrategy, DeepSpeedStrategy, + ModelParallelStrategy, XLAStrategy, ] if _habana_available_and_importable(): diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py index 54efb999a8cf1..03b9268b3158e 100644 --- a/tests/tests_fabric/strategies/test_model_parallel.py +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -118,8 +118,7 @@ def test_parallelize_fn_call(): @RunIf(min_torch="2.3") def test_no_backward_sync(): - """Test that the backward sync control calls `.no_sync()`, and only on a module wrapped in - FullyShardedDataParallel.""" + """Test that the backward sync control disables gradient sync on modules that benefit from it.""" from torch.distributed._composable.fsdp import FSDP strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) @@ -141,7 +140,7 @@ def test_no_backward_sync(): @RunIf(min_torch="2.3") def test_save_checkpoint_storage_options(tmp_path): - """Test that the FSDP strategy does not accept storage options for saving checkpoints.""" + """Test that the strategy does not accept storage options for saving checkpoints.""" strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) with pytest.raises( TypeError, match=escape("ModelParallelStrategy.save_checkpoint(..., storage_options=...)` is not") @@ -326,7 +325,7 @@ def test_load_raw_checkpoint_optimizer_unsupported(tmp_path): @RunIf(min_torch="2.3") -@mock.patch("lightning.fabric.strategies.ModelParallelStrategy._setup_device_mesh") +@mock.patch("lightning.fabric.strategies.model_parallel._setup_device_mesh") @mock.patch("torch.distributed.init_process_group") def test_set_timeout(init_process_group_mock, _): """Test that the timeout gets passed to the ``torch.distributed.init_process_group`` function.""" diff --git a/tests/tests_fabric/strategies/test_model_parallel_integration.py b/tests/tests_fabric/strategies/test_model_parallel_integration.py index a023cbc56811b..75fb1003cfad7 100644 --- a/tests/tests_fabric/strategies/test_model_parallel_integration.py +++ b/tests/tests_fabric/strategies/test_model_parallel_integration.py @@ -29,42 +29,6 @@ from tests_fabric.helpers.runif import RunIf -@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) -def test_setup_device_mesh(): - from torch.distributed.device_mesh import DeviceMesh - - for dp_size, tp_size in ((1, 4), (4, 1), (2, 2)): - strategy = ModelParallelStrategy( - parallelize_fn=(lambda m, _: m), - data_parallel_size=dp_size, - tensor_parallel_size=tp_size, - ) - fabric = Fabric(accelerator="auto", devices=4, strategy=strategy) - fabric.launch() - - device_mesh = fabric.strategy.device_mesh - assert isinstance(device_mesh, DeviceMesh) - assert device_mesh.device_type == fabric.device.type - assert device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") - assert device_mesh.size(0) == dp_size - assert device_mesh.size(1) == tp_size - assert device_mesh.ndim == 2 - - fabric.barrier() - - # Passing "auto" will select internode and intranode dimensions automatically - strategy = ModelParallelStrategy( - parallelize_fn=(lambda m, _: m), - data_parallel_size="auto", - tensor_parallel_size="auto", - ) - fabric = Fabric(accelerator="auto", devices=4, num_nodes=1, strategy=strategy) - fabric.launch() - assert fabric.strategy.device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") - assert fabric.strategy.device_mesh.size(0) == 1 - assert fabric.strategy.device_mesh.size(1) == 4 - - class FeedForward(nn.Module): def __init__(self): super().__init__() @@ -116,6 +80,42 @@ def _parallelize_feed_forward_fsdp2_tp(model, device_mesh): return model +@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) +def test_setup_device_mesh(): + from torch.distributed.device_mesh import DeviceMesh + + for dp_size, tp_size in ((1, 4), (4, 1), (2, 2)): + strategy = ModelParallelStrategy( + parallelize_fn=(lambda m, _: m), + data_parallel_size=dp_size, + tensor_parallel_size=tp_size, + ) + fabric = Fabric(accelerator="auto", devices=4, strategy=strategy) + fabric.launch() + + device_mesh = fabric.strategy.device_mesh + assert isinstance(device_mesh, DeviceMesh) + assert device_mesh.device_type == fabric.device.type + assert device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") + assert device_mesh.size(0) == dp_size + assert device_mesh.size(1) == tp_size + assert device_mesh.ndim == 2 + + fabric.barrier() + + # Passing "auto" will select internode and intranode dimensions automatically + strategy = ModelParallelStrategy( + parallelize_fn=(lambda m, _: m), + data_parallel_size="auto", + tensor_parallel_size="auto", + ) + fabric = Fabric(accelerator="auto", devices=4, num_nodes=1, strategy=strategy) + fabric.launch() + assert fabric.strategy.device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") + assert fabric.strategy.device_mesh.size(0) == 1 + assert fabric.strategy.device_mesh.size(1) == 4 + + @RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=2) def test_tensor_parallel(): from torch.distributed._tensor import DTensor diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py new file mode 100644 index 0000000000000..4b9b0887c85bf --- /dev/null +++ b/tests/tests_pytorch/strategies/test_model_parallel.py @@ -0,0 +1,211 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from datetime import timedelta +from re import escape +from unittest import mock +from unittest.mock import Mock + +import pytest +import torch +import torch.nn as nn +from lightning.pytorch import LightningModule +from lightning.pytorch.plugins.environments import LightningEnvironment +from lightning.pytorch.strategies import ModelParallelStrategy + +from tests_pytorch.helpers.runif import RunIf + + +@mock.patch("lightning.pytorch.strategies.model_parallel._TORCH_GREATER_EQUAL_2_3", False) +def test_torch_greater_equal_2_3(): + with pytest.raises(ImportError, match="ModelParallelStrategy requires PyTorch 2.3 or higher"): + ModelParallelStrategy() + + +@RunIf(min_torch="2.3") +def test_device_mesh_access(): + strategy = ModelParallelStrategy() + with pytest.raises(RuntimeError, match="Accessing the device mesh .* not allowed"): + _ = strategy.device_mesh + + +@RunIf(min_torch="2.3") +@pytest.mark.parametrize( + ("num_nodes", "devices", "invalid_dp_size", "invalid_tp_size"), + [ + (1, 4, 1, 1), + (1, 4, 2, 3), + (1, 4, 4, 2), + (2, 4, 1, 4), + (2, 4, 2, 1), + ], +) +def test_validate_device_mesh_dimensions(num_nodes, devices, invalid_dp_size, invalid_tp_size): + """Test passing sizes that don't multiply to the world size raises an error.""" + strategy = ModelParallelStrategy( + data_parallel_size=invalid_dp_size, + tensor_parallel_size=invalid_tp_size, + ) + strategy._setup_distributed = Mock() + strategy._accelerator = Mock() + strategy.cluster_environment = Mock( + world_size=Mock(return_value=(num_nodes * devices)), local_rank=Mock(return_value=1) + ) + strategy.parallel_devices = [torch.device("cpu")] * devices + strategy.num_nodes = num_nodes + with pytest.raises(RuntimeError, match="multiplied should equal the world size"): + strategy.setup_environment() + + +@RunIf(min_torch="2.3") +def test_checkpoint_io_unsupported(): + """Test that the ModelParallel strategy does not support the `CheckpointIO` plugin.""" + strategy = ModelParallelStrategy() + with pytest.raises(NotImplementedError, match="does not use the `CheckpointIO` plugin"): + _ = strategy.checkpoint_io + + with pytest.raises(NotImplementedError, match="does not support setting a `CheckpointIO` plugin"): + strategy.checkpoint_io = Mock() + + +@RunIf(min_torch="2.3") +def test_fsdp_v1_modules_unsupported(): + """Test that the strategy won't allow setting up a module wrapped with the legacy FSDP API.""" + from torch.distributed.fsdp import FullyShardedDataParallel + + class Model(LightningModule): + def configure_model(self): + pass + + model = Model() + model.modules = Mock(return_value=[Mock(spec=FullyShardedDataParallel)]) + strategy = ModelParallelStrategy() + strategy.model = model + strategy._lightning_module = model + strategy._accelerator = Mock() + + with pytest.raises(TypeError, match="only supports the new FSDP2 APIs in PyTorch >= 2.3"): + strategy.setup(Mock()) + + +@RunIf(min_torch="2.3") +def test_configure_model_required(): + class Model1(LightningModule): + pass + + class Model2(LightningModule): + def configure_model(self): + pass + + model = Model1() + strategy = ModelParallelStrategy() + strategy.model = model + strategy._lightning_module = model + strategy._accelerator = Mock() + strategy._parallel_devices = [torch.device("cpu")] + + with pytest.raises(TypeError, match="you are required to override the `configure_model"): + strategy.setup(Mock()) + + model = Model2() + strategy.model = model + strategy._lightning_module = model + strategy.setup(Mock()) + + +@RunIf(min_torch="2.3") +def test_save_checkpoint_storage_options(tmp_path): + """Test that the strategy does not accept storage options for saving checkpoints.""" + strategy = ModelParallelStrategy() + with pytest.raises( + TypeError, match=escape("ModelParallelStrategy.save_checkpoint(..., storage_options=...)` is not") + ): + strategy.save_checkpoint(checkpoint=Mock(), filepath=tmp_path, storage_options=Mock()) + + +@RunIf(min_torch="2.3") +def test_save_checkpoint_path_exists(): + pytest.skip("Checkpoint saving and loading not implemented") + + +@RunIf(min_torch="2.3") +def test_load_full_checkpoint_support(): + pytest.skip("Checkpoint saving and loading not implemented") + + +@RunIf(min_torch="2.3") +def test_load_unknown_checkpoint_type(): + pytest.skip("Checkpoint saving and loading not implemented") + + +@RunIf(min_torch="2.3") +@mock.patch("lightning.pytorch.strategies.model_parallel._setup_device_mesh") +@mock.patch("torch.distributed.init_process_group") +def test_set_timeout(init_process_group_mock, _): + """Test that the timeout gets passed to the ``torch.distributed.init_process_group`` function.""" + test_timedelta = timedelta(seconds=30) + strategy = ModelParallelStrategy(timeout=test_timedelta) + strategy._lightning_module = Mock() + strategy.parallel_devices = [torch.device("cpu")] + strategy.cluster_environment = LightningEnvironment() + strategy.accelerator = Mock() + strategy.setup_environment() + process_group_backend = strategy._get_process_group_backend() + global_rank = strategy.cluster_environment.global_rank() + world_size = strategy.cluster_environment.world_size() + init_process_group_mock.assert_called_with( + process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta + ) + + +@RunIf(min_torch="2.3") +def test_meta_device_materialization(): + """Test that the `setup()` method materializes meta-device tensors in the LightningModule.""" + + class NoResetParameters(nn.Module): + def __init__(self): + super().__init__() + self.weight = nn.Parameter(torch.ones(4, 4)) + + class CustomModel(LightningModule): + def __init__(self): + super().__init__() + # nn.Sequential as a parameterless module + self.layer1 = nn.Sequential(NoResetParameters(), NoResetParameters()) + self.layer2 = nn.Linear(4, 4) + self.register_buffer("buffer", torch.rand(2)) + + def reset_parameters(self): + self.buffer.fill_(1.0) + + def configure_model(self) -> None: + pass + + with torch.device("meta"): + model = CustomModel() + assert model.layer1[0].weight.is_meta + assert model.layer2.weight.is_meta + assert model.buffer.is_meta + + strategy = ModelParallelStrategy() + strategy._accelerator = Mock() + strategy._device_mesh = Mock() + strategy._parallel_devices = [torch.device("cpu")] + strategy._lightning_module = model + strategy.model = model + + with pytest.warns(UserWarning, match=r"`reset_parameters\(\)` method for re-initialization: NoResetParameters"): + strategy.setup(Mock()) + assert all(not p.is_meta for p in model.parameters()) + assert all(not b.is_meta for b in model.buffers()) diff --git a/tests/tests_pytorch/strategies/test_model_parallel_integration.py b/tests/tests_pytorch/strategies/test_model_parallel_integration.py new file mode 100644 index 0000000000000..bbac2a6078f9c --- /dev/null +++ b/tests/tests_pytorch/strategies/test_model_parallel_integration.py @@ -0,0 +1,340 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from lightning.pytorch import LightningModule, Trainer, seed_everything +from lightning.pytorch.demos.boring_classes import BoringModel, RandomDataset +from lightning.pytorch.strategies import ModelParallelStrategy +from torch.utils.data import DataLoader, DistributedSampler +from torchmetrics.classification import Accuracy + +from tests_pytorch.helpers.runif import RunIf + + +class FeedForward(nn.Module): + def __init__(self): + super().__init__() + self.w1 = nn.Linear(32, 64) + self.w2 = nn.Linear(32, 64) + self.w3 = nn.Linear(64, 32) + + def forward(self, x): + return self.w3(F.silu(self.w1(x)) * self.w2(x)) + + +def _parallelize_feed_forward_tp(model, device_mesh): + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, parallelize_module + + tp_mesh = device_mesh["tensor_parallel"] + tp_plan = { + "w1": ColwiseParallel(), + "w2": ColwiseParallel(), + "w3": RowwiseParallel(), + } + parallelize_module(model, tp_mesh, tp_plan) + return model + + +def _parallelize_feed_forward_fsdp2(model, device_mesh): + from torch.distributed._composable.fsdp.fully_shard import fully_shard + + dp_mesh = device_mesh["data_parallel"] + assert dp_mesh.ndim == 1 # Hybrid-sharding not supported + + # Fully-shard each layer + fully_shard(model.w1, mesh=dp_mesh) + fully_shard(model.w2, mesh=dp_mesh) + fully_shard(model.w3, mesh=dp_mesh) + + # TODO: Re-enable activation checkpointing + # Currently, state dict keys get prefixed with '_checkpoint_wrapper' in the keys + # which leads to mismatches when loading weights into a checkpoint-wrapped module. + # PyTorch should handle this automatically. + + # model = checkpoint_wrapper(model) + + return model + + +def _parallelize_feed_forward_fsdp2_tp(model, device_mesh): + model = _parallelize_feed_forward_tp(model, device_mesh) + model = _parallelize_feed_forward_fsdp2(model, device_mesh) + return model + + +class TemplateModel(LightningModule): + def __init__(self): + super().__init__() + self.model = FeedForward() + + def training_step(self, batch): + output = self.model(batch) + return output.sum() + + def train_dataloader(self): + dataset_size = 8 + dataset = RandomDataset(32, dataset_size) + return DataLoader(dataset, batch_size=2) + + def configure_optimizers(self): + return torch.optim.AdamW(self.model.parameters()) + + +class FSDP2Model(TemplateModel): + def configure_model(self): + _parallelize_feed_forward_fsdp2(self.model, device_mesh=self.device_mesh) + + +class TensorParallelModel(TemplateModel): + def configure_model(self): + _parallelize_feed_forward_tp(self.model, device_mesh=self.device_mesh) + + +class FSDP2TensorParallelModel(TemplateModel): + def configure_model(self): + _parallelize_feed_forward_fsdp2_tp(self.model, device_mesh=self.device_mesh) + + +@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) +def test_setup_device_mesh(): + from torch.distributed.device_mesh import DeviceMesh + + for dp_size, tp_size in ((1, 4), (4, 1), (2, 2)): + strategy = ModelParallelStrategy( + data_parallel_size=dp_size, + tensor_parallel_size=tp_size, + ) + trainer = Trainer( + accelerator="auto", + devices=4, + strategy=strategy, + logger=False, + enable_checkpointing=False, + max_steps=1, + ) + + class Model(BoringModel): + def configure_model(self): + device_mesh = self.device_mesh + assert isinstance(device_mesh, DeviceMesh) + assert device_mesh.device_type == model.device.type + assert device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") + assert device_mesh.size(0) == dp_size + assert device_mesh.size(1) == tp_size + assert device_mesh.ndim == 2 + + model = Model() + trainer.fit(model) + + # Passing "auto" will select internode and intranode dimensions automatically + strategy = ModelParallelStrategy( + data_parallel_size="auto", + tensor_parallel_size="auto", + ) + trainer = Trainer( + accelerator="auto", + devices=4, + num_nodes=1, + strategy=strategy, + logger=False, + enable_checkpointing=False, + max_steps=1, + ) + + class Model(BoringModel): + def configure_model(self): + device_mesh = self.device_mesh + assert device_mesh.mesh_dim_names == ("data_parallel", "tensor_parallel") + assert device_mesh.size(0) == 1 + assert device_mesh.size(1) == 4 + + model = Model() + trainer.fit(model) + + +@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=2) +def test_tensor_parallel(): + from torch.distributed._tensor import DTensor + + class Model(TensorParallelModel): + def on_train_start(self): + device_mesh = self.device_mesh + optimizer = self.optimizers() + assert all( + tensor.device_mesh == device_mesh["tensor_parallel"] for tensor in optimizer.param_groups[0]["params"] + ) + assert all(isinstance(weight, DTensor) for weight in self.model.parameters()) + assert self.model.w1.weight.device_mesh == device_mesh["tensor_parallel"] + + # No data sharding, all GPUs get the same input inside a TP group + dataloader = self.trainer.train_dataloader + assert len(dataloader) == 8 // dataloader.batch_size + assert isinstance(dataloader.sampler, DistributedSampler) + + def training_step(self, batch): + # All batches must be identical across TP group + batches = self.all_gather(batch) + assert all(torch.equal(batches[0], batches[i]) for i in range(1, len(batches))) + return super().training_step(batch) + + trainer = Trainer( + accelerator="auto", + devices=2, + strategy=ModelParallelStrategy(), + max_steps=2, + enable_checkpointing=False, + logger=False, + ) + + seed_everything(0) + with trainer.init_module(empty_init=True): + model = Model() + + trainer.fit(model) + + +@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) +def test_fsdp2_tensor_parallel(): + from torch.distributed._tensor import DTensor + + class Model(FSDP2TensorParallelModel): + def on_train_start(self): + optimizer = self.optimizers() + assert all(isinstance(weight, DTensor) for weight in self.model.parameters()) + assert all(isinstance(tensor, DTensor) for tensor in optimizer.param_groups[0]["params"]) + assert self.model.w1.weight.device_mesh.ndim == 2 + assert self.model.w1.weight.device_mesh.size(0) == 2 + assert self.model.w1.weight.device_mesh.size(1) == 2 + assert all(weight.device.type != "meta" for weight in self.model.parameters()) + assert all(tensor.device_mesh.ndim == 2 for tensor in optimizer.param_groups[0]["params"]) + assert all(tensor.device.type != "meta" for tensor in optimizer.param_groups[0]["params"]) + + # No data sharding across TP dimension, sharding across data-parallel dimension only + device_mesh = self.device_mesh + dp_mesh = device_mesh["data_parallel"] + dataloader = self.trainer.train_dataloader + assert len(dataloader) == 8 // dataloader.batch_size // dp_mesh.size() + assert isinstance(dataloader.sampler, DistributedSampler) + + def training_step(self, batch): + batches = self.all_gather(batch) + dp_mesh = self.device_mesh["data_parallel"] + tp_mesh = self.device_mesh["tensor_parallel"] + + # Batches across the TP dimension must be identical + batches_tp = batches[tp_mesh.mesh] + assert all(torch.equal(batches_tp[0], batches_tp[i]) for i in range(1, len(batches_tp))) + # Batches across the DP dimension must be different + batches_dp = batches[dp_mesh.mesh] + assert all(not torch.equal(batches_dp[0], batches_dp[i]) for i in range(1, len(batches_dp))) + + return super().training_step(batch) + + strategy = ModelParallelStrategy( + data_parallel_size=2, + tensor_parallel_size=2, + ) + trainer = Trainer( + accelerator="auto", + devices=4, + strategy=strategy, + max_steps=2, + enable_checkpointing=False, + logger=False, + ) + + seed_everything(0) + with trainer.init_module(empty_init=True): + model = Model() + + trainer.fit(model) + + +@RunIf(min_torch="2.3", min_cuda_gpus=2, standalone=True) +def test_modules_without_parameters(tmp_path): + """Test that TorchMetrics get moved to the device despite not having any parameters.""" + + class MetricsModel(TensorParallelModel): + def __init__(self): + super().__init__() + self.metric = Accuracy("multiclass", num_classes=10) + assert self.metric.device == self.metric.tp.device == torch.device("cpu") + + def setup(self, stage) -> None: + assert self.metric.device == self.metric.tp.device == torch.device("cpu") + + def training_step(self, batch): + assert self.metric.device.type == self.metric.tp.device.type == "cuda" + self.metric(torch.rand(2, 10, device=self.device), torch.randint(0, 10, size=(2,), device=self.device)) + return super().training_step(batch) + + model = MetricsModel() + trainer = Trainer( + default_root_dir=tmp_path, + accelerator="cuda", + devices=2, + strategy=ModelParallelStrategy(), + max_steps=1, + enable_checkpointing=False, + logger=False, + ) + trainer.fit(model) + + +@RunIf(min_torch="2.3", min_cuda_gpus=2, standalone=True) +@pytest.mark.parametrize( + ("precision", "expected_dtype"), + [ + ("32-true", torch.float32), + ("16-true", torch.float16), + pytest.param("bf16-true", torch.bfloat16, marks=RunIf(bf16_cuda=True)), + ], +) +def test_module_init_context(precision, expected_dtype, tmp_path): + """Test that the module under the init-context gets moved to the right device and dtype.""" + + class Model(FSDP2Model): + def on_train_start(self): + assert self.model.w1.weight.device == torch.device("cuda", self.local_rank) + assert self.model.w1.weight.dtype == expected_dtype + optimizer = self.optimizers(use_pl_optimizer=False) + assert optimizer.param_groups[0]["params"][0].device.type == "cuda" + + def _run_setup_assertions(empty_init, expected_device): + trainer = Trainer( + default_root_dir=tmp_path, + accelerator="cuda", + devices=2, + strategy=ModelParallelStrategy(), + precision=precision, + max_steps=1, + barebones=True, + enable_checkpointing=False, + logger=False, + ) + with trainer.init_module(empty_init=empty_init): + model = Model() + + # The model is on the CPU/meta-device until after `ModelParallelStrategy.setup()` + assert model.model.w1.weight.device == expected_device + assert model.model.w1.weight.dtype == expected_dtype + trainer.fit(model) + + # Case 1: No empty init + _run_setup_assertions(empty_init=False, expected_device=torch.device("cpu")) + + # Case 2: Empty-init with PyTorch >= 2.1 supports meta device + _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) From c8059d7bfd0ccd4d3d29efe9a57a41b110d8a02a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 19 May 2024 02:35:58 +0200 Subject: [PATCH 030/179] (6/n) Support 2D Parallelism - Trainer example (#19879) * Add 2D parallel example * replace with torchtitan code --- examples/fabric/tensor_parallel/model.py | 102 ++-- .../fabric/tensor_parallel/parallelism.py | 6 +- examples/pytorch/tensor_parallel/README.md | 49 ++ examples/pytorch/tensor_parallel/data.py | 21 + examples/pytorch/tensor_parallel/model.py | 456 ++++++++++++++++++ .../pytorch/tensor_parallel/parallelism.py | 100 ++++ examples/pytorch/tensor_parallel/train.py | 75 +++ 7 files changed, 765 insertions(+), 44 deletions(-) create mode 100644 examples/pytorch/tensor_parallel/README.md create mode 100644 examples/pytorch/tensor_parallel/data.py create mode 100644 examples/pytorch/tensor_parallel/model.py create mode 100644 examples/pytorch/tensor_parallel/parallelism.py create mode 100644 examples/pytorch/tensor_parallel/train.py diff --git a/examples/fabric/tensor_parallel/model.py b/examples/fabric/tensor_parallel/model.py index ad8dbd99e1c08..3c9e7de472b90 100644 --- a/examples/fabric/tensor_parallel/model.py +++ b/examples/fabric/tensor_parallel/model.py @@ -1,5 +1,12 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Llama 2 is licensed under the LLAMA 2 Community License, +# Copyright (c) Meta Platforms, Inc. All Rights Reserved. + from dataclasses import dataclass from typing import Optional, Tuple @@ -19,15 +26,16 @@ class ModelArgs: multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 ffn_dim_multiplier: Optional[float] = None norm_eps: float = 1e-5 + rope_theta: float = 10000 max_batch_size: int = 32 - max_seq_len: int = 32768 + max_seq_len: int = 2048 # If `True`, then each transformer block init uses its layer ID, and if # `False`, each uses the total number of transformer blocks depth_init: bool = True -def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor: """Precompute the frequency tensor for complex exponentials (cis) with given dimensions. This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' @@ -44,17 +52,20 @@ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): """ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) - t = torch.arange(end, device=freqs.device) # type: ignore - freqs = torch.outer(t, freqs).float() # type: ignore + t = torch.arange(end, device=freqs.device) + freqs = torch.outer(t, freqs).float() return torch.polar(torch.ones_like(freqs), freqs) # complex64 -def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor: """Reshape frequency tensor for broadcasting it with another tensor. This function reshapes the frequency tensor to have the same shape as the target tensor 'x' for the purpose of broadcasting the frequency tensor during element-wise operations. + The input freqs_cis tensor is assumed to be of shape (max_seqlen, dim), + and the first seqlen elements will be sliced, but dim must match x. + Args: freqs_cis (torch.Tensor): Frequency tensor to be reshaped. x (torch.Tensor): Target tensor for broadcasting compatibility. @@ -65,7 +76,9 @@ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): """ ndim = x.ndim assert 0 <= 1 < ndim - assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + seqlen = x.shape[1] + freqs_cis = freqs_cis[0:seqlen] + assert freqs_cis.shape == (seqlen, x.shape[-1]) shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] return freqs_cis.view(*shape) @@ -149,7 +162,6 @@ class Attention(nn.Module): Attributes: n_kv_heads (int): Number of key and value heads. n_heads (int): Number of query heads. - n_local_kv_heads (int): Number of local key and value heads. n_rep (int): Number of repetitions for local heads. head_dim (int): Dimension size of each attention head. wq (Linear): Linear transformation for queries. @@ -191,15 +203,16 @@ def forward( torch.Tensor: Output tensor after attention. """ - bsz, seqlen, _ = x.shape + bs, seqlen, _ = x.shape xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) - xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim) - xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim) - xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim) + xq = xq.view(bs, seqlen, self.n_heads, self.head_dim) + xk = xk.view(bs, seqlen, self.n_kv_heads, self.head_dim) + xv = xv.view(bs, seqlen, self.n_kv_heads, self.head_dim) xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + # repeat k/v heads if n_kv_heads < n_heads keys = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) values = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) @@ -210,7 +223,7 @@ def forward( # we use casual mask for training output = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True) output = output.transpose(1, 2).contiguous() # (bs, seqlen, n_local_heads, head_dim) - output = output.view(bsz, seqlen, -1) + output = output.view(bs, seqlen, -1) return self.wo(output) @@ -346,33 +359,30 @@ def __init__(self, model_args: ModelArgs): self.model_args = model_args self.vocab_size = model_args.vocab_size self.n_layers = model_args.n_layers - self.model_dim = model_args.dim self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) - self.register_buffer( - "freqs_cis", - precompute_freqs_cis( - model_args.dim // model_args.n_heads, - # Need to compute until at least the max token limit for generation - # (use 2x max sequence length to be safe) - model_args.max_seq_len * 2, - ), - ) - self.layers = torch.nn.ModuleList() + + # TODO persistent should be set to false, since this buffer can be recomputed. + # however, we set it to true for 2 reasons. (1) due to pytorch/pytorch#123411, + # compile or pipeline-tracer will not correctly handle non-persistent buffers, + # so we need to fix that. (2) if we initialize pipeline-parallel models from + # a seed checkpoint rather than calling init_weights, we need freqs_cis to be + # initialized by the checkpoint, or we need to add a separate initializer for + # just the non-persistent buffers that is called after loading checkpoints. + self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True) + + self.layers = torch.nn.ModuleDict() for layer_id in range(model_args.n_layers): - self.layers.append(TransformerBlock(layer_id, model_args)) + self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args) self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False) + self.init_weights() def reset_parameters(self): with torch.device(self.freqs_cis.device): - self.freqs_cis = precompute_freqs_cis( - self.model_args.dim // self.model_args.n_heads, - # Need to compute until at least the max token limit for generation - # (use 2x max sequence length to be safe) - self.model_args.max_seq_len * 2, - ) + self.freqs_cis = self._precompute_freqs_cis() def init_weights(self): """[Note: On ``init_weights`` vs. @@ -388,8 +398,10 @@ def init_weights(self): ``Transformer`` root module to avoid reinitializing tensors. """ + with torch.device(self.freqs_cis.device): + self.freqs_cis = self._precompute_freqs_cis() nn.init.normal_(self.tok_embeddings.weight) - for layer in self.layers: + for layer in self.layers.values(): layer.init_weights() self.norm.reset_parameters() final_out_std = self.model_args.dim**-0.5 @@ -402,6 +414,15 @@ def init_weights(self): b=cutoff_factor * final_out_std, ) + def _precompute_freqs_cis(self) -> torch.Tensor: + return precompute_freqs_cis( + self.model_args.dim // self.model_args.n_heads, + # Need to compute until at least the max token limit for generation + # (use 2x max sequence length to be safe) + self.model_args.max_seq_len * 2, + self.model_args.rope_theta, + ) + def forward(self, tokens: torch.Tensor): """Perform a forward pass through the Transformer model. @@ -412,15 +433,14 @@ def forward(self, tokens: torch.Tensor): torch.Tensor: Output logits after applying the Transformer model. """ - _bsz, seqlen = tokens.shape - h = self.tok_embeddings(tokens) - self.freqs_cis = self.freqs_cis.to(h.device) - freqs_cis = self.freqs_cis[0:seqlen] - - for layer in self.layers: - h = layer(h, freqs_cis) - h = self.norm(h) - return self.output(h).float() + # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages + h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens + + for layer in self.layers.values(): + h = layer(h, self.freqs_cis) + + h = self.norm(h) if self.norm else h + return self.output(h).float() if self.output else h @classmethod def from_model_args(cls, model_args: ModelArgs) -> "Transformer": diff --git a/examples/fabric/tensor_parallel/parallelism.py b/examples/fabric/tensor_parallel/parallelism.py index 088d3f2ef1bd0..f6f38aa499efa 100644 --- a/examples/fabric/tensor_parallel/parallelism.py +++ b/examples/fabric/tensor_parallel/parallelism.py @@ -46,7 +46,7 @@ def parallelize(model: Transformer, device_mesh: DeviceMesh) -> Transformer: model = parallelize_module(model, tp_mesh, plan) # Parallelize each transformer block - for transformer_block in model.layers: + for transformer_block in model.layers.values(): plan = { "attention": PrepareModuleInput( input_layouts=(Shard(1), None), @@ -83,12 +83,12 @@ def parallelize(model: Transformer, device_mesh: DeviceMesh) -> Transformer: mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} - for layer_id, transformer_block in enumerate(model.layers): + for layer_id, transformer_block in model.layers.items(): # Apply activation checkpointing transformer_block = checkpoint_wrapper(transformer_block) # As an optimization, do not reshard after forward for the last # transformer block since FSDP would prefetch it immediately - reshard_after_forward = layer_id < len(model.layers) - 1 + reshard_after_forward = int(layer_id) < len(model.layers) - 1 fully_shard( transformer_block, **fsdp_config, diff --git a/examples/pytorch/tensor_parallel/README.md b/examples/pytorch/tensor_parallel/README.md new file mode 100644 index 0000000000000..97675976148f8 --- /dev/null +++ b/examples/pytorch/tensor_parallel/README.md @@ -0,0 +1,49 @@ +## Tensor Parallel and 2D Parallel + +This example shows how to apply tensor-parallelism to your model (here Llama 2 7B) with the `ModelParallelStrategy`, and how it can be combined with FSDP (2D parallelism). +PyTorch 2.3+ and a machine with at least 4 GPUs and 24 GB memory each are required to run this example. + +```bash +pip install 'torch>=2.3' +``` + +Navigate to this example folder and run the training script: + +```bash +cd examples/pytorch/tensor_parallel +python train.py +``` + +You should see an output like this: + +``` +GPU available: True (cuda), used: True +TPU available: False, using: 0 TPU cores +HPU available: False, using: 0 HPUs + +Number of model parameters: 6.7 B +Starting training ... + +Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 +Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4 +Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4 +Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4 +---------------------------------------------------------------------------------------------------- +distributed_backend=nccl +All distributed processes registered. Starting with 4 processes +---------------------------------------------------------------------------------------------------- + +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3] +LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3] +LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3] +LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3] + +Epoch 0: 100%|█████████████████████████████████████████████| 10/10 [01:49<00:00, 0.09it/s, v_num=2] +`Trainer.fit` stopped: `max_epochs=1` reached. +Saving a (distributed) checkpoint ... +Training successfully completed! +Peak memory usage: 36.73 GB +``` + +> \[!NOTE\] +> The `ModelParallelStrategy` is experimental and subject to change. Report issues on [GitHub](https://github.com/Lightning-AI/pytorch-lightning/issues). diff --git a/examples/pytorch/tensor_parallel/data.py b/examples/pytorch/tensor_parallel/data.py new file mode 100644 index 0000000000000..ba36987283ffd --- /dev/null +++ b/examples/pytorch/tensor_parallel/data.py @@ -0,0 +1,21 @@ +import torch +from torch.utils.data import Dataset + + +class RandomTokenDataset(Dataset): + def __init__(self, vocab_size: int, seq_length: int): + self.vocab_size = vocab_size + self.seq_length = seq_length + self.tokens = torch.randint( + self.vocab_size, + size=(len(self), self.seq_length + 1), + # Set a seed to make this toy dataset the same on each rank + # Fabric will add a `DistributedSampler` to shard the data correctly + generator=torch.Generator().manual_seed(42), + ) + + def __len__(self) -> int: + return 128 + + def __getitem__(self, item: int): + return self.tokens[item] diff --git a/examples/pytorch/tensor_parallel/model.py b/examples/pytorch/tensor_parallel/model.py new file mode 100644 index 0000000000000..3c9e7de472b90 --- /dev/null +++ b/examples/pytorch/tensor_parallel/model.py @@ -0,0 +1,456 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Llama 2 is licensed under the LLAMA 2 Community License, +# Copyright (c) Meta Platforms, Inc. All Rights Reserved. + + +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import nn + + +@dataclass +class ModelArgs: + dim: int = 4096 + n_layers: int = 32 + n_heads: int = 32 + n_kv_heads: Optional[int] = None + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + ffn_dim_multiplier: Optional[float] = None + norm_eps: float = 1e-5 + rope_theta: float = 10000 + + max_batch_size: int = 32 + max_seq_len: int = 2048 + # If `True`, then each transformer block init uses its layer ID, and if + # `False`, each uses the total number of transformer blocks + depth_init: bool = True + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor: + """Precompute the frequency tensor for complex exponentials (cis) with given dimensions. + + This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' + and the end index 'end'. The 'theta' parameter scales the frequencies. + The returned tensor contains complex values in complex64 data type. + + Args: + dim (int): Dimension of the frequency tensor. + end (int): End index for precomputing frequencies. + theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0. + + Returns: + torch.Tensor: Precomputed frequency tensor with complex exponentials. + + """ + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) + freqs = torch.outer(t, freqs).float() + return torch.polar(torch.ones_like(freqs), freqs) # complex64 + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor: + """Reshape frequency tensor for broadcasting it with another tensor. + + This function reshapes the frequency tensor to have the same shape as the target tensor 'x' + for the purpose of broadcasting the frequency tensor during element-wise operations. + + The input freqs_cis tensor is assumed to be of shape (max_seqlen, dim), + and the first seqlen elements will be sliced, but dim must match x. + + Args: + freqs_cis (torch.Tensor): Frequency tensor to be reshaped. + x (torch.Tensor): Target tensor for broadcasting compatibility. + + Returns: + torch.Tensor: Reshaped frequency tensor. + + """ + ndim = x.ndim + assert 0 <= 1 < ndim + seqlen = x.shape[1] + freqs_cis = freqs_cis[0:seqlen] + assert freqs_cis.shape == (seqlen, x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Apply rotary embeddings to input tensors using the given frequency tensor. + + This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided + frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor + is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are + returned as real tensors. + + Args: + xq (torch.Tensor): Query tensor to apply rotary embeddings. + xk (torch.Tensor): Key tensor to apply rotary embeddings. + freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings. + + """ + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + x[:, :, :, None, :] + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + + +class RMSNorm(nn.Module): + """Initialize the RMSNorm normalization layer. + + Args: + dim (int): The dimension of the input tensor. + eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. + + Attributes: + eps (float): A small value added to the denominator for numerical stability. + weight (nn.Parameter): Learnable scaling parameter. + + """ + + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x: torch.Tensor): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x: torch.Tensor): + output = self._norm(x.float()).type_as(x) + return output * self.weight + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) # type: ignore + + +class Attention(nn.Module): + """Multi-head attention module. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Attributes: + n_kv_heads (int): Number of key and value heads. + n_heads (int): Number of query heads. + n_rep (int): Number of repetitions for local heads. + head_dim (int): Dimension size of each attention head. + wq (Linear): Linear transformation for queries. + wk (Linear): Linear transformation for keys. + wv (Linear): Linear transformation for values. + wo (Linear): Linear transformation for output. + + """ + + def __init__(self, model_args: ModelArgs): + super().__init__() + self.n_heads = model_args.n_heads + self.n_kv_heads = model_args.n_heads if model_args.n_kv_heads is None else model_args.n_kv_heads + self.n_rep = self.n_heads // self.n_kv_heads + self.head_dim = model_args.dim // model_args.n_heads + + self.wq = nn.Linear(model_args.dim, model_args.n_heads * self.head_dim, bias=False) + self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wo = nn.Linear(model_args.n_heads * self.head_dim, model_args.dim, bias=False) + + def init_weights(self, init_std: float): + for linear in (self.wq, self.wk, self.wv): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02) + nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std) + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + ): + """Forward pass of the attention module. + + Args: + x (torch.Tensor): Input tensor. + freqs_cis (torch.Tensor): Precomputed frequency tensor. + + Returns: + torch.Tensor: Output tensor after attention. + + """ + bs, seqlen, _ = x.shape + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + + xq = xq.view(bs, seqlen, self.n_heads, self.head_dim) + xk = xk.view(bs, seqlen, self.n_kv_heads, self.head_dim) + xv = xv.view(bs, seqlen, self.n_kv_heads, self.head_dim) + + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + # repeat k/v heads if n_kv_heads < n_heads + keys = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + values = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xk = keys.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xv = values.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + + # we use casual mask for training + output = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True) + output = output.transpose(1, 2).contiguous() # (bs, seqlen, n_local_heads, head_dim) + output = output.view(bs, seqlen, -1) + return self.wo(output) + + +class FeedForward(nn.Module): + """FeedForward module. + + Args: + dim (int): Input dimension. + hidden_dim (int): Hidden dimension of the feedforward layer. + multiple_of (int): Value to ensure hidden dimension is a multiple of this value. + ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None. + + Attributes: + w1 (Linear): Linear transformation for the first layer. + w2 (Linear): Linear transformation for the second layer. + w3 (Linear): Linear transformation for the third layer. + + """ + + def __init__( + self, + dim: int, + hidden_dim: int, + multiple_of: int, + ffn_dim_multiplier: Optional[float], + ): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + hidden_dim = int(ffn_dim_multiplier * hidden_dim) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + def init_weights(self, init_std: float): + nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02) + for linear in (self.w2, self.w3): + nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std) + + +class TransformerBlock(nn.Module): + """TransformerBlock Module. + + Args: + layer_id (int): Identifier for the layer. + model_args (ModelArgs): Model configuration arguments. + + Attributes: + n_heads (int): Number of attention heads. + dim (int): Dimension size of the model. + head_dim (int): Dimension size of each attention head. + attention (Attention): Attention module. + feed_forward (FeedForward): FeedForward module. + layer_id (int): Identifier for the layer. + attention_norm (RMSNorm): Layer normalization for attention output. + ffn_norm (RMSNorm): Layer normalization for feedforward output. + + """ + + def __init__(self, layer_id: int, model_args: ModelArgs): + super().__init__() + self.n_heads = model_args.n_heads + self.dim = model_args.dim + self.attention = Attention(model_args) + self.feed_forward = FeedForward( + dim=model_args.dim, + hidden_dim=4 * model_args.dim, + multiple_of=model_args.multiple_of, + ffn_dim_multiplier=model_args.ffn_dim_multiplier, + ) + self.layer_id = layer_id + self.num_layers = model_args.n_layers + + self.attention_norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + self.ffn_norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + + if model_args.depth_init: + self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5 + else: + self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5 + + def forward( + self, + x: torch.Tensor, + freqs_cis: torch.Tensor, + ): + """Perform a forward pass through the TransformerBlock. + + Args: + x (torch.Tensor): Input tensor. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + + Returns: + torch.Tensor: Output tensor after applying attention and feedforward layers. + + """ + h = x + self.attention(self.attention_norm(x), freqs_cis) + return h + self.feed_forward(self.ffn_norm(h)) + + def init_weights(self): + for norm in (self.attention_norm, self.ffn_norm): + norm.reset_parameters() + self.attention.init_weights(self.weight_init_std) + self.feed_forward.init_weights(self.weight_init_std) + + +class Transformer(nn.Module): + """Transformer Module. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Attributes: + model_args (ModelArgs): Model configuration arguments. + vocab_size (int): Vocabulary size. + n_layers (int): Number of layers in the model. + tok_embeddings (ParallelEmbedding): Token embeddings. + layers (torch.nn.ModuleList): List of Transformer blocks. + norm (RMSNorm): Layer normalization for the model output. + output (ColumnParallelLinear): Linear layer for final output. + freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies. + + """ + + def __init__(self, model_args: ModelArgs): + super().__init__() + self.model_args = model_args + self.vocab_size = model_args.vocab_size + self.n_layers = model_args.n_layers + + self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim) + + # TODO persistent should be set to false, since this buffer can be recomputed. + # however, we set it to true for 2 reasons. (1) due to pytorch/pytorch#123411, + # compile or pipeline-tracer will not correctly handle non-persistent buffers, + # so we need to fix that. (2) if we initialize pipeline-parallel models from + # a seed checkpoint rather than calling init_weights, we need freqs_cis to be + # initialized by the checkpoint, or we need to add a separate initializer for + # just the non-persistent buffers that is called after loading checkpoints. + self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True) + + self.layers = torch.nn.ModuleDict() + for layer_id in range(model_args.n_layers): + self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args) + + self.norm = RMSNorm(dim=model_args.dim, eps=model_args.norm_eps) + + self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False) + self.init_weights() + + def reset_parameters(self): + with torch.device(self.freqs_cis.device): + self.freqs_cis = self._precompute_freqs_cis() + + def init_weights(self): + """[Note: On ``init_weights`` vs. + + ``reset_parameters``] + Modules may define ``reset_parameters`` to initialize parameter values. + ``reset_parameters`` is meant to only initialize directly owned + parameters/buffers, not those of their child modules, and it can be + used to give the initial values for these tensors. + Separately, users may want custom initialization for their modules, + different from that in ``reset_parameters``. For this, we define + ``init_weights``. We only call it in the constructor of this + ``Transformer`` root module to avoid reinitializing tensors. + + """ + with torch.device(self.freqs_cis.device): + self.freqs_cis = self._precompute_freqs_cis() + nn.init.normal_(self.tok_embeddings.weight) + for layer in self.layers.values(): + layer.init_weights() + self.norm.reset_parameters() + final_out_std = self.model_args.dim**-0.5 + cutoff_factor = 3 + nn.init.trunc_normal_( + self.output.weight, + mean=0.0, + std=final_out_std, + a=-cutoff_factor * final_out_std, + b=cutoff_factor * final_out_std, + ) + + def _precompute_freqs_cis(self) -> torch.Tensor: + return precompute_freqs_cis( + self.model_args.dim // self.model_args.n_heads, + # Need to compute until at least the max token limit for generation + # (use 2x max sequence length to be safe) + self.model_args.max_seq_len * 2, + self.model_args.rope_theta, + ) + + def forward(self, tokens: torch.Tensor): + """Perform a forward pass through the Transformer model. + + Args: + tokens (torch.Tensor): Input token indices. + + Returns: + torch.Tensor: Output logits after applying the Transformer model. + + """ + # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages + h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens + + for layer in self.layers.values(): + h = layer(h, self.freqs_cis) + + h = self.norm(h) if self.norm else h + return self.output(h).float() if self.output else h + + @classmethod + def from_model_args(cls, model_args: ModelArgs) -> "Transformer": + """Initialize a Transformer model from a ModelArgs object. + + Args: + model_args (ModelArgs): Model configuration arguments. + + Returns: + Transformer: Transformer model. + + """ + return cls(model_args) diff --git a/examples/pytorch/tensor_parallel/parallelism.py b/examples/pytorch/tensor_parallel/parallelism.py new file mode 100644 index 0000000000000..f6f38aa499efa --- /dev/null +++ b/examples/pytorch/tensor_parallel/parallelism.py @@ -0,0 +1,100 @@ +import torch +from model import Transformer +from torch.distributed._composable.fsdp import MixedPrecisionPolicy +from torch.distributed._composable.fsdp.fully_shard import fully_shard +from torch.distributed._tensor import Replicate, Shard +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.tensor.parallel import ( + ColwiseParallel, + PrepareModuleInput, + RowwiseParallel, + SequenceParallel, + parallelize_module, +) + + +# Taken and modified from torchtitan +# https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py +def parallelize(model: Transformer, device_mesh: DeviceMesh) -> Transformer: + """Apply parallelisms and activation checkpointing to the model. + + NOTE: The passed-in model preferably should be on meta device. Otherwise, + the model must fit on GPU or CPU memory. + + """ + + dp_mesh = device_mesh["data_parallel"] + tp_mesh = device_mesh["tensor_parallel"] + + if tp_mesh.size() > 1: + # 1. Parallelize the first embedding and the last linear proj layer + # 2. Parallelize the root norm layer over the sequence dim + # 3. Shard the first transformer block's inputs + + # Parallelize the first embedding and the last linear out projection + plan = { + "tok_embeddings": RowwiseParallel(input_layouts=Replicate()), + "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()), + "norm": SequenceParallel(), + "layers.0": PrepareModuleInput( + input_layouts=(Replicate(), None), + desired_input_layouts=(Shard(1), None), + use_local_output=True, + ), + } + model = parallelize_module(model, tp_mesh, plan) + + # Parallelize each transformer block + for transformer_block in model.layers.values(): + plan = { + "attention": PrepareModuleInput( + input_layouts=(Shard(1), None), + desired_input_layouts=(Replicate(), None), + ), + "attention.wq": ColwiseParallel(), + "attention.wk": ColwiseParallel(), + "attention.wv": ColwiseParallel(), + "attention.wo": RowwiseParallel(output_layouts=Shard(1)), + "attention_norm": SequenceParallel(), + "feed_forward": PrepareModuleInput( + input_layouts=(Shard(1),), + desired_input_layouts=(Replicate(),), + ), + "feed_forward.w1": ColwiseParallel(), + "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)), + "feed_forward.w3": ColwiseParallel(), + "ffn_norm": SequenceParallel(), + } + + # Adjust attention module to use the local number of heads + attn_layer = transformer_block.attention + attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size() + attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size() + + # Apply the plan for the current transformer block + parallelize_module(transformer_block, tp_mesh, plan) + + if dp_mesh.size() > 1: + assert dp_mesh.ndim == 1 # Hybrid-sharding not supported + + # NOTE: Currently, the user is required to manually handle precision settings such as the `mp_policy` here + # because the model parallel strategy does not respect all settings of `Fabric(precision=...)` at the moment. + mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32) + + fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy} + for layer_id, transformer_block in model.layers.items(): + # Apply activation checkpointing + transformer_block = checkpoint_wrapper(transformer_block) + # As an optimization, do not reshard after forward for the last + # transformer block since FSDP would prefetch it immediately + reshard_after_forward = int(layer_id) < len(model.layers) - 1 + fully_shard( + transformer_block, + **fsdp_config, + reshard_after_forward=reshard_after_forward, + ) + model.layers[layer_id] = transformer_block + model = fully_shard(model, **fsdp_config) + + return model diff --git a/examples/pytorch/tensor_parallel/train.py b/examples/pytorch/tensor_parallel/train.py new file mode 100644 index 0000000000000..ad4220a3fc864 --- /dev/null +++ b/examples/pytorch/tensor_parallel/train.py @@ -0,0 +1,75 @@ +import lightning as L +import torch +import torch.nn.functional as F +from data import RandomTokenDataset +from lightning.pytorch.strategies import ModelParallelStrategy +from model import ModelArgs, Transformer +from parallelism import parallelize +from torch.distributed.tensor.parallel import loss_parallel +from torch.utils.data import DataLoader + + +class Llama2(L.LightningModule): + def __init__(self): + super().__init__() + self.model_args = ModelArgs(vocab_size=32000) + self.model = Transformer(self.model_args) + + def configure_model(self): + # User-defined function that applies the desired parallelizations specific to the model + # (TP, FSDP2, activation checkpointing, ...) + parallelize(self.model, device_mesh=self.device_mesh) + + def on_train_start(self) -> None: + self.model.init_weights() + + def training_step(self, batch): + inputs = batch[:, :-1] + labels = batch[:, 1:] + output = self.model(inputs) + with loss_parallel(): + return F.cross_entropy(output.reshape(-1, output.size(-1)), labels.reshape(-1)) + + def configure_optimizers(self): + return torch.optim.AdamW(self.model.parameters(), lr=3e-3, foreach=True) + + def train_dataloader(self): + dataset = RandomTokenDataset(vocab_size=self.model_args.vocab_size, seq_length=128) + # Trainer configures the sampler automatically for you such that + # all batches in a tensor-parallel group are identical + return DataLoader(dataset, batch_size=8, num_workers=4) + + +def train(): + strategy = ModelParallelStrategy( + # Define the size of the 2D parallelism + # Set to "auto" to apply TP intra-node and DP inter-node + data_parallel_size=2, + tensor_parallel_size=2, + ) + + trainer = L.Trainer( + accelerator="cuda", + devices=4, + strategy=strategy, + limit_train_batches=10, + max_epochs=1, + ) + + # Initialize the model + with trainer.init_module(empty_init=True): + model = Llama2() + + trainer.print(f"Number of model parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f} B") + trainer.print("Starting training ...") + + trainer.fit(model) + + trainer.print("Training successfully completed!") + trainer.print(f"Peak memory usage: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB") + + +if __name__ == "__main__": + assert torch.cuda.device_count() >= 4, "This example requires at least 4 GPUs with 24 GB of memory each." + torch.set_float32_matmul_precision("high") + train() From d5bf4b9ed3866633c32cf53a5c9920adab78cdc7 Mon Sep 17 00:00:00 2001 From: Luca Antiga Date: Sat, 18 May 2024 22:03:16 -0400 Subject: [PATCH 031/179] [App] Extend retry to 4xx except 400, 401, 403, 404 (#19842) * Extend retry to 4xx except 400, 401, 403, 404 * Remove unused intersphinx mapping for app --------- Co-authored-by: awaelchli --- src/lightning/app/utilities/network.py | 10 +++++++--- tests/tests_app/utilities/test_network.py | 8 ++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/lightning/app/utilities/network.py b/src/lightning/app/utilities/network.py index 04afdb0b4f92c..a7cc00fde52b7 100644 --- a/src/lightning/app/utilities/network.py +++ b/src/lightning/app/utilities/network.py @@ -96,10 +96,14 @@ def create_retry_strategy(): # are going to be alive for a very long time (~ 4 days) but retries every 120 seconds total=_CONNECTION_RETRY_TOTAL, backoff_factor=_CONNECTION_RETRY_BACKOFF_FACTOR, + # Any 4xx and 5xx statuses except + # 400 Bad Request + # 401 Unauthorized + # 403 Forbidden + # 404 Not Found status_forcelist={ - 408, # Request Timeout - 429, # Too Many Requests - *range(500, 600), # Any 5xx Server Error status + 402, + *range(405, 600), }, allowed_methods={ "POST", # Default methods are idempotent, add POST here diff --git a/tests/tests_app/utilities/test_network.py b/tests/tests_app/utilities/test_network.py index 38c8961919db6..3a14c0301ef1e 100644 --- a/tests/tests_app/utilities/test_network.py +++ b/tests/tests_app/utilities/test_network.py @@ -49,7 +49,8 @@ def test_find_free_network_port_cloudspace(_, patch_constants): def test_http_client_retry_post(getconn_mock): getconn_mock.return_value.getresponse.side_effect = [ mock.Mock(status=500, msg=HTTPMessage()), - mock.Mock(status=429, msg=HTTPMessage()), + mock.Mock(status=599, msg=HTTPMessage()), + mock.Mock(status=405, msg=HTTPMessage()), mock.Mock(status=200, msg=HTTPMessage()), ] @@ -61,6 +62,7 @@ def test_http_client_retry_post(getconn_mock): mock.call("POST", "/test", body=None, headers=mock.ANY), mock.call("POST", "/test", body=None, headers=mock.ANY), mock.call("POST", "/test", body=None, headers=mock.ANY), + mock.call("POST", "/test", body=None, headers=mock.ANY), ] @@ -68,7 +70,8 @@ def test_http_client_retry_post(getconn_mock): def test_http_client_retry_get(getconn_mock): getconn_mock.return_value.getresponse.side_effect = [ mock.Mock(status=500, msg=HTTPMessage()), - mock.Mock(status=429, msg=HTTPMessage()), + mock.Mock(status=599, msg=HTTPMessage()), + mock.Mock(status=405, msg=HTTPMessage()), mock.Mock(status=200, msg=HTTPMessage()), ] @@ -80,4 +83,5 @@ def test_http_client_retry_get(getconn_mock): mock.call("GET", "/test", body=None, headers=mock.ANY), mock.call("GET", "/test", body=None, headers=mock.ANY), mock.call("GET", "/test", body=None, headers=mock.ANY), + mock.call("GET", "/test", body=None, headers=mock.ANY), ] From 82e6e61bea85c27d869b594f7791383c7a4efaf3 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 20 May 2024 06:29:37 +0200 Subject: [PATCH 032/179] Remove redundant code to set the device on the LightningModule (#19877) Co-authored-by: Luca Antiga --- src/lightning/pytorch/strategies/deepspeed.py | 4 ---- src/lightning/pytorch/strategies/fsdp.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/lightning/pytorch/strategies/deepspeed.py b/src/lightning/pytorch/strategies/deepspeed.py index 6be3d3f8ba590..382f8070898f8 100644 --- a/src/lightning/pytorch/strategies/deepspeed.py +++ b/src/lightning/pytorch/strategies/deepspeed.py @@ -337,10 +337,6 @@ def setup(self, trainer: "pl.Trainer") -> None: assert self.accelerator is not None self.accelerator.setup(trainer) - # we set the device so that optimizers can be created with distributed comms. - assert self.lightning_module is not None - self.lightning_module._device = self.root_device - assert self.model is not None self.model = self.precision_plugin.convert_module(self.model) self.model = self._setup_model(self.model) diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 1aae8b678b674..70590d2f254e2 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -305,10 +305,6 @@ def setup(self, trainer: "pl.Trainer") -> None: if trainer.state.fn == TrainerFn.FITTING and self._layer_sync: self.model = self._layer_sync.apply(self.model) - # we set the device so that optimizers can be created with distributed comms. - assert self.lightning_module is not None - self.lightning_module._device = self.root_device - self.model = self.precision_plugin.convert_module(self.model) if is_overridden("configure_sharded_model", self.lightning_module): From d76feef0d6c4f46c1e01be49c13c965f4ce942ef Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 20 May 2024 13:19:38 +0200 Subject: [PATCH 033/179] Enable loss-parallel in example (#19882) --- examples/fabric/tensor_parallel/parallelism.py | 8 +++++++- examples/fabric/tensor_parallel/train.py | 2 +- examples/pytorch/tensor_parallel/parallelism.py | 8 +++++++- examples/pytorch/tensor_parallel/train.py | 5 +++++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/examples/fabric/tensor_parallel/parallelism.py b/examples/fabric/tensor_parallel/parallelism.py index f6f38aa499efa..44d55c8da1cc9 100644 --- a/examples/fabric/tensor_parallel/parallelism.py +++ b/examples/fabric/tensor_parallel/parallelism.py @@ -35,7 +35,13 @@ def parallelize(model: Transformer, device_mesh: DeviceMesh) -> Transformer: # Parallelize the first embedding and the last linear out projection plan = { "tok_embeddings": RowwiseParallel(input_layouts=Replicate()), - "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()), + "output": ColwiseParallel( + input_layouts=Shard(1), + # Optional: Shard the output along the class dimension to compute the loss in parallel. + # See `loss_parallel` in `train.py` + output_layouts=Shard(-1), + use_local_output=False, + ), "norm": SequenceParallel(), "layers.0": PrepareModuleInput( input_layouts=(Replicate(), None), diff --git a/examples/fabric/tensor_parallel/train.py b/examples/fabric/tensor_parallel/train.py index 2c3ab3819830c..ce48fe341fb85 100644 --- a/examples/fabric/tensor_parallel/train.py +++ b/examples/fabric/tensor_parallel/train.py @@ -57,8 +57,8 @@ def train(): with loss_parallel(): loss = F.cross_entropy(output.reshape(-1, output.size(-1)), labels.reshape(-1)) + fabric.backward(loss) - fabric.backward(loss) optimizer.step() optimizer.zero_grad() fabric.print(f"Iteration {i} complete") diff --git a/examples/pytorch/tensor_parallel/parallelism.py b/examples/pytorch/tensor_parallel/parallelism.py index f6f38aa499efa..44d55c8da1cc9 100644 --- a/examples/pytorch/tensor_parallel/parallelism.py +++ b/examples/pytorch/tensor_parallel/parallelism.py @@ -35,7 +35,13 @@ def parallelize(model: Transformer, device_mesh: DeviceMesh) -> Transformer: # Parallelize the first embedding and the last linear out projection plan = { "tok_embeddings": RowwiseParallel(input_layouts=Replicate()), - "output": ColwiseParallel(input_layouts=Shard(1), output_layouts=Replicate()), + "output": ColwiseParallel( + input_layouts=Shard(1), + # Optional: Shard the output along the class dimension to compute the loss in parallel. + # See `loss_parallel` in `train.py` + output_layouts=Shard(-1), + use_local_output=False, + ), "norm": SequenceParallel(), "layers.0": PrepareModuleInput( input_layouts=(Replicate(), None), diff --git a/examples/pytorch/tensor_parallel/train.py b/examples/pytorch/tensor_parallel/train.py index ad4220a3fc864..6efbadf175988 100644 --- a/examples/pytorch/tensor_parallel/train.py +++ b/examples/pytorch/tensor_parallel/train.py @@ -27,9 +27,14 @@ def training_step(self, batch): inputs = batch[:, :-1] labels = batch[:, 1:] output = self.model(inputs) + # Optional: Parallelize loss computation across class dimension (see parallelism.py) with loss_parallel(): return F.cross_entropy(output.reshape(-1, output.size(-1)), labels.reshape(-1)) + def backward(self, *args, **kwargs): + with loss_parallel(): + super().backward(*args, **kwargs) + def configure_optimizers(self): return torch.optim.AdamW(self.model.parameters(), lr=3e-3, foreach=True) From b1bb3f31735e7df22097a8c15dd5323d75fda45d Mon Sep 17 00:00:00 2001 From: Gilles Peiffer Date: Tue, 21 May 2024 19:31:54 +0200 Subject: [PATCH 034/179] Update `LearningRateMonitor` docs and tests for `log_weight_decay` (#19805) --- src/lightning/pytorch/callbacks/lr_monitor.py | 4 +++- tests/tests_pytorch/callbacks/test_lr_monitor.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/lightning/pytorch/callbacks/lr_monitor.py b/src/lightning/pytorch/callbacks/lr_monitor.py index 357cfceefa03e..6a94c7ece70a3 100644 --- a/src/lightning/pytorch/callbacks/lr_monitor.py +++ b/src/lightning/pytorch/callbacks/lr_monitor.py @@ -44,6 +44,8 @@ class LearningRateMonitor(Callback): according to the ``interval`` key of each scheduler. Defaults to ``None``. log_momentum: option to also log the momentum values of the optimizer, if the optimizer has the ``momentum`` or ``betas`` attribute. Defaults to ``False``. + log_weight_decay: option to also log the weight decay values of the optimizer. Defaults to + ``False``. Raises: MisconfigurationException: @@ -58,7 +60,7 @@ class LearningRateMonitor(Callback): Logging names are automatically determined based on optimizer class name. In case of multiple optimizers of same type, they will be named ``Adam``, - ``Adam-1`` etc. If a optimizer has multiple parameter groups they will + ``Adam-1`` etc. If an optimizer has multiple parameter groups they will be named ``Adam/pg1``, ``Adam/pg2`` etc. To control naming, pass in a ``name`` keyword in the construction of the learning rate schedulers. A ``name`` keyword can also be used for parameter groups in the diff --git a/tests/tests_pytorch/callbacks/test_lr_monitor.py b/tests/tests_pytorch/callbacks/test_lr_monitor.py index ebe21e272aac8..4aedb4f23fa14 100644 --- a/tests/tests_pytorch/callbacks/test_lr_monitor.py +++ b/tests/tests_pytorch/callbacks/test_lr_monitor.py @@ -44,6 +44,9 @@ def test_lr_monitor_single_lr(tmp_path): assert lr_monitor.lrs, "No learning rates logged" assert all(v is None for v in lr_monitor.last_momentum_values.values()), "Momentum should not be logged by default" + assert all( + v is None for v in lr_monitor.last_weight_decay_values.values() + ), "Weight decay should not be logged by default" assert len(lr_monitor.lrs) == len(trainer.lr_scheduler_configs) assert list(lr_monitor.lrs) == ["lr-SGD"] From 7e87ce05c8cd5c4ebc0ab3ddea0295a1c15bcc09 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 21 May 2024 19:46:01 +0200 Subject: [PATCH 035/179] Fix state dict loading in bitsandbytes plugin when checkpoint is already quantized (#19886) * bugfix * add test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update * add chlog --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/lightning/fabric/CHANGELOG.md | 2 +- .../fabric/plugins/precision/bitsandbytes.py | 12 +++---- src/lightning/pytorch/CHANGELOG.md | 2 +- .../plugins/precision/test_bitsandbytes.py | 34 +++++++++++++++++++ 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 6dc4101c8f3d9..b4076e6e23b67 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -50,7 +50,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886)) - diff --git a/src/lightning/fabric/plugins/precision/bitsandbytes.py b/src/lightning/fabric/plugins/precision/bitsandbytes.py index 12a0ac3998b6e..0f524dd67fad9 100644 --- a/src/lightning/fabric/plugins/precision/bitsandbytes.py +++ b/src/lightning/fabric/plugins/precision/bitsandbytes.py @@ -233,9 +233,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc """Inplace quantize.""" if weight is None: weight = self.weight.data - if weight.data.type == torch.int8: - # already quantized - return + if weight.data.dtype == torch.int8: + # already quantized + return assert isinstance(self.weight, bnb.nn.Int8Params) self.weight = self.quantize(self.weight, weight, device) @@ -317,9 +317,9 @@ def quantize_(self, weight: Optional[torch.Tensor] = None, device: Optional[torc """Inplace quantize.""" if weight is None: weight = self.weight.data - if weight.data.type == torch.uint8: - # already quantized - return + if weight.data.dtype == torch.uint8: + # already quantized + return assert isinstance(self.weight, bnb.nn.Params4bit) self.weight = self.quantize(self.weight, weight, device) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 11c9238a6d409..6792f979871a6 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -45,7 +45,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886)) - Fixed `WandbLogger.log_hyperparameters()` raising an error if hyperparameters are not JSON serializable ([#19769](https://github.com/Lightning-AI/pytorch-lightning/pull/19769)) diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index ec02796b4b51c..a88e7c2be7b3a 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -230,3 +230,37 @@ def __init__(self): assert not keys.missing_keys assert model.l.weight.device.type == "cuda" assert model.l.weight.dtype == expected + + +@RunIf(min_cuda_gpus=1, min_torch="2.1") +@pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable") +def test_load_quantized_checkpoint(tmp_path): + """Test that a checkpoint saved from a quantized model can be loaded back into a quantized model.""" + + class Model(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(16, 16, bias=False) + + def forward(self, x): + return self.linear(x) + + fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq")) + model = Model() + model = fabric.setup(model) + model(torch.randn(2, 16, device=fabric.device)) + state_dict = model.state_dict() + # The checkpoint contains quantized weights + assert state_dict["linear.weight"].dtype == torch.uint8 + assert state_dict["linear.weight"].shape == (128, 1) + torch.save(state_dict, tmp_path / "checkpoint.pt") + + fabric = Fabric(accelerator="cuda", devices=1, plugins=BitsandbytesPrecision("nf4-dq")) + model = Model() + model = fabric.setup(model) + state_dict = torch.load(tmp_path / "checkpoint.pt") + model.load_state_dict(state_dict) + assert model.linear.weight.dtype == torch.uint8 + assert model.linear.weight.shape == (128, 1) + # Shapes match during forward (weight is being dequantized during forward) + model(torch.randn(2, 16, device=fabric.device)) From 987c2c4093ea4dbebc0fd41e503fd1743f054933 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 22 May 2024 12:20:40 +0200 Subject: [PATCH 036/179] (7/n) Support 2D Parallelism - TP Fabric Docs (#19884) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sebastian Raschka Co-authored-by: Carlos Mocholí --- docs/source-fabric/_static/main.css | 10 + .../advanced/model_parallel/fsdp.rst | 13 +- .../advanced/model_parallel/index.rst | 163 ++++++++++ .../advanced/model_parallel/tp.rst | 279 ++++++++++++++++++ .../advanced/model_parallel/tp_fsdp.rst | 5 + docs/source-fabric/api/strategies.rst | 1 + docs/source-fabric/fundamentals/launch.rst | 2 +- docs/source-fabric/glossary/index.rst | 13 + docs/source-fabric/guide/index.rst | 4 +- docs/source-fabric/levels/advanced.rst | 4 +- .../advanced/model_parallel/fsdp.rst | 5 +- examples/fabric/tensor_parallel/train.py | 2 +- examples/pytorch/tensor_parallel/train.py | 2 +- src/lightning/fabric/strategies/fsdp.py | 2 - src/lightning/pytorch/strategies/fsdp.py | 2 - 15 files changed, 484 insertions(+), 23 deletions(-) create mode 100644 docs/source-fabric/advanced/model_parallel/index.rst create mode 100644 docs/source-fabric/advanced/model_parallel/tp.rst create mode 100644 docs/source-fabric/advanced/model_parallel/tp_fsdp.rst diff --git a/docs/source-fabric/_static/main.css b/docs/source-fabric/_static/main.css index c1bd8ad0305b7..e3f427599d34e 100644 --- a/docs/source-fabric/_static/main.css +++ b/docs/source-fabric/_static/main.css @@ -1,3 +1,13 @@ col { width: 50% !important; } + +ul.no-bullets { + list-style-type: none; /* Remove default bullets */ + padding-left: 0; /* Remove default padding */ +} + +ul.no-bullets li { + padding-left: 0.5em; + text-indent: -2em; +} diff --git a/docs/source-fabric/advanced/model_parallel/fsdp.rst b/docs/source-fabric/advanced/model_parallel/fsdp.rst index 3d1efa5563d0f..51a2d7cc2d953 100644 --- a/docs/source-fabric/advanced/model_parallel/fsdp.rst +++ b/docs/source-fabric/advanced/model_parallel/fsdp.rst @@ -1,14 +1,11 @@ -########################################### -Training models with billions of parameters -########################################### +##################################################### +Training models with billions of parameters with FSDP +##################################################### Use Fully Sharded Data Parallel (FSDP) to train large models with billions of parameters efficiently on multiple GPUs and across multiple machines. -.. note:: This is an experimental feature. - - Today, large models with billions of parameters are trained with many GPUs across several machines in parallel. -Even a single H100 GPU with 80 GB of VRAM (the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision). +Even a single H100 GPU with 80 GB of VRAM (one of the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision). The memory consumption for training is generally made up of 1. the model parameters, @@ -19,7 +16,7 @@ The memory consumption for training is generally made up of | When the sum of these memory components exceed the VRAM of a single GPU, regular data-parallel training (DDP) can no longer be employed. -One of the methods that can alleviate this limitation is called **model-parallel** training, and known as **FSDP** in PyTorch, and in this guide, you will learn how to effectively scale large models with it. +One of the methods that can alleviate this limitation is called **Fully Sharded Data Parallel (FSDP)**, and in this guide, you will learn how to effectively scale large models with it. ---- diff --git a/docs/source-fabric/advanced/model_parallel/index.rst b/docs/source-fabric/advanced/model_parallel/index.rst new file mode 100644 index 0000000000000..59b8acc72dd00 --- /dev/null +++ b/docs/source-fabric/advanced/model_parallel/index.rst @@ -0,0 +1,163 @@ +########################################### +Training models with billions of parameters +########################################### + +Today, large models with billions of parameters are trained with many GPUs across several machines in parallel. +Even a single H100 GPU with 80 GB of VRAM (one of the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision). +The memory consumption for training is generally made up of + +1. the model parameters, +2. the layer activations (forward), +3. the gradients (backward), +4. the optimizer states (e.g., Adam has two additional exponential averages per parameter) and +5. model outputs and loss. + +| + +When the sum of these memory components exceed the VRAM of a single GPU, regular data-parallel training (DDP) can no longer be employed. +To alleviate this limitation, we need to introduce **Model Parallelism**. + + +---- + + +************************** +What is Model Parallelism? +************************** + +There are different types of model parallelism, each with its own trade-offs. + +**Fully Sharded Data Parallelism (FSDP)** shards both model parameters and optimizer states across multiple GPUs, significantly reducing memory usage per GPU. +This method, while highly memory-efficient, involves frequent synchronization between GPUs, introducing communication overhead and complexity in implementation. +FSDP is advantageous when memory constraints are the primary issue, provided there are high-bandwidth interconnects to minimize latency. + +**Tensor Parallelism (TP)** splits individual tensors across GPUs, enabling fine-grained distribution of computation and memory. +It scales well to a large number of GPUs but requires synchronization of tensor slices after each operation, which adds communication overhead. +TP is most effective with models that have many linear layers (LLMs), offering a balance between memory distribution and computational efficiency. + +**Pipeline Parallelism (PP)** divides model layers into segments, each processed by different GPUs, reducing memory load per GPU and minimizing inter-GPU communication to pipeline stage boundaries. +While this reduces communication overhead, it can introduce pipeline bubbles where some GPUs idle, leading to potential inefficiencies. +PP is ideal for deep models with sequential architectures (LLMs), though it requires careful management to minimize idle times. + +Choosing a model parallelism style involves considering model architecture, hardware interconnects, and training efficiency. +In practice, hybrid approaches combining FSDP, TP, and PP are often used to leverage the strengths of each method while mitigating their weaknesses. + + +---- + + +*********** +Get started +*********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Fully-Sharded Data Parallel (FSDP) + :description: Get started training large multi-billion parameter models with minimal code changes + :col_css: col-md-4 + :button_link: fsdp.html + :height: 180 + :tag: advanced + +.. displayitem:: + :header: Tensor Parallel (TP) + :description: Learn the principles behind tensor parallelism and how to apply it to your model + :col_css: col-md-4 + :button_link: tp.html + :height: 180 + :tag: advanced + +.. displayitem:: + :header: 2D Parallel (FSDP + TP) + :description: Combine Tensor Parallelism with FSDP (2D Parallel) to train efficiently on 100s of GPUs + :button_link: tp_fsdp.html + :col_css: col-md-4 + :height: 180 + :tag: advanced + +.. displayitem:: + :header: Pipeline Parallelism + :description: Coming soon + :button_link: + :col_css: col-md-4 + :height: 180 + :tag: advanced + +.. raw:: html + +
+
+ + +---- + + +********************* +Parallelisms compared +********************* + + +**Distributed Data Parallel (DDP)** + +.. raw:: html + +
    +
  • ✅   No model code changes required
  • +
  • ✅   Training with very large batch sizes (batch size scales with number of GPUs)
  • +
  • ❗   Model (weights, optimizer state, activations / gradients) must fit into a GPU
  • +
+ +| + +**Fully-Sharded Data Parallel (FSDP)** + +.. raw:: html + +
    +
  • ✅   No model code changes required
  • +
  • ✅   Training with very large batch sizes (batch size scales with number of GPUs)
  • +
  • ✅   Model (weights, optimizer state, gradients) gets distributed across all GPUs
  • +
  • ❗   A single FSDP layer when gathered during forward/backward must fit into the GPU
  • +
  • ❗   Requires some knowledge about model architecture to set configuration options correctly
  • +
  • ❗   Requires very fast networking (multi-node), data transfers between GPUs often become a bottleneck
  • +
+ +| + +**Tensor Parallel (TP)** + +.. raw:: html + +
    +
  • ❗   Model code changes required
  • +
  • 🤔   Fixed global batch size (does not scale with number of GPUs)
  • +
  • ✅   Model (weights, optimizer state, activations) gets distributed across all GPUs
  • +
  • ✅   Parallelizes the computation of layers that are too large to fit onto a single GPU
  • +
  • ❗   Requires lots of knowledge about model architecture to set configuration options correctly
  • +
  • 🤔   Less GPU data transfers required, but data transfers don't overlap with computation like in FSDP
  • +
+ +| + +**2D Parallel (FSDP + TP)** + +.. raw:: html + +
    +
  • ❗   Model code changes required
  • +
  • ✅   Training with very large batch sizes (batch size scales across data-parallel dimension)
  • +
  • ✅   Model (weights, optimizer state, activations) gets distributed across all GPUs
  • +
  • ✅   Parallelizes the computation of layers that are too large to fit onto a single GPU
  • +
  • ❗   Requires lots of knowledge about model architecture to set configuration options correctly
  • +
  • ✅   Tensor-parallel within machines and FSDP across machines reduces data transfer bottlenecks
  • +
+ +| + +Lightning Fabric supports all the parallelisms mentioned above natively through PyTorch, with the exception of pipeline parallelism (PP) which is not yet supported. + +| diff --git a/docs/source-fabric/advanced/model_parallel/tp.rst b/docs/source-fabric/advanced/model_parallel/tp.rst new file mode 100644 index 0000000000000..4d5fb26181dc6 --- /dev/null +++ b/docs/source-fabric/advanced/model_parallel/tp.rst @@ -0,0 +1,279 @@ +################## +Tensor Parallelism +################## + +Tensor parallelism is a technique for training large models by distributing layers across multiple devices, improving memory management and efficiency by reducing inter-device communication. +However, for smaller models, the communication overhead may outweigh its benefits. +This method is most effective for models with very large layers, significantly enhancing performance and memory efficiency. + +.. note:: This is an experimental feature. + + +---- + + +******************************************* +How to exploit parallelism in linear layers +******************************************* + +In tensor parallelism, the computation of a linear layer can be split up across GPUs. +This saves memory because each GPU only needs to hold a portion of the weight matrix. +There are two ways a linear layer can be split up: row-wise or column-wise. + +Column-wise Parallel +==================== + +In a column-wise parallel layer, the weight matrix is split evenly along the column dimension. +Each GPU is sent the same input, and computes a regular matrix multiplication with its portion of the weight matrix. +At the end, the outputs from each GPU can be concatenated to form the final output. + + +.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/fabric/tp-colwise.jpeg + :alt: Left: Regular matrix multiplication. Right: Column-wise parallel matrix multiplication split across two GPUs. + :width: 100% + +Row-wise Parallel +================= + +Row-wise parallelism divides the rows of the weight matrix evenly across devices. +In addition, the input gets split the same way along the inner dimension (because the weight matrix now has fewer rows). +Each GPU then performs a regular matrix multiplication with its portion of the weight matrix and inputs. +At the end, the outputs from each GPU can be summed up element-wise (all-reduce) to form the final output. + +.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/fabric/tp-rowwise.jpeg + :alt: Left: Regular matrix multiplication. Right: Row-wise parallel matrix multiplication split across two GPUs. + :width: 100% + + +Combined Column- and Row-wise Parallel +====================================== + +When there are multiple linear layers in sequence, e.g., in a MLP or a Transformer, the column-wise and row-wise parallel styles can be combined for maximum effect. +Instead of concatenating the output of the column-wise parallel layer, we keep the outputs separate and feed them directly to the row-wise parallel layer. +This way, we avoid costly data transfers between GPUs. + +.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/fabric/tp-combined.jpeg + :alt: Top: Two regular matrix multiplications in sequence. Bottom: Combined column-wise and row-wise parallel matrix multiplications across two GPUs. + :width: 100% + +Note that activation functions between the layers can still be applied without additional communication because they are element-wise, but are not shown in the figures for simplicity. + + +---- + + +*********************************** +Apply tensor parallelism to a model +*********************************** + +To apply tensor parallelism to a model with Fabric, you need a good understanding of your model's architecture to make the decision of where to apply the parallel styles you've seen above. +Let's start with a simple MLP toy example: + +.. code-block:: python + + import torch.nn as nn + import torch.nn.functional as F + + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +This model has three linear layers. Layers ``w1`` and ``w3`` produce an output that is later multiplied element-wise. +That output is then fed into layer ``w2``. +Therefore, ``w1`` and ``w3`` are suitable candidates for column-wise parallelism, because their output(s) can easily be combined with ``w2`` in row-wise fashion. + +In Fabric, define a function that applies the tensor parallelism to the model: + +.. code-block:: python + + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + + + def parallelize_feedforward(model, device_mesh): + # Lightning will set up a device mesh for you + tp_mesh = device_mesh["tensor_parallel"] + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(model, tp_mesh, plan) + return model + +Next, configure the :class:`~lightning.fabric.strategies.model_parallel.ModelParallelStrategy` in Fabric: + +.. code-block:: python + + import lightning as L + from lightning.fabric.strategies import ModelParallelStrategy + + # 1. Pass the parallelization function to the strategy + strategy = ModelParallelStrategy(parallelize_fn=parallelize_feedforward) + + # 2. Configure devices and set the strategy in Fabric + fabric = L.Fabric(accelerator="cuda", devices=2, strategy=strategy) + fabric.launch() + +The strategy takes the custom parallelization function as input. +No other changes to your training code are necessary at this point. +Later in the code, when you call ``fabric.setup(model)``, Fabric will apply the ``parallelize_feedforward`` function to the model automatically. + +.. collapse:: Full training example (requires at least 2 GPUs). + + .. code-block:: python + + import torch + import torch.nn as nn + import torch.nn.functional as F + + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + + import lightning as L + from lightning.pytorch.demos.boring_classes import RandomDataset + from lightning.fabric.strategies import ModelParallelStrategy + + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + + def parallelize_feedforward(model, device_mesh): + # Lightning will set up a device mesh for you + tp_mesh = device_mesh["tensor_parallel"] + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(model, tp_mesh, plan) + return model + + + strategy = ModelParallelStrategy(parallelize_fn=parallelize_feedforward) + fabric = L.Fabric(accelerator="cuda", devices=2, strategy=strategy) + fabric.launch() + + # Initialize the model + model = FeedForward(8192, 8192) + model = fabric.setup(model) + + # Define the optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3) + optimizer = fabric.setup_optimizers(optimizer) + + # Define dataset/dataloader + dataset = RandomDataset(8192, 64) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=8) + dataloader = fabric.setup_dataloaders(dataloader) + + # Simplified training loop + for i, batch in enumerate(dataloader): + output = model(batch) + loss = output.sum() + fabric.backward(loss) + optimizer.step() + optimizer.zero_grad() + fabric.print(f"Iteration {i} complete") + + fabric.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + +| + +When measuring the peak memory consumption, we should see that doubling the number of GPUs reduces the memory consuption roughly by half: + + +.. list-table:: + :widths: 20 20 20 20 20 + :header-rows: 1 + + * - + - 1 GPU (no TP) + - 2 GPUs + - 4 GPUs + - 8 GPUs + * - Memory per GPU + - 4.04 GB + - 2.03 GB + - 1.02 GB + - 0.60 GB + +Beyond this toy example, we recommend you study our `LLM Tensor Parallel Example (Llama 3) `_. + + +---- + + +*************************** +Data-loading considerations +*************************** + +In a tensor-parallelized model, it is important that the model receives an identical input on each GPU. +Otherwise, training won't converge. +Therefore, when you shuffle data in your dataset or data loader, or when applying randomized transformations/augmentations in your data, ensure that the seed is set appropriately. + +Given this requirement, your global batch size will be limited by the memory of a single GPU. +To scale the batch size and accelerate training further, you can combine :doc:`tensor parallelism with data parallelism (in particular, FSDP) `. + + +---- + + +********** +Next steps +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: LLM Tensor Parallel Example + :description: Full example how to apply tensor parallelism to a large language model (Llama 3) + :col_css: col-md-4 + :button_link: https://github.com/Lightning-AI/pytorch-lightning/tree/master/examples/fabric/tensor_parallel + :height: 160 + :tag: advanced + +.. displayitem:: + :header: 2D Parallel (FSDP + TP) + :description: Combine Tensor Parallelism with FSDP (2D Parallel) to train efficiently on 100s of GPUs + :button_link: tp_fsdp.html + :col_css: col-md-4 + :height: 160 + :tag: advanced + +.. displayitem:: + :header: PyTorch API Reference + :description: Explore the official PyTorch Tensor Parallel APIs + :button_link: https://pytorch.org/docs/stable/distributed.tensor.parallel.html + :col_css: col-md-4 + :height: 160 + :tag: advanced + + +.. raw:: html + +
+
+ +| diff --git a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst new file mode 100644 index 0000000000000..606d9619f63d0 --- /dev/null +++ b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst @@ -0,0 +1,5 @@ +########################################## +2D Parallelism (Tensor Parallelism + FSDP) +########################################## + +Content will be available soon. diff --git a/docs/source-fabric/api/strategies.rst b/docs/source-fabric/api/strategies.rst index b018118e3c02d..63b573d0a4438 100644 --- a/docs/source-fabric/api/strategies.rst +++ b/docs/source-fabric/api/strategies.rst @@ -25,3 +25,4 @@ Strategies ParallelStrategy SingleDeviceStrategy SingleDeviceXLAStrategy + ModelParallelStrategy diff --git a/docs/source-fabric/fundamentals/launch.rst b/docs/source-fabric/fundamentals/launch.rst index efde3f54fe846..f8c0deecf4e25 100644 --- a/docs/source-fabric/fundamentals/launch.rst +++ b/docs/source-fabric/fundamentals/launch.rst @@ -223,7 +223,7 @@ Next steps .. displayitem:: :header: Mixed Precision Training - :description: Save memory and speed up training using mixed precision + :description: Save memory and speed up training using mixed precision :col_css: col-md-4 :button_link: ../fundamentals/precision.html :height: 160 diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst index e229b4fe5c998..aec3396b36e7e 100644 --- a/docs/source-fabric/glossary/index.rst +++ b/docs/source-fabric/glossary/index.rst @@ -9,6 +9,9 @@ Glossary Checkpoint <../guide/checkpoint/index> Weights and Biases <../guide/loggers/wandb> Wrappers <../api/wrappers> + Model Parallel <../advanced/model_parallel/index> + Tensor Parallel <../advanced/model_parallel/tp> + 2D Parallelism <../advanced/model_parallel/tp_fsdp> .. raw:: html @@ -136,6 +139,11 @@ Glossary :button_link: ../fundamentals/precision.html :col_css: col-md-4 +.. displayitem:: + :header: Model Parallelism + :button_link: ../advanced/model_parallel/index.html + :col_css: col-md-4 + .. displayitem:: :header: MPI :button_link: ../guide/multi_node/other.html @@ -191,6 +199,11 @@ Glossary :button_link: ../guide/logging.html :col_css: col-md-4 +.. displayitem:: + :header: Tensor Parallelism + :button_link: ../advanced/model_parallel/tp.html + :col_css: col-md-4 + .. displayitem:: :header: TorchElastic :button_link: ../guide/multi_node/barebones.html diff --git a/docs/source-fabric/guide/index.rst b/docs/source-fabric/guide/index.rst index 795d756d33549..669501ec09864 100644 --- a/docs/source-fabric/guide/index.rst +++ b/docs/source-fabric/guide/index.rst @@ -167,8 +167,8 @@ Advanced Topics .. displayitem:: :header: Train models with billions of parameters - :description: Train the largest models with FSDP across multiple GPUs and machines - :button_link: ../advanced/model_parallel/fsdp.html + :description: Train the largest models with FSDP/TP across multiple GPUs and machines + :button_link: ../advanced/model_parallel/index.html :col_css: col-md-4 :height: 160 :tag: advanced diff --git a/docs/source-fabric/levels/advanced.rst b/docs/source-fabric/levels/advanced.rst index 965e848c7c993..0e4590cc76f01 100644 --- a/docs/source-fabric/levels/advanced.rst +++ b/docs/source-fabric/levels/advanced.rst @@ -53,8 +53,8 @@ Advanced skills .. displayitem:: :header: Train models with billions of parameters - :description: Train the largest models with FSDP across multiple GPUs and machines - :button_link: ../advanced/model_parallel/fsdp.html + :description: Train the largest models with FSDP/TP across multiple GPUs and machines + :button_link: ../advanced/model_parallel/index.html :col_css: col-md-4 :height: 170 :tag: advanced diff --git a/docs/source-pytorch/advanced/model_parallel/fsdp.rst b/docs/source-pytorch/advanced/model_parallel/fsdp.rst index af6670f9eba01..4e4c288e9ea0b 100644 --- a/docs/source-pytorch/advanced/model_parallel/fsdp.rst +++ b/docs/source-pytorch/advanced/model_parallel/fsdp.rst @@ -8,11 +8,8 @@ Train models with billions of parameters using FSDP Use Fully Sharded Data Parallel (FSDP) to train large models with billions of parameters efficiently on multiple GPUs and across multiple machines. -.. note:: This is an experimental feature. - - Today, large models with billions of parameters are trained with many GPUs across several machines in parallel. -Even a single H100 GPU with 80 GB of VRAM (the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision). +Even a single H100 GPU with 80 GB of VRAM (one of the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision). The memory consumption for training is generally made up of 1. the model parameters, diff --git a/examples/fabric/tensor_parallel/train.py b/examples/fabric/tensor_parallel/train.py index ce48fe341fb85..4a98f12cf6168 100644 --- a/examples/fabric/tensor_parallel/train.py +++ b/examples/fabric/tensor_parallel/train.py @@ -69,7 +69,7 @@ def train(): fabric.save("checkpoint.pt", state) fabric.print("Training successfully completed!") - fabric.print(f"Peak memory usage: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB") + fabric.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") if __name__ == "__main__": diff --git a/examples/pytorch/tensor_parallel/train.py b/examples/pytorch/tensor_parallel/train.py index 6efbadf175988..37556cd087a7b 100644 --- a/examples/pytorch/tensor_parallel/train.py +++ b/examples/pytorch/tensor_parallel/train.py @@ -71,7 +71,7 @@ def train(): trainer.fit(model) trainer.print("Training successfully completed!") - trainer.print(f"Peak memory usage: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB") + trainer.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") if __name__ == "__main__": diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index fdd842209a0e3..eb125d191df94 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -86,8 +86,6 @@ class FSDPStrategy(ParallelStrategy, _Sharded): r"""Strategy for Fully Sharded Data Parallel provided by torch.distributed. - .. warning:: This is an :ref:`experimental ` feature. - Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model size, whilst using efficient communication to reduce overhead. In practice, this means we can remain at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 70590d2f254e2..3c352e8174ddc 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -83,8 +83,6 @@ class FSDPStrategy(ParallelStrategy): r"""Strategy for Fully Sharded Data Parallel provided by torch.distributed. - .. warning:: This is an :ref:`experimental ` feature. - Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model size, whilst using efficient communication to reduce overhead. In practice, this means we can remain at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar From 8fc7b4ae945c6d0204976ff3c45d5b546b7a475a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 22 May 2024 18:31:40 +0200 Subject: [PATCH 037/179] Remove the requirement for FSDPStrategy subclasses to only support GPU (#19894) --- src/lightning/fabric/connector.py | 2 +- tests/tests_fabric/test_connector.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/connector.py b/src/lightning/fabric/connector.py index edbfd77721a95..3a7334b3505ab 100644 --- a/src/lightning/fabric/connector.py +++ b/src/lightning/fabric/connector.py @@ -429,7 +429,7 @@ def _check_strategy_and_fallback(self) -> None: f" platform. We recommed `Fabric(strategy='ddp_spawn')` instead." ) if ( - strategy_flag in _FSDP_ALIASES or isinstance(self._strategy_flag, FSDPStrategy) + strategy_flag in _FSDP_ALIASES or type(self._strategy_flag) is FSDPStrategy ) and self._accelerator_flag not in ("cuda", "gpu"): raise ValueError( "You selected the FSDP strategy but FSDP is only available on GPU. Set `Fabric(accelerator='gpu', ...)`" diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py index fba4c34c07d63..a45abd097f966 100644 --- a/tests/tests_fabric/test_connector.py +++ b/tests/tests_fabric/test_connector.py @@ -978,6 +978,16 @@ def test_fsdp_unsupported_on_cpu(_): with pytest.raises(ValueError, match="You selected the FSDP strategy but FSDP is only available on GPU"): _Connector(accelerator="cpu", strategy="fsdp") + class FSDPStrategySubclass(FSDPStrategy): + pass + + class AcceleratorSubclass(CPUAccelerator): + pass + + # we allow subclasses of FSDPStrategy to be used with other accelerators + _Connector(accelerator="cpu", strategy=FSDPStrategySubclass()) + _Connector(accelerator=AcceleratorSubclass(), strategy=FSDPStrategySubclass()) + def test_connector_defaults_match_fabric_defaults(): """Test that the default values for the init arguments of Connector match the ones in Fabric.""" From 341474aaac186987bfae538f7c4865f988d71d1a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 22 May 2024 19:47:55 +0200 Subject: [PATCH 038/179] (8/n) Support 2D Parallelism - 2D Parallel Fabric Docs (#19887) --- docs/source-fabric/advanced/model_init.rst | 12 +- .../advanced/model_parallel/index.rst | 1 - .../advanced/model_parallel/tp.rst | 3 + .../advanced/model_parallel/tp_fsdp.rst | 284 +++++++++++++++++- 4 files changed, 292 insertions(+), 8 deletions(-) diff --git a/docs/source-fabric/advanced/model_init.rst b/docs/source-fabric/advanced/model_init.rst index f1e5cf846b5e7..4b31df036fe78 100644 --- a/docs/source-fabric/advanced/model_init.rst +++ b/docs/source-fabric/advanced/model_init.rst @@ -61,15 +61,15 @@ When loading a model from a checkpoint, for example when fine-tuning, set ``empt ---- -******************************************** -Model-parallel training (FSDP and DeepSpeed) -******************************************** +*************************************************** +Model-parallel training (FSDP, TP, DeepSpeed, etc.) +*************************************************** -When training sharded models with :doc:`FSDP ` or DeepSpeed, using :meth:`~lightning.fabric.fabric.Fabric.init_module` is necessary in most cases because otherwise model initialization gets very slow (minutes) or (and that's more likely) you run out of CPU memory due to the size of the model. +When training distributed models with :doc:`FSDP/TP ` or DeepSpeed, using :meth:`~lightning.fabric.fabric.Fabric.init_module` is necessary in most cases because otherwise model initialization gets very slow (minutes) or (and that's more likely) you run out of CPU memory due to the size of the model. .. code-block:: python - # Recommended for FSDP and DeepSpeed + # Recommended for FSDP, TP and DeepSpeed with fabric.init_module(empty_init=True): model = GPT3() # parameters are placed on the meta-device @@ -81,4 +81,4 @@ When training sharded models with :doc:`FSDP ` or DeepSpeed .. note:: Empty-init is experimental and the behavior may change in the future. - For FSDP on PyTorch 2.1+, it is required that all user-defined modules that manage parameters implement a ``reset_parameters()`` method (all PyTorch built-in modules have this too). + For distributed models on PyTorch 2.1+, it is required that all user-defined modules that manage parameters implement a ``reset_parameters()`` method (all PyTorch built-in modules have this too). diff --git a/docs/source-fabric/advanced/model_parallel/index.rst b/docs/source-fabric/advanced/model_parallel/index.rst index 59b8acc72dd00..78649279a29b5 100644 --- a/docs/source-fabric/advanced/model_parallel/index.rst +++ b/docs/source-fabric/advanced/model_parallel/index.rst @@ -82,7 +82,6 @@ Get started .. displayitem:: :header: Pipeline Parallelism :description: Coming soon - :button_link: :col_css: col-md-4 :height: 180 :tag: advanced diff --git a/docs/source-fabric/advanced/model_parallel/tp.rst b/docs/source-fabric/advanced/model_parallel/tp.rst index 4d5fb26181dc6..fdb05da121932 100644 --- a/docs/source-fabric/advanced/model_parallel/tp.rst +++ b/docs/source-fabric/advanced/model_parallel/tp.rst @@ -110,6 +110,7 @@ In Fabric, define a function that applies the tensor parallelism to the model: parallelize_module(model, tp_mesh, plan) return model +By writing the parallelization code in a separate function rather than hardcoding it into the model, we keep the original source code clean and maintainable. Next, configure the :class:`~lightning.fabric.strategies.model_parallel.ModelParallelStrategy` in Fabric: .. code-block:: python @@ -222,6 +223,8 @@ Beyond this toy example, we recommend you study our `LLM Tensor Parallel Example ---- +.. _tp-data-loading: + *************************** Data-loading considerations *************************** diff --git a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst index 606d9619f63d0..7bab5e88a3b19 100644 --- a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst +++ b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst @@ -2,4 +2,286 @@ 2D Parallelism (Tensor Parallelism + FSDP) ########################################## -Content will be available soon. +2D Parallelism combines Tensor Parallelism (TP) and Fully Sharded Data Parallelism (FSDP) to leverage the memory efficiency of FSDP and the computational scalability of TP. +This hybrid approach balances the trade-offs of each method, optimizing memory usage and minimizing communication overhead, enabling the training of extremely large models on large GPU clusters. + +The :doc:`Tensor Parallelism documentation ` and a general understanding of `FSDP `_ are a prerequisite for this tutorial. + +.. note:: This is an experimental feature. + + +---- + + +********************* +Enable 2D parallelism +********************* + +We will start off with the same feed forward example model as in the :doc:`Tensor Parallelism tutorial `. + +.. code-block:: python + + import torch + import torch.nn as nn + import torch.nn.functional as F + + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + +Next, we define a function that applies the desired parallelism to our model. +The function must take as first argument the model and as second argument the a :class:`~torch.distributed.device_mesh.DeviceMesh`. +More on how the device mesh works later. + +.. code-block:: python + + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + from torch.distributed._composable.fsdp.fully_shard import fully_shard + + def parallelize_feedforward(model, device_mesh): + # Lightning will set up a device mesh for you + # Here, it is 2-dimensional + tp_mesh = device_mesh["tensor_parallel"] + dp_mesh = device_mesh["data_parallel"] + + if tp_mesh.size() > 1: + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(model, tp_mesh, plan) + + if dp_mesh.size() > 1: + # Use PyTorch's FSDP2 APIs to parallelize the model + fully_shard(model.w1, mesh=dp_mesh) + fully_shard(model.w2, mesh=dp_mesh) + fully_shard(model.w3, mesh=dp_mesh) + fully_shard(model, mesh=dp_mesh) + + return model + +By writing the parallelization code in a separate function rather than hardcoding it into the model, we keep the original source code clean and maintainable. +In addition to the tensor-parallel code from the :doc:`Tensor Parallelism tutorial `, this function also shards the model's parameters using FSDP along the data-parallel dimension. + +Finally, pass the parallelization function to the :class:`~lightning.fabric.strategies.model_parallel.ModelParallelStrategy` and configure the data-parallel and tensor-parallel sizes: + +.. code-block:: python + + import lightning as L + from lightning.fabric.strategies import ModelParallelStrategy + + strategy = ModelParallelStrategy( + parallelize_fn=parallelize_feedforward, + # Define the size of the 2D parallelism + # Set these to "auto" (default) to apply TP intra-node and FSDP inter-node + data_parallel_size=2, + tensor_parallel_size=2, + ) + + fabric = L.Fabric(accelerator="cuda", devices=4, strategy=strategy) + fabric.launch() + + +In this example with 4 GPUs, Fabric will create a device mesh that groups GPU 0-1 and GPU 2-3 (2 groups because ``data_parallel_size=2``, and 2 GPUs per group because ``tensor_parallel_size=2``). +Later on when ``fabric.setup(model)`` is called, each layer wrapped with FSDP (``fully_shard``) will be split into two shards, one for the GPU 0-1 group, and one for the GPU 2-3 group. +Finally, the tensor parallelism will apply to each group, splitting the sharded tensor across the GPUs within each group. + + +.. collapse:: Full training example (requires at least 4 GPUs). + + .. code-block:: python + + import torch + import torch.nn as nn + import torch.nn.functional as F + + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + from torch.distributed._composable.fsdp.fully_shard import fully_shard + + import lightning as L + from lightning.pytorch.demos.boring_classes import RandomDataset + from lightning.fabric.strategies import ModelParallelStrategy + + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + + def parallelize_feedforward(model, device_mesh): + # Lightning will set up a device mesh for you + # Here, it is 2-dimensional + tp_mesh = device_mesh["tensor_parallel"] + dp_mesh = device_mesh["data_parallel"] + + if tp_mesh.size() > 1: + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(model, tp_mesh, plan) + + if dp_mesh.size() > 1: + # Use PyTorch's FSDP2 APIs to parallelize the model + fully_shard(model.w1, mesh=dp_mesh) + fully_shard(model.w2, mesh=dp_mesh) + fully_shard(model.w3, mesh=dp_mesh) + fully_shard(model, mesh=dp_mesh) + + return model + + + strategy = ModelParallelStrategy( + parallelize_fn=parallelize_feedforward, + data_parallel_size=2, + tensor_parallel_size=2, + ) + + fabric = L.Fabric(accelerator="cuda", devices=4, strategy=strategy) + fabric.launch() + + # Initialize the model + model = FeedForward(8192, 8192) + model = fabric.setup(model) + + # Define the optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, foreach=True) + optimizer = fabric.setup_optimizers(optimizer) + + # Define dataset/dataloader + dataset = RandomDataset(8192, 128) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=8) + dataloader = fabric.setup_dataloaders(dataloader) + + # Simplified training loop + for i, batch in enumerate(dataloader): + output = model(batch) + loss = output.sum() + fabric.backward(loss) + optimizer.step() + optimizer.zero_grad() + fabric.print(f"Iteration {i} complete") + + fabric.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + +| + +Beyond this toy example, we recommend you study our `LLM 2D Parallel Example (Llama 3) `_. + + +---- + + +******************* +Effective use cases +******************* + +In the toy example above, the parallelization is configured to work within a single machine across multiple GPUs. +However, in practice the main use case for 2D parallelism is in multi-node training, where one can effectively combine both methods to maximize throughput and model scale. +Since tensor-parallelism requires blocking collective calls, fast GPU data transfers are essential to keep throughput high and therefore TP is typically applied across GPUs within a machine. +On the other hand, FSDP by design has the advantage that it can overlap GPU transfers with the computation (it can prefetch layers). +Hence, combining FSDP for inter-node parallelism and TP for intra-node parallelism is generally a good strategy to minimize both the latency and network bandwidth usage, making it possible to scale to much larger models than is possible with FSDP alone. + + +.. code-block:: python + + from lightning.fabric.strategies import ModelParallelStrategy + + strategy = ModelParallelStrategy( + # Default is "auto" + # Applies TP intra-node and DP inter-node + data_parallel_size="auto", + tensor_parallel_size="auto", + ) + + +---- + + +*************************** +Data-loading considerations +*************************** + +In a tensor-parallelized model, it is important that the model receives an identical input on each GPU that participates in the same tensor-parallel group. +However, across the data-parallel dimension, the inputs should be different. +In other words, if TP is applied within a node, and FSDP across nodes, each node must receive a different batch, but every GPU within the node gets the same batch of data. + +If you use a PyTorch data loader and set it up using :meth:`~lightning.fabric.fabric.Fabric.setup_dataloaders`, Fabric will automatically handle this for you by configuring the distributed sampler. +However, when you shuffle data in your dataset or data loader, or when applying randomized transformations/augmentations in your data, you must still ensure that the seed is set appropriately. + + +.. code-block:: python + + import lightning as L + + fabric = L.Fabric(...) + + # Define dataset/dataloader + # If there is randomness/augmentation in the dataset, fix the seed + dataset = MyDataset(seed=42) + dataloader = DataLoader(dataset, batch_size=8, shuffle=True) + + # Fabric configures the sampler automatically for you such that + # all batches in a tensor-parallel group are identical, + # while still sharding the dataset across the data-parallel group + dataloader = fabric.setup_dataloaders(dataloader) + + for i, batch in enumerate(dataloader): + ... + + + + +---- + + +********** +Next steps +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: LLM 2D Parallel Example + :description: Full example how to combine TP + FSDP in a large language model (Llama 3) + :col_css: col-md-4 + :button_link: https://github.com/Lightning-AI/pytorch-lightning/tree/master/examples/fabric/tensor_parallel + :height: 160 + :tag: advanced + +.. displayitem:: + :header: Pipeline Parallelism + :description: Coming sooon + :col_css: col-md-4 + :height: 160 + :tag: advanced + + +.. raw:: html + +
+
+ +| From fa1126ea5323b251ef9fe5d114111067781a37d0 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Wed, 22 May 2024 23:46:51 +0200 Subject: [PATCH 039/179] docs: fix link to CLIP (#19896) * docs: fix link to CLIP * www * ignore --- docs/source-app/conf.py | 6 ++++++ docs/source-app/get_started/what_app_can_do.rst | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py index 9d130172a3e61..e139e88365e64 100644 --- a/docs/source-app/conf.py +++ b/docs/source-app/conf.py @@ -456,3 +456,9 @@ def find_source(): # ignore all links in any CHANGELOG file linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"] + + +# ignore the following relative links (false positive errors during linkcheck) +linkcheck_ignore = [ + "https://www.openai.com/index/clip/", +] diff --git a/docs/source-app/get_started/what_app_can_do.rst b/docs/source-app/get_started/what_app_can_do.rst index dc940c342ca61..b4033dd40d594 100644 --- a/docs/source-app/get_started/what_app_can_do.rst +++ b/docs/source-app/get_started/what_app_can_do.rst @@ -85,7 +85,7 @@ Find the `ScratchPad App `_. +This App lets you find anything you're looking for inside a video. The engine is powered by `Open AI CLIP `_. Find the `InVideo Search App `_ on the App Gallery and the `InVideo Search App codebase. `_ in GitHub. From 414c86332e7469157c57fb9ec5baf83f2d7253bc Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 23 May 2024 00:13:41 +0200 Subject: [PATCH 040/179] (9/n) Support 2D Parallelism - Remaining Checkpoint Logic (#19888) Co-authored-by: Luca Antiga --- .../fabric/strategies/model_parallel.py | 12 +- src/lightning/pytorch/CHANGELOG.md | 3 + .../pytorch/strategies/model_parallel.py | 72 ++++++-- tests/tests_pytorch/strategies/test_fsdp.py | 46 ++--- .../strategies/test_model_parallel.py | 93 ++++++++-- .../test_model_parallel_integration.py | 169 ++++++++++++++++++ 6 files changed, 334 insertions(+), 61 deletions(-) diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index a143d168cdede..df84978486b19 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -412,6 +412,7 @@ def _load_checkpoint( path: Path, state: Dict[str, Union[Module, Optimizer, Any]], strict: bool = True, + optimizer_states_from_list: bool = False, ) -> Dict[str, Any]: from torch.distributed.checkpoint.state_dict import ( StateDictOptions, @@ -473,8 +474,15 @@ def _load_checkpoint( full_state_dict=True, strict=strict, ) - for optimizer_name, optimizer in optimizers.items(): - optimizer_state = _rekey_optimizer_state_if_needed(checkpoint.pop(optimizer_name), module) + for optimizer_idx, (optimizer_name, optimizer) in enumerate(optimizers.items()): + if optimizer_states_from_list: + # This code path is only used by `lightning.pytorch`, which saves optimizer states as a list + # rather than individual states at the top level. + optimizer_state = checkpoint["optimizer_states"][optimizer_idx] + else: + optimizer_state = checkpoint.pop(optimizer_name) + + optimizer_state = _rekey_optimizer_state_if_needed(optimizer_state, module) set_optimizer_state_dict( module, optimizer, diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 6792f979871a6..297f54b4b2949 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -18,6 +18,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) +- Added `ModelParallelStrategy` to support 2D parallelism ([#19878](https://github.com/Lightning-AI/pytorch-lightning/pull/19878), [#19888](https://github.com/Lightning-AI/pytorch-lightning/pull/19888)) + + ### Changed diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py index 304b9bc04fc2d..e0fdcd4a236cb 100644 --- a/src/lightning/pytorch/strategies/model_parallel.py +++ b/src/lightning/pytorch/strategies/model_parallel.py @@ -11,8 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import shutil from contextlib import contextmanager, nullcontext from datetime import timedelta +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Mapping, Optional, Union import torch @@ -22,9 +24,13 @@ from typing_extensions import override import lightning.pytorch as pl -from lightning.fabric.plugins import CheckpointIO from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout -from lightning.fabric.strategies.model_parallel import _setup_device_mesh +from lightning.fabric.strategies.model_parallel import ( + _distributed_checkpoint_save, + _is_sharded_checkpoint, + _load_checkpoint, + _setup_device_mesh, +) from lightning.fabric.utilities.distributed import ( _distributed_is_initialized, _get_default_process_group_backend_for_device, @@ -34,6 +40,7 @@ from lightning.fabric.utilities.distributed import group as _group from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3 from lightning.fabric.utilities.init import _materialize_distributed_module +from lightning.fabric.utilities.load import _METADATA_FILENAME from lightning.fabric.utilities.optimizer import _optimizers_to_device from lightning.fabric.utilities.seed import reset_seed from lightning.fabric.utilities.types import _PATH, ReduceOp @@ -95,16 +102,6 @@ def device_mesh(self) -> "DeviceMesh": raise RuntimeError("Accessing the device mesh before processes have initialized is not allowed.") return self._device_mesh - @property - @override - def checkpoint_io(self) -> CheckpointIO: - raise NotImplementedError(f"The `{type(self).__name__}` does not use the `CheckpointIO` plugin interface.") - - @checkpoint_io.setter - @override - def checkpoint_io(self, io: CheckpointIO) -> None: - raise NotImplementedError(f"The `{type(self).__name__}` does not support setting a `CheckpointIO` plugin.") - @property @override def root_device(self) -> torch.device: @@ -253,6 +250,11 @@ def teardown(self) -> None: @override def lightning_module_state_dict(self) -> Dict[str, Any]: + """Collects the state dict of the model. + + Only returns a non-empty state dict on rank 0 if ``save_distributed_checkpoint=False``. + + """ from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict state_dict_options = StateDictOptions(full_state_dict=(not self._save_distributed_checkpoint), cpu_offload=True) @@ -266,6 +268,11 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr @override def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Any]: + """Collects the state of the given optimizer. + + Only returns a non-empty state dict on rank 0 if ``save_distributed_checkpoint=False``. + + """ from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp import OptimStateKeyType @@ -275,8 +282,9 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Any]: optimizer = optimizer._optimizer assert self.model is not None + state_dict = get_optimizer_state_dict(self.model, optimizer, options=state_dict_options) - if not self._save_distributed_checkpoint: + if not self._save_distributed_checkpoint and self.global_rank == 0: # Store the optimizer state dict in standard format state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_ID, self.model) return state_dict @@ -295,11 +303,45 @@ def save_checkpoint( f"`{type(self).__name__}.save_checkpoint(..., storage_options=...)` is not supported because" f" `{type(self).__name__}` does not use the `CheckpointIO`." ) - raise NotImplementedError("Checkpoint saving is not yet implemented.") + # broadcast the path from rank 0 to ensure all the checkpoints are saved to a common path + path = Path(self.broadcast(filepath)) + if path.is_dir() and not self._save_distributed_checkpoint and not _is_sharded_checkpoint(path): + raise IsADirectoryError(f"The checkpoint path exists and is a directory: {path}") + + if self._save_distributed_checkpoint: + if path.is_file(): + path.unlink() + path.mkdir(parents=True, exist_ok=True) + + converted_state = {"state_dict": checkpoint.pop("state_dict")} + converted_state.update({ + f"optimizer_{idx}": optim_state + for idx, optim_state in enumerate(checkpoint.pop("optimizer_states", [])) + }) + _distributed_checkpoint_save(converted_state, path) + + if self.global_rank == 0: + torch.save(checkpoint, path / _METADATA_FILENAME) + else: + if _is_sharded_checkpoint(path): + shutil.rmtree(path) + return super().save_checkpoint(checkpoint=checkpoint, filepath=path) @override def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: - raise NotImplementedError("Checkpoint loading is not yet implemented.") + # broadcast the path from rank 0 to ensure all the states are loaded from a common path + path = Path(self.broadcast(checkpoint_path)) + state = { + "state_dict": self.model, + **{f"optimizer_{idx}": optimizer for idx, optimizer in enumerate(self.optimizers)}, + } + assert self.lightning_module is not None + return _load_checkpoint( + path=path, + state=state, + strict=self.lightning_module.strict_loading, + optimizer_states_from_list=True, + ) def _setup_distributed(self) -> None: super().setup_environment() diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 76c336fecf107..5557c07df9960 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -210,7 +210,7 @@ def test_invalid_on_cpu(tmp_path, cuda_count_0): trainer.strategy.setup_environment() -def test_fsdp_custom_mixed_precision(): +def test_custom_mixed_precision(): """Test to ensure that passing a custom mixed precision config works.""" config = MixedPrecision() strategy = FSDPStrategy(mixed_precision=config) @@ -218,7 +218,7 @@ def test_fsdp_custom_mixed_precision(): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) -def test_fsdp_strategy_sync_batchnorm(tmp_path): +def test_strategy_sync_batchnorm(tmp_path): """Test to ensure that sync_batchnorm works when using FSDP and GPU, and all stages can be run.""" model = TestFSDPModel() trainer = Trainer( @@ -234,7 +234,7 @@ def test_fsdp_strategy_sync_batchnorm(tmp_path): @RunIf(min_cuda_gpus=1, skip_windows=True) -def test_fsdp_modules_without_parameters(tmp_path): +def test_modules_without_parameters(tmp_path): """Test that TorchMetrics get moved to the device despite not having any parameters.""" class MetricsModel(BoringModel): @@ -266,7 +266,7 @@ def training_step(self, batch, batch_idx): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))]) @pytest.mark.parametrize("state_dict_type", ["sharded", "full"]) -def test_fsdp_strategy_checkpoint(state_dict_type, precision, tmp_path): +def test_strategy_checkpoint(state_dict_type, precision, tmp_path): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" model = TestFSDPModel() strategy = FSDPStrategy(state_dict_type=state_dict_type) @@ -286,7 +286,7 @@ def custom_auto_wrap_policy( @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.parametrize("wrap_min_params", [2, 1024, 100000000]) -def test_fsdp_strategy_full_state_dict(tmp_path, wrap_min_params): +def test_strategy_full_state_dict(tmp_path, wrap_min_params): """Test to ensure that the full state dict is extracted when using FSDP strategy. Based on `wrap_min_params`, the model will be fully wrapped, half wrapped, and not wrapped at all. @@ -342,7 +342,7 @@ def test_fsdp_strategy_full_state_dict(tmp_path, wrap_min_params): ), ], ) -def test_fsdp_checkpoint_multi_gpus(tmp_path, model, strategy, strategy_cfg): +def test_checkpoint_multi_gpus(tmp_path, model, strategy, strategy_cfg): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run.""" ck = ModelCheckpoint(save_last=True) @@ -410,7 +410,7 @@ def configure_optimizers(self): trainer.fit(model) -def test_fsdp_forbidden_precision_raises(): +def test_forbidden_precision_raises(): with pytest.raises(TypeError, match="can only work with the `FSDPPrecision"): FSDPStrategy(precision_plugin=HalfPrecision()) @@ -419,7 +419,7 @@ def test_fsdp_forbidden_precision_raises(): strategy.precision_plugin = HalfPrecision() -def test_fsdp_activation_checkpointing(): +def test_activation_checkpointing(): """Test that the FSDP strategy can apply activation checkpointing to the given layers.""" class Block1(nn.Linear): @@ -469,7 +469,7 @@ def __init__(self): apply_mock.assert_called_with(wrapped, checkpoint_wrapper_fn=ANY, **strategy._activation_checkpointing_kwargs) -def test_fsdp_strategy_cpu_offload(): +def test_strategy_cpu_offload(): """Test the different ways cpu offloading can be enabled.""" # bool strategy = FSDPStrategy(cpu_offload=True) @@ -481,7 +481,7 @@ def test_fsdp_strategy_cpu_offload(): assert strategy.cpu_offload == config -def test_fsdp_sharding_strategy(): +def test_sharding_strategy(): """Test the different ways the sharding strategy can be set.""" from torch.distributed.fsdp import ShardingStrategy @@ -501,7 +501,7 @@ def test_fsdp_sharding_strategy(): @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"]) -def test_fsdp_hybrid_sharding_strategy(sharding_strategy): +def test_hybrid_sharding_strategy(sharding_strategy): """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg.""" with pytest.raises(RuntimeError, match="The hybrid sharding strategy requires you to pass at least one of"): FSDPStrategy(sharding_strategy=sharding_strategy) @@ -523,7 +523,7 @@ def test_fsdp_hybrid_sharding_strategy(sharding_strategy): FSDPStrategy(sharding_strategy=sharding_strategy, process_group=process_group, device_mesh=device_mesh) -def test_fsdp_use_orig_params(): +def test_use_orig_params(): """Test that Lightning enables `use_orig_params` automatically.""" strategy = FSDPStrategy() assert strategy.kwargs["use_orig_params"] @@ -548,7 +548,7 @@ def test_set_timeout(init_process_group_mock): @mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state") -def test_fsdp_strategy_load_optimizer_states_multiple(_, tmp_path): +def test_strategy_load_optimizer_states_multiple(_, tmp_path): strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")], state_dict_type="full") trainer = Trainer() trainer.state.fn = TrainerFn.FITTING @@ -572,7 +572,7 @@ def test_fsdp_strategy_load_optimizer_states_multiple(_, tmp_path): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.parametrize("wrap_min_params", [2, 1024, 100000000]) -def test_fsdp_strategy_save_optimizer_states(tmp_path, wrap_min_params): +def test_strategy_save_optimizer_states(tmp_path, wrap_min_params): """Test to ensure that the full state dict and optimizer states is saved when using FSDP strategy. Based on `wrap_min_params`, the model will be fully wrapped, half wrapped, and not wrapped at all. If the model can @@ -630,7 +630,7 @@ def test_fsdp_strategy_save_optimizer_states(tmp_path, wrap_min_params): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.parametrize("wrap_min_params", [2, 1024, 100000000]) -def test_fsdp_strategy_load_optimizer_states(wrap_min_params, tmp_path): +def test_strategy_load_optimizer_states(wrap_min_params, tmp_path): """Test to ensure that the full state dict and optimizer states can be load when using FSDP strategy. Based on `wrap_min_params`, the model will be fully wrapped, half wrapped, and not wrapped at all. If the DDP model @@ -741,7 +741,7 @@ def test_save_checkpoint_storage_options(tmp_path): @mock.patch("lightning.pytorch.strategies.fsdp._get_sharded_state_dict_context") @mock.patch("lightning.fabric.plugins.io.torch_io._atomic_save") @mock.patch("lightning.pytorch.strategies.fsdp.shutil") -def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, tmp_path): +def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, tmp_path): strategy = FSDPStrategy(state_dict_type="full") # state_dict_type='full', path exists, path is not a sharded checkpoint: error @@ -757,16 +757,12 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, path.mkdir() (path / "meta.pt").touch() assert _is_sharded_checkpoint(path) - model = Mock(spec=FullyShardedDataParallel) - model.modules.return_value = [model] strategy.save_checkpoint(Mock(), filepath=path) shutil_mock.rmtree.assert_called_once_with(path) # state_dict_type='full', path exists, path is a file: no error (overwrite) path = tmp_path / "file.pt" path.touch() - model = Mock(spec=FullyShardedDataParallel) - model.modules.return_value = [model] torch_save_mock.reset_mock() strategy.save_checkpoint(Mock(), filepath=path) torch_save_mock.assert_called_once() @@ -783,8 +779,6 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, path = tmp_path / "not-empty-2" path.mkdir() (path / "file").touch() - model = Mock(spec=FullyShardedDataParallel) - model.modules.return_value = [model] with save_mock: strategy.save_checkpoint({"state_dict": {}, "optimizer_states": {"": {}}}, filepath=path) assert (path / "file").exists() @@ -792,21 +786,19 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, # state_dict_type='sharded', path exists, path is a file: no error (overwrite) path = tmp_path / "file-2.pt" path.touch() - model = Mock(spec=FullyShardedDataParallel) - model.modules.return_value = [model] with save_mock: strategy.save_checkpoint({"state_dict": {}, "optimizer_states": {"": {}}}, filepath=path) assert path.is_dir() @mock.patch("lightning.pytorch.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x) -def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path): +def test_save_checkpoint_unknown_state_dict_type(tmp_path): strategy = FSDPStrategy(state_dict_type="invalid") with pytest.raises(ValueError, match="Unknown state_dict_type"): strategy.save_checkpoint(checkpoint=Mock(), filepath=tmp_path) -def test_fsdp_load_unknown_checkpoint_type(tmp_path): +def test_load_unknown_checkpoint_type(tmp_path): """Test that the strategy validates the contents at the checkpoint path.""" strategy = FSDPStrategy() strategy.model = Mock() @@ -874,7 +866,7 @@ def test_save_load_sharded_state_dict(tmp_path): @mock.patch("lightning.pytorch.strategies.fsdp.torch.load") @mock.patch("lightning.pytorch.strategies.fsdp._lazy_load") @mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state") -def test_fsdp_lazy_load_full_state_dict(_, lazy_load_mock, torch_load_mock, tmp_path): +def test_lazy_load_full_state_dict(_, lazy_load_mock, torch_load_mock, tmp_path): """Test that loading a single file (full state) is lazy to reduce peak CPU memory usage.""" model = BoringModel() checkpoint = {"state_dict": model.state_dict()} diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py index 4b9b0887c85bf..e22593f391811 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel.py +++ b/tests/tests_pytorch/strategies/test_model_parallel.py @@ -20,6 +20,7 @@ import pytest import torch import torch.nn as nn +from lightning.fabric.strategies.model_parallel import _is_sharded_checkpoint from lightning.pytorch import LightningModule from lightning.pytorch.plugins.environments import LightningEnvironment from lightning.pytorch.strategies import ModelParallelStrategy @@ -68,17 +69,6 @@ def test_validate_device_mesh_dimensions(num_nodes, devices, invalid_dp_size, in strategy.setup_environment() -@RunIf(min_torch="2.3") -def test_checkpoint_io_unsupported(): - """Test that the ModelParallel strategy does not support the `CheckpointIO` plugin.""" - strategy = ModelParallelStrategy() - with pytest.raises(NotImplementedError, match="does not use the `CheckpointIO` plugin"): - _ = strategy.checkpoint_io - - with pytest.raises(NotImplementedError, match="does not support setting a `CheckpointIO` plugin"): - strategy.checkpoint_io = Mock() - - @RunIf(min_torch="2.3") def test_fsdp_v1_modules_unsupported(): """Test that the strategy won't allow setting up a module wrapped with the legacy FSDP API.""" @@ -135,18 +125,87 @@ def test_save_checkpoint_storage_options(tmp_path): @RunIf(min_torch="2.3") -def test_save_checkpoint_path_exists(): - pytest.skip("Checkpoint saving and loading not implemented") +@mock.patch("lightning.pytorch.strategies.model_parallel.ModelParallelStrategy.broadcast", lambda _, x: x) +@mock.patch("lightning.fabric.plugins.io.torch_io._atomic_save") +@mock.patch("lightning.pytorch.strategies.model_parallel.shutil") +def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, tmp_path): + strategy = ModelParallelStrategy(save_distributed_checkpoint=False) + + # save_distributed_checkpoint=False, path exists, path is not a sharded checkpoint: error + path = tmp_path / "not-empty" + path.mkdir() + (path / "file").touch() + assert not _is_sharded_checkpoint(path) + with pytest.raises(IsADirectoryError, match="exists and is a directory"): + strategy.save_checkpoint(Mock(), filepath=path) + + # save_distributed_checkpoint=False, path exists, path is a sharded checkpoint: no error (overwrite) + path = tmp_path / "sharded-checkpoint" + path.mkdir() + (path / "meta.pt").touch() + assert _is_sharded_checkpoint(path) + strategy.save_checkpoint(Mock(), filepath=path) + shutil_mock.rmtree.assert_called_once_with(path) + + # save_distributed_checkpoint=False, path exists, path is a file: no error (overwrite) + path = tmp_path / "file.pt" + path.touch() + torch_save_mock.reset_mock() + strategy.save_checkpoint(Mock(), filepath=path) + torch_save_mock.assert_called_once() + + strategy = ModelParallelStrategy(save_distributed_checkpoint=True) + + save_mock = mock.patch("torch.distributed.checkpoint.save") + + # save_distributed_checkpoint=True, path exists, path is a folder: no error (overwrite) + path = tmp_path / "not-empty-2" + path.mkdir() + (path / "file").touch() + with save_mock: + strategy.save_checkpoint({"state_dict": {}, "optimizer_states": {"": {}}}, filepath=path) + assert (path / "file").exists() + + # save_distributed_checkpoint=True, path exists, path is a file: no error (overwrite) + path = tmp_path / "file-2.pt" + path.touch() + with save_mock: + strategy.save_checkpoint({"state_dict": {}, "optimizer_states": {"": {}}}, filepath=path) + assert path.is_dir() @RunIf(min_torch="2.3") -def test_load_full_checkpoint_support(): - pytest.skip("Checkpoint saving and loading not implemented") +@mock.patch("lightning.fabric.strategies.model_parallel._TORCH_GREATER_EQUAL_2_4", False) +def test_load_full_checkpoint_support(tmp_path): + """Test that loading non-distributed checkpoints into distributed models requires PyTorch >= 2.4.""" + strategy = ModelParallelStrategy() + strategy.model = Mock() + strategy._lightning_module = Mock(strict_loading=True) + path = tmp_path / "full.ckpt" + path.touch() + + with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( + "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True + ): + strategy.load_checkpoint(checkpoint_path=path) + + with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( + "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True + ): + strategy.load_checkpoint(checkpoint_path=path) @RunIf(min_torch="2.3") -def test_load_unknown_checkpoint_type(): - pytest.skip("Checkpoint saving and loading not implemented") +@mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True) +def test_load_unknown_checkpoint_type(_, tmp_path): + """Test that the strategy validates the contents at the checkpoint path.""" + strategy = ModelParallelStrategy() + strategy.model = Mock() + strategy._lightning_module = Mock(strict_loading=True) + path = tmp_path / "empty_dir" # neither a single file nor a directory with meta file + path.mkdir() + with pytest.raises(ValueError, match="does not point to a valid checkpoint"): + strategy.load_checkpoint(checkpoint_path=path) @RunIf(min_torch="2.3") diff --git a/tests/tests_pytorch/strategies/test_model_parallel_integration.py b/tests/tests_pytorch/strategies/test_model_parallel_integration.py index bbac2a6078f9c..bb8d7c719f821 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel_integration.py +++ b/tests/tests_pytorch/strategies/test_model_parallel_integration.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +from pathlib import Path + import pytest import torch import torch.nn as nn @@ -338,3 +341,169 @@ def _run_setup_assertions(empty_init, expected_device): # Case 2: Empty-init with PyTorch >= 2.1 supports meta device _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) + + +@RunIf(min_torch="2.3", min_cuda_gpus=2, skip_windows=True, standalone=True) +@pytest.mark.parametrize("save_distributed_checkpoint", [True, False]) +def test_strategy_state_dict(tmp_path, save_distributed_checkpoint): + """Test that the strategy returns the correct state dict of the LightningModule.""" + model = FSDP2Model() + correct_state_dict = model.state_dict() # State dict before wrapping + + strategy = ModelParallelStrategy(save_distributed_checkpoint=save_distributed_checkpoint) + trainer = Trainer( + default_root_dir=tmp_path, + accelerator="cuda", + devices=2, + strategy=strategy, + max_epochs=1, + barebones=True, + ) + trainer.fit(model) + + state_dict = trainer.strategy.lightning_module_state_dict() + + if save_distributed_checkpoint: + # All ranks return a state dict + assert len(state_dict) > 0 + # State dict should contain same keys as non-distributed state dict + assert list(state_dict.keys()) == list(correct_state_dict.keys()) + else: + if trainer.global_rank != 0: + # The full state-dict is only returned on rank 0 + assert len(state_dict) == 0 + return + # State dict should contain same keys as non-distributed state dict + assert list(state_dict.keys()) == list(correct_state_dict.keys()) + + +@RunIf(min_torch="2.3", min_cuda_gpus=2, skip_windows=True, standalone=True) +def test_load_full_state_checkpoint_into_regular_model(tmp_path): + """Test that a full-state checkpoint saved from a distributed model can be loaded back into a regular model.""" + + # Save a regular full-state checkpoint from a distributed model + model = FSDP2Model() + strategy = ModelParallelStrategy(save_distributed_checkpoint=False) + trainer = Trainer( + default_root_dir=tmp_path, + accelerator="gpu", + devices=2, + strategy=strategy, + max_epochs=1, + barebones=True, + ) + trainer.fit(model) + model_path = tmp_path / "last.ckpt" + model_path = trainer.strategy.broadcast(model_path) + trainer.save_checkpoint(model_path) + model_state_dict = trainer.strategy.lightning_module_state_dict() + optimizer_state_dict = trainer.strategy.optimizer_state(model.optimizers()) + + if trainer.global_rank != 0: + assert len(model_state_dict) == 0 + assert len(optimizer_state_dict) == 0 + + # Create a regular model and load the checkpoint into it + model = TemplateModel() + trainer = Trainer(default_root_dir=tmp_path, accelerator="gpu", devices=2, strategy="ddp", max_epochs=1) + trainer.fit(model, ckpt_path=model_path) + restored_model_state_dict = trainer.strategy.lightning_module_state_dict() + restored_optimizer_state_dict = trainer.strategy.optimizer_state(model.optimizers()) + + if trainer.global_rank == 0: + assert len(model_state_dict) == len(restored_model_state_dict) + assert len(optimizer_state_dict) == len(restored_optimizer_state_dict) + torch.testing.assert_close(model_state_dict, restored_model_state_dict, atol=0, rtol=0) + torch.testing.assert_close(optimizer_state_dict, restored_optimizer_state_dict, atol=0, rtol=0) + trainer.strategy.barrier() + + +@RunIf(min_torch="2.4", min_cuda_gpus=2, skip_windows=True, standalone=True) +def test_load_standard_checkpoint_into_distributed_model(tmp_path): + """Test that a regular checkpoint (weights and optimizer states) can be loaded into a distributed model.""" + + # Save a regular DDP checkpoint + model = TemplateModel() + trainer = Trainer(default_root_dir=tmp_path, accelerator="gpu", devices=2, strategy="ddp", max_epochs=1) + trainer.fit(model) + model_path = tmp_path / "last.ckpt" + model_path = trainer.strategy.broadcast(model_path) + trainer.save_checkpoint(model_path) + model_state_dict = trainer.strategy.lightning_module_state_dict() + optimizer_state_dict = trainer.strategy.optimizer_state(model.optimizers()) + + # Create a distributed model and load the checkpoint into it + model = FSDP2Model() + strategy = ModelParallelStrategy(save_distributed_checkpoint=False) + trainer = Trainer( + default_root_dir=tmp_path, + accelerator="gpu", + devices=2, + strategy=strategy, + max_epochs=1, + barebones=True, + ) + trainer.fit(model, ckpt_path=model_path) + restored_model_state_dict = trainer.strategy.lightning_module_state_dict() + restored_optimizer_state_dict = trainer.strategy.optimizer_state(model.optimizers()) + + if trainer.global_rank != 0: + assert len(restored_model_state_dict) == 0 + assert len(restored_optimizer_state_dict) == 0 + if trainer.global_rank == 0: + assert len(model_state_dict) == len(restored_model_state_dict) + assert len(optimizer_state_dict) == len(restored_optimizer_state_dict) + torch.testing.assert_close(model_state_dict, restored_model_state_dict, atol=0, rtol=0) + torch.testing.assert_close(optimizer_state_dict, restored_optimizer_state_dict, atol=0, rtol=0) + trainer.strategy.barrier() + + +@RunIf(min_torch="2.4", min_cuda_gpus=2, standalone=True) +def test_save_load_sharded_state_dict(tmp_path): + """Test saving and loading with the distributed state dict format.""" + + class CheckpointModel(FSDP2Model): + def __init__(self, params_to_compare=None): + super().__init__() + self.params_to_compare = params_to_compare + + def on_train_start(self): + if self.params_to_compare is None: + return + for p0, p1 in zip(self.params_to_compare, self.trainer.model.parameters()): + assert torch.equal(p0, p1.full_tensor()) + + seed_everything(0) + + strategy = ModelParallelStrategy(save_distributed_checkpoint=True) + trainer_kwargs = { + "default_root_dir": tmp_path, + "accelerator": "cuda", + "devices": 2, + "max_epochs": 1, + "enable_progress_bar": False, + "enable_model_summary": False, + "logger": False, + } + + # Initial training + model = CheckpointModel() + trainer = Trainer(**trainer_kwargs, strategy=strategy) + trainer.fit(model) + params_before = [p.full_tensor() for p in trainer.model.parameters()] + + checkpoint_path = Path(trainer.strategy.broadcast(trainer.checkpoint_callback.best_model_path)) + assert set(os.listdir(checkpoint_path)) == {"meta.pt", ".metadata", "__0_0.distcp", "__1_0.distcp"} + + metadata = torch.load(checkpoint_path / "meta.pt") + assert "pytorch-lightning_version" in metadata + assert len(metadata["callbacks"]) == 1 # model checkpoint callback + assert "state_dict" not in metadata + assert "optimizer_states" not in metadata + + # Load checkpoint and continue training + trainer_kwargs.update(max_epochs=2) + model = CheckpointModel(params_to_compare=params_before) + strategy = ModelParallelStrategy(save_distributed_checkpoint=True) + trainer = Trainer(**trainer_kwargs, strategy=strategy) + trainer.fit(model, ckpt_path=checkpoint_path) From e0d7ede64301b5eb7b971ac8e5ea257fe31f5e5f Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Thu, 23 May 2024 11:35:53 +0200 Subject: [PATCH 041/179] docs: prune unused `linkcode` (#19897) --- docs/source-app/conf.py | 54 +------------------------------------- docs/source-fabric/conf.py | 43 ------------------------------ 2 files changed, 1 insertion(+), 96 deletions(-) diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py index e139e88365e64..5399d8205cd49 100644 --- a/docs/source-app/conf.py +++ b/docs/source-app/conf.py @@ -41,11 +41,6 @@ # The full version, including alpha/beta/rc tags release = lightning.__version__ -# Options for the linkcode extension -# ---------------------------------- -github_user = "Lightning-AI" -github_repo = project - # -- Project documents ------------------------------------------------------- if _FETCH_S3_ASSETS: @@ -71,7 +66,7 @@ "sphinx_toolbox.collapse", "sphinx.ext.todo", "sphinx.ext.coverage", - "sphinx.ext.linkcode", + # "sphinx.ext.linkcode", "sphinx.ext.autosummary", "sphinx.ext.napoleon", # 'sphinxcontrib.mockautodoc', # raises error: directive 'automodule' is already registered ... @@ -324,15 +319,6 @@ def setup(app): app.add_js_file("copybutton.js") app.add_css_file("main.css") - -# copy all notebooks to local folder -path_nbs = os.path.join(_PATH_HERE, "notebooks") -if not os.path.isdir(path_nbs): - os.mkdir(path_nbs) -for path_ipynb in glob.glob(os.path.join(_PATH_ROOT, "notebooks", "*.ipynb")): - path_ipynb2 = os.path.join(path_nbs, os.path.basename(path_ipynb)) - shutil.copy(path_ipynb, path_ipynb2) - # copy all examples to local folder path_examples = os.path.join(_PATH_HERE, "..", "examples") if not os.path.isdir(path_examples): @@ -370,44 +356,6 @@ def _package_list_from_file(file): autodoc_mock_imports = MOCK_PACKAGES -# Resolve function -# This function is used to populate the (source-app) links in the API -def linkcode_resolve(domain, info): - def find_source(): - # try to find the file and line number, based on code from numpy: - # https://github.com/numpy/numpy/blob/master/doc/source/conf.py#L286 - obj = sys.modules[info["module"]] - for part in info["fullname"].split("."): - obj = getattr(obj, part) - fname = inspect.getsourcefile(obj) - # https://github.com/rtfd/readthedocs.org/issues/5735 - if any(s in fname for s in ("readthedocs", "rtfd", "checkouts")): - # /home/docs/checkouts/readthedocs.org/user_builds/pytorch_lightning/checkouts/ - # devel/pytorch_lightning/utilities/cls_experiment.py#L26-L176 - path_top = os.path.abspath(os.path.join("..", "..", "..")) - fname = os.path.relpath(fname, start=path_top) - else: - # Local build, imitate master - fname = "master/" + os.path.relpath(fname, start=os.path.abspath("..")) - source, lineno = inspect.getsourcelines(obj) - return fname, lineno, lineno + len(source) - 1 - - if domain != "py" or not info["module"]: - return None - try: - filename = "%s#L%d-L%d" % find_source() - except Exception: - filename = info["module"].replace(".", "/") + ".py" - # import subprocess - # tag = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE, - # universal_newlines=True).communicate()[0][:-1] - branch = filename.split("/")[0] - # do mapping from latest tags to master - branch = {"latest": "master", "stable": "master"}.get(branch, branch) - filename = "/".join([branch] + filename.split("/")[1:]) - return f"https://github.com/{github_user}/{github_repo}/blob/{filename}" - - autosummary_generate = True autodoc_member_order = "groupwise" diff --git a/docs/source-fabric/conf.py b/docs/source-fabric/conf.py index 2428ed46ec80f..847ca4dd403e4 100644 --- a/docs/source-fabric/conf.py +++ b/docs/source-fabric/conf.py @@ -39,11 +39,6 @@ # The full version, including alpha/beta/rc tags release = lightning.__version__ -# Options for the linkcode extension -# ---------------------------------- -github_user = "Lightning-AI" -github_repo = project - # -- Project documents ------------------------------------------------------- if _FETCH_S3_ASSETS: @@ -339,44 +334,6 @@ def _package_list_from_file(file): autodoc_mock_imports = MOCK_PACKAGES -# Resolve function -# This function is used to populate the (source) links in the API -def linkcode_resolve(domain, info): - def find_source(): - # try to find the file and line number, based on code from numpy: - # https://github.com/numpy/numpy/blob/master/doc/source/conf.py#L286 - obj = sys.modules[info["module"]] - for part in info["fullname"].split("."): - obj = getattr(obj, part) - fname = inspect.getsourcefile(obj) - # https://github.com/rtfd/readthedocs.org/issues/5735 - if any(s in fname for s in ("readthedocs", "rtfd", "checkouts")): - # /home/docs/checkouts/readthedocs.org/user_builds/pytorch_lightning/checkouts/ - # devel/pytorch_lightning/utilities/cls_experiment.py#L26-L176 - path_top = os.path.abspath(os.path.join("..", "..", "..")) - fname = os.path.relpath(fname, start=path_top) - else: - # Local build, imitate master - fname = "master/" + os.path.relpath(fname, start=os.path.abspath("..")) - source, lineno = inspect.getsourcelines(obj) - return fname, lineno, lineno + len(source) - 1 - - if domain != "py" or not info["module"]: - return None - try: - filename = "%s#L%d-L%d" % find_source() - except Exception: - filename = info["module"].replace(".", "/") + ".py" - # import subprocess - # tag = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE, - # universal_newlines=True).communicate()[0][:-1] - branch = filename.split("/")[0] - # do mapping from latest tags to master - branch = {"latest": "master", "stable": "master"}.get(branch, branch) - filename = "/".join([branch] + filename.split("/")[1:]) - return f"https://github.com/{github_user}/{github_repo}/blob/{filename}" - - autosummary_generate = True autodoc_member_order = "groupwise" From 7874cd08ec50d78e3cc89b92963eda342cf6f35f Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 23 May 2024 13:11:28 +0200 Subject: [PATCH 042/179] [TPU] Fix test assertion error from artifacts (#19825) --- tests/tests_pytorch/accelerators/test_xla.py | 2 +- .../trainer/properties/test_estimated_stepping_batches.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/accelerators/test_xla.py b/tests/tests_pytorch/accelerators/test_xla.py index 16fd5491d2ddd..23060486468bd 100644 --- a/tests/tests_pytorch/accelerators/test_xla.py +++ b/tests/tests_pytorch/accelerators/test_xla.py @@ -56,7 +56,7 @@ def test_resume_training_on_cpu(tmp_path): """Checks if training can be resumed from a saved checkpoint on CPU.""" # Train a model on TPU model = BoringModel() - trainer = Trainer(max_epochs=1, accelerator="tpu", devices="auto") + trainer = Trainer(max_epochs=1, accelerator="tpu", devices="auto", default_root_dir=tmp_path) trainer.fit(model) if trainer.world_size != trainer.num_devices: diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 0f9d80424528a..76c0c695b3c02 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -152,6 +152,6 @@ def on_train_start(self): @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_num_stepping_batches_with_tpu_multi(): """Test stepping batches with the TPU strategy across multiple devices.""" - trainer = Trainer(accelerator="tpu", devices="auto", max_epochs=1) + trainer = Trainer(accelerator="tpu", devices="auto", max_epochs=1, logger=False, enable_checkpointing=False) model = MultiprocessModel() trainer.fit(model) From c09356db1e1ef9da3faedee551a8ba2d8a732d11 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 23 May 2024 14:55:52 +0200 Subject: [PATCH 043/179] (10/10) Support 2D Parallelism - Port Fabric docs to PL (#19899) --- .../advanced/model_parallel/tp_fsdp.rst | 3 +- docs/source-fabric/glossary/index.rst | 5 + docs/source-pytorch/_static/main.css | 10 + .../accelerators/gpu_advanced.rst | 2 +- .../advanced/model_parallel/fsdp.rst | 2 +- .../advanced/model_parallel/index.rst | 162 ++++++++++ .../advanced/model_parallel/tp.rst | 298 ++++++++++++++++++ .../advanced/model_parallel/tp_fsdp.rst | 292 +++++++++++++++++ docs/source-pytorch/api_references.rst | 1 + docs/source-pytorch/common/index.rst | 2 +- docs/source-pytorch/common_usecases.rst | 2 +- docs/source-pytorch/glossary/index.rst | 20 +- .../levels/advanced_level_21.rst | 4 +- examples/pytorch/tensor_parallel/train.py | 4 +- 14 files changed, 795 insertions(+), 12 deletions(-) create mode 100644 docs/source-pytorch/advanced/model_parallel/index.rst create mode 100644 docs/source-pytorch/advanced/model_parallel/tp.rst create mode 100644 docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst diff --git a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst index 7bab5e88a3b19..e9a305b56da48 100644 --- a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst +++ b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst @@ -21,7 +21,6 @@ We will start off with the same feed forward example model as in the :doc:`Tenso .. code-block:: python - import torch import torch.nn as nn import torch.nn.functional as F @@ -164,7 +163,7 @@ Finally, the tensor parallelism will apply to each group, splitting the sharded model = fabric.setup(model) # Define the optimizer - optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, foreach=True) + optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3) optimizer = fabric.setup_optimizers(optimizer) # Define dataset/dataloader diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst index aec3396b36e7e..f526f5382c448 100644 --- a/docs/source-fabric/glossary/index.rst +++ b/docs/source-fabric/glossary/index.rst @@ -19,6 +19,11 @@ Glossary
+.. displayitem:: + :header: 2D Parallelism + :button_link: ../advanced/model_parallel/tp_fsdp.html + :col_css: col-md-4 + .. displayitem:: :header: Accelerator :button_link: ../fundamentals/accelerators.html diff --git a/docs/source-pytorch/_static/main.css b/docs/source-pytorch/_static/main.css index c1bd8ad0305b7..e3f427599d34e 100644 --- a/docs/source-pytorch/_static/main.css +++ b/docs/source-pytorch/_static/main.css @@ -1,3 +1,13 @@ col { width: 50% !important; } + +ul.no-bullets { + list-style-type: none; /* Remove default bullets */ + padding-left: 0; /* Remove default padding */ +} + +ul.no-bullets li { + padding-left: 0.5em; + text-indent: -2em; +} diff --git a/docs/source-pytorch/accelerators/gpu_advanced.rst b/docs/source-pytorch/accelerators/gpu_advanced.rst index 1c35fa5385f0c..dc1500421812f 100644 --- a/docs/source-pytorch/accelerators/gpu_advanced.rst +++ b/docs/source-pytorch/accelerators/gpu_advanced.rst @@ -22,7 +22,7 @@ For experts pushing the state-of-the-art in model development, Lightning offers :header: Train models with billions of parameters :description: :col_css: col-md-4 - :button_link: ../advanced/model_parallel.html + :button_link: ../advanced/model_parallel/index.html :height: 150 :tag: advanced diff --git a/docs/source-pytorch/advanced/model_parallel/fsdp.rst b/docs/source-pytorch/advanced/model_parallel/fsdp.rst index 4e4c288e9ea0b..5be8043c5d548 100644 --- a/docs/source-pytorch/advanced/model_parallel/fsdp.rst +++ b/docs/source-pytorch/advanced/model_parallel/fsdp.rst @@ -20,7 +20,7 @@ The memory consumption for training is generally made up of | When the sum of these memory components exceed the VRAM of a single GPU, regular data-parallel training (DDP) can no longer be employed. -One of the methods that can alleviate this limitation is called **model-parallel** training, and known as **FSDP** in PyTorch, and in this guide, you will learn how to effectively scale large models with it. +One of the methods that can alleviate this limitation is called **Fully Sharded Data Parallel (FSDP)**, and in this guide, you will learn how to effectively scale large models with it. ---- diff --git a/docs/source-pytorch/advanced/model_parallel/index.rst b/docs/source-pytorch/advanced/model_parallel/index.rst new file mode 100644 index 0000000000000..75c97bfb7c98e --- /dev/null +++ b/docs/source-pytorch/advanced/model_parallel/index.rst @@ -0,0 +1,162 @@ +########################################### +Training models with billions of parameters +########################################### + +Today, large models with billions of parameters are trained with many GPUs across several machines in parallel. +Even a single H100 GPU with 80 GB of VRAM (one of the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision). +The memory consumption for training is generally made up of + +1. the model parameters, +2. the layer activations (forward), +3. the gradients (backward), +4. the optimizer states (e.g., Adam has two additional exponential averages per parameter) and +5. model outputs and loss. + +| + +When the sum of these memory components exceed the VRAM of a single GPU, regular data-parallel training (DDP) can no longer be employed. +To alleviate this limitation, we need to introduce **Model Parallelism**. + + +---- + + +************************** +What is Model Parallelism? +************************** + +There are different types of model parallelism, each with its own trade-offs. + +**Fully Sharded Data Parallelism (FSDP)** shards both model parameters and optimizer states across multiple GPUs, significantly reducing memory usage per GPU. +This method, while highly memory-efficient, involves frequent synchronization between GPUs, introducing communication overhead and complexity in implementation. +FSDP is advantageous when memory constraints are the primary issue, provided there are high-bandwidth interconnects to minimize latency. + +**Tensor Parallelism (TP)** splits individual tensors across GPUs, enabling fine-grained distribution of computation and memory. +It scales well to a large number of GPUs but requires synchronization of tensor slices after each operation, which adds communication overhead. +TP is most effective with models that have many linear layers (LLMs), offering a balance between memory distribution and computational efficiency. + +**Pipeline Parallelism (PP)** divides model layers into segments, each processed by different GPUs, reducing memory load per GPU and minimizing inter-GPU communication to pipeline stage boundaries. +While this reduces communication overhead, it can introduce pipeline bubbles where some GPUs idle, leading to potential inefficiencies. +PP is ideal for deep models with sequential architectures (LLMs), though it requires careful management to minimize idle times. + +Choosing a model parallelism style involves considering model architecture, hardware interconnects, and training efficiency. +In practice, hybrid approaches combining FSDP, TP, and PP are often used to leverage the strengths of each method while mitigating their weaknesses. + + +---- + + +*********** +Get started +*********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Fully-Sharded Data Parallel (FSDP) + :description: Get started training large multi-billion parameter models with minimal code changes + :col_css: col-md-4 + :button_link: fsdp.html + :height: 180 + :tag: advanced + +.. displayitem:: + :header: Tensor Parallel (TP) + :description: Learn the principles behind tensor parallelism and how to apply it to your model + :col_css: col-md-4 + :button_link: tp.html + :height: 180 + :tag: advanced + +.. displayitem:: + :header: 2D Parallel (FSDP + TP) + :description: Combine Tensor Parallelism with FSDP (2D Parallel) to train efficiently on 100s of GPUs + :button_link: tp_fsdp.html + :col_css: col-md-4 + :height: 180 + :tag: advanced + +.. displayitem:: + :header: Pipeline Parallelism + :description: Coming soon + :col_css: col-md-4 + :height: 180 + :tag: advanced + +.. raw:: html + +
+
+ + +---- + + +********************* +Parallelisms compared +********************* + + +**Distributed Data Parallel (DDP)** + +.. raw:: html + +
    +
  • ✅   No model code changes required
  • +
  • ✅   Training with very large batch sizes (batch size scales with number of GPUs)
  • +
  • ❗   Model (weights, optimizer state, activations / gradients) must fit into a GPU
  • +
+ +| + +**Fully-Sharded Data Parallel (FSDP)** + +.. raw:: html + +
    +
  • ✅   No model code changes required
  • +
  • ✅   Training with very large batch sizes (batch size scales with number of GPUs)
  • +
  • ✅   Model (weights, optimizer state, gradients) gets distributed across all GPUs
  • +
  • ❗   A single FSDP layer when gathered during forward/backward must fit into the GPU
  • +
  • ❗   Requires some knowledge about model architecture to set configuration options correctly
  • +
  • ❗   Requires very fast networking (multi-node), data transfers between GPUs often become a bottleneck
  • +
+ +| + +**Tensor Parallel (TP)** + +.. raw:: html + +
    +
  • ❗   Model code changes required
  • +
  • 🤔   Fixed global batch size (does not scale with number of GPUs)
  • +
  • ✅   Model (weights, optimizer state, activations) gets distributed across all GPUs
  • +
  • ✅   Parallelizes the computation of layers that are too large to fit onto a single GPU
  • +
  • ❗   Requires lots of knowledge about model architecture to set configuration options correctly
  • +
  • 🤔   Less GPU data transfers required, but data transfers don't overlap with computation like in FSDP
  • +
+ +| + +**2D Parallel (FSDP + TP)** + +.. raw:: html + +
    +
  • ❗   Model code changes required
  • +
  • ✅   Training with very large batch sizes (batch size scales across data-parallel dimension)
  • +
  • ✅   Model (weights, optimizer state, activations) gets distributed across all GPUs
  • +
  • ✅   Parallelizes the computation of layers that are too large to fit onto a single GPU
  • +
  • ❗   Requires lots of knowledge about model architecture to set configuration options correctly
  • +
  • ✅   Tensor-parallel within machines and FSDP across machines reduces data transfer bottlenecks
  • +
+ +| + +PyTorch Lightning supports all the parallelisms mentioned above natively through PyTorch, with the exception of pipeline parallelism (PP) which is not yet supported. + +| diff --git a/docs/source-pytorch/advanced/model_parallel/tp.rst b/docs/source-pytorch/advanced/model_parallel/tp.rst new file mode 100644 index 0000000000000..1f0de56298b7b --- /dev/null +++ b/docs/source-pytorch/advanced/model_parallel/tp.rst @@ -0,0 +1,298 @@ +################## +Tensor Parallelism +################## + +Tensor parallelism is a technique for training large models by distributing layers across multiple devices, improving memory management and efficiency by reducing inter-device communication. +However, for smaller models, the communication overhead may outweigh its benefits. +This method is most effective for models with very large layers, significantly enhancing performance and memory efficiency. + +.. note:: This is an experimental feature. + + +---- + + +******************************************* +How to exploit parallelism in linear layers +******************************************* + +In tensor parallelism, the computation of a linear layer can be split up across GPUs. +This saves memory because each GPU only needs to hold a portion of the weight matrix. +There are two ways a linear layer can be split up: row-wise or column-wise. + +Column-wise Parallel +==================== + +In a column-wise parallel layer, the weight matrix is split evenly along the column dimension. +Each GPU is sent the same input, and computes a regular matrix multiplication with its portion of the weight matrix. +At the end, the outputs from each GPU can be concatenated to form the final output. + + +.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/fabric/tp-colwise.jpeg + :alt: Left: Regular matrix multiplication. Right: Column-wise parallel matrix multiplication split across two GPUs. + :width: 100% + +Row-wise Parallel +================= + +Row-wise parallelism divides the rows of the weight matrix evenly across devices. +In addition, the input gets split the same way along the inner dimension (because the weight matrix now has fewer rows). +Each GPU then performs a regular matrix multiplication with its portion of the weight matrix and inputs. +At the end, the outputs from each GPU can be summed up element-wise (all-reduce) to form the final output. + +.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/fabric/tp-rowwise.jpeg + :alt: Left: Regular matrix multiplication. Right: Row-wise parallel matrix multiplication split across two GPUs. + :width: 100% + + +Combined Column- and Row-wise Parallel +====================================== + +When there are multiple linear layers in sequence, e.g., in a MLP or a Transformer, the column-wise and row-wise parallel styles can be combined for maximum effect. +Instead of concatenating the output of the column-wise parallel layer, we keep the outputs separate and feed them directly to the row-wise parallel layer. +This way, we avoid costly data transfers between GPUs. + +.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/fabric/tp-combined.jpeg + :alt: Top: Two regular matrix multiplications in sequence. Bottom: Combined column-wise and row-wise parallel matrix multiplications across two GPUs. + :width: 100% + +Note that activation functions between the layers can still be applied without additional communication because they are element-wise, but are not shown in the figures for simplicity. + + +---- + + +*********************************** +Apply tensor parallelism to a model +*********************************** + +To apply tensor parallelism to a LightningModule, you need a good understanding of your model's architecture to make the decision of where to apply the parallel styles you've seen above. +Let's start with a simple MLP toy example: + +.. code-block:: python + + import torch.nn as nn + import torch.nn.functional as F + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +This model has three linear layers. Layers ``w1`` and ``w3`` produce an output that is later multiplied element-wise. +That output is then fed into layer ``w2``. +Therefore, ``w1`` and ``w3`` are suitable candidates for column-wise parallelism, because their output(s) can easily be combined with ``w2`` in row-wise fashion. + +Now, when implementing the LightningModule, override the :meth:`~lightning.pytorch.core.hooks.ModelHooks.configure_model` hook and apply the tensor parallelism to the model: + +.. code-block:: python + + import lightning as L + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + + + class LitModel(L.LightningModule): + def __init__(self): + super().__init__() + self.model = FeedForward(8192, 8192) + + def configure_model(self): + # Lightning will set up a `self.device_mesh` for you + tp_mesh = self.device_mesh["tensor_parallel"] + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(self.model, tp_mesh, plan) + + def training_step(self, batch): + ... + + def configure_optimizers(self): + ... + + def train_dataloader(self): + ... + +By writing the parallelization code in this special hook rather than hardcoding it into the model, we keep the original source code clean and maintainable. +Next, configure the :class:`~lightning.pytorch.strategies.model_parallel.ModelParallelStrategy` in the Trainer: + +.. code-block:: python + + import lightning as L + from lightning.pytorch.strategies import ModelParallelStrategy + + # 1. Create the strategy + strategy = ModelParallelStrategy() + + # 2. Configure devices and set the strategy in Trainer + trainer = L.Trainer(accelerator="cuda", devices=2, strategy=strategy) + trainer.fit(...) + +No other changes to your training code are necessary at this point. +When ``trainer.fit(...)`` (or ``validate()``, ``test``, etc.) gets called, the Trainer will call your :meth:`~lightning.pytorch.core.hooks.ModelHooks.configure_model` hook before the training loop starts. + +.. collapse:: Full training example (requires at least 2 GPUs). + + .. code-block:: python + + import torch + import torch.nn as nn + import torch.nn.functional as F + + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + + import lightning as L + from lightning.pytorch.demos.boring_classes import RandomDataset + from lightning.pytorch.strategies import ModelParallelStrategy + + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + + class LitModel(L.LightningModule): + def __init__(self): + super().__init__() + self.model = FeedForward(8192, 8192) + + def configure_model(self): + if self.device_mesh is None: + return + + # Lightning will set up a `self.device_mesh` for you + tp_mesh = self.device_mesh["tensor_parallel"] + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(self.model, tp_mesh, plan) + + def training_step(self, batch): + output = self.model(batch) + loss = output.sum() + return loss + + def configure_optimizers(self): + return torch.optim.AdamW(self.model.parameters(), lr=3e-3) + + def train_dataloader(self): + # Trainer configures the sampler automatically for you such that + # all batches in a tensor-parallel group are identical + dataset = RandomDataset(8192, 64) + return torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2) + + + strategy = ModelParallelStrategy() + trainer = L.Trainer( + accelerator="cuda", + devices=2, + strategy=strategy, + max_epochs=1, + ) + + model = LitModel() + trainer.fit(model) + + trainer.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + +| + +When measuring the peak memory consumption, we should see that doubling the number of GPUs reduces the memory consuption roughly by half: + + +.. list-table:: + :widths: 20 20 20 20 20 + :header-rows: 1 + + * - + - 1 GPU (no TP) + - 2 GPUs + - 4 GPUs + - 8 GPUs + * - Memory per GPU + - 4.04 GB + - 2.03 GB + - 1.02 GB + - 0.60 GB + +Beyond this toy example, we recommend you study our `LLM Tensor Parallel Example (Llama 3) `_. + + +---- + + +*************************** +Data-loading considerations +*************************** + +In a tensor-parallelized model, it is important that the model receives an identical input on each GPU. +Otherwise, training won't converge. +Therefore, when you shuffle data in your dataset or data loader, or when applying randomized transformations/augmentations in your data, ensure that the seed is set appropriately. + +Given this requirement, your global batch size will be limited by the memory of a single GPU. +To scale the batch size and accelerate training further, you can combine :doc:`tensor parallelism with data parallelism (in particular, FSDP) `. + + +---- + + +********** +Next steps +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: LLM Tensor Parallel Example + :description: Full example how to apply tensor parallelism to a large language model (Llama 3) + :col_css: col-md-4 + :button_link: https://github.com/Lightning-AI/pytorch-lightning/tree/master/examples/pytorch/tensor_parallel + :height: 160 + :tag: advanced + +.. displayitem:: + :header: 2D Parallel (FSDP + TP) + :description: Combine Tensor Parallelism with FSDP (2D Parallel) to train efficiently on 100s of GPUs + :button_link: tp_fsdp.html + :col_css: col-md-4 + :height: 160 + :tag: advanced + +.. displayitem:: + :header: PyTorch API Reference + :description: Explore the official PyTorch Tensor Parallel APIs + :button_link: https://pytorch.org/docs/stable/distributed.tensor.parallel.html + :col_css: col-md-4 + :height: 160 + :tag: advanced + + +.. raw:: html + +
+
+ +| diff --git a/docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst b/docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst new file mode 100644 index 0000000000000..956863a61e523 --- /dev/null +++ b/docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst @@ -0,0 +1,292 @@ +########################################## +2D Parallelism (Tensor Parallelism + FSDP) +########################################## + +2D Parallelism combines Tensor Parallelism (TP) and Fully Sharded Data Parallelism (FSDP) to leverage the memory efficiency of FSDP and the computational scalability of TP. +This hybrid approach balances the trade-offs of each method, optimizing memory usage and minimizing communication overhead, enabling the training of extremely large models on large GPU clusters. + +The :doc:`Tensor Parallelism documentation ` and a general understanding of `FSDP `_ are a prerequisite for this tutorial. + +.. note:: This is an experimental feature. + + +---- + + +********************* +Enable 2D parallelism +********************* + +We will start off with the same feed forward example model as in the :doc:`Tensor Parallelism tutorial `. + +.. code-block:: python + + import torch.nn as nn + import torch.nn.functional as F + + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + +Next, we implement the LightningModule and override the :meth:`~lightning.pytorch.core.hooks.ModelHooks.configure_model` that applies the desired parallelism to our model. + +.. code-block:: python + + import lightning as L + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + from torch.distributed._composable.fsdp.fully_shard import fully_shard + + + class LitModel(L.LightningModule): + def __init__(self): + super().__init__() + self.model = FeedForward(8192, 8192) + + def configure_model(self): + # Lightning will set up a `self.device_mesh` for you + # Here, it is 2-dimensional + tp_mesh = self.device_mesh["tensor_parallel"] + dp_mesh = self.device_mesh["data_parallel"] + + if tp_mesh.size() > 1: + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(self.model, tp_mesh, plan) + + if dp_mesh.size() > 1: + # Use PyTorch's FSDP2 APIs to parallelize the model + fully_shard(self.model.w1, mesh=dp_mesh) + fully_shard(self.model.w2, mesh=dp_mesh) + fully_shard(self.model.w3, mesh=dp_mesh) + fully_shard(self.model, mesh=dp_mesh) + +By writing the parallelization code in this special hook rather than hardcoding it into the model, we keep the original source code clean and maintainable. +In addition to the tensor-parallel code from the :doc:`Tensor Parallelism tutorial `, this implementation now also shards the model's parameters using FSDP along the data-parallel dimension. + +Finally, configure the :class:`~lightning.pytorch.strategies.model_parallel.ModelParallelStrategy` and configure the data-parallel and tensor-parallel sizes: + +.. code-block:: python + + import lightning as L + from lightning.pytorch.strategies import ModelParallelStrategy + + # 1. Create the strategy + strategy = ModelParallelStrategy( + # Define the size of the 2D parallelism + # Set these to "auto" (default) to apply TP intra-node and FSDP inter-node + data_parallel_size=2, + tensor_parallel_size=2, + ) + + # 2. Configure devices and set the strategy in Trainer + trainer = L.Trainer(accelerator="cuda", devices=4, strategy=strategy) + trainer.fit(...) + + +In this example with 4 GPUs, the Trainer will create a device mesh that groups GPU 0-1 and GPU 2-3 (2 groups because ``data_parallel_size=2``, and 2 GPUs per group because ``tensor_parallel_size=2``). +Later on when ``trainer.fit(model)`` is called, each layer wrapped with FSDP (``fully_shard``) will be split into two shards, one for the GPU 0-1 group, and one for the GPU 2-3 group. +Finally, the tensor parallelism will apply to each group, splitting the sharded tensor across the GPUs within each group. + +.. collapse:: Full training example (requires at least 4 GPUs). + + .. code-block:: python + + import torch + import torch.nn as nn + import torch.nn.functional as F + + from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel + from torch.distributed.tensor.parallel import parallelize_module + from torch.distributed._composable.fsdp.fully_shard import fully_shard + + import lightning as L + from lightning.pytorch.demos.boring_classes import RandomDataset + from lightning.pytorch.strategies import ModelParallelStrategy + + + class FeedForward(nn.Module): + def __init__(self, dim, hidden_dim): + super().__init__() + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + + class LitModel(L.LightningModule): + def __init__(self): + super().__init__() + self.model = FeedForward(8192, 8192) + + def configure_model(self): + if self.device_mesh is None: + return + + # Lightning will set up a `self.device_mesh` for you + # Here, it is 2-dimensional + tp_mesh = self.device_mesh["tensor_parallel"] + dp_mesh = self.device_mesh["data_parallel"] + + if tp_mesh.size() > 1: + # Use PyTorch's distributed tensor APIs to parallelize the model + plan = { + "w1": ColwiseParallel(), + "w2": RowwiseParallel(), + "w3": ColwiseParallel(), + } + parallelize_module(self.model, tp_mesh, plan) + + if dp_mesh.size() > 1: + # Use PyTorch's FSDP2 APIs to parallelize the model + fully_shard(self.model.w1, mesh=dp_mesh) + fully_shard(self.model.w2, mesh=dp_mesh) + fully_shard(self.model.w3, mesh=dp_mesh) + fully_shard(self.model, mesh=dp_mesh) + + + def training_step(self, batch): + output = self.model(batch) + loss = output.sum() + return loss + + def configure_optimizers(self): + return torch.optim.AdamW(self.model.parameters(), lr=3e-3) + + def train_dataloader(self): + # Trainer configures the sampler automatically for you such that + # all batches in a tensor-parallel group are identical + dataset = RandomDataset(8192, 64) + return torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2) + + + strategy = ModelParallelStrategy( + data_parallel_size=2, + tensor_parallel_size=2, + ) + trainer = L.Trainer( + accelerator="cuda", + devices=4, + strategy=strategy, + max_epochs=1, + ) + + model = LitModel() + trainer.fit(model) + + trainer.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + + +| + +Beyond this toy example, we recommend you study our `LLM 2D Parallel Example (Llama 3) `_. + + +---- + + +******************* +Effective use cases +******************* + +In the toy example above, the parallelization is configured to work within a single machine across multiple GPUs. +However, in practice the main use case for 2D parallelism is in multi-node training, where one can effectively combine both methods to maximize throughput and model scale. +Since tensor-parallelism requires blocking collective calls, fast GPU data transfers are essential to keep throughput high and therefore TP is typically applied across GPUs within a machine. +On the other hand, FSDP by design has the advantage that it can overlap GPU transfers with the computation (it can prefetch layers). +Hence, combining FSDP for inter-node parallelism and TP for intra-node parallelism is generally a good strategy to minimize both the latency and network bandwidth usage, making it possible to scale to much larger models than is possible with FSDP alone. + + +.. code-block:: python + + from lightning.pytorch.strategies import ModelParallelStrategy + + strategy = ModelParallelStrategy( + # Default is "auto" + # Applies TP intra-node and DP inter-node + data_parallel_size="auto", + tensor_parallel_size="auto", + ) + + +---- + + +*************************** +Data-loading considerations +*************************** + +In a tensor-parallelized model, it is important that the model receives an identical input on each GPU that participates in the same tensor-parallel group. +However, across the data-parallel dimension, the inputs should be different. +In other words, if TP is applied within a node, and FSDP across nodes, each node must receive a different batch, but every GPU within the node gets the same batch of data. + +If you use a PyTorch data loader, the Trainer will automatically handle this for you by configuring the distributed sampler. +However, when you shuffle data in your dataset or data loader, or when applying randomized transformations/augmentations in your data, you must still ensure that the seed is set appropriately. + + +.. code-block:: python + + import lightning as L + + trainer = L.Trainer(...) + + # Define dataset/dataloader + # If there is randomness/augmentation in the dataset, fix the seed + dataset = MyDataset(seed=42) + dataloader = DataLoader(dataset, batch_size=8, shuffle=True) + + # PyTorch Lightning configures the sampler automatically for you such that + # all batches in a tensor-parallel group are identical, + # while still sharding the dataset across the data-parallel group + trainer.fit(model, dataloader) + + for i, batch in enumerate(dataloader): + ... + + +---- + + +********** +Next steps +********** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: LLM 2D Parallel Example + :description: Full example how to combine TP + FSDP in a large language model (Llama 3) + :col_css: col-md-4 + :button_link: https://github.com/Lightning-AI/pytorch-lightning/tree/master/examples/pytorch/tensor_parallel + :height: 160 + :tag: advanced + +.. displayitem:: + :header: Pipeline Parallelism + :description: Coming sooon + :col_css: col-md-4 + :height: 160 + :tag: advanced + + +.. raw:: html + +
+
+ +| diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index 3542001a6e973..1f58f6ac23dd5 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -214,6 +214,7 @@ strategies DDPStrategy DeepSpeedStrategy FSDPStrategy + ModelParallelStrategy ParallelStrategy SingleDeviceStrategy SingleDeviceXLAStrategy diff --git a/docs/source-pytorch/common/index.rst b/docs/source-pytorch/common/index.rst index 17bab965be751..738e971aec532 100644 --- a/docs/source-pytorch/common/index.rst +++ b/docs/source-pytorch/common/index.rst @@ -142,7 +142,7 @@ How-to Guides .. displayitem:: :header: Train models with billions of parameters :description: Scale GPU training for models with billions of parameters - :button_link: ../advanced/model_parallel.html + :button_link: ../advanced/model_parallel/index.html :col_css: col-md-4 :height: 180 diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst index 0b9447a414911..2891d264d885d 100644 --- a/docs/source-pytorch/common_usecases.rst +++ b/docs/source-pytorch/common_usecases.rst @@ -109,7 +109,7 @@ Customize and extend Lightning for things like custom hardware or distributed st :header: Train models with billions of parameters :description: Scale GPU training to models with billions of parameters :col_css: col-md-12 - :button_link: advanced/model_parallel.html + :button_link: advanced/model_parallel/index.html :height: 100 .. displayitem:: diff --git a/docs/source-pytorch/glossary/index.rst b/docs/source-pytorch/glossary/index.rst index 2a5fbbfe0afce..6b5e4b12b307f 100644 --- a/docs/source-pytorch/glossary/index.rst +++ b/docs/source-pytorch/glossary/index.rst @@ -3,6 +3,7 @@ :maxdepth: 1 :hidden: + 2D Parallelism <../advanced/model_parallel/tp_fsdp> Accelerators <../extensions/accelerator> Callback <../extensions/callbacks> Checkpointing <../common/checkpointing> @@ -28,7 +29,7 @@ TPU <../accelerators/tpu> Metrics Model <../model/build_model.rst> - Model Parallel <../advanced/model_parallel> + Model Parallel <../advanced/model_parallel/index> Plugins <../extensions/plugins> Progress bar <../common/progress_bar> Production <../deploy/production_advanced> @@ -43,6 +44,7 @@ Style guide <../starter/style_guide> SWA <../advanced/training_tricks> SLURM <../clouds/cluster_advanced> + Tensor Parallel <../advanced/model_parallel/tp> Transfer learning <../advanced/transfer_learning> Trainer <../common/trainer> TorchRun (TorchElastic) <../clouds/cluster_intermediate_2> @@ -58,6 +60,13 @@ Glossary
+.. displayitem:: + :header: 2D Parallelism + :description: Combine Tensor Parallelism with FSDP (2D Parallel) to train efficiently on 100s of GPUs + :col_css: col-md-12 + :button_link: ../advanced/model_parallel/tp_fsdp.html + :height: 100 + .. displayitem:: :header: Accelerators :description: Accelerators connect the Trainer to hardware to train faster @@ -230,7 +239,7 @@ Glossary :header: Model Parallelism :description: A way to scale training that splits a model between multiple devices. :col_css: col-md-12 - :button_link: ../advanced/model_parallel.html + :button_link: ../advanced/model_parallel/index.html :height: 100 .. displayitem:: @@ -331,6 +340,13 @@ Glossary :button_link: ../clouds/cluster_advanced.html :height: 100 +.. displayitem:: + :header: Tensor Parallelism + :description: Parallelize the computation of model layers across multiple GPUs, reducing memory usage and communication overhead + :col_css: col-md-12 + :button_link: ../advanced/tp.html + :height: 100 + .. displayitem:: :header: Transfer learning :description: Using pre-trained models to improve learning diff --git a/docs/source-pytorch/levels/advanced_level_21.rst b/docs/source-pytorch/levels/advanced_level_21.rst index 6c07d1d037465..68262ca9e223c 100644 --- a/docs/source-pytorch/levels/advanced_level_21.rst +++ b/docs/source-pytorch/levels/advanced_level_21.rst @@ -25,9 +25,9 @@ Scale to billions of parameters with multiple distributed strategies. .. displayitem:: :header: Train models with billions of parameters - :description: Scale to billions of params on GPUs with FSDP or Deepspeed. + :description: Scale to billions of params on GPUs with FSDP, TP or Deepspeed. :col_css: col-md-6 - :button_link: ../advanced/model_parallel.html + :button_link: ../advanced/model_parallel/index.html :height: 150 :tag: advanced diff --git a/examples/pytorch/tensor_parallel/train.py b/examples/pytorch/tensor_parallel/train.py index 37556cd087a7b..6a91e1242e4af 100644 --- a/examples/pytorch/tensor_parallel/train.py +++ b/examples/pytorch/tensor_parallel/train.py @@ -9,7 +9,7 @@ from torch.utils.data import DataLoader -class Llama2(L.LightningModule): +class Llama3(L.LightningModule): def __init__(self): super().__init__() self.model_args = ModelArgs(vocab_size=32000) @@ -63,7 +63,7 @@ def train(): # Initialize the model with trainer.init_module(empty_init=True): - model = Llama2() + model = Llama3() trainer.print(f"Number of model parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.1f} B") trainer.print("Starting training ...") From 896c2a656ad2db3278ec11520aed04e378f4462b Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 23 May 2024 19:43:46 +0200 Subject: [PATCH 044/179] Error for unsupported precision types with ModelParallelStrategy (#19902) --- src/lightning/fabric/connector.py | 7 +++++++ .../trainer/connectors/accelerator_connector.py | 10 ++++++++++ .../strategies/test_model_parallel_integration.py | 15 ++------------- tests/tests_fabric/test_connector.py | 14 ++++++++++++++ .../connectors/test_accelerator_connector.py | 13 +++++++++++++ 5 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/lightning/fabric/connector.py b/src/lightning/fabric/connector.py index 3a7334b3505ab..f677893100351 100644 --- a/src/lightning/fabric/connector.py +++ b/src/lightning/fabric/connector.py @@ -62,6 +62,7 @@ ) from lightning.fabric.strategies.ddp import _DDP_FORK_ALIASES from lightning.fabric.strategies.fsdp import _FSDP_ALIASES, FSDPStrategy +from lightning.fabric.strategies.model_parallel import ModelParallelStrategy from lightning.fabric.utilities import rank_zero_info, rank_zero_warn from lightning.fabric.utilities.device_parser import _determine_root_gpu_device from lightning.fabric.utilities.imports import _IS_INTERACTIVE @@ -460,6 +461,12 @@ def _check_and_init_precision(self) -> Precision: return DeepSpeedPrecision(self._precision_input) # type: ignore if isinstance(self.strategy, FSDPStrategy): return FSDPPrecision(precision=self._precision_input) # type: ignore[arg-type] + mp_precision_supported = ("32-true", "bf16-mixed", "bf16-true", "16-true") + if isinstance(self.strategy, ModelParallelStrategy) and self._precision_input not in mp_precision_supported: + raise ValueError( + f"The `ModelParallelStrategy` does not support `Fabric(..., precision={self._precision_input!r})`." + f" Choose a different precision among: {', '.join(mp_precision_supported)}." + ) if self._precision_input in ("16-true", "bf16-true"): return HalfPrecision(self._precision_input) # type: ignore if self._precision_input == "32-true": diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 6a350030ea0f7..1c97a223b129e 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -529,6 +529,16 @@ def _validate_precision_choice(self) -> None: self.accelerator, CUDAAccelerator ): raise RuntimeError("Bitsandbytes is only supported on CUDA GPUs.") + mp_precision_supported = ("32-true", "bf16-mixed", "bf16-true", "16-true") + if ( + isinstance(self._strategy_flag, ModelParallelStrategy) + and self._precision_flag not in mp_precision_supported + ): + raise ValueError( + f"The `ModelParallelStrategy` does not support `Fabric(..., precision={self._precision_flag!r})`." + f" Choose a different precision among: {', '.join(mp_precision_supported)}." + ) + if _habana_available_and_importable(): from lightning_habana import HPUAccelerator diff --git a/tests/tests_fabric/strategies/test_model_parallel_integration.py b/tests/tests_fabric/strategies/test_model_parallel_integration.py index 75fb1003cfad7..6db31d00f7397 100644 --- a/tests/tests_fabric/strategies/test_model_parallel_integration.py +++ b/tests/tests_fabric/strategies/test_model_parallel_integration.py @@ -241,9 +241,7 @@ def _train(fabric, model=None, optimizer=None): @pytest.mark.parametrize( "precision", [ - pytest.param( - "16-mixed", marks=pytest.mark.xfail(reason="Precision plugin does not implement ShardedGradScaler yet") - ), + pytest.param("32-true"), pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True)), ], ) @@ -548,7 +546,6 @@ def _parallelize_single_linear_tp_fsdp2(model, device_mesh): "precision", [ "32-true", - pytest.param("16-mixed"), pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True)), ], ) @@ -556,18 +553,10 @@ def _parallelize_single_linear_tp_fsdp2(model, device_mesh): "clip_type", [ pytest.param("norm", marks=pytest.mark.skip("Gradient clipping by norm is not correct.")), - pytest.param( - "val", - marks=pytest.mark.xfail( - raises=RecursionError, strict=False, reason="Recursion error when clipping DTensor" - ), - ), + "val", ], ) def test_clip_gradients(clip_type, precision): - if clip_type == "norm" and precision == "16-mixed": - pytest.skip(reason="Clipping by norm with 16-mixed is numerically unstable.") - strategy = ModelParallelStrategy(_parallelize_single_linear_tp_fsdp2) fabric = Fabric(accelerator="auto", devices=2, precision=precision, strategy=strategy) fabric.launch() diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py index a45abd097f966..b6f6b03b37605 100644 --- a/tests/tests_fabric/test_connector.py +++ b/tests/tests_fabric/test_connector.py @@ -14,6 +14,7 @@ import inspect import os import sys +from contextlib import nullcontext from typing import Any, Dict from unittest import mock from unittest.mock import Mock @@ -53,6 +54,7 @@ DDPStrategy, DeepSpeedStrategy, FSDPStrategy, + ModelParallelStrategy, SingleDeviceStrategy, SingleDeviceXLAStrategy, XLAFSDPStrategy, @@ -866,6 +868,18 @@ def test_precision_selection_amp_ddp(strategy, devices, is_custom_plugin, plugin assert isinstance(connector.precision, plugin_cls) +@RunIf(min_torch="2.3") +@pytest.mark.parametrize( + ("precision", "raises"), + [("32-true", False), ("16-true", False), ("bf16-true", False), ("16-mixed", True), ("bf16-mixed", False)], +) +@mock.patch("lightning.fabric.accelerators.mps.MPSAccelerator.is_available", return_value=False) +def test_precision_selection_model_parallel(_, precision, raises): + error_context = pytest.raises(ValueError, match=f"does not support .*{precision}") if raises else nullcontext() + with error_context: + _Connector(precision=precision, strategy=ModelParallelStrategy(lambda x, _: x)) + + def test_bitsandbytes_precision_cuda_required(monkeypatch): monkeypatch.setattr(lightning.fabric.plugins.precision.bitsandbytes, "_BITSANDBYTES_AVAILABLE", True) monkeypatch.setitem(sys.modules, "bitsandbytes", Mock()) diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index 977e297b42665..32f16c92f1c16 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -14,6 +14,7 @@ import inspect import os import sys +from contextlib import nullcontext from typing import Any, Dict from unittest import mock from unittest.mock import Mock @@ -48,6 +49,7 @@ DDPStrategy, DeepSpeedStrategy, FSDPStrategy, + ModelParallelStrategy, SingleDeviceStrategy, SingleDeviceXLAStrategy, XLAStrategy, @@ -1063,3 +1065,14 @@ def test_bitsandbytes_precision_cuda_required(monkeypatch): monkeypatch.setitem(sys.modules, "bitsandbytes", Mock()) with pytest.raises(RuntimeError, match="Bitsandbytes is only supported on CUDA GPUs"): _AcceleratorConnector(accelerator="cpu", plugins=BitsandbytesPrecision(mode="int8")) + + +@RunIf(min_torch="2.3") +@pytest.mark.parametrize( + ("precision", "raises"), + [("32-true", False), ("16-true", False), ("bf16-true", False), ("16-mixed", True), ("bf16-mixed", False)], +) +def test_precision_selection_model_parallel(precision, raises, mps_count_0): + error_context = pytest.raises(ValueError, match=f"does not support .*{precision}") if raises else nullcontext() + with error_context: + _AcceleratorConnector(precision=precision, strategy=ModelParallelStrategy()) From 98005bbed0b7ded09a4b88c6fb6f72527a451d33 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 28 May 2024 15:04:55 +0200 Subject: [PATCH 045/179] Add Studio badge to tensor parallel docs (#19913) --- docs/source-fabric/advanced/model_parallel/tp.rst | 11 ++++++++--- .../source-fabric/advanced/model_parallel/tp_fsdp.rst | 8 ++++++-- docs/source-pytorch/advanced/model_parallel/tp.rst | 10 +++++++--- .../advanced/model_parallel/tp_fsdp.rst | 8 ++++++-- examples/fabric/tensor_parallel/README.md | 2 +- examples/pytorch/tensor_parallel/README.md | 2 +- 6 files changed, 29 insertions(+), 12 deletions(-) diff --git a/docs/source-fabric/advanced/model_parallel/tp.rst b/docs/source-fabric/advanced/model_parallel/tp.rst index fdb05da121932..d85d3c36249a1 100644 --- a/docs/source-fabric/advanced/model_parallel/tp.rst +++ b/docs/source-fabric/advanced/model_parallel/tp.rst @@ -6,7 +6,11 @@ Tensor parallelism is a technique for training large models by distributing laye However, for smaller models, the communication overhead may outweigh its benefits. This method is most effective for models with very large layers, significantly enhancing performance and memory efficiency. -.. note:: This is an experimental feature. +.. raw:: html + + + Open In Studio + ---- @@ -197,9 +201,10 @@ Later in the code, when you call ``fabric.setup(model)``, Fabric will apply the fabric.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") -| -When measuring the peak memory consumption, we should see that doubling the number of GPUs reduces the memory consuption roughly by half: +.. note:: Tensor Parallelism in Lightning Fabric as well as PyTorch is experimental. The APIs may change in the future. + +When measuring the peak memory consumption, we should see that doubling the number of GPUs reduces the memory consumption roughly by half: .. list-table:: diff --git a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst index e9a305b56da48..f87645e0c11c6 100644 --- a/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst +++ b/docs/source-fabric/advanced/model_parallel/tp_fsdp.rst @@ -7,7 +7,11 @@ This hybrid approach balances the trade-offs of each method, optimizing memory u The :doc:`Tensor Parallelism documentation ` and a general understanding of `FSDP `_ are a prerequisite for this tutorial. -.. note:: This is an experimental feature. +.. raw:: html + + + Open In Studio + ---- @@ -182,7 +186,7 @@ Finally, the tensor parallelism will apply to each group, splitting the sharded fabric.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") -| +.. note:: 2D Parallelism in Lightning Fabric as well as PyTorch is experimental. The APIs may change in the future. Beyond this toy example, we recommend you study our `LLM 2D Parallel Example (Llama 3) `_. diff --git a/docs/source-pytorch/advanced/model_parallel/tp.rst b/docs/source-pytorch/advanced/model_parallel/tp.rst index 1f0de56298b7b..e857f1f974828 100644 --- a/docs/source-pytorch/advanced/model_parallel/tp.rst +++ b/docs/source-pytorch/advanced/model_parallel/tp.rst @@ -6,7 +6,11 @@ Tensor parallelism is a technique for training large models by distributing laye However, for smaller models, the communication overhead may outweigh its benefits. This method is most effective for models with very large layers, significantly enhancing performance and memory efficiency. -.. note:: This is an experimental feature. +.. raw:: html + + + Open In Studio + ---- @@ -215,9 +219,9 @@ When ``trainer.fit(...)`` (or ``validate()``, ``test``, etc.) gets called, the T trainer.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") -| +.. note:: Tensor Parallelism in PyTorch Lightning as well as PyTorch is experimental. The APIs may change in the future. -When measuring the peak memory consumption, we should see that doubling the number of GPUs reduces the memory consuption roughly by half: +When measuring the peak memory consumption, we should see that doubling the number of GPUs reduces the memory consumption roughly by half: .. list-table:: diff --git a/docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst b/docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst index 956863a61e523..dae23bd4ee0c0 100644 --- a/docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst +++ b/docs/source-pytorch/advanced/model_parallel/tp_fsdp.rst @@ -7,7 +7,11 @@ This hybrid approach balances the trade-offs of each method, optimizing memory u The :doc:`Tensor Parallelism documentation ` and a general understanding of `FSDP `_ are a prerequisite for this tutorial. -.. note:: This is an experimental feature. +.. raw:: html + + + Open In Studio + ---- @@ -190,7 +194,7 @@ Finally, the tensor parallelism will apply to each group, splitting the sharded trainer.print(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") -| +.. note:: 2D Parallelism in PyTorch Lightning as well as PyTorch is experimental. The APIs may change in the future. Beyond this toy example, we recommend you study our `LLM 2D Parallel Example (Llama 3) `_. diff --git a/examples/fabric/tensor_parallel/README.md b/examples/fabric/tensor_parallel/README.md index 4abd1fc058515..e66d9acd2848b 100644 --- a/examples/fabric/tensor_parallel/README.md +++ b/examples/fabric/tensor_parallel/README.md @@ -1,6 +1,6 @@ ## Tensor Parallel and 2D Parallel -This example shows how to apply tensor-parallelism to your model (here Llama 2 7B) with the `ModelParallelStrategy`, and how it can be combined with FSDP (2D parallelism). +This example shows how to apply tensor-parallelism to your model (here Llama 3 7B) with the `ModelParallelStrategy`, and how it can be combined with FSDP (2D parallelism). PyTorch 2.3+ and a machine with at least 4 GPUs and 24 GB memory each are required to run this example. ```bash diff --git a/examples/pytorch/tensor_parallel/README.md b/examples/pytorch/tensor_parallel/README.md index 97675976148f8..d8b81b6de1bff 100644 --- a/examples/pytorch/tensor_parallel/README.md +++ b/examples/pytorch/tensor_parallel/README.md @@ -1,6 +1,6 @@ ## Tensor Parallel and 2D Parallel -This example shows how to apply tensor-parallelism to your model (here Llama 2 7B) with the `ModelParallelStrategy`, and how it can be combined with FSDP (2D parallelism). +This example shows how to apply tensor-parallelism to your model (here Llama 3 7B) with the `ModelParallelStrategy`, and how it can be combined with FSDP (2D parallelism). PyTorch 2.3+ and a machine with at least 4 GPUs and 24 GB memory each are required to run this example. ```bash From 014cdd84ed727ba4160f768589996facdd52ee6b Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 30 May 2024 12:12:41 +0200 Subject: [PATCH 046/179] Update code owners file (#19922) * update code owners * update * Update .github/CODEOWNERS Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --- .github/CODEOWNERS | 55 ++++++++------------ docs/source-pytorch/community/governance.rst | 2 +- 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 01d89f760eff2..96f43f384119f 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,46 +8,37 @@ * @lantiga @borda @tchaton @awaelchli # CI/CD and configs -/.actions/ @borda @carmocca @ethanwharris @justusschock -/.github/ @borda @carmocca @ethanwharris @justusschock -/.azure/ @borda @carmocca @ethanwharris @justusschock +/.actions/ @borda @ethanwharris @justusschock +/.github/ @borda @ethanwharris @justusschock +/.azure/ @borda @ethanwharris @justusschock /.azure/app-cloud-e2e.yml @awaelchli @ethanwharris @lantiga -/dockers/ @borda @carmocca @ethanwharris @justusschock -*.yml @borda @carmocca @ethanwharris @justusschock +/dockers/ @borda @ethanwharris @justusschock +*.yml @borda @ethanwharris @justusschock # Docs -/docs/ @edenlightning @lantiga @borda @awaelchli -/docs/*/conf.py @borda @awaelchli @carmocca -/.github/*.md @edenlightning @williamfalcon @lantiga @borda -/.github/ISSUE_TEMPLATE/ @edenlightning @borda @tchaton @awaelchli -/docs/source-fabric/conf.py @borda @awaelchli @carmocca -/docs/source-fabric/index.rst @awaelchli @lantiga @carmocca -/docs/source-pytorch/conf.py @borda @awaelchli @carmocca +/docs/ @lantiga @borda @awaelchli +/docs/*/conf.py @borda @awaelchli +/.github/*.md @williamfalcon @lantiga @borda +/.github/ISSUE_TEMPLATE/ @borda @tchaton @awaelchli +/docs/source-fabric/conf.py @borda @awaelchli +/docs/source-fabric/index.rst @awaelchli @lantiga +/docs/source-pytorch/conf.py @borda @awaelchli /docs/source-pytorch/index.rst @williamfalcon @lantiga /docs/source-pytorch/levels @williamfalcon @lantiga -/docs/source-app/ @williamfalcon @lantiga @tchaton @awaelchli -/docs/source-app/index.rst @williamfalcon @lantiga -/docs/source-app/expertise_levels @williamfalcon @lantiga +/docs/source-app/ @williamfalcon @lantiga @tchaton # PyTorch Lightning -/src/lightning/pytorch @williamfalcon @awaelchli @carmocca @justusschock -/src/pytorch_lightning @williamfalcon @awaelchli @carmocca @justusschock -/tests/tests_pytorch @awaelchli @carmocca @justusschock @borda - -# Core APIs -/src/lightning/pytorch/callbacks/callback.py @williamfalcon @awaelchli @carmocca -/src/lightning/pytorch/core/datamodule.py @williamFalcon @awaelchli @carmocca -/src/lightning/pytorch/trainer/trainer.py @williamfalcon @tchaton @awaelchli @carmocca -/src/lightning/pytorch/core/hooks.py @williamfalcon @tchaton @awaelchli @carmocca -/src/lightning/pytorch/core/module.py @williamfalcon @tchaton @awaelchli @carmocca +/src/lightning/pytorch @williamfalcon @awaelchli @justusschock +/src/pytorch_lightning @williamfalcon @awaelchli @justusschock +/tests/tests_pytorch @awaelchli @justusschock @borda # Lightning Data -/src/lightning/data/ @tchaton +/src/lightning/data/ @tchaton @lantiga # Lightning Fabric -/src/lightning/fabric @awaelchli @carmocca @justusschock -/src/lightning_fabric @awaelchli @carmocca @justusschock -/tests/tests_fabric @awaelchli @carmocca @justusschock +/src/lightning/fabric @awaelchli @justusschock +/src/lightning_fabric @awaelchli @justusschock +/tests/tests_fabric @awaelchli @justusschock # Lightning App /src/lightning/app @tchaton @lantiga @awaelchli @ethanwharris @@ -59,8 +50,8 @@ /.github/CODEOWNERS @williamfalcon /SECURITY.md @williamfalcon @lantiga /README.md @williamfalcon @lantiga -/setup.py @williamfalcon @borda @carmocca -/src/pytorch_lightning/__about__.py @williamfalcon @borda @carmocca +/setup.py @williamfalcon @borda +/src/pytorch_lightning/__about__.py @williamfalcon @borda /src/lightning_app/__about__.py @williamfalcon @lantiga @borda /src/lightning_fabric/__about__.py @williamfalcon @borda @awaelchli -/src/*/__setup__.py @borda @carmocca @justusschock +/src/*/__setup__.py @borda @justusschock diff --git a/docs/source-pytorch/community/governance.rst b/docs/source-pytorch/community/governance.rst index 35306643c5396..36b613d32d5be 100644 --- a/docs/source-pytorch/community/governance.rst +++ b/docs/source-pytorch/community/governance.rst @@ -19,7 +19,6 @@ Role: All final decisions related to Lightning. Maintainers ----------- - Adrian Wälchli (`awaelchli `_) -- Carlos Mocholí (`carmocca `_) - Jirka Borovec (`Borda `_) - Justus Schock (`justusschock `_) @@ -33,6 +32,7 @@ Emeritus Maintainers Alumni ------ +- Carlos Mocholí (`carmocca `_) - Akihiro Nitta (`akihironitta `_) - Ananth Subramaniam (`ananthsub `_) - Danielle Pintz (`daniellepintz `_) From 5d7932546d4a3a2b251d972e656559bb62436697 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 30 May 2024 17:50:02 +0200 Subject: [PATCH 047/179] Update code owners file (#19925) update --- .github/CODEOWNERS | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 96f43f384119f..821543cea4438 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,7 +5,7 @@ # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. -* @lantiga @borda @tchaton @awaelchli +* @lantiga @borda @tchaton @awaelchli @justusschock # CI/CD and configs /.actions/ @borda @ethanwharris @justusschock @@ -28,17 +28,13 @@ /docs/source-app/ @williamfalcon @lantiga @tchaton # PyTorch Lightning -/src/lightning/pytorch @williamfalcon @awaelchli @justusschock -/src/pytorch_lightning @williamfalcon @awaelchli @justusschock -/tests/tests_pytorch @awaelchli @justusschock @borda +/src/lightning/pytorch @lantiga @borda @tchaton @awaelchli @justusschock # Lightning Data /src/lightning/data/ @tchaton @lantiga # Lightning Fabric -/src/lightning/fabric @awaelchli @justusschock -/src/lightning_fabric @awaelchli @justusschock -/tests/tests_fabric @awaelchli @justusschock +/src/lightning/fabric @lantiga @borda @tchaton @awaelchli @justusschock # Lightning App /src/lightning/app @tchaton @lantiga @awaelchli @ethanwharris From 95d6b6b9da20d1f07e05c3806dd6b87dae48f485 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 30 May 2024 17:54:48 +0200 Subject: [PATCH 048/179] Disable skipping training step in distributed training (#19918) --- src/lightning/pytorch/CHANGELOG.md | 3 ++- .../pytorch/loops/optimization/automatic.py | 8 +++++- ...timizer_loop.py => test_automatic_loop.py} | 26 ++++++++++++++++++- tests/tests_pytorch/models/test_hooks.py | 2 ++ .../tests_pytorch/trainer/test_dataloaders.py | 6 ++++- 5 files changed, 41 insertions(+), 4 deletions(-) rename tests/tests_pytorch/loops/optimization/{test_optimizer_loop.py => test_automatic_loop.py} (76%) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 297f54b4b2949..d12a943e53489 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -28,7 +28,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Relaxed the requirement for custom batch samplers to expose `drop_last` for prediction ([#19678](https://github.com/Lightning-AI/pytorch-lightning/pull/19678)) -- +- It is no longer allowed to skip `training_step()` by returning `None` in distributed training ([#19918](https://github.com/Lightning-AI/pytorch-lightning/pull/19918)) + ### Deprecated diff --git a/src/lightning/pytorch/loops/optimization/automatic.py b/src/lightning/pytorch/loops/optimization/automatic.py index 82666a0e21e5f..2ce6acab11a37 100644 --- a/src/lightning/pytorch/loops/optimization/automatic.py +++ b/src/lightning/pytorch/loops/optimization/automatic.py @@ -314,8 +314,14 @@ def _training_step(self, kwargs: OrderedDict) -> ClosureResult: """ trainer = self.trainer - # manually capture logged metrics training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) self.trainer.strategy.post_training_step() # unused hook - call anyway for backward compatibility + if training_step_output is None and trainer.world_size > 1: + raise RuntimeError( + "Skipping the `training_step` by returning None in distributed training is not supported." + " It is recommended that you rewrite your training logic to avoid having to skip the step in the first" + " place." + ) + return self.output_result_cls.from_training_step_output(training_step_output, trainer.accumulate_grad_batches) diff --git a/tests/tests_pytorch/loops/optimization/test_optimizer_loop.py b/tests/tests_pytorch/loops/optimization/test_automatic_loop.py similarity index 76% rename from tests/tests_pytorch/loops/optimization/test_optimizer_loop.py rename to tests/tests_pytorch/loops/optimization/test_automatic_loop.py index 2111212de8901..0ea6290586f55 100644 --- a/tests/tests_pytorch/loops/optimization/test_optimizer_loop.py +++ b/tests/tests_pytorch/loops/optimization/test_automatic_loop.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from contextlib import nullcontext from typing import Dict, Generic, Iterator, Mapping, TypeVar import pytest @@ -82,3 +82,27 @@ def training_step(self, batch, batch_idx): with pytest.raises(MisconfigurationException, match=match): trainer.fit(model) + + +@pytest.mark.parametrize("world_size", [1, 2]) +def test_skip_training_step_not_allowed(world_size, tmp_path): + """Test that skipping the training_step in distributed training is not allowed.""" + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + return None + + model = TestModel() + trainer = Trainer( + default_root_dir=tmp_path, + max_steps=1, + barebones=True, + ) + trainer.strategy.world_size = world_size # mock world size without launching processes + error_context = ( + pytest.raises(RuntimeError, match="Skipping the `training_step` .* is not supported") + if world_size > 1 + else nullcontext() + ) + with error_context: + trainer.fit(model) diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index 763a140982a8d..5a175e181dd9e 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -178,6 +178,8 @@ class TestModel(BoringModel): def training_step(self, batch, batch_idx): assert batch.samples.device == self.device assert isinstance(batch_idx, int) + # the actual training step is not needed for the assertions + return super().training_step(torch.rand(1, 32, device=self.device), batch_idx) def train_dataloader(self): return torch.utils.data.DataLoader(RandomDataset(32, 64), collate_fn=collate_fn) diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py index 75cb3cd45fcf0..a2d29baa9fa6f 100644 --- a/tests/tests_pytorch/trainer/test_dataloaders.py +++ b/tests/tests_pytorch/trainer/test_dataloaders.py @@ -641,6 +641,8 @@ def __init__(self): def training_step(self, batch, batch_idx): self.batches_seen.append(batch) + # the actual training step is not needed for the assertions below + return super().training_step(torch.rand(1, 32, device=self.device), batch_idx) def on_train_epoch_end(self): world_size = 2 @@ -810,8 +812,10 @@ def __init__(self): super().__init__() self.seen_samples = [] - def training_step(self, batch): + def training_step(self, batch, batch_idx): self.seen_samples.extend(batch.tolist()) + # the actual training step is not needed for the test + return super().training_step(torch.rand(1, 32, device=self.device), batch_idx) def on_train_end(self): seen_samples = self.all_gather(self.seen_samples) From dffc0f96ec76fc957a64be2ba828003e2ce09df9 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Thu, 30 May 2024 19:14:56 +0300 Subject: [PATCH 049/179] Update FlopCounterMode usage in throughput.py (#19926) `mods` argument is not needed anymore for `FlopCounterMode`: https://github.com/pytorch/pytorch/blob/ffe506e85350a505be5698c871d50b2fc614406d/torch/utils/flop_counter.py#L595-L596 --- src/lightning/fabric/utilities/throughput.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py index c340686346c0f..f483c274c35b8 100644 --- a/src/lightning/fabric/utilities/throughput.py +++ b/src/lightning/fabric/utilities/throughput.py @@ -296,7 +296,7 @@ def measure_flops( raise ImportError("`measure_flops` requires PyTorch >= 2.1.") from torch.utils.flop_counter import FlopCounterMode - flop_counter = FlopCounterMode(model, display=False) + flop_counter = FlopCounterMode(display=False) with flop_counter: if loss_fn is None: forward_fn() From 427fdfaf6e2cf0adf840271839b4ea63d04e5523 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 30 May 2024 19:47:48 +0200 Subject: [PATCH 050/179] Update docstring for `self.log` about keys in distributed training (#19917) --- src/lightning/pytorch/core/module.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index 5a4f8d4e1bbb1..68395ce97d9f3 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -405,7 +405,7 @@ def log( The default behavior per hook is documented here: :ref:`extensions/logging:Automatic Logging`. Args: - name: key to log. + name: key to log. Must be identical across all processes if using DDP or any other distributed strategy. value: value to log. Can be a ``float``, ``Tensor``, or a ``Metric``. prog_bar: if ``True`` logs to the progress bar. logger: if ``True`` logs to the logger. @@ -569,6 +569,7 @@ def log_dict( Args: dictionary: key value pairs. + Keys must be identical across all processes if using DDP or any other distributed strategy. The values can be a ``float``, ``Tensor``, ``Metric``, or ``MetricCollection``. prog_bar: if ``True`` logs to the progress base. logger: if ``True`` logs to the logger. From a99a6d3af1e9b8090d892dfc24b4f616853a8a40 Mon Sep 17 00:00:00 2001 From: PL Ghost <75324987+pl-ghost@users.noreply.github.com> Date: Fri, 31 May 2024 18:53:54 +0200 Subject: [PATCH 051/179] Adding test for legacy checkpoint created with 2.2.5 (#19806) --- tests/legacy/back-compatible-versions.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/legacy/back-compatible-versions.txt b/tests/legacy/back-compatible-versions.txt index a6b218707420c..1243a2fad62f8 100644 --- a/tests/legacy/back-compatible-versions.txt +++ b/tests/legacy/back-compatible-versions.txt @@ -99,3 +99,4 @@ 2.2.0.post0 2.2.1 2.2.2 +2.2.5 From fd86ea7356f842a32cd7eeca160f390e60ebca77 Mon Sep 17 00:00:00 2001 From: Gilles Peiffer Date: Mon, 3 Jun 2024 21:20:01 +0200 Subject: [PATCH 052/179] Fix typos in CONTRIBUTING.md (#19937) --- .github/CONTRIBUTING.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 29bf114bfbf43..db6543b8cb40e 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -121,11 +121,11 @@ To build the documentation locally, simply execute the following commands from p All added or edited code shall be the own original work of the particular contributor. If you use some third-party implementation, all such blocks/functions/modules shall be properly referred and if possible also agreed by code's author. For example - `This code is inspired from http://...`. -In case you adding new dependencies, make sure that they are compatible with the actual PyTorch Lightning license (ie. dependencies should be _at least_ as permissive as the PyTorch Lightning license). +In case you are adding new dependencies, make sure that they are compatible with the actual PyTorch Lightning license (i.e. dependencies should be _at least_ as permissive as the PyTorch Lightning license). ### Coding Style -1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`. +1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`). 1. You can use [pre-commit](https://pre-commit.com/) to make sure your code style is correct. ### Documentation @@ -234,9 +234,9 @@ Here are tutorials: Here is the process to create a new test -- 0. Optional: Follow tutorials ! -- 1. Find a file in tests/ which match what you want to test. If none, create one. -- 2. Use this template to get started ! +- 0. Optional: Follow tutorials! +- 1. Find a file in tests/ which matches what you want to test. If none, create one. +- 2. Use this template to get started! - 3. Use **BoringModel and derivates to test out your code**. ```python From bac82b83a89cf4958dad52c94d9ec8eb1a8b94d1 Mon Sep 17 00:00:00 2001 From: Matthew Hoffman Date: Tue, 4 Jun 2024 16:43:18 -0700 Subject: [PATCH 053/179] Remove unknown `[metadata]` table from `pyproject.toml` (#19904) --- pyproject.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ecfc70736994a..dc77740823c9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,11 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -[metadata] -name = "lightning" -author = "Lightning-AI et al." -url = "https://github.com/Lightning-AI/lightning" - [build-system] requires = [ "setuptools", From 785f15d1488fa7951367654576c1650b41710c5c Mon Sep 17 00:00:00 2001 From: Gilles Peiffer Date: Wed, 5 Jun 2024 01:45:05 +0200 Subject: [PATCH 054/179] Remove `numpy` dependencies in `src/lightning/pytorch` (#19841) --- src/lightning/pytorch/loggers/logger.py | 6 +++--- src/lightning/pytorch/tuner/lr_finder.py | 2 +- src/lightning/pytorch/utilities/__init__.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lightning/pytorch/loggers/logger.py b/src/lightning/pytorch/loggers/logger.py index c3051d34a7b09..40e8ed8c4a13e 100644 --- a/src/lightning/pytorch/loggers/logger.py +++ b/src/lightning/pytorch/loggers/logger.py @@ -15,11 +15,11 @@ import functools import operator +import statistics from abc import ABC from collections import defaultdict from typing import Any, Callable, Dict, Mapping, Optional, Sequence -import numpy as np from typing_extensions import override from lightning.fabric.loggers import Logger as FabricLogger @@ -100,7 +100,7 @@ def method(*args: Any, **kwargs: Any) -> None: def merge_dicts( # pragma: no cover dicts: Sequence[Mapping], agg_key_funcs: Optional[Mapping] = None, - default_func: Callable[[Sequence[float]], float] = np.mean, + default_func: Callable[[Sequence[float]], float] = statistics.mean, ) -> Dict: """Merge a sequence with dictionaries into one dictionary by aggregating the same keys with some given function. @@ -126,7 +126,7 @@ def merge_dicts( # pragma: no cover >>> d2 = {'a': 1.1, 'b': 2.2, 'v': 1, 'd': {'d1': 2, 'd2': 3}} >>> d3 = {'a': 1.1, 'v': 2.3, 'd': {'d3': 3, 'd4': {'d5': 1}}} >>> dflt_func = min - >>> agg_funcs = {'a': np.mean, 'v': max, 'd': {'d1': sum}} + >>> agg_funcs = {'a': statistics.mean, 'v': max, 'd': {'d1': sum}} >>> pprint.pprint(merge_dicts([d1, d2, d3], agg_funcs, dflt_func)) {'a': 1.3, 'b': 2.0, diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py index 8eebd3cd7f974..4997e23070df7 100644 --- a/src/lightning/pytorch/tuner/lr_finder.py +++ b/src/lightning/pytorch/tuner/lr_finder.py @@ -190,7 +190,7 @@ def suggestion(self, skip_begin: int = 10, skip_end: int = 1) -> Optional[float] losses = losses[torch.isfinite(losses)] if len(losses) < 2: - # computing np.gradient requires at least 2 points + # computing torch.gradient requires at least 2 points log.error( "Failed to compute suggestion for learning rate because there are not enough points. Increase the loop" " iteration limits or the size of your dataset/dataloader." diff --git a/src/lightning/pytorch/utilities/__init__.py b/src/lightning/pytorch/utilities/__init__.py index 5cd0af5ac7813..c3ba77b46e8b7 100644 --- a/src/lightning/pytorch/utilities/__init__.py +++ b/src/lightning/pytorch/utilities/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. """General utilities.""" -import numpy +import torch from lightning.fabric.utilities import ( LightningEnum, @@ -55,6 +55,6 @@ "suggested_max_num_workers", ] -FLOAT16_EPSILON = numpy.finfo(numpy.float16).eps -FLOAT32_EPSILON = numpy.finfo(numpy.float32).eps -FLOAT64_EPSILON = numpy.finfo(numpy.float64).eps +FLOAT16_EPSILON = torch.finfo(torch.float16).eps +FLOAT32_EPSILON = torch.finfo(torch.float32).eps +FLOAT64_EPSILON = torch.finfo(torch.float64).eps From 351bec76259c30d44d5d74d68f4b19ed5e30bb35 Mon Sep 17 00:00:00 2001 From: Alex Spies Date: Wed, 5 Jun 2024 10:06:16 +0900 Subject: [PATCH 055/179] Fix typo on `estimated_stepping_batches` property (#19847) --- src/lightning/pytorch/trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index bf7d47a880da3..d9d13602700cf 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -1654,8 +1654,8 @@ def _results(self) -> Optional[_ResultCollection]: def estimated_stepping_batches(self) -> Union[int, float]: r"""The estimated number of batches that will ``optimizer.step()`` during training. - This accounts for gradient accumulation and the current trainer configuration. This might sets up your training - dataloader if hadn't been set up already. + This accounts for gradient accumulation and the current trainer configuration. This might be used when setting + up your training dataloader, if it hasn't been set up already. .. code-block:: python From 19f0fb978c3c1d5e14fc904d678b10ed9798d5ea Mon Sep 17 00:00:00 2001 From: Federico Berto Date: Wed, 5 Jun 2024 10:12:27 +0900 Subject: [PATCH 056/179] Set `_choose_auto_accelerator` to `staticmethod` (#19822) --- src/lightning/fabric/connector.py | 3 ++- .../pytorch/trainer/connectors/accelerator_connector.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/connector.py b/src/lightning/fabric/connector.py index f677893100351..9fb66255830c6 100644 --- a/src/lightning/fabric/connector.py +++ b/src/lightning/fabric/connector.py @@ -313,7 +313,8 @@ def _check_device_config_and_set_final_flags(self, devices: Union[List[int], str f" using {accelerator_name} accelerator." ) - def _choose_auto_accelerator(self) -> str: + @staticmethod + def _choose_auto_accelerator() -> str: """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" if XLAAccelerator.is_available(): return "tpu" diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 1c97a223b129e..06f3ee366bcaa 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -328,7 +328,8 @@ def _check_device_config_and_set_final_flags(self, devices: Union[List[int], str f" using {accelerator_name} accelerator." ) - def _choose_auto_accelerator(self) -> str: + @staticmethod + def _choose_auto_accelerator() -> str: """Choose the accelerator type (str) based on availability.""" if XLAAccelerator.is_available(): return "tpu" From 8bfbe0c90821c55e03a4aa72a41069f73a15bc88 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 5 Jun 2024 04:09:01 +0200 Subject: [PATCH 057/179] Fix strict loading from distributed checkpoints vs PyTorch nightly (#19946) * strict loading * docstring --- src/lightning/fabric/strategies/model_parallel.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index df84978486b19..629113b291a2a 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -275,12 +275,7 @@ def load_checkpoint( state: Optional[Union[Module, Optimizer, Dict[str, Union[Module, Optimizer, Any]]]] = None, strict: bool = True, ) -> Dict[str, Any]: - """Load the contents from a checkpoint and restore the state of the given objects. - - Currently does not support loading the optimizer state if the model is distributed but the checkpoint is a full, - non-distributed checkpoint. - - """ + """Load the contents from a checkpoint and restore the state of the given objects.""" if not state: raise ValueError( f"Got {type(self).__name__}.load_checkpoint(..., state={state!r}) but a state with at least " @@ -559,14 +554,14 @@ def _load_raw_module_state( state_dict_options = StateDictOptions( broadcast_from_rank0=True, # type: ignore[call-arg] full_state_dict=True, - strict=strict, # gets ignored at the moment + # must be set False to allow loading each param separately below + strict=False, ) for submodule_name, submodule in module.named_modules(): for param_name, _ in _named_parameters_and_buffers_to_load(submodule): full_param_name = f"{submodule_name}{'.' if submodule_name else ''}{param_name}" if full_param_name not in state_dict: - # Note: PyTorch does not currently respect the `strict` setting in state_dict_options! if not strict: continue raise KeyError( From 5aadfa62508ee20735083900273c8e3ff5867602 Mon Sep 17 00:00:00 2001 From: Yurij Mikhalevich Date: Tue, 4 Jun 2024 19:11:20 -0700 Subject: [PATCH 058/179] fix(docs): fix broken link to ensure the docs can be built (#19941) * fix(docs): fix broken link to ensure the docs can be built * nit --- docs/source-app/glossary/restful_api/restful_api.rst | 2 +- docs/source-app/workflows/build_rest_api/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source-app/glossary/restful_api/restful_api.rst b/docs/source-app/glossary/restful_api/restful_api.rst index a1128f2234558..6e04f60c75f1c 100644 --- a/docs/source-app/glossary/restful_api/restful_api.rst +++ b/docs/source-app/glossary/restful_api/restful_api.rst @@ -25,7 +25,7 @@ These methods are guidelines to organize your RESTful Services and help users un * **`PUT`:** Updates/replaces existing resources. * **`DELETE`:** Deletes resources. -Learn more about `HTTP Methods for RESTful Services here `_. +Learn more about `HTTP Methods for RESTful Services here `_. The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. diff --git a/docs/source-app/workflows/build_rest_api/index.rst b/docs/source-app/workflows/build_rest_api/index.rst index 8b9192fb350fc..590f3d03d538d 100644 --- a/docs/source-app/workflows/build_rest_api/index.rst +++ b/docs/source-app/workflows/build_rest_api/index.rst @@ -25,7 +25,7 @@ These methods are guidelines to organize your RESTful Services and help users un * **`PUT`:** Updates/replaces existing resources. * **`DELETE`:** Deletes resources. -Learn more about `HTTP Methods for RESTful Services here `_. +Learn more about `HTTP Methods for RESTful Services here `_. The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. From e0b7c04e6358fe62b762a491eecc7185f432850b Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Wed, 5 Jun 2024 18:32:36 +0200 Subject: [PATCH 059/179] ci/docs: enable dispatch build without warning as errors (#19948) --- .github/workflows/docs-build.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index 64510b45c0c4b..7f7a09ce44007 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -108,11 +108,14 @@ jobs: - name: Full build for deployment if: github.event_name != 'pull_request' run: echo "DOCS_FETCH_ASSETS=1" >> $GITHUB_ENV + - name: Build without warnings + if: github.event_name != 'workflow_dispatch' + run: echo "BUILD_SPHINX_OPTS=-W --keep-going" >> $GITHUB_ENV - name: Make ${{ matrix.target }} working-directory: ./docs/source-${{ matrix.pkg-name }} # allow failing link check and doctest if you run with dispatch - continue-on-error: ${{ (matrix.target == 'doctest' || matrix.target == 'linkcheck') && github.event_name == 'workflow_dispatch' }} - run: make ${{ matrix.target }} --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going" + continue-on-error: ${{ (matrix.target == 'doctest' || matrix.target == 'linkcheck') && github.event_name == 'workflow_dispatch' }} + run: make ${{ matrix.target }} --debug --jobs $(nproc) SPHINXOPTS="$BUILD_SPHINX_OPTS" - name: Keep artifact if: github.event_name == 'pull_request' From b9f215d7fdddb4dc765e92afd468de7a55b801d7 Mon Sep 17 00:00:00 2001 From: Gilles Peiffer Date: Wed, 5 Jun 2024 18:32:56 +0200 Subject: [PATCH 060/179] Replace usage of `grep -P` with `perl` in `run_standalone_tests.sh` (#19942) --- .azure/gpu-tests-pytorch.yml | 1 + .github/checkgroup.yml | 3 ++- tests/run_standalone_tests.sh | 9 ++++----- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 156513d604210..dd1a3d4abcf6a 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -31,6 +31,7 @@ pr: - "src/lightning/pytorch/**" - "src/pytorch_lightning/*" - "tests/tests_pytorch/**" + - "tests/run_standalone_*.sh" - "pyproject.toml" # includes pytest config - "requirements/fabric/**" - "src/lightning/fabric/**" diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 0c5e7e4579ccb..c4c88b12a2598 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -60,6 +60,7 @@ subprojects: - "src/lightning/pytorch/**" - "src/pytorch_lightning/*" - "tests/tests_pytorch/**" + - "tests/run_standalone_*.sh" - "pyproject.toml" # includes pytest config - "requirements/fabric/**" - "src/lightning/fabric/**" @@ -201,7 +202,6 @@ subprojects: - ".azure/gpu-tests-fabric.yml" - "examples/fabric/**" - "examples/run_fabric_examples.sh" - - "tests/run_standalone_*.sh" - "requirements/fabric/**" - "src/lightning/__init__.py" - "src/lightning/__setup__.py" @@ -209,6 +209,7 @@ subprojects: - "src/lightning/fabric/**" - "src/lightning_fabric/*" - "tests/tests_fabric/**" + - "tests/run_standalone_*.sh" - "pyproject.toml" # includes pytest config - "!requirements/*/docs.txt" - "!*.md" diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh index 0de781b0c47c7..0aa0bacff168a 100755 --- a/tests/run_standalone_tests.sh +++ b/tests/run_standalone_tests.sh @@ -26,18 +26,17 @@ export PL_RUN_STANDALONE_TESTS=1 defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 " echo "Using defaults: ${defaults}" -# get the testing location as the fist argument +# get the testing location as the first argument test_path=$1 printf "source path: $test_path\n" # collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS standalone_tests=$(python3 -m pytest $test_path -q --collect-only --pythonwarnings ignore) -printf "Collected tests: \n $standalone_tests" +printf "Collected tests: \n $standalone_tests\n" # match only lines with tests -parametrizations=$(grep -oP '\S+::test_\S+' <<< "$standalone_tests") +parametrizations=$(perl -nle 'print $& while m{\S+::test_\S+}g' <<< "$standalone_tests") # convert the list to be array parametrizations_arr=($parametrizations) - report='' rm -f standalone_test_output.txt # in case it exists, remove it @@ -47,7 +46,7 @@ function show_batched_output { if [ -f standalone_test_output.txt ]; then # if exists cat standalone_test_output.txt # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail - if grep -iE 'error|exception|traceback|failed' standalone_test_output.txt | grep -vE 'on_exception|xfailed' | grep -qv -f testnames.txt; then + if perl -nle 'print if /error|(? Date: Thu, 6 Jun 2024 01:31:43 +0200 Subject: [PATCH 061/179] Destroy process group in atexit handler (#19931) --- src/lightning/fabric/CHANGELOG.md | 2 ++ src/lightning/fabric/utilities/distributed.py | 10 ++++++++++ src/lightning/pytorch/CHANGELOG.md | 1 + tests/tests_fabric/conftest.py | 5 ++--- tests/tests_fabric/utilities/test_distributed.py | 12 ++++++++++++ tests/tests_pytorch/conftest.py | 5 ++--- 6 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index b4076e6e23b67..ea595b2635138 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -17,6 +17,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `ModelParallelStrategy` to support 2D parallelism ([#19846](https://github.com/Lightning-AI/pytorch-lightning/pull/19846), [#19852](https://github.com/Lightning-AI/pytorch-lightning/pull/19852), [#19870](https://github.com/Lightning-AI/pytorch-lightning/pull/19870), [#19872](https://github.com/Lightning-AI/pytorch-lightning/pull/19872)) +- Added a call to `torch.distributed.destroy_process_group` in atexit handler if process group needs destruction ([#19931](https://github.com/Lightning-AI/pytorch-lightning/pull/19931)) + ### Changed diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index 30bfe4e254a07..bb20b889ec626 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -1,3 +1,4 @@ +import atexit import contextlib import logging import os @@ -291,6 +292,10 @@ def _init_dist_connection( log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) + if torch_distributed_backend == "nccl": + # PyTorch >= 2.4 warns about undestroyed NCCL process group, so we need to do it at program exit + atexit.register(_destroy_dist_connection) + # On rank=0 let everyone know training is starting rank_zero_info( f"{'-' * 100}\n" @@ -300,6 +305,11 @@ def _init_dist_connection( ) +def _destroy_dist_connection() -> None: + if _distributed_is_initialized(): + torch.distributed.destroy_process_group() + + def _get_default_process_group_backend_for_device(device: torch.device) -> str: return "nccl" if device.type == "cuda" else "gloo" diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index d12a943e53489..1e3ae02dd2c1b 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `ModelParallelStrategy` to support 2D parallelism ([#19878](https://github.com/Lightning-AI/pytorch-lightning/pull/19878), [#19888](https://github.com/Lightning-AI/pytorch-lightning/pull/19888)) +- Added a call to `torch.distributed.destroy_process_group` in atexit handler if process group needs destruction ([#19931](https://github.com/Lightning-AI/pytorch-lightning/pull/19931)) ### Changed diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py index 4a4371eb9d3a1..c92754833836e 100644 --- a/tests/tests_fabric/conftest.py +++ b/tests/tests_fabric/conftest.py @@ -23,7 +23,7 @@ import torch.distributed from lightning.fabric.accelerators import XLAAccelerator from lightning.fabric.strategies.launchers.subprocess_script import _ChildProcessObserver -from lightning.fabric.utilities.distributed import _distributed_is_initialized +from lightning.fabric.utilities.distributed import _destroy_dist_connection if sys.version_info >= (3, 9): from concurrent.futures.process import _ExecutorManagerThread @@ -78,8 +78,7 @@ def restore_env_variables(): def teardown_process_group(): """Ensures that the distributed process group gets closed before the next test runs.""" yield - if _distributed_is_initialized(): - torch.distributed.destroy_process_group() + _destroy_dist_connection() @pytest.fixture(autouse=True) diff --git a/tests/tests_fabric/utilities/test_distributed.py b/tests/tests_fabric/utilities/test_distributed.py index 0d8f03fcdd120..5331a6f9be611 100644 --- a/tests/tests_fabric/utilities/test_distributed.py +++ b/tests/tests_fabric/utilities/test_distributed.py @@ -11,8 +11,10 @@ from lightning.fabric.strategies import DDPStrategy, SingleDeviceStrategy from lightning.fabric.strategies.launchers.multiprocessing import _MultiProcessingLauncher from lightning.fabric.utilities.distributed import ( + _destroy_dist_connection, _gather_all_tensors, _InfiniteBarrier, + _init_dist_connection, _set_num_threads_if_needed, _suggested_max_num_threads, _sync_ddp, @@ -217,3 +219,13 @@ def test_infinite_barrier(): barrier.__exit__(None, None, None) assert barrier.barrier.call_count == 2 dist_mock.destroy_process_group.assert_called_once() + + +@mock.patch("lightning.fabric.utilities.distributed.atexit") +@mock.patch("lightning.fabric.utilities.distributed.torch.distributed.init_process_group") +def test_init_dist_connection_registers_destruction_handler(_, atexit_mock): + _init_dist_connection(LightningEnvironment(), "nccl") + atexit_mock.register.assert_called_once_with(_destroy_dist_connection) + atexit_mock.reset_mock() + _init_dist_connection(LightningEnvironment(), "gloo") + atexit_mock.register.assert_not_called() diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 8b9ca14684db0..c0319e873be08 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -27,7 +27,7 @@ import torch.distributed from lightning.fabric.plugins.environments.lightning import find_free_network_port from lightning.fabric.strategies.launchers.subprocess_script import _ChildProcessObserver -from lightning.fabric.utilities.distributed import _distributed_is_initialized +from lightning.fabric.utilities.distributed import _destroy_dist_connection, _distributed_is_initialized from lightning.fabric.utilities.imports import _IS_WINDOWS from lightning.pytorch.accelerators import XLAAccelerator from lightning.pytorch.trainer.connectors.signal_connector import _SignalConnector @@ -123,8 +123,7 @@ def restore_signal_handlers(): def teardown_process_group(): """Ensures that the distributed process group gets closed before the next test runs.""" yield - if _distributed_is_initialized(): - torch.distributed.destroy_process_group() + _destroy_dist_connection() @pytest.fixture(autouse=True) From 7668a6bf598d27c85c8859bbcc5e39bccf1277ca Mon Sep 17 00:00:00 2001 From: Liyang90 Date: Wed, 5 Jun 2024 17:15:03 -0700 Subject: [PATCH 062/179] Flexible and easy to use HSDP setting (#19504) Co-authored-by: awaelchli --- src/lightning/fabric/CHANGELOG.md | 2 + src/lightning/fabric/strategies/fsdp.py | 20 +++++++++- src/lightning/pytorch/CHANGELOG.md | 2 + src/lightning/pytorch/strategies/fsdp.py | 41 +++++++++++++++++++-- tests/tests_fabric/strategies/test_fsdp.py | 7 +++- tests/tests_pytorch/strategies/test_fsdp.py | 7 +++- 6 files changed, 73 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index ea595b2635138..2ee0243a0d5a5 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -19,6 +19,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a call to `torch.distributed.destroy_process_group` in atexit handler if process group needs destruction ([#19931](https://github.com/Lightning-AI/pytorch-lightning/pull/19931)) +- Added support for configuring hybrid-sharding by passing a tuple for the `FSDPStrategy(device_mesh=...)` argument ([#19504](https://github.com/Lightning-AI/pytorch-lightning/pull/19504)) + ### Changed diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index eb125d191df94..9a711b8449c3e 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -74,12 +74,14 @@ from lightning.fabric.utilities.types import _PATH, _Stateful if TYPE_CHECKING: + from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision, ShardingStrategy from torch.distributed.fsdp.wrap import ModuleWrapPolicy _POLICY = Union[Set[Type[Module]], Callable[[Module, bool, int], bool], ModuleWrapPolicy] _SHARDING_STRATEGY = Union[ShardingStrategy, Literal["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD"]] + _FSDP_ALIASES = ("fsdp", "fsdp_cpu_offload") @@ -117,10 +119,14 @@ class FSDPStrategy(ParallelStrategy, _Sharded): - ``"SHARD_GRAD_OP"``: Shards gradients and optimizer states only. Model parameters get replicated. - ``"NO_SHARD"``: No sharding (identical to regular DDP). - ``"HYBRID_SHARD"``: Shards model parameters, gradients, and optimizer states within a single machine, but - replicates across machines. + replicates across machines. See also the `device_mesh` parameter below. Also accepts a :class:`torch.distributed.fsdp.ShardingStrategy` enum value. + device_mesh: A tuple `(replication size, sharding size)` that defines over how many devices to shard and + replicate the model. The product of the two numbers must equal the world size. Only valid in combination + with the `HYBRID_SHARD` sharding strategy. + state_dict_type: The format in which the state of the model and optimizers gets saved into the checkpoint. - ``"full"``: The full weights and optimizer states get assembled on rank 0 and saved to a single file. @@ -146,6 +152,7 @@ def __init__( activation_checkpointing_policy: Optional["_POLICY"] = None, sharding_strategy: "_SHARDING_STRATEGY" = "FULL_SHARD", state_dict_type: Literal["full", "sharded"] = "sharded", + device_mesh: Optional[Union[Tuple[int], "DeviceMesh"]] = None, **kwargs: Any, ) -> None: super().__init__( @@ -163,6 +170,11 @@ def __init__( # Enables joint setup of model and optimizer, multiple optimizer param groups, and `torch.compile()` self._fsdp_kwargs.setdefault("use_orig_params", True) + if device_mesh is not None: + if not _TORCH_GREATER_EQUAL_2_2: + raise ValueError("The `device_mesh` argument is only supported in torch >= 2.2.") + self._fsdp_kwargs["device_mesh"] = device_mesh + self._activation_checkpointing_kwargs = _activation_checkpointing_kwargs( activation_checkpointing, activation_checkpointing_policy ) @@ -244,6 +256,12 @@ def setup_environment(self) -> None: super().setup_environment() self._setup_distributed() + # if 'device_mesh' in the `_fsdp_kwargs` is provided as a tuple, update it into the `DeviceMesh` object here + if isinstance(self._fsdp_kwargs.get("device_mesh"), tuple): + from torch.distributed.device_mesh import init_device_mesh + + self._fsdp_kwargs["device_mesh"] = init_device_mesh("cuda", self._fsdp_kwargs["device_mesh"]) + @override def setup_module_and_optimizers( self, module: Module, optimizers: List[Optimizer] diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 1e3ae02dd2c1b..b47c01592882d 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -22,6 +22,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a call to `torch.distributed.destroy_process_group` in atexit handler if process group needs destruction ([#19931](https://github.com/Lightning-AI/pytorch-lightning/pull/19931)) +- Added support for configuring hybrid-sharding by passing a tuple for the `FSDPStrategy(device_mesh=...)` argument ([#19504](https://github.com/Lightning-AI/pytorch-lightning/pull/19504)) + ### Changed diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 3c352e8174ddc..90f6c1febdccb 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -16,7 +16,21 @@ from contextlib import contextmanager, nullcontext from datetime import timedelta from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Literal, Mapping, Optional, Set, Type, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Generator, + List, + Literal, + Mapping, + Optional, + Set, + Tuple, + Type, + Union, +) import torch from lightning_utilities.core.rank_zero import rank_zero_only as utils_rank_zero_only @@ -53,7 +67,10 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 +from lightning.fabric.utilities.imports import ( + _TORCH_GREATER_EQUAL_2_1, + _TORCH_GREATER_EQUAL_2_2, +) from lightning.fabric.utilities.init import _EmptyInit, _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors from lightning.fabric.utilities.optimizer import _optimizers_to_device @@ -70,6 +87,7 @@ from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn if TYPE_CHECKING: + from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision, ShardingStrategy from torch.distributed.fsdp.wrap import ModuleWrapPolicy @@ -114,10 +132,14 @@ class FSDPStrategy(ParallelStrategy): - ``"SHARD_GRAD_OP"``: Shards gradients and optimizer states only. Model parameters get replicated. - ``"NO_SHARD"``: No sharding (identical to regular DDP). - ``"HYBRID_SHARD"``: Shards model parameters, gradients, and optimizer states within a single machine, but - replicates across machines. + replicates across machines. See also the `device_mesh` parameter below. Also accepts a :class:`torch.distributed.fsdp.ShardingStrategy` enum value. + device_mesh: A tuple `(replication size, sharding size)` that defines over how many devices to shard and + replicate the model. The product of the two numbers must equal the world size. Only valid in combination + with the `HYBRID_SHARD` sharding strategy. + state_dict_type: The format in which the state of the model and optimizers gets saved into the checkpoint. - ``"full"``: The full weights and optimizer states get assembled on rank 0 and saved to a single file. @@ -147,6 +169,7 @@ def __init__( activation_checkpointing_policy: Optional["_POLICY"] = None, sharding_strategy: "_SHARDING_STRATEGY" = "FULL_SHARD", state_dict_type: Literal["full", "sharded"] = "full", + device_mesh: Optional[Union[Tuple[int], "DeviceMesh"]] = None, **kwargs: Any, ) -> None: super().__init__( @@ -162,6 +185,12 @@ def __init__( self.cpu_offload = _init_cpu_offload(cpu_offload) self.mixed_precision = mixed_precision self.kwargs = _auto_wrap_policy_kwargs(auto_wrap_policy, kwargs) + + if device_mesh is not None: + if not _TORCH_GREATER_EQUAL_2_2: + raise ValueError("The `device_mesh` argument is only supported in torch >= 2.2.") + self.kwargs["device_mesh"] = device_mesh + self.sharding_strategy = _init_sharding_strategy(sharding_strategy, self.kwargs) # Avoids the need for user to reference params in `configure_optimizers` via @@ -242,6 +271,12 @@ def setup_environment(self) -> None: assert self.cluster_environment is not None _init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) + # if 'device_mesh' in the `kwargs` is provided as a tuple, update it into the `DeviceMesh` object here + if isinstance(self.kwargs.get("device_mesh"), tuple): + from torch.distributed.device_mesh import init_device_mesh + + self.kwargs["device_mesh"] = init_device_mesh("cuda", self.kwargs["device_mesh"]) + def _get_process_group_backend(self) -> str: return self._process_group_backend or _get_default_process_group_backend_for_device(self.root_device) diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index ed0dda85ffaef..1cf2a4d2f1f63 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -72,7 +72,7 @@ def test_sharding_strategy(): @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"]) -def test_hybrid_shard_configuration(sharding_strategy): +def test_hybrid_shard_configuration(sharding_strategy, monkeypatch): """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg.""" with pytest.raises(RuntimeError, match="The hybrid sharding strategy requires you to pass at least one of"): FSDPStrategy(sharding_strategy=sharding_strategy) @@ -85,6 +85,11 @@ def test_hybrid_shard_configuration(sharding_strategy): assert strategy.sharding_strategy.name == sharding_strategy assert strategy._fsdp_kwargs["process_group"] is process_group + monkeypatch.setattr("lightning.fabric.strategies.fsdp._TORCH_GREATER_EQUAL_2_2", False) + with pytest.raises(ValueError, match="`device_mesh` argument is only supported in torch >= 2.2."): + FSDPStrategy(device_mesh=Mock()) + + monkeypatch.setattr("lightning.fabric.strategies.fsdp._TORCH_GREATER_EQUAL_2_2", True) device_mesh = Mock() strategy = FSDPStrategy(sharding_strategy=sharding_strategy, device_mesh=device_mesh) assert strategy.sharding_strategy.name == sharding_strategy diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 5557c07df9960..04eeabbbd7c49 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -501,7 +501,7 @@ def test_sharding_strategy(): @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"]) -def test_hybrid_sharding_strategy(sharding_strategy): +def test_hybrid_shard_configuration(sharding_strategy, monkeypatch): """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg.""" with pytest.raises(RuntimeError, match="The hybrid sharding strategy requires you to pass at least one of"): FSDPStrategy(sharding_strategy=sharding_strategy) @@ -514,6 +514,11 @@ def test_hybrid_sharding_strategy(sharding_strategy): assert strategy.sharding_strategy.name == sharding_strategy assert strategy.kwargs["process_group"] is process_group + monkeypatch.setattr("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_2", False) + with pytest.raises(ValueError, match="`device_mesh` argument is only supported in torch >= 2.2."): + FSDPStrategy(device_mesh=Mock()) + + monkeypatch.setattr("lightning.pytorch.strategies.fsdp._TORCH_GREATER_EQUAL_2_2", True) device_mesh = Mock() strategy = FSDPStrategy(sharding_strategy=sharding_strategy, device_mesh=device_mesh) assert strategy.sharding_strategy.name == sharding_strategy From 812ffdec84189bdee560d4b629df01dd2a6bfb53 Mon Sep 17 00:00:00 2001 From: Mario Vasilev <66969704+mariovas3@users.noreply.github.com> Date: Thu, 6 Jun 2024 01:24:45 +0100 Subject: [PATCH 063/179] Fix `save_last` type annotation for ModelCheckpoint (#19808) --- src/lightning/pytorch/CHANGELOG.md | 3 ++- .../pytorch/callbacks/model_checkpoint.py | 4 ++-- .../checkpointing/test_model_checkpoint.py | 23 +++++++++++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index b47c01592882d..54ce68c696187 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -57,7 +57,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `WandbLogger.log_hyperparameters()` raising an error if hyperparameters are not JSON serializable ([#19769](https://github.com/Lightning-AI/pytorch-lightning/pull/19769)) -- + +- Fixed an issue with the LightningCLI not being able to set the `ModelCheckpoint(save_last=...)` argument ([#19808](https://github.com/Lightning-AI/pytorch-lightning/pull/19808)) ## [2.2.2] - 2024-04-11 diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py index 6c5dd01df15c7..ba3014274b9b8 100644 --- a/src/lightning/pytorch/callbacks/model_checkpoint.py +++ b/src/lightning/pytorch/callbacks/model_checkpoint.py @@ -27,7 +27,7 @@ from copy import deepcopy from datetime import timedelta from pathlib import Path -from typing import Any, Dict, Literal, Optional, Set +from typing import Any, Dict, Literal, Optional, Set, Union from weakref import proxy import torch @@ -216,7 +216,7 @@ def __init__( filename: Optional[str] = None, monitor: Optional[str] = None, verbose: bool = False, - save_last: Optional[Literal[True, False, "link"]] = None, + save_last: Optional[Union[bool, Literal["link"]]] = None, save_top_k: int = 1, save_weights_only: bool = False, mode: str = "min", diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py index c911885117e29..006736e0865e3 100644 --- a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py +++ b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py @@ -18,6 +18,7 @@ import time from argparse import Namespace from datetime import timedelta +from inspect import signature from pathlib import Path from typing import Union from unittest import mock @@ -28,6 +29,7 @@ import pytest import torch import yaml +from jsonargparse import ArgumentParser from lightning.fabric.utilities.cloud_io import _load as pl_load from lightning.pytorch import Trainer, seed_everything from lightning.pytorch.callbacks import ModelCheckpoint @@ -1601,3 +1603,24 @@ def test_expand_home(): # it is possible to have a folder with the name `~` checkpoint = ModelCheckpoint(dirpath="./~/checkpoints") assert checkpoint.dirpath == str(Path.cwd() / "~" / "checkpoints") + + +@pytest.mark.parametrize( + ("val", "expected"), + [ + ("yes", True), + ("True", True), + ("true", True), + ("no", False), + ("false", False), + ("False", False), + ("link", "link"), + ], +) +def test_save_last_cli(val, expected): + """Test that the CLI can parse the `save_last` argument correctly (composed type).""" + annot = signature(ModelCheckpoint).parameters["save_last"].annotation + parser = ArgumentParser() + parser.add_argument("--a", type=annot) + args = parser.parse_args(["--a", val]) + assert args.a == expected From a611de0c15c4ef0b30d8cc014bbe5f0383b33e27 Mon Sep 17 00:00:00 2001 From: Bhavay Malhotra <56443877+Bhavay-2001@users.noreply.github.com> Date: Thu, 6 Jun 2024 20:32:01 +0530 Subject: [PATCH 064/179] Removing numpy requirement from all files in examples/pytorch/domain_templates (#19947) --- .../generative_adversarial_net.py | 6 +++--- .../domain_templates/reinforce_learn_Qnet.py | 16 ++++++++-------- .../domain_templates/semantic_segmentation.py | 5 ++--- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/examples/pytorch/domain_templates/generative_adversarial_net.py b/examples/pytorch/domain_templates/generative_adversarial_net.py index 311d1c38771b8..417e167df0d93 100644 --- a/examples/pytorch/domain_templates/generative_adversarial_net.py +++ b/examples/pytorch/domain_templates/generative_adversarial_net.py @@ -19,9 +19,9 @@ """ +import math from argparse import ArgumentParser, Namespace -import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -59,7 +59,7 @@ def block(in_feat, out_feat, normalize=True): *block(128, 256), *block(256, 512), *block(512, 1024), - nn.Linear(1024, int(np.prod(img_shape))), + nn.Linear(1024, int(math.prod(img_shape))), nn.Tanh(), ) @@ -80,7 +80,7 @@ def __init__(self, img_shape): super().__init__() self.model = nn.Sequential( - nn.Linear(int(np.prod(img_shape)), 512), + nn.Linear(int(math.prod(img_shape)), 512), nn.LeakyReLU(0.2, inplace=True), nn.Linear(512, 256), nn.LeakyReLU(0.2, inplace=True), diff --git a/examples/pytorch/domain_templates/reinforce_learn_Qnet.py b/examples/pytorch/domain_templates/reinforce_learn_Qnet.py index 9b065db8173fe..497cb658c275f 100644 --- a/examples/pytorch/domain_templates/reinforce_learn_Qnet.py +++ b/examples/pytorch/domain_templates/reinforce_learn_Qnet.py @@ -33,11 +33,11 @@ """ import argparse +import random from collections import OrderedDict, deque, namedtuple from typing import Iterator, List, Tuple import gym -import numpy as np import torch import torch.nn as nn import torch.optim as optim @@ -103,15 +103,15 @@ def append(self, experience: Experience) -> None: self.buffer.append(experience) def sample(self, batch_size: int) -> Tuple: - indices = np.random.choice(len(self.buffer), batch_size, replace=False) + indices = random.sample(range(len(self.buffer)), batch_size) states, actions, rewards, dones, next_states = zip(*(self.buffer[idx] for idx in indices)) return ( - np.array(states), - np.array(actions), - np.array(rewards, dtype=np.float32), - np.array(dones, dtype=np.bool), - np.array(next_states), + torch.tensor(states), + torch.tensor(actions), + torch.tensor(rewards, dtype=torch.float32), + torch.tensor(dones, dtype=torch.bool), + torch.tensor(next_states), ) @@ -175,7 +175,7 @@ def get_action(self, net: nn.Module, epsilon: float, device: str) -> int: action """ - if np.random.random() < epsilon: + if random.random() < epsilon: action = self.env.action_space.sample() else: state = torch.tensor([self.state]) diff --git a/examples/pytorch/domain_templates/semantic_segmentation.py b/examples/pytorch/domain_templates/semantic_segmentation.py index c60518e229cbf..12ecbeeb5f0a9 100644 --- a/examples/pytorch/domain_templates/semantic_segmentation.py +++ b/examples/pytorch/domain_templates/semantic_segmentation.py @@ -16,7 +16,6 @@ import random from argparse import ArgumentParser, Namespace -import numpy as np import torch import torch.nn.functional as F import torchvision.transforms as transforms @@ -107,11 +106,11 @@ def __len__(self): def __getitem__(self, idx): img = Image.open(self.img_list[idx]) img = img.resize(self.img_size) - img = np.array(img) + img = torch.tensor(img) mask = Image.open(self.mask_list[idx]).convert("L") mask = mask.resize(self.img_size) - mask = np.array(mask) + mask = torch.tensor(mask) mask = self.encode_segmap(mask) if self.transform: From 4f96c83ba0b5ad74b204f54b78668168886c1622 Mon Sep 17 00:00:00 2001 From: Douwe den Blanken Date: Thu, 6 Jun 2024 20:51:48 +0200 Subject: [PATCH 065/179] Sanitize argument-free object params before logging (#19771) Co-authored-by: awaelchli --- src/lightning/fabric/CHANGELOG.md | 2 ++ src/lightning/fabric/utilities/logger.py | 7 ++++++- tests/tests_fabric/utilities/test_logger.py | 14 +++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 2ee0243a0d5a5..419b663e55c25 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added sanitization for classes before logging them as hyperparameters ([#19771](https://github.com/Lightning-AI/pytorch-lightning/pull/19771)) + - Enabled consolidating distributed checkpoints through `fabric consolidate` in the new CLI ([#19560](https://github.com/Lightning-AI/pytorch-lightning/pull/19560)) - Added the ability to explicitly mark forward methods in Fabric via `_FabricModule.mark_forward_method()` ([#19690](https://github.com/Lightning-AI/pytorch-lightning/pull/19690)) diff --git a/src/lightning/fabric/utilities/logger.py b/src/lightning/fabric/utilities/logger.py index abe5816deda1a..839cbbaa225b3 100644 --- a/src/lightning/fabric/utilities/logger.py +++ b/src/lightning/fabric/utilities/logger.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import inspect import json from argparse import Namespace from dataclasses import asdict, is_dataclass @@ -52,8 +54,11 @@ def _sanitize_callable_params(params: Dict[str, Any]) -> Dict[str, Any]: """ def _sanitize_callable(val: Any) -> Any: - # Give them one chance to return a value. Don't go rabbit hole of recursive call + if inspect.isclass(val): + # If it's a class, don't try to instantiate it, just return the name + return val.__name__ if callable(val): + # Callables get a chance to return a name try: _val = val() if callable(_val): diff --git a/tests/tests_fabric/utilities/test_logger.py b/tests/tests_fabric/utilities/test_logger.py index 33681c65f7fa2..0f6500cb42be1 100644 --- a/tests/tests_fabric/utilities/test_logger.py +++ b/tests/tests_fabric/utilities/test_logger.py @@ -92,7 +92,7 @@ class B: def test_sanitize_callable_params(): - """Callback function are not serializiable. + """Callback functions are not serializable. Therefore, we get them a chance to return something and if the returned type is not accepted, return None. @@ -104,11 +104,21 @@ def return_something(): def wrapper_something(): return return_something + class ClassNoArgs: + def __init__(self): + pass + + class ClassWithCall: + def __call__(self): + return "name" + params = Namespace( foo="bar", something=return_something, wrapper_something_wo_name=(lambda: lambda: "1"), wrapper_something=wrapper_something, + class_no_args=ClassNoArgs, + class_with_call=ClassWithCall, ) params = _convert_params(params) @@ -118,6 +128,8 @@ def wrapper_something(): assert params["something"] == "something" assert params["wrapper_something"] == "wrapper_something" assert params["wrapper_something_wo_name"] == "" + assert params["class_no_args"] == "ClassNoArgs" + assert params["class_with_call"] == "ClassWithCall" def test_sanitize_params(): From 5fa32d95e3c28354c360d23dbee85ddd8507e5f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Barz?= Date: Fri, 7 Jun 2024 00:36:28 +0200 Subject: [PATCH 066/179] Ignore parameters causing ValueError when dumping to YAML (#19804) --- src/lightning/pytorch/CHANGELOG.md | 2 ++ src/lightning/pytorch/core/saving.py | 2 +- tests/tests_pytorch/models/test_hparams.py | 10 +++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 54ce68c696187..2b76b36902977 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -60,6 +60,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue with the LightningCLI not being able to set the `ModelCheckpoint(save_last=...)` argument ([#19808](https://github.com/Lightning-AI/pytorch-lightning/pull/19808)) +- Fixed an issue causing ValueError for certain object such as TorchMetrics when dumping hyperparameters to YAML ([#19804](https://github.com/Lightning-AI/pytorch-lightning/pull/19804)) + ## [2.2.2] - 2024-04-11 diff --git a/src/lightning/pytorch/core/saving.py b/src/lightning/pytorch/core/saving.py index f8e9c8300337a..521192f500b53 100644 --- a/src/lightning/pytorch/core/saving.py +++ b/src/lightning/pytorch/core/saving.py @@ -359,7 +359,7 @@ def save_hparams_to_yaml(config_yaml: _PATH, hparams: Union[dict, Namespace], us try: v = v.name if isinstance(v, Enum) else v yaml.dump(v) - except TypeError: + except (TypeError, ValueError): warn(f"Skipping '{k}' parameter because it is not possible to safely dump to YAML.") hparams[k] = type(v).__name__ else: diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index 0d7fced3b8197..e8a3cf680170b 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -552,7 +552,7 @@ def test_hparams_pickle_warning(tmp_path): trainer.fit(model) -def test_hparams_save_yaml(tmp_path): +def test_save_hparams_to_yaml(tmp_path): class Options(str, Enum): option1name = "option1val" option2name = "option2val" @@ -590,6 +590,14 @@ def _compare_params(loaded_params, default_params: dict): _compare_params(load_hparams_from_yaml(path_yaml), hparams) +def test_save_hparams_to_yaml_warning(tmp_path): + """Test that we warn about unserializable parameters that need to be dropped.""" + path_yaml = tmp_path / "hparams.yaml" + hparams = {"torch_type": torch.float32} + with pytest.warns(UserWarning, match="Skipping 'torch_type' parameter"): + save_hparams_to_yaml(path_yaml, hparams) + + class NoArgsSubClassBoringModel(CustomBoringModel): def __init__(self): super().__init__() From 06ea3a05716a6d1f4a96cfb25021accdd18d8146 Mon Sep 17 00:00:00 2001 From: Alexander Jipa Date: Fri, 7 Jun 2024 10:52:58 -0400 Subject: [PATCH 067/179] Fix resetting epoch loop restarting flag in LearningRateFinder (#19819) --- src/lightning/pytorch/CHANGELOG.md | 3 +++ src/lightning/pytorch/tuner/lr_finder.py | 1 + tests/tests_pytorch/tuner/test_lr_finder.py | 1 + 3 files changed, 5 insertions(+) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 2b76b36902977..1d6c6609109e4 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -63,6 +63,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue causing ValueError for certain object such as TorchMetrics when dumping hyperparameters to YAML ([#19804](https://github.com/Lightning-AI/pytorch-lightning/pull/19804)) +- Fixed resetting `epoch_loop.restarting` to avoid full validation run after `LearningRateFinder` ([#19818](https://github.com/Lightning-AI/pytorch-lightning/issues/19818)) + + ## [2.2.2] - 2024-04-11 ### Fixed diff --git a/src/lightning/pytorch/tuner/lr_finder.py b/src/lightning/pytorch/tuner/lr_finder.py index 4997e23070df7..17a2063e50212 100644 --- a/src/lightning/pytorch/tuner/lr_finder.py +++ b/src/lightning/pytorch/tuner/lr_finder.py @@ -301,6 +301,7 @@ def _lr_find( trainer._checkpoint_connector.restore(ckpt_path) trainer.strategy.remove_checkpoint(ckpt_path) trainer.fit_loop.restarting = False # reset restarting flag as checkpoint restoring sets it to True + trainer.fit_loop.epoch_loop.restarting = False # reset restarting flag as checkpoint restoring sets it to True trainer.fit_loop.epoch_loop.val_loop._combined_loader = None return lr_finder diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py index a0d1d70aa36c4..a31be67911409 100644 --- a/tests/tests_pytorch/tuner/test_lr_finder.py +++ b/tests/tests_pytorch/tuner/test_lr_finder.py @@ -434,6 +434,7 @@ def lr_find(self, trainer, pl_module) -> None: super().lr_find(trainer, pl_module) pl_module._expected_max_steps = None assert not trainer.fit_loop.restarting + assert not trainer.fit_loop.epoch_loop.restarting def on_train_epoch_start(self, trainer, pl_module): if trainer.current_epoch in self.milestones or trainer.current_epoch == 0: From fa5da26e39bc9b19afa827b16789644ca3271dea Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 11 Jun 2024 10:04:51 -0400 Subject: [PATCH 068/179] Update README.md (#19968) --- README.md | 90 +++++++++++++++---------------------------------------- 1 file changed, 24 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index e9d5666c170eb..40ae9714d63a9 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,10 @@ ______________________________________________________________________

- Lightning.ai • + Lightning AI • + ExamplesPyTorch LightningFabric • - Lightning AppsDocsCommunityContribute • @@ -92,15 +92,11 @@ pip install -iU https://test.pypi.org/simple/ pytorch-lightning ______________________________________________________________________ -## Lightning has 4 core packages +## Lightning has 2 core packages [PyTorch Lightning: Train and deploy PyTorch at scale](#pytorch-lightning-train-and-deploy-pytorch-at-scale).
[Lightning Fabric: Expert control](#lightning-fabric-expert-control). -
-[Lightning Data: Blazing fast, distributed streaming of training data from cloud storage](https://github.com/Lightning-AI/pytorch-lightning/tree/master/src/lightning/data). -
-[Lightning Apps: Build AI products and ML workflows](#lightning-apps-build-ai-products-and-ml-workflows). Lightning gives you granular control over how much abstraction you want to add over PyTorch. @@ -108,7 +104,9 @@ Lightning gives you granular control over how much abstraction you want to add o

-______________________________________________________________________ +  +  + # PyTorch Lightning: Train and Deploy PyTorch at Scale @@ -118,6 +116,15 @@ PyTorch Lightning is just organized PyTorch - Lightning disentangles PyTorch cod ______________________________________________________________________ +### Examples +Explore various types of training possible with PyTorch Lightning. Pretrain and finetune ANY kind of model to perform ANY task like classification, segmentation, summarization and more: + +| Task | Description | Run | +|---|---|---| +| [Hello world](#hello-simple-model) | Pretrain - Hello world example | Open In Studio | +| [Image segmentation](https://lightning.ai/lightning-ai/studios/image-segmentation-with-pytorch-lightning) | Finetune - ResNet-50 model to segment images | Open In Studio | +| [Text classification](https://lightning.ai/lightning-ai/studios/text-classification-with-pytorch-lightning) | Finetune - text classifier (BERT model) | Open In Studio | + ### Hello simple model ```python @@ -319,6 +326,9 @@ ______________________________________________________________________ ______________________________________________________________________ +  +  + # Lightning Fabric: Expert control. Run on any device at any scale with expert-level control over PyTorch training loop and scaling strategy. You can even write your own Trainer. @@ -501,62 +511,8 @@ ______________________________________________________________________ ______________________________________________________________________ -# Lightning Apps: Build AI products and ML workflows - -Lightning Apps remove the cloud infrastructure boilerplate so you can focus on solving the research or business problems. Lightning Apps can run on the Lightning Cloud, your own cluster or a private cloud. - -
- -
- -## Hello Lightning app world - -```python -# app.py -import lightning as L - - -class TrainComponent(L.LightningWork): - def run(self, x): - print(f"train a model on {x}") - - -class AnalyzeComponent(L.LightningWork): - def run(self, x): - print(f"analyze model on {x}") - - -class WorkflowOrchestrator(L.LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=L.CloudCompute("cpu")) - self.analyze = AnalyzeComponent(cloud_compute=L.CloudCompute("gpu")) - - def run(self): - self.train.run("CPU machine 1") - self.analyze.run("GPU machine 2") - - -app = L.LightningApp(WorkflowOrchestrator()) -``` - -Run on the cloud or locally - -```bash -# run on the cloud -lightning run app app.py --setup --cloud - -# run locally -lightning run app app.py -``` - -______________________________________________________________________ - - - -______________________________________________________________________ +  +  ## Examples @@ -587,7 +543,8 @@ ______________________________________________________________________ - [Logistic Regression](https://lightning-bolts.readthedocs.io/en/stable/models/classic_ml.html#logistic-regression) - [Linear Regression](https://lightning-bolts.readthedocs.io/en/stable/models/classic_ml.html#linear-regression) -______________________________________________________________________ +  +  ## Continuous Integration @@ -611,7 +568,8 @@ Lightning is rigorously tested across multiple CPUs, GPUs and TPUs and against m -______________________________________________________________________ +  +  ## Community From a97814af135f1f3e393dc9fff7279596e5446d29 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 11 Jun 2024 11:01:22 -0400 Subject: [PATCH 069/179] Update README.md --- README.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 40ae9714d63a9..8af11d2a54167 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ ______________________________________________________________________

Lightning AIExamples • - PyTorch Lightning • - Fabric • + PyTorch Lightning • + FabricDocsCommunityContribute • @@ -39,6 +39,20 @@ ______________________________________________________________________

+
+ +

+ +  + + + Get started + + +

+ +
+ ## Install Lightning Simple installation from PyPI @@ -329,7 +343,7 @@ ______________________________________________________________________     -# Lightning Fabric: Expert control. +# Lightning Fabric: Expert control Run on any device at any scale with expert-level control over PyTorch training loop and scaling strategy. You can even write your own Trainer. From f6fd046552a1504023cb3386a8a0df418a810e4f Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 11 Jun 2024 18:38:56 +0200 Subject: [PATCH 070/179] Release 2.3.0 (#19954) --- src/lightning/data/README.md | 4 ++-- src/lightning/fabric/CHANGELOG.md | 32 +----------------------------- src/lightning/pytorch/CHANGELOG.md | 31 +---------------------------- src/version.info | 2 +- 4 files changed, 5 insertions(+), 64 deletions(-) diff --git a/src/lightning/data/README.md b/src/lightning/data/README.md index efd51a37e48a0..525a7e14f894d 100644 --- a/src/lightning/data/README.md +++ b/src/lightning/data/README.md @@ -15,7 +15,7 @@ We developed `StreamingDataset` to optimize training of large datasets stored on Specifically crafted for multi-gpu & multi-node (with [DDP](https://lightning.ai/docs/pytorch/stable/accelerators/gpu_intermediate.html), [FSDP](https://lightning.ai/docs/pytorch/stable/advanced/model_parallel/fsdp.html), etc...), distributed training with large models, it enhances accuracy, performance, and user-friendliness. Now, training efficiently is possible regardless of the data's location. Simply stream in the required data when needed. -The `StreamingDataset` is compatible with any data type, including **images, text, video, audio, geo-spatial, and multimodal data** and it is a drop-in replacement for your PyTorch [IterableDataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset) class. For example, it is used by [Lit-GPT](https://github.com/Lightning-AI/lit-gpt/blob/main/pretrain/tinyllama.py) to pretrain LLMs. +The `StreamingDataset` is compatible with any data type, including **images, text, video, audio, geo-spatial, and multimodal data** and it is a drop-in replacement for your PyTorch [IterableDataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset) class. For example, it is used by [Lit-GPT](https://github.com/Lightning-AI/lit-gpt/blob/main/litgpt/data/tinyllama.py) to pretrain LLMs.
@@ -284,7 +284,7 @@ for batch in tqdm(train_dataloader): Lightning Data provides a stateful `StreamingDataLoader`. This simplifies resuming training over large datasets. -Note: The `StreamingDataLoader` is used by [Lit-GPT](https://github.com/Lightning-AI/lit-gpt/blob/main/pretrain/tinyllama.py) to pretrain LLMs. The statefulness still works when using a mixture of datasets with the `CombinedStreamingDataset`. +Note: The `StreamingDataLoader` is used by [Lit-GPT](https://github.com/Lightning-AI/lit-gpt/blob/main/litgpt/data/tinyllama.py) to pretrain LLMs. The statefulness still works when using a mixture of datasets with the `CombinedStreamingDataset`. ```python import os diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 419b663e55c25..102783ea81849 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -5,63 +5,33 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unReleased] - 2024-MM-DD +## [2.3.0] - 2024-06-13 ### Added - Added sanitization for classes before logging them as hyperparameters ([#19771](https://github.com/Lightning-AI/pytorch-lightning/pull/19771)) - - Enabled consolidating distributed checkpoints through `fabric consolidate` in the new CLI ([#19560](https://github.com/Lightning-AI/pytorch-lightning/pull/19560)) - - Added the ability to explicitly mark forward methods in Fabric via `_FabricModule.mark_forward_method()` ([#19690](https://github.com/Lightning-AI/pytorch-lightning/pull/19690)) - - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) - - Added `ModelParallelStrategy` to support 2D parallelism ([#19846](https://github.com/Lightning-AI/pytorch-lightning/pull/19846), [#19852](https://github.com/Lightning-AI/pytorch-lightning/pull/19852), [#19870](https://github.com/Lightning-AI/pytorch-lightning/pull/19870), [#19872](https://github.com/Lightning-AI/pytorch-lightning/pull/19872)) - - Added a call to `torch.distributed.destroy_process_group` in atexit handler if process group needs destruction ([#19931](https://github.com/Lightning-AI/pytorch-lightning/pull/19931)) - - Added support for configuring hybrid-sharding by passing a tuple for the `FSDPStrategy(device_mesh=...)` argument ([#19504](https://github.com/Lightning-AI/pytorch-lightning/pull/19504)) - ### Changed - Renamed `lightning run model` to `fabric run` ([#19442](https://github.com/Lightning-AI/pytorch-lightning/pull/19442), [#19527](https://github.com/Lightning-AI/pytorch-lightning/pull/19527)) - - - The `Fabric.rank_zero_first` context manager now uses a barrier without timeout to avoid long-running tasks to be interrupted ([#19448](https://github.com/Lightning-AI/lightning/pull/19448)) - - - Fabric now raises an error if you forget to call `fabric.backward()` when it is needed by the strategy or precision selection ([#19447](https://github.com/Lightning-AI/lightning/pull/19447), [#19493](https://github.com/Lightning-AI/lightning/pull/19493)) - - - `_BackwardSyncControl` can now control what to do when gradient accumulation is disabled ([#19577](https://github.com/Lightning-AI/lightning/pull/19577)) - -### Deprecated - -- - -- - -- - ### Removed - Removed support for PyTorch 1.13 ([#19706](https://github.com/Lightning-AI/lightning/pull/19706)) -- - -- - ### Fixed - Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886)) -- - -- - ## [2.2.2] - 2024-04-11 diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 1d6c6609109e4..34ef2aa42189a 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -4,65 +4,36 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unReleased] - 2024-MM-DD +## [2.3.0] - 2024-06-13 ### Added - The `ModelSummary` and `RichModelSummary` callbacks now display the training mode of each layer in the column "Mode" ([#19468](https://github.com/Lightning-AI/lightning/pull/19468)) - - Added `load_from_checkpoint` support for `LightningCLI` when using dependency injection ([#18105](https://github.com/Lightning-AI/lightning/pull/18105)) - - Added robust timer duration parsing with an informative error message when parsing fails ([#19513](https://github.com/Lightning-AI/pytorch-lightning/pull/19513)) - - Added `on_exception` hook to `LightningDataModule` ([#19601](https://github.com/Lightning-AI/pytorch-lightning/pull/19601)) - - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708)) - - Added `ModelParallelStrategy` to support 2D parallelism ([#19878](https://github.com/Lightning-AI/pytorch-lightning/pull/19878), [#19888](https://github.com/Lightning-AI/pytorch-lightning/pull/19888)) - - Added a call to `torch.distributed.destroy_process_group` in atexit handler if process group needs destruction ([#19931](https://github.com/Lightning-AI/pytorch-lightning/pull/19931)) - - Added support for configuring hybrid-sharding by passing a tuple for the `FSDPStrategy(device_mesh=...)` argument ([#19504](https://github.com/Lightning-AI/pytorch-lightning/pull/19504)) - ### Changed - The `prepare_data()` hook in `LightningModule` and `LightningDataModule` is now subject to a barrier without timeout to avoid long-running tasks to be interrupted ([#19448](https://github.com/Lightning-AI/lightning/pull/19448)) - - Relaxed the requirement for custom batch samplers to expose `drop_last` for prediction ([#19678](https://github.com/Lightning-AI/pytorch-lightning/pull/19678)) - - It is no longer allowed to skip `training_step()` by returning `None` in distributed training ([#19918](https://github.com/Lightning-AI/pytorch-lightning/pull/19918)) - -### Deprecated - -- - -- - -- - ### Removed - Removed the Bagua integration (`Trainer(strategy="bagua")`) ([#19445](https://github.com/Lightning-AI/lightning/pull/19445)) - - Removed support for PyTorch 1.13 ([#19706](https://github.com/Lightning-AI/lightning/pull/19706)) -- - ### Fixed - Fixed a matrix shape mismatch issue when running a model loaded from a quantized checkpoint (bitsandbytes) ([#19886](https://github.com/Lightning-AI/lightning/pull/19886)) - - - Fixed `WandbLogger.log_hyperparameters()` raising an error if hyperparameters are not JSON serializable ([#19769](https://github.com/Lightning-AI/pytorch-lightning/pull/19769)) - - - Fixed an issue with the LightningCLI not being able to set the `ModelCheckpoint(save_last=...)` argument ([#19808](https://github.com/Lightning-AI/pytorch-lightning/pull/19808)) - - Fixed an issue causing ValueError for certain object such as TorchMetrics when dumping hyperparameters to YAML ([#19804](https://github.com/Lightning-AI/pytorch-lightning/pull/19804)) - - - Fixed resetting `epoch_loop.restarting` to avoid full validation run after `LearningRateFinder` ([#19818](https://github.com/Lightning-AI/pytorch-lightning/issues/19818)) diff --git a/src/version.info b/src/version.info index c1ffed065aef0..276cbf9e2858c 100644 --- a/src/version.info +++ b/src/version.info @@ -1 +1 @@ -2.3.0dev +2.3.0 From a42484cf8eaa02e45921205bb5dfbae1a0d14cb9 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 13 Jun 2024 21:58:34 +0200 Subject: [PATCH 071/179] Fix failing app tests (#19971) --- requirements/app/app.txt | 2 +- tests/tests_app/cli/test_cmd_launch.py | 2 ++ tests/tests_app/core/test_lightning_api.py | 2 +- tests/tests_app/core/test_lightning_app.py | 1 + tests/tests_app/utilities/test_network.py | 16 ++++++++-------- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/requirements/app/app.txt b/requirements/app/app.txt index d2bffc5cecfd3..a59e0b5ca5c28 100644 --- a/requirements/app/app.txt +++ b/requirements/app/app.txt @@ -23,7 +23,7 @@ Jinja2 <3.2.0 PyYAML <=6.0.1 requests <2.32.0 rich >=12.3.0, <13.6.0 -urllib3 <2.1.0 +urllib3 <2.0.0 uvicorn <0.24.0 websocket-client <1.7.0 websockets <11.1.0 diff --git a/tests/tests_app/cli/test_cmd_launch.py b/tests/tests_app/cli/test_cmd_launch.py index 4b75c08de5dce..167e896fba11c 100644 --- a/tests/tests_app/cli/test_cmd_launch.py +++ b/tests/tests_app/cli/test_cmd_launch.py @@ -7,6 +7,7 @@ from unittest import mock from unittest.mock import ANY, MagicMock, Mock +import pytest from click.testing import CliRunner from lightning.app.cli.lightning_cli_launch import run_flow, run_flow_and_servers, run_frontend, run_server from lightning.app.core.queues import QueuingSystem @@ -189,6 +190,7 @@ def start_processes(**functions): @_RunIf(skip_windows=True) +@pytest.mark.flaky(reruns=3) def test_manage_server_processes_one_process_gets_killed(capfd): functions = {"p1": run_forever_process, "p2": run_for_2_seconds_and_raise} p = Process(target=start_processes, kwargs=functions) diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index 65ac6fcab2bf7..9b80d540c17e0 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -404,7 +404,7 @@ async def test_frontend_routes(path, expected_status_code): assert response.status_code == expected_status_code -@pytest.mark.xfail(sys.platform == "linux", reason="No idea why... need to be fixed") # fixme +@pytest.mark.xfail(sys.platform == "linux", strict=False, reason="No idea why... need to be fixed") # fixme def test_start_server_started(): """This test ensures has_started_queue receives a signal when the REST API has started.""" api_publish_state_queue = mp.Queue() diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index 08a2d7c641b29..a70cbb853e437 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -1084,6 +1084,7 @@ def run_once(self): return res +@pytest.mark.flaky(reruns=3) def test_lightning_app_has_updated(): app = TestLightningHasUpdatedApp(FlowPath()) MultiProcessRuntime(app, start_server=False).dispatch() diff --git a/tests/tests_app/utilities/test_network.py b/tests/tests_app/utilities/test_network.py index 3a14c0301ef1e..999c75da57dd7 100644 --- a/tests/tests_app/utilities/test_network.py +++ b/tests/tests_app/utilities/test_network.py @@ -48,10 +48,10 @@ def test_find_free_network_port_cloudspace(_, patch_constants): @mock.patch("urllib3.connectionpool.HTTPConnectionPool._get_conn") def test_http_client_retry_post(getconn_mock): getconn_mock.return_value.getresponse.side_effect = [ - mock.Mock(status=500, msg=HTTPMessage()), - mock.Mock(status=599, msg=HTTPMessage()), - mock.Mock(status=405, msg=HTTPMessage()), - mock.Mock(status=200, msg=HTTPMessage()), + mock.Mock(status=500, msg=HTTPMessage(), headers={}), + mock.Mock(status=599, msg=HTTPMessage(), headers={}), + mock.Mock(status=405, msg=HTTPMessage(), headers={}), + mock.Mock(status=200, msg=HTTPMessage(), headers={}), ] client = HTTPClient(base_url="http://test.url") @@ -69,10 +69,10 @@ def test_http_client_retry_post(getconn_mock): @mock.patch("urllib3.connectionpool.HTTPConnectionPool._get_conn") def test_http_client_retry_get(getconn_mock): getconn_mock.return_value.getresponse.side_effect = [ - mock.Mock(status=500, msg=HTTPMessage()), - mock.Mock(status=599, msg=HTTPMessage()), - mock.Mock(status=405, msg=HTTPMessage()), - mock.Mock(status=200, msg=HTTPMessage()), + mock.Mock(status=500, msg=HTTPMessage(), headers={}), + mock.Mock(status=599, msg=HTTPMessage(), headers={}), + mock.Mock(status=405, msg=HTTPMessage(), headers={}), + mock.Mock(status=200, msg=HTTPMessage(), headers={}), ] client = HTTPClient(base_url="http://test.url") From bb511b0bafce2826346fdb84fbe4f80ec198ce60 Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Thu, 13 Jun 2024 18:26:46 -0400 Subject: [PATCH 072/179] Fix minor typo in Trainer's documentation (#19969) --- src/lightning/pytorch/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index d9d13602700cf..ce9801593e397 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -206,7 +206,7 @@ def __init__( across epochs or during iteration-based training. Default: ``1.0``. - check_val_every_n_epoch: Perform a validation loop every after every `N` training epochs. If ``None``, + check_val_every_n_epoch: Perform a validation loop after every `N` training epochs. If ``None``, validation will be done solely based on the number of training batches, requiring ``val_check_interval`` to be an integer value. Default: ``1``. From b16e998a6e950c3d77a433ada28d2c9b2534431d Mon Sep 17 00:00:00 2001 From: PL Ghost <75324987+pl-ghost@users.noreply.github.com> Date: Sun, 16 Jun 2024 15:37:39 +0200 Subject: [PATCH 073/179] Adding test for legacy checkpoint created with 2.3.0 (#19974) --- tests/legacy/back-compatible-versions.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/legacy/back-compatible-versions.txt b/tests/legacy/back-compatible-versions.txt index 1243a2fad62f8..2495d0f84ea95 100644 --- a/tests/legacy/back-compatible-versions.txt +++ b/tests/legacy/back-compatible-versions.txt @@ -100,3 +100,4 @@ 2.2.1 2.2.2 2.2.5 +2.3.0 From c1af4d05279af7a4630d2a27b57bd85699797465 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 16 Jun 2024 16:43:42 +0200 Subject: [PATCH 074/179] Better graceful shutdown for KeyboardInterrupt (#19976) --- src/lightning/fabric/CHANGELOG.md | 34 ++++++++++++++++++ src/lightning/fabric/utilities/distributed.py | 4 +++ src/lightning/pytorch/CHANGELOG.md | 35 +++++++++++++++++++ .../strategies/launchers/multiprocessing.py | 2 +- .../strategies/launchers/subprocess_script.py | 2 +- src/lightning/pytorch/trainer/call.py | 20 +++++++---- .../trainer/connectors/signal_connector.py | 11 +++--- .../progress/test_rich_progress_bar.py | 2 +- .../callbacks/test_lambda_function.py | 9 +++-- tests/tests_pytorch/trainer/test_states.py | 3 +- tests/tests_pytorch/trainer/test_trainer.py | 34 +++++++++++++++--- 11 files changed, 134 insertions(+), 22 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 102783ea81849..37322981c503e 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -5,6 +5,40 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [unreleased] - YYYY-MM-DD + +### Added + +- + +- + +### Changed + +- + +- + +### Deprecated + +- + +- + +### Removed + +- + +- + +### Fixed + +- + +- + + + ## [2.3.0] - 2024-06-13 ### Added diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index bb20b889ec626..75b2f7c580b6f 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -2,6 +2,7 @@ import contextlib import logging import os +import signal import time from contextlib import nullcontext from datetime import timedelta @@ -306,8 +307,11 @@ def _init_dist_connection( def _destroy_dist_connection() -> None: + # Don't allow Ctrl+C to interrupt this handler + signal.signal(signal.SIGINT, signal.SIG_IGN) if _distributed_is_initialized(): torch.distributed.destroy_process_group() + signal.signal(signal.SIGINT, signal.SIG_DFL) def _get_default_process_group_backend_for_device(device: torch.device) -> str: diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 34ef2aa42189a..08562a9eb8dca 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -4,6 +4,41 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). + +## [unreleased] - YYYY-MM-DD + +### Added + +- + +- + +### Changed + +- Triggering KeyboardInterrupt (Ctrl+C) during `.fit()`, `.evaluate()`, `.test()` or `.predict()` now terminates all processes launched by the Trainer and exits the program ([#19976](https://github.com/Lightning-AI/pytorch-lightning/pull/19976)) + +- + +### Deprecated + +- + +- + +### Removed + +- + +- + +### Fixed + +- + +- + + + ## [2.3.0] - 2024-06-13 ### Added diff --git a/src/lightning/pytorch/strategies/launchers/multiprocessing.py b/src/lightning/pytorch/strategies/launchers/multiprocessing.py index aa96da63adb65..58d9f2b16e0f3 100644 --- a/src/lightning/pytorch/strategies/launchers/multiprocessing.py +++ b/src/lightning/pytorch/strategies/launchers/multiprocessing.py @@ -259,7 +259,7 @@ def update_main_process_results(self, trainer: "pl.Trainer", extra: Dict[str, An def kill(self, signum: _SIGNUM) -> None: for proc in self.procs: if proc.is_alive() and proc.pid is not None: - log.info(f"pid {os.getpid()} killing {proc.pid} with {signum}") + log.debug(f"Process {os.getpid()} is terminating {proc.pid} with {signum}") with suppress(ProcessLookupError): os.kill(proc.pid, signum) diff --git a/src/lightning/pytorch/strategies/launchers/subprocess_script.py b/src/lightning/pytorch/strategies/launchers/subprocess_script.py index 03dbbc52365fb..d2035d03d2589 100644 --- a/src/lightning/pytorch/strategies/launchers/subprocess_script.py +++ b/src/lightning/pytorch/strategies/launchers/subprocess_script.py @@ -107,7 +107,7 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] @override def kill(self, signum: _SIGNUM) -> None: for proc in self.procs: - log.info(f"pid {os.getpid()} killing {proc.pid} with {signum}") + log.debug(f"Process {os.getpid()} is terminating {proc.pid} with {signum}") # this skips subprocesses already terminated proc.send_signal(signum) diff --git a/src/lightning/pytorch/trainer/call.py b/src/lightning/pytorch/trainer/call.py index befd7f0df84dc..4c3bc5ef41bdd 100644 --- a/src/lightning/pytorch/trainer/call.py +++ b/src/lightning/pytorch/trainer/call.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import signal from copy import deepcopy from typing import Any, Callable, Dict, Optional, Type, Union @@ -20,10 +21,12 @@ import lightning.pytorch as pl from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin from lightning.pytorch.callbacks import Checkpoint, EarlyStopping +from lightning.pytorch.strategies.launchers import _SubprocessScriptLauncher +from lightning.pytorch.trainer.connectors.signal_connector import _get_sigkill_signal from lightning.pytorch.trainer.states import TrainerStatus from lightning.pytorch.utilities.exceptions import _TunerExitException from lightning.pytorch.utilities.model_helpers import is_overridden -from lightning.pytorch.utilities.rank_zero import rank_zero_warn +from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn log = logging.getLogger(__name__) @@ -49,12 +52,17 @@ def _call_and_handle_interrupt(trainer: "pl.Trainer", trainer_fn: Callable, *arg trainer.state.status = TrainerStatus.FINISHED trainer.state.stage = None - # TODO: Unify both exceptions below, where `KeyboardError` doesn't re-raise except KeyboardInterrupt as exception: - rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...") - # user could press Ctrl+c many times... only shutdown once - if not trainer.interrupted: - _interrupt(trainer, exception) + rank_zero_info("\nDetected KeyboardInterrupt, attempting graceful shutdown ...") + # user could press Ctrl+C many times, disable KeyboardInterrupt for shutdown + signal.signal(signal.SIGINT, signal.SIG_IGN) + _interrupt(trainer, exception) + trainer._teardown() + launcher = trainer.strategy.launcher + if isinstance(launcher, _SubprocessScriptLauncher): + launcher.kill(_get_sigkill_signal()) + exit(1) + except BaseException as exception: _interrupt(trainer, exception) trainer._teardown() diff --git a/src/lightning/pytorch/trainer/connectors/signal_connector.py b/src/lightning/pytorch/trainer/connectors/signal_connector.py index 728d8b6b6ee43..ca9e3eb249474 100644 --- a/src/lightning/pytorch/trainer/connectors/signal_connector.py +++ b/src/lightning/pytorch/trainer/connectors/signal_connector.py @@ -2,7 +2,6 @@ import os import re import signal -import sys import threading from subprocess import call from types import FrameType @@ -54,7 +53,7 @@ def register_signal_handlers(self) -> None: sigterm_handlers.append(self._sigterm_handler_fn) # Windows seems to have signal incompatibilities - if not self._is_on_windows(): + if not _IS_WINDOWS: sigusr = environment.requeue_signal if isinstance(environment, SLURMEnvironment) else signal.SIGUSR1 assert sigusr is not None if sigusr_handlers and not self._has_already_handler(sigusr): @@ -155,10 +154,6 @@ def _valid_signals() -> Set[signal.Signals]: } return set(signal.Signals) - @staticmethod - def _is_on_windows() -> bool: - return sys.platform == "win32" - @staticmethod def _has_already_handler(signum: _SIGNUM) -> bool: return signal.getsignal(signum) not in (None, signal.SIG_DFL) @@ -172,3 +167,7 @@ def __getstate__(self) -> Dict: state = self.__dict__.copy() state["_original_handlers"] = {} return state + + +def _get_sigkill_signal() -> _SIGNUM: + return signal.SIGTERM if _IS_WINDOWS else signal.SIGKILL diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py index 22e83443ef9cf..de41035d4d832 100644 --- a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py @@ -143,7 +143,7 @@ def on_train_start(self) -> None: with mock.patch( "lightning.pytorch.callbacks.progress.rich_progress.Progress.stop", autospec=True - ) as mock_progress_stop: + ) as mock_progress_stop, pytest.raises(SystemExit): progress_bar = RichProgressBar() trainer = Trainer( default_root_dir=tmp_path, diff --git a/tests/tests_pytorch/callbacks/test_lambda_function.py b/tests/tests_pytorch/callbacks/test_lambda_function.py index 483c8c73e99e2..40d694bb35ebc 100644 --- a/tests/tests_pytorch/callbacks/test_lambda_function.py +++ b/tests/tests_pytorch/callbacks/test_lambda_function.py @@ -13,6 +13,7 @@ # limitations under the License. from functools import partial +import pytest from lightning.pytorch import Trainer, seed_everything from lightning.pytorch.callbacks import Callback, LambdaCallback from lightning.pytorch.demos.boring_classes import BoringModel @@ -23,10 +24,13 @@ def test_lambda_call(tmp_path): seed_everything(42) + class CustomException(Exception): + pass + class CustomModel(BoringModel): def on_train_epoch_start(self): if self.current_epoch > 1: - raise KeyboardInterrupt + raise CustomException("Custom exception to trigger `on_exception` hooks") checker = set() @@ -59,7 +63,8 @@ def call(hook, *_, **__): limit_predict_batches=1, callbacks=[LambdaCallback(**hooks_args)], ) - trainer.fit(model, ckpt_path=ckpt_path) + with pytest.raises(CustomException): + trainer.fit(model, ckpt_path=ckpt_path) trainer.test(model) trainer.predict(model) diff --git a/tests/tests_pytorch/trainer/test_states.py b/tests/tests_pytorch/trainer/test_states.py index bd5fd1c67e7b6..d89e99c9319c6 100644 --- a/tests/tests_pytorch/trainer/test_states.py +++ b/tests/tests_pytorch/trainer/test_states.py @@ -84,5 +84,6 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): trainer = Trainer(callbacks=[InterruptCallback()], default_root_dir=tmp_path, **extra_params) - trainer.fit(model) + with pytest.raises(SystemExit): + trainer.fit(model) assert trainer.interrupted diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index 1791f498d5512..802c1a17bc448 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -28,6 +28,7 @@ import torch import torch.nn as nn from lightning.fabric.utilities.cloud_io import _load as pl_load +from lightning.fabric.utilities.imports import _IS_WINDOWS from lightning.fabric.utilities.seed import seed_everything from lightning.pytorch import Callback, LightningDataModule, LightningModule, Trainer from lightning.pytorch.accelerators import CPUAccelerator, CUDAAccelerator @@ -45,7 +46,7 @@ from lightning.pytorch.loggers import TensorBoardLogger from lightning.pytorch.overrides.distributed import UnrepeatedDistributedSampler, _IndexBatchSamplerWrapper from lightning.pytorch.strategies import DDPStrategy, SingleDeviceStrategy -from lightning.pytorch.strategies.launchers import _MultiProcessingLauncher +from lightning.pytorch.strategies.launchers import _MultiProcessingLauncher, _SubprocessScriptLauncher from lightning.pytorch.trainer.states import RunningStage, TrainerFn from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _OMEGACONF_AVAILABLE @@ -1007,7 +1008,8 @@ def on_exception(self, trainer, pl_module, exception): ) assert not trainer.interrupted assert handle_interrupt_callback.exception is None - trainer.fit(model) + with pytest.raises(SystemExit): + trainer.fit(model) assert trainer.interrupted assert isinstance(handle_interrupt_callback.exception, KeyboardInterrupt) with pytest.raises(MisconfigurationException): @@ -1016,6 +1018,30 @@ def on_exception(self, trainer, pl_module, exception): assert isinstance(handle_interrupt_callback.exception, MisconfigurationException) +def test_keyboard_interrupt(tmp_path): + class InterruptCallback(Callback): + def __init__(self): + super().__init__() + + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): + raise KeyboardInterrupt + + model = BoringModel() + trainer = Trainer( + callbacks=[InterruptCallback()], + barebones=True, + default_root_dir=tmp_path, + ) + + trainer.strategy._launcher = Mock(spec=_SubprocessScriptLauncher) + trainer.strategy._launcher.launch = lambda function, *args, trainer, **kwargs: function(*args, **kwargs) + + with pytest.raises(SystemExit) as exc_info: + trainer.fit(model) + assert exc_info.value.args[0] == 1 + trainer.strategy._launcher.kill.assert_called_once_with(15 if _IS_WINDOWS else 9) + + @pytest.mark.parametrize("precision", ["32-true", pytest.param("16-mixed", marks=RunIf(min_cuda_gpus=1))]) @RunIf(sklearn=True) def test_gradient_clipping_by_norm(tmp_path, precision): @@ -2042,7 +2068,7 @@ def on_fit_start(self): trainer = Trainer(default_root_dir=tmp_path) with mock.patch("lightning.pytorch.strategies.strategy.Strategy.on_exception") as on_exception_mock, suppress( - Exception + Exception, SystemExit ): trainer.fit(ExceptionModel()) on_exception_mock.assert_called_once_with(exception) @@ -2061,7 +2087,7 @@ def on_fit_start(self): datamodule.on_exception = Mock() trainer = Trainer(default_root_dir=tmp_path) - with suppress(Exception): + with suppress(Exception, SystemExit): trainer.fit(ExceptionModel(), datamodule=datamodule) datamodule.on_exception.assert_called_once_with(exception) From 394c42aaf6f5f67ad4cde338428c2b1cdd03c882 Mon Sep 17 00:00:00 2001 From: liambsmith Date: Tue, 18 Jun 2024 13:14:32 -0400 Subject: [PATCH 075/179] Fix callback call in Fabric Trainer example (#19986) --- examples/fabric/build_your_own_trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fabric/build_your_own_trainer/trainer.py b/examples/fabric/build_your_own_trainer/trainer.py index c5e6836bfeddf..a225bf5556ae3 100644 --- a/examples/fabric/build_your_own_trainer/trainer.py +++ b/examples/fabric/build_your_own_trainer/trainer.py @@ -227,7 +227,7 @@ def train_loop( should_optim_step = self.global_step % self.grad_accum_steps == 0 if should_optim_step: # currently only supports a single optimizer - self.fabric.call("on_before_optimizer_step", optimizer, 0) + self.fabric.call("on_before_optimizer_step", optimizer) # optimizer step runs train step internally through closure optimizer.step(partial(self.training_step, model=model, batch=batch, batch_idx=batch_idx)) From 1e83a1bd32a29e1c8802d0593dbcd6e6b173a8d3 Mon Sep 17 00:00:00 2001 From: Etay Livne <53942171+EtayLivne@users.noreply.github.com> Date: Tue, 18 Jun 2024 20:15:12 +0300 Subject: [PATCH 076/179] Check if CometLogger experiment is alive (#19915) Co-authored-by: Etay Livne --- src/lightning/pytorch/loggers/comet.py | 2 +- tests/tests_pytorch/loggers/test_comet.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/lightning/pytorch/loggers/comet.py b/src/lightning/pytorch/loggers/comet.py index ccb6d62de866a..277af5c85f539 100644 --- a/src/lightning/pytorch/loggers/comet.py +++ b/src/lightning/pytorch/loggers/comet.py @@ -268,7 +268,7 @@ def experiment(self) -> Union["Experiment", "ExistingExperiment", "OfflineExperi self.logger.experiment.some_comet_function() """ - if self._experiment is not None: + if self._experiment is not None and self._experiment.alive: return self._experiment if self._future_experiment_key is not None: diff --git a/tests/tests_pytorch/loggers/test_comet.py b/tests/tests_pytorch/loggers/test_comet.py index 791089c47cbbe..e467c63543ede 100644 --- a/tests/tests_pytorch/loggers/test_comet.py +++ b/tests/tests_pytorch/loggers/test_comet.py @@ -66,6 +66,20 @@ def test_comet_logger_online(comet_mock): api.assert_called_once_with("rest") +@mock.patch.dict(os.environ, {}) +def test_comet_experiment_resets_if_not_alive(comet_mock): + """Test that the CometLogger creates a new experiment if the old one is not alive anymore.""" + logger = CometLogger() + assert logger._experiment is None + alive_experiment = Mock(alive=True) + logger._experiment = alive_experiment + assert logger.experiment is alive_experiment + + unalive_experiment = Mock(alive=False) + logger._experiment = unalive_experiment + assert logger.experiment is not unalive_experiment + + @mock.patch.dict(os.environ, {}) def test_comet_logger_no_api_key_given(comet_mock): """Test that CometLogger fails to initialize if both api key and save_dir are missing.""" From cec6ae123de749927ca42d616e98bb030d54cad7 Mon Sep 17 00:00:00 2001 From: elmuz <9587977+elmuz@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:57:35 +0200 Subject: [PATCH 077/179] Fix typo `scrict` -> `strict` in types.py (#19998) --- src/lightning/pytorch/utilities/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/pytorch/utilities/types.py b/src/lightning/pytorch/utilities/types.py index bc75e0f50aeb0..c1b971e924a52 100644 --- a/src/lightning/pytorch/utilities/types.py +++ b/src/lightning/pytorch/utilities/types.py @@ -107,7 +107,7 @@ class LRSchedulerConfigType(TypedDict, total=False): frequency: int reduce_on_plateau: bool monitor: Optional[str] - scrict: bool + strict: bool class OptimizerLRSchedulerConfig(TypedDict): From d3a0ada4ffac76c09f3198b4e35d60b08d6c6069 Mon Sep 17 00:00:00 2001 From: SW Yoo <71536965+swyo@users.noreply.github.com> Date: Fri, 21 Jun 2024 23:36:10 +0900 Subject: [PATCH 078/179] Fix dtype for MPS in reinforcement learning example (#19982) --- examples/fabric/reinforcement_learning/train_fabric.py | 2 +- .../fabric/reinforcement_learning/train_fabric_decoupled.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/fabric/reinforcement_learning/train_fabric.py b/examples/fabric/reinforcement_learning/train_fabric.py index 1f3f83f3f2025..74b9b378371d3 100644 --- a/examples/fabric/reinforcement_learning/train_fabric.py +++ b/examples/fabric/reinforcement_learning/train_fabric.py @@ -146,7 +146,7 @@ def main(args: argparse.Namespace): # Single environment step next_obs, reward, done, truncated, info = envs.step(action.cpu().numpy()) done = torch.logical_or(torch.tensor(done), torch.tensor(truncated)) - rewards[step] = torch.tensor(reward, device=device).view(-1) + rewards[step] = torch.tensor(reward, device=device, dtype=torch.float32).view(-1) next_obs, next_done = torch.tensor(next_obs, device=device), done.to(device) if "final_info" in info: diff --git a/examples/fabric/reinforcement_learning/train_fabric_decoupled.py b/examples/fabric/reinforcement_learning/train_fabric_decoupled.py index bbc09c977efcf..3849ae0f96a3c 100644 --- a/examples/fabric/reinforcement_learning/train_fabric_decoupled.py +++ b/examples/fabric/reinforcement_learning/train_fabric_decoupled.py @@ -135,7 +135,7 @@ def player(args, world_collective: TorchCollective, player_trainer_collective: T # Single environment step next_obs, reward, done, truncated, info = envs.step(action.cpu().numpy()) done = torch.logical_or(torch.tensor(done), torch.tensor(truncated)) - rewards[step] = torch.tensor(reward, device=device).view(-1) + rewards[step] = torch.tensor(reward, device=device, dtype=torch.float32).view(-1) next_obs, next_done = torch.tensor(next_obs, device=device), done.to(device) if "final_info" in info: From 5981aebfcc6070f064cbda1178beaa4d821c0ecf Mon Sep 17 00:00:00 2001 From: Mauricio Villegas <5780272+mauvilsa@users.noreply.github.com> Date: Fri, 21 Jun 2024 16:38:42 +0200 Subject: [PATCH 079/179] Update `test_lightning_cli_help` for future change in jsonargparse (#20002) --- tests/tests_pytorch/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 4c28d6588cea2..8a77fd1440f50 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -466,7 +466,7 @@ def test_lightning_cli_help(): ), pytest.raises(SystemExit): any_model_any_data_cli() - assert "--data.init_args.data_dir" in out.getvalue() + assert ("--data.data_dir" in out.getvalue()) or ("--data.init_args.data_dir" in out.getvalue()) def test_lightning_cli_print_config(): From 709a2a9d3b79b0a436eb2d271fbeecf8a7ba1352 Mon Sep 17 00:00:00 2001 From: liambsmith Date: Fri, 21 Jun 2024 10:43:30 -0400 Subject: [PATCH 080/179] Updated Fabric trainer example to not call `self.trainer.model` during validation (#19993) --- examples/fabric/build_your_own_trainer/trainer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/fabric/build_your_own_trainer/trainer.py b/examples/fabric/build_your_own_trainer/trainer.py index a225bf5556ae3..7af01ede054a8 100644 --- a/examples/fabric/build_your_own_trainer/trainer.py +++ b/examples/fabric/build_your_own_trainer/trainer.py @@ -264,7 +264,7 @@ def val_loop( val_loader: Optional[torch.utils.data.DataLoader], limit_batches: Union[int, float] = float("inf"), ): - """The validation loop ruunning a single validation epoch. + """The validation loop running a single validation epoch. Args: model: the LightningModule to evaluate @@ -285,7 +285,10 @@ def val_loop( ) return - self.fabric.call("on_validation_model_eval") # calls `model.eval()` + if not is_overridden("on_validation_model_eval", _unwrap_objects(model)): + model.eval() + else: + self.fabric.call("on_validation_model_eval") # calls `model.eval()` torch.set_grad_enabled(False) @@ -311,7 +314,10 @@ def val_loop( self.fabric.call("on_validation_epoch_end") - self.fabric.call("on_validation_model_train") + if not is_overridden("on_validation_model_train", _unwrap_objects(model)): + model.train() + else: + self.fabric.call("on_validation_model_train") torch.set_grad_enabled(True) def training_step(self, model: L.LightningModule, batch: Any, batch_idx: int) -> torch.Tensor: From e330da5870fae34339170b942095a2600fa7a95e Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 21 Jun 2024 17:20:59 -0700 Subject: [PATCH 081/179] Fix torch-numpy compatibility conflict in tests (#20004) --- requirements/fabric/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- src/lightning/fabric/utilities/testing/_runif.py | 4 ++-- tests/tests_fabric/strategies/test_ddp_integration.py | 5 +++++ tests/tests_fabric/utilities/test_distributed.py | 5 +++++ 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 7487dd9b754b3..aac884d9c6f43 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -6,4 +6,4 @@ torch >=2.0.0, <2.4.0 fsspec[http] >=2022.5.0, <2024.4.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 -lightning-utilities >=0.8.0, <0.12.0 +lightning-utilities >=0.10.0, <0.12.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 4993a918af099..6372357b6d290 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -9,4 +9,4 @@ fsspec[http] >=2022.5.0, <2024.4.0 torchmetrics >=0.7.0, <1.3.0 # needed for using fixed compare_version packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 -lightning-utilities >=0.8.0, <0.12.0 +lightning-utilities >=0.10.0, <0.12.0 diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py index 6ab2ff730eec9..9a6f5554baa19 100644 --- a/src/lightning/fabric/utilities/testing/_runif.py +++ b/src/lightning/fabric/utilities/testing/_runif.py @@ -17,7 +17,7 @@ from typing import Dict, List, Optional, Tuple import torch -from lightning_utilities.core.imports import compare_version +from lightning_utilities.core.imports import RequirementCache, compare_version from packaging.version import Version from lightning.fabric.accelerators import XLAAccelerator @@ -112,7 +112,7 @@ def _runif_reasons( reasons.append("Standalone execution") kwargs["standalone"] = True - if deepspeed and not _DEEPSPEED_AVAILABLE: + if deepspeed and not (_DEEPSPEED_AVAILABLE and RequirementCache(module="deepspeed.utils")): reasons.append("Deepspeed") if dynamo: diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py index 6f003748b9cce..281f0d47bae0c 100644 --- a/tests/tests_fabric/strategies/test_ddp_integration.py +++ b/tests/tests_fabric/strategies/test_ddp_integration.py @@ -19,6 +19,7 @@ import pytest import torch from lightning.fabric import Fabric +from lightning_utilities.core.imports import RequirementCache from torch._dynamo import OptimizedModule from torch.nn.parallel.distributed import DistributedDataParallel @@ -27,6 +28,10 @@ from tests_fabric.test_fabric import BoringModel +@pytest.mark.skipif( + RequirementCache("torch<2.4") and RequirementCache("numpy>=2.0"), + reason="torch.distributed not compatible with numpy>=2.0", +) @pytest.mark.parametrize( "accelerator", [ diff --git a/tests/tests_fabric/utilities/test_distributed.py b/tests/tests_fabric/utilities/test_distributed.py index 5331a6f9be611..2c30b3aa62ddf 100644 --- a/tests/tests_fabric/utilities/test_distributed.py +++ b/tests/tests_fabric/utilities/test_distributed.py @@ -20,6 +20,7 @@ _sync_ddp, is_shared_filesystem, ) +from lightning_utilities.core.imports import RequirementCache from tests_fabric.helpers.runif import RunIf @@ -121,6 +122,10 @@ def test_collective_operations(devices, process): spawn_launch(process, devices) +@pytest.mark.skipif( + RequirementCache("torch<2.4") and RequirementCache("numpy>=2.0"), + reason="torch.distributed not compatible with numpy>=2.0", +) @pytest.mark.flaky(reruns=3) # flaky with "process 0 terminated with signal SIGABRT" (GLOO) def test_is_shared_filesystem(tmp_path, monkeypatch): # In the non-distributed case, every location is interpreted as 'shared' From 9304a2c72eeb4b9ffa80390f5428ae33afd5b85f Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 23 Jun 2024 19:36:57 +0200 Subject: [PATCH 082/179] Convert tensors to bytes instead of numpy in multiprocessing result-queue (#20005) --- .../strategies/launchers/multiprocessing.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/lightning/pytorch/strategies/launchers/multiprocessing.py b/src/lightning/pytorch/strategies/launchers/multiprocessing.py index 58d9f2b16e0f3..7431927df21f5 100644 --- a/src/lightning/pytorch/strategies/launchers/multiprocessing.py +++ b/src/lightning/pytorch/strategies/launchers/multiprocessing.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import io import logging import os import queue @@ -19,7 +20,6 @@ from dataclasses import dataclass from typing import Any, Callable, Dict, List, Literal, NamedTuple, Optional, Union -import numpy as np import torch import torch.backends.cudnn import torch.multiprocessing as mp @@ -226,7 +226,7 @@ def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Opt def get_extra_results(self, trainer: "pl.Trainer") -> Dict[str, Any]: """Gather extra state from the Trainer and return it as a dictionary for sending back to the main process. To - avoid issues with memory sharing, we cast the data to numpy. + avoid issues with memory sharing, we convert tensors to bytes. Args: trainer: reference to the Trainer. @@ -236,14 +236,15 @@ def get_extra_results(self, trainer: "pl.Trainer") -> Dict[str, Any]: process this output. """ - callback_metrics: dict = apply_to_collection( - trainer.callback_metrics, Tensor, lambda x: x.cpu().numpy() - ) # send as numpy to avoid issues with memory sharing - return {"callback_metrics": callback_metrics} + callback_metrics = apply_to_collection(trainer.callback_metrics, Tensor, lambda t: t.cpu()) + buffer = io.BytesIO() + torch.save(callback_metrics, buffer) + # send tensors as bytes to avoid issues with memory sharing + return {"callback_metrics_bytes": buffer.getvalue()} def update_main_process_results(self, trainer: "pl.Trainer", extra: Dict[str, Any]) -> None: """Retrieve the :attr:`trainer.callback_metrics` dictionary from the given queue. To preserve consistency, we - cast back the data to ``torch.Tensor``. + convert bytes back to ``torch.Tensor``. Args: trainer: reference to the Trainer. @@ -252,8 +253,9 @@ def update_main_process_results(self, trainer: "pl.Trainer", extra: Dict[str, An """ # NOTE: `get_extra_results` needs to be called before - callback_metrics = extra["callback_metrics"] - trainer.callback_metrics.update(apply_to_collection(callback_metrics, np.ndarray, lambda x: torch.tensor(x))) + callback_metrics_bytes = extra["callback_metrics_bytes"] + callback_metrics = torch.load(io.BytesIO(callback_metrics_bytes)) + trainer.callback_metrics.update(callback_metrics) @override def kill(self, signum: _SIGNUM) -> None: From 55b95f26adf7156036f6450d33001c2aaa7a0fbb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jun 2024 07:46:29 +0200 Subject: [PATCH 083/179] build(deps): bump docker/build-push-action from 5 to 6 (#20007) Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 5 to 6. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/v5...v6) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/docker-build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 73c6e7496f9fa..d917ebc407143 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -83,7 +83,7 @@ jobs: gh_env.write("DOCKER_TAGS=" + ",".join(tags)) shell: python - - uses: docker/build-push-action@v5 + - uses: docker/build-push-action@v6 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -119,7 +119,7 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v5 + - uses: docker/build-push-action@v6 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -151,7 +151,7 @@ jobs: - name: Build Conda Docker # publish master/release continue-on-error: true - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: file: dockers/nvidia/Dockerfile push: false From d0d01d3ff926a987398978159a9c9c6aa32c3c50 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 26 Jun 2024 00:44:29 +0200 Subject: [PATCH 084/179] Fix package build dependencies (#20015) --- requirements/app/app.txt | 3 ++- requirements/ci.txt | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/requirements/app/app.txt b/requirements/app/app.txt index a59e0b5ca5c28..accb7baaf0d8f 100644 --- a/requirements/app/app.txt +++ b/requirements/app/app.txt @@ -6,7 +6,7 @@ fsspec[http] >=2022.5.0, <2023.11.0 croniter >=1.3.0, <1.5.0 # strict; TODO: for now until we find something more robust. traitlets >=5.3.0, <5.12.0 arrow >=1.2.0, <1.3.0 -lightning-utilities >=0.8.0, <0.12.0 +lightning-utilities >=0.10.0, <0.12.0 beautifulsoup4 >=4.8.0, <4.13.0 inquirer >=2.10.0, <3.2.0 psutil <5.9.6 @@ -27,3 +27,4 @@ urllib3 <2.0.0 uvicorn <0.24.0 websocket-client <1.7.0 websockets <11.1.0 +numpy >=1.17.2, <2.0 diff --git a/requirements/ci.txt b/requirements/ci.txt index 08c2bd41148ec..cdebc301790e9 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -1,6 +1,7 @@ -setuptools -wheel +setuptools <70.1.1 +wheel <0.44.0 awscli >=1.30.0, <1.31.0 twine ==4.0.1 +importlib-metadata <8.0.0 wget -packaging +packaging <24.2 From d53e107fb5ac91fb5f27aef19a9377c1a4badc96 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 26 Jun 2024 11:53:41 +0100 Subject: [PATCH 085/179] Scale mmt (#19984) --- pyproject.toml | 1 + requirements/app/app.txt | 1 + src/lightning/app/cli/lightning_cli.py | 110 +++++++++++++- src/lightning/app/core/app.py | 5 + src/lightning/app/core/constants.py | 26 ++++ src/lightning/app/core/queues.py | 57 +++++--- src/lightning/app/core/work.py | 3 + src/lightning/app/launcher/launcher.py | 138 ++++++++++++------ .../app/launcher/lightning_backend.py | 133 +++++++++-------- .../app/launcher/lightning_hybrid_backend.py | 38 ++--- src/lightning/app/launcher/utils.py | 97 ++++++++++++ src/lightning/app/runners/backends/backend.py | 7 +- src/lightning/app/utilities/proxies.py | 37 ++++- src/lightning/fabric/cli.py | 22 ++- src/lightning_app/__main__.py | 2 +- tests/tests_app/cli/test_cli.py | 36 +---- tests/tests_app/cli/test_cmd_launch.py | 2 +- .../components/multi_node/test_trainer.py | 1 + tests/tests_app/core/test_constants.py | 15 +- tests/tests_app/core/test_lightning_app.py | 4 +- tests/tests_app/core/test_lightning_work.py | 4 +- tests/tests_app/core/test_queues.py | 126 +++++++--------- tests/tests_app/launcher/test_running_flow.py | 16 +- tests/tests_app/storage/test_path.py | 1 + tests/tests_app/utilities/test_proxies.py | 15 +- 25 files changed, 602 insertions(+), 295 deletions(-) create mode 100644 src/lightning/app/launcher/utils.py diff --git a/pyproject.toml b/pyproject.toml index dc77740823c9b..c24f27828fdd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,6 +140,7 @@ exclude = [ "src/lightning/app/cli/component-template", "src/lightning/app/cli/pl-app-template", "src/lightning/app/cli/react-ui-template", + "src/lightning/app/launcher/utils.py", ] install_types = "True" non_interactive = "True" diff --git a/requirements/app/app.txt b/requirements/app/app.txt index accb7baaf0d8f..bdf9e51d7370d 100644 --- a/requirements/app/app.txt +++ b/requirements/app/app.txt @@ -28,3 +28,4 @@ uvicorn <0.24.0 websocket-client <1.7.0 websockets <11.1.0 numpy >=1.17.2, <2.0 +msgpack diff --git a/src/lightning/app/cli/lightning_cli.py b/src/lightning/app/cli/lightning_cli.py index 8f61554019652..6aa84063ab93f 100644 --- a/src/lightning/app/cli/lightning_cli.py +++ b/src/lightning/app/cli/lightning_cli.py @@ -40,7 +40,19 @@ from lightning.app.cli.lightning_cli_delete import delete from lightning.app.cli.lightning_cli_launch import launch from lightning.app.cli.lightning_cli_list import get_list -from lightning.app.core.constants import ENABLE_APP_COMMENT_COMMAND_EXECUTION, get_lightning_cloud_url +from lightning.app.core.constants import ( + APP_SERVER_HOST, + APP_SERVER_PORT, + ENABLE_APP_COMMENT_COMMAND_EXECUTION, + get_lightning_cloud_url, +) +from lightning.app.launcher.launcher import ( + run_lightning_flow, + run_lightning_work, + serve_frontend, + start_application_server, + start_flow_and_servers, +) from lightning.app.runners.cloud import CloudRuntime from lightning.app.runners.runtime import dispatch from lightning.app.runners.runtime_type import RuntimeType @@ -393,3 +405,99 @@ def _prepare_file(file: str) -> str: return file raise FileNotFoundError(f"The provided file {file} hasn't been found.") + + +@run.command("server") +@click.argument("file", type=click.Path(exists=True)) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) +@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) +def run_server(file: str, queue_id: str, host: str, port: int) -> None: + """It takes the application file as input, build the application object and then use that to run the application + server. + + This is used by the cloud runners to start the status server for the application + + """ + logger.debug(f"Run Server: {file} {queue_id} {host} {port}") + start_application_server(file, host, port, queue_id=queue_id) + + +@run.command("flow") +@click.argument("file", type=click.Path(exists=True)) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +@click.option("--base-url", help="Base url at which the app server is hosted", default="") +def run_flow(file: str, queue_id: str, base_url: str) -> None: + """It takes the application file as input, build the application object, proxy all the work components and then run + the application flow defined in the root component. + + It does exactly what a singleprocess dispatcher would do but with proxied work components. + + """ + logger.debug(f"Run Flow: {file} {queue_id} {base_url}") + run_lightning_flow(file, queue_id=queue_id, base_url=base_url) + + +@run.command("work") +@click.argument("file", type=click.Path(exists=True)) +@click.option("--work-name", type=str) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +def run_work(file: str, work_name: str, queue_id: str) -> None: + """Unlike other entrypoints, this command will take the file path or module details for a work component and run + that by fetching the states from the queues.""" + logger.debug(f"Run Work: {file} {work_name} {queue_id}") + run_lightning_work( + file=file, + work_name=work_name, + queue_id=queue_id, + ) + + +@run.command("frontend") +@click.argument("file", type=click.Path(exists=True)) +@click.option("--flow-name") +@click.option("--host") +@click.option("--port", type=int) +def run_frontend(file: str, flow_name: str, host: str, port: int) -> None: + """Serve the frontend specified by the given flow.""" + logger.debug(f"Run Frontend: {file} {flow_name} {host}") + serve_frontend(file=file, flow_name=flow_name, host=host, port=port) + + +@run.command("flow-and-servers") +@click.argument("file", type=click.Path(exists=True)) +@click.option("--queue-id", help="ID for identifying queue", default="", type=str) +@click.option("--base-url", help="Base url at which the app server is hosted", default="") +@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) +@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) +@click.option( + "--flow-port", + help="Pair of flow name and frontend port", + type=(str, int), + multiple=True, +) +def run_flow_and_servers( + file: str, + base_url: str, + queue_id: str, + host: str, + port: int, + flow_port: Tuple[Tuple[str, int]], +) -> None: + """It takes the application file as input, build the application object and then use that to run the application + flow defined in the root component, the application server and all the flow frontends. + + This is used by the cloud runners to start the flow, the status server and all frontends for the application + + """ + logger.debug(f"Run Flow: {file} {queue_id} {base_url}") + logger.debug(f"Run Server: {file} {queue_id} {host} {port}.") + logger.debug(f"Run Frontend's: {flow_port}") + start_flow_and_servers( + entrypoint_file=file, + base_url=base_url, + queue_id=queue_id, + host=host, + port=port, + flow_names_and_ports=flow_port, + ) diff --git a/src/lightning/app/core/app.py b/src/lightning/app/core/app.py index c29a43ba9db0a..e1da6adee32ba 100644 --- a/src/lightning/app/core/app.py +++ b/src/lightning/app/core/app.py @@ -35,6 +35,7 @@ FLOW_DURATION_SAMPLES, FLOW_DURATION_THRESHOLD, FRONTEND_DIR, + SHOULD_START_WORKS_WITH_FLOW, STATE_ACCUMULATE_WAIT, ) from lightning.app.core.queues import BaseQueue @@ -144,6 +145,7 @@ def __init__( self.threads: List[threading.Thread] = [] self.exception = None self.collect_changes: bool = True + self._should_start_works_with_flow: bool = SHOULD_START_WORKS_WITH_FLOW self.status: Optional[AppStatus] = None # TODO: Enable ready locally for opening the UI. @@ -733,6 +735,9 @@ def _send_flow_to_work_deltas(self, state: dict) -> None: self.flow_to_work_delta_queues[w.name].put(deep_diff) def _start_with_flow_works(self) -> None: + if not self._should_start_works_with_flow: + return + for w in self.works: if w._start_with_flow: parallel = w.parallel diff --git a/src/lightning/app/core/constants.py b/src/lightning/app/core/constants.py index 64b159e57fea8..449f4f078ce77 100644 --- a/src/lightning/app/core/constants.py +++ b/src/lightning/app/core/constants.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os from pathlib import Path from typing import Optional @@ -101,6 +102,31 @@ def get_lightning_cloud_url() -> str: BATCH_DELTA_COUNT = int(os.getenv("BATCH_DELTA_COUNT", "128")) CHECK_ERROR_QUEUE_INTERVAL = float(os.getenv("CHECK_ERROR_QUEUE_INTERVAL", "30")) +SHOULD_START_WORKS_WITH_FLOW = bool(int(os.getenv("SHOULD_START_WORKS_WITH_FLOW", "1"))) +IS_RUNNING_IN_FLOW = os.getenv("LIGHTNING_CLOUD_WORK_NAME", None) is None + + +class DistributedPluginChecker: + def __init__(self) -> None: + distributed_arguments = os.getenv("DISTRIBUTED_ARGUMENTS", None) + work_name = os.getenv("LIGHTNING_CLOUD_WORK_NAME") + + self.running_distributed_plugin = False + + if distributed_arguments and work_name: + distributed_arguments = json.loads(distributed_arguments) + assert distributed_arguments + num_nodes = distributed_arguments.get("num_instances", 0) + node_rank = int(work_name.split(".")[-1]) + + # Only the start with flow works are skipped for performance purposes + self.running_distributed_plugin = node_rank < num_nodes + + def __bool__(self) -> bool: + return self.running_distributed_plugin + + +IS_DISTRIBUTED_PLUGIN = DistributedPluginChecker() def enable_multiple_works_in_default_container() -> bool: diff --git a/src/lightning/app/core/queues.py b/src/lightning/app/core/queues.py index d37251c824616..f04447320cc3f 100644 --- a/src/lightning/app/core/queues.py +++ b/src/lightning/app/core/queues.py @@ -17,7 +17,6 @@ import pickle import queue # needed as import instead from/import for mocking in tests import time -import warnings from abc import ABC, abstractmethod from enum import Enum from pathlib import Path @@ -25,6 +24,7 @@ from urllib.parse import urljoin import backoff +import msgpack import requests from requests.exceptions import ConnectionError, ConnectTimeout, ReadTimeout @@ -34,6 +34,7 @@ HTTP_QUEUE_REQUESTS_PER_SECOND, HTTP_QUEUE_TOKEN, HTTP_QUEUE_URL, + IS_RUNNING_IN_FLOW, LIGHTNING_DIR, QUEUE_DEBUG_ENABLED, REDIS_HOST, @@ -41,7 +42,6 @@ REDIS_PORT, REDIS_QUEUES_READ_DEFAULT_TIMEOUT, STATE_UPDATE_TIMEOUT, - WARNING_QUEUE_SIZE, ) from lightning.app.utilities.app_helpers import Logger from lightning.app.utilities.imports import _is_redis_available, requires @@ -80,9 +80,14 @@ def get_queue(self, queue_name: str) -> "BaseQueue": return MultiProcessQueue(queue_name, default_timeout=STATE_UPDATE_TIMEOUT) if self == QueuingSystem.REDIS: return RedisQueue(queue_name, default_timeout=REDIS_QUEUES_READ_DEFAULT_TIMEOUT) - return RateLimitedQueue( - HTTPQueue(queue_name, default_timeout=STATE_UPDATE_TIMEOUT), HTTP_QUEUE_REQUESTS_PER_SECOND - ) + + queue = HTTPQueue(queue_name, default_timeout=STATE_UPDATE_TIMEOUT) + + # In the flow, don't rate limit the caller queue. Otherwise, startup time would be slow with lot of works. + if CALLER_QUEUE_CONSTANT in queue_name and IS_RUNNING_IN_FLOW: + return queue + + return RateLimitedQueue(queue, HTTP_QUEUE_REQUESTS_PER_SECOND) def get_api_response_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": queue_name = f"{queue_id}_{API_RESPONSE_QUEUE_CONSTANT}" if queue_id else API_RESPONSE_QUEUE_CONSTANT @@ -284,14 +289,6 @@ def put(self, item: Any) -> None: item._backend = None value = pickle.dumps(item) - queue_len = self.length() - if queue_len >= WARNING_QUEUE_SIZE: - warnings.warn( - f"The Redis Queue {self.name} length is larger than the " - f"recommended length of {WARNING_QUEUE_SIZE}. " - f"Found {queue_len}. This might cause your application to crash, " - "please investigate this." - ) try: self.redis.rpush(self.name, value) except redis.exceptions.ConnectionError: @@ -451,7 +448,11 @@ def is_running(self) -> bool: return False return False + @backoff.on_exception( + backoff.expo, (RuntimeError, requests.exceptions.HTTPError, requests.exceptions.ChunkedEncodingError) + ) def get(self, timeout: Optional[float] = None) -> Any: + logger.debug(f"get {self.name}") if not self.app_id: raise ValueError(f"App ID couldn't be extracted from the queue name: {self.name}") @@ -498,13 +499,17 @@ def _get(self) -> Any: resp = self.client.post(f"v1/{self.app_id}/{self._name_suffix}", query_params={"action": "pop"}) if resp.status_code == 204: raise queue.Empty - return pickle.loads(resp.content) + + if self._use_pickle(): + return pickle.loads(resp.content) + return msgpack.unpackb(resp.content) except ConnectionError: # Note: If the Http Queue service isn't available, # we consider the queue is empty to avoid failing the app. raise queue.Empty def batch_get(self, timeout: Optional[float] = None, count: Optional[int] = None) -> List[Any]: + logger.debug(f"batch_get {self.name}") try: resp = self.client.post( f"v1/{self.app_id}/{self._name_suffix}", @@ -512,24 +517,24 @@ def batch_get(self, timeout: Optional[float] = None, count: Optional[int] = None ) if resp.status_code == 204: raise queue.Empty - return [pickle.loads(base64.b64decode(data)) for data in resp.json()] + + if self._use_pickle(): + return [pickle.loads(base64.b64decode(data)) for data in resp.json()] + return [msgpack.unpackb(base64.b64decode(data)) for data in resp.json()] except ConnectionError: # Note: If the Http Queue service isn't available, # we consider the queue is empty to avoid failing the app. raise queue.Empty - @backoff.on_exception(backoff.expo, (RuntimeError, requests.exceptions.HTTPError)) + @backoff.on_exception( + backoff.expo, (RuntimeError, requests.exceptions.HTTPError, requests.exceptions.ChunkedEncodingError) + ) def put(self, item: Any) -> None: + logger.debug(f"put {self.name}") if not self.app_id: raise ValueError(f"The Lightning App ID couldn't be extracted from the queue name: {self.name}") - value = pickle.dumps(item) - queue_len = self.length() - if queue_len >= WARNING_QUEUE_SIZE: - warnings.warn( - f"The Queue {self._name_suffix} length is larger than the recommended length of {WARNING_QUEUE_SIZE}. " - f"Found {queue_len}. This might cause your application to crash, please investigate this." - ) + value = pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL) if self._use_pickle() else msgpack.packb(item) resp = self.client.post(f"v1/{self.app_id}/{self._name_suffix}", data=value, query_params={"action": "push"}) if resp.status_code != 201: raise RuntimeError(f"Failed to push to queue: {self._name_suffix}") @@ -568,6 +573,12 @@ def to_dict(self) -> dict: def from_dict(cls, state: dict) -> "HTTPQueue": return cls(**state) + def _use_pickle(self) -> bool: + # Note: msgpack is faster than pickle to serialize and deserialize simple JSON + return ( + WORK_QUEUE_CONSTANT in self.name or DELTA_QUEUE_CONSTANT in self.name or ERROR_QUEUE_CONSTANT in self.name + ) + def debug_log_callback(message: str, *args: Any, **kwargs: Any) -> None: if QUEUE_DEBUG_ENABLED or (Path(LIGHTNING_DIR) / "QUEUE_DEBUG_ENABLED").exists(): diff --git a/src/lightning/app/core/work.py b/src/lightning/app/core/work.py index 9b4ada4144649..8764c118e15ee 100644 --- a/src/lightning/app/core/work.py +++ b/src/lightning/app/core/work.py @@ -630,6 +630,9 @@ def start(self) -> None: # This enables to start the run method with a phony input and exit. self.run(Action(method="start")) + def on_start(self) -> None: + """Define actions to perform when the work has started.""" + def run(self, *args: Any, **kwargs: Any) -> None: """Override to add your own logic. diff --git a/src/lightning/app/launcher/launcher.py b/src/lightning/app/launcher/launcher.py index 7dc9fca11db42..696ac2100f186 100644 --- a/src/lightning/app/launcher/launcher.py +++ b/src/lightning/app/launcher/launcher.py @@ -9,31 +9,37 @@ from multiprocessing import Process from typing import Callable, Dict, List, Optional, Tuple, TypedDict -ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER = bool(int(os.getenv("ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER", "0"))) - -if True: # ToDo: Avoid Module level import not at top of file - from lightning.app.core import constants - from lightning.app.core.api import start_server - from lightning.app.core.flow import LightningFlow - from lightning.app.core.queues import MultiProcessQueue, QueuingSystem - from lightning.app.storage.orchestrator import StorageOrchestrator +from lightning.app import LightningFlow +from lightning.app.core import constants +from lightning.app.core.api import start_server +from lightning.app.core.constants import ( + CHECK_ERROR_QUEUE_INTERVAL, + ENABLE_ORCHESTRATOR, + IS_DISTRIBUTED_PLUGIN, + enable_multiple_works_in_default_container, +) +from lightning.app.core.queues import MultiProcessQueue, QueuingSystem +from lightning.app.storage.orchestrator import StorageOrchestrator +from lightning.app.utilities.cloud import _sigterm_flow_handler +from lightning.app.utilities.component import _set_flow_context, _set_frontend_context +from lightning.app.utilities.enum import AppStage +from lightning.app.utilities.exceptions import ExitAppException +from lightning.app.utilities.load_app import extract_metadata_from_app, load_app_from_file +from lightning.app.utilities.proxies import WorkRunner +from lightning.app.utilities.redis import check_if_redis_running + +try: from lightning.app.utilities.app_commands import run_app_commands - from lightning.app.utilities.cloud import _sigterm_flow_handler - from lightning.app.utilities.component import _set_flow_context, _set_frontend_context - from lightning.app.utilities.enum import AppStage - from lightning.app.utilities.exceptions import ExitAppException - from lightning.app.utilities.load_app import extract_metadata_from_app, load_app_from_file - from lightning.app.utilities.proxies import WorkRunner - from lightning.app.utilities.redis import check_if_redis_running - -if ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER: + + ABLE_TO_RUN_APP_COMMANDS = True +except (ImportError, ModuleNotFoundError): + ABLE_TO_RUN_APP_COMMANDS = False + +if enable_multiple_works_in_default_container(): from lightning.app.launcher.lightning_hybrid_backend import CloudHybridBackend as CloudBackend else: from lightning.app.launcher.lightning_backend import CloudBackend - -if True: # Avoid Module level import not at top of file - from lightning.app.utilities.app_helpers import convert_print_to_logger_info - from lightning.app.utilities.packaging.lightning_utils import enable_debugging +from lightning.app.launcher.utils import LIGHTNING_VERSION, convert_print_to_logger_info, enable_debugging, exit_app if hasattr(constants, "get_cloud_queue_type"): CLOUD_QUEUE_TYPE = constants.get_cloud_queue_type() or "redis" @@ -48,6 +54,22 @@ class FlowRestAPIQueues(TypedDict): api_response_queue: MultiProcessQueue +def check_error_queue(self) -> None: + if not getattr(self, "_last_check_error_queue", None): + self._last_check_error_queue = 0.0 + + if (time.time() - self._last_check_error_queue) > CHECK_ERROR_QUEUE_INTERVAL: + exception: Exception = self.get_state_changed_from_queue(self.error_queue) # type: ignore[assignment,arg-type] + if isinstance(exception, Exception): + self.exception = exception + self.stage = AppStage.FAILED + self._last_check_error_queue = time.time() + + +def patch_app(app): + app.check_error_queue = partial(check_error_queue, self=app) + + @convert_print_to_logger_info @enable_debugging def start_application_server( @@ -72,6 +94,7 @@ def start_application_server( }) app = load_app_from_file(entrypoint_file) + patch_app(app) from lightning.app.api.http_methods import _add_tags_to_api, _validate_api from lightning.app.utilities.app_helpers import is_overridden @@ -124,12 +147,34 @@ def run_lightning_work( copy_request_queues = queues.get_orchestrator_copy_request_queue(work_name=work_name, queue_id=queue_id) copy_response_queues = queues.get_orchestrator_copy_response_queue(work_name=work_name, queue_id=queue_id) - run_app_commands(file) + if ABLE_TO_RUN_APP_COMMANDS: + run_app_commands(file) load_app_from_file(file) - queue = queues.get_work_queue(work_name=work_name, queue_id=queue_id) - work = queue.get() + if IS_DISTRIBUTED_PLUGIN: + import json + + from multi_node.launcher import ScriptLauncher + + from lightning.app import CloudCompute + + script_command = os.environ["COMMAND"] + distributed_arguments = os.environ["DISTRIBUTED_ARGUMENTS"] + distributed_arguments = json.loads(distributed_arguments) + cloud_compute = distributed_arguments["cloud_compute"] + disk_size = int(distributed_arguments.get("disk_size", 400)) + + work = ScriptLauncher( + cloud_compute=CloudCompute(cloud_compute, disk_size=disk_size), + parallel=True, + command=script_command, + ) + work_name = os.getenv("LIGHTNING_CLOUD_WORK_NAME", "") + work._name = work_name + else: + queue = queues.get_work_queue(work_name=work_name, queue_id=queue_id) + work = queue.get() extras = {} @@ -179,15 +224,17 @@ def run_lightning_flow(entrypoint_file: str, queue_id: str, base_url: str, queue app.should_publish_changes_to_api = True - storage_orchestrator = StorageOrchestrator( - app, - app.request_queues, - app.response_queues, - app.copy_request_queues, - app.copy_response_queues, - ) - storage_orchestrator.setDaemon(True) - storage_orchestrator.start() + # reduces the number of requests to the CP + if ENABLE_ORCHESTRATOR: + storage_orchestrator = StorageOrchestrator( + app, + app.request_queues, + app.response_queues, + app.copy_request_queues, + app.copy_response_queues, + ) + storage_orchestrator.setDaemon(True) + storage_orchestrator.start() # refresh the layout with the populated urls. app._update_layout() @@ -211,14 +258,16 @@ def run_lightning_flow(entrypoint_file: str, queue_id: str, base_url: str, queue app.stage = AppStage.FAILED print(traceback.format_exc()) - storage_orchestrator.join(0) + if ENABLE_ORCHESTRATOR: + storage_orchestrator.join(0) + app.backend.stop_all_works(app.works) exit_code = 1 if app.stage == AppStage.FAILED else 0 print(f"Finishing the App with exit_code: {str(exit_code)}...") if not exit_code: - app.backend.stop_app(app) + exit_app(app) sys.exit(exit_code) @@ -385,12 +434,13 @@ def start_flow_and_servers( "api_response_queue": queue_system.get_api_response_queue(queue_id=queue_id), } - # In order to avoid running this function 3 seperate times while executing the - # `run_lightning_flow`, `start_application_server`, & `serve_frontend` functions - # in a subprocess we extract this to the top level. If we intend to make changes - # to be able to start these components in seperate containers, the implementation - # will have to move a call to this function within the initialization process. - run_app_commands(entrypoint_file) + if ABLE_TO_RUN_APP_COMMANDS: + # In order to avoid running this function 3 seperate times while executing the + # `run_lightning_flow`, `start_application_server`, & `serve_frontend` functions + # in a subprocess we extract this to the top level. If we intend to make changes + # to be able to start these components in seperate containers, the implementation + # will have to move a call to this function within the initialization process. + run_app_commands(entrypoint_file) flow_process = start_server_in_process( run_lightning_flow, @@ -434,6 +484,12 @@ def wait_for_queues(queue_system: QueuingSystem) -> None: logger.warning("Waiting for http queues to start...") time.sleep(1) else: + if CLOUD_QUEUE_TYPE != "redis": + raise ValueError( + f"Queue system {queue_system} is not correctly configured. You seem to have requested HTTP queues," + f"but using an old version of lightning framework ({LIGHTNING_VERSION}) that doesn't support " + f"HTTP queues. Try upgrading lightning framework to the latest version." + ) while not check_if_redis_running(): if (int(time.time()) - queue_check_start_time) % 10 == 0: logger.warning("Waiting for redis queues to start...") diff --git a/src/lightning/app/launcher/lightning_backend.py b/src/lightning/app/launcher/lightning_backend.py index 0a974057ef070..2d0876472755e 100644 --- a/src/lightning/app/launcher/lightning_backend.py +++ b/src/lightning/app/launcher/lightning_backend.py @@ -1,13 +1,29 @@ +import contextlib import inspect import json import logging import os import random import string -import urllib from time import monotonic, sleep, time from typing import List, Optional +from lightning.app import LightningApp, LightningWork +from lightning.app.core.queues import QueuingSystem +from lightning.app.runners.backends.backend import Backend +from lightning.app.storage import Drive +from lightning.app.utilities.enum import WorkStageStatus, WorkStopReasons, make_status +from lightning.app.utilities.network import LightningClient + +with contextlib.suppress(ImportError, ModuleNotFoundError): + # TODO: remove try block and just import after lighting_app > 0.6.3 is released + from lightning.app.storage import Mount + +try: + from lightning.app.utilities.exceptions import LightningPlatformException +except ImportError: + LightningPlatformException = Exception + from lightning_cloud.openapi import ( AppinstancesIdBody, Externalv1LightningappInstance, @@ -34,36 +50,31 @@ ) from lightning_cloud.openapi.rest import ApiException -from lightning.app.core import LightningApp, LightningWork -from lightning.app.core.queues import QueuingSystem -from lightning.app.runners.backends.backend import Backend -from lightning.app.storage import Drive, Mount -from lightning.app.utilities.enum import WorkStageStatus, WorkStopReasons, make_status -from lightning.app.utilities.exceptions import LightningPlatformException -from lightning.app.utilities.network import LightningClient, _check_service_url_is_ready +from lightning.app.launcher.utils import LIGHTNING_VERSION, cloud_work_stage_to_work_status_stage logger = logging.getLogger(__name__) -from lightning_cloud.openapi import SpecLightningappInstanceIdWorksBody, WorksIdBody # noqa: E402 +# TODO: For future travelers: This backward incompatible change is being introduced when lightning app is at 0.6.0 +# Once we are safe to remove the support for 0.6.0, remove this ugly import +try: + from lightning_cloud.openapi import SpecLightningappInstanceIdWorksBody, WorksIdBody +except ImportError: + logger.warning( + f"You are using an old version of lightning ({LIGHTNING_VERSION}). " f"Please upgrade to the latest version." + ) + from lightning_cloud.openapi import Body5 as SpecLightningappInstanceIdWorksBody + from lightning_cloud.openapi import Body6 as WorksIdBody +except Exception as e: + logger.warning( + f"You are using an old version of lightning ({LIGHTNING_VERSION}). " + f"Please upgrade to the latest version. {e}" + ) + from lightning_cloud.openapi import Body5 as SpecLightningappInstanceIdWorksBody + from lightning_cloud.openapi import Body6 as WorksIdBody LIGHTNING_STOP_TIMEOUT = int(os.getenv("LIGHTNING_STOP_TIMEOUT", 2 * 60)) -def cloud_work_stage_to_work_status_stage(stage: V1LightningworkState) -> str: - """Maps the Work stage names from the cloud backend to the status names in the Lightning framework.""" - mapping = { - V1LightningworkState.STOPPED: WorkStageStatus.STOPPED, - V1LightningworkState.PENDING: WorkStageStatus.PENDING, - V1LightningworkState.NOT_STARTED: WorkStageStatus.PENDING, - V1LightningworkState.IMAGE_BUILDING: WorkStageStatus.PENDING, - V1LightningworkState.RUNNING: WorkStageStatus.RUNNING, - V1LightningworkState.FAILED: WorkStageStatus.FAILED, - } - if stage not in mapping: - raise ValueError(f"Cannot map the lightning-cloud work state {stage} to the lightning status stage.") - return mapping[stage] - - class CloudBackend(Backend): def __init__( self, @@ -116,10 +127,11 @@ def _work_to_spec(work: LightningWork) -> V1LightningworkSpec: ), ) - # this should really be part of the work.cloud_compute struct, but to save - # time we are not going to modify the backend in this set of PRs & instead - # use the same s3 drives API which we used before. - if work.cloud_compute.mounts is not None: + # TODO: remove after we move lighting_app past v0.6.3 + if hasattr(work.cloud_compute, "mounts") and work.cloud_compute.mounts is not None: + # this should really be part of the work.cloud_compute struct, but to save + # time we are not going to modify the backend in this set of PRs & instead + # use the same s3 drives API which we used before. if isinstance(work.cloud_compute.mounts, Mount): drive_specs.append( _create_mount_drive_spec( @@ -250,8 +262,8 @@ def update_work_statuses(self, works: List[LightningWork]) -> None: """Pulls the status of each Work instance in the cloud. Normally, the Lightning frameworks communicates statuses through the queues, but while the Work instance is - being provisionied, the queues don't exist yet and hence we need to make API calls directly to the backend to - fetch the status and update it in the states. + being provisionied, the queues don't exist yet and hence we need to make API calls directly to the Grid backend + to fetch the status and update it in the states. """ if not works: @@ -334,33 +346,34 @@ def all_works_stopped(works: List[Externalv1Lightningwork]) -> bool: break def resolve_url(self, app, base_url: Optional[str] = None) -> None: - if not self.base_url: - self.base_url = base_url - - for flow in app.flows: - if self.base_url: - # Replacing the path with complete URL - if not (self.base_url.startswith("http://") or self.base_url.startswith("https://")): - raise ValueError( - "Base URL doesn't have a valid scheme, expected it to start with 'http://' or 'https://' " - ) - if isinstance(flow._layout, dict) and "target" not in flow._layout: - # FIXME: Why _check_service_url_is_ready doesn't work ? - frontend_url = urllib.parse.urljoin(self.base_url, flow.name + "/") - flow._layout["target"] = frontend_url - - for work in app.works: - if ( - work._url == "" - and work.status.stage - in ( - WorkStageStatus.RUNNING, - WorkStageStatus.SUCCEEDED, - ) - and work._internal_ip != "" - and _check_service_url_is_ready(f"http://{work._internal_ip}:{work._port}") - ): - work._url = work._future_url + pass + # if not self.base_url: + # self.base_url = base_url + + # for flow in app.flows: + # if self.base_url: + # # Replacing the path with complete URL + # if not (self.base_url.startswith("http://") or self.base_url.startswith("https://")): + # raise ValueError( + # "Base URL doesn't have a valid scheme, expected it to start with 'http://' or 'https://' " + # ) + # if isinstance(flow._layout, dict) and "target" not in flow._layout: + # # FIXME: Why _check_service_url_is_ready doesn't work ? + # frontend_url = urllib.parse.urljoin(self.base_url, flow.name + "/") + # flow._layout["target"] = frontend_url + + # for work in app.works: + # if ( + # work._url == "" + # and work.status.stage + # in ( + # WorkStageStatus.RUNNING, + # WorkStageStatus.SUCCEEDED, + # ) + # and work._internal_ip != "" + # and _check_service_url_is_ready(f"http://{work._internal_ip}:{work._port}") + # ): + # work._url = work._future_url @staticmethod def _get_proxy_scheme() -> str: @@ -399,7 +412,7 @@ def _handle_idle_timeout(self, idle_timeout: float, work: LightningWork, resp: E def _register_queues(self, app, work): super()._register_queues(app, work) - kw = {"queue_id": self.queue_id, "work_name": work.name} + kw = dict(queue_id=self.queue_id, work_name=work.name) # noqa: C408 app.work_queues.update({work.name: self.queues.get_work_queue(**kw)}) def stop_work(self, app: LightningApp, work: LightningWork) -> None: @@ -448,7 +461,7 @@ def _delete_work(self, work_resp: Externalv1Lightningwork) -> None: ) print(f"Deleting {work_resp.name} ...") - def update_lightning_app_frontend(self, app: "lightning.LightningApp"): # noqa: F821 + def update_lightning_app_frontend(self, app): """Used to create frontend's if the app couldn't be loaded locally.""" if not len(app.frontends.keys()): return @@ -479,7 +492,7 @@ def update_lightning_app_frontend(self, app: "lightning.LightningApp"): # noqa: body=AppinstancesIdBody(spec=spec), ) - def stop_app(self, app: "lightning.LightningApp"): # noqa: F821 + def stop_app(self, app): """Used to mark the App has stopped if everything has fine.""" external_app_spec: "Externalv1LightningappInstance" = ( diff --git a/src/lightning/app/launcher/lightning_hybrid_backend.py b/src/lightning/app/launcher/lightning_hybrid_backend.py index a5b82cd602601..249f9690811cf 100644 --- a/src/lightning/app/launcher/lightning_hybrid_backend.py +++ b/src/lightning/app/launcher/lightning_hybrid_backend.py @@ -39,15 +39,15 @@ def _prepare_work_creation(self, app, work) -> None: client = LightningClient() list_apps_resp = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project_id) - lit_app: Optional[Externalv1LightningappInstance] = None + lightning_app: Optional[Externalv1LightningappInstance] = None - for lapp in list_apps_resp.lightningapps: - if lapp.id == app_id: - lit_app = lapp + for lightningapp in list_apps_resp.lightningapps: + if lightningapp.id == app_id: + lightning_app = lightningapp - assert lit_app + assert lightning_app - network_configs = lit_app.spec.network_config + network_configs = lightning_app.spec.network_config index = len(self.work_to_network_configs) @@ -55,12 +55,12 @@ def _prepare_work_creation(self, app, work) -> None: self.work_to_network_configs[work.name] = network_configs[index] # Enable Ingress and update the specs. - lit_app.spec.network_config[index].enable = True + lightning_app.spec.network_config[index].enable = True client.lightningapp_instance_service_update_lightningapp_instance( project_id=project_id, - id=lit_app.id, - body=AppinstancesIdBody(name=lit_app.name, spec=lit_app.spec), + id=lightning_app.id, + body=AppinstancesIdBody(name=lightning_app.name, spec=lightning_app.spec), ) work_network_config = self.work_to_network_configs[work.name] @@ -85,7 +85,7 @@ def resolve_url(self, app, base_url: Optional[str] = None) -> None: backend = self._get_backend(works[0]) backend.resolve_url(app, base_url) - def update_lightning_app_frontend(self, app: "lightning.LightningApp"): # noqa: F821 + def update_lightning_app_frontend(self, app): self.backends["cloud"].update_lightning_app_frontend(app) def stop_work(self, app, work) -> None: @@ -107,24 +107,24 @@ def _prepare_work_stop(self, app, work): client = LightningClient() list_apps_resp = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project_id) - lit_app: Optional[Externalv1LightningappInstance] = None + lightning_app: Optional[Externalv1LightningappInstance] = None - for lapp in list_apps_resp.lightningapps: - if lapp.id == app_id: - lit_app = lapp + for lightningapp in list_apps_resp.lightningapps: + if lightningapp.id == app_id: + lightning_app = lightningapp - assert lit_app + assert lightning_app network_config = self.work_to_network_configs[work.name] - for nc in lit_app.spec.network_config: + for nc in lightning_app.spec.network_config: if nc.host == network_config.host: nc.enable = False client.lightningapp_instance_service_update_lightningapp_instance( project_id=project_id, - id=lit_app.id, - body=AppinstancesIdBody(name=lit_app.name, spec=lit_app.spec), + id=lightning_app.id, + body=AppinstancesIdBody(name=lightning_app.name, spec=lightning_app.spec), ) del self.work_to_network_configs[work.name] @@ -150,6 +150,6 @@ def _get_app_id() -> str: def _get_project_id() -> str: return os.environ["LIGHTNING_CLOUD_PROJECT_ID"] - def stop_app(self, app: "lightning.LightningApp"): # noqa: F821 + def stop_app(self, app): """Used to mark the App has stopped if everything has fine.""" self.backends["cloud"].stop_app(app) diff --git a/src/lightning/app/launcher/utils.py b/src/lightning/app/launcher/utils.py new file mode 100644 index 0000000000000..b6a3859830a49 --- /dev/null +++ b/src/lightning/app/launcher/utils.py @@ -0,0 +1,97 @@ +import functools +import logging +import os +import signal +from typing import Any, Callable + +import psutil +from lightning_cloud.openapi import V1LightningworkState + +from lightning.app import LightningApp, _logger, _root_logger +from lightning.app import __version__ as LIGHTNING_VERSION +from lightning.app.utilities.enum import WorkStageStatus + + +def cloud_work_stage_to_work_status_stage(stage: V1LightningworkState) -> str: + """Maps the Work stage names from the Grid cloud backend to the status names in the Lightning framework.""" + mapping = { + V1LightningworkState.STOPPED: WorkStageStatus.STOPPED, + V1LightningworkState.PENDING: WorkStageStatus.PENDING, + V1LightningworkState.NOT_STARTED: WorkStageStatus.PENDING, + V1LightningworkState.IMAGE_BUILDING: WorkStageStatus.PENDING, + V1LightningworkState.RUNNING: WorkStageStatus.RUNNING, + V1LightningworkState.FAILED: WorkStageStatus.FAILED, + } + if stage not in mapping: + raise ValueError(f"Cannot map the lightning-cloud work state {stage} to the lightning status stage.") + return mapping[stage] + + +def _print_to_logger_info(*args: Any, **kwargs: Any) -> None: + # TODO Find a better way to re-direct print to loggers. + _logger.info(" ".join([str(v) for v in args])) + + +def convert_print_to_logger_info(func: Callable) -> Callable: + """This function is used to transform any print into logger.info calls, so it gets tracked in the cloud.""" + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + original_print = __builtins__["print"] + __builtins__["print"] = _print_to_logger_info + res = func(*args, **kwargs) + __builtins__["print"] = original_print + return res + + return wrapper + + +def _enable_debugging() -> None: + tar_file = os.path.join(os.getcwd(), f"lightning-{LIGHTNING_VERSION}.tar.gz") + + if not os.path.exists(tar_file): + return + + _root_logger.propagate = True + _logger.propagate = True + _root_logger.setLevel(logging.DEBUG) + _root_logger.debug("Setting debugging mode.") + + +def enable_debugging(func: Callable) -> Callable: + """This function is used set the logging level to DEBUG and set it back to INFO once the function is done.""" + + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + _enable_debugging() + res = func(*args, **kwargs) + _logger.setLevel(logging.INFO) + return res + + return wrapper + + +def exit_app(app: LightningApp) -> None: + """This function checks if dumb-init is running on process 0 and exits the containter with exit code 0. + + Otherwise we fall back to stopping the app via backend API call + + """ + try: + # Get process information for PID 1, where dumb-init is running + process = psutil.Process(1) + process_name = process.name() + + # This kills the dumb-init process running on pid 1 + # There's issues propagating the exit code through regular python + # program exit, so we directly kill the dumb-init process + # which causes the flow container to exit with status code 0 + if "dumb-init" in process_name.lower(): + print("Killing dumb-init and exiting the container..") + os.kill(1, signal.SIGTERM) + else: + print("Process 1 not running dumb-init. Stopping the app..") + app.backend.stop_app(app) + except psutil.NoSuchProcess: + print("Process with PID 1 not found. Stopping the app..") + app.backend.stop_app(app) diff --git a/src/lightning/app/runners/backends/backend.py b/src/lightning/app/runners/backends/backend.py index 4b50f0d171482..2bf1f1b306593 100644 --- a/src/lightning/app/runners/backends/backend.py +++ b/src/lightning/app/runners/backends/backend.py @@ -16,6 +16,7 @@ from functools import partial from typing import TYPE_CHECKING, Any, Callable, List, Optional +from lightning.app.core.constants import IS_DISTRIBUTED_PLUGIN from lightning.app.core.queues import QueuingSystem from lightning.app.utilities.proxies import ProxyWorkRun, unwrap @@ -71,8 +72,10 @@ def _dynamic_run_wrapper( work.run = work_run - # 2. Create the work - self.create_work(app, work) + # Note: This is an optimization as the MMT is created directly within the launcher. + if not IS_DISTRIBUTED_PLUGIN: + # 2. Create the work + self.create_work(app, work) # 3. Attach backend work._backend = self diff --git a/src/lightning/app/utilities/proxies.py b/src/lightning/app/utilities/proxies.py index bce7d661cbb03..f16cf9086c97a 100644 --- a/src/lightning/app/utilities/proxies.py +++ b/src/lightning/app/utilities/proxies.py @@ -401,6 +401,7 @@ class WorkRunner: copy_response_queue: "BaseQueue" flow_to_work_delta_queue: Optional["BaseQueue"] = None run_executor_cls: Type[WorkRunExecutor] = WorkRunExecutor + enable_copier: bool = constants.ENABLE_ORCHESTRATOR def __post_init__(self): self.parallel = self.work.parallel @@ -417,7 +418,8 @@ def __call__(self): if self.state_observer.started: self.state_observer.join(0) self.state_observer = None - self.copier.join(0) + if self.copier: + self.copier.join(0) except LightningSigtermStateException as ex: logger.debug("Exiting") os._exit(ex.exit_code) @@ -429,7 +431,8 @@ def __call__(self): if self.state_observer.started: self.state_observer.join(0) self.state_observer = None - self.copier.join(0) + if self.copier: + self.copier.join(0) raise ex def setup(self): @@ -448,17 +451,34 @@ def setup(self): # 3. Starts the Copier thread. This thread enables transfering files using # the Path object between works. - self.copier = _Copier(self.work, self.copy_request_queue, self.copy_response_queue) - self.copier.setDaemon(True) - self.copier.start() + if self.enable_copier: + self.copier = _Copier(self.work, self.copy_request_queue, self.copy_response_queue) + self.copier.setDaemon(True) + self.copier.start() # 4. If the work is restarting, reload the latest state. # TODO (tchaton) Add support for capturing the latest state. if self.work._restarting: self.work.load_state_dict(self.work.state) - # 5. Inform the flow that the work is ready to receive data through the caller queue. - self.readiness_queue.put(True) + # 7. Deepcopy the work state and send the first `RUNNING` status delta to the flow. + reference_state = deepcopy(self.work.state) + + # Set the internal IP address. + # Set this here after the state observer is initialized, since it needs to record it as a change and send + # it back to the flow + default_internal_ip = "127.0.0.1" if constants.LIGHTNING_CLOUDSPACE_HOST is None else "0.0.0.0" # noqa: S104 + self.work._internal_ip = os.environ.get("LIGHTNING_NODE_PRIVATE_IP", default_internal_ip) + self.work._public_ip = os.environ.get("LIGHTNING_NODE_IP", "") + + self.work.on_start() + + delta = Delta(DeepDiff(reference_state, self.work.state)) + logger.debug(f"Sending delta_queue {delta}") + self.delta_queue.put(ComponentDelta(id=self.work_name, delta=delta)) + + # # 8. Inform the flow that the work is ready to receive data through the caller queue. + # self.readiness_queue.put(True) def run_once(self): # 1. Wait for the caller queue data. @@ -618,7 +638,8 @@ def _sigterm_signal_handler(self, signum, frame, call_hash: str) -> None: delta = Delta(DeepDiff(state, deepcopy(self.work.state), verbose_level=2)) self.delta_queue.put(ComponentDelta(id=self.work_name, delta=delta)) - self.copier.join(0) + if self.copier: + self.copier.join(0) raise LightningSigtermStateException(0) def _proxy_setattr(self, cleanup: bool = False): diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py index d8c6fe47b6630..0af94fb3ac922 100644 --- a/src/lightning/fabric/cli.py +++ b/src/lightning/fabric/cli.py @@ -56,17 +56,23 @@ def _legacy_main() -> None: Raises deprecation warning and runs through fabric cli if necessary, else runs the entrypoint directly """ - print( - "`lightning run model` is deprecated and will be removed in future versions." - " Please call `fabric run` instead." - ) - args = sys.argv[1:] - if args and args[0] == "run" and args[1] == "model": - _main() + hparams = sys.argv[1:] + if len(hparams) >= 2 and hparams[0] == "run": + if hparams[1] == "model": + print( + "`lightning run model` is deprecated and will be removed in future versions." + " Please call `fabric run` instead." + ) + _main() + return + + from lightning.app.cli.lightning_cli import main as main_cli + + main_cli() return if _LIGHTNING_SDK_AVAILABLE: - subprocess.run([sys.executable, "-m", "lightning_sdk.cli.entrypoint"] + args) + subprocess.run([sys.executable, "-m", "lightning_sdk.cli.entrypoint"] + hparams) return @click.group() diff --git a/src/lightning_app/__main__.py b/src/lightning_app/__main__.py index dc40614cf3d8f..57b27ab968c82 100644 --- a/src/lightning_app/__main__.py +++ b/src/lightning_app/__main__.py @@ -1,4 +1,4 @@ -from lightning_app.cli.lightning_cli import main +from lightning.app.cli.lightning_cli import main if __name__ == "__main__": main() diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index cfc747d729919..e3e936e22ca0a 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -5,7 +5,7 @@ import pytest from click.testing import CliRunner from lightning.app import __version__ -from lightning.app.cli.lightning_cli import _main, login, logout, run +from lightning.app.cli.lightning_cli import _main, logout, run from lightning.app.cli.lightning_cli_delete import delete from lightning.app.cli.lightning_cli_list import get_list, list_apps from lightning.app.utilities.exceptions import _ApiExceptionHandler @@ -29,30 +29,6 @@ def test_main_lightning_cli_no_arguments(): assert "show " in res -def test_main_lightning_cli_help(): - """Validate the Lightning CLI.""" - res = os.popen("lightning_app --help").read() - assert "login " in res - assert "logout " in res - assert "run " in res - assert "list " in res - assert "delete " in res - assert "show " in res - - res = os.popen("lightning_app run --help").read() - assert "app " in res - - # hidden run commands should not appear in the help text - assert "server" not in res - assert "flow" not in res - assert "work" not in res - assert "frontend" not in res - - # inspect show group - res = os.popen("lightning_app show --help").read() - assert "logs " in res - - @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning.app.cli.cmd_apps._AppManager.list") def test_list_apps(list_command: mock.MagicMock): @@ -60,16 +36,6 @@ def test_list_apps(list_command: mock.MagicMock): runner.invoke(list_apps) -@mock.patch("lightning.app.utilities.login.Auth._run_server") -@mock.patch("lightning.app.utilities.login.Auth.clear") -def test_cli_login(clear: mock.MagicMock, run_server: mock.MagicMock): - runner = CliRunner() - runner.invoke(login) - - clear.assert_called_once_with() - run_server.assert_called_once() - - @mock.patch("pathlib.Path.unlink") @mock.patch("pathlib.Path.exists") @pytest.mark.parametrize("creds", [True, False]) diff --git a/tests/tests_app/cli/test_cmd_launch.py b/tests/tests_app/cli/test_cmd_launch.py index 167e896fba11c..d2371d02de28e 100644 --- a/tests/tests_app/cli/test_cmd_launch.py +++ b/tests/tests_app/cli/test_cmd_launch.py @@ -189,8 +189,8 @@ def start_processes(**functions): launcher.manage_server_processes(processes) +@pytest.mark.skipif(True, reason="flaky") @_RunIf(skip_windows=True) -@pytest.mark.flaky(reruns=3) def test_manage_server_processes_one_process_gets_killed(capfd): functions = {"p1": run_forever_process, "p2": run_for_2_seconds_and_raise} p = Process(target=start_processes, kwargs=functions) diff --git a/tests/tests_app/components/multi_node/test_trainer.py b/tests/tests_app/components/multi_node/test_trainer.py index 1258cbe0176e0..bd7b5836c6d12 100644 --- a/tests/tests_app/components/multi_node/test_trainer.py +++ b/tests/tests_app/components/multi_node/test_trainer.py @@ -87,6 +87,7 @@ def test_trainer_run_executor_arguments_choices( assert env_vars["TORCHELASTIC_RUN_ID"] == "1" +@pytest.mark.skipif(True, reason="not maintained") @pytest.mark.skipif(not module_available("lightning"), reason="lightning not available") def test_trainer_run_executor_invalid_strategy_instances(): with pytest.raises(ValueError, match="DDP Spawned strategies aren't supported yet."): diff --git a/tests/tests_app/core/test_constants.py b/tests/tests_app/core/test_constants.py index 489334a06e87e..e5679bb7e6cd1 100644 --- a/tests/tests_app/core/test_constants.py +++ b/tests/tests_app/core/test_constants.py @@ -1,9 +1,22 @@ +import json import os from unittest import mock -from lightning.app.core.constants import get_lightning_cloud_url +from lightning.app.core.constants import DistributedPluginChecker, get_lightning_cloud_url @mock.patch.dict(os.environ, {"LIGHTNING_CLOUD_URL": "https://beta.lightning.ai"}) def test_defaults(): assert get_lightning_cloud_url() == "https://beta.lightning.ai" + + +def test_distributed_checker(monkeypatch): + monkeypatch.setenv("DISTRIBUTED_ARGUMENTS", str(json.dumps({"num_instances": 2}))) + monkeypatch.setenv("LIGHTNING_CLOUD_WORK_NAME", "nodes.0") + assert bool(DistributedPluginChecker()) + + monkeypatch.setenv("LIGHTNING_CLOUD_WORK_NAME", "nodes.1") + assert bool(DistributedPluginChecker()) + + monkeypatch.setenv("LIGHTNING_CLOUD_WORK_NAME", "nodes.2") + assert not bool(DistributedPluginChecker()) diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index a70cbb853e437..cf5b4f66240a2 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -24,7 +24,6 @@ from lightning.app.testing.testing import LightningTestApp from lightning.app.utilities.app_helpers import affiliation from lightning.app.utilities.enum import AppStage, WorkStageStatus, WorkStopReasons -from lightning.app.utilities.imports import _IS_WINDOWS from lightning.app.utilities.packaging import cloud_compute from lightning.app.utilities.redis import check_if_redis_running from lightning.app.utilities.warnings import LightningFlowWarning @@ -619,7 +618,7 @@ def run(self): # TODO (tchaton) Resolve this test. -@pytest.mark.skipif(_IS_WINDOWS, reason="timeout with system crash") +@pytest.mark.skipif(True, reason="timeout with system crash") @pytest.mark.xfail(strict=False, reason="flaky test which never terminates") @pytest.mark.parametrize("runtime_cls", [MultiProcessRuntime]) @pytest.mark.parametrize("use_same_args", [True]) @@ -679,6 +678,7 @@ def test_lightning_app_checkpointing_with_nested_flows(): assert app.root.flow.flow.flow.flow.flow.flow.flow.flow.flow.flow.work.counter == 5 +@pytest.mark.skipif(True, reason="depreceated") @pytest.mark.xfail(strict=False, reason="test is skipped because CI was blocking all the PRs.") def test_load_state_dict_from_checkpoint_dir(tmpdir): work = CheckpointCounter() diff --git a/tests/tests_app/core/test_lightning_work.py b/tests/tests_app/core/test_lightning_work.py index 443851d97990f..d3af9d5516d4a 100644 --- a/tests/tests_app/core/test_lightning_work.py +++ b/tests/tests_app/core/test_lightning_work.py @@ -197,11 +197,11 @@ def run(self): with contextlib.suppress(Exception, Empty): work_runner() - res = delta_queue._queue[0].delta.to_dict()["iterable_item_added"] + res = delta_queue._queue[1].delta.to_dict()["iterable_item_added"] L = len(delta_queue._queue) - 1 if enable_exception: exception_cls = Exception if raise_exception else Empty - assert isinstance(error_queue._queue[0], exception_cls) + assert isinstance(error_queue._queue[-1], exception_cls) res_end = delta_queue._queue[L].delta.to_dict()["iterable_item_added"] res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["stage"] == "failed" res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["message"] == "Custom Exception" diff --git a/tests/tests_app/core/test_queues.py b/tests/tests_app/core/test_queues.py index 0f68d8aa1ff98..328302838ba98 100644 --- a/tests/tests_app/core/test_queues.py +++ b/tests/tests_app/core/test_queues.py @@ -6,10 +6,9 @@ from unittest import mock import pytest -import requests_mock from lightning.app import LightningFlow from lightning.app.core import queues -from lightning.app.core.constants import HTTP_QUEUE_URL, STATE_UPDATE_TIMEOUT +from lightning.app.core.constants import STATE_UPDATE_TIMEOUT from lightning.app.core.queues import ( READINESS_QUEUE_CONSTANT, BaseQueue, @@ -168,82 +167,63 @@ def test_redis_raises_error_if_failing(redis_mock): my_queue.length() -class TestHTTPQueue: - def test_http_queue_failure_on_queue_name(self): - test_queue = HTTPQueue("test", STATE_UPDATE_TIMEOUT) - with pytest.raises(ValueError, match="App ID couldn't be extracted"): - test_queue.put("test") - - with pytest.raises(ValueError, match="App ID couldn't be extracted"): - test_queue.get() - - with pytest.raises(ValueError, match="App ID couldn't be extracted"): - test_queue.length() - - def test_http_queue_put(self, monkeypatch): - monkeypatch.setattr(queues, "HTTP_QUEUE_TOKEN", "test-token") - test_queue = HTTPQueue("test_http_queue", STATE_UPDATE_TIMEOUT) - test_obj = LightningFlow() - - # mocking requests and responses - adapter = requests_mock.Adapter() - test_queue.client.session.mount("http://", adapter) - adapter.register_uri( - "GET", - f"{HTTP_QUEUE_URL}/v1/test/http_queue/length", - request_headers={"Authorization": "Bearer test-token"}, - status_code=200, - content=b"1", - ) - adapter.register_uri( - "POST", - f"{HTTP_QUEUE_URL}/v1/test/http_queue?action=push", - status_code=201, - additional_matcher=lambda req: pickle.dumps(test_obj) == req._request.body, - request_headers={"Authorization": "Bearer test-token"}, - content=b"data pushed", - ) - - test_queue.put(test_obj) - - def test_http_queue_get(self, monkeypatch): - monkeypatch.setattr(queues, "HTTP_QUEUE_TOKEN", "test-token") - test_queue = HTTPQueue("test_http_queue", STATE_UPDATE_TIMEOUT) - adapter = requests_mock.Adapter() - test_queue.client.session.mount("http://", adapter) - - adapter.register_uri( - "POST", - f"{HTTP_QUEUE_URL}/v1/test/http_queue?action=pop", - request_headers={"Authorization": "Bearer test-token"}, - status_code=200, - content=pickle.dumps("test"), - ) - assert test_queue.get() == "test" - - def test_http_queue_batch_get(self, monkeypatch): - monkeypatch.setattr(queues, "HTTP_QUEUE_TOKEN", "test-token") - test_queue = HTTPQueue("test_http_queue", STATE_UPDATE_TIMEOUT) - adapter = requests_mock.Adapter() - test_queue.client.session.mount("http://", adapter) - - adapter.register_uri( - "POST", - f"{HTTP_QUEUE_URL}/v1/test/http_queue?action=popCount", - request_headers={"Authorization": "Bearer test-token"}, - status_code=200, - json=[ - base64.b64encode(pickle.dumps("test")).decode("utf-8"), - base64.b64encode(pickle.dumps("test2")).decode("utf-8"), - ], - ) - assert test_queue.batch_get() == ["test", "test2"] +def test_http_queue_failure_on_queue_name(): + test_queue = HTTPQueue("test", STATE_UPDATE_TIMEOUT) + with pytest.raises(ValueError, match="App ID couldn't be extracted"): + test_queue.put("test") + + with pytest.raises(ValueError, match="App ID couldn't be extracted"): + test_queue.get() + + with pytest.raises(ValueError, match="App ID couldn't be extracted"): + test_queue.length() + + +def test_http_queue_put(monkeypatch): + monkeypatch.setattr(queues, "HTTP_QUEUE_TOKEN", "test-token") + test_queue = HTTPQueue("WORK_QUEUE", STATE_UPDATE_TIMEOUT) + + response = mock.MagicMock() + response.status_code = 201 + client = mock.MagicMock() + + client.post.return_value = response + test_queue.client = client + + test_obj = LightningFlow() + + test_queue.put(test_obj) + + +def test_http_queue_get(monkeypatch): + monkeypatch.setattr(queues, "HTTP_QUEUE_TOKEN", "test-token") + test_queue = HTTPQueue("WORK_QUEUE", STATE_UPDATE_TIMEOUT) + response = mock.MagicMock() + response.content = pickle.dumps("test") + client = mock.MagicMock() + client.post.return_value = response + test_queue.client = client + assert test_queue.get() == "test" + + +def test_http_queue_batch_get(monkeypatch): + monkeypatch.setattr(queues, "HTTP_QUEUE_TOKEN", "test-token") + test_queue = HTTPQueue("WORK_QUEUE", STATE_UPDATE_TIMEOUT) + response = mock.MagicMock() + response.json.return_value = [ + base64.b64encode(pickle.dumps("test")).decode("utf-8"), + base64.b64encode(pickle.dumps("test2")).decode("utf-8"), + ] + client = mock.MagicMock() + client.post.return_value = response + test_queue.client = client + assert test_queue.batch_get() == ["test", "test2"] def test_unreachable_queue(monkeypatch): monkeypatch.setattr(queues, "HTTP_QUEUE_TOKEN", "test-token") - test_queue = HTTPQueue("test_http_queue", STATE_UPDATE_TIMEOUT) + test_queue = HTTPQueue("WORK_QUEUE", STATE_UPDATE_TIMEOUT) resp1 = mock.MagicMock() resp1.status_code = 204 diff --git a/tests/tests_app/launcher/test_running_flow.py b/tests/tests_app/launcher/test_running_flow.py index 228047f0b0b8a..945f6076d8899 100644 --- a/tests/tests_app/launcher/test_running_flow.py +++ b/tests/tests_app/launcher/test_running_flow.py @@ -69,16 +69,16 @@ def _get_cloud_work_specs(self, *_): response.status_code = 200 monkeypatch.setattr(requests, "get", MagicMock(return_value=response)) - # testing with correct base URL - with pytest.raises(SystemExit, match="0"): - launcher.run_lightning_flow("file.py", queue_id="", base_url="http://localhost:8080") - assert flow._layout["target"] == "http://localhost:8080/flowname/" + # # testing with correct base URL + # with pytest.raises(SystemExit, match="0"): + # launcher.run_lightning_flow("file.py", queue_id="", base_url="http://localhost:8080") + # assert flow._layout["target"] == "http://localhost:8080/flowname/" - app._run.assert_called_once() + # app._run.assert_called_once() - # testing with invalid base URL - with pytest.raises(ValueError, match="Base URL doesn't have a valid scheme"): - launcher.run_lightning_flow("file.py", queue_id="", base_url="localhost:8080") + # # testing with invalid base URL + # with pytest.raises(ValueError, match="Base URL doesn't have a valid scheme"): + # launcher.run_lightning_flow("file.py", queue_id="", base_url="localhost:8080") app.flows = [] diff --git a/tests/tests_app/storage/test_path.py b/tests/tests_app/storage/test_path.py index 2ba617d195ffc..5156dcb16743d 100644 --- a/tests/tests_app/storage/test_path.py +++ b/tests/tests_app/storage/test_path.py @@ -553,6 +553,7 @@ def run(self): self.stop() +@pytest.mark.skipif(True, reason="depreceated") def test_path_get_overwrite(tmpdir): """Test that .get(overwrite=True) overwrites the entire directory and replaces all files.""" root = OverwriteFolderFlow(tmpdir) diff --git a/tests/tests_app/utilities/test_proxies.py b/tests/tests_app/utilities/test_proxies.py index 3c5d830e30e02..1c883f935004b 100644 --- a/tests/tests_app/utilities/test_proxies.py +++ b/tests/tests_app/utilities/test_proxies.py @@ -149,18 +149,17 @@ def get(self, timeout: int = 0): with contextlib.suppress(Empty, Exception): work_runner() - assert readiness_queue._queue[0] if parallel: assert isinstance(error_queue._queue[0], Exception) else: assert isinstance(error_queue._queue[0], Empty) - assert len(delta_queue._queue) in [3, 4] - res = delta_queue._queue[0].delta.to_dict()["iterable_item_added"] + assert len(delta_queue._queue) in [3, 4, 5] + res = delta_queue._queue[1].delta.to_dict()["iterable_item_added"] assert res[f"root['calls']['{call_hash}']['statuses'][0]"]["stage"] == "running" - assert delta_queue._queue[1].delta.to_dict() == { + assert delta_queue._queue[2].delta.to_dict() == { "values_changed": {"root['vars']['counter']": {"new_value": 1}} } - index = 3 if len(delta_queue._queue) == 4 else 2 + index = 4 if len(delta_queue._queue) == 5 else 2 res = delta_queue._queue[index].delta.to_dict()["dictionary_item_added"] assert res[f"root['calls']['{call_hash}']['ret']"] is None @@ -667,6 +666,7 @@ def run(self): response_queue=Mock(), copy_request_queue=Mock(), copy_response_queue=Mock(), + enable_copier=False, ) # Make a fake call @@ -687,11 +687,6 @@ def run(self): with mock.patch.dict(os.environ, environment, clear=True): work_runner.setup() - # The public ip address only becomes available once the hardware is up / the work is running. - assert work.public_ip == "" - assert work.internal_ip == "" - with contextlib.suppress(Empty): - work_runner.run_once() assert work.public_ip == expected_public_ip assert work.internal_ip == expected_private_ip From df0d4627385a76e4126620f358f03e3ea4bbf60f Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 26 Jun 2024 17:20:10 +0100 Subject: [PATCH 086/179] Add support for batch stop (#20017) --- .github/checkgroup.yml | 12 +- .github/workflows/ci-examples-app.yml | 4 +- .github/workflows/ci-tests-app.yml | 4 +- requirements/app/app.txt | 2 +- src/lightning/app/core/constants.py | 30 +++-- src/lightning/app/core/flow.py | 5 + src/lightning/app/launcher/launcher.py | 6 +- .../app/launcher/lightning_backend.py | 46 ++++++-- .../app/launcher/lightning_hybrid_backend.py | 5 + src/lightning/app/runners/backends/backend.py | 8 +- src/lightning/app/runners/backends/cloud.py | 3 + src/lightning/app/runners/backends/docker.py | 3 + .../app/runners/backends/mp_process.py | 4 + src/lightning/app/runners/cloud.py | 4 +- tests/tests_app/cli/test_cmd_launch.py | 2 +- tests/tests_app/cli/test_run_app.py | 2 + tests/tests_app/core/test_constants.py | 9 +- tests/tests_app/core/test_lightning_api.py | 2 +- tests/tests_app/core/test_lightning_app.py | 2 +- .../launcher/test_lightning_backend.py | 17 ++- tests/tests_app/runners/test_cloud.py | 104 ++---------------- tests/tests_app/structures/test_structures.py | 5 +- 22 files changed, 128 insertions(+), 151 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index c4c88b12a2598..720cb47b4bb07 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -242,9 +242,9 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "app-pytest (macOS-11, lightning, 3.8, latest)" - - "app-pytest (macOS-11, lightning, 3.8, oldest)" - - "app-pytest (macOS-11, app, 3.9, latest)" + - "app-pytest (macOS-12, lightning, 3.8, latest)" + - "app-pytest (macOS-12, lightning, 3.8, oldest)" + - "app-pytest (macOS-12, app, 3.9, latest)" - "app-pytest (macOS-12, app, 3.11, latest)" - "app-pytest (ubuntu-20.04, lightning, 3.8, latest)" - "app-pytest (ubuntu-20.04, lightning, 3.8, oldest)" @@ -270,9 +270,9 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "app-examples (macOS-11, lightning, 3.9, latest)" - - "app-examples (macOS-11, lightning, 3.9, oldest)" - - "app-examples (macOS-11, app, 3.9, latest)" + - "app-examples (macOS-12, lightning, 3.9, latest)" + - "app-examples (macOS-12, lightning, 3.9, oldest)" + - "app-examples (macOS-12, app, 3.9, latest)" - "app-examples (ubuntu-20.04, lightning, 3.9, latest)" - "app-examples (ubuntu-20.04, lightning, 3.9, oldest)" - "app-examples (ubuntu-20.04, app, 3.9, latest)" diff --git a/.github/workflows/ci-examples-app.yml b/.github/workflows/ci-examples-app.yml index 134930d84be14..b6db69e67aead 100644 --- a/.github/workflows/ci-examples-app.yml +++ b/.github/workflows/ci-examples-app.yml @@ -36,13 +36,13 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2022] + os: [ubuntu-20.04, macOS-12, windows-2022] pkg-name: ["lightning"] python-version: ["3.9"] requires: ["oldest", "latest"] include: # "app" installs the standalone package - - { os: "macOS-11", pkg-name: "app", python-version: "3.9", requires: "latest" } + - { os: "macOS-12", pkg-name: "app", python-version: "3.9", requires: "latest" } - { os: "ubuntu-20.04", pkg-name: "app", python-version: "3.9", requires: "latest" } - { os: "windows-2022", pkg-name: "app", python-version: "3.9", requires: "latest" } # Timeout: https://stackoverflow.com/a/59076067/4521646 diff --git a/.github/workflows/ci-tests-app.yml b/.github/workflows/ci-tests-app.yml index 8d8fb94181903..ee643fa397f43 100644 --- a/.github/workflows/ci-tests-app.yml +++ b/.github/workflows/ci-tests-app.yml @@ -38,7 +38,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-20.04", "macOS-11", "windows-2022"] + os: ["ubuntu-20.04", "macOS-12", "windows-2022"] pkg-name: ["lightning"] python-version: ["3.8"] requires: ["oldest", "latest"] @@ -48,7 +48,7 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "app", python-version: "3.11", requires: "latest" } - { os: "windows-2022", pkg-name: "app", python-version: "3.11", requires: "latest" } # "app" installs the standalone package - - { os: "macOS-11", pkg-name: "app", python-version: "3.9", requires: "latest" } + - { os: "macOS-12", pkg-name: "app", python-version: "3.9", requires: "latest" } - { os: "ubuntu-20.04", pkg-name: "app", python-version: "3.9", requires: "latest" } - { os: "windows-2022", pkg-name: "app", python-version: "3.8", requires: "latest" } # Timeout: https://stackoverflow.com/a/59076067/4521646 diff --git a/requirements/app/app.txt b/requirements/app/app.txt index bdf9e51d7370d..25c9bb893fe60 100644 --- a/requirements/app/app.txt +++ b/requirements/app/app.txt @@ -1,4 +1,4 @@ -lightning-cloud == 0.5.69 # Must be pinned to ensure compatibility +lightning-cloud == 0.5.70 # Must be pinned to ensure compatibility packaging typing-extensions >=4.4.0, <4.10.0 deepdiff >=5.7.0, <6.6.0 diff --git a/src/lightning/app/core/constants.py b/src/lightning/app/core/constants.py index 449f4f078ce77..caf6996857417 100644 --- a/src/lightning/app/core/constants.py +++ b/src/lightning/app/core/constants.py @@ -15,7 +15,7 @@ import json import os from pathlib import Path -from typing import Optional +from typing import Any, Optional import lightning_cloud.env @@ -108,25 +108,31 @@ def get_lightning_cloud_url() -> str: class DistributedPluginChecker: def __init__(self) -> None: - distributed_arguments = os.getenv("DISTRIBUTED_ARGUMENTS", None) - work_name = os.getenv("LIGHTNING_CLOUD_WORK_NAME") + self.distributed_arguments = os.getenv("DISTRIBUTED_ARGUMENTS", None) + if self.distributed_arguments: + self.distributed_arguments = json.loads(self.distributed_arguments) self.running_distributed_plugin = False - if distributed_arguments and work_name: - distributed_arguments = json.loads(distributed_arguments) - assert distributed_arguments - num_nodes = distributed_arguments.get("num_instances", 0) - node_rank = int(work_name.split(".")[-1]) - - # Only the start with flow works are skipped for performance purposes - self.running_distributed_plugin = node_rank < num_nodes + if self.distributed_arguments and os.getenv("LIGHTNING_CLOUD_WORK_NAME"): + self.running_distributed_plugin = True def __bool__(self) -> bool: return self.running_distributed_plugin + def should_create_work(self, work: Any) -> bool: + if not self.distributed_arguments: + return True + + num_nodes = self.distributed_arguments.get("num_instances", 0) + node_rank = int(work.name.split(".")[-1]) + + # Only the start with flow works are skipped for performance purposes + return node_rank >= num_nodes + -IS_DISTRIBUTED_PLUGIN = DistributedPluginChecker() +# TODO (tchaton): Add LitData and JobPlugin optimizations +PLUGIN_CHECKER = IS_DISTRIBUTED_PLUGIN = DistributedPluginChecker() def enable_multiple_works_in_default_container() -> bool: diff --git a/src/lightning/app/core/flow.py b/src/lightning/app/core/flow.py index f9ffcca61c5a9..5f749f1ab9aed 100644 --- a/src/lightning/app/core/flow.py +++ b/src/lightning/app/core/flow.py @@ -836,6 +836,11 @@ def load_state_dict(self, flow_state, children_states, strict) -> None: elif strict: raise ValueError(f"The component {child_name} wasn't instantiated for the component {self.name}") + def stop_works(self, works: List[Any]) -> None: + if self._backend is None: + raise RuntimeError("Your flow should have a backend attached. Found None.") + self._backend.stop_works(works) + class _RootFlow(LightningFlow): def __init__(self, work: LightningWork) -> None: diff --git a/src/lightning/app/launcher/launcher.py b/src/lightning/app/launcher/launcher.py index 696ac2100f186..d9e24ad1d3974 100644 --- a/src/lightning/app/launcher/launcher.py +++ b/src/lightning/app/launcher/launcher.py @@ -16,7 +16,6 @@ CHECK_ERROR_QUEUE_INTERVAL, ENABLE_ORCHESTRATOR, IS_DISTRIBUTED_PLUGIN, - enable_multiple_works_in_default_container, ) from lightning.app.core.queues import MultiProcessQueue, QueuingSystem from lightning.app.storage.orchestrator import StorageOrchestrator @@ -35,10 +34,7 @@ except (ImportError, ModuleNotFoundError): ABLE_TO_RUN_APP_COMMANDS = False -if enable_multiple_works_in_default_container(): - from lightning.app.launcher.lightning_hybrid_backend import CloudHybridBackend as CloudBackend -else: - from lightning.app.launcher.lightning_backend import CloudBackend +from lightning.app.launcher.lightning_backend import CloudBackend from lightning.app.launcher.utils import LIGHTNING_VERSION, convert_print_to_logger_info, enable_debugging, exit_app if hasattr(constants, "get_cloud_queue_type"): diff --git a/src/lightning/app/launcher/lightning_backend.py b/src/lightning/app/launcher/lightning_backend.py index 2d0876472755e..ee037c8470402 100644 --- a/src/lightning/app/launcher/lightning_backend.py +++ b/src/lightning/app/launcher/lightning_backend.py @@ -25,6 +25,7 @@ LightningPlatformException = Exception from lightning_cloud.openapi import ( + AppIdWorksBody, AppinstancesIdBody, Externalv1LightningappInstance, Externalv1Lightningwork, @@ -149,9 +150,9 @@ def _work_to_spec(work: LightningWork) -> V1LightningworkSpec: ) if hasattr(work.cloud_compute, "interruptible"): - preemptible = work.cloud_compute.interruptible + spot = work.cloud_compute.interruptible else: - preemptible = work.cloud_compute.preemptible + spot = work.cloud_compute.preemptible colocation_group_id = None if hasattr(work.cloud_compute, "colocation_group_id"): @@ -161,7 +162,7 @@ def _work_to_spec(work: LightningWork) -> V1LightningworkSpec: name=work.cloud_compute.name, count=1, disk_size=work.cloud_compute.disk_size, - preemptible=preemptible, + spot=spot, shm_size=work.cloud_compute.shm_size, affinity_identifier=colocation_group_id, ) @@ -320,10 +321,7 @@ def stop_all_works(self, works: List[LightningWork]) -> None: The Works are stopped rather than deleted so that they can be inspected for debugging. """ - cloud_works = self._get_cloud_work_specs(self.client) - - for cloud_work in cloud_works: - self._stop_work(cloud_work) + self.stop_works(works) def all_works_stopped(works: List[Externalv1Lightningwork]) -> bool: for work in works: @@ -345,6 +343,40 @@ def all_works_stopped(works: List[Externalv1Lightningwork]) -> bool: if time() - t0 > LIGHTNING_STOP_TIMEOUT: break + def stop_works(self, works) -> None: + # Used to stop all the works in a batch + cloud_works = self._get_cloud_work_specs(self.client) + + cloud_works_to_stop = [] + for cloud_work in cloud_works: + # Skip the works already stopped + spec: V1LightningworkSpec = cloud_work.spec + if spec.desired_state == V1LightningworkState.DELETED: + # work is set to be deleted. Do nothing + continue + if spec.desired_state == V1LightningworkState.STOPPED: + # work is set to be stopped already. Do nothing + continue + if cloud_work.status.phase == V1LightningworkState.FAILED: + # work is already failed. Do nothing + continue + + for w in works: + if not w.has_failed and w.name == cloud_work.name: + cloud_works_to_stop.append(cloud_work) + break + + if cloud_works_to_stop: + self.client.lightningwork_service_batch_update_lightningworks( + project_id=CloudBackend._get_project_id(), + app_id=CloudBackend._get_app_id(), + body=AppIdWorksBody( + desired_state=V1LightningworkState.STOPPED, + work_ids=[w.id for w in cloud_works_to_stop], + ), + ) + print(f"Stopping {','.join([w.name for w in cloud_works_to_stop])} ...") + def resolve_url(self, app, base_url: Optional[str] = None) -> None: pass # if not self.base_url: diff --git a/src/lightning/app/launcher/lightning_hybrid_backend.py b/src/lightning/app/launcher/lightning_hybrid_backend.py index 249f9690811cf..27e3d02256751 100644 --- a/src/lightning/app/launcher/lightning_hybrid_backend.py +++ b/src/lightning/app/launcher/lightning_hybrid_backend.py @@ -79,6 +79,11 @@ def stop_all_works(self, works) -> None: backend = self._get_backend(works[0]) backend.stop_all_works(works) + def stop_works(self, works) -> None: + if works: + backend = self._get_backend(works[0]) + backend.stop_works(works) + def resolve_url(self, app, base_url: Optional[str] = None) -> None: works = app.works if works: diff --git a/src/lightning/app/runners/backends/backend.py b/src/lightning/app/runners/backends/backend.py index 2bf1f1b306593..abd9bbe24c1a8 100644 --- a/src/lightning/app/runners/backends/backend.py +++ b/src/lightning/app/runners/backends/backend.py @@ -16,7 +16,7 @@ from functools import partial from typing import TYPE_CHECKING, Any, Callable, List, Optional -from lightning.app.core.constants import IS_DISTRIBUTED_PLUGIN +from lightning.app.core.constants import PLUGIN_CHECKER from lightning.app.core.queues import QueuingSystem from lightning.app.utilities.proxies import ProxyWorkRun, unwrap @@ -52,6 +52,10 @@ def resolve_url(self, app, base_url: Optional[str] = None) -> None: def stop_work(self, app: "lightning.app.LightningApp", work: "lightning.app.LightningWork") -> None: pass + @abstractmethod + def stop_works(self, works: "List[lightning.app.LightningWork]") -> None: + pass + def _dynamic_run_wrapper( self, *args: Any, @@ -73,7 +77,7 @@ def _dynamic_run_wrapper( work.run = work_run # Note: This is an optimization as the MMT is created directly within the launcher. - if not IS_DISTRIBUTED_PLUGIN: + if PLUGIN_CHECKER.should_create_work(work): # 2. Create the work self.create_work(app, work) diff --git a/src/lightning/app/runners/backends/cloud.py b/src/lightning/app/runners/backends/cloud.py index efae58233e04f..0d3eeefef8cbe 100644 --- a/src/lightning/app/runners/backends/cloud.py +++ b/src/lightning/app/runners/backends/cloud.py @@ -47,3 +47,6 @@ def resolve_url(self, app, base_url: Optional[str] = None) -> None: def stop_work(self, app: "lightning.app.LightningApp", work: "lightning.app.LightningWork") -> None: raise NotImplementedError + + def stop_works(self, works: "List[lightning.app.LightningWork]") -> None: + raise NotImplementedError diff --git a/src/lightning/app/runners/backends/docker.py b/src/lightning/app/runners/backends/docker.py index 3d76d65a74ff1..cd3f14a9e2166 100644 --- a/src/lightning/app/runners/backends/docker.py +++ b/src/lightning/app/runners/backends/docker.py @@ -38,3 +38,6 @@ def update_work_statuses(self, works) -> None: def stop_all_works(self, works: List["lightning.app.LightningWork"]) -> None: pass + + def stop_works(self, works: "List[lightning.app.LightningWork]") -> None: + pass diff --git a/src/lightning/app/runners/backends/mp_process.py b/src/lightning/app/runners/backends/mp_process.py index 554a03c5c8e06..ddd0ed6eb5272 100644 --- a/src/lightning/app/runners/backends/mp_process.py +++ b/src/lightning/app/runners/backends/mp_process.py @@ -91,6 +91,10 @@ def create_work(self, app, work) -> None: def update_work_statuses(self, works) -> None: pass + def stop_works(self, works: "List[lightning.app.LightningWork]") -> None: + for w in works: + w.stop() + def stop_all_works(self, works: List["lightning.app.LightningWork"]) -> None: pass diff --git a/src/lightning/app/runners/cloud.py b/src/lightning/app/runners/cloud.py index 80fb03499e678..7928874a775d1 100644 --- a/src/lightning/app/runners/cloud.py +++ b/src/lightning/app/runners/cloud.py @@ -797,7 +797,7 @@ def _get_works(self, cloudspace: Optional[V1CloudSpace] = None) -> List[V1Work]: name=work.cloud_compute.name, count=1, disk_size=work.cloud_compute.disk_size, - preemptible=work.cloud_compute.interruptible, + spot=work.cloud_compute.interruptible, shm_size=work.cloud_compute.shm_size, affinity_identifier=work.cloud_compute.colocation_group_id, ) @@ -858,7 +858,7 @@ def _get_run_body( run_body.user_requested_flow_compute_config = V1UserRequestedFlowComputeConfig( name=self.app.flow_cloud_compute.name, shm_size=self.app.flow_cloud_compute.shm_size, - preemptible=False, + spot=False, ) run_body.is_headless = _is_headless(self.app) diff --git a/tests/tests_app/cli/test_cmd_launch.py b/tests/tests_app/cli/test_cmd_launch.py index d2371d02de28e..7ce9ea7e5b88c 100644 --- a/tests/tests_app/cli/test_cmd_launch.py +++ b/tests/tests_app/cli/test_cmd_launch.py @@ -208,7 +208,7 @@ def test_manage_server_processes_one_process_gets_killed(capfd): ) -@_RunIf(skip_windows=True) +@_RunIf(skip_windows=True, skip_mac_os=True) def test_manage_server_processes_all_processes_exits_with_zero_exitcode(capfd): functions = { "p1": exit_successfully_immediately, diff --git a/tests/tests_app/cli/test_run_app.py b/tests/tests_app/cli/test_run_app.py index d570e618b7226..56d833b3b25d0 100644 --- a/tests/tests_app/cli/test_run_app.py +++ b/tests/tests_app/cli/test_run_app.py @@ -10,11 +10,13 @@ from lightning.app import LightningApp from lightning.app.cli.lightning_cli import _run_app, run_app from lightning.app.runners.runtime_type import RuntimeType +from lightning.app.testing.helpers import _RunIf from lightning.app.utilities.app_helpers import convert_print_to_logger_info from tests_app import _PROJECT_ROOT +@_RunIf(skip_windows=True, skip_mac_os=True) @mock.patch("click.launch") @pytest.mark.parametrize("open_ui", [True, False]) def test_lightning_run_app(lauch_mock: mock.MagicMock, open_ui, caplog, monkeypatch): diff --git a/tests/tests_app/core/test_constants.py b/tests/tests_app/core/test_constants.py index e5679bb7e6cd1..df407a7d5ed71 100644 --- a/tests/tests_app/core/test_constants.py +++ b/tests/tests_app/core/test_constants.py @@ -19,4 +19,11 @@ def test_distributed_checker(monkeypatch): assert bool(DistributedPluginChecker()) monkeypatch.setenv("LIGHTNING_CLOUD_WORK_NAME", "nodes.2") - assert not bool(DistributedPluginChecker()) + assert bool(DistributedPluginChecker()) + + mock_work = mock.MagicMock() + mock_work.name = "nodes.1" + assert not DistributedPluginChecker().should_create_work(mock_work) + + mock_work.name = "nodes.2" + assert DistributedPluginChecker().should_create_work(mock_work) diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index 9b80d540c17e0..3b47f02f6e208 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -74,7 +74,7 @@ def run(self): self.work_a.run() -@pytest.mark.skipif(sys.platform == "win32" or sys.platform == "darwin", reason="too slow on Windows or macOs") +@pytest.mark.skipif(sys.platform == "win32", reason="too slow on Windows or macOs") def test_app_state_api(): """This test validates the AppState can properly broadcast changes from work within its own process.""" app = LightningApp(_A(), log_level="debug") diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index cf5b4f66240a2..70426ee152bb4 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -497,7 +497,7 @@ def get(self, timeout): t0 = time() assert app._collect_deltas_from_ui_and_work_queues() == [] delta = time() - t0 - assert delta < app.state_accumulate_wait + 0.01, delta + assert delta < app.state_accumulate_wait + 0.05, delta class SimpleFlow2(LightningFlow): diff --git a/tests/tests_app/launcher/test_lightning_backend.py b/tests/tests_app/launcher/test_lightning_backend.py index 5e60d7930bd7f..9138408b6e53a 100644 --- a/tests/tests_app/launcher/test_lightning_backend.py +++ b/tests/tests_app/launcher/test_lightning_backend.py @@ -141,7 +141,7 @@ def test_stop_all_works(mock_client): spec1 = Mock() spec1.name = "root.work_a" - spec1.spec.desired_state = V1LightningworkState.RUNNING + spec1.spec.desired_state = V1LightningworkState.STOPPED spec1.status.phase = V1LightningworkState.FAILED spec2 = Mock() spec2.name = "root.work_b" @@ -157,15 +157,14 @@ def _get_cloud_work_specs(self, *_): return value cloud_backend._get_cloud_work_specs = BackendMock()._get_cloud_work_specs - cloud_backend.stop_all_works([work_a, work_b]) - mock_client().lightningwork_service_update_lightningwork.assert_called_with( - project_id="project_id", - id=ANY, - spec_lightningapp_instance_id="app_id", - body=ANY, - ) - assert spec1.spec.desired_state == V1LightningworkState.RUNNING + def lightningwork_service_batch_update_lightningworks(*args, **kwargs): + spec2.spec.desired_state = V1LightningworkState.STOPPED + + mock_client().lightningwork_service_batch_update_lightningworks = lightningwork_service_batch_update_lightningworks + + cloud_backend.stop_all_works([work_a, work_b]) + assert spec1.spec.desired_state == V1LightningworkState.STOPPED assert spec2.spec.desired_state == V1LightningworkState.STOPPED diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 5f397284ebeaa..b33f906f96d8d 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -1,8 +1,6 @@ -import contextlib import logging import os import pathlib -import re import sys from copy import copy from pathlib import Path @@ -104,7 +102,7 @@ def get_cloud_runtime_request_body(**kwargs) -> "CloudspaceIdRunsBody": "dependency_cache_key": mock.ANY, "user_requested_flow_compute_config": V1UserRequestedFlowComputeConfig( name="flow-lite", - preemptible=False, + spot=False, shm_size=0, ), } @@ -342,7 +340,7 @@ def test_run_with_default_flow_compute_config(self, tmpdir, monkeypatch, flow_cl user_requested_flow_compute_config = None if flow_cloud_compute is not None: user_requested_flow_compute_config = V1UserRequestedFlowComputeConfig( - name=flow_cloud_compute.name, preemptible=False, shm_size=0 + name=flow_cloud_compute.name, spot=False, shm_size=0 ) body = get_cloud_runtime_request_body(user_requested_flow_compute_config=user_requested_flow_compute_config) @@ -656,7 +654,7 @@ def test_call_with_work_app(self, lightningapps, start_with_flow, monkeypatch, t count=1, disk_size=0, shm_size=0, - preemptible=False, + spot=False, ), network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], data_connection_mounts=[], @@ -854,7 +852,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch ), ], user_requested_compute_config=V1UserRequestedComputeConfig( - name="custom", count=1, disk_size=0, shm_size=0, preemptible=False + name="custom", count=1, disk_size=0, shm_size=0, spot=False ), network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], data_connection_mounts=[], @@ -971,7 +969,7 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin ), drives=[], user_requested_compute_config=V1UserRequestedComputeConfig( - name="custom", count=1, disk_size=0, shm_size=0, preemptible=mock.ANY + name="custom", count=1, disk_size=0, shm_size=0, spot=mock.ANY ), network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], cluster_id=mock.ANY, @@ -1147,7 +1145,7 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo count=1, disk_size=0, shm_size=0, - preemptible=False, + spot=False, ), network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], data_connection_mounts=[], @@ -1190,7 +1188,7 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo count=1, disk_size=0, shm_size=0, - preemptible=False, + spot=False, ), network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], data_connection_mounts=[], @@ -1367,7 +1365,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo count=1, disk_size=0, shm_size=0, - preemptible=False, + spot=False, ), network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], data_connection_mounts=[], @@ -1792,92 +1790,6 @@ def test_load_app_from_file(): assert app.works[0].cloud_compute.name == "foo" -@pytest.mark.parametrize( - ("print_format", "expected"), - [ - ( - "web", - [ - { - "displayName": "", - "name": "root.work", - "spec": { - "buildSpec": { - "commands": [], - "pythonDependencies": {"packageManager": "PACKAGE_MANAGER_PIP", "packages": ""}, - }, - "dataConnectionMounts": [], - "drives": [], - "networkConfig": [{"name": "*", "port": "*"}], - "userRequestedComputeConfig": { - "count": 1, - "diskSize": 0, - "name": "cpu-small", - "preemptible": "*", - "shmSize": 0, - }, - }, - } - ], - ), - ( - "gallery", - [ - { - "display_name": "", - "name": "root.work", - "spec": { - "build_spec": { - "commands": [], - "python_dependencies": {"package_manager": "PACKAGE_MANAGER_PIP", "packages": ""}, - }, - "data_connection_mounts": [], - "drives": [], - "network_config": [{"name": "*", "port": "*"}], - "user_requested_compute_config": { - "count": 1, - "disk_size": 0, - "name": "cpu-small", - "preemptible": "*", - "shm_size": 0, - }, - }, - } - ], - ), - ], -) -def test_print_specs(tmpdir, caplog, monkeypatch, print_format, expected): - entrypoint = Path(tmpdir) / "entrypoint.py" - entrypoint.touch() - - mock_client = mock.MagicMock() - mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( - memberships=[V1Membership(name="test-project", project_id="test-project-id")] - ) - mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( - V1ListLightningappInstancesResponse(lightningapps=[]) - ) - cloud_backend = mock.MagicMock(client=mock_client) - monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) - - cloud_runtime = cloud.CloudRuntime(app=LightningApp(EmptyWork()), entrypoint=entrypoint) - - cloud.LIGHTNING_CLOUD_PRINT_SPECS = print_format - - try: - with caplog.at_level(logging.INFO), contextlib.suppress(SystemExit): - cloud_runtime.dispatch() - - lines = caplog.text.split("\n") - - expected = re.escape(str(expected).replace("'", '"').replace(" ", "")).replace('"\\*"', "(.*)") - expected = "INFO(.*)works: " + expected - assert any(re.fullmatch(expected, line) for line in lines) - finally: - cloud.LIGHTNING_CLOUD_PRINT_SPECS = None - - def test_incompatible_cloud_compute_and_build_config(monkeypatch): """Test that an exception is raised when a build config has a custom image defined, but the cloud compute is the default. diff --git a/tests/tests_app/structures/test_structures.py b/tests/tests_app/structures/test_structures.py index 2d0c5e58005f0..cd6fc7ae9571f 100644 --- a/tests/tests_app/structures/test_structures.py +++ b/tests/tests_app/structures/test_structures.py @@ -8,7 +8,6 @@ from lightning.app.structures import Dict, List from lightning.app.testing.helpers import EmptyFlow from lightning.app.utilities.enum import CacheCallsKeys, WorkStageStatus -from lightning.app.utilities.imports import _IS_WINDOWS def test_dict(): @@ -332,7 +331,7 @@ def run(self): self.counter += 1 -@pytest.mark.skipif(_IS_WINDOWS, reason="strange TimeOut exception") +@pytest.mark.skipif(True, reason="out-dated") @pytest.mark.xfail(strict=False, reason="tchaton: Resolve this test.") @pytest.mark.parametrize("run_once_iterable", [False, True]) @pytest.mark.parametrize("cache_calls", [False, True]) @@ -510,7 +509,7 @@ def run(self): self.stop() -@pytest.mark.xfail(strict=False, reason="flaky") +@pytest.mark.skipif(True, reason="out-dated") def test_structures_with_payload(): app = LightningApp(FlowPayload(), log_level="debug") MultiProcessRuntime(app, start_server=False).dispatch() From 3f69134479140682f3dd02abacbe25fd08ead97c Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 27 Jun 2024 15:29:13 +0200 Subject: [PATCH 087/179] Fix seed in test to avoid interactions on global random state (#20014) --- tests/tests_pytorch/core/test_lightning_optimizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/core/test_lightning_optimizer.py b/tests/tests_pytorch/core/test_lightning_optimizer.py index 89e7c5ee5c5c7..4cc079c4cef3a 100644 --- a/tests/tests_pytorch/core/test_lightning_optimizer.py +++ b/tests/tests_pytorch/core/test_lightning_optimizer.py @@ -16,7 +16,7 @@ import pytest import torch -from lightning.pytorch import Trainer +from lightning.pytorch import Trainer, seed_everything from lightning.pytorch.core.optimizer import LightningOptimizer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.loops.optimization.automatic import Closure @@ -239,6 +239,8 @@ def test_lightning_optimizer_automatic_optimization_lbfgs_zero_grad(tmp_path): """Test zero_grad is called the same number of times as LBFGS requires for reevaluation of the loss in automatic_optimization.""" + seed_everything(0) + class TestModel(BoringModel): def configure_optimizers(self): return torch.optim.LBFGS(self.parameters()) From 967413a5b9b1a3fc10d16a7c26d7f26d7a7625d2 Mon Sep 17 00:00:00 2001 From: Corwin Joy Date: Thu, 27 Jun 2024 07:19:01 -0700 Subject: [PATCH 088/179] Add atomic save to checkpoint routine (#20011) --- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/fabric/utilities/cloud_io.py | 5 ++++- src/lightning/pytorch/CHANGELOG.md | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 37322981c503e..6155644ed6709 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Made saving non-distributed checkpoints fully atomic ([#20011](https://github.com/Lightning-AI/pytorch-lightning/pull/20011)) - diff --git a/src/lightning/fabric/utilities/cloud_io.py b/src/lightning/fabric/utilities/cloud_io.py index af795a4801ef0..49b02cd0952d1 100644 --- a/src/lightning/fabric/utilities/cloud_io.py +++ b/src/lightning/fabric/utilities/cloud_io.py @@ -76,7 +76,10 @@ def _atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None bytesbuffer = io.BytesIO() log.debug(f"Saving checkpoint: {filepath}") torch.save(checkpoint, bytesbuffer) - with fsspec.open(filepath, "wb") as f: + + # We use a transaction here to avoid file corruption if the save gets interrupted + fs, urlpath = fsspec.core.url_to_fs(str(filepath)) + with fs.transaction, fs.open(urlpath, "wb") as f: f.write(bytesbuffer.getvalue()) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 08562a9eb8dca..39dd56e8e73aa 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -9,7 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Made saving non-distributed checkpoints fully atomic ([#20011](https://github.com/Lightning-AI/pytorch-lightning/pull/20011)) - From c2a96e88ba761f392049ede7466c1c117dd87016 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 27 Jun 2024 19:16:43 +0200 Subject: [PATCH 089/179] Set development version for 2.4 (#20022) --- src/version.info | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.info b/src/version.info index 276cbf9e2858c..283c1ce9938fe 100644 --- a/src/version.info +++ b/src/version.info @@ -1 +1 @@ -2.3.0 +2.4.0dev From b6be13ce309c427add2486901c187911432caf05 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 28 Jun 2024 10:43:26 +0200 Subject: [PATCH 090/179] Fix dependency issues with omegaconf and hydra (#20025) --- requirements/pytorch/extra.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 8df61e4834769..6962da858c4ab 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -3,8 +3,8 @@ # extended list of package dependencies to reach full functionality matplotlib>3.1, <3.9.0 -omegaconf >=2.0.5, <2.4.0 -hydra-core >=1.0.5, <1.4.0 +omegaconf >=2.2.3, <2.4.0 +hydra-core >=1.2.0, <1.4.0 jsonargparse[signatures] >=4.27.7, <4.28.0 rich >=12.3.0, <13.6.0 tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute From aa2da72ab9c3e24a6e47c1c731a6a75210a46cb2 Mon Sep 17 00:00:00 2001 From: PL Ghost <75324987+pl-ghost@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:28:01 +0200 Subject: [PATCH 091/179] docs: Bump HPU ref `1.5.0` (#19843) * bumping HPU version -> (1.5.0) * fix build warning * the HPU also need some images * Apply suggestions from code review --------- Co-authored-by: jerome-habana Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: awaelchli Co-authored-by: Jirka --- docs/source-pytorch/conf.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index 0baba58ee0a38..d8572bdc6552e 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -14,6 +14,7 @@ import glob import os import shutil +import urllib import warnings from importlib.util import module_from_spec, spec_from_file_location from types import ModuleType @@ -91,8 +92,14 @@ def _load_py_module(name: str, location: str) -> ModuleType: assist_local.AssistantCLI.pull_docs_files( gh_user_repo="Lightning-AI/lightning-Habana", target_dir="docs/source-pytorch/integrations/hpu", - checkout="refs/tags/1.4.0", + checkout="refs/tags/1.5.0", ) +# the HPU also need some images +URL_RAW_DOCS_GRAPHCORE = "https://raw.githubusercontent.com/Lightning-AI/lightning-Habana/1.5.0/docs/source" +for img in ["_images/HPUProfiler.png", "_images/IGP.png"]: + img_ = os.path.join(_PATH_HERE, "integrations", "hpu", img) + os.makedirs(os.path.dirname(img_), exist_ok=True) + urllib.request.urlretrieve(f"{URL_RAW_DOCS_GRAPHCORE}/{img}", img_) # Copy strategies docs as single pages assist_local.AssistantCLI.pull_docs_files( @@ -459,6 +466,7 @@ def _load_py_module(name: str, location: str) -> ModuleType: ("py:class", "lightning.pytorch.utilities.types.LRSchedulerConfig"), ("py:class", "lightning.pytorch.utilities.types.OptimizerLRSchedulerConfig"), ("py:class", "lightning_habana.pytorch.plugins.precision.HPUPrecisionPlugin"), + ("py:class", "lightning_habana.pytorch.strategies.HPUDDPStrategy"), ("py:class", "lightning_habana.pytorch.strategies.HPUParallelStrategy"), ("py:class", "lightning_habana.pytorch.strategies.SingleHPUStrategy"), ("py:obj", "logger.experiment"), From fa5af164246f7cca589dfd3f1c86419eadb61254 Mon Sep 17 00:00:00 2001 From: PL Ghost <75324987+pl-ghost@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:47:45 +0200 Subject: [PATCH 092/179] docs: Bump HPU ref `1.6.0` (#20026) --------- Co-authored-by: jerome-habana Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: awaelchli --- docs/source-pytorch/conf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py index d8572bdc6552e..82fa3e1f6a2d8 100644 --- a/docs/source-pytorch/conf.py +++ b/docs/source-pytorch/conf.py @@ -92,14 +92,14 @@ def _load_py_module(name: str, location: str) -> ModuleType: assist_local.AssistantCLI.pull_docs_files( gh_user_repo="Lightning-AI/lightning-Habana", target_dir="docs/source-pytorch/integrations/hpu", - checkout="refs/tags/1.5.0", + checkout="refs/tags/1.6.0", ) # the HPU also need some images -URL_RAW_DOCS_GRAPHCORE = "https://raw.githubusercontent.com/Lightning-AI/lightning-Habana/1.5.0/docs/source" +URL_RAW_DOCS_HABANA = "https://raw.githubusercontent.com/Lightning-AI/lightning-Habana/1.5.0/docs/source" for img in ["_images/HPUProfiler.png", "_images/IGP.png"]: img_ = os.path.join(_PATH_HERE, "integrations", "hpu", img) os.makedirs(os.path.dirname(img_), exist_ok=True) - urllib.request.urlretrieve(f"{URL_RAW_DOCS_GRAPHCORE}/{img}", img_) + urllib.request.urlretrieve(f"{URL_RAW_DOCS_HABANA}/{img}", img_) # Copy strategies docs as single pages assist_local.AssistantCLI.pull_docs_files( From 2524864b3cc8dae3552a4b5c3c819c2ce6f278af Mon Sep 17 00:00:00 2001 From: PL Ghost <75324987+pl-ghost@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:48:15 +0200 Subject: [PATCH 093/179] Adding test for legacy checkpoint created with 2.3.1 (#20023) --- tests/legacy/back-compatible-versions.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/legacy/back-compatible-versions.txt b/tests/legacy/back-compatible-versions.txt index 2495d0f84ea95..06ebce5658d0f 100644 --- a/tests/legacy/back-compatible-versions.txt +++ b/tests/legacy/back-compatible-versions.txt @@ -101,3 +101,4 @@ 2.2.2 2.2.5 2.3.0 +2.3.1 From 5636fe4a9c8059bc9662925139cd6ed57ed86708 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 30 Jun 2024 22:19:38 +0200 Subject: [PATCH 094/179] CI: replace macOS-11 with macOS-14 (#20029) --- .github/checkgroup.yml | 22 +++++++++++----------- .github/workflows/ci-tests-fabric.yml | 10 +++++----- .github/workflows/ci-tests-pytorch.yml | 10 +++++----- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 720cb47b4bb07..22b2dee5dd059 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -19,10 +19,10 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "pl-cpu (macOS-11, lightning, 3.8, 2.0, oldest)" - - "pl-cpu (macOS-11, lightning, 3.10, 2.0)" - - "pl-cpu (macOS-11, lightning, 3.10, 2.1)" - - "pl-cpu (macOS-11, lightning, 3.10, 2.2)" + - "pl-cpu (macOS-13, lightning, 3.8, 2.0, oldest)" + - "pl-cpu (macOS-14, lightning, 3.10, 2.0)" + - "pl-cpu (macOS-14, lightning, 3.10, 2.1)" + - "pl-cpu (macOS-14, lightning, 3.10, 2.2)" - "pl-cpu (macOS-14, lightning, 3.10, 2.3)" - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" @@ -34,7 +34,7 @@ subprojects: - "pl-cpu (windows-2022, lightning, 3.10, 2.1)" - "pl-cpu (windows-2022, lightning, 3.10, 2.2)" - "pl-cpu (windows-2022, lightning, 3.10, 2.3)" - - "pl-cpu (macOS-11, pytorch, 3.8, 2.0)" + - "pl-cpu (macOS-14, pytorch, 3.8, 2.0)" - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.0)" - "pl-cpu (windows-2022, pytorch, 3.8, 2.0)" - "pl-cpu (macOS-12, pytorch, 3.11, 2.0)" @@ -171,10 +171,10 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "fabric-cpu (macOS-11, lightning, 3.8, 2.0, oldest)" - - "fabric-cpu (macOS-11, lightning, 3.10, 2.0)" - - "fabric-cpu (macOS-11, lightning, 3.11, 2.1)" - - "fabric-cpu (macOS-11, lightning, 3.11, 2.2)" + - "fabric-cpu (macOS-13, lightning, 3.8, 2.0, oldest)" + - "fabric-cpu (macOS-14, lightning, 3.10, 2.0)" + - "fabric-cpu (macOS-14, lightning, 3.11, 2.1)" + - "fabric-cpu (macOS-14, lightning, 3.11, 2.2)" - "fabric-cpu (macOS-14, lightning, 3.10, 2.3)" - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" @@ -186,7 +186,7 @@ subprojects: - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.3)" - - "fabric-cpu (macOS-11, fabric, 3.8, 2.0)" + - "fabric-cpu (macOS-14, fabric, 3.8, 2.0)" - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.0)" - "fabric-cpu (windows-2022, fabric, 3.8, 2.0)" - "fabric-cpu (macOS-12, fabric, 3.11, 2.0)" @@ -298,7 +298,7 @@ subprojects: - "src/lightning/store/**" - "tests/tests_store/**" checks: - - "store-cpu (macOS-11, lightning, 3.10, 2.0)" + - "store-cpu (macOS-14, lightning, 3.10, 2.0)" - "store-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - "store-cpu (windows-2022, lightning, 3.10, 2.0)" diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 082636a617227..009e03f38c7bd 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -39,14 +39,14 @@ jobs: fail-fast: false matrix: include: - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } # only run PyTorch latest - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } @@ -60,7 +60,7 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } + - { os: "macOS-13", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } - { os: "ubuntu-20.04", pkg-name: "lightning", @@ -76,7 +76,7 @@ jobs: requires: "oldest", } # "fabric" installs the standalone package - - { os: "macOS-11", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } + - { os: "macOS-14", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } - { os: "windows-2022", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } timeout-minutes: 25 # because of building grpcio on Mac diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index b0b10ef4acea8..967369976ed23 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -43,14 +43,14 @@ jobs: fail-fast: false matrix: include: - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } # only run PyTorch latest - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" } - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } @@ -64,7 +64,7 @@ jobs: - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } + - { os: "macOS-13", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } - { os: "ubuntu-20.04", pkg-name: "lightning", @@ -80,7 +80,7 @@ jobs: requires: "oldest", } # "pytorch" installs the standalone package - - { os: "macOS-11", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } timeout-minutes: 50 From 14493c06850efa8f27fd959554be105b23497942 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 1 Jul 2024 00:02:00 +0200 Subject: [PATCH 095/179] Drop PyTorch 2.0 from the test matrix (#20009) --- .github/checkgroup.yml | 38 +++++-------- .github/workflows/ci-tests-fabric.yml | 18 +++---- .github/workflows/ci-tests-pytorch.yml | 18 +++---- .github/workflows/docker-build.yml | 2 - docs/source-fabric/advanced/compile.rst | 4 -- docs/source-fabric/advanced/model_init.rst | 2 +- docs/source-pytorch/advanced/compile.rst | 4 -- requirements/fabric/base.txt | 4 +- requirements/fabric/examples.txt | 2 +- requirements/pytorch/base.txt | 4 +- requirements/pytorch/examples.txt | 2 +- requirements/pytorch/test.txt | 4 +- src/lightning/fabric/CHANGELOG.md | 3 +- src/lightning/fabric/__init__.py | 2 +- src/lightning/fabric/fabric.py | 6 +-- src/lightning/fabric/strategies/fsdp.py | 41 +++++--------- src/lightning/fabric/utilities/imports.py | 3 -- src/lightning/fabric/utilities/init.py | 3 -- .../fabric/utilities/testing/_runif.py | 9 +--- src/lightning/fabric/utilities/throughput.py | 3 -- src/lightning/pytorch/CHANGELOG.md | 3 +- src/lightning/pytorch/core/module.py | 26 +-------- src/lightning/pytorch/demos/transformer.py | 3 +- src/lightning/pytorch/loops/utilities.py | 4 -- src/lightning/pytorch/strategies/fsdp.py | 25 ++++----- .../connectors/logger_connector/result.py | 5 +- src/lightning/pytorch/utilities/compile.py | 5 -- tests/tests_fabric/conftest.py | 5 +- .../plugins/precision/test_bitsandbytes.py | 4 +- .../strategies/test_ddp_integration.py | 2 +- tests/tests_fabric/strategies/test_fsdp.py | 48 +++++------------ .../strategies/test_fsdp_integration.py | 14 ++--- tests/tests_fabric/test_fabric.py | 4 +- tests/tests_fabric/test_wrappers.py | 16 +++--- tests/tests_fabric/utilities/test_init.py | 1 - .../tests_fabric/utilities/test_throughput.py | 2 - .../callbacks/test_throughput_monitor.py | 3 -- tests/tests_pytorch/conftest.py | 5 +- .../deprecated_api/test_no_removal_version.py | 6 +-- tests/tests_pytorch/strategies/test_fsdp.py | 53 +++++-------------- .../test_model_parallel_integration.py | 2 +- .../trainer/flags/test_inference_mode.py | 5 +- 42 files changed, 118 insertions(+), 295 deletions(-) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 22b2dee5dd059..79b65664d2eb8 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -19,29 +19,23 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "pl-cpu (macOS-13, lightning, 3.8, 2.0, oldest)" - - "pl-cpu (macOS-14, lightning, 3.10, 2.0)" + - "pl-cpu (macOS-13, lightning, 3.8, 2.1, oldest)" - "pl-cpu (macOS-14, lightning, 3.10, 2.1)" - "pl-cpu (macOS-14, lightning, 3.10, 2.2)" - "pl-cpu (macOS-14, lightning, 3.10, 2.3)" - - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" + - "pl-cpu (ubuntu-20.04, lightning, 3.8, 2.1, oldest)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.2)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.3)" - - "pl-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" - - "pl-cpu (windows-2022, lightning, 3.10, 2.0)" + - "pl-cpu (windows-2022, lightning, 3.8, 2.1, oldest)" - "pl-cpu (windows-2022, lightning, 3.10, 2.1)" - "pl-cpu (windows-2022, lightning, 3.10, 2.2)" - "pl-cpu (windows-2022, lightning, 3.10, 2.3)" - - "pl-cpu (macOS-14, pytorch, 3.8, 2.0)" - - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.0)" - - "pl-cpu (windows-2022, pytorch, 3.8, 2.0)" - - "pl-cpu (macOS-12, pytorch, 3.11, 2.0)" + - "pl-cpu (macOS-14, pytorch, 3.8, 2.1)" + - "pl-cpu (ubuntu-20.04, pytorch, 3.8, 2.1)" + - "pl-cpu (windows-2022, pytorch, 3.8, 2.1)" - "pl-cpu (macOS-12, pytorch, 3.11, 2.1)" - - "pl-cpu (ubuntu-22.04, pytorch, 3.11, 2.0)" - "pl-cpu (ubuntu-22.04, pytorch, 3.11, 2.1)" - - "pl-cpu (windows-2022, pytorch, 3.11, 2.0)" - "pl-cpu (windows-2022, pytorch, 3.11, 2.1)" - id: "pytorch_lightning: Azure GPU" @@ -144,13 +138,11 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "build-cuda (3.10, 2.0, 11.8.0)" - "build-cuda (3.10, 2.1, 12.1.0)" - "build-cuda (3.10, 2.2, 12.1.0)" - "build-cuda (3.11, 2.1, 12.1.0)" - "build-cuda (3.11, 2.2, 12.1.0)" #- "build-NGC" - - "build-pl (3.10, 2.0, 11.8.0)" - "build-pl (3.10, 2.1, 12.1.0)" - "build-pl (3.10, 2.2, 12.1.0)" - "build-pl (3.11, 2.1, 12.1.0)" @@ -171,29 +163,23 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "fabric-cpu (macOS-13, lightning, 3.8, 2.0, oldest)" - - "fabric-cpu (macOS-14, lightning, 3.10, 2.0)" + - "fabric-cpu (macOS-13, lightning, 3.8, 2.1, oldest)" - "fabric-cpu (macOS-14, lightning, 3.11, 2.1)" - "fabric-cpu (macOS-14, lightning, 3.11, 2.2)" - "fabric-cpu (macOS-14, lightning, 3.10, 2.3)" - - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.0, oldest)" - - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" + - "fabric-cpu (ubuntu-20.04, lightning, 3.8, 2.1, oldest)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.1)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.3)" - - "fabric-cpu (windows-2022, lightning, 3.8, 2.0, oldest)" - - "fabric-cpu (windows-2022, lightning, 3.10, 2.0)" + - "fabric-cpu (windows-2022, lightning, 3.8, 2.1, oldest)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.1)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.3)" - - "fabric-cpu (macOS-14, fabric, 3.8, 2.0)" - - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.0)" - - "fabric-cpu (windows-2022, fabric, 3.8, 2.0)" - - "fabric-cpu (macOS-12, fabric, 3.11, 2.0)" + - "fabric-cpu (macOS-14, fabric, 3.8, 2.1)" + - "fabric-cpu (ubuntu-20.04, fabric, 3.8, 2.1)" + - "fabric-cpu (windows-2022, fabric, 3.8, 2.1)" - "fabric-cpu (macOS-12, fabric, 3.11, 2.1)" - - "fabric-cpu (ubuntu-22.04, fabric, 3.11, 2.0)" - "fabric-cpu (ubuntu-22.04, fabric, 3.11, 2.1)" - - "fabric-cpu (windows-2022, fabric, 3.11, 2.0)" - "fabric-cpu (windows-2022, fabric, 3.11, 2.1)" - id: "lightning_fabric: Azure GPU" diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 009e03f38c7bd..2c0d8d16b89ad 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -39,9 +39,6 @@ jobs: fail-fast: false matrix: include: - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } # only run PyTorch latest - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" } @@ -53,32 +50,29 @@ jobs: - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - - { os: "macOS-12", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" } - - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" } - { os: "macOS-12", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-13", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } + - { os: "macOS-13", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.1", requires: "oldest" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "2.0", + pytorch-version: "2.1", requires: "oldest", } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "2.0", + pytorch-version: "2.1", requires: "oldest", } # "fabric" installs the standalone package - - { os: "macOS-14", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } - - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } - - { os: "windows-2022", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.0" } + - { os: "macOS-14", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.1" } + - { os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.1" } + - { os: "windows-2022", pkg-name: "fabric", python-version: "3.8", pytorch-version: "2.1" } timeout-minutes: 25 # because of building grpcio on Mac env: PACKAGE_NAME: ${{ matrix.pkg-name }} diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index 967369976ed23..b75b6e73d987f 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -43,9 +43,6 @@ jobs: fail-fast: false matrix: include: - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.0" } # only run PyTorch latest - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } @@ -57,32 +54,29 @@ jobs: - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues - - { os: "macOS-12", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" } - - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" } - { os: "macOS-12", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" } - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.1" } # "oldest" versions tests, only on minimum Python - - { os: "macOS-13", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.0", requires: "oldest" } + - { os: "macOS-13", pkg-name: "lightning", python-version: "3.8", pytorch-version: "2.1", requires: "oldest" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "2.0", + pytorch-version: "2.1", requires: "oldest", } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.8", - pytorch-version: "2.0", + pytorch-version: "2.1", requires: "oldest", } # "pytorch" installs the standalone package - - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } - - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } - - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.0" } + - { os: "macOS-14", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.1" } + - { os: "ubuntu-20.04", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.1" } + - { os: "windows-2022", pkg-name: "pytorch", python-version: "3.8", pytorch-version: "2.1" } timeout-minutes: 50 env: PACKAGE_NAME: ${{ matrix.pkg-name }} diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index d917ebc407143..0891205421550 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -43,7 +43,6 @@ jobs: include: # We only release one docker image per PyTorch version. # Make sure the matrix here matches the one below. - - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" } - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } @@ -104,7 +103,6 @@ jobs: include: # These are the base images for PL release docker images. # Make sure the matrix here matches the one above. - - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" } - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } diff --git a/docs/source-fabric/advanced/compile.rst b/docs/source-fabric/advanced/compile.rst index a8e1cc2db243c..ed46a1f822643 100644 --- a/docs/source-fabric/advanced/compile.rst +++ b/docs/source-fabric/advanced/compile.rst @@ -5,10 +5,6 @@ Speed up models by compiling them Compiling your PyTorch model can result in significant speedups, especially on the latest generations of GPUs. This guide shows you how to apply `torch.compile `_ correctly in your code. -.. note:: - - This requires PyTorch >= 2.0. - ---- diff --git a/docs/source-fabric/advanced/model_init.rst b/docs/source-fabric/advanced/model_init.rst index 4b31df036fe78..f5f76e8aa087b 100644 --- a/docs/source-fabric/advanced/model_init.rst +++ b/docs/source-fabric/advanced/model_init.rst @@ -81,4 +81,4 @@ When training distributed models with :doc:`FSDP/TP ` or D .. note:: Empty-init is experimental and the behavior may change in the future. - For distributed models on PyTorch 2.1+, it is required that all user-defined modules that manage parameters implement a ``reset_parameters()`` method (all PyTorch built-in modules have this too). + For distributed models, it is required that all user-defined modules that manage parameters implement a ``reset_parameters()`` method (all PyTorch built-in modules have this too). diff --git a/docs/source-pytorch/advanced/compile.rst b/docs/source-pytorch/advanced/compile.rst index 73d5f4fbc2af4..484559e111a52 100644 --- a/docs/source-pytorch/advanced/compile.rst +++ b/docs/source-pytorch/advanced/compile.rst @@ -5,10 +5,6 @@ Speed up models by compiling them Compiling your LightningModule can result in significant speedups, especially on the latest generations of GPUs. This guide shows you how to apply `torch.compile `_ correctly in your code. -.. note:: - - This requires PyTorch >= 2.0. - ---- diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index aac884d9c6f43..7ca4556821613 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -1,8 +1,8 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -numpy >=1.17.2, <1.27.0 -torch >=2.0.0, <2.4.0 +numpy >=1.21.0, <1.27.0 +torch >=2.1.0, <2.4.0 fsspec[http] >=2022.5.0, <2024.4.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt index 0e2feb97eccc4..49ffde9d0f2f0 100644 --- a/requirements/fabric/examples.txt +++ b/requirements/fabric/examples.txt @@ -1,6 +1,6 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision >=0.15.0, <0.19.0 +torchvision >=0.16.0, <0.19.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 6372357b6d290..cd71466551f01 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -1,8 +1,8 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -numpy >=1.17.2, <1.27.0 -torch >=2.0.0, <2.4.0 +numpy >=1.21.0, <1.27.0 +torch >=2.1.0, <2.4.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 fsspec[http] >=2022.5.0, <2024.4.0 diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 55b85025bddb2..e4b1bc31e9aea 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment requests <2.32.0 -torchvision >=0.15.0, <0.19.0 +torchvision >=0.16.0, <0.19.0 ipython[all] <8.15.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index 94a06630df61b..472c4157dfb39 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -8,8 +8,8 @@ pytest-random-order ==1.1.0 # needed in tests cloudpickle >=1.3, <2.3.0 scikit-learn >0.22.1, <1.4.0 -onnx >=0.14.0, <1.15.0 -onnxruntime >=0.15.0, <1.17.0 +onnx >=1.12.0, <1.15.0 +onnxruntime >=1.12.0, <1.17.0 psutil <5.9.6 # for `DeviceStatsMonitor` pandas >1.0, <2.2.0 # needed in benchmarks fastapi # for `ServableModuleValidator` # not setting version as re-defined in App diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 6155644ed6709..985759830769d 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -27,7 +27,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Removed -- +- Removed support for PyTorch 2.1 ([#20009](https://github.com/Lightning-AI/lightning/pull/20009)) + - diff --git a/src/lightning/fabric/__init__.py b/src/lightning/fabric/__init__.py index 75752d8b94884..26f01aad64b17 100644 --- a/src/lightning/fabric/__init__.py +++ b/src/lightning/fabric/__init__.py @@ -21,7 +21,7 @@ _logger.propagate = False -# In PyTorch 2.0+, setting this variable will force `torch.cuda.is_available()` and `torch.cuda.device_count()` +# Setting this variable will force `torch.cuda.is_available()` and `torch.cuda.device_count()` # to use an NVML-based implementation that doesn't poison forks. # https://github.com/pytorch/pytorch/issues/83973 os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1" diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index 71d8f623dcee4..b9032fe7a9d93 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -225,8 +225,7 @@ def setup( _reapply_compile: If ``True`` (default), and the model was ``torch.compile``d before, the corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP, - FSDP etc.). Only applies on PyTorch >= 2.1. Set it to ``False`` if compiling DDP/FSDP is causing - issues. + FSDP etc.). Set it to ``False`` if compiling DDP/FSDP is causing issues. Returns: The tuple containing wrapped module and the optimizers, in the same order they were passed in. @@ -292,8 +291,7 @@ def setup_module( _reapply_compile: If ``True`` (default), and the model was ``torch.compile``d before, the corresponding :class:`~torch._dynamo.OptimizedModule` wrapper will be removed and reapplied with the same settings after the model was set up by the strategy (e.g., after the model was wrapped by DDP, - FSDP etc.). Only applies on PyTorch >= 2.1. Set it to ``False`` if compiling DDP/FSDP is causing - issues. + FSDP etc.). Set it to ``False`` if compiling DDP/FSDP is causing issues. Returns: The wrapped model. diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index 9a711b8449c3e..b8a3a268478dd 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -63,11 +63,10 @@ ) from lightning.fabric.utilities.distributed import group as _group from lightning.fabric.utilities.imports import ( - _TORCH_GREATER_EQUAL_2_1, _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_3, ) -from lightning.fabric.utilities.init import _EmptyInit, _has_meta_device_parameters_or_buffers +from lightning.fabric.utilities.init import _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.load import _METADATA_FILENAME, _lazy_load, _materialize_tensors, _move_state_into from lightning.fabric.utilities.rank_zero import rank_zero_deprecation, rank_zero_only, rank_zero_warn from lightning.fabric.utilities.seed import reset_seed @@ -325,7 +324,7 @@ def setup_optimizer(self, optimizer: Optimizer) -> Optimizer: if self._fsdp_kwargs.get("use_orig_params"): return super().setup_optimizer(optimizer) if not _optimizer_has_flat_params(optimizer): - # We avoid this limitation in PyTorch >= 2.0 by setting `use_orig_params=True` + # We avoid this limitation by setting `use_orig_params=True` raise ValueError( "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the optimizer" " after setting up the model." @@ -340,15 +339,12 @@ def module_to_device(self, module: Module) -> None: def module_init_context(self, empty_init: Optional[bool] = None) -> ContextManager: precision_init_ctx = self.precision.module_init_context() module_sharded_ctx = self.module_sharded_context() - empty_ctx = _EmptyInit(enabled=bool(empty_init)) stack = ExitStack() - if _TORCH_GREATER_EQUAL_2_1 and empty_init: + if empty_init: # Materialization happens in `setup`. When modules get wrapped by FSDP, the sequence of operations is: # 1) materialize module 2) call `reset_parameters()` 3) shard the module. # These operations are applied to each submodule 'bottom up' in the module hierarchy. stack.enter_context(torch.device("meta")) - else: - stack.enter_context(empty_ctx) stack.enter_context(precision_init_ctx) stack.enter_context(module_sharded_ctx) return stack @@ -697,18 +693,13 @@ def _activation_checkpointing_kwargs( classes = tuple(activation_checkpointing) else: classes = (activation_checkpointing,) - if _TORCH_GREATER_EQUAL_2_1: - rank_zero_deprecation( - f"`FSDPStrategy(activation_checkpointing={activation_checkpointing})` is deprecated, use " - f"`FSDPStrategy(activation_checkpointing_policy={set(classes)})` instead." - ) + rank_zero_deprecation( + f"`FSDPStrategy(activation_checkpointing={activation_checkpointing})` is deprecated, use " + f"`FSDPStrategy(activation_checkpointing_policy={set(classes)})` instead." + ) return {"check_fn": lambda submodule: isinstance(submodule, classes)} if isinstance(activation_checkpointing_policy, set): - if _TORCH_GREATER_EQUAL_2_1: - return _auto_wrap_policy_kwargs(activation_checkpointing_policy, {}) - return {"check_fn": lambda submodule: isinstance(submodule, tuple(activation_checkpointing_policy))} - if not _TORCH_GREATER_EQUAL_2_1: - raise ValueError("`activation_checkpointing_policy` requires torch >= 2.1.0. HINT: `pip install -U torch`") + return _auto_wrap_policy_kwargs(activation_checkpointing_policy, {}) return {"auto_wrap_policy": activation_checkpointing_policy} @@ -716,15 +707,10 @@ def _auto_wrap_policy_kwargs(policy: Optional["_POLICY"], kwargs: Dict) -> Dict: if policy is None: return kwargs if isinstance(policy, set): - if _TORCH_GREATER_EQUAL_2_1: - from torch.distributed.fsdp.wrap import ModuleWrapPolicy + from torch.distributed.fsdp.wrap import ModuleWrapPolicy - policy = ModuleWrapPolicy(policy) - else: - from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy + policy = ModuleWrapPolicy(policy) - # this is not transformer specific despite the name - policy = partial(transformer_auto_wrap_policy, transformer_layer_cls=policy) kwargs["auto_wrap_policy"] = policy return kwargs @@ -829,11 +815,8 @@ def _get_full_state_dict_context( from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.api import FullOptimStateDictConfig - # In PyTorch < 2.1, offload to CPU in combination with `world_size=1` is not possible - offload_to_cpu = world_size > 1 or _TORCH_GREATER_EQUAL_2_1 - state_dict_config = FullStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only) - - optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=offload_to_cpu, rank0_only=rank0_only) + state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=rank0_only) + optim_state_dict_config = FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=rank0_only) state_dict_type_context = FSDP.state_dict_type( module=module, state_dict_type=StateDictType.FULL_STATE_DICT, diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py index 46374e23ad2b5..fc40175ff53a4 100644 --- a/src/lightning/fabric/utilities/imports.py +++ b/src/lightning/fabric/utilities/imports.py @@ -26,13 +26,10 @@ # 2. The inspection mode via `python -i`: https://stackoverflow.com/a/6879085/1162383 _IS_INTERACTIVE = hasattr(sys, "ps1") or bool(sys.flags.interactive) -_TORCH_GREATER_EQUAL_2_1 = compare_version("torch", operator.ge, "2.1.0") _TORCH_GREATER_EQUAL_2_2 = compare_version("torch", operator.ge, "2.2.0") _TORCH_GREATER_EQUAL_2_3 = compare_version("torch", operator.ge, "2.3.0") _TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0", use_base_version=True) -_TORCH_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0") and not _TORCH_GREATER_EQUAL_2_1 - _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) diff --git a/src/lightning/fabric/utilities/init.py b/src/lightning/fabric/utilities/init.py index fccdce7aa813d..c92dfd8c2e82b 100644 --- a/src/lightning/fabric/utilities/init.py +++ b/src/lightning/fabric/utilities/init.py @@ -20,7 +20,6 @@ from torch.overrides import TorchFunctionMode from typing_extensions import override -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.utilities.rank_zero import rank_zero_warn from lightning.fabric.utilities.types import _DEVICE @@ -61,8 +60,6 @@ def __torch_function__( def _materialize(module: Module, device: _DEVICE) -> None: """Materialize a module.""" - if not _TORCH_GREATER_EQUAL_2_1: - raise RuntimeError("recurse=False requires torch 2.1") module.to_empty(device=device, recurse=False) if not hasattr(module, "reset_parameters"): raise TypeError( diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py index 9a6f5554baa19..f28265104878d 100644 --- a/src/lightning/fabric/utilities/testing/_runif.py +++ b/src/lightning/fabric/utilities/testing/_runif.py @@ -24,7 +24,6 @@ from lightning.fabric.accelerators.cuda import num_cuda_devices from lightning.fabric.accelerators.mps import MPSAccelerator from lightning.fabric.strategies.deepspeed import _DEEPSPEED_AVAILABLE -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 def _runif_reasons( @@ -116,13 +115,9 @@ def _runif_reasons( reasons.append("Deepspeed") if dynamo: - if _TORCH_GREATER_EQUAL_2_1: - from torch._dynamo.eval_frame import is_dynamo_supported + from torch._dynamo.eval_frame import is_dynamo_supported - cond = not is_dynamo_supported() - else: - cond = sys.platform == "win32" or sys.version_info >= (3, 11) - if cond: + if not is_dynamo_supported(): reasons.append("torch.dynamo") return reasons, kwargs diff --git a/src/lightning/fabric/utilities/throughput.py b/src/lightning/fabric/utilities/throughput.py index f483c274c35b8..6743da7b34085 100644 --- a/src/lightning/fabric/utilities/throughput.py +++ b/src/lightning/fabric/utilities/throughput.py @@ -18,7 +18,6 @@ import torch from typing_extensions import override -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.utilities.rank_zero import rank_zero_only, rank_zero_warn if TYPE_CHECKING: @@ -292,8 +291,6 @@ def measure_flops( FLOPs will be included in the result. """ - if not _TORCH_GREATER_EQUAL_2_1: - raise ImportError("`measure_flops` requires PyTorch >= 2.1.") from torch.utils.flop_counter import FlopCounterMode flop_counter = FlopCounterMode(display=False) diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 39dd56e8e73aa..8e026f485fb65 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -27,7 +27,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Removed -- +- Removed support for PyTorch 2.1 ([#20009](https://github.com/Lightning-AI/lightning/pull/20009)) + - diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index 68395ce97d9f3..c78cb87bb9b4d 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -51,7 +51,6 @@ from lightning.fabric.utilities.apply_func import convert_to_tensors from lightning.fabric.utilities.cloud_io import get_filesystem from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin -from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.utilities.types import _MAP_LOCATION_TYPE, _PATH from lightning.fabric.wrappers import _FabricOptimizer from lightning.pytorch.callbacks.callback import Callback @@ -67,7 +66,7 @@ from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_0_9_1 from lightning.pytorch.utilities.model_helpers import _restricted_classmethod -from lightning.pytorch.utilities.rank_zero import WarningCache, rank_zero_debug, rank_zero_warn +from lightning.pytorch.utilities.rank_zero import WarningCache, rank_zero_warn from lightning.pytorch.utilities.signature_utils import is_param_in_hook_signature from lightning.pytorch.utilities.types import ( _METRIC, @@ -140,7 +139,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._current_fx_name: Optional[str] = None self._param_requires_grad_state: Dict[str, bool] = {} self._metric_attributes: Optional[Dict[int, str]] = None - self._register_sharded_tensor_state_dict_hooks_if_available() self._compiler_ctx: Optional[Dict[str, Any]] = None # attributes only used when using fabric @@ -1390,9 +1388,7 @@ def forward(self, x): """ if not _ONNX_AVAILABLE: - raise ModuleNotFoundError( - f"`torch>=2.0` requires `onnx` to be installed to use `{type(self).__name__}.to_onnx()`" - ) + raise ModuleNotFoundError(f"`{type(self).__name__}.to_onnx()` requires `onnx` to be installed.") mode = self.training @@ -1599,24 +1595,6 @@ def __getstate__(self) -> Dict[str, Any]: state["_trainer"] = None return state - def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None: - """Adds ShardedTensor state dict hooks if ShardedTensors are supported. - - These hooks ensure that ShardedTensors are included when saving, and are loaded the LightningModule correctly. - - """ - if _TORCH_GREATER_EQUAL_2_1: - # ShardedTensor is deprecated in favor of DistributedTensor - return - if _IS_WINDOWS or not torch.distributed.is_available(): - rank_zero_debug("Could not register sharded tensor state dict hooks") - return - - from torch.distributed._shard.sharded_tensor import pre_load_state_dict_hook, state_dict_hook - - self._register_state_dict_hook(state_dict_hook) - self._register_load_state_dict_pre_hook(pre_load_state_dict_hook, True) - @contextmanager def _jit_is_scripting() -> Generator: diff --git a/src/lightning/pytorch/demos/transformer.py b/src/lightning/pytorch/demos/transformer.py index 833c15d91cbdd..3f5bcb696affe 100644 --- a/src/lightning/pytorch/demos/transformer.py +++ b/src/lightning/pytorch/demos/transformer.py @@ -85,8 +85,7 @@ def forward(self, x: Tensor) -> Tensor: if self.pe is None: # 1) can't use buffer, see https://github.com/pytorch/pytorch/issues/68407 # 2) can't use parameter becauses pe gets sliced and DDP requires all params to participate in forward - # 3) can't make it a `requires_grad=False` parameter because FSDP in PyTorch < 2.1 needs all params to - # require grad + # TODO: Could make this a `nn.Parameter` with `requires_grad=False` self.pe = self._init_pos_encoding(device=x.device) x + self.pe[: x.size(0), :] diff --git a/src/lightning/pytorch/loops/utilities.py b/src/lightning/pytorch/loops/utilities.py index 8ca54184b477a..99ea5c4254d62 100644 --- a/src/lightning/pytorch/loops/utilities.py +++ b/src/lightning/pytorch/loops/utilities.py @@ -21,7 +21,6 @@ import lightning.pytorch as pl from lightning.fabric.utilities.distributed import _distributed_is_initialized -from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0 from lightning.fabric.utilities.warnings import PossibleUserWarning from lightning.pytorch.accelerators.xla import XLAAccelerator from lightning.pytorch.callbacks.timer import Timer @@ -171,9 +170,6 @@ def _decorator(self: _Loop, *args: Any, **kwargs: Any) -> Any: elif isinstance(self.trainer.strategy, FSDPStrategy): # https://github.com/pytorch/pytorch/issues/95957 context_manager = torch.no_grad - elif _TORCH_EQUAL_2_0 and self.trainer.lightning_module._compiler_ctx is not None: - # avoid: `RuntimeError: Inference tensors do not track version counter` fixed in v2.1 - context_manager = torch.no_grad elif self.inference_mode: context_manager = torch.inference_mode else: diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py index 90f6c1febdccb..ab6e579c3071f 100644 --- a/src/lightning/pytorch/strategies/fsdp.py +++ b/src/lightning/pytorch/strategies/fsdp.py @@ -67,11 +67,8 @@ _sync_ddp_if_available, ) from lightning.fabric.utilities.distributed import group as _group -from lightning.fabric.utilities.imports import ( - _TORCH_GREATER_EQUAL_2_1, - _TORCH_GREATER_EQUAL_2_2, -) -from lightning.fabric.utilities.init import _EmptyInit, _has_meta_device_parameters_or_buffers +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.init import _has_meta_device_parameters_or_buffers from lightning.fabric.utilities.load import _lazy_load, _materialize_tensors from lightning.fabric.utilities.optimizer import _optimizers_to_device from lightning.fabric.utilities.seed import reset_seed @@ -368,8 +365,8 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: invalid_params_error = False try: - # In PyTorch < 2.0, or if `use_orig_params=False` the user needs to do access - # `self.trainer.model.parameters()` in configure_optimizers() + # If `use_orig_params=False` the user needs to do access `self.trainer.model.parameters()` in + # `configure_optimizers()` super().setup_optimizers(trainer) except ValueError as ex: if "optimizer got an empty parameter list" not in str(ex): @@ -377,7 +374,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: invalid_params_error = True if invalid_params_error or any(not _optimizer_has_flat_params(optimizer) for optimizer in self.optimizers): - # We avoid this limitation in PyTorch >= 2.0 by setting `use_orig_params=True` + # We avoid this limitation by setting `use_orig_params=True` raise ValueError( "The optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the" " optimizer after setting up the model by referencing `self.trainer.model.parameters()` in the" @@ -393,14 +390,10 @@ def model_to_device(self) -> None: @contextmanager @override def tensor_init_context(self, empty_init: Optional[bool] = None) -> Generator[None, None, None]: - empty_init_context: Union[torch.device, _EmptyInit, nullcontext] - if _TORCH_GREATER_EQUAL_2_1 and empty_init: - # Materialization happens in `setup`. When modules get wrapped by FSDP, the sequence of operations is: - # 1) materialize module 2) call `reset_parameters()` 3) shard the module. - # These operations are applied to each submodule 'bottom up' in the module hierarchy. - empty_init_context = torch.device("meta") - else: - empty_init_context = _EmptyInit(enabled=bool(empty_init)) + # Materialization happens in `setup`. When modules get wrapped by FSDP, the sequence of operations is: + # 1) materialize module 2) call `reset_parameters()` 3) shard the module. + # These operations are applied to each submodule 'bottom up' in the module hierarchy. + empty_init_context = torch.device("meta") if empty_init else nullcontext() with empty_init_context, self.precision_plugin.tensor_init_context(): yield diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py index d7320c2c2e251..2f4ad406f4fa4 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py @@ -24,7 +24,6 @@ from lightning.fabric.utilities import move_data_to_device from lightning.fabric.utilities.apply_func import convert_tensors_to_scalars from lightning.fabric.utilities.distributed import _distributed_is_initialized -from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0 from lightning.pytorch.utilities.data import extract_batch_size from lightning.pytorch.utilities.exceptions import MisconfigurationException from lightning.pytorch.utilities.imports import _TORCHMETRICS_GREATER_EQUAL_1_0_0 @@ -112,7 +111,7 @@ class _Metadata: on_step: bool = False on_epoch: bool = True # https://github.com/pytorch/pytorch/issues/96197 - reduce_fx: Callable = "mean" if _TORCH_EQUAL_2_0 else torch.mean # type: ignore[assignment] + reduce_fx: Callable = torch.mean enable_graph: bool = False add_dataloader_idx: bool = True dataloader_idx: Optional[int] = None @@ -362,7 +361,7 @@ def log( on_step: bool = False, on_epoch: bool = True, # https://github.com/pytorch/pytorch/issues/96197 - reduce_fx: Callable = "mean" if _TORCH_EQUAL_2_0 else torch.mean, # type: ignore[assignment] + reduce_fx: Callable = torch.mean, enable_graph: bool = False, sync_dist: bool = False, sync_dist_fn: Callable = _Sync.no_op, diff --git a/src/lightning/pytorch/utilities/compile.py b/src/lightning/pytorch/utilities/compile.py index 7c5a8067740a4..cb2433e04bb1a 100644 --- a/src/lightning/pytorch/utilities/compile.py +++ b/src/lightning/pytorch/utilities/compile.py @@ -17,7 +17,6 @@ from torch._dynamo import OptimizedModule import lightning.pytorch as pl -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.pytorch.strategies import DDPStrategy, DeepSpeedStrategy, FSDPStrategy, SingleDeviceStrategy, Strategy from lightning.pytorch.utilities.model_helpers import _check_mixed_imports @@ -56,11 +55,7 @@ def from_compiled(model: OptimizedModule) -> "pl.LightningModule": } orig_module.forward = model.dynamo_ctx(orig_module.forward) # type: ignore[method-assign] - if not _TORCH_GREATER_EQUAL_2_1: # https://github.com/pytorch/pytorch/issues/95630 - orig_module.forward._torchdynamo_inline = orig_module.forward orig_module.training_step = model.dynamo_ctx(orig_module.training_step) # type: ignore[method-assign] - if not _TORCH_GREATER_EQUAL_2_1: # https://github.com/pytorch/pytorch/issues/95630 - orig_module.training_step._torchdynamo_inline = orig_module.training_step orig_module.validation_step = model.dynamo_ctx(orig_module.validation_step) # type: ignore[method-assign] orig_module.test_step = model.dynamo_ctx(orig_module.test_step) # type: ignore[method-assign] orig_module.predict_step = model.dynamo_ctx(orig_module.predict_step) # type: ignore[method-assign] diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py index c92754833836e..8b0d83d7f2990 100644 --- a/tests/tests_fabric/conftest.py +++ b/tests/tests_fabric/conftest.py @@ -101,7 +101,10 @@ def thread_police_duuu_daaa_duuu_daaa(): assert not thread.is_alive() elif isinstance(thread, _ChildProcessObserver): thread.join(timeout=10) - elif thread.name == "QueueFeederThread": # tensorboardX + elif ( + thread.name == "QueueFeederThread" # tensorboardX + or thread.name == "QueueManagerThread" # torch.compile + ): thread.join(timeout=20) elif ( sys.version_info >= (3, 9) diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index a88e7c2be7b3a..c45adef192e68 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -148,7 +148,7 @@ def __init__(self): assert model.l.weight.dtype == expected -@RunIf(min_cuda_gpus=1, min_torch="2.1") +@RunIf(min_cuda_gpus=1) @pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable") @pytest.mark.parametrize( ("args", "expected"), @@ -232,7 +232,7 @@ def __init__(self): assert model.l.weight.dtype == expected -@RunIf(min_cuda_gpus=1, min_torch="2.1") +@RunIf(min_cuda_gpus=1) @pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable") def test_load_quantized_checkpoint(tmp_path): """Test that a checkpoint saved from a quantized model can be loaded back into a quantized model.""" diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py index 281f0d47bae0c..a7ed09b00b09e 100644 --- a/tests/tests_fabric/strategies/test_ddp_integration.py +++ b/tests/tests_fabric/strategies/test_ddp_integration.py @@ -75,7 +75,7 @@ def assert_params_equal(params0, params1): assert_params_equal(params_before, wrapped_model.parameters()) -@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0", dynamo=True) +@RunIf(min_cuda_gpus=2, standalone=True, dynamo=True) @mock.patch("lightning.fabric.wrappers.torch.compile", Mock(wraps=torch.compile)) @mock.patch.dict(os.environ, {}) def test_reapply_compile(): diff --git a/tests/tests_fabric/strategies/test_fsdp.py b/tests/tests_fabric/strategies/test_fsdp.py index 1cf2a4d2f1f63..0c46e7ac1763c 100644 --- a/tests/tests_fabric/strategies/test_fsdp.py +++ b/tests/tests_fabric/strategies/test_fsdp.py @@ -16,7 +16,6 @@ from unittest import mock from unittest.mock import ANY, MagicMock, Mock -import lightning.fabric import pytest import torch import torch.nn as nn @@ -28,8 +27,9 @@ _get_full_state_dict_context, _is_sharded_checkpoint, ) -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1, _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, FullyShardedDataParallel, MixedPrecision +from torch.distributed.fsdp.wrap import ModuleWrapPolicy from torch.optim import Adam @@ -147,13 +147,6 @@ def test_no_backward_sync(): module.no_sync.assert_called_once() -def test_activation_checkpointing_support(monkeypatch): - """Test that we error out if activation checkpointing requires a newer PyTorch version.""" - monkeypatch.setattr(lightning.fabric.strategies.fsdp, "_TORCH_GREATER_EQUAL_2_1", False) - with pytest.raises(ValueError, match="activation_checkpointing_policy` requires torch >= 2.1.0"): - FSDPStrategy(activation_checkpointing_policy=Mock()) - - def test_activation_checkpointing(): """Test that the FSDP strategy can apply activation checkpointing to the given layers.""" @@ -170,28 +163,13 @@ def __init__(self): self.layer1 = Block2(2, 2) self.layer2 = nn.Linear(3, 3) - if _TORCH_GREATER_EQUAL_2_1: - from torch.distributed.fsdp.wrap import ModuleWrapPolicy - - strategy = FSDPStrategy(activation_checkpointing_policy={Block1}) - assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} - assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) - - strategy = FSDPStrategy(activation_checkpointing_policy=ModuleWrapPolicy({Block1, Block2})) - assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} - assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) - else: - strategy = FSDPStrategy(activation_checkpointing=Block1) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} + strategy = FSDPStrategy(activation_checkpointing_policy={Block1}) + assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} + assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) - strategy = FSDPStrategy(activation_checkpointing=[Block1, Block2]) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} - - strategy = FSDPStrategy(activation_checkpointing_policy={Block1}) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} - - strategy = FSDPStrategy(activation_checkpointing_policy={Block1, Block2}) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} + strategy = FSDPStrategy(activation_checkpointing_policy=ModuleWrapPolicy({Block1, Block2})) + assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} + assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) strategy._parallel_devices = [torch.device("cuda", 0)] with mock.patch("torch.distributed.fsdp.FullyShardedDataParallel", new=MagicMock), mock.patch( @@ -401,15 +379,13 @@ def test_set_timeout(init_process_group_mock): ) -@pytest.mark.parametrize("torch_ge_2_1", [True, False]) @mock.patch("torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.set_state_dict_type") -def test_get_full_state_dict_context_offload(set_type_mock, monkeypatch, torch_ge_2_1): - """Test that the state dict context manager handles CPU offloading depending on the PyTorch version.""" - monkeypatch.setattr("lightning.fabric.strategies.fsdp._TORCH_GREATER_EQUAL_2_1", torch_ge_2_1) +def test_get_full_state_dict_context_offload(set_type_mock, monkeypatch): + """Test that the state dict context manager handles CPU offloading.""" with _get_full_state_dict_context(module=Mock(spec=FullyShardedDataParallel), world_size=1): - assert set_type_mock.call_args_list[0][0][2].offload_to_cpu is torch_ge_2_1 # model config - assert set_type_mock.call_args_list[0][0][3].offload_to_cpu is torch_ge_2_1 # optim config + assert set_type_mock.call_args_list[0][0][2].offload_to_cpu # model config + assert set_type_mock.call_args_list[0][0][3].offload_to_cpu # optim config set_type_mock.reset_mock() diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index 16e4910c7ec33..e324ab5056d47 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -23,7 +23,6 @@ from lightning.fabric import Fabric from lightning.fabric.plugins import FSDPPrecision from lightning.fabric.strategies import FSDPStrategy -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.fabric.wrappers import _FabricOptimizer from torch._dynamo import OptimizedModule @@ -400,7 +399,7 @@ def test_setup_with_orig_params_and_multiple_param_groups(): assert not isinstance(layer.weight, FlatParameter) -@RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.1.0", dynamo=True, skip_windows=True) +@RunIf(min_cuda_gpus=2, standalone=True, dynamo=True, skip_windows=True) @mock.patch("lightning.fabric.wrappers.torch.compile", Mock(wraps=torch.compile)) @mock.patch.dict(os.environ, {}) def test_reapply_compile(): @@ -466,12 +465,8 @@ def _run_setup_assertions(empty_init, expected_device): # Case 1: No empty init _run_setup_assertions(empty_init=False, expected_device=torch.device("cpu")) - if _TORCH_GREATER_EQUAL_2_1: - # Case 2: Empty-init with PyTorch >= 2.1 supports meta device - _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) - else: - # Case 2: Empty-init with PyTorch < 2.1 only supports `torch.empty()`-init - _run_setup_assertions(empty_init=True, expected_device=torch.device("cpu")) + # Case 2: Empty-init with meta device + _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) @RunIf(min_cuda_gpus=2, standalone=True) @@ -538,9 +533,6 @@ def test_rewrap_warnings(): assert not isinstance(model._forward_module, FullyShardedDataParallel) assert isinstance(model._forward_module[2], FullyShardedDataParallel) - if not _TORCH_GREATER_EQUAL_2_1: - return - with fabric.init_module(empty_init=True): model = torch.nn.Sequential(torch.nn.Linear(1, 1), torch.nn.ReLU(), wrap(torch.nn.Linear(1, 1))) assert model[0].weight.is_meta diff --git a/tests/tests_fabric/test_fabric.py b/tests/tests_fabric/test_fabric.py index f76a846e80a75..70d04d5431404 100644 --- a/tests/tests_fabric/test_fabric.py +++ b/tests/tests_fabric/test_fabric.py @@ -289,7 +289,7 @@ def test_setup_optimizers_not_supported(strategy_cls): fabric.setup_optimizers(optimizer) -@RunIf(min_cuda_gpus=1, min_torch="2.1") +@RunIf(min_cuda_gpus=1) def test_setup_optimizer_on_meta_device(): """Test that the setup-methods validate that the optimizer doesn't have references to meta-device parameters.""" fabric = Fabric(strategy="fsdp", devices=1) @@ -867,8 +867,6 @@ def test_init_module_context(monkeypatch): def test_init_tensor_context(monkeypatch): - """Test that `.init_tensor()` warns if using PyTorch < 2.0.""" - fabric = Fabric(accelerator="cpu") strategy = SingleDeviceStrategy(device=torch.device("cuda")) strategy.tensor_init_context = Mock(wraps=strategy.tensor_init_context) diff --git a/tests/tests_fabric/test_wrappers.py b/tests/tests_fabric/test_wrappers.py index 91f516d03a00c..b89a536aff84c 100644 --- a/tests/tests_fabric/test_wrappers.py +++ b/tests/tests_fabric/test_wrappers.py @@ -19,7 +19,6 @@ from lightning.fabric.fabric import Fabric from lightning.fabric.plugins import Precision from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1 from lightning.fabric.wrappers import ( _FabricDataLoader, _FabricModule, @@ -268,14 +267,13 @@ def __init__(self): assert torch.equal(fabric_module.layer.weight, weight) assert torch.equal(fabric_module.layer.bias, bias) - if _TORCH_GREATER_EQUAL_2_1: - # Can use additional `assign` argument in PyTorch >= 2.1 - with torch.device("meta"): - original_module = OriginalModule() - fabric_module = _FabricModule(wrapped_module, Mock(), original_module=original_module) - assert fabric_module.layer.weight.is_meta - fabric_module.load_state_dict({"layer.weight": weight, "layer.bias": bias}, assign=True) - assert not fabric_module.layer.weight.is_meta + # Can use additional `assign` argument + with torch.device("meta"): + original_module = OriginalModule() + fabric_module = _FabricModule(wrapped_module, Mock(), original_module=original_module) + assert fabric_module.layer.weight.is_meta + fabric_module.load_state_dict({"layer.weight": weight, "layer.bias": bias}, assign=True) + assert not fabric_module.layer.weight.is_meta @pytest.mark.parametrize( diff --git a/tests/tests_fabric/utilities/test_init.py b/tests/tests_fabric/utilities/test_init.py index bdbca90495561..dd08dec020669 100644 --- a/tests/tests_fabric/utilities/test_init.py +++ b/tests/tests_fabric/utilities/test_init.py @@ -58,7 +58,6 @@ def test_empty_init_speed(): assert normal_init_time > 2 * empty_init_time -@RunIf(min_torch="2.1") def test_materialize_meta_tensors(): class Submodule(torch.nn.Module): def __init__(self): diff --git a/tests/tests_fabric/utilities/test_throughput.py b/tests/tests_fabric/utilities/test_throughput.py index f2c3de30a325b..eefadb285af02 100644 --- a/tests/tests_fabric/utilities/test_throughput.py +++ b/tests/tests_fabric/utilities/test_throughput.py @@ -13,11 +13,9 @@ measure_flops, ) -from tests_fabric.helpers.runif import RunIf from tests_fabric.test_fabric import BoringModel -@RunIf(min_torch="2.1") def test_measure_flops(): with torch.device("meta"): model = BoringModel() diff --git a/tests/tests_pytorch/callbacks/test_throughput_monitor.py b/tests/tests_pytorch/callbacks/test_throughput_monitor.py index 9467e45e2fa80..a74efba75813b 100644 --- a/tests/tests_pytorch/callbacks/test_throughput_monitor.py +++ b/tests/tests_pytorch/callbacks/test_throughput_monitor.py @@ -8,10 +8,7 @@ from lightning.pytorch.callbacks.throughput_monitor import ThroughputMonitor from lightning.pytorch.demos.boring_classes import BoringModel -from tests_pytorch.helpers.runif import RunIf - -@RunIf(min_torch="2.1") def test_measure_flops(): with torch.device("meta"): model = BoringModel() diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index c0319e873be08..97c17c4e4644e 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -153,7 +153,10 @@ def thread_police_duuu_daaa_duuu_daaa(): assert not thread.is_alive() elif isinstance(thread, _ChildProcessObserver): thread.join(timeout=10) - elif thread.name == "QueueFeederThread": # tensorboardX + elif ( + thread.name == "QueueFeederThread" # tensorboardX + or thread.name == "QueueManagerThread" # torch.compile + ): thread.join(timeout=20) elif isinstance(thread, TMonitor): thread.exit() diff --git a/tests/tests_pytorch/deprecated_api/test_no_removal_version.py b/tests/tests_pytorch/deprecated_api/test_no_removal_version.py index d12254c6794fe..e6da72c777dbb 100644 --- a/tests/tests_pytorch/deprecated_api/test_no_removal_version.py +++ b/tests/tests_pytorch/deprecated_api/test_no_removal_version.py @@ -35,14 +35,10 @@ def test_ddp_is_distributed(): _ = strategy.is_distributed -def test_fsdp_activation_checkpointing(monkeypatch): +def test_fsdp_activation_checkpointing(): with pytest.raises(ValueError, match="cannot set both `activation_checkpointing"): FSDPStrategy(activation_checkpointing=torch.nn.Linear, activation_checkpointing_policy=lambda *_: True) - monkeypatch.setattr(lightning.fabric.strategies.fsdp, "_TORCH_GREATER_EQUAL_2_1", True) - with pytest.deprecated_call(match=r"use `FSDPStrategy\(activation_checkpointing_policy"): - FSDPStrategy(activation_checkpointing=torch.nn.Linear) - def test_double_precision_wrapper(): with pytest.deprecated_call(match=r"The `LightningDoublePrecisionModule` is deprecated and no longer needed"): diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py index 04eeabbbd7c49..fe36eb0a03553 100644 --- a/tests/tests_pytorch/strategies/test_fsdp.py +++ b/tests/tests_pytorch/strategies/test_fsdp.py @@ -14,7 +14,7 @@ import torch.nn as nn from lightning.fabric.plugins.environments import LightningEnvironment from lightning.fabric.strategies.fsdp import _is_sharded_checkpoint -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_1, _TORCH_GREATER_EQUAL_2_2 +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2 from lightning.fabric.utilities.load import _load_distributed_checkpoint from lightning.pytorch import Trainer from lightning.pytorch.callbacks import ModelCheckpoint @@ -334,10 +334,9 @@ def test_strategy_full_state_dict(tmp_path, wrap_min_params): TestFSDPModelAutoWrapped(), FSDPStrategy, { - "auto_wrap_policy": ModuleWrapPolicy({nn.Linear}) if _TORCH_GREATER_EQUAL_2_1 else None, + "auto_wrap_policy": ModuleWrapPolicy({nn.Linear}), "use_orig_params": True, }, - marks=RunIf(min_torch="2.1.0"), id="autowrap_use_orig_params", ), ], @@ -380,19 +379,12 @@ def test_invalid_parameters_in_optimizer(use_orig_params): fast_dev_run=1, ) - error_context = ( - nullcontext() - if _TORCH_GREATER_EQUAL_2_1 or use_orig_params is not False - else pytest.raises(ValueError, match="The optimizer does not seem to reference any FSDP parameters") - ) - class EmptyParametersModel(BoringModel): def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=1e-2) model = EmptyParametersModel() - with error_context: - trainer.fit(model) + trainer.fit(model) class NoFlatParametersModel(BoringModel): def configure_optimizers(self): @@ -435,28 +427,13 @@ def __init__(self): self.layer1 = Block2(2, 2) self.layer2 = nn.Linear(3, 3) - if _TORCH_GREATER_EQUAL_2_1: - from torch.distributed.fsdp.wrap import ModuleWrapPolicy - - strategy = FSDPStrategy(activation_checkpointing_policy={Block1}) - assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} - assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) - - strategy = FSDPStrategy(activation_checkpointing_policy=ModuleWrapPolicy({Block1, Block2})) - assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} - assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) - else: - strategy = FSDPStrategy(activation_checkpointing=Block1) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} - - strategy = FSDPStrategy(activation_checkpointing=[Block1, Block2]) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} + strategy = FSDPStrategy(activation_checkpointing_policy={Block1}) + assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} + assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) - strategy = FSDPStrategy(activation_checkpointing_policy={Block1}) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} - - strategy = FSDPStrategy(activation_checkpointing_policy={Block1, Block2}) - assert set(strategy._activation_checkpointing_kwargs) == {"check_fn"} + strategy = FSDPStrategy(activation_checkpointing_policy=ModuleWrapPolicy({Block1, Block2})) + assert set(strategy._activation_checkpointing_kwargs) == {"auto_wrap_policy"} + assert isinstance(strategy._activation_checkpointing_kwargs["auto_wrap_policy"], ModuleWrapPolicy) model = Model() strategy._parallel_devices = [torch.device("cuda", 0)] @@ -608,7 +585,7 @@ def test_strategy_save_optimizer_states(tmp_path, wrap_min_params): if trainer.global_rank != 0: assert len(model_state_dict) == 0 - if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1: + if trainer.global_rank != 0: assert len(optimizer_state_dict) == 0 # restore model to ddp @@ -679,7 +656,7 @@ def test_strategy_load_optimizer_states(wrap_min_params, tmp_path): if trainer.global_rank != 0: assert len(restored_model_state_dict) == 0 - if trainer.global_rank != 0 and _TORCH_GREATER_EQUAL_2_1: + if trainer.global_rank != 0: assert len(restored_optimizer_state_dict) == 0 if trainer.global_rank == 0: @@ -936,12 +913,8 @@ def _run_setup_assertions(empty_init, expected_device): # Case 1: No empty init _run_setup_assertions(empty_init=False, expected_device=torch.device("cpu")) - if _TORCH_GREATER_EQUAL_2_1: - # Case 2: Empty-init with PyTorch >= 2.1 supports meta device - _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) - else: - # Case 2: Empty-init with PyTorch < 2.1 only supports `torch.empty()`-init - _run_setup_assertions(empty_init=True, expected_device=torch.device("cpu")) + # Case 2: Empty-init with meta device + _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) @RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.3.0") diff --git a/tests/tests_pytorch/strategies/test_model_parallel_integration.py b/tests/tests_pytorch/strategies/test_model_parallel_integration.py index bb8d7c719f821..3f09db0568170 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel_integration.py +++ b/tests/tests_pytorch/strategies/test_model_parallel_integration.py @@ -339,7 +339,7 @@ def _run_setup_assertions(empty_init, expected_device): # Case 1: No empty init _run_setup_assertions(empty_init=False, expected_device=torch.device("cpu")) - # Case 2: Empty-init with PyTorch >= 2.1 supports meta device + # Case 2: Empty-init with meta device _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) diff --git a/tests/tests_pytorch/trainer/flags/test_inference_mode.py b/tests/tests_pytorch/trainer/flags/test_inference_mode.py index c262f0ca33806..bae7b66dbbd55 100644 --- a/tests/tests_pytorch/trainer/flags/test_inference_mode.py +++ b/tests/tests_pytorch/trainer/flags/test_inference_mode.py @@ -16,7 +16,6 @@ import pytest import torch -from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0 from lightning.pytorch import Trainer from lightning.pytorch.demos.boring_classes import BoringModel from lightning.pytorch.loops import _Loop @@ -81,7 +80,5 @@ def run(self): ... f.run() no_grad_mock.assert_called_once_with() f.inference_mode = True - with mock.patch("torch.inference_mode") as inference_mode_mock: + with mock.patch("torch.inference_mode"): f.run() - if not _TORCH_EQUAL_2_0: - inference_mode_mock.assert_called_once_with() From 37e04d075a5532c69b8ac7457795b4345cca30cc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 09:17:42 +0200 Subject: [PATCH 096/179] build(deps): bump Lightning-AI/utilities from 0.11.2 to 0.11.3 (#20031) Bumps [Lightning-AI/utilities](https://github.com/lightning-ai/utilities) from 0.11.2 to 0.11.3. - [Release notes](https://github.com/lightning-ai/utilities/releases) - [Changelog](https://github.com/Lightning-AI/utilities/blob/main/CHANGELOG.md) - [Commits](https://github.com/lightning-ai/utilities/compare/v0.11.2...v0.11.3) --- updated-dependencies: - dependency-name: Lightning-AI/utilities dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/call-clear-cache.yml | 4 ++-- .github/workflows/ci-check-md-links.yml | 2 +- .github/workflows/ci-schema.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/call-clear-cache.yml b/.github/workflows/call-clear-cache.yml index 72914b82e4a45..6dc9cec92d058 100644 --- a/.github/workflows/call-clear-cache.yml +++ b/.github/workflows/call-clear-cache.yml @@ -12,12 +12,12 @@ on: jobs: cron-clear: if: github.event_name == 'schedule' - uses: Lightning-AI/utilities/.github/workflows/clear-cache.yml@v0.11.2 + uses: Lightning-AI/utilities/.github/workflows/clear-cache.yml@v0.11.3 with: pattern: "latest|docs" direct-clear: if: github.event_name == 'workflow_dispatch' - uses: Lightning-AI/utilities/.github/workflows/clear-cache.yml@v0.11.2 + uses: Lightning-AI/utilities/.github/workflows/clear-cache.yml@v0.11.3 with: pattern: ${{ inputs.pattern }} diff --git a/.github/workflows/ci-check-md-links.yml b/.github/workflows/ci-check-md-links.yml index 65c336a18a273..02571ea7b2d57 100644 --- a/.github/workflows/ci-check-md-links.yml +++ b/.github/workflows/ci-check-md-links.yml @@ -14,7 +14,7 @@ on: jobs: check-md-links: - uses: Lightning-AI/utilities/.github/workflows/check-md-links.yml@v0.11.2 + uses: Lightning-AI/utilities/.github/workflows/check-md-links.yml@v0.11.3 with: config-file: ".github/markdown-links-config.json" base-branch: "master" diff --git a/.github/workflows/ci-schema.yml b/.github/workflows/ci-schema.yml index 5ab6006205084..5dd38297e3508 100644 --- a/.github/workflows/ci-schema.yml +++ b/.github/workflows/ci-schema.yml @@ -8,7 +8,7 @@ on: jobs: check: - uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.11.2 + uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.11.3 with: # skip azure due to the wrong schema file by MSFT # https://github.com/Lightning-AI/lightning-flash/pull/1455#issuecomment-1244793607 From 693c21ac1b1f2ac6aafd00119c21e819cf597006 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 3 Jul 2024 00:01:03 +0200 Subject: [PATCH 097/179] Add testing for PyTorch 2.4 (Fabric) (#20028) --- .azure/gpu-tests-fabric.yml | 10 ++- .github/workflows/ci-tests-fabric.yml | 8 ++- requirements/fabric/base.txt | 2 +- requirements/fabric/examples.txt | 2 +- src/lightning/fabric/plugins/precision/amp.py | 5 +- src/lightning/fabric/strategies/fsdp.py | 4 ++ .../fabric/strategies/model_parallel.py | 18 ++--- src/lightning/fabric/utilities/imports.py | 2 +- .../fabric/utilities/testing/_runif.py | 5 +- tests/tests_fabric/conftest.py | 1 + .../plugins/precision/test_amp.py | 7 +- .../plugins/precision/test_amp_integration.py | 4 +- .../plugins/precision/test_bitsandbytes.py | 4 +- tests/tests_fabric/strategies/test_dp.py | 1 + .../strategies/test_fsdp_integration.py | 6 +- .../strategies/test_model_parallel.py | 66 +++++++------------ .../test_model_parallel_integration.py | 18 ++--- tests/tests_fabric/test_connector.py | 2 +- tests/tests_fabric/test_wrappers.py | 2 +- .../strategies/test_model_parallel.py | 21 ------ 20 files changed, 84 insertions(+), 104 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 1a854604606aa..576b9c3eb3d2b 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -62,6 +62,9 @@ jobs: "Lightning | latest": image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "lightning" + "Lightning | future": + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.4-cuda12.1.0" + PACKAGE_NAME: "lightning" workspace: clean: all steps: @@ -72,9 +75,12 @@ jobs: echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))') echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope" + python_ver=$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')") + echo "##vso[task.setvariable variable=PYTHON_VERSION_MM]$python_ver" displayName: "set env. vars" - bash: | - echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}/torch_test.html" + echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}" + echo "##vso[task.setvariable variable=TORCHVISION_URL]https://download.pytorch.org/whl/test/cu124/torchvision-0.19.0%2Bcu124-cp${PYTHON_VERSION_MM}-cp${PYTHON_VERSION_MM}-linux_x86_64.whl" condition: endsWith(variables['Agent.JobName'], 'future') displayName: "set env. vars 4 future" @@ -103,7 +109,7 @@ jobs: - bash: | extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") - pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" + pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}" --find-links="${TORCHVISION_URL}" displayName: "Install package & dependencies" - bash: | diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 2c0d8d16b89ad..8d5ed3e9e7fa4 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -49,6 +49,10 @@ jobs: - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.3" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.4" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.4" } + # TODO: PyTorch 2.4 on Windows not yet working with `torch.distributed` (not compiled with libuv support) + # - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.4" } # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - { os: "macOS-12", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.1" } @@ -79,7 +83,7 @@ jobs: FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} PYPI_CACHE_DIR: "_pip-wheels" TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" - TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch_test.html" + TORCH_URL_TEST: "https://download.pytorch.org/whl/test/cpu/torch" # TODO: Remove this - Enable running MPS tests on this platform DISABLE_MPS: ${{ matrix.os == 'macOS-14' && '1' || '0' }} steps: @@ -118,7 +122,7 @@ jobs: - name: Env. variables run: | # Switch PyTorch URL - python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.3' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV + python -c "print('TORCH_URL=' + str('${{env.TORCH_URL_TEST}}' if '${{ matrix.pytorch-version }}' == '2.4' else '${{env.TORCH_URL_STABLE}}'))" >> $GITHUB_ENV # Switch coverage scope python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV # if you install mono-package set dependency only for this subpackage diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 7ca4556821613..a4ba3bc9c5757 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy >=1.21.0, <1.27.0 -torch >=2.1.0, <2.4.0 +torch >=2.1.0, <2.5.0 fsspec[http] >=2022.5.0, <2024.4.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt index 49ffde9d0f2f0..cb4135da2409a 100644 --- a/requirements/fabric/examples.txt +++ b/requirements/fabric/examples.txt @@ -1,6 +1,6 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision >=0.16.0, <0.19.0 +torchvision >=0.16.0, <0.20.0 torchmetrics >=0.10.0, <1.3.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/src/lightning/fabric/plugins/precision/amp.py b/src/lightning/fabric/plugins/precision/amp.py index 75d7932ddb916..c3b7fb74c293f 100644 --- a/src/lightning/fabric/plugins/precision/amp.py +++ b/src/lightning/fabric/plugins/precision/amp.py @@ -22,6 +22,7 @@ from lightning.fabric.plugins.precision.precision import Precision from lightning.fabric.plugins.precision.utils import _convert_fp_tensor +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4 from lightning.fabric.utilities.types import Optimizable @@ -39,7 +40,7 @@ def __init__( self, precision: Literal["16-mixed", "bf16-mixed"], device: str, - scaler: Optional[torch.cuda.amp.GradScaler] = None, + scaler: Optional["torch.cuda.amp.GradScaler"] = None, ) -> None: if precision not in ("16-mixed", "bf16-mixed"): raise ValueError( @@ -49,7 +50,7 @@ def __init__( self.precision = precision if scaler is None and self.precision == "16-mixed": - scaler = torch.cuda.amp.GradScaler() + scaler = torch.amp.GradScaler(device=device) if _TORCH_GREATER_EQUAL_2_4 else torch.cuda.amp.GradScaler() if scaler is not None and self.precision == "bf16-mixed": raise ValueError(f"`precision='bf16-mixed'` does not use a scaler, found {scaler}.") self.device = device diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py index b8a3a268478dd..88a4be4549e27 100644 --- a/src/lightning/fabric/strategies/fsdp.py +++ b/src/lightning/fabric/strategies/fsdp.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import shutil +import warnings from contextlib import ExitStack, nullcontext from datetime import timedelta from functools import partial @@ -83,6 +84,9 @@ _FSDP_ALIASES = ("fsdp", "fsdp_cpu_offload") +# TODO: Switch to new state-dict APIs +warnings.filterwarnings("ignore", category=FutureWarning, message=".*FSDP.state_dict_type.*") # from torch >= 2.4 + class FSDPStrategy(ParallelStrategy, _Sharded): r"""Strategy for Fully Sharded Data Parallel provided by torch.distributed. diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py index 629113b291a2a..c7277008968af 100644 --- a/src/lightning/fabric/strategies/model_parallel.py +++ b/src/lightning/fabric/strategies/model_parallel.py @@ -70,7 +70,7 @@ class ModelParallelStrategy(ParallelStrategy): Currently supports up to 2D parallelism. Specifically, it supports the combination of Fully Sharded Data-Parallel 2 (FSDP2) with Tensor Parallelism (DTensor). These PyTorch APIs are currently still - experimental in PyTorch. Requires PyTorch 2.3 or newer. + experimental in PyTorch. Requires PyTorch 2.4 or newer. Arguments: parallelize_fn: A function that applies parallelisms to a module. The strategy will provide the @@ -95,8 +95,8 @@ def __init__( timeout: Optional[timedelta] = default_pg_timeout, ) -> None: super().__init__() - if not _TORCH_GREATER_EQUAL_2_3: - raise ImportError(f"{type(self).__name__} requires PyTorch 2.3 or higher.") + if not _TORCH_GREATER_EQUAL_2_4: + raise ImportError(f"{type(self).__name__} requires PyTorch 2.4 or higher.") self._parallelize_fn = parallelize_fn self._data_parallel_size = data_parallel_size self._tensor_parallel_size = tensor_parallel_size @@ -178,7 +178,7 @@ def setup_module(self, module: TModel) -> TModel: if any(isinstance(mod, FullyShardedDataParallel) for mod in module.modules()): raise TypeError( "Found modules that are wrapped with `torch.distributed.fsdp.FullyShardedDataParallel`." - f" The `{self.__class__.__name__}` only supports the new FSDP2 APIs in PyTorch >= 2.3." + f" The `{self.__class__.__name__}` only supports the new FSDP2 APIs in PyTorch >= 2.4." ) module = self._parallelize_fn(module, self.device_mesh) @@ -329,10 +329,10 @@ def __init__(self, module: Module, enabled: bool) -> None: self._enabled = enabled def _set_requires_grad_sync(self, requires_grad_sync: bool) -> None: - from torch.distributed._composable.fsdp import FSDP + from torch.distributed._composable.fsdp import FSDPModule for mod in self._module.modules(): - if isinstance(mod, FSDP): + if isinstance(mod, FSDPModule): mod.set_requires_gradient_sync(requires_grad_sync, recurse=False) def __enter__(self) -> None: @@ -458,9 +458,6 @@ def _load_checkpoint( return metadata if _is_full_checkpoint(path): - if not _TORCH_GREATER_EQUAL_2_4: - raise ImportError("Loading a non-distributed checkpoint into a distributed model requires PyTorch >= 2.4.") - checkpoint = torch.load(path, mmap=True, map_location="cpu") _load_raw_module_state(checkpoint.pop(module_key), module, strict=strict) @@ -546,9 +543,6 @@ def _load_raw_module_state( from torch.distributed.fsdp import FullyShardedDataParallel as FSDP if _has_dtensor_modules(module): - if not _TORCH_GREATER_EQUAL_2_4: - raise ImportError("Loading a non-distributed checkpoint into a distributed model requires PyTorch >= 2.4.") - from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict state_dict_options = StateDictOptions( diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py index fc40175ff53a4..c8fa1ddf1e083 100644 --- a/src/lightning/fabric/utilities/imports.py +++ b/src/lightning/fabric/utilities/imports.py @@ -28,7 +28,7 @@ _TORCH_GREATER_EQUAL_2_2 = compare_version("torch", operator.ge, "2.2.0") _TORCH_GREATER_EQUAL_2_3 = compare_version("torch", operator.ge, "2.3.0") -_TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0", use_base_version=True) +_TORCH_GREATER_EQUAL_2_4 = compare_version("torch", operator.ge, "2.4.0") _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) diff --git a/src/lightning/fabric/utilities/testing/_runif.py b/src/lightning/fabric/utilities/testing/_runif.py index f28265104878d..6f0513465cab5 100644 --- a/src/lightning/fabric/utilities/testing/_runif.py +++ b/src/lightning/fabric/utilities/testing/_runif.py @@ -24,6 +24,7 @@ from lightning.fabric.accelerators.cuda import num_cuda_devices from lightning.fabric.accelerators.mps import MPSAccelerator from lightning.fabric.strategies.deepspeed import _DEEPSPEED_AVAILABLE +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4 def _runif_reasons( @@ -111,7 +112,9 @@ def _runif_reasons( reasons.append("Standalone execution") kwargs["standalone"] = True - if deepspeed and not (_DEEPSPEED_AVAILABLE and RequirementCache(module="deepspeed.utils")): + if deepspeed and not ( + _DEEPSPEED_AVAILABLE and not _TORCH_GREATER_EQUAL_2_4 and RequirementCache(module="deepspeed.utils") + ): reasons.append("Deepspeed") if dynamo: diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py index 8b0d83d7f2990..446994167d0a1 100644 --- a/tests/tests_fabric/conftest.py +++ b/tests/tests_fabric/conftest.py @@ -104,6 +104,7 @@ def thread_police_duuu_daaa_duuu_daaa(): elif ( thread.name == "QueueFeederThread" # tensorboardX or thread.name == "QueueManagerThread" # torch.compile + or "(_read_thread)" in thread.name # torch.compile ): thread.join(timeout=20) elif ( diff --git a/tests/tests_fabric/plugins/precision/test_amp.py b/tests/tests_fabric/plugins/precision/test_amp.py index 34f14b8871ea3..93d53eb406f71 100644 --- a/tests/tests_fabric/plugins/precision/test_amp.py +++ b/tests/tests_fabric/plugins/precision/test_amp.py @@ -17,11 +17,13 @@ import pytest import torch from lightning.fabric.plugins.precision.amp import MixedPrecision +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4 def test_amp_precision_default_scaler(): precision = MixedPrecision(precision="16-mixed", device=Mock()) - assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) + scaler_cls = torch.amp.GradScaler if _TORCH_GREATER_EQUAL_2_4 else torch.cuda.amp.GradScaler + assert isinstance(precision.scaler, scaler_cls) def test_amp_precision_scaler_with_bf16(): @@ -36,7 +38,8 @@ def test_amp_precision_forward_context(): """Test to ensure that the context manager correctly is set to bfloat16 on CPU and CUDA.""" precision = MixedPrecision(precision="16-mixed", device="cuda") assert precision.device == "cuda" - assert isinstance(precision.scaler, torch.cuda.amp.GradScaler) + scaler_cls = torch.amp.GradScaler if _TORCH_GREATER_EQUAL_2_4 else torch.cuda.amp.GradScaler + assert isinstance(precision.scaler, scaler_cls) assert torch.get_default_dtype() == torch.float32 with precision.forward_context(): assert torch.get_autocast_gpu_dtype() == torch.float16 diff --git a/tests/tests_fabric/plugins/precision/test_amp_integration.py b/tests/tests_fabric/plugins/precision/test_amp_integration.py index 5d88a7d9babb9..aa6c6cfce4504 100644 --- a/tests/tests_fabric/plugins/precision/test_amp_integration.py +++ b/tests/tests_fabric/plugins/precision/test_amp_integration.py @@ -17,6 +17,7 @@ import torch import torch.nn as nn from lightning.fabric import Fabric, seed_everything +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_4 from tests_fabric.helpers.runif import RunIf @@ -82,7 +83,8 @@ def run(fused=False): optimizer = torch.optim.Adam(model.parameters(), lr=1.0, fused=fused) model, optimizer = fabric.setup(model, optimizer) - assert isinstance(fabric._precision.scaler, torch.cuda.amp.GradScaler) + scaler_cls = torch.amp.GradScaler if _TORCH_GREATER_EQUAL_2_4 else torch.cuda.amp.GradScaler + assert isinstance(fabric._precision.scaler, scaler_cls) data = torch.randn(10, 10, device="cuda") target = torch.randn(10, 10, device="cuda") diff --git a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py index c45adef192e68..31d655fc4c082 100644 --- a/tests/tests_fabric/plugins/precision/test_bitsandbytes.py +++ b/tests/tests_fabric/plugins/precision/test_bitsandbytes.py @@ -93,7 +93,7 @@ def __init__(self): precision.convert_module(model) -@RunIf(min_cuda_gpus=1) +@RunIf(min_cuda_gpus=1, max_torch="2.4") @pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable") @pytest.mark.parametrize( ("args", "expected"), @@ -232,7 +232,7 @@ def __init__(self): assert model.l.weight.dtype == expected -@RunIf(min_cuda_gpus=1) +@RunIf(min_cuda_gpus=1, max_torch="2.4") @pytest.mark.skipif(not _BITSANDBYTES_AVAILABLE, reason="bitsandbytes unavailable") def test_load_quantized_checkpoint(tmp_path): """Test that a checkpoint saved from a quantized model can be loaded back into a quantized model.""" diff --git a/tests/tests_fabric/strategies/test_dp.py b/tests/tests_fabric/strategies/test_dp.py index 572bbd20d357c..e50abb1882870 100644 --- a/tests/tests_fabric/strategies/test_dp.py +++ b/tests/tests_fabric/strategies/test_dp.py @@ -74,6 +74,7 @@ def __instancecheck__(self, instance): assert strategy.get_module_state_dict(wrapped_module).keys() == original_module.state_dict().keys() +@pytest.mark.filterwarnings("ignore::FutureWarning") @pytest.mark.parametrize( "precision", [ diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py index e324ab5056d47..77b2f975d28fe 100644 --- a/tests/tests_fabric/strategies/test_fsdp_integration.py +++ b/tests/tests_fabric/strategies/test_fsdp_integration.py @@ -118,7 +118,7 @@ def get_model(self): return model -@RunIf(min_cuda_gpus=2, standalone=True) +@RunIf(min_cuda_gpus=2, standalone=True, max_torch="2.4") @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))]) @pytest.mark.parametrize("manual_wrapping", [True, False]) def test_train_save_load(tmp_path, manual_wrapping, precision): @@ -173,6 +173,7 @@ def test_train_save_load(tmp_path, manual_wrapping, precision): assert state["coconut"] == 11 +@pytest.mark.filterwarnings("ignore::FutureWarning") @RunIf(min_cuda_gpus=2, standalone=True) def test_save_full_state_dict(tmp_path): """Test that FSDP saves the full state into a single file with `state_dict_type="full"`.""" @@ -287,6 +288,7 @@ def test_save_full_state_dict(tmp_path): trainer.run() +@pytest.mark.filterwarnings("ignore::FutureWarning") @RunIf(min_cuda_gpus=2, standalone=True) def test_load_full_state_dict_into_sharded_model(tmp_path): """Test that the strategy can load a full-state checkpoint into a FSDP sharded model.""" @@ -469,6 +471,7 @@ def _run_setup_assertions(empty_init, expected_device): _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) +@pytest.mark.filterwarnings("ignore::FutureWarning") @RunIf(min_cuda_gpus=2, standalone=True) def test_save_filter(tmp_path): fabric = Fabric(accelerator="cuda", strategy=FSDPStrategy(state_dict_type="full"), devices=2) @@ -602,6 +605,7 @@ def test_clip_gradients(clip_type, precision): optimizer.zero_grad() +@pytest.mark.filterwarnings("ignore::FutureWarning") @RunIf(min_cuda_gpus=2, standalone=True, min_torch="2.3.0") def test_save_sharded_and_consolidate_and_load(tmp_path): """Test the consolidation of a FSDP-sharded checkpoint into a single file.""" diff --git a/tests/tests_fabric/strategies/test_model_parallel.py b/tests/tests_fabric/strategies/test_model_parallel.py index 03b9268b3158e..1f8b5b783b73e 100644 --- a/tests/tests_fabric/strategies/test_model_parallel.py +++ b/tests/tests_fabric/strategies/test_model_parallel.py @@ -28,20 +28,20 @@ from tests_fabric.helpers.runif import RunIf -@mock.patch("lightning.fabric.strategies.model_parallel._TORCH_GREATER_EQUAL_2_3", False) -def test_torch_greater_equal_2_3(): - with pytest.raises(ImportError, match="ModelParallelStrategy requires PyTorch 2.3 or higher"): +@mock.patch("lightning.fabric.strategies.model_parallel._TORCH_GREATER_EQUAL_2_4", False) +def test_torch_greater_equal_2_4(): + with pytest.raises(ImportError, match="ModelParallelStrategy requires PyTorch 2.4 or higher"): ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_device_mesh_access(): strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) with pytest.raises(RuntimeError, match="Accessing the device mesh .* not allowed"): _ = strategy.device_mesh -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") @pytest.mark.parametrize( ("num_nodes", "devices", "invalid_dp_size", "invalid_tp_size"), [ @@ -70,7 +70,7 @@ def test_validate_device_mesh_dimensions(num_nodes, devices, invalid_dp_size, in strategy.setup_environment() -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_checkpoint_io_unsupported(): """Test that the ModelParallel strategy does not support the `CheckpointIO` plugin.""" strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) @@ -81,18 +81,18 @@ def test_checkpoint_io_unsupported(): strategy.checkpoint_io = Mock() -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_fsdp_v1_modules_unsupported(): """Test that the strategy won't allow setting up a module wrapped with the legacy FSDP API.""" from torch.distributed.fsdp import FullyShardedDataParallel module = Mock(modules=Mock(return_value=[Mock(spec=FullyShardedDataParallel)])) strategy = ModelParallelStrategy(parallelize_fn=(lambda x, _: x)) - with pytest.raises(TypeError, match="only supports the new FSDP2 APIs in PyTorch >= 2.3"): + with pytest.raises(TypeError, match="only supports the new FSDP2 APIs in PyTorch >= 2.4"): strategy.setup_module(module) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_parallelize_fn_call(): model = nn.Linear(2, 2) optimizer = Adam(model.parameters()) @@ -116,15 +116,15 @@ def test_parallelize_fn_call(): strategy.setup_module_and_optimizers(model, [optimizer]) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_no_backward_sync(): """Test that the backward sync control disables gradient sync on modules that benefit from it.""" - from torch.distributed._composable.fsdp import FSDP + from torch.distributed._composable.fsdp import FSDPModule strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) assert isinstance(strategy._backward_sync_control, _ParallelBackwardSyncControl) - fsdp_layer = Mock(spec=FSDP) + fsdp_layer = Mock(spec=FSDPModule) other_layer = nn.Linear(2, 2) module = Mock() module.modules = Mock(return_value=[fsdp_layer, other_layer]) @@ -138,7 +138,7 @@ def test_no_backward_sync(): fsdp_layer.set_requires_gradient_sync.assert_called_with(False, recurse=False) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_save_checkpoint_storage_options(tmp_path): """Test that the strategy does not accept storage options for saving checkpoints.""" strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) @@ -148,7 +148,7 @@ def test_save_checkpoint_storage_options(tmp_path): strategy.save_checkpoint(path=tmp_path, state=Mock(), storage_options=Mock()) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") @mock.patch("lightning.fabric.strategies.model_parallel.ModelParallelStrategy.broadcast", lambda _, x: x) @mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True) @mock.patch("torch.distributed.checkpoint.state_dict.get_model_state_dict", return_value={}) @@ -205,7 +205,7 @@ def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, _, __, ___, t assert path.is_dir() -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_save_checkpoint_one_dist_module_required(tmp_path): """Test that the ModelParallelStrategy strategy can only save one distributed model per checkpoint.""" strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) @@ -226,29 +226,7 @@ def test_save_checkpoint_one_dist_module_required(tmp_path): strategy.save_checkpoint(path=tmp_path, state={"model1": model1, "model2": model2}) -@RunIf(min_torch="2.3") -@mock.patch("lightning.fabric.strategies.model_parallel.torch.load", Mock()) -@mock.patch("lightning.fabric.strategies.model_parallel._TORCH_GREATER_EQUAL_2_4", False) -def test_load_full_checkpoint_support(tmp_path): - """Test that loading non-distributed checkpoints into distributed models requires PyTorch >= 2.4.""" - strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) - model = Mock(spec=nn.Module) - model.parameters.return_value = [torch.zeros(2, 1)] - path = tmp_path / "full.ckpt" - path.touch() - - with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( - "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True - ): - strategy.load_checkpoint(path=path, state={"model": model}) - - with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( - "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True - ): - strategy.load_checkpoint(path=path, state=model) - - -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_load_checkpoint_no_state(tmp_path): """Test that the ModelParallelStrategy strategy can't load the full state without access to a model instance from the user.""" @@ -259,7 +237,7 @@ def test_load_checkpoint_no_state(tmp_path): strategy.load_checkpoint(path=tmp_path, state={}) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") @mock.patch("lightning.fabric.strategies.model_parallel.ModelParallelStrategy.broadcast", lambda _, x: x) @mock.patch("lightning.fabric.strategies.model_parallel.torch.load", Mock()) def test_load_checkpoint_one_dist_module_required(tmp_path): @@ -289,7 +267,7 @@ def test_load_checkpoint_one_dist_module_required(tmp_path): strategy.load_checkpoint(path=path, state=model) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") @mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True) def test_load_unknown_checkpoint_type(_, tmp_path): """Test that the strategy validates the contents at the checkpoint path.""" @@ -301,7 +279,7 @@ def test_load_unknown_checkpoint_type(_, tmp_path): strategy.load_checkpoint(path=path, state={"model": model}) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_load_raw_checkpoint_validate_single_file(tmp_path): """Test that we validate the given checkpoint is a single file when loading a raw PyTorch state-dict checkpoint.""" strategy = ModelParallelStrategy(parallelize_fn=(lambda m, _: m)) @@ -312,7 +290,7 @@ def test_load_raw_checkpoint_validate_single_file(tmp_path): strategy.load_checkpoint(path=path, state=model) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_load_raw_checkpoint_optimizer_unsupported(tmp_path): """Validate that the ModelParallelStrategy strategy does not yet support loading the raw PyTorch state-dict for an optimizer.""" @@ -324,7 +302,7 @@ def test_load_raw_checkpoint_optimizer_unsupported(tmp_path): strategy.load_checkpoint(path=tmp_path, state=optimizer) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") @mock.patch("lightning.fabric.strategies.model_parallel._setup_device_mesh") @mock.patch("torch.distributed.init_process_group") def test_set_timeout(init_process_group_mock, _): @@ -343,7 +321,7 @@ def test_set_timeout(init_process_group_mock, _): ) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") def test_meta_device_materialization(): """Test that the `setup_module()` method materializes meta-device tensors in the module.""" diff --git a/tests/tests_fabric/strategies/test_model_parallel_integration.py b/tests/tests_fabric/strategies/test_model_parallel_integration.py index 6db31d00f7397..e8a8e5b5a4554 100644 --- a/tests/tests_fabric/strategies/test_model_parallel_integration.py +++ b/tests/tests_fabric/strategies/test_model_parallel_integration.py @@ -80,7 +80,7 @@ def _parallelize_feed_forward_fsdp2_tp(model, device_mesh): return model -@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) +@RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=4) def test_setup_device_mesh(): from torch.distributed.device_mesh import DeviceMesh @@ -116,7 +116,7 @@ def test_setup_device_mesh(): assert fabric.strategy.device_mesh.size(1) == 4 -@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=2) +@RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=2) def test_tensor_parallel(): from torch.distributed._tensor import DTensor @@ -160,7 +160,7 @@ def test_tensor_parallel(): optimizer.zero_grad() -@RunIf(min_torch="2.3", standalone=True, min_cuda_gpus=4) +@RunIf(min_torch="2.4", standalone=True, min_cuda_gpus=4) def test_fsdp2_tensor_parallel(): from torch.distributed._tensor import DTensor @@ -237,7 +237,7 @@ def _train(fabric, model=None, optimizer=None): return model, optimizer -@RunIf(min_torch="2.3", min_cuda_gpus=4, standalone=True) +@RunIf(min_torch="2.4", min_cuda_gpus=4, standalone=True) @pytest.mark.parametrize( "precision", [ @@ -445,7 +445,7 @@ def test_load_full_state_dict_into_sharded_model(tmp_path): assert torch.equal(params_before, params_after) -@RunIf(min_torch="2.3", min_cuda_gpus=2, skip_windows=True, standalone=True) +@RunIf(min_torch="2.4", min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.parametrize("move_to_device", [True, False]) @mock.patch("lightning.fabric.wrappers._FabricModule") def test_setup_module_move_to_device(fabric_module_mock, move_to_device): @@ -471,7 +471,7 @@ def test_setup_module_move_to_device(fabric_module_mock, move_to_device): assert fabric.device == torch.device("cuda", fabric.local_rank) -@RunIf(min_torch="2.3", min_cuda_gpus=2, skip_windows=True, standalone=True) +@RunIf(min_torch="2.4", min_cuda_gpus=2, skip_windows=True, standalone=True) @pytest.mark.parametrize( ("precision", "expected_dtype"), [ @@ -502,7 +502,7 @@ def _run_setup_assertions(empty_init, expected_device): _run_setup_assertions(empty_init=True, expected_device=torch.device("meta")) -@RunIf(min_torch="2.3", min_cuda_gpus=2, standalone=True) +@RunIf(min_torch="2.4", min_cuda_gpus=2, standalone=True) def test_save_filter(tmp_path): strategy = ModelParallelStrategy( parallelize_fn=_parallelize_feed_forward_fsdp2, @@ -541,7 +541,7 @@ def _parallelize_single_linear_tp_fsdp2(model, device_mesh): return model -@RunIf(min_torch="2.3", min_cuda_gpus=2, standalone=True) +@RunIf(min_torch="2.4", min_cuda_gpus=2, standalone=True) @pytest.mark.parametrize( "precision", [ @@ -597,7 +597,7 @@ def test_clip_gradients(clip_type, precision): optimizer.zero_grad() -@RunIf(min_torch="2.3", min_cuda_gpus=4, standalone=True) +@RunIf(min_torch="2.4", min_cuda_gpus=4, standalone=True) def test_save_sharded_and_consolidate_and_load(tmp_path): """Test the consolidation of a distributed (DTensor) checkpoint into a single file.""" strategy = ModelParallelStrategy( diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py index b6f6b03b37605..08d6dbb45ed91 100644 --- a/tests/tests_fabric/test_connector.py +++ b/tests/tests_fabric/test_connector.py @@ -868,7 +868,7 @@ def test_precision_selection_amp_ddp(strategy, devices, is_custom_plugin, plugin assert isinstance(connector.precision, plugin_cls) -@RunIf(min_torch="2.3") +@RunIf(min_torch="2.4") @pytest.mark.parametrize( ("precision", "raises"), [("32-true", False), ("16-true", False), ("bf16-true", False), ("16-mixed", True), ("bf16-mixed", False)], diff --git a/tests/tests_fabric/test_wrappers.py b/tests/tests_fabric/test_wrappers.py index b89a536aff84c..26223b47f8c5e 100644 --- a/tests/tests_fabric/test_wrappers.py +++ b/tests/tests_fabric/test_wrappers.py @@ -685,7 +685,7 @@ def test_unwrap_compiled(): assert unwrapped is compiled._orig_mod assert compile_kwargs == {"fullgraph": True, "dynamic": True, "disable": False} - del compiled._compile_kwargs + compiled._compile_kwargs = None with pytest.raises(RuntimeError, match="Failed to determine the arguments that were used to compile the module"): _unwrap_compiled(compiled) diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py index e22593f391811..4f30ae8fef3d0 100644 --- a/tests/tests_pytorch/strategies/test_model_parallel.py +++ b/tests/tests_pytorch/strategies/test_model_parallel.py @@ -174,27 +174,6 @@ def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, tmp_path): assert path.is_dir() -@RunIf(min_torch="2.3") -@mock.patch("lightning.fabric.strategies.model_parallel._TORCH_GREATER_EQUAL_2_4", False) -def test_load_full_checkpoint_support(tmp_path): - """Test that loading non-distributed checkpoints into distributed models requires PyTorch >= 2.4.""" - strategy = ModelParallelStrategy() - strategy.model = Mock() - strategy._lightning_module = Mock(strict_loading=True) - path = tmp_path / "full.ckpt" - path.touch() - - with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( - "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True - ): - strategy.load_checkpoint(checkpoint_path=path) - - with pytest.raises(ImportError, match="Loading .* into a distributed model requires PyTorch >= 2.4"), mock.patch( - "lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True - ): - strategy.load_checkpoint(checkpoint_path=path) - - @RunIf(min_torch="2.3") @mock.patch("lightning.fabric.strategies.model_parallel._has_dtensor_modules", return_value=True) def test_load_unknown_checkpoint_type(_, tmp_path): From b8a5236daa924aa6e57a1418f768657a9d7f0015 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 4 Jul 2024 09:39:26 +0200 Subject: [PATCH 098/179] Move checkpoint customization docs sections (#20006) move docs sections --- .../common/checkpointing_advanced.rst | 79 ------------------ .../common/checkpointing_intermediate.rst | 83 +++++++++++++++++++ 2 files changed, 83 insertions(+), 79 deletions(-) diff --git a/docs/source-pytorch/common/checkpointing_advanced.rst b/docs/source-pytorch/common/checkpointing_advanced.rst index dc99f184d61cb..b72dcb430cb29 100644 --- a/docs/source-pytorch/common/checkpointing_advanced.rst +++ b/docs/source-pytorch/common/checkpointing_advanced.rst @@ -39,82 +39,3 @@ To resume training from a cloud checkpoint use a cloud url. trainer.fit(model, ckpt_path="s3://my_bucket/ckpts/classifier.ckpt") PyTorch Lightning uses `fsspec `_ internally to handle all filesystem operations. - ----- - -*************************** -Modularize your checkpoints -*************************** -Checkpoints can also save the state of :doc:`datamodules <../extensions/datamodules_state>` and :doc:`callbacks <../extensions/callbacks_state>`. - ----- - -**************************** -Modify a checkpoint anywhere -**************************** -When you need to change the components of a checkpoint before saving or loading, use the :meth:`~lightning.pytorch.core.hooks.CheckpointHooks.on_save_checkpoint` and :meth:`~lightning.pytorch.core.hooks.CheckpointHooks.on_load_checkpoint` of your ``LightningModule``. - -.. code-block:: python - - class LitModel(L.LightningModule): - def on_save_checkpoint(self, checkpoint): - checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object - - def on_load_checkpoint(self, checkpoint): - my_cool_pickable_object = checkpoint["something_cool_i_want_to_save"] - -Use the above approach when you need to couple this behavior to your LightningModule for reproducibility reasons. Otherwise, Callbacks also have the :meth:`~lightning.pytorch.callbacks.callback.Callback.on_save_checkpoint` and :meth:`~lightning.pytorch.callbacks.callback.Callback.on_load_checkpoint` which you should use instead: - -.. code-block:: python - - import lightning as L - - - class LitCallback(L.Callback): - def on_save_checkpoint(self, checkpoint): - checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object - - def on_load_checkpoint(self, checkpoint): - my_cool_pickable_object = checkpoint["something_cool_i_want_to_save"] - - ----- - - -******************************** -Resume from a partial checkpoint -******************************** - -Loading a checkpoint is normally "strict", meaning parameter names in the checkpoint must match the parameter names in the model or otherwise PyTorch will raise an error. -In use cases where you want to load only a partial checkpoint, you can disable strict loading by setting ``self.strict_loading = False`` in the LightningModule to avoid errors. -A common use case is when you have a pretrained feature extractor or encoder that you don't update during training, and you don't want it included in the checkpoint: - -.. code-block:: python - - import lightning as L - - class LitModel(L.LightningModule): - def __init__(self): - super().__init__() - - # This model only trains the decoder, we don't save the encoder - self.encoder = from_pretrained(...).requires_grad_(False) - self.decoder = Decoder() - - # Set to False because we only care about the decoder - self.strict_loading = False - - def state_dict(self): - # Don't save the encoder, it is not being trained - return {k: v for k, v in super().state_dict().items() if "encoder" not in k} - - -Since ``strict_loading`` is set to ``False``, you won't get any key errors when resuming the checkpoint with the Trainer: - -.. code-block:: python - - trainer = Trainer() - model = LitModel() - - # Will load weights with `.load_state_dict(strict=model.strict_loading)` - trainer.fit(model, ckpt_path="path/to/checkpoint") diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst index a682673459b32..ec124d4e6e27e 100644 --- a/docs/source-pytorch/common/checkpointing_intermediate.rst +++ b/docs/source-pytorch/common/checkpointing_intermediate.rst @@ -175,3 +175,86 @@ In distributed training cases where a model is running across many machines, Lig By using :meth:`~lightning.pytorch.trainer.trainer.Trainer.save_checkpoint` instead of ``torch.save``, you make your code agnostic to the distributed training strategy being used. It will ensure that checkpoints are saved correctly in a multi-process setting, avoiding race conditions, deadlocks and other common issues that normally require boilerplate code to handle properly. + + +---- + + +*************************** +Modularize your checkpoints +*************************** +Checkpoints can also save the state of :doc:`datamodules <../extensions/datamodules_state>` and :doc:`callbacks <../extensions/callbacks_state>`. + + +---- + + +**************************** +Modify a checkpoint anywhere +**************************** +When you need to change the components of a checkpoint before saving or loading, use the :meth:`~lightning.pytorch.core.hooks.CheckpointHooks.on_save_checkpoint` and :meth:`~lightning.pytorch.core.hooks.CheckpointHooks.on_load_checkpoint` of your ``LightningModule``. + +.. code-block:: python + + class LitModel(L.LightningModule): + def on_save_checkpoint(self, checkpoint): + checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object + + def on_load_checkpoint(self, checkpoint): + my_cool_pickable_object = checkpoint["something_cool_i_want_to_save"] + +Use the above approach when you need to couple this behavior to your LightningModule for reproducibility reasons. Otherwise, Callbacks also have the :meth:`~lightning.pytorch.callbacks.callback.Callback.on_save_checkpoint` and :meth:`~lightning.pytorch.callbacks.callback.Callback.on_load_checkpoint` which you should use instead: + +.. code-block:: python + + import lightning as L + + + class LitCallback(L.Callback): + def on_save_checkpoint(self, checkpoint): + checkpoint["something_cool_i_want_to_save"] = my_cool_pickable_object + + def on_load_checkpoint(self, checkpoint): + my_cool_pickable_object = checkpoint["something_cool_i_want_to_save"] + + +---- + + +******************************** +Resume from a partial checkpoint +******************************** + +Loading a checkpoint is normally "strict", meaning parameter names in the checkpoint must match the parameter names in the model or otherwise PyTorch will raise an error. +In use cases where you want to load only a partial checkpoint, you can disable strict loading by setting ``self.strict_loading = False`` in the LightningModule to avoid errors. +A common use case is when you have a pretrained feature extractor or encoder that you don't update during training, and you don't want it included in the checkpoint: + +.. code-block:: python + + import lightning as L + + class LitModel(L.LightningModule): + def __init__(self): + super().__init__() + + # This model only trains the decoder, we don't save the encoder + self.encoder = from_pretrained(...).requires_grad_(False) + self.decoder = Decoder() + + # Set to False because we only care about the decoder + self.strict_loading = False + + def state_dict(self): + # Don't save the encoder, it is not being trained + return {k: v for k, v in super().state_dict().items() if "encoder" not in k} + + +Since ``strict_loading`` is set to ``False``, you won't get any key errors when resuming the checkpoint with the Trainer: + +.. code-block:: python + + trainer = Trainer() + model = LitModel() + + # Will load weights with `.load_state_dict(strict=model.strict_loading)` + trainer.fit(model, ckpt_path="path/to/checkpoint") From c00ed5cb930af7d9056e9c4d6ee70a8b8b2a2b08 Mon Sep 17 00:00:00 2001 From: PL Ghost <75324987+pl-ghost@users.noreply.github.com> Date: Thu, 4 Jul 2024 11:38:23 +0200 Subject: [PATCH 099/179] Adding test for legacy checkpoint created with 2.3.2 (#20042) --- tests/legacy/back-compatible-versions.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/legacy/back-compatible-versions.txt b/tests/legacy/back-compatible-versions.txt index 06ebce5658d0f..a5679497e63fe 100644 --- a/tests/legacy/back-compatible-versions.txt +++ b/tests/legacy/back-compatible-versions.txt @@ -102,3 +102,4 @@ 2.2.5 2.3.0 2.3.1 +2.3.2 From 90f23b34e956ac19eda4f5b41b9d1339595fd207 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Thu, 4 Jul 2024 16:05:36 +0200 Subject: [PATCH 100/179] Add audio example to README (#20044) add audio example to readme --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8af11d2a54167..b39656c4e1729 100644 --- a/README.md +++ b/README.md @@ -133,11 +133,12 @@ ______________________________________________________________________ ### Examples Explore various types of training possible with PyTorch Lightning. Pretrain and finetune ANY kind of model to perform ANY task like classification, segmentation, summarization and more: -| Task | Description | Run | -|---|---|---| -| [Hello world](#hello-simple-model) | Pretrain - Hello world example | Open In Studio | -| [Image segmentation](https://lightning.ai/lightning-ai/studios/image-segmentation-with-pytorch-lightning) | Finetune - ResNet-50 model to segment images | Open In Studio | -| [Text classification](https://lightning.ai/lightning-ai/studios/text-classification-with-pytorch-lightning) | Finetune - text classifier (BERT model) | Open In Studio | +| Task | Description | Run | +|-------------------------------------------------------------------------------------------------------------|------------------------------------------------|---| +| [Hello world](#hello-simple-model) | Pretrain - Hello world example | Open In Studio | +| [Image segmentation](https://lightning.ai/lightning-ai/studios/image-segmentation-with-pytorch-lightning) | Finetune - ResNet-50 model to segment images | Open In Studio | +| [Text classification](https://lightning.ai/lightning-ai/studios/text-classification-with-pytorch-lightning) | Finetune - text classifier (BERT model) | Open In Studio | +| [Audio generation](https://lightning.ai/lightning-ai/studios/finetune-a-personal-ai-music-generator) | Finetune - audio generator (transformer model) | Open In Studio | ### Hello simple model From 330af381de88cff17515418a341cbc1f9f127f9a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Fri, 5 Jul 2024 13:56:29 +0200 Subject: [PATCH 101/179] Remove the lightning app code (#20039) * remove source, tests, docs, workflows * update checkgroup * update codeowners * update workflows * package setup * config files * update * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove entry point * docs * __main__ * remove store * leftover store removals --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .actions/assistant.py | 33 +- .azure/app-cloud-e2e.yml | 206 -- .github/CODEOWNERS | 10 - .github/CONTRIBUTING.md | 14 - .github/ISSUE_TEMPLATE/1_bug_report.yaml | 2 - .github/actions/pkg-install/action.yml | 2 +- .github/actions/prep-apps/action.yml | 38 - .github/checkgroup.yml | 121 - .github/label-change.yml | 20 - .github/workflows/README.md | 3 - .github/workflows/_build-packages.yml | 2 +- .github/workflows/_flagship-apps.yml | 122 - .github/workflows/ci-examples-app.yml | 136 -- .github/workflows/ci-flagship-apps.yml | 27 - .github/workflows/ci-pkg-install.yml | 32 +- .github/workflows/ci-tests-app.yml | 175 -- .github/workflows/ci-tests-store.yml | 96 - .github/workflows/code-checks.yml | 1 - .github/workflows/docs-build.yml | 6 +- .github/workflows/release-pkg.yml | 4 +- .gitignore | 7 - .lightningignore | 16 - .pre-commit-config.yaml | 1 - .readthedocs.yml | 3 +- Makefile | 10 - docs/crossroad.html | 1 - docs/rtfd-build.sh | 2 +- docs/source-app/Makefile | 19 - docs/source-app/_static/copybutton.js | 78 - docs/source-app/_static/images/icon.svg | 9 - docs/source-app/_static/images/logo-large.svg | 9 - docs/source-app/_static/images/logo-small.svg | 9 - docs/source-app/_static/images/logo.png | Bin 16014 -> 0 bytes docs/source-app/_static/images/logo.svg | 22 - docs/source-app/_static/main.css | 3 - docs/source-app/_templates/classtemplate.rst | 9 - .../_templates/classtemplate_no_index.rst | 12 - docs/source-app/_templates/layout.html | 16 - .../_templates/theme_variables.jinja | 18 - docs/source-app/api_reference/components.rst | 35 - docs/source-app/api_reference/core.rst | 26 - docs/source-app/api_reference/frontend.rst | 25 - docs/source-app/api_reference/runners.rst | 21 - docs/source-app/api_reference/storage.rst | 71 - docs/source-app/basics.rst | 259 --- docs/source-app/code_samples/basics/0.py | 19 - docs/source-app/code_samples/basics/1.py | 22 - .../code_samples/convert_pl_to_app/app.py | 17 - .../convert_pl_to_app/requirements.txt | 3 - .../code_samples/convert_pl_to_app/train.py | 46 - .../code_samples/quickstart/__init__.py | 0 .../code_samples/quickstart/app/__init__.py | 0 .../code_samples/quickstart/app/app_0.py | 21 - .../code_samples/quickstart/app/app_1.py | 92 - .../code_samples/quickstart/app_01.py | 27 - .../code_samples/quickstart/app_02.py | 32 - .../code_samples/quickstart/app_03.py | 31 - .../code_samples/quickstart/app_comp.py | 26 - .../quickstart/hello_world/app.py | 15 - .../quickstart/hello_world/app_ui.py | 57 - .../quickstart/hello_world/ui/index.html | 1 - docs/source-app/conf.py | 412 ---- docs/source-app/contribute_app.rst | 7 - docs/source-app/core_api/lightning_app/app.py | 27 - .../core_api/lightning_app/communication.rst | 15 - .../lightning_app/communication_content.rst | 160 -- .../lightning_app/compute_content.rst | 40 - .../core_api/lightning_app/dynamic_work.rst | 15 - .../lightning_app/dynamic_work_content.rst | 202 -- .../core_api/lightning_app/index.rst | 94 - .../core_api/lightning_app/lightning_app.rst | 12 - docs/source-app/core_api/lightning_flow.rst | 8 - .../core_api/lightning_work/compute.rst | 15 - .../lightning_work/compute_content.rst | 94 - .../lightning_work/handling_app_exception.rst | 13 - .../handling_app_exception_content.rst | 74 - .../core_api/lightning_work/index.rst | 112 - .../lightning_work/lightning_work.rst | 11 - .../core_api/lightning_work/payload.rst | 15 - .../lightning_work/payload_content.rst | 75 - .../core_api/lightning_work/status.rst | 13 - .../lightning_work/status_content.rst | 197 -- docs/source-app/core_api/overview.rst | 40 - docs/source-app/examples/dag/dag.rst | 81 - .../examples/dag/dag_from_scratch.rst | 53 - docs/source-app/examples/data_explore_app.rst | 5 - docs/source-app/examples/etl_app.rst | 5 - docs/source-app/examples/file_server/app.py | 243 -- .../examples/file_server/file_server.rst | 13 - .../file_server/file_server_content.rst | 85 - .../file_server/file_server_step_1.rst | 49 - .../file_server/file_server_step_2.rst | 75 - .../file_server/file_server_step_3.rst | 54 - .../file_server/file_server_step_4.rst | 127 -- .../examples/github_repo_runner/app.py | 309 --- .../github_repo_runner/github_repo_runner.rst | 15 - .../github_repo_runner_content.rst | 97 - .../github_repo_runner_step_1.rst | 62 - .../github_repo_runner_step_2.rst | 68 - .../github_repo_runner_step_3.rst | 62 - .../github_repo_runner_step_4.rst | 86 - .../github_repo_runner_step_5.rst | 75 - docs/source-app/examples/hands_on_example.rst | 50 - docs/source-app/examples/index.rst | 36 - .../examples/model_server_app/app.py | 34 - .../model_server_app/load_testing.rst | 57 - .../model_server_app/locust_component.py | 43 - .../examples/model_server_app/locustfile.py | 41 - .../examples/model_server_app/model_server.py | 90 - .../model_server_app/model_server.rst | 48 - .../model_server_app/model_server_app.rst | 15 - .../model_server_app_content.rst | 84 - .../putting_everything_together.rst | 80 - .../examples/model_server_app/train.py | 41 - .../examples/model_server_app/train.rst | 49 - .../source-app/examples/research_demo_app.rst | 5 - .../get_started/add_an_interactive_demo.rst | 15 - docs/source-app/get_started/build_model.rst | 73 - .../get_started/go_beyond_training.rst | 14 - .../go_beyond_training_content.rst | 405 ---- .../jumpstart_from_app_gallery.rst | 123 - .../jumpstart_from_component_gallery.rst | 151 -- .../get_started/training_with_apps.rst | 125 - .../get_started/what_app_can_do.rst | 187 -- docs/source-app/glossary/app_tree.rst | 113 - .../glossary/build_config/build_config.rst | 43 - .../build_config/build_config_advanced.rst | 63 - .../build_config/build_config_basic.rst | 68 - .../build_config_intermediate.rst | 56 - .../glossary/command_lines/command_lines.rst | 69 - docs/source-app/glossary/dag.rst | 46 - docs/source-app/glossary/debug_app.rst | 3 - docs/source-app/glossary/distributed_fe.rst | 5 - .../glossary/distributed_hardware.rst | 5 - .../glossary/environment_variables.rst | 27 - docs/source-app/glossary/event_loop.rst | 11 - docs/source-app/glossary/fault_tolerance.rst | 7 - docs/source-app/glossary/index.rst | 155 -- docs/source-app/glossary/ios_and_android.rst | 26 - .../glossary/lightning_app_overview/index.rst | 11 - docs/source-app/glossary/mount.rst | 1 - .../glossary/restful_api/restful_api.rst | 53 - docs/source-app/glossary/scheduling.rst | 185 -- docs/source-app/glossary/secrets.rst | 74 - .../glossary/sharing_components.rst | 50 - .../glossary/storage/differences.rst | 78 - docs/source-app/glossary/storage/drive.rst | 13 - .../glossary/storage/drive_content.rst | 223 -- .../glossary/storage/drive_content_old.rst | 199 -- docs/source-app/glossary/storage/path.rst | 326 --- docs/source-app/glossary/storage/storage.rst | 77 - .../glossary/use_local_lightning.rst | 15 - docs/source-app/index.rst | 153 -- docs/source-app/install/install_beginner.rst | 117 - docs/source-app/install/installation.rst | 29 - docs/source-app/install/mac.bash | 5 - docs/source-app/install/pip.bash | 1 - docs/source-app/install/windows.bash | 4 - docs/source-app/intro.rst | 88 - docs/source-app/landing_app.py | 12 - docs/source-app/landing_app_run.bash | 5 - docs/source-app/levels/advanced/index.rst | 94 - docs/source-app/levels/advanced/level_16.rst | 10 - docs/source-app/levels/advanced/level_17.rst | 10 - docs/source-app/levels/advanced/level_18.rst | 10 - docs/source-app/levels/advanced/level_19.rst | 11 - docs/source-app/levels/advanced/level_20.rst | 11 - .../advanced/start_dynamic_components.rst | 38 - docs/source-app/levels/basic/build_a_dag.rst | 20 - .../basic/build_a_lightning_component.rst | 154 -- .../levels/basic/create_a_model_demo.rst | 20 - .../levels/basic/deploy_ai_model_api.rst | 20 - .../hello_components/code_run_cloud.bash | 1 - .../code_run_cloud_setup.bash | 1 - .../hello_components/code_run_local.bash | 1 - .../code_run_local_setup.bash | 1 - .../basic/hello_components/deploy_model.py | 31 - .../basic/hello_components/hello_world.py | 12 - .../basic/hello_components/hello_world_gpu.py | 12 - .../basic/hello_components/multi_node.py | 29 - .../basic/hello_components/pl_multinode.py | 20 - .../basic/hello_components/pt_multinode.py | 60 - .../basic/hello_components/run_ptl_script.py | 13 - .../basic/hello_components/streamlit_demo.py | 30 - .../basic/hello_components/train_ptl.py | 15 - .../basic/hello_components/train_pytorch.py | 28 - .../levels/basic/hello_components/xgboost.py | 21 - .../basic/hello_components/xgboost_gpu.py | 22 - .../levels/basic/hero_components.rst | 8 - docs/source-app/levels/basic/index.rst | 54 - .../levels/basic/key_features/accelerators.py | 21 - .../levels/basic/key_features/auto_timeout.py | 13 - .../basic/key_features/custom_container.py | 13 - .../levels/basic/key_features/idle_machine.py | 13 - .../basic/key_features/massive_dataset.py | 13 - .../levels/basic/key_features/mount_data.py | 13 - .../levels/basic/key_features/spot.py | 13 - ...al_lightning_component_implementations.rst | 75 - .../run_jupyter_notebook_on_the_cloud.rst | 20 - .../basic/save_money_on_cloud_costs.rst | 51 - .../basic/scripts/toy_app_1_component.py | 17 - .../basic/scripts/toy_app_1_component_pdb.py | 18 - .../basic/train_pytorch_on_the_cloud.rst | 20 - docs/source-app/levels/expert/index.rst | 82 - .../connect_lightning_components.rst | 116 - .../intermediate/debug_a_lightning_app.rst | 48 - .../debug_app_scripts/debug_app.py | 25 - .../intermediate/debug_app_scripts/toy_app.py | 24 - .../debug_app_scripts/toy_app_1_component.py | 17 - .../toy_app_1_component_pdb.py | 18 - .../embed_web_ui_into_lightningwork.rst | 40 - docs/source-app/levels/intermediate/index.rst | 87 - .../levels/intermediate/level_12.rst | 12 - .../level_2_scripts/code_run_cloud.bash | 1 - .../level_2_scripts/code_run_local.bash | 1 - .../intermediate/level_2_scripts/hello_app.py | 23 - .../level_2_scripts/hello_app_auto_scale.py | 30 - .../level_2_scripts/hello_app_cron.py | 27 - .../level_2_scripts/hello_app_scheduler.py | 27 - .../level_2_scripts/organized_app_python.py | 36 - .../levels/intermediate/level_9.rst | 16 - .../run_lightning_work_in_parallel.rst | 38 - .../levels/intermediate/scripts/.storage/a | Bin 22 -> 0 bytes .../intermediate/scripts/.storage/embeddings | Bin 36 -> 0 bytes docs/source-app/levels/intermediate/scripts/a | Bin 22 -> 0 bytes .../levels/intermediate/scripts/comms_1.py | 18 - .../levels/intermediate/scripts/debug_app.py | 25 - .../levels/intermediate/scripts/embeddings | Bin 36 -> 0 bytes .../levels/intermediate/scripts/toy_app.py | 24 - .../intermediate/scripts/toy_payload.py | 31 - .../intermediate/scripts/two_comms_non_ml.py | 36 - .../intermediate/scripts/two_work_comms.py | 35 - .../share_files_between_components.rst | 34 - ...variables_between_lightning_components.rst | 162 -- .../start_from_lightning_app_templates.rst | 34 - docs/source-app/make.bat | 35 - docs/source-app/moving_to_the_cloud.rst | 122 - docs/source-app/quickstart.rst | 125 - docs/source-app/testing.rst | 155 -- docs/source-app/ui_and_frontends.rst | 23 - .../source-app/workflows/access_app_state.rst | 59 - docs/source-app/workflows/add_components.rst | 31 - .../workflows/add_server/any_server.rst | 187 -- .../workflows/add_server/flask_basic.rst | 155 -- .../source-app/workflows/add_server/index.rst | 8 - .../workflows/add_server/index_content.rst | 35 - docs/source-app/workflows/add_web_link.rst | 54 - .../add_web_ui/angular_js_intermediate.rst | 6 - .../workflows/add_web_ui/dash/basic.rst | 221 -- .../workflows/add_web_ui/dash/index.rst | 84 - .../add_web_ui/dash/intermediate.rst | 42 - .../add_web_ui/dash/intermediate_plot.py | 86 - .../add_web_ui/dash/intermediate_state.py | 39 - .../workflows/add_web_ui/example_app.rst | 7 - .../add_web_ui/glossary_front_end.rst | 9 - .../workflows/add_web_ui/glossary_ui.rst | 9 - .../workflows/add_web_ui/gradio/basic.rst | 217 -- .../workflows/add_web_ui/gradio/index.rst | 84 - .../add_web_ui/gradio/intermediate.rst | 21 - .../workflows/add_web_ui/html/basic.rst | 166 -- .../workflows/add_web_ui/html/index.rst | 87 - .../add_web_ui/html/intermediate.rst | 20 - .../source-app/workflows/add_web_ui/index.rst | 10 - .../workflows/add_web_ui/index_content.rst | 121 - .../integrate_any_javascript_framework.rst | 164 -- .../workflows/add_web_ui/jupyter_basic.rst | 70 - .../workflows/add_web_ui/justpy/index.rst | 92 - .../workflows/add_web_ui/panel/basic.rst | 369 --- .../workflows/add_web_ui/panel/index.rst | 85 - .../add_web_ui/panel/intermediate.rst | 210 -- ...ommunicate_between_react_and_lightning.rst | 58 - .../react/connect_react_and_lightning.rst | 107 - .../react/create_react_template.rst | 51 - .../workflows/add_web_ui/react/index.rst | 106 - .../react/react_development_workflow.rst | 27 - .../workflows/add_web_ui/streamlit/basic.rst | 186 -- .../workflows/add_web_ui/streamlit/index.rst | 84 - .../add_web_ui/streamlit/intermediate.rst | 105 - .../add_web_ui/vue_js_intermediate.rst | 6 - .../arrange_tabs/arrange_app_basic.rst | 69 - .../arrange_tabs/arrange_app_intermediate.rst | 21 - .../workflows/arrange_tabs/index.rst | 5 - .../workflows/arrange_tabs/index_content.rst | 34 - .../build_command_line_interface/app.py | 35 - .../build_command_line_interface/cli.rst | 144 -- .../cli_client.rst | 175 -- .../commands/__init__.py | 0 .../commands/notebook/__init__.py | 0 .../commands/notebook/run.py | 33 - .../example_command.py | 25 - .../build_command_line_interface/index.rst | 48 - .../index_content.rst | 51 - .../post_example.py | 26 - .../from_pytorch_lightning_script.rst | 109 - .../build_lightning_app/from_scratch.rst | 11 - .../from_scratch_content.rst | 60 - .../workflows/build_lightning_app/index.rst | 11 - .../build_lightning_app/index_content.rst | 32 - .../build_lightning_component/basic.rst | 9 - .../from_scratch_component_content.rst | 153 -- .../build_lightning_component/index.rst | 11 - .../index_content.rst | 122 - .../intermediate.rst | 71 - .../publish_a_component.rst | 59 - .../workflows/build_rest_api/add_api.rst | 104 - .../workflows/build_rest_api/index.rst | 34 - .../build_rest_api/index_content.rst | 50 - .../workflows/build_rest_api/models.py | 6 - .../workflows/build_rest_api/post_example.py | 25 - .../build_rest_api/post_example_pydantic.py | 32 - .../build_rest_api/request_validation.rst | 69 - docs/source-app/workflows/debug_locally.rst | 5 - .../workflows/enable_fault_tolerance.rst | 5 - docs/source-app/workflows/extend_app.rst | 59 - docs/source-app/workflows/index.rst | 186 -- .../workflows/mount_cloud_object_store.rst | 141 -- .../run_app_on_cloud/cloud_files.rst | 69 - .../workflows/run_app_on_cloud/index.rst | 5 - .../run_app_on_cloud/index_content.rst | 115 - .../run_app_on_cloud/lightning_cloud.rst | 67 - .../workflows/run_app_on_cloud/on_prem.rst | 6 - .../run_app_on_cloud/on_your_own_machine.rst | 26 - docs/source-app/workflows/run_app_snippet.rst | 33 - .../run_components_on_different_hardware.rst | 5 - .../workflows/run_on_private_cloud.rst | 26 - .../workflows/run_work_in_parallel.rst | 10 - .../run_work_in_parallel_content.rst | 41 - docs/source-app/workflows/run_work_once.rst | 13 - .../workflows/run_work_once_content.rst | 151 -- docs/source-app/workflows/schedule_apps.rst | 5 - .../workflows/scripts/parallel/toy_app.py | 27 - .../scripts/parallel/toy_parallel.py | 27 - .../scripts/parallel/toy_two_parallel.py | 27 - .../parallel/toy_two_parallel_not_started.py | 27 - docs/source-app/workflows/share_app.rst | 33 - .../share_files_between_components.rst | 120 - .../share_files_between_components/app.py | 48 - docs/source-app/workflows/test_an_app.rst | 5 - docs/source-pytorch/versioning.rst | 2 - examples/app/argparse/app.py | 28 - examples/app/boring/.gitignore | 10 - examples/app/boring/app.py | 61 - examples/app/boring/app_dynamic.py | 72 - examples/app/boring/scripts/__init__.py | 0 examples/app/boring/scripts/serve.py | 29 - .../app/commands_and_api/.lightningignore | 1 - examples/app/commands_and_api/app.py | 52 - examples/app/commands_and_api/command.py | 18 - examples/app/components/python/__init__.py | 0 examples/app/components/python/app.py | 25 - .../app/components/python/component_popen.py | 7 - .../app/components/python/component_tracer.py | 52 - examples/app/components/python/pl_script.py | 10 - examples/app/components/serve/gradio/app.py | 51 - .../app/components/serve/gradio/beyonce.jpg | Bin 132520 -> 0 bytes .../components/serve/gradio/requirements.txt | 1 - examples/app/dag/.gitignore | 6 - examples/app/dag/.lightningignore | 8 - examples/app/dag/app.py | 130 -- examples/app/dag/processing.py | 14 - examples/app/dag/requirements.txt | 2 - examples/app/display_name/.lightningignore | 1 - examples/app/display_name/app.py | 25 - examples/app/drive/.gitignore | 1 - examples/app/drive/app.py | 51 - examples/app/hpo/README.md | 64 - examples/app/hpo/app_wi_ui.py | 60 - examples/app/hpo/app_wo_ui.py | 57 - examples/app/hpo/download_data.py | 5 - examples/app/hpo/hyperplot.py | 34 - examples/app/hpo/objective.py | 62 - examples/app/hpo/pl_script.py | 43 - examples/app/hpo/requirements.txt | 3 - examples/app/hpo/utils.py | 55 - examples/app/installation_commands/app.py | 31 - examples/app/interruptible/app.py | 32 - examples/app/justpy/app.py | 42 - examples/app/justpy/requirements.txt | 1 - examples/app/layout/app.py | 101 - examples/app/layout/requirements.txt | 1 - examples/app/layout/ui1/index.html | 10 - examples/app/layout/ui2/index.html | 10 - examples/app/mount/app.py | 34 - examples/app/multi_node/README.md | 51 - examples/app/multi_node/pl_boring_script.py | 7 - examples/app/multi_node/requirements.txt | 1 - examples/app/multi_node/train_any.py | 22 - examples/app/multi_node/train_fabric.py | 40 - examples/app/multi_node/train_lt.py | 21 - examples/app/multi_node/train_lt_script.py | 11 - examples/app/multi_node/train_pytorch.py | 60 - .../app/multi_node/train_pytorch_spawn.py | 51 - examples/app/payload/app.py | 31 - examples/app/pickle_or_not/app.py | 54 - examples/app/pickle_or_not/requirements.txt | 0 examples/app/server/app.py | 39 - examples/app/server_with_auto_scaler/app.py | 93 - examples/app/template_streamlit_ui/app.py | 44 - .../template_streamlit_ui/requirements.txt | 1 - examples/app/v0/.gitignore | 2 - examples/app/v0/README.md | 18 - examples/app/v0/app.py | 49 - examples/app/v0/emulate_ui.py | 18 - examples/app/v0/requirements.txt | 1 - examples/app/v0/ui/a/index.html | 1 - examples/app/v0/ui/b/index.html | 1 - .../app/works_on_default_machine/app_v2.py | 52 - .../works_on_default_machine/requirements.txt | 1 - pyproject.toml | 103 +- requirements.txt | 1 - requirements/app/app.txt | 31 - requirements/app/cloud.txt | 4 - requirements/app/components.txt | 5 - requirements/app/docs.txt | 1 - requirements/app/test.txt | 18 - requirements/app/ui.txt | 0 setup.py | 6 +- src/app-ui-version.info | 1 - src/lightning/__init__.py | 17 - src/lightning/__main__.py | 4 - src/lightning/__setup__.py | 18 +- src/lightning/app/CHANGELOG.md | 608 ----- src/lightning/app/__init__.py | 51 - src/lightning/app/api/__init__.py | 8 - src/lightning/app/api/http_methods.py | 258 --- src/lightning/app/api/request_types.py | 56 - src/lightning/app/cli/__init__.py | 0 src/lightning/app/cli/app-template/.gitignore | 157 -- src/lightning/app/cli/app-template/LICENSE | 201 -- src/lightning/app/cli/app-template/README.md | 37 - src/lightning/app/cli/app-template/app.py | 16 - .../app-template/placeholdername/__init__.py | 4 - .../components/component_a/__init__.py | 3 - .../components/component_a/component_a.py | 6 - .../components/component_b/__init__.py | 3 - .../components/component_b/component_a.py | 6 - .../app/cli/app-template/requirements.txt | 0 src/lightning/app/cli/app-template/setup.py | 15 - .../app/cli/app-template/tests/README.md | 17 - .../app/cli/app-template/tests/__init__.py | 0 .../cli/app-template/tests/requirements.txt | 8 - .../tests/test_placeholdername_app.py | 44 - src/lightning/app/cli/cmd_apps.py | 146 -- src/lightning/app/cli/cmd_init.py | 167 -- src/lightning/app/cli/cmd_install.py | 657 ------ src/lightning/app/cli/cmd_pl_init.py | 187 -- src/lightning/app/cli/cmd_react_ui_init.py | 131 -- src/lightning/app/cli/commands/__init__.py | 0 .../app/cli/commands/app_commands.py | 135 -- src/lightning/app/cli/commands/cd.py | 117 - src/lightning/app/cli/commands/cp.py | 350 --- src/lightning/app/cli/commands/logs.py | 122 - src/lightning/app/cli/commands/ls.py | 268 --- src/lightning/app/cli/commands/pwd.py | 53 - src/lightning/app/cli/commands/rm.py | 101 - .../.github/workflows/ci-testing.yml | 79 - .../app/cli/component-template/.gitignore | 157 -- .../app/cli/component-template/LICENSE | 201 -- .../app/cli/component-template/README.md | 35 - .../app/cli/component-template/app.py | 15 - .../placeholdername/__init__.py | 3 - .../placeholdername/component.py | 12 - .../cli/component-template/requirements.txt | 0 .../app/cli/component-template/setup.py | 15 - .../cli/component-template/tests/README.md | 17 - .../cli/component-template/tests/__init__.py | 0 .../component-template/tests/requirements.txt | 8 - .../tests/test_placeholdername_component.py | 14 - src/lightning/app/cli/connect/__init__.py | 0 src/lightning/app/cli/connect/app.py | 387 ---- src/lightning/app/cli/connect/data.py | 109 - src/lightning/app/cli/core.py | 27 - src/lightning/app/cli/lightning_cli.py | 503 ---- src/lightning/app/cli/lightning_cli_delete.py | 124 - src/lightning/app/cli/lightning_cli_launch.py | 130 -- src/lightning/app/cli/lightning_cli_list.py | 32 - .../app/cli/pl-app-template/.gitignore | 1 - .../app/cli/pl-app-template/.lightningignore | 2 - src/lightning/app/cli/pl-app-template/app.py | 105 - .../app/cli/pl-app-template/core/__init__.py | 0 .../app/cli/pl-app-template/core/callbacks.py | 319 --- .../core/components/__init__.py | 2 - .../core/components/logger/__init__.py | 0 .../core/components/logger/tensorboard.py | 49 - .../components/logger/weights_and_biases.py | 33 - .../core/components/script_runner/__init__.py | 1 - .../components/script_runner/script_runner.py | 76 - .../app/cli/pl-app-template/core/state.py | 45 - .../app/cli/pl-app-template/setup.py | 34 - .../app/cli/pl-app-template/tests/__init__.py | 0 .../pl-app-template/tests/core/__init__.py | 0 .../tests/core/test_callbacks.py | 68 - .../app/cli/pl-app-template/tests/test_app.py | 14 - .../app/cli/pl-app-template/ui/.gitignore | 25 - .../cli/pl-app-template/ui/.prettierignore | 3 - .../app/cli/pl-app-template/ui/.prettierrc | 24 - .../cli/pl-app-template/ui/craco.config.js | 29 - .../app/cli/pl-app-template/ui/package.json | 95 - .../cli/pl-app-template/ui/public/favicon.svg | 9 - .../cli/pl-app-template/ui/public/index.html | 65 - .../pl-app-template/ui/public/manifest.json | 15 - .../cli/pl-app-template/ui/public/robots.txt | 3 - .../app/cli/pl-app-template/ui/src/App.tsx | 126 - .../components/EnvironmentConfigurator.tsx | 67 - .../ui/src/components/ErrorPanel.tsx | 24 - .../ui/src/components/ExecutionSummary.tsx | 79 - .../src/components/HyperparameterSummary.tsx | 95 - .../ui/src/components/Launcher.tsx | 172 -- .../ui/src/components/ProgressBar.tsx | 35 - .../ui/src/components/ProgressBarGroup.tsx | 49 - .../ui/src/components/Timer.tsx | 29 - .../ui/src/hooks/useLightningState.ts | 31 - .../app/cli/pl-app-template/ui/src/index.css | 19 - .../app/cli/pl-app-template/ui/src/index.tsx | 26 - .../ui/src/lightning-colors.ts | 2 - .../pl-app-template/ui/src/react-app-env.d.ts | 1 - .../pl-app-template/ui/src/reportWebVitals.ts | 15 - .../pl-app-template/ui/src/types/lightning.ts | 57 - .../app/cli/pl-app-template/ui/tsconfig.json | 22 - .../app/cli/react-ui-template/README.md | 103 - .../app/cli/react-ui-template/example_app.py | 33 - .../app/cli/react-ui-template/ui/index.html | 14 - .../app/cli/react-ui-template/ui/package.json | 31 - .../app/cli/react-ui-template/ui/src/App.css | 10 - .../app/cli/react-ui-template/ui/src/App.tsx | 69 - .../cli/react-ui-template/ui/src/favicon.svg | 15 - .../ui/src/hooks/useLightningState.ts | 31 - .../cli/react-ui-template/ui/src/index.css | 11 - .../app/cli/react-ui-template/ui/src/main.tsx | 11 - .../ui/src/types/lightning.ts | 57 - .../react-ui-template/ui/src/vite-env.d.ts | 1 - .../cli/react-ui-template/ui/tsconfig.json | 21 - .../react-ui-template/ui/tsconfig.node.json | 8 - .../cli/react-ui-template/ui/vite.config.ts | 9 - .../app/cli/react-ui-template/ui/yarn.lock | 1278 ----------- src/lightning/app/components/README.md | 1 - src/lightning/app/components/__init__.py | 41 - .../app/components/database/__init__.py | 4 - .../app/components/database/client.py | 93 - .../app/components/database/server.py | 243 -- .../app/components/database/utilities.py | 263 --- .../app/components/multi_node/__init__.py | 6 - .../app/components/multi_node/base.py | 107 - .../app/components/multi_node/fabric.py | 132 -- .../components/multi_node/pytorch_spawn.py | 119 - .../app/components/multi_node/trainer.py | 130 -- .../app/components/python/__init__.py | 4 - src/lightning/app/components/python/popen.py | 113 - src/lightning/app/components/python/tracer.py | 200 -- .../app/components/serve/__init__.py | 17 - .../app/components/serve/auto_scaler.py | 753 ------ .../app/components/serve/catimage.png | Bin 20105 -> 0 bytes .../app/components/serve/cold_start_proxy.py | 72 - .../app/components/serve/gradio_server.py | 200 -- .../app/components/serve/python_server.py | 328 --- src/lightning/app/components/serve/serve.py | 170 -- .../app/components/serve/streamlit.py | 174 -- .../app/components/serve/types/__init__.py | 4 - .../app/components/serve/types/image.py | 45 - .../app/components/serve/types/type.py | 32 - src/lightning/app/components/training.py | 203 -- src/lightning/app/core/__init__.py | 5 - src/lightning/app/core/api.py | 498 ---- src/lightning/app/core/app.py | 746 ------ src/lightning/app/core/constants.py | 159 -- src/lightning/app/core/flow.py | 866 ------- src/lightning/app/core/queues.py | 585 ----- src/lightning/app/core/work.py | 772 ------- src/lightning/app/frontend/__init__.py | 7 - src/lightning/app/frontend/frontend.py | 68 - .../app/frontend/just_py/__init__.py | 0 src/lightning/app/frontend/just_py/just_py.py | 114 - .../app/frontend/just_py/just_py_base.py | 66 - src/lightning/app/frontend/panel/__init__.py | 6 - .../app/frontend/panel/app_state_comm.py | 101 - .../app/frontend/panel/app_state_watcher.py | 123 - .../app/frontend/panel/panel_frontend.py | 193 -- .../frontend/panel/panel_serve_render_fn.py | 71 - src/lightning/app/frontend/stream_lit.py | 113 - src/lightning/app/frontend/streamlit_base.py | 50 - src/lightning/app/frontend/utils.py | 72 - src/lightning/app/frontend/web.py | 140 -- src/lightning/app/launcher/__init__.py | 0 src/lightning/app/launcher/launcher.py | 492 ---- .../app/launcher/lightning_backend.py | 570 ----- .../app/launcher/lightning_hybrid_backend.py | 160 -- src/lightning/app/launcher/utils.py | 97 - src/lightning/app/pdb/__init__.py | 6 - src/lightning/app/pdb/pdb.py | 50 - src/lightning/app/plugin/__init__.py | 3 - src/lightning/app/plugin/plugin.py | 237 -- src/lightning/app/runners/__init__.py | 14 - .../app/runners/backends/__init__.py | 24 - src/lightning/app/runners/backends/backend.py | 150 -- src/lightning/app/runners/backends/cloud.py | 52 - src/lightning/app/runners/backends/docker.py | 43 - .../app/runners/backends/mp_process.py | 141 -- src/lightning/app/runners/cloud.py | 1109 --------- src/lightning/app/runners/multiprocess.py | 161 -- src/lightning/app/runners/runtime.py | 182 -- src/lightning/app/runners/runtime_type.py | 33 - src/lightning/app/source_code/__init__.py | 7 - src/lightning/app/source_code/copytree.py | 183 -- src/lightning/app/source_code/hashing.py | 51 - src/lightning/app/source_code/local.py | 149 -- src/lightning/app/source_code/tar.py | 201 -- src/lightning/app/source_code/uploader.py | 111 - src/lightning/app/storage/__init__.py | 6 - src/lightning/app/storage/copier.py | 155 -- src/lightning/app/storage/drive.py | 341 --- src/lightning/app/storage/filesystem.py | 166 -- src/lightning/app/storage/mount.py | 75 - src/lightning/app/storage/orchestrator.py | 208 -- src/lightning/app/storage/path.py | 453 ---- src/lightning/app/storage/payload.py | 274 --- src/lightning/app/storage/requests.py | 57 - src/lightning/app/structures/__init__.py | 4 - src/lightning/app/structures/dict.py | 160 -- src/lightning/app/structures/list.py | 177 -- src/lightning/app/testing/__init__.py | 20 - src/lightning/app/testing/config.py | 28 - src/lightning/app/testing/helpers.py | 179 -- src/lightning/app/testing/testing.py | 535 ----- src/lightning/app/utilities/__init__.py | 0 src/lightning/app/utilities/app_commands.py | 127 -- src/lightning/app/utilities/app_helpers.py | 582 ----- src/lightning/app/utilities/app_logs.py | 137 -- src/lightning/app/utilities/app_status.py | 44 - src/lightning/app/utilities/auth.py | 61 - src/lightning/app/utilities/cli_helpers.py | 358 --- src/lightning/app/utilities/cloud.py | 63 - src/lightning/app/utilities/clusters.py | 52 - .../app/utilities/commands/__init__.py | 3 - src/lightning/app/utilities/commands/base.py | 308 --- src/lightning/app/utilities/component.py | 152 -- .../app/utilities/data_structures.py | 50 - .../app/utilities/dependency_caching.py | 27 - src/lightning/app/utilities/enum.py | 82 - src/lightning/app/utilities/exceptions.py | 101 - src/lightning/app/utilities/frontend.py | 89 - src/lightning/app/utilities/git.py | 86 - src/lightning/app/utilities/imports.py | 148 -- src/lightning/app/utilities/introspection.py | 400 ---- src/lightning/app/utilities/layout.py | 215 -- src/lightning/app/utilities/load_app.py | 304 --- src/lightning/app/utilities/log.py | 23 - src/lightning/app/utilities/log_helpers.py | 49 - src/lightning/app/utilities/login.py | 213 -- .../app/utilities/logs_socket_api.py | 94 - src/lightning/app/utilities/name_generator.py | 1359 ----------- src/lightning/app/utilities/network.py | 215 -- src/lightning/app/utilities/openapi.py | 75 - .../app/utilities/packaging/__init__.py | 0 .../app/utilities/packaging/app_config.py | 76 - .../app/utilities/packaging/build_config.py | 208 -- .../app/utilities/packaging/cloud_compute.py | 188 -- .../app/utilities/packaging/docker.py | 127 -- .../utilities/packaging/lightning_utils.py | 219 -- .../app/utilities/packaging/tarfile.py | 52 - src/lightning/app/utilities/port.py | 168 -- src/lightning/app/utilities/proxies.py | 766 ------- src/lightning/app/utilities/redis.py | 34 - src/lightning/app/utilities/safe_pickle.py | 107 - src/lightning/app/utilities/scheduler.py | 60 - src/lightning/app/utilities/secrets.py | 41 - src/lightning/app/utilities/state.py | 323 --- src/lightning/app/utilities/tracer.py | 193 -- src/lightning/app/utilities/tree.py | 88 - src/lightning/app/utilities/types.py | 28 - src/lightning/app/utilities/warnings.py | 17 - src/lightning/fabric/cli.py | 18 +- src/lightning/store/README.md | 41 - src/lightning/store/__init__.py | 3 - src/lightning/store/store.py | 92 - src/lightning/store/utils.py | 67 - src/lightning_app/MANIFEST.in | 11 - src/lightning_app/README.md | 146 -- src/lightning_app/__about__.py | 35 - src/lightning_app/__main__.py | 4 - src/lightning_app/__setup__.py | 122 - src/lightning_app/__version__.py | 9 - src/lightning_app/py.typed | 0 .../shell-folder_code-lives-lightning.info | 2 - tests/integrations_app/__init__.py | 3 - .../apps/collect_failures/__init__.py | 1 - .../apps/collect_failures/app.py | 46 - .../apps/collect_failures/requirements.txt | 1 - .../apps/core_features_app/__init__.py | 1 - .../apps/core_features_app/app.py | 17 - .../apps/custom_work_dependencies/__init__.py | 1 - .../apps/custom_work_dependencies/app.py | 53 - .../apps/idle_timeout/__init__.py | 1 - .../integrations_app/apps/idle_timeout/app.py | 70 - tests/integrations_app/conftest.py | 81 - tests/integrations_app/flagship/__init__.py | 5 - .../integrations_app/flagship/test_flashy.py | 73 - .../integrations_app/flagship/test_jupyter.py | 1 - tests/integrations_app/flagship/test_muse.py | 1 - tests/integrations_app/local/__init__.py | 3 - .../local/test_collect_failures.py | 40 - .../local/test_core_features_app.py | 26 - .../local/test_custom_work_dependencies.py | 23 - .../local/test_idle_timeout.py | 23 - tests/integrations_app/public/__init__.py | 3 - tests/integrations_app/public/test_app_dag.py | 20 - .../integrations_app/public/test_argparse.py | 68 - .../public/test_boring_app.py | 36 - .../public/test_commands_and_api.py | 37 - tests/integrations_app/public/test_drive.py | 23 - tests/integrations_app/public/test_gradio.py | 30 - .../public/test_installation_commands_app.py | 22 - tests/integrations_app/public/test_layout.py | 25 - .../public/test_multi_node.py | 47 - tests/integrations_app/public/test_payload.py | 18 - .../public/test_pickle_or_not.py | 26 - .../public/test_quick_start.py | 70 - tests/integrations_app/public/test_scripts.py | 39 - .../public/test_template_react_ui.py | 35 - .../public/test_template_streamlit_ui.py | 35 - tests/integrations_app/public/test_v0_app.py | 98 - tests/tests_app/__init__.py | 4 - tests/tests_app/cli/__init__.py | 0 tests/tests_app/cli/jsons/connect_1.json | 311 --- .../cli/launch_data/app_v0/__init__.py | 0 tests/tests_app/cli/launch_data/app_v0/app.py | 51 - .../cli/launch_data/app_v0/ui/a/index.html | 1 - .../cli/launch_data/app_v0/ui/b/index.html | 1 - tests/tests_app/cli/test_cd.py | 57 - tests/tests_app/cli/test_cli.py | 60 - tests/tests_app/cli/test_cloud_cli.py | 220 -- tests/tests_app/cli/test_cmd_apps.py | 157 -- tests/tests_app/cli/test_cmd_cli_delete.py | 28 - tests/tests_app/cli/test_cmd_init.py | 92 - tests/tests_app/cli/test_cmd_install.py | 384 ---- tests/tests_app/cli/test_cmd_launch.py | 330 --- tests/tests_app/cli/test_cmd_pl_init.py | 125 - tests/tests_app/cli/test_cmd_react_ui_init.py | 60 - tests/tests_app/cli/test_cmd_show_logs.py | 60 - tests/tests_app/cli/test_connect.py | 171 -- tests/tests_app/cli/test_connect_data.py | 58 - tests/tests_app/cli/test_cp.py | 242 -- tests/tests_app/cli/test_ls.py | 101 - tests/tests_app/cli/test_rm.py | 99 - tests/tests_app/cli/test_run_app.py | 224 -- tests/tests_app/components/__init__.py | 0 .../components/database/test_client_server.py | 202 -- .../components/multi_node/__init__.py | 0 .../components/multi_node/test_base.py | 30 - .../components/multi_node/test_fabric.py | 97 - .../components/multi_node/test_trainer.py | 94 - .../tests_app/components/python/scripts/a.py | 1 - .../tests_app/components/python/scripts/b.py | 3 - .../tests_app/components/python/scripts/c.py | 4 - .../components/python/test_python.py | 148 -- .../__init__.py | 17 - .../components/sample_package_repo/setup.py | 46 - .../components/serve/test_auto_scaler.py | 222 -- .../serve/test_model_inference_api.py | 81 - .../components/serve/test_python_server.py | 60 - .../components/serve/test_streamlit.py | 113 - tests/tests_app/conftest.py | 140 -- tests/tests_app/core/__init__.py | 0 .../tests_app/core/lightning_app/__init__.py | 0 .../lightning_app/test_configure_layout.py | 243 -- tests/tests_app/core/scripts/app_metadata.py | 61 - tests/tests_app/core/scripts/app_with_env.py | 14 - .../core/scripts/app_with_local_import.py | 4 - tests/tests_app/core/scripts/empty.py | 0 tests/tests_app/core/scripts/example_1.py | 1 - tests/tests_app/core/scripts/example_2.py | 1 - tests/tests_app/core/scripts/lightning_cli.py | 60 - .../core/scripts/lightning_overrides.py | 50 - .../core/scripts/lightning_trainer.py | 74 - tests/tests_app/core/scripts/registry.py | 102 - .../core/scripts/script_with_error.py | 13 - tests/tests_app/core/scripts/two_apps.py | 10 - tests/tests_app/core/test_constants.py | 29 - tests/tests_app/core/test_lightning_api.py | 595 ----- tests/tests_app/core/test_lightning_app.py | 1215 ---------- tests/tests_app/core/test_lightning_flow.py | 965 -------- tests/tests_app/core/test_lightning_work.py | 420 ---- tests/tests_app/core/test_queues.py | 264 --- tests/tests_app/frontend/__init__.py | 0 tests/tests_app/frontend/conftest.py | 74 - .../frontend/just_py/test_just_py.py | 46 - tests/tests_app/frontend/panel/__init__.py | 0 tests/tests_app/frontend/panel/app_panel.py | 4 - .../frontend/panel/test_app_state_comm.py | 40 - .../frontend/panel/test_app_state_watcher.py | 95 - .../frontend/panel/test_panel_frontend.py | 168 -- .../panel/test_panel_serve_render_fn.py | 86 - tests/tests_app/frontend/test_stream_lit.py | 101 - tests/tests_app/frontend/test_utils.py | 42 - tests/tests_app/frontend/test_web.py | 81 - .../tests_app/frontend/utilities/__init__.py | 0 tests/tests_app/helpers/__init__.py | 0 .../launcher/test_lightning_backend.py | 807 ------- .../launcher/test_lightning_hydrid.py | 14 - tests/tests_app/launcher/test_running_flow.py | 133 -- tests/tests_app/plugin/__init__.py | 0 tests/tests_app/plugin/test_plugin.py | 221 -- tests/tests_app/runners/__init__.py | 0 tests/tests_app/runners/backends/__init__.py | 0 .../runners/backends/test_mp_process.py | 28 - tests/tests_app/runners/test_cloud.py | 2025 ----------------- tests/tests_app/runners/test_multiprocess.py | 124 - tests/tests_app/runners/test_runtime.py | 43 - tests/tests_app/source_code/test_copytree.py | 107 - tests/tests_app/source_code/test_local.py | 376 --- tests/tests_app/source_code/test_tar.py | 120 - tests/tests_app/source_code/test_uploader.py | 48 - tests/tests_app/storage/__init__.py | 0 tests/tests_app/storage/test_copier.py | 148 -- tests/tests_app/storage/test_drive.py | 256 --- tests/tests_app/storage/test_filesystem.py | 75 - tests/tests_app/storage/test_mount.py | 41 - tests/tests_app/storage/test_orchestrator.py | 84 - tests/tests_app/storage/test_path.py | 725 ------ tests/tests_app/storage/test_payload.py | 154 -- tests/tests_app/structures/__init__.py | 0 tests/tests_app/structures/test_structures.py | 563 ----- tests/tests_app/test_imports.py | 68 - tests/tests_app/utilities/__init__.py | 0 .../tests_app/utilities/packaging/__init__.py | 0 .../packaging/projects/Dockerfile.cpu | 1 - .../packaging/projects/dock/__init__.py | 0 .../utilities/packaging/projects/dock/app.py | 12 - .../packaging/projects/dock/compo/__init__.py | 0 .../projects/dock/compo/a/__init__.py | 0 .../packaging/projects/dock/compo/a/a.py | 14 - .../projects/dock/compo/b/__init__.py | 0 .../packaging/projects/dock/compo/b/b.py | 10 - .../packaging/projects/dockerfile/__init__.py | 0 .../packaging/projects/dockerfile/app.py | 11 - .../dockerfile/comp_dockerfile/__init__.py | 0 .../dockerfile/comp_dockerfile/a/Dockerfile | 1 - .../dockerfile/comp_dockerfile/a/__init__.py | 0 .../dockerfile/comp_dockerfile/a/a.py | 6 - .../dockerfile/comp_dockerfile/b/__init__.py | 0 .../dockerfile/comp_dockerfile/b/b.py | 10 - .../packaging/projects/no_req/__init__.py | 0 .../packaging/projects/no_req/app.py | 12 - .../projects/no_req/comp/__init__.py | 0 .../projects/no_req/comp/a/__init__.py | 0 .../packaging/projects/no_req/comp/a/a.py | 8 - .../projects/no_req/comp/b/__init__.py | 0 .../packaging/projects/no_req/comp/b/b.py | 10 - .../packaging/projects/req/__init__.py | 0 .../utilities/packaging/projects/req/app.py | 12 - .../projects/req/comp_req/__init__.py | 0 .../projects/req/comp_req/a/__init__.py | 0 .../packaging/projects/req/comp_req/a/a.py | 8 - .../projects/req/comp_req/a/requirements.txt | 3 - .../projects/req/comp_req/b/__init__.py | 0 .../packaging/projects/req/comp_req/b/b.py | 10 - .../packaging/projects/requirements.txt | 1 - .../utilities/packaging/test_app_config.py | 36 - .../utilities/packaging/test_build_spec.py | 105 - .../utilities/packaging/test_cloud_compute.py | 81 - .../utilities/packaging/test_docker.py | 65 - .../packaging/test_lightning_utils.py | 64 - .../tests_app/utilities/test_app_commands.py | 88 - tests/tests_app/utilities/test_app_helpers.py | 205 -- tests/tests_app/utilities/test_app_logs.py | 13 - tests/tests_app/utilities/test_auth.py | 25 - tests/tests_app/utilities/test_cli_helpers.py | 196 -- tests/tests_app/utilities/test_cloud.py | 30 - tests/tests_app/utilities/test_commands.py | 165 -- tests/tests_app/utilities/test_component.py | 77 - .../utilities/test_dependency_caching.py | 15 - tests/tests_app/utilities/test_exceptions.py | 84 - tests/tests_app/utilities/test_git.py | 46 - tests/tests_app/utilities/test_imports.py | 49 - .../tests_app/utilities/test_introspection.py | 60 - tests/tests_app/utilities/test_layout.py | 142 -- tests/tests_app/utilities/test_load_app.py | 109 - tests/tests_app/utilities/test_log_helpers.py | 23 - tests/tests_app/utilities/test_login.py | 154 -- tests/tests_app/utilities/test_network.py | 87 - tests/tests_app/utilities/test_port.py | 108 - tests/tests_app/utilities/test_proxies.py | 790 ------- tests/tests_app/utilities/test_safe_pickle.py | 13 - tests/tests_app/utilities/test_secrets.py | 52 - tests/tests_app/utilities/test_state.py | 335 --- tests/tests_app/utilities/test_tracer.py | 27 - tests/tests_app/utilities/test_tree.py | 102 - .../app_commands/app_commands_to_ignore.txt | 4 - .../bang_not_at_start_of_line.txt | 2 - .../command_after_first_non_comment_line.txt | 4 - .../commands_with_mixed_comments_1.txt | 4 - .../commands_with_mixed_comments_2.txt | 5 - .../app_commands/multiple_commands.txt | 2 - ...ltiple_spaces_between_band_and_command.txt | 1 - .../testdata/app_commands/single_command.txt | 1 - .../space_between_bang_and_command.txt | 1 - .../utilities/testdata/safe_pickle_app.py | 63 - tests/tests_store/__init__.py | 0 tests/tests_store/test_store.py | 83 - 898 files changed, 25 insertions(+), 73832 deletions(-) delete mode 100644 .azure/app-cloud-e2e.yml delete mode 100644 .github/actions/prep-apps/action.yml delete mode 100644 .github/workflows/_flagship-apps.yml delete mode 100644 .github/workflows/ci-examples-app.yml delete mode 100644 .github/workflows/ci-flagship-apps.yml delete mode 100644 .github/workflows/ci-tests-app.yml delete mode 100644 .github/workflows/ci-tests-store.yml delete mode 100644 .lightningignore delete mode 100644 docs/source-app/Makefile delete mode 100644 docs/source-app/_static/copybutton.js delete mode 100644 docs/source-app/_static/images/icon.svg delete mode 100644 docs/source-app/_static/images/logo-large.svg delete mode 100644 docs/source-app/_static/images/logo-small.svg delete mode 100644 docs/source-app/_static/images/logo.png delete mode 100644 docs/source-app/_static/images/logo.svg delete mode 100644 docs/source-app/_static/main.css delete mode 100644 docs/source-app/_templates/classtemplate.rst delete mode 100644 docs/source-app/_templates/classtemplate_no_index.rst delete mode 100644 docs/source-app/_templates/layout.html delete mode 100644 docs/source-app/_templates/theme_variables.jinja delete mode 100644 docs/source-app/api_reference/components.rst delete mode 100644 docs/source-app/api_reference/core.rst delete mode 100644 docs/source-app/api_reference/frontend.rst delete mode 100644 docs/source-app/api_reference/runners.rst delete mode 100644 docs/source-app/api_reference/storage.rst delete mode 100644 docs/source-app/basics.rst delete mode 100644 docs/source-app/code_samples/basics/0.py delete mode 100644 docs/source-app/code_samples/basics/1.py delete mode 100644 docs/source-app/code_samples/convert_pl_to_app/app.py delete mode 100644 docs/source-app/code_samples/convert_pl_to_app/requirements.txt delete mode 100644 docs/source-app/code_samples/convert_pl_to_app/train.py delete mode 100644 docs/source-app/code_samples/quickstart/__init__.py delete mode 100644 docs/source-app/code_samples/quickstart/app/__init__.py delete mode 100644 docs/source-app/code_samples/quickstart/app/app_0.py delete mode 100644 docs/source-app/code_samples/quickstart/app/app_1.py delete mode 100644 docs/source-app/code_samples/quickstart/app_01.py delete mode 100644 docs/source-app/code_samples/quickstart/app_02.py delete mode 100644 docs/source-app/code_samples/quickstart/app_03.py delete mode 100644 docs/source-app/code_samples/quickstart/app_comp.py delete mode 100644 docs/source-app/code_samples/quickstart/hello_world/app.py delete mode 100644 docs/source-app/code_samples/quickstart/hello_world/app_ui.py delete mode 100644 docs/source-app/code_samples/quickstart/hello_world/ui/index.html delete mode 100644 docs/source-app/conf.py delete mode 100644 docs/source-app/contribute_app.rst delete mode 100644 docs/source-app/core_api/lightning_app/app.py delete mode 100644 docs/source-app/core_api/lightning_app/communication.rst delete mode 100644 docs/source-app/core_api/lightning_app/communication_content.rst delete mode 100644 docs/source-app/core_api/lightning_app/compute_content.rst delete mode 100644 docs/source-app/core_api/lightning_app/dynamic_work.rst delete mode 100644 docs/source-app/core_api/lightning_app/dynamic_work_content.rst delete mode 100644 docs/source-app/core_api/lightning_app/index.rst delete mode 100644 docs/source-app/core_api/lightning_app/lightning_app.rst delete mode 100644 docs/source-app/core_api/lightning_flow.rst delete mode 100644 docs/source-app/core_api/lightning_work/compute.rst delete mode 100644 docs/source-app/core_api/lightning_work/compute_content.rst delete mode 100644 docs/source-app/core_api/lightning_work/handling_app_exception.rst delete mode 100644 docs/source-app/core_api/lightning_work/handling_app_exception_content.rst delete mode 100644 docs/source-app/core_api/lightning_work/index.rst delete mode 100644 docs/source-app/core_api/lightning_work/lightning_work.rst delete mode 100644 docs/source-app/core_api/lightning_work/payload.rst delete mode 100644 docs/source-app/core_api/lightning_work/payload_content.rst delete mode 100644 docs/source-app/core_api/lightning_work/status.rst delete mode 100644 docs/source-app/core_api/lightning_work/status_content.rst delete mode 100644 docs/source-app/core_api/overview.rst delete mode 100644 docs/source-app/examples/dag/dag.rst delete mode 100644 docs/source-app/examples/dag/dag_from_scratch.rst delete mode 100644 docs/source-app/examples/data_explore_app.rst delete mode 100644 docs/source-app/examples/etl_app.rst delete mode 100644 docs/source-app/examples/file_server/app.py delete mode 100644 docs/source-app/examples/file_server/file_server.rst delete mode 100644 docs/source-app/examples/file_server/file_server_content.rst delete mode 100644 docs/source-app/examples/file_server/file_server_step_1.rst delete mode 100644 docs/source-app/examples/file_server/file_server_step_2.rst delete mode 100644 docs/source-app/examples/file_server/file_server_step_3.rst delete mode 100644 docs/source-app/examples/file_server/file_server_step_4.rst delete mode 100644 docs/source-app/examples/github_repo_runner/app.py delete mode 100644 docs/source-app/examples/github_repo_runner/github_repo_runner.rst delete mode 100644 docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst delete mode 100644 docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst delete mode 100644 docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst delete mode 100644 docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst delete mode 100644 docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst delete mode 100644 docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst delete mode 100644 docs/source-app/examples/hands_on_example.rst delete mode 100644 docs/source-app/examples/index.rst delete mode 100644 docs/source-app/examples/model_server_app/app.py delete mode 100644 docs/source-app/examples/model_server_app/load_testing.rst delete mode 100644 docs/source-app/examples/model_server_app/locust_component.py delete mode 100644 docs/source-app/examples/model_server_app/locustfile.py delete mode 100644 docs/source-app/examples/model_server_app/model_server.py delete mode 100644 docs/source-app/examples/model_server_app/model_server.rst delete mode 100644 docs/source-app/examples/model_server_app/model_server_app.rst delete mode 100644 docs/source-app/examples/model_server_app/model_server_app_content.rst delete mode 100644 docs/source-app/examples/model_server_app/putting_everything_together.rst delete mode 100644 docs/source-app/examples/model_server_app/train.py delete mode 100644 docs/source-app/examples/model_server_app/train.rst delete mode 100644 docs/source-app/examples/research_demo_app.rst delete mode 100644 docs/source-app/get_started/add_an_interactive_demo.rst delete mode 100644 docs/source-app/get_started/build_model.rst delete mode 100644 docs/source-app/get_started/go_beyond_training.rst delete mode 100644 docs/source-app/get_started/go_beyond_training_content.rst delete mode 100644 docs/source-app/get_started/jumpstart_from_app_gallery.rst delete mode 100644 docs/source-app/get_started/jumpstart_from_component_gallery.rst delete mode 100644 docs/source-app/get_started/training_with_apps.rst delete mode 100644 docs/source-app/get_started/what_app_can_do.rst delete mode 100644 docs/source-app/glossary/app_tree.rst delete mode 100644 docs/source-app/glossary/build_config/build_config.rst delete mode 100644 docs/source-app/glossary/build_config/build_config_advanced.rst delete mode 100644 docs/source-app/glossary/build_config/build_config_basic.rst delete mode 100644 docs/source-app/glossary/build_config/build_config_intermediate.rst delete mode 100644 docs/source-app/glossary/command_lines/command_lines.rst delete mode 100644 docs/source-app/glossary/dag.rst delete mode 100644 docs/source-app/glossary/debug_app.rst delete mode 100644 docs/source-app/glossary/distributed_fe.rst delete mode 100644 docs/source-app/glossary/distributed_hardware.rst delete mode 100644 docs/source-app/glossary/environment_variables.rst delete mode 100644 docs/source-app/glossary/event_loop.rst delete mode 100644 docs/source-app/glossary/fault_tolerance.rst delete mode 100644 docs/source-app/glossary/index.rst delete mode 100644 docs/source-app/glossary/ios_and_android.rst delete mode 100644 docs/source-app/glossary/lightning_app_overview/index.rst delete mode 100644 docs/source-app/glossary/mount.rst delete mode 100644 docs/source-app/glossary/restful_api/restful_api.rst delete mode 100644 docs/source-app/glossary/scheduling.rst delete mode 100644 docs/source-app/glossary/secrets.rst delete mode 100644 docs/source-app/glossary/sharing_components.rst delete mode 100644 docs/source-app/glossary/storage/differences.rst delete mode 100644 docs/source-app/glossary/storage/drive.rst delete mode 100644 docs/source-app/glossary/storage/drive_content.rst delete mode 100644 docs/source-app/glossary/storage/drive_content_old.rst delete mode 100644 docs/source-app/glossary/storage/path.rst delete mode 100644 docs/source-app/glossary/storage/storage.rst delete mode 100644 docs/source-app/glossary/use_local_lightning.rst delete mode 100644 docs/source-app/index.rst delete mode 100644 docs/source-app/install/install_beginner.rst delete mode 100644 docs/source-app/install/installation.rst delete mode 100644 docs/source-app/install/mac.bash delete mode 100644 docs/source-app/install/pip.bash delete mode 100644 docs/source-app/install/windows.bash delete mode 100644 docs/source-app/intro.rst delete mode 100644 docs/source-app/landing_app.py delete mode 100644 docs/source-app/landing_app_run.bash delete mode 100644 docs/source-app/levels/advanced/index.rst delete mode 100644 docs/source-app/levels/advanced/level_16.rst delete mode 100644 docs/source-app/levels/advanced/level_17.rst delete mode 100644 docs/source-app/levels/advanced/level_18.rst delete mode 100644 docs/source-app/levels/advanced/level_19.rst delete mode 100644 docs/source-app/levels/advanced/level_20.rst delete mode 100644 docs/source-app/levels/advanced/start_dynamic_components.rst delete mode 100644 docs/source-app/levels/basic/build_a_dag.rst delete mode 100644 docs/source-app/levels/basic/build_a_lightning_component.rst delete mode 100644 docs/source-app/levels/basic/create_a_model_demo.rst delete mode 100644 docs/source-app/levels/basic/deploy_ai_model_api.rst delete mode 100644 docs/source-app/levels/basic/hello_components/code_run_cloud.bash delete mode 100644 docs/source-app/levels/basic/hello_components/code_run_cloud_setup.bash delete mode 100644 docs/source-app/levels/basic/hello_components/code_run_local.bash delete mode 100644 docs/source-app/levels/basic/hello_components/code_run_local_setup.bash delete mode 100644 docs/source-app/levels/basic/hello_components/deploy_model.py delete mode 100644 docs/source-app/levels/basic/hello_components/hello_world.py delete mode 100644 docs/source-app/levels/basic/hello_components/hello_world_gpu.py delete mode 100644 docs/source-app/levels/basic/hello_components/multi_node.py delete mode 100644 docs/source-app/levels/basic/hello_components/pl_multinode.py delete mode 100644 docs/source-app/levels/basic/hello_components/pt_multinode.py delete mode 100644 docs/source-app/levels/basic/hello_components/run_ptl_script.py delete mode 100644 docs/source-app/levels/basic/hello_components/streamlit_demo.py delete mode 100644 docs/source-app/levels/basic/hello_components/train_ptl.py delete mode 100644 docs/source-app/levels/basic/hello_components/train_pytorch.py delete mode 100644 docs/source-app/levels/basic/hello_components/xgboost.py delete mode 100644 docs/source-app/levels/basic/hello_components/xgboost_gpu.py delete mode 100644 docs/source-app/levels/basic/hero_components.rst delete mode 100644 docs/source-app/levels/basic/index.rst delete mode 100644 docs/source-app/levels/basic/key_features/accelerators.py delete mode 100644 docs/source-app/levels/basic/key_features/auto_timeout.py delete mode 100644 docs/source-app/levels/basic/key_features/custom_container.py delete mode 100644 docs/source-app/levels/basic/key_features/idle_machine.py delete mode 100644 docs/source-app/levels/basic/key_features/massive_dataset.py delete mode 100644 docs/source-app/levels/basic/key_features/mount_data.py delete mode 100644 docs/source-app/levels/basic/key_features/spot.py delete mode 100644 docs/source-app/levels/basic/real_lightning_component_implementations.rst delete mode 100644 docs/source-app/levels/basic/run_jupyter_notebook_on_the_cloud.rst delete mode 100644 docs/source-app/levels/basic/save_money_on_cloud_costs.rst delete mode 100644 docs/source-app/levels/basic/scripts/toy_app_1_component.py delete mode 100644 docs/source-app/levels/basic/scripts/toy_app_1_component_pdb.py delete mode 100644 docs/source-app/levels/basic/train_pytorch_on_the_cloud.rst delete mode 100644 docs/source-app/levels/expert/index.rst delete mode 100644 docs/source-app/levels/intermediate/connect_lightning_components.rst delete mode 100644 docs/source-app/levels/intermediate/debug_a_lightning_app.rst delete mode 100644 docs/source-app/levels/intermediate/debug_app_scripts/debug_app.py delete mode 100644 docs/source-app/levels/intermediate/debug_app_scripts/toy_app.py delete mode 100644 docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component.py delete mode 100644 docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component_pdb.py delete mode 100644 docs/source-app/levels/intermediate/embed_web_ui_into_lightningwork.rst delete mode 100644 docs/source-app/levels/intermediate/index.rst delete mode 100644 docs/source-app/levels/intermediate/level_12.rst delete mode 100644 docs/source-app/levels/intermediate/level_2_scripts/code_run_cloud.bash delete mode 100644 docs/source-app/levels/intermediate/level_2_scripts/code_run_local.bash delete mode 100644 docs/source-app/levels/intermediate/level_2_scripts/hello_app.py delete mode 100644 docs/source-app/levels/intermediate/level_2_scripts/hello_app_auto_scale.py delete mode 100644 docs/source-app/levels/intermediate/level_2_scripts/hello_app_cron.py delete mode 100644 docs/source-app/levels/intermediate/level_2_scripts/hello_app_scheduler.py delete mode 100644 docs/source-app/levels/intermediate/level_2_scripts/organized_app_python.py delete mode 100644 docs/source-app/levels/intermediate/level_9.rst delete mode 100644 docs/source-app/levels/intermediate/run_lightning_work_in_parallel.rst delete mode 100644 docs/source-app/levels/intermediate/scripts/.storage/a delete mode 100644 docs/source-app/levels/intermediate/scripts/.storage/embeddings delete mode 100644 docs/source-app/levels/intermediate/scripts/a delete mode 100644 docs/source-app/levels/intermediate/scripts/comms_1.py delete mode 100644 docs/source-app/levels/intermediate/scripts/debug_app.py delete mode 100644 docs/source-app/levels/intermediate/scripts/embeddings delete mode 100644 docs/source-app/levels/intermediate/scripts/toy_app.py delete mode 100644 docs/source-app/levels/intermediate/scripts/toy_payload.py delete mode 100644 docs/source-app/levels/intermediate/scripts/two_comms_non_ml.py delete mode 100644 docs/source-app/levels/intermediate/scripts/two_work_comms.py delete mode 100644 docs/source-app/levels/intermediate/share_files_between_components.rst delete mode 100644 docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst delete mode 100644 docs/source-app/levels/intermediate/start_from_lightning_app_templates.rst delete mode 100644 docs/source-app/make.bat delete mode 100644 docs/source-app/moving_to_the_cloud.rst delete mode 100644 docs/source-app/quickstart.rst delete mode 100644 docs/source-app/testing.rst delete mode 100644 docs/source-app/ui_and_frontends.rst delete mode 100644 docs/source-app/workflows/access_app_state.rst delete mode 100644 docs/source-app/workflows/add_components.rst delete mode 100644 docs/source-app/workflows/add_server/any_server.rst delete mode 100644 docs/source-app/workflows/add_server/flask_basic.rst delete mode 100644 docs/source-app/workflows/add_server/index.rst delete mode 100644 docs/source-app/workflows/add_server/index_content.rst delete mode 100644 docs/source-app/workflows/add_web_link.rst delete mode 100644 docs/source-app/workflows/add_web_ui/angular_js_intermediate.rst delete mode 100644 docs/source-app/workflows/add_web_ui/dash/basic.rst delete mode 100644 docs/source-app/workflows/add_web_ui/dash/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/dash/intermediate.rst delete mode 100644 docs/source-app/workflows/add_web_ui/dash/intermediate_plot.py delete mode 100644 docs/source-app/workflows/add_web_ui/dash/intermediate_state.py delete mode 100644 docs/source-app/workflows/add_web_ui/example_app.rst delete mode 100644 docs/source-app/workflows/add_web_ui/glossary_front_end.rst delete mode 100644 docs/source-app/workflows/add_web_ui/glossary_ui.rst delete mode 100644 docs/source-app/workflows/add_web_ui/gradio/basic.rst delete mode 100644 docs/source-app/workflows/add_web_ui/gradio/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/gradio/intermediate.rst delete mode 100644 docs/source-app/workflows/add_web_ui/html/basic.rst delete mode 100644 docs/source-app/workflows/add_web_ui/html/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/html/intermediate.rst delete mode 100644 docs/source-app/workflows/add_web_ui/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/index_content.rst delete mode 100644 docs/source-app/workflows/add_web_ui/integrate_any_javascript_framework.rst delete mode 100644 docs/source-app/workflows/add_web_ui/jupyter_basic.rst delete mode 100644 docs/source-app/workflows/add_web_ui/justpy/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/panel/basic.rst delete mode 100644 docs/source-app/workflows/add_web_ui/panel/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/panel/intermediate.rst delete mode 100644 docs/source-app/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst delete mode 100644 docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst delete mode 100644 docs/source-app/workflows/add_web_ui/react/create_react_template.rst delete mode 100644 docs/source-app/workflows/add_web_ui/react/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/react/react_development_workflow.rst delete mode 100644 docs/source-app/workflows/add_web_ui/streamlit/basic.rst delete mode 100644 docs/source-app/workflows/add_web_ui/streamlit/index.rst delete mode 100644 docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst delete mode 100644 docs/source-app/workflows/add_web_ui/vue_js_intermediate.rst delete mode 100644 docs/source-app/workflows/arrange_tabs/arrange_app_basic.rst delete mode 100644 docs/source-app/workflows/arrange_tabs/arrange_app_intermediate.rst delete mode 100644 docs/source-app/workflows/arrange_tabs/index.rst delete mode 100644 docs/source-app/workflows/arrange_tabs/index_content.rst delete mode 100644 docs/source-app/workflows/build_command_line_interface/app.py delete mode 100644 docs/source-app/workflows/build_command_line_interface/cli.rst delete mode 100644 docs/source-app/workflows/build_command_line_interface/cli_client.rst delete mode 100644 docs/source-app/workflows/build_command_line_interface/commands/__init__.py delete mode 100644 docs/source-app/workflows/build_command_line_interface/commands/notebook/__init__.py delete mode 100644 docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py delete mode 100644 docs/source-app/workflows/build_command_line_interface/example_command.py delete mode 100644 docs/source-app/workflows/build_command_line_interface/index.rst delete mode 100644 docs/source-app/workflows/build_command_line_interface/index_content.rst delete mode 100644 docs/source-app/workflows/build_command_line_interface/post_example.py delete mode 100644 docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst delete mode 100644 docs/source-app/workflows/build_lightning_app/from_scratch.rst delete mode 100644 docs/source-app/workflows/build_lightning_app/from_scratch_content.rst delete mode 100644 docs/source-app/workflows/build_lightning_app/index.rst delete mode 100644 docs/source-app/workflows/build_lightning_app/index_content.rst delete mode 100644 docs/source-app/workflows/build_lightning_component/basic.rst delete mode 100644 docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst delete mode 100644 docs/source-app/workflows/build_lightning_component/index.rst delete mode 100644 docs/source-app/workflows/build_lightning_component/index_content.rst delete mode 100644 docs/source-app/workflows/build_lightning_component/intermediate.rst delete mode 100644 docs/source-app/workflows/build_lightning_component/publish_a_component.rst delete mode 100644 docs/source-app/workflows/build_rest_api/add_api.rst delete mode 100644 docs/source-app/workflows/build_rest_api/index.rst delete mode 100644 docs/source-app/workflows/build_rest_api/index_content.rst delete mode 100644 docs/source-app/workflows/build_rest_api/models.py delete mode 100644 docs/source-app/workflows/build_rest_api/post_example.py delete mode 100644 docs/source-app/workflows/build_rest_api/post_example_pydantic.py delete mode 100644 docs/source-app/workflows/build_rest_api/request_validation.rst delete mode 100644 docs/source-app/workflows/debug_locally.rst delete mode 100644 docs/source-app/workflows/enable_fault_tolerance.rst delete mode 100644 docs/source-app/workflows/extend_app.rst delete mode 100644 docs/source-app/workflows/index.rst delete mode 100644 docs/source-app/workflows/mount_cloud_object_store.rst delete mode 100644 docs/source-app/workflows/run_app_on_cloud/cloud_files.rst delete mode 100644 docs/source-app/workflows/run_app_on_cloud/index.rst delete mode 100644 docs/source-app/workflows/run_app_on_cloud/index_content.rst delete mode 100644 docs/source-app/workflows/run_app_on_cloud/lightning_cloud.rst delete mode 100644 docs/source-app/workflows/run_app_on_cloud/on_prem.rst delete mode 100644 docs/source-app/workflows/run_app_on_cloud/on_your_own_machine.rst delete mode 100644 docs/source-app/workflows/run_app_snippet.rst delete mode 100644 docs/source-app/workflows/run_components_on_different_hardware.rst delete mode 100644 docs/source-app/workflows/run_on_private_cloud.rst delete mode 100644 docs/source-app/workflows/run_work_in_parallel.rst delete mode 100644 docs/source-app/workflows/run_work_in_parallel_content.rst delete mode 100644 docs/source-app/workflows/run_work_once.rst delete mode 100644 docs/source-app/workflows/run_work_once_content.rst delete mode 100644 docs/source-app/workflows/schedule_apps.rst delete mode 100644 docs/source-app/workflows/scripts/parallel/toy_app.py delete mode 100644 docs/source-app/workflows/scripts/parallel/toy_parallel.py delete mode 100644 docs/source-app/workflows/scripts/parallel/toy_two_parallel.py delete mode 100644 docs/source-app/workflows/scripts/parallel/toy_two_parallel_not_started.py delete mode 100644 docs/source-app/workflows/share_app.rst delete mode 100644 docs/source-app/workflows/share_files_between_components.rst delete mode 100644 docs/source-app/workflows/share_files_between_components/app.py delete mode 100644 docs/source-app/workflows/test_an_app.rst delete mode 100644 examples/app/argparse/app.py delete mode 100644 examples/app/boring/.gitignore delete mode 100644 examples/app/boring/app.py delete mode 100644 examples/app/boring/app_dynamic.py delete mode 100644 examples/app/boring/scripts/__init__.py delete mode 100644 examples/app/boring/scripts/serve.py delete mode 100644 examples/app/commands_and_api/.lightningignore delete mode 100644 examples/app/commands_and_api/app.py delete mode 100644 examples/app/commands_and_api/command.py delete mode 100644 examples/app/components/python/__init__.py delete mode 100644 examples/app/components/python/app.py delete mode 100644 examples/app/components/python/component_popen.py delete mode 100644 examples/app/components/python/component_tracer.py delete mode 100644 examples/app/components/python/pl_script.py delete mode 100644 examples/app/components/serve/gradio/app.py delete mode 100644 examples/app/components/serve/gradio/beyonce.jpg delete mode 100644 examples/app/components/serve/gradio/requirements.txt delete mode 100644 examples/app/dag/.gitignore delete mode 100644 examples/app/dag/.lightningignore delete mode 100644 examples/app/dag/app.py delete mode 100644 examples/app/dag/processing.py delete mode 100644 examples/app/dag/requirements.txt delete mode 100644 examples/app/display_name/.lightningignore delete mode 100644 examples/app/display_name/app.py delete mode 100644 examples/app/drive/.gitignore delete mode 100644 examples/app/drive/app.py delete mode 100644 examples/app/hpo/README.md delete mode 100644 examples/app/hpo/app_wi_ui.py delete mode 100644 examples/app/hpo/app_wo_ui.py delete mode 100644 examples/app/hpo/download_data.py delete mode 100644 examples/app/hpo/hyperplot.py delete mode 100644 examples/app/hpo/objective.py delete mode 100644 examples/app/hpo/pl_script.py delete mode 100644 examples/app/hpo/requirements.txt delete mode 100644 examples/app/hpo/utils.py delete mode 100644 examples/app/installation_commands/app.py delete mode 100644 examples/app/interruptible/app.py delete mode 100644 examples/app/justpy/app.py delete mode 100644 examples/app/justpy/requirements.txt delete mode 100644 examples/app/layout/app.py delete mode 100644 examples/app/layout/requirements.txt delete mode 100644 examples/app/layout/ui1/index.html delete mode 100644 examples/app/layout/ui2/index.html delete mode 100644 examples/app/mount/app.py delete mode 100644 examples/app/multi_node/README.md delete mode 100644 examples/app/multi_node/pl_boring_script.py delete mode 100644 examples/app/multi_node/requirements.txt delete mode 100644 examples/app/multi_node/train_any.py delete mode 100644 examples/app/multi_node/train_fabric.py delete mode 100644 examples/app/multi_node/train_lt.py delete mode 100644 examples/app/multi_node/train_lt_script.py delete mode 100644 examples/app/multi_node/train_pytorch.py delete mode 100644 examples/app/multi_node/train_pytorch_spawn.py delete mode 100644 examples/app/payload/app.py delete mode 100644 examples/app/pickle_or_not/app.py delete mode 100644 examples/app/pickle_or_not/requirements.txt delete mode 100644 examples/app/server/app.py delete mode 100644 examples/app/server_with_auto_scaler/app.py delete mode 100644 examples/app/template_streamlit_ui/app.py delete mode 100644 examples/app/template_streamlit_ui/requirements.txt delete mode 100644 examples/app/v0/.gitignore delete mode 100644 examples/app/v0/README.md delete mode 100644 examples/app/v0/app.py delete mode 100644 examples/app/v0/emulate_ui.py delete mode 100644 examples/app/v0/requirements.txt delete mode 100644 examples/app/v0/ui/a/index.html delete mode 100644 examples/app/v0/ui/b/index.html delete mode 100644 examples/app/works_on_default_machine/app_v2.py delete mode 100644 examples/app/works_on_default_machine/requirements.txt delete mode 100644 requirements/app/app.txt delete mode 100644 requirements/app/cloud.txt delete mode 100644 requirements/app/components.txt delete mode 100644 requirements/app/docs.txt delete mode 100644 requirements/app/test.txt delete mode 100644 requirements/app/ui.txt delete mode 100644 src/app-ui-version.info delete mode 100644 src/lightning/__main__.py delete mode 100644 src/lightning/app/CHANGELOG.md delete mode 100644 src/lightning/app/__init__.py delete mode 100644 src/lightning/app/api/__init__.py delete mode 100644 src/lightning/app/api/http_methods.py delete mode 100644 src/lightning/app/api/request_types.py delete mode 100644 src/lightning/app/cli/__init__.py delete mode 100644 src/lightning/app/cli/app-template/.gitignore delete mode 100644 src/lightning/app/cli/app-template/LICENSE delete mode 100644 src/lightning/app/cli/app-template/README.md delete mode 100644 src/lightning/app/cli/app-template/app.py delete mode 100644 src/lightning/app/cli/app-template/placeholdername/__init__.py delete mode 100644 src/lightning/app/cli/app-template/placeholdername/components/component_a/__init__.py delete mode 100644 src/lightning/app/cli/app-template/placeholdername/components/component_a/component_a.py delete mode 100644 src/lightning/app/cli/app-template/placeholdername/components/component_b/__init__.py delete mode 100644 src/lightning/app/cli/app-template/placeholdername/components/component_b/component_a.py delete mode 100644 src/lightning/app/cli/app-template/requirements.txt delete mode 100644 src/lightning/app/cli/app-template/setup.py delete mode 100644 src/lightning/app/cli/app-template/tests/README.md delete mode 100644 src/lightning/app/cli/app-template/tests/__init__.py delete mode 100644 src/lightning/app/cli/app-template/tests/requirements.txt delete mode 100644 src/lightning/app/cli/app-template/tests/test_placeholdername_app.py delete mode 100644 src/lightning/app/cli/cmd_apps.py delete mode 100644 src/lightning/app/cli/cmd_init.py delete mode 100644 src/lightning/app/cli/cmd_install.py delete mode 100644 src/lightning/app/cli/cmd_pl_init.py delete mode 100644 src/lightning/app/cli/cmd_react_ui_init.py delete mode 100644 src/lightning/app/cli/commands/__init__.py delete mode 100644 src/lightning/app/cli/commands/app_commands.py delete mode 100644 src/lightning/app/cli/commands/cd.py delete mode 100644 src/lightning/app/cli/commands/cp.py delete mode 100644 src/lightning/app/cli/commands/logs.py delete mode 100644 src/lightning/app/cli/commands/ls.py delete mode 100644 src/lightning/app/cli/commands/pwd.py delete mode 100644 src/lightning/app/cli/commands/rm.py delete mode 100644 src/lightning/app/cli/component-template/.github/workflows/ci-testing.yml delete mode 100644 src/lightning/app/cli/component-template/.gitignore delete mode 100644 src/lightning/app/cli/component-template/LICENSE delete mode 100644 src/lightning/app/cli/component-template/README.md delete mode 100644 src/lightning/app/cli/component-template/app.py delete mode 100644 src/lightning/app/cli/component-template/placeholdername/__init__.py delete mode 100644 src/lightning/app/cli/component-template/placeholdername/component.py delete mode 100644 src/lightning/app/cli/component-template/requirements.txt delete mode 100644 src/lightning/app/cli/component-template/setup.py delete mode 100644 src/lightning/app/cli/component-template/tests/README.md delete mode 100644 src/lightning/app/cli/component-template/tests/__init__.py delete mode 100644 src/lightning/app/cli/component-template/tests/requirements.txt delete mode 100644 src/lightning/app/cli/component-template/tests/test_placeholdername_component.py delete mode 100644 src/lightning/app/cli/connect/__init__.py delete mode 100644 src/lightning/app/cli/connect/app.py delete mode 100644 src/lightning/app/cli/connect/data.py delete mode 100644 src/lightning/app/cli/core.py delete mode 100644 src/lightning/app/cli/lightning_cli.py delete mode 100644 src/lightning/app/cli/lightning_cli_delete.py delete mode 100644 src/lightning/app/cli/lightning_cli_launch.py delete mode 100644 src/lightning/app/cli/lightning_cli_list.py delete mode 100644 src/lightning/app/cli/pl-app-template/.gitignore delete mode 100644 src/lightning/app/cli/pl-app-template/.lightningignore delete mode 100644 src/lightning/app/cli/pl-app-template/app.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/__init__.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/callbacks.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/components/__init__.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/components/logger/__init__.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/components/logger/tensorboard.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/components/logger/weights_and_biases.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/components/script_runner/__init__.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/components/script_runner/script_runner.py delete mode 100644 src/lightning/app/cli/pl-app-template/core/state.py delete mode 100644 src/lightning/app/cli/pl-app-template/setup.py delete mode 100644 src/lightning/app/cli/pl-app-template/tests/__init__.py delete mode 100644 src/lightning/app/cli/pl-app-template/tests/core/__init__.py delete mode 100644 src/lightning/app/cli/pl-app-template/tests/core/test_callbacks.py delete mode 100644 src/lightning/app/cli/pl-app-template/tests/test_app.py delete mode 100644 src/lightning/app/cli/pl-app-template/ui/.gitignore delete mode 100644 src/lightning/app/cli/pl-app-template/ui/.prettierignore delete mode 100644 src/lightning/app/cli/pl-app-template/ui/.prettierrc delete mode 100644 src/lightning/app/cli/pl-app-template/ui/craco.config.js delete mode 100644 src/lightning/app/cli/pl-app-template/ui/package.json delete mode 100644 src/lightning/app/cli/pl-app-template/ui/public/favicon.svg delete mode 100644 src/lightning/app/cli/pl-app-template/ui/public/index.html delete mode 100644 src/lightning/app/cli/pl-app-template/ui/public/manifest.json delete mode 100644 src/lightning/app/cli/pl-app-template/ui/public/robots.txt delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/App.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/EnvironmentConfigurator.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/ErrorPanel.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/ExecutionSummary.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/HyperparameterSummary.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/Launcher.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/ProgressBar.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/ProgressBarGroup.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/components/Timer.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/hooks/useLightningState.ts delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/index.css delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/index.tsx delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/lightning-colors.ts delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/react-app-env.d.ts delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/reportWebVitals.ts delete mode 100644 src/lightning/app/cli/pl-app-template/ui/src/types/lightning.ts delete mode 100644 src/lightning/app/cli/pl-app-template/ui/tsconfig.json delete mode 100644 src/lightning/app/cli/react-ui-template/README.md delete mode 100644 src/lightning/app/cli/react-ui-template/example_app.py delete mode 100644 src/lightning/app/cli/react-ui-template/ui/index.html delete mode 100644 src/lightning/app/cli/react-ui-template/ui/package.json delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/App.css delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/App.tsx delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/favicon.svg delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/hooks/useLightningState.ts delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/index.css delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/main.tsx delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/types/lightning.ts delete mode 100644 src/lightning/app/cli/react-ui-template/ui/src/vite-env.d.ts delete mode 100644 src/lightning/app/cli/react-ui-template/ui/tsconfig.json delete mode 100644 src/lightning/app/cli/react-ui-template/ui/tsconfig.node.json delete mode 100644 src/lightning/app/cli/react-ui-template/ui/vite.config.ts delete mode 100644 src/lightning/app/cli/react-ui-template/ui/yarn.lock delete mode 100644 src/lightning/app/components/README.md delete mode 100644 src/lightning/app/components/__init__.py delete mode 100644 src/lightning/app/components/database/__init__.py delete mode 100644 src/lightning/app/components/database/client.py delete mode 100644 src/lightning/app/components/database/server.py delete mode 100644 src/lightning/app/components/database/utilities.py delete mode 100644 src/lightning/app/components/multi_node/__init__.py delete mode 100644 src/lightning/app/components/multi_node/base.py delete mode 100644 src/lightning/app/components/multi_node/fabric.py delete mode 100644 src/lightning/app/components/multi_node/pytorch_spawn.py delete mode 100644 src/lightning/app/components/multi_node/trainer.py delete mode 100644 src/lightning/app/components/python/__init__.py delete mode 100644 src/lightning/app/components/python/popen.py delete mode 100644 src/lightning/app/components/python/tracer.py delete mode 100644 src/lightning/app/components/serve/__init__.py delete mode 100644 src/lightning/app/components/serve/auto_scaler.py delete mode 100644 src/lightning/app/components/serve/catimage.png delete mode 100644 src/lightning/app/components/serve/cold_start_proxy.py delete mode 100644 src/lightning/app/components/serve/gradio_server.py delete mode 100644 src/lightning/app/components/serve/python_server.py delete mode 100644 src/lightning/app/components/serve/serve.py delete mode 100644 src/lightning/app/components/serve/streamlit.py delete mode 100644 src/lightning/app/components/serve/types/__init__.py delete mode 100644 src/lightning/app/components/serve/types/image.py delete mode 100644 src/lightning/app/components/serve/types/type.py delete mode 100644 src/lightning/app/components/training.py delete mode 100644 src/lightning/app/core/__init__.py delete mode 100644 src/lightning/app/core/api.py delete mode 100644 src/lightning/app/core/app.py delete mode 100644 src/lightning/app/core/constants.py delete mode 100644 src/lightning/app/core/flow.py delete mode 100644 src/lightning/app/core/queues.py delete mode 100644 src/lightning/app/core/work.py delete mode 100644 src/lightning/app/frontend/__init__.py delete mode 100644 src/lightning/app/frontend/frontend.py delete mode 100644 src/lightning/app/frontend/just_py/__init__.py delete mode 100644 src/lightning/app/frontend/just_py/just_py.py delete mode 100644 src/lightning/app/frontend/just_py/just_py_base.py delete mode 100644 src/lightning/app/frontend/panel/__init__.py delete mode 100644 src/lightning/app/frontend/panel/app_state_comm.py delete mode 100644 src/lightning/app/frontend/panel/app_state_watcher.py delete mode 100644 src/lightning/app/frontend/panel/panel_frontend.py delete mode 100644 src/lightning/app/frontend/panel/panel_serve_render_fn.py delete mode 100644 src/lightning/app/frontend/stream_lit.py delete mode 100644 src/lightning/app/frontend/streamlit_base.py delete mode 100644 src/lightning/app/frontend/utils.py delete mode 100644 src/lightning/app/frontend/web.py delete mode 100644 src/lightning/app/launcher/__init__.py delete mode 100644 src/lightning/app/launcher/launcher.py delete mode 100644 src/lightning/app/launcher/lightning_backend.py delete mode 100644 src/lightning/app/launcher/lightning_hybrid_backend.py delete mode 100644 src/lightning/app/launcher/utils.py delete mode 100644 src/lightning/app/pdb/__init__.py delete mode 100644 src/lightning/app/pdb/pdb.py delete mode 100644 src/lightning/app/plugin/__init__.py delete mode 100644 src/lightning/app/plugin/plugin.py delete mode 100644 src/lightning/app/runners/__init__.py delete mode 100644 src/lightning/app/runners/backends/__init__.py delete mode 100644 src/lightning/app/runners/backends/backend.py delete mode 100644 src/lightning/app/runners/backends/cloud.py delete mode 100644 src/lightning/app/runners/backends/docker.py delete mode 100644 src/lightning/app/runners/backends/mp_process.py delete mode 100644 src/lightning/app/runners/cloud.py delete mode 100644 src/lightning/app/runners/multiprocess.py delete mode 100644 src/lightning/app/runners/runtime.py delete mode 100644 src/lightning/app/runners/runtime_type.py delete mode 100644 src/lightning/app/source_code/__init__.py delete mode 100644 src/lightning/app/source_code/copytree.py delete mode 100644 src/lightning/app/source_code/hashing.py delete mode 100644 src/lightning/app/source_code/local.py delete mode 100644 src/lightning/app/source_code/tar.py delete mode 100644 src/lightning/app/source_code/uploader.py delete mode 100644 src/lightning/app/storage/__init__.py delete mode 100644 src/lightning/app/storage/copier.py delete mode 100644 src/lightning/app/storage/drive.py delete mode 100644 src/lightning/app/storage/filesystem.py delete mode 100644 src/lightning/app/storage/mount.py delete mode 100644 src/lightning/app/storage/orchestrator.py delete mode 100644 src/lightning/app/storage/path.py delete mode 100644 src/lightning/app/storage/payload.py delete mode 100644 src/lightning/app/storage/requests.py delete mode 100644 src/lightning/app/structures/__init__.py delete mode 100644 src/lightning/app/structures/dict.py delete mode 100644 src/lightning/app/structures/list.py delete mode 100644 src/lightning/app/testing/__init__.py delete mode 100644 src/lightning/app/testing/config.py delete mode 100644 src/lightning/app/testing/helpers.py delete mode 100644 src/lightning/app/testing/testing.py delete mode 100644 src/lightning/app/utilities/__init__.py delete mode 100644 src/lightning/app/utilities/app_commands.py delete mode 100644 src/lightning/app/utilities/app_helpers.py delete mode 100644 src/lightning/app/utilities/app_logs.py delete mode 100644 src/lightning/app/utilities/app_status.py delete mode 100644 src/lightning/app/utilities/auth.py delete mode 100644 src/lightning/app/utilities/cli_helpers.py delete mode 100644 src/lightning/app/utilities/cloud.py delete mode 100644 src/lightning/app/utilities/clusters.py delete mode 100644 src/lightning/app/utilities/commands/__init__.py delete mode 100644 src/lightning/app/utilities/commands/base.py delete mode 100644 src/lightning/app/utilities/component.py delete mode 100644 src/lightning/app/utilities/data_structures.py delete mode 100644 src/lightning/app/utilities/dependency_caching.py delete mode 100644 src/lightning/app/utilities/enum.py delete mode 100644 src/lightning/app/utilities/exceptions.py delete mode 100644 src/lightning/app/utilities/frontend.py delete mode 100644 src/lightning/app/utilities/git.py delete mode 100644 src/lightning/app/utilities/imports.py delete mode 100644 src/lightning/app/utilities/introspection.py delete mode 100644 src/lightning/app/utilities/layout.py delete mode 100644 src/lightning/app/utilities/load_app.py delete mode 100644 src/lightning/app/utilities/log.py delete mode 100644 src/lightning/app/utilities/log_helpers.py delete mode 100644 src/lightning/app/utilities/login.py delete mode 100644 src/lightning/app/utilities/logs_socket_api.py delete mode 100644 src/lightning/app/utilities/name_generator.py delete mode 100644 src/lightning/app/utilities/network.py delete mode 100644 src/lightning/app/utilities/openapi.py delete mode 100644 src/lightning/app/utilities/packaging/__init__.py delete mode 100644 src/lightning/app/utilities/packaging/app_config.py delete mode 100644 src/lightning/app/utilities/packaging/build_config.py delete mode 100644 src/lightning/app/utilities/packaging/cloud_compute.py delete mode 100644 src/lightning/app/utilities/packaging/docker.py delete mode 100644 src/lightning/app/utilities/packaging/lightning_utils.py delete mode 100644 src/lightning/app/utilities/packaging/tarfile.py delete mode 100644 src/lightning/app/utilities/port.py delete mode 100644 src/lightning/app/utilities/proxies.py delete mode 100644 src/lightning/app/utilities/redis.py delete mode 100644 src/lightning/app/utilities/safe_pickle.py delete mode 100644 src/lightning/app/utilities/scheduler.py delete mode 100644 src/lightning/app/utilities/secrets.py delete mode 100644 src/lightning/app/utilities/state.py delete mode 100644 src/lightning/app/utilities/tracer.py delete mode 100644 src/lightning/app/utilities/tree.py delete mode 100644 src/lightning/app/utilities/types.py delete mode 100644 src/lightning/app/utilities/warnings.py delete mode 100644 src/lightning/store/README.md delete mode 100644 src/lightning/store/__init__.py delete mode 100644 src/lightning/store/store.py delete mode 100644 src/lightning/store/utils.py delete mode 100644 src/lightning_app/MANIFEST.in delete mode 100644 src/lightning_app/README.md delete mode 100644 src/lightning_app/__about__.py delete mode 100644 src/lightning_app/__main__.py delete mode 100644 src/lightning_app/__setup__.py delete mode 100644 src/lightning_app/__version__.py delete mode 100644 src/lightning_app/py.typed delete mode 100644 src/lightning_app/shell-folder_code-lives-lightning.info delete mode 100644 tests/integrations_app/__init__.py delete mode 100644 tests/integrations_app/apps/collect_failures/__init__.py delete mode 100644 tests/integrations_app/apps/collect_failures/app.py delete mode 100644 tests/integrations_app/apps/collect_failures/requirements.txt delete mode 100644 tests/integrations_app/apps/core_features_app/__init__.py delete mode 100644 tests/integrations_app/apps/core_features_app/app.py delete mode 100644 tests/integrations_app/apps/custom_work_dependencies/__init__.py delete mode 100644 tests/integrations_app/apps/custom_work_dependencies/app.py delete mode 100644 tests/integrations_app/apps/idle_timeout/__init__.py delete mode 100644 tests/integrations_app/apps/idle_timeout/app.py delete mode 100644 tests/integrations_app/conftest.py delete mode 100644 tests/integrations_app/flagship/__init__.py delete mode 100644 tests/integrations_app/flagship/test_flashy.py delete mode 100644 tests/integrations_app/flagship/test_jupyter.py delete mode 100644 tests/integrations_app/flagship/test_muse.py delete mode 100644 tests/integrations_app/local/__init__.py delete mode 100644 tests/integrations_app/local/test_collect_failures.py delete mode 100644 tests/integrations_app/local/test_core_features_app.py delete mode 100644 tests/integrations_app/local/test_custom_work_dependencies.py delete mode 100644 tests/integrations_app/local/test_idle_timeout.py delete mode 100644 tests/integrations_app/public/__init__.py delete mode 100644 tests/integrations_app/public/test_app_dag.py delete mode 100644 tests/integrations_app/public/test_argparse.py delete mode 100644 tests/integrations_app/public/test_boring_app.py delete mode 100644 tests/integrations_app/public/test_commands_and_api.py delete mode 100644 tests/integrations_app/public/test_drive.py delete mode 100644 tests/integrations_app/public/test_gradio.py delete mode 100644 tests/integrations_app/public/test_installation_commands_app.py delete mode 100644 tests/integrations_app/public/test_layout.py delete mode 100644 tests/integrations_app/public/test_multi_node.py delete mode 100644 tests/integrations_app/public/test_payload.py delete mode 100644 tests/integrations_app/public/test_pickle_or_not.py delete mode 100644 tests/integrations_app/public/test_quick_start.py delete mode 100644 tests/integrations_app/public/test_scripts.py delete mode 100644 tests/integrations_app/public/test_template_react_ui.py delete mode 100644 tests/integrations_app/public/test_template_streamlit_ui.py delete mode 100644 tests/integrations_app/public/test_v0_app.py delete mode 100644 tests/tests_app/__init__.py delete mode 100644 tests/tests_app/cli/__init__.py delete mode 100644 tests/tests_app/cli/jsons/connect_1.json delete mode 100644 tests/tests_app/cli/launch_data/app_v0/__init__.py delete mode 100644 tests/tests_app/cli/launch_data/app_v0/app.py delete mode 100644 tests/tests_app/cli/launch_data/app_v0/ui/a/index.html delete mode 100644 tests/tests_app/cli/launch_data/app_v0/ui/b/index.html delete mode 100644 tests/tests_app/cli/test_cd.py delete mode 100644 tests/tests_app/cli/test_cli.py delete mode 100644 tests/tests_app/cli/test_cloud_cli.py delete mode 100644 tests/tests_app/cli/test_cmd_apps.py delete mode 100644 tests/tests_app/cli/test_cmd_cli_delete.py delete mode 100644 tests/tests_app/cli/test_cmd_init.py delete mode 100644 tests/tests_app/cli/test_cmd_install.py delete mode 100644 tests/tests_app/cli/test_cmd_launch.py delete mode 100644 tests/tests_app/cli/test_cmd_pl_init.py delete mode 100644 tests/tests_app/cli/test_cmd_react_ui_init.py delete mode 100644 tests/tests_app/cli/test_cmd_show_logs.py delete mode 100644 tests/tests_app/cli/test_connect.py delete mode 100644 tests/tests_app/cli/test_connect_data.py delete mode 100644 tests/tests_app/cli/test_cp.py delete mode 100644 tests/tests_app/cli/test_ls.py delete mode 100644 tests/tests_app/cli/test_rm.py delete mode 100644 tests/tests_app/cli/test_run_app.py delete mode 100644 tests/tests_app/components/__init__.py delete mode 100644 tests/tests_app/components/database/test_client_server.py delete mode 100644 tests/tests_app/components/multi_node/__init__.py delete mode 100644 tests/tests_app/components/multi_node/test_base.py delete mode 100644 tests/tests_app/components/multi_node/test_fabric.py delete mode 100644 tests/tests_app/components/multi_node/test_trainer.py delete mode 100644 tests/tests_app/components/python/scripts/a.py delete mode 100644 tests/tests_app/components/python/scripts/b.py delete mode 100644 tests/tests_app/components/python/scripts/c.py delete mode 100644 tests/tests_app/components/python/test_python.py delete mode 100644 tests/tests_app/components/sample_package_repo/external_lightning_component_package/__init__.py delete mode 100644 tests/tests_app/components/sample_package_repo/setup.py delete mode 100644 tests/tests_app/components/serve/test_auto_scaler.py delete mode 100644 tests/tests_app/components/serve/test_model_inference_api.py delete mode 100644 tests/tests_app/components/serve/test_python_server.py delete mode 100644 tests/tests_app/components/serve/test_streamlit.py delete mode 100644 tests/tests_app/conftest.py delete mode 100644 tests/tests_app/core/__init__.py delete mode 100644 tests/tests_app/core/lightning_app/__init__.py delete mode 100644 tests/tests_app/core/lightning_app/test_configure_layout.py delete mode 100644 tests/tests_app/core/scripts/app_metadata.py delete mode 100644 tests/tests_app/core/scripts/app_with_env.py delete mode 100644 tests/tests_app/core/scripts/app_with_local_import.py delete mode 100644 tests/tests_app/core/scripts/empty.py delete mode 100644 tests/tests_app/core/scripts/example_1.py delete mode 100644 tests/tests_app/core/scripts/example_2.py delete mode 100644 tests/tests_app/core/scripts/lightning_cli.py delete mode 100644 tests/tests_app/core/scripts/lightning_overrides.py delete mode 100644 tests/tests_app/core/scripts/lightning_trainer.py delete mode 100644 tests/tests_app/core/scripts/registry.py delete mode 100644 tests/tests_app/core/scripts/script_with_error.py delete mode 100644 tests/tests_app/core/scripts/two_apps.py delete mode 100644 tests/tests_app/core/test_constants.py delete mode 100644 tests/tests_app/core/test_lightning_api.py delete mode 100644 tests/tests_app/core/test_lightning_app.py delete mode 100644 tests/tests_app/core/test_lightning_flow.py delete mode 100644 tests/tests_app/core/test_lightning_work.py delete mode 100644 tests/tests_app/core/test_queues.py delete mode 100644 tests/tests_app/frontend/__init__.py delete mode 100644 tests/tests_app/frontend/conftest.py delete mode 100644 tests/tests_app/frontend/just_py/test_just_py.py delete mode 100644 tests/tests_app/frontend/panel/__init__.py delete mode 100644 tests/tests_app/frontend/panel/app_panel.py delete mode 100644 tests/tests_app/frontend/panel/test_app_state_comm.py delete mode 100644 tests/tests_app/frontend/panel/test_app_state_watcher.py delete mode 100644 tests/tests_app/frontend/panel/test_panel_frontend.py delete mode 100644 tests/tests_app/frontend/panel/test_panel_serve_render_fn.py delete mode 100644 tests/tests_app/frontend/test_stream_lit.py delete mode 100644 tests/tests_app/frontend/test_utils.py delete mode 100644 tests/tests_app/frontend/test_web.py delete mode 100644 tests/tests_app/frontend/utilities/__init__.py delete mode 100644 tests/tests_app/helpers/__init__.py delete mode 100644 tests/tests_app/launcher/test_lightning_backend.py delete mode 100644 tests/tests_app/launcher/test_lightning_hydrid.py delete mode 100644 tests/tests_app/launcher/test_running_flow.py delete mode 100644 tests/tests_app/plugin/__init__.py delete mode 100644 tests/tests_app/plugin/test_plugin.py delete mode 100644 tests/tests_app/runners/__init__.py delete mode 100644 tests/tests_app/runners/backends/__init__.py delete mode 100644 tests/tests_app/runners/backends/test_mp_process.py delete mode 100644 tests/tests_app/runners/test_cloud.py delete mode 100644 tests/tests_app/runners/test_multiprocess.py delete mode 100644 tests/tests_app/runners/test_runtime.py delete mode 100644 tests/tests_app/source_code/test_copytree.py delete mode 100644 tests/tests_app/source_code/test_local.py delete mode 100644 tests/tests_app/source_code/test_tar.py delete mode 100644 tests/tests_app/source_code/test_uploader.py delete mode 100644 tests/tests_app/storage/__init__.py delete mode 100644 tests/tests_app/storage/test_copier.py delete mode 100644 tests/tests_app/storage/test_drive.py delete mode 100644 tests/tests_app/storage/test_filesystem.py delete mode 100644 tests/tests_app/storage/test_mount.py delete mode 100644 tests/tests_app/storage/test_orchestrator.py delete mode 100644 tests/tests_app/storage/test_path.py delete mode 100644 tests/tests_app/storage/test_payload.py delete mode 100644 tests/tests_app/structures/__init__.py delete mode 100644 tests/tests_app/structures/test_structures.py delete mode 100644 tests/tests_app/test_imports.py delete mode 100644 tests/tests_app/utilities/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/Dockerfile.cpu delete mode 100644 tests/tests_app/utilities/packaging/projects/dock/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dock/app.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dock/compo/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dock/compo/a/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dock/compo/a/a.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dock/compo/b/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dock/compo/b/b.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/app.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/comp_dockerfile/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/comp_dockerfile/a/Dockerfile delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/comp_dockerfile/a/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/comp_dockerfile/a/a.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/comp_dockerfile/b/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/dockerfile/comp_dockerfile/b/b.py delete mode 100644 tests/tests_app/utilities/packaging/projects/no_req/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/no_req/app.py delete mode 100644 tests/tests_app/utilities/packaging/projects/no_req/comp/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/no_req/comp/a/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/no_req/comp/a/a.py delete mode 100644 tests/tests_app/utilities/packaging/projects/no_req/comp/b/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/no_req/comp/b/b.py delete mode 100644 tests/tests_app/utilities/packaging/projects/req/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/req/app.py delete mode 100644 tests/tests_app/utilities/packaging/projects/req/comp_req/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/req/comp_req/a/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/req/comp_req/a/a.py delete mode 100644 tests/tests_app/utilities/packaging/projects/req/comp_req/a/requirements.txt delete mode 100644 tests/tests_app/utilities/packaging/projects/req/comp_req/b/__init__.py delete mode 100644 tests/tests_app/utilities/packaging/projects/req/comp_req/b/b.py delete mode 100644 tests/tests_app/utilities/packaging/projects/requirements.txt delete mode 100644 tests/tests_app/utilities/packaging/test_app_config.py delete mode 100644 tests/tests_app/utilities/packaging/test_build_spec.py delete mode 100644 tests/tests_app/utilities/packaging/test_cloud_compute.py delete mode 100644 tests/tests_app/utilities/packaging/test_docker.py delete mode 100644 tests/tests_app/utilities/packaging/test_lightning_utils.py delete mode 100644 tests/tests_app/utilities/test_app_commands.py delete mode 100644 tests/tests_app/utilities/test_app_helpers.py delete mode 100644 tests/tests_app/utilities/test_app_logs.py delete mode 100644 tests/tests_app/utilities/test_auth.py delete mode 100644 tests/tests_app/utilities/test_cli_helpers.py delete mode 100644 tests/tests_app/utilities/test_cloud.py delete mode 100644 tests/tests_app/utilities/test_commands.py delete mode 100644 tests/tests_app/utilities/test_component.py delete mode 100644 tests/tests_app/utilities/test_dependency_caching.py delete mode 100644 tests/tests_app/utilities/test_exceptions.py delete mode 100644 tests/tests_app/utilities/test_git.py delete mode 100644 tests/tests_app/utilities/test_imports.py delete mode 100644 tests/tests_app/utilities/test_introspection.py delete mode 100644 tests/tests_app/utilities/test_layout.py delete mode 100644 tests/tests_app/utilities/test_load_app.py delete mode 100644 tests/tests_app/utilities/test_log_helpers.py delete mode 100644 tests/tests_app/utilities/test_login.py delete mode 100644 tests/tests_app/utilities/test_network.py delete mode 100644 tests/tests_app/utilities/test_port.py delete mode 100644 tests/tests_app/utilities/test_proxies.py delete mode 100644 tests/tests_app/utilities/test_safe_pickle.py delete mode 100644 tests/tests_app/utilities/test_secrets.py delete mode 100644 tests/tests_app/utilities/test_state.py delete mode 100644 tests/tests_app/utilities/test_tracer.py delete mode 100644 tests/tests_app/utilities/test_tree.py delete mode 100644 tests/tests_app/utilities/testdata/app_commands/app_commands_to_ignore.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/bang_not_at_start_of_line.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/command_after_first_non_comment_line.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/commands_with_mixed_comments_1.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/commands_with_mixed_comments_2.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/multiple_commands.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/multiple_spaces_between_band_and_command.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/single_command.txt delete mode 100644 tests/tests_app/utilities/testdata/app_commands/space_between_bang_and_command.txt delete mode 100644 tests/tests_app/utilities/testdata/safe_pickle_app.py delete mode 100644 tests/tests_store/__init__.py delete mode 100644 tests/tests_store/test_store.py diff --git a/.actions/assistant.py b/.actions/assistant.py index 664f3e8a89e75..c1e2f08c340cb 100644 --- a/.actions/assistant.py +++ b/.actions/assistant.py @@ -14,10 +14,8 @@ import glob import logging import os -import pathlib import re import shutil -import tarfile import tempfile import urllib.request from distutils.version import LooseVersion @@ -35,11 +33,6 @@ "requirements/pytorch/strategies.txt", "requirements/pytorch/examples.txt", ), - "app": ( - "requirements/app/app.txt", - "requirements/app/cloud.txt", - "requirements/app/ui.txt", - ), "fabric": ( "requirements/fabric/base.txt", "requirements/fabric/strategies.txt", @@ -216,30 +209,6 @@ def distribute_version(src_folder: str, ver_file: str = "version.info") -> None: shutil.copy2(ver_template, fpath) -def _download_frontend(pkg_path: str, version: str = "v0.0.0"): - """Downloads an archive file for a specific release of the Lightning frontend and extracts it to the correct - directory.""" - - try: - frontend_dir = pathlib.Path(pkg_path, "ui") - download_dir = tempfile.mkdtemp() - - shutil.rmtree(frontend_dir, ignore_errors=True) - # TODO: remove this once lightning-ui package is ready as a dependency - frontend_release_url = f"https://lightning-packages.s3.amazonaws.com/ui/{version}.tar.gz" - response = urllib.request.urlopen(frontend_release_url) - - file = tarfile.open(fileobj=response, mode="r|gz") - file.extractall(path=download_dir) # noqa: S202 - - shutil.move(download_dir, frontend_dir) - print("The Lightning UI has successfully been downloaded!") - - # If installing from source without internet connection, we don't want to break the installation - except Exception: - print("The Lightning UI downloading has failed!") - - def _load_aggregate_requirements(req_dir: str = "requirements", freeze_requirements: bool = False) -> None: """Load all base requirements from all particular packages and prune duplicates. @@ -466,7 +435,7 @@ def pull_docs_files( raise RuntimeError(f"Requesting file '{zip_url}' does not exist or it is just unavailable.") with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(tmp) # noqa: S202 + zip_ref.extractall(tmp) zip_dirs = [d for d in glob.glob(os.path.join(tmp, "*")) if os.path.isdir(d)] # check that the extracted archive has only repo folder diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml deleted file mode 100644 index 53174c42aba02..0000000000000 --- a/.azure/app-cloud-e2e.yml +++ /dev/null @@ -1,206 +0,0 @@ -# Python package -# Create and test a Python package on multiple Python versions. -# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: -# https://docs.microsoft.com/azure/devops/pipelines/languages/python - -trigger: - tags: - include: - - "*" - branches: - include: - - "master" - - "release/*" - - "refs/tags/*" - -schedules: - - cron: "0 0 * * *" # At the end of every day - displayName: Daily midnight testing - branches: - include: - - "master" - -pr: - branches: - include: - - "master" - - "release/*" - paths: - include: - - ".actions/*" - - ".azure/app-cloud-e2e.yml" - - "src/lightning/__about__.py" - - "src/lightning/__init__.py" - - "src/lightning/__main__.py" - - "src/lightning/__setup__.py" - - "src/lightning/__version__.py" - - "src/lightning/app/**" - - "src/lightning_app/*" - - "examples/app/**" - - "requirements/app/**" - - "tests/integrations_app/**" - - "setup.py" - exclude: - - "!tests/integrations_app/flagship/**" - - "requirements/*/docs.txt" - - "*.md" - - "**/*.md" - -# variables are automatically exported as environment variables so this will override pip's default cache dir -variables: - - name: pip_cache_dir - value: $(Pipeline.Workspace)/.pip - - name: local_id - value: $(Build.BuildId) - - name: video_artifact_dir - value: ./videos - -jobs: - - job: test_e2e - pool: "azure-cpus" - container: - # see all available tags: https://mcr.microsoft.com/en-us/product/playwright/python/tags - image: mcr.microsoft.com/playwright/python:v1.38.0-focal - options: "--shm-size=4gb" - strategy: - matrix: - "App: v0_app": - name: "v0_app" - dir: "public" - "App: boring_app": - name: "boring_app" - dir: "public" - "App: template_streamlit_ui": - name: "template_streamlit_ui" - dir: "public" - "App: template_react_ui": - name: "template_react_ui" - dir: "public" - # 'App: template_jupyterlab': # TODO: clarify where these files lives - # name: "template_jupyterlab" - "App: installation_commands_app": - name: "installation_commands_app" - dir: "public" - "App: drive": - name: "drive" - dir: "public" - "App: payload": - name: "payload" - dir: "public" - "App: commands_and_api": - name: "commands_and_api" - dir: "public" - "App: quick_start": - name: "quick_start" - dir: "public" - "App: idle_timeout": - name: "idle_timeout" - dir: "local" - "App: collect_failures": - name: "collect_failures" - dir: "local" - "App: custom_work_dependencies": - name: "custom_work_dependencies" - dir: "local" - timeoutInMinutes: "15" - cancelTimeoutInMinutes: "1" - # values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace - workspace: - clean: all - variables: - FREEZE_REQUIREMENTS: "1" - HEADLESS: "1" - PACKAGE_LIGHTNING: "1" - CLOUD: "1" - VIDEO_LOCATION: $(video_artifact_dir) - PR_NUMBER: $(local_id) - TEST_APP_NAME: $(name) - TEST_APP_FOLDER: $(dir) - HAR_LOCATION: "./artifacts/hars" - SLOW_MO: "50" - LIGHTNING_DEBUG: "1" - steps: - - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' - displayName: "Set id for this PR" - condition: eq(variables['Build.Reason'], 'PullRequest') - - - bash: | - whoami - mkdir -p "$(video_artifact_dir)/$(name)" - printf "local id: $(local_id)\n" - python --version - pip --version - echo "allow fail: ${{ in(variables['name'], 'quick_start', 'template_react_ui') }}" - displayName: "Info" - - # TODO: we are testing it as `lightning`, so add also version for `lightning_app` - - bash: | - pip install -e .[app-dev] \ - -f https://download.pytorch.org/whl/cpu/torch_stable.html - displayName: "Install Lightning & dependencies" - - - bash: python -m playwright install # --with-deps - displayName: "Install Playwright system dependencies" - - # The magic happens here it doesn't need to install the quick start dependencies. - # This test is very important to test the main user story of lightning app. - # It also e2e tests running on cloud without installing dependencies. - - bash: | - git clone https://github.com/Lightning-AI/lightning-quick-start examples/app/quick-start - # without succeeded this could run even if the job has already failed - condition: and(succeeded(), eq(variables['name'], 'quick_start')) - displayName: "Clone Quick start Repo" - - bash: | - git clone https://github.com/Lightning-AI/lightning-template-react examples/app/template_react_ui - # without succeeded this could run even if the job has already failed - condition: and(succeeded(), eq(variables['name'], 'template_react_ui')) - displayName: "Clone Template React UI Repo" - - # Replace imports to use `lightning` instead of `lightning_app` since we install lightning only ATM - - bash: | - pip install -q -r .actions/requirements.txt - python .actions/assistant.py copy_replace_imports \ - --source_dir="./examples" --source_import="lightning_app" --target_import="lightning.app" - displayName: "Adjust examples" - - - bash: pip --version && pip list - displayName: "List pip dependency" - - - bash: | - ls -l examples/app/$(TEST_APP_NAME) - echo ${TEST_FILE} - python -m pytest ${TEST_FILE}::test_$(TEST_APP_NAME)_example_cloud \ - --timeout=360 --capture=no -v --color=yes - env: - TEST_FILE: tests/integrations_app/$(TEST_APP_FOLDER)/test_$(TEST_APP_NAME).py - #LAI_USER: $(LAI_USER) # for STAGING - #LAI_PASS: $(LAI_PASS) # for STAGING - LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) - LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) - LIGHTNING_USERNAME: $(LIGHTNING_USERNAME_PROD) - LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) - # Todo: investigate why these apps are failing - continueOnError: ${{ in(variables['name'], 'quick_start', 'template_react_ui') }} - displayName: "Run the tests" - - - task: PublishPipelineArtifact@1 - condition: failed() - inputs: - path: "$(video_artifact_dir)/$(name)" - artifactName: $(name) - publishLocation: "pipeline" - displayName: "Publish videos" - - - bash: | - time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" - condition: always() - continueOnError: "true" - timeoutInMinutes: "3" - env: - #LAI_USER: $(LAI_USER) # for STAGING - #LAI_PASS: $(LAI_PASS) # for STAGING - LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD) - LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD) - LIGHTNING_USERNAME: $(LIGHTNING_USERNAME_PROD) - LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD) - displayName: "Clean Previous Apps" diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 821543cea4438..cdc2b63b2379d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -11,7 +11,6 @@ /.actions/ @borda @ethanwharris @justusschock /.github/ @borda @ethanwharris @justusschock /.azure/ @borda @ethanwharris @justusschock -/.azure/app-cloud-e2e.yml @awaelchli @ethanwharris @lantiga /dockers/ @borda @ethanwharris @justusschock *.yml @borda @ethanwharris @justusschock @@ -25,7 +24,6 @@ /docs/source-pytorch/conf.py @borda @awaelchli /docs/source-pytorch/index.rst @williamfalcon @lantiga /docs/source-pytorch/levels @williamfalcon @lantiga -/docs/source-app/ @williamfalcon @lantiga @tchaton # PyTorch Lightning /src/lightning/pytorch @lantiga @borda @tchaton @awaelchli @justusschock @@ -36,18 +34,10 @@ # Lightning Fabric /src/lightning/fabric @lantiga @borda @tchaton @awaelchli @justusschock -# Lightning App -/src/lightning/app @tchaton @lantiga @awaelchli @ethanwharris -/src/lightning_app @tchaton @lantiga @awaelchli @ethanwharris -/tests/tests_app @tchaton @lantiga @awaelchli @ethanwharris -/tests/integrations_app @tchaton @lantiga @awaelchli @ethanwharris -/examples/app_* @tchaton @lantiga @awaelchli @ethanwharris - /.github/CODEOWNERS @williamfalcon /SECURITY.md @williamfalcon @lantiga /README.md @williamfalcon @lantiga /setup.py @williamfalcon @borda /src/pytorch_lightning/__about__.py @williamfalcon @borda -/src/lightning_app/__about__.py @williamfalcon @lantiga @borda /src/lightning_fabric/__about__.py @williamfalcon @borda @awaelchli /src/*/__setup__.py @borda @justusschock diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index db6543b8cb40e..775dc5dee77dc 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -310,20 +310,6 @@ and the last true master commit is `ccc111` and your first commit is `mmm222`. git push -f ``` -#### How to run an app on the cloud with a local version of lightning - -The lightning cloud uses the latest release by default. However, you might want to run your app with some local changes you've made to the lightning framework. To use your local version of lightning on the cloud, set the following environment variable: - -```bash -git clone https://github.com/Lightning-AI/lightning.git -cd lightning -pip install -e . -export PACKAGE_LIGHTNING=1 # <- this is the magic to use your version (not mainstream PyPI)! -lightning run app app.py --cloud -``` - -By setting `PACKAGE_LIGHTNING=1`, lightning packages the lightning source code in your local directory in addition to your app source code and uploads them to the cloud. - ### Bonus Workflow Tip If you don't want to remember all the commands above every time you want to push some code/setup a Lightning Dev environment on a new VM, you can set up bash aliases for some common commands. You can add these to one of your `~/.bashrc`, `~/.zshrc`, or `~/.bash_aliases` files. diff --git a/.github/ISSUE_TEMPLATE/1_bug_report.yaml b/.github/ISSUE_TEMPLATE/1_bug_report.yaml index e6037c6d3bbb0..a3e2cfabe58f9 100644 --- a/.github/ISSUE_TEMPLATE/1_bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/1_bug_report.yaml @@ -91,9 +91,7 @@ body: Current environment ``` - #- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow): #- PyTorch Lightning Version (e.g., 1.5.0): - #- Lightning App Version (e.g., 0.5.2): #- PyTorch Version (e.g., 2.0): #- Python version (e.g., 3.9): #- OS (e.g., Linux): diff --git a/.github/actions/pkg-install/action.yml b/.github/actions/pkg-install/action.yml index 96aab24a66a4c..e379f37aef68a 100644 --- a/.github/actions/pkg-install/action.yml +++ b/.github/actions/pkg-install/action.yml @@ -25,7 +25,7 @@ runs: run: | import os, glob - lut = {'app': 'lightning_app', 'fabric': 'lightning_fabric', 'pytorch': 'pytorch_lightning'} + lut = {'fabric': 'lightning_fabric', 'pytorch': 'pytorch_lightning'} act_pkg = lut.get('${{inputs.pkg-name}}', 'lightning') pkg_sdist = glob.glob('*.tar.gz')[0] pkg_wheel = glob.glob('*.whl')[0] diff --git a/.github/actions/prep-apps/action.yml b/.github/actions/prep-apps/action.yml deleted file mode 100644 index 2cd1655e2a36d..0000000000000 --- a/.github/actions/prep-apps/action.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: Adjust App environment -description: make adjustment specific for selected App - -inputs: - name: - description: application name - required: true - -runs: - using: "composite" - steps: - - name: adjust env -> Flashy - if: inputs.name == 'flashy' - working-directory: tests/_flagship-app - run: | - ls -l . - pip install -r requirements-dev.txt -f $TORCH_URL - pip install -e . -f $TORCH_URL - shell: bash - - - name: adjust env -> Muse - if: inputs.name == 'muse' - working-directory: tests/ - run: | - pip install -e _flagship-app -f $TORCH_URL - cp _flagship-app/tests/test_app.py \ - integrations_app/flagship/test_${{ inputs.name }}.py - shell: bash - - - name: adjust env -> Jupyter - if: inputs.name == 'jupyter' - working-directory: tests/ - run: | - pip install -e _flagship-app -f $TORCH_URL - # pip install -r _flagship-app/tests/requirements-dev.txt - cp _flagship-app/tests/test_jupyter_app.py \ - integrations_app/flagship/test_${{ inputs.name }}.py - shell: bash diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 79b65664d2eb8..3774a56e2f480 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -213,121 +213,6 @@ subprojects: checks: - "test-on-tpus (pytorch, pjrt, v4-8)" - # SECTION: lightning_app - - - id: "lightning_app: Tests workflow" - paths: - - ".actions/*" - - ".github/workflows/ci-tests-app.yml" - - "src/lightning/app/**" - - "src/lightning_app/*" - - "tests/tests_app/**" - - "requirements/app/**" - - "setup.py" - - "!requirements/*/docs.txt" - - "!*.md" - - "!**/*.md" - checks: - - "app-pytest (macOS-12, lightning, 3.8, latest)" - - "app-pytest (macOS-12, lightning, 3.8, oldest)" - - "app-pytest (macOS-12, app, 3.9, latest)" - - "app-pytest (macOS-12, app, 3.11, latest)" - - "app-pytest (ubuntu-20.04, lightning, 3.8, latest)" - - "app-pytest (ubuntu-20.04, lightning, 3.8, oldest)" - - "app-pytest (ubuntu-20.04, app, 3.9, latest)" - - "app-pytest (ubuntu-22.04, app, 3.11, latest)" - - "app-pytest (windows-2022, lightning, 3.8, latest)" - - "app-pytest (windows-2022, lightning, 3.8, oldest)" - - "app-pytest (windows-2022, app, 3.8, latest)" - - "app-pytest (windows-2022, app, 3.11, latest)" - - - id: "lightning_app: Examples" - paths: - - ".actions/*" - - ".github/workflows/ci-examples-app.yml" - - "src/lightning/app/**" - - "src/lightning_app/*" - - "tests/integrations_app/**" - - "!tests/integrations_app/flagship/**" - - "examples/app/**" - - "requirements/app/**" - - "setup.py" - - "!requirements/*/docs.txt" - - "!*.md" - - "!**/*.md" - checks: - - "app-examples (macOS-12, lightning, 3.9, latest)" - - "app-examples (macOS-12, lightning, 3.9, oldest)" - - "app-examples (macOS-12, app, 3.9, latest)" - - "app-examples (ubuntu-20.04, lightning, 3.9, latest)" - - "app-examples (ubuntu-20.04, lightning, 3.9, oldest)" - - "app-examples (ubuntu-20.04, app, 3.9, latest)" - - "app-examples (windows-2022, lightning, 3.9, latest)" - - "app-examples (windows-2022, lightning, 3.9, oldest)" - - "app-examples (windows-2022, app, 3.9, latest)" - - #- id: "lightning: Flagships" - # paths: - # - ".github/workflows/_flagship-apps.yml" - # - ".github/workflows/ci-flagship-apps.yml" - # - "github/actions/prep-apps/action.yml" - # - "tests/integrations_app/flagship/**" - # checks: - # - "test-flagships / run-flagships (flashy, Lightning-Universe/Flashy-app)" - - - id: "lightning: Store" - paths: - - ".github/workflows/ci-tests-store.yml" - - "src/lightning/__init__.py" - - "src/lightning/__setup__.py" - - "src/lightning/__version__.py" - - "src/lightning/store/**" - - "tests/tests_store/**" - checks: - - "store-cpu (macOS-14, lightning, 3.10, 2.0)" - - "store-cpu (ubuntu-20.04, lightning, 3.10, 2.0)" - - "store-cpu (windows-2022, lightning, 3.10, 2.0)" - - # FixMe: re-enable when BE stabilize - # - id: "lightning_app: Azure" - # paths: - # - ".actions/*" - # - ".azure/app-cloud-e2e.yml" - # - "src/lightning/__about__.py" - # - "src/lightning/__init__.py" - # - "src/lightning/__main__.py" - # - "src/lightning/__setup__.py" - # - "src/lightning/__version__.py" - # - "src/lightning/app/**" - # - "src/lightning_app/*" - # - "examples/app/**" - # - "requirements/app/**" - # - "tests/integrations_app/**" - # - "!tests/integrations_app/flagship/**" - # - "setup.py" - # - "!requirements/*/docs.txt" - # - "!*.md" - # - "!**/*.md" - # checks: - # - "App.cloud-e2e" - - - id: "lightning_app: Docs" - paths: - - ".actions/*" - - "src/lightning/app/**" - - "src/lightning_app/*" - - "docs/source-app/**" - - ".github/workflows/docs-build.yml" - - "requirements/docs.txt" - - "requirements/app/**" - - "setup.py" - - "pyproject.toml" # includes metadata used in the package creation - - "!*.md" - - "!**/*.md" - checks: - - "docs-make (app, doctest)" - - "docs-make (app, html)" - # SECTION: common - id: "mypy" @@ -359,8 +244,6 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "install-pkg (ubuntu-22.04, app, 3.8)" - - "install-pkg (ubuntu-22.04, app, 3.11)" - "install-pkg (ubuntu-22.04, fabric, 3.8)" - "install-pkg (ubuntu-22.04, fabric, 3.11)" - "install-pkg (ubuntu-22.04, pytorch, 3.8)" @@ -369,8 +252,6 @@ subprojects: - "install-pkg (ubuntu-22.04, lightning, 3.11)" - "install-pkg (ubuntu-22.04, notset, 3.8)" - "install-pkg (ubuntu-22.04, notset, 3.11)" - - "install-pkg (macOS-12, app, 3.8)" - - "install-pkg (macOS-12, app, 3.11)" - "install-pkg (macOS-12, fabric, 3.8)" - "install-pkg (macOS-12, fabric, 3.11)" - "install-pkg (macOS-12, pytorch, 3.8)" @@ -379,8 +260,6 @@ subprojects: - "install-pkg (macOS-12, lightning, 3.11)" - "install-pkg (macOS-12, notset, 3.8)" - "install-pkg (macOS-12, notset, 3.11)" - - "install-pkg (windows-2022, app, 3.8)" - - "install-pkg (windows-2022, app, 3.11)" - "install-pkg (windows-2022, fabric, 3.8)" - "install-pkg (windows-2022, fabric, 3.11)" - "install-pkg (windows-2022, pytorch, 3.8)" diff --git a/.github/label-change.yml b/.github/label-change.yml index 1f5b809a5fa60..8312c612b1207 100644 --- a/.github/label-change.yml +++ b/.github/label-change.yml @@ -1,28 +1,9 @@ -app: - - changed-files: - - any-glob-to-any-file: - - "src/lightning/app/**" - - "src/lightning_app/*" - - "tests/tests_app/**" - - "tests/integrations_app/**" - - "tests/integrations_app_examples/**" - - "examples/app/**" - - "docs/source-app/**" - - "requirements/app/**" - data: - changed-files: - any-glob-to-any-file: - "src/lightning/data/**" - "requirements/data/**" -store: - - changed-files: - - any-glob-to-any-file: - - "src/lightning/store/**" - - "tests/tests_store/**" - - "requirements/store/**" - pl: - changed-files: - any-glob-to-any-file: @@ -70,7 +51,6 @@ package: - "src/version.info" - "src/lightning/*/__setup__.py" - "src/lightning/*/__version__.py" - - "src/lightning_app/*" - "src/lightning_fabric/*" - "src/pytorch_lightning/*" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 0c2c5a69c4b4c..58f4afe529509 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -8,15 +8,12 @@ Brief description of all our automation tools used for boosting development perf | workflow file | action | accelerator | | -------------------------------------- | ----------------------------------------------------------------------------------------- | ----------- | -| .github/workflows/ci-tests-app.yml | Run all tests (may need internet connectivity). | CPU | | .github/workflows/ci-tests-fabric.yml | Run all tests except for accelerator-specific and standalone. | CPU | | .github/workflows/ci-tests-pytorch.yml | Run all tests except for accelerator-specific and standalone. | CPU | | .github/workflows/ci-tests-data.yml | Run unit and integration tests with data pipelining. | CPU | -| .github/workflows/ci-tests-store.yml | Run integration tests on uploading models to cloud. | CPU | | .azure-pipelines/gpu-tests-fabric.yml | Run only GPU-specific tests, standalone\*, and examples. | GPU | | .azure-pipelines/gpu-tests-pytorch.yml | Run only GPU-specific tests, standalone\*, and examples. | GPU | | .azure-pipelines/gpu-benchmarks.yml | Run speed/memory benchmarks for parity with vanila PyTorch. | GPU | -| .github/workflows/ci-examples-app.yml | Run integration tests with App examples. | CPU | | .github/workflows/ci-flagship-apps.yml | Run end-2-end tests with full applications, including deployment to the production cloud. | CPU | | .github/workflows/ci-tests-pytorch.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | | .github/workflows/tpu-tests.yml | Run only TPU-specific tests. Requires that the PR title contains '\[TPU\]' | TPU | diff --git a/.github/workflows/_build-packages.yml b/.github/workflows/_build-packages.yml index cb58031805803..e0262ac63b685 100644 --- a/.github/workflows/_build-packages.yml +++ b/.github/workflows/_build-packages.yml @@ -12,7 +12,7 @@ on: required: false type: string default: | - ["lightning", "app", "fabric", "pytorch"] + ["lightning", "fabric", "pytorch"] defaults: run: diff --git a/.github/workflows/_flagship-apps.yml b/.github/workflows/_flagship-apps.yml deleted file mode 100644 index 3282c58ababb5..0000000000000 --- a/.github/workflows/_flagship-apps.yml +++ /dev/null @@ -1,122 +0,0 @@ -name: Call integration of flagship Apps - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - workflow_call: - inputs: - environment: - description: "Lightning environment" - required: false - default: "PROD" - type: string - workflow_dispatch: - inputs: - environment: - description: "Lightning environment" - required: true - default: "PROD" - type: choice - options: - - PROD - - STAGING - -defaults: - run: - shell: bash - -jobs: - run-flagships: - if: github.event.pull_request.draft == false - runs-on: ubuntu-latest - container: - image: mcr.microsoft.com/playwright/python:v1.38.0-focal - strategy: - fail-fast: false - matrix: - include: - - { app: "flashy", repo: "Lightning-Universe/Flashy-app" } - - { app: "muse", repo: "Lightning-Universe/stable-diffusion-deploy" } - - { app: "jupyter", repo: "Lightning-Universe/Jupyter-component" } - - # TODO: - # - Training Studio - # - Echo - # - StreamLit / Gradio - # - All homepage & docs apps - - env: - HEADLESS: "1" - PACKAGE_LIGHTNING: "1" - CLOUD: "1" - VIDEO_LOCATION: "./videos" - HAR_LOCATION: "./artifacts/hars" - SLOW_MO: "50" - LIGHTNING_DEBUG: "1" - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" - # Timeout: https://stackoverflow.com/a/59076067/4521646 - timeout-minutes: 20 - - steps: - - uses: actions/checkout@v4 - - - name: basic setup - timeout-minutes: 20 - run: | - mkdir -p tests/_flagships - mkdir -p $VIDEO_LOCATION - pip --version - pip list - # for some reason the python package playwright is missing - pip install -r requirements/app/test.txt - python -m playwright install # --with-deps - - - name: Clone the Repo/App - uses: actions/checkout@v4 - with: - repository: ${{ matrix.repo }} - path: tests/_flagship-app - - - name: Adjust env. for this App - uses: ./.github/actions/prep-apps - with: - name: ${{ matrix.app }} - - - name: Install Lightning package - timeout-minutes: 20 - run: pip install -e .[cloud,test] -f $TORCH_URL - - name: List pip dependency - run: pip --version && pip list - - - name: Run the tests - working-directory: tests/ - env: - LIGHTNING_USER_ID: ${{ secrets[format('LIGHTNING_USER_ID_{0}', inputs.environment)] }} - LIGHTNING_API_KEY: ${{ secrets[format('LIGHTNING_API_KEY_{0}', inputs.environment)] }} - LIGHTNING_USERNAME: ${{ secrets[format('LIGHTNING_USERNAME_{0}', inputs.environment)] }} - LIGHTNING_CLOUD_URL: ${{ secrets[format('LIGHTNING_CLOUD_URL_{0}', inputs.environment)] }} - LAI_USER: ${{ secrets.LAI_SSH_USER }} - LAI_PASS: ${{ secrets.LAI_SSH_PASS }} - run: | - ls -l _flagship-app - python -m pytest integrations_app/flagship/test_${{ matrix.app }}.py \ - --capture=no -v --color=yes - - - name: Upload recordings - uses: actions/upload-artifact@v3 - if: failure() - with: - name: flahship-app-${{ matrix.app }} - path: ${{ env.VIDEO_LOCATION }} - - - name: Clean Previous Apps - if: always() - timeout-minutes: 3 - env: - LIGHTNING_USER_ID: ${{ secrets[format('LIGHTNING_USER_ID_{0}', inputs.environment)] }} - LIGHTNING_API_KEY: ${{ secrets[format('LIGHTNING_API_KEY_{0}', inputs.environment)] }} - LIGHTNING_USERNAME: ${{ secrets[format('LIGHTNING_USERNAME_{0}', inputs.environment)] }} - LIGHTNING_CLOUD_URL: ${{ secrets[format('LIGHTNING_CLOUD_URL_{0}', inputs.environment)] }} - LAI_USER: ${{ secrets.LAI_SSH_USER }} - LAI_PASS: ${{ secrets.LAI_SSH_PASS }} - run: | - time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" diff --git a/.github/workflows/ci-examples-app.yml b/.github/workflows/ci-examples-app.yml deleted file mode 100644 index b6db69e67aead..0000000000000 --- a/.github/workflows/ci-examples-app.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Test App - examples - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped - paths: - - ".actions/*" - - ".github/workflows/ci-examples-app.yml" - - "src/lightning/app/**" - - "src/lightning_app/*" - - "tests/integrations_app/**" - - "!tests/integrations_app/flagship/**" - - "examples/app/**" - - "requirements/app/**" - - "setup.py" - - "!requirements/*/docs.txt" - - "!*.md" - - "!**/*.md" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -defaults: - run: - shell: bash - -jobs: - app-examples: - if: github.event.pull_request.draft == false - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-20.04, macOS-12, windows-2022] - pkg-name: ["lightning"] - python-version: ["3.9"] - requires: ["oldest", "latest"] - include: - # "app" installs the standalone package - - { os: "macOS-12", pkg-name: "app", python-version: "3.9", requires: "latest" } - - { os: "ubuntu-20.04", pkg-name: "app", python-version: "3.9", requires: "latest" } - - { os: "windows-2022", pkg-name: "app", python-version: "3.9", requires: "latest" } - # Timeout: https://stackoverflow.com/a/59076067/4521646 - timeout-minutes: 15 - env: - PACKAGE_NAME: ${{ matrix.pkg-name }} - FREEZE_REQUIREMENTS: 1 - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" - PYPI_CACHE_DIR: "_pip-wheels" - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: basic setup - run: pip install -q -r .actions/requirements.txt - - - name: Set min. dependencies - if: ${{ matrix.requires == 'oldest' }} - run: python .actions/assistant.py replace_oldest_ver - - - name: pip wheels cache - uses: actions/cache/restore@v4 - with: - path: ${{ env.PYPI_CACHE_DIR }} - key: pypi_wheels - - - name: Install Lightning package & dependencies - timeout-minutes: 20 - run: | - extra=$(python -c "print({'lightning': 'app-'}.get('${{ matrix.pkg-name }}', ''))") - # do not use `-e` because it will make both packages available since it adds `src` to `sys.path` automatically - pip install ".[${extra}dev]" -U -f ${TORCH_URL} -f ${PYPI_CACHE_DIR} --prefer-binary - pip list - - name: Dump handy wheels - if: github.event_name == 'push' && github.ref == 'refs/heads/master' - continue-on-error: true - uses: ./.github/actions/pip-wheels - with: - wheel-dir: ${{ env.PYPI_CACHE_DIR }} - torch-url: ${{ env.TORCH_URL }} - cache-key: "pypi_wheels" - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: "16" - - - name: Install Yarn - timeout-minutes: 20 - run: npm install -g yarn - - - name: Adjust imports -> App - if: ${{ matrix.pkg-name != 'lightning' }} - run: | - python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ - --source_import="lightning.app,lightning.fabric,lightning.pytorch" \ - --target_import="lightning_app,lightning_fabric,pytorch_lightning" \ - --lightning_by="lightning_app" - python .actions/assistant.py copy_replace_imports --source_dir="./examples" \ - --source_import="lightning.app,lightning.fabric,lightning.pytorch,lightning" \ - --target_import="lightning_app,lightning_fabric,pytorch_lightning,lightning_app" - - - name: Switch coverage scope - run: python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_app'))" >> $GITHUB_ENV - - - name: Tests - working-directory: ./tests - run: | - python -m coverage run --source ${{ env.COVERAGE_SCOPE }} \ - -m pytest -m "not cloud" integrations_app \ - --timeout=120 --durations=0 -vvvv - - - name: Statistics - if: success() - working-directory: ./tests - run: | - coverage xml -i - coverage report -i - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/coverage.xml - flags: cpu,pytest,app,examples - env_vars: OS,PYTHON - name: codecov-umbrella - fail_ci_if_error: false diff --git a/.github/workflows/ci-flagship-apps.yml b/.github/workflows/ci-flagship-apps.yml deleted file mode 100644 index 6332299fc03d1..0000000000000 --- a/.github/workflows/ci-flagship-apps.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Test App - flagships - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - push: - branches: ["release/*"] - pull_request: - branches: [master, "release/*"] - types: [opened, reopened, ready_for_review, synchronize] - paths: - - ".github/workflows/_flagship-apps.yml" - - ".github/workflows/ci-flagship-apps.yml" - - "github/actions/prep-apps/action.yml" - - "tests/integrations_app/flagship/**" - schedule: - # on Sundays - - cron: "0 0 * * 0" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -jobs: - test-flagships: - if: github.event.pull_request.draft == false - uses: ./.github/workflows/_flagship-apps.yml - secrets: inherit diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 67a9b9f21b515..6e82167410ec3 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -43,14 +43,8 @@ jobs: fail-fast: false matrix: os: ["ubuntu-22.04", "macOS-12", "windows-2022"] - pkg-name: ["app", "fabric", "pytorch", "lightning", "notset"] + pkg-name: ["fabric", "pytorch", "lightning", "notset"] python-version: ["3.8", "3.11"] - # TODO: add also install from source - include: - - { os: "macOS-12", pkg-name: "lightning", python-version: "3.9", pkg-extra: "app" } - - { os: "macOS-12", pkg-name: "notset", python-version: "3.9", pkg-extra: "app" } - - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.9", pkg-extra: "app" } - - { os: "ubuntu-22.04", pkg-name: "notset", python-version: "3.9", pkg-extra: "app" } steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -70,16 +64,6 @@ jobs: with: pkg-folder: dist/${{ env.PKG_DIR }} pkg-name: ${{ matrix.pkg-name }} - pkg-extra: ${{ matrix.pkg-extra }} - - - name: Run CLI (via python) - if: ${{ (matrix.pkg-name == 'lightning' || matrix.pkg-name == 'notset') && matrix.pkg-extra == 'app' }} - run: python -m lightning --version - - name: Run CLI (direct bash) - if: | - ((matrix.pkg-name == 'lightning' || matrix.pkg-name == 'notset') && matrix.pkg-extra == 'app') || - matrix.pkg-name == 'app' - run: lightning_app --version - name: DocTests actions working-directory: .actions/ @@ -88,15 +72,15 @@ jobs: python -m pytest assistant.py - name: Adjust code for standalone - if: contains(fromJSON('["app", "fabric", "pytorch"]'), matrix.pkg-name) + if: contains(fromJSON('["fabric", "pytorch"]'), matrix.pkg-name) run: | python .actions/assistant.py copy_replace_imports --source_dir="./src" \ - --source_import="lightning.pytorch,lightning.fabric,lightning.app" \ - --target_import="pytorch_lightning,lightning_fabric,lightning_app" + --source_import="lightning.pytorch,lightning.fabric" \ + --target_import="pytorch_lightning,lightning_fabric" - name: Rename src folders working-directory: src/ run: | - python -c "n = '${{matrix.pkg-name}}' ; n = n if n in ('app', 'fabric', 'pytorch') else '' ; print('PKG_NAME=' + n)" >> $GITHUB_ENV + python -c "n = '${{matrix.pkg-name}}' ; n = n if n in ('fabric', 'pytorch') else '' ; print('PKG_NAME=' + n)" >> $GITHUB_ENV rm -f ./*/__*.py rm -f ./**/__*.py mv lightning lit # rename lightning folder to prevent accidental local imports @@ -104,12 +88,8 @@ jobs: if: ${{ matrix.pkg-name == 'lightning' || matrix.pkg-name == 'notset' }} working-directory: src/lit run: | - items=("data" "store" "app") + items=("data") for item in "${items[@]}"; do - if [[ "$item" == "${{ matrix.pkg-extra }}" ]]; then - echo "Skipping $item" - continue # Skip this iteration - fi echo "Removing $item" rm -rf $item done diff --git a/.github/workflows/ci-tests-app.yml b/.github/workflows/ci-tests-app.yml deleted file mode 100644 index ee643fa397f43..0000000000000 --- a/.github/workflows/ci-tests-app.yml +++ /dev/null @@ -1,175 +0,0 @@ -name: Test App - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped - paths: - - ".actions/*" - - ".github/workflows/ci-tests-app.yml" - - "requirements/ci.txt" - - "src/lightning/app/**" - - "src/lightning_app/*" - - "tests/tests_app/**" - - "requirements/app/**" - - "setup.py" - - "!requirements/*/docs.txt" - - "!*.md" - - "!**/*.md" - schedule: - # At the end of every day - - cron: "0 0 * * *" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -defaults: - run: - shell: bash - -jobs: - app-pytest: - if: github.event.pull_request.draft == false - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: ["ubuntu-20.04", "macOS-12", "windows-2022"] - pkg-name: ["lightning"] - python-version: ["3.8"] - requires: ["oldest", "latest"] - include: - # only run Python latest, use App scope to limit dependency issues - - { os: "macOS-12", pkg-name: "app", python-version: "3.11", requires: "latest" } - - { os: "ubuntu-22.04", pkg-name: "app", python-version: "3.11", requires: "latest" } - - { os: "windows-2022", pkg-name: "app", python-version: "3.11", requires: "latest" } - # "app" installs the standalone package - - { os: "macOS-12", pkg-name: "app", python-version: "3.9", requires: "latest" } - - { os: "ubuntu-20.04", pkg-name: "app", python-version: "3.9", requires: "latest" } - - { os: "windows-2022", pkg-name: "app", python-version: "3.8", requires: "latest" } - # Timeout: https://stackoverflow.com/a/59076067/4521646 - timeout-minutes: 55 - env: - PACKAGE_NAME: ${{ matrix.pkg-name }} - FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - PYPI_CACHE_DIR: "_pip-wheels" - TORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html" - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: basic setup - run: pip install -q -r .actions/requirements.txt - - - name: Set min. dependencies - if: ${{ matrix.requires == 'oldest' }} - run: python .actions/assistant.py replace_oldest_ver - - - name: pip wheels cache - uses: actions/cache/restore@v4 - with: - path: ${{ env.PYPI_CACHE_DIR }} - key: pypi_wheels - - name: List restored pkgs - run: | - mkdir -p $PYPI_CACHE_DIR - ls -lh $PYPI_CACHE_DIR - - - name: Env. variables - run: | - # Switch coverage scope - python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'pytorch_lightning'))" >> $GITHUB_ENV - # if you install mono-package set dependency only for this subpackage - python -c "print('EXTRA_PREFIX=' + str('' if '${{matrix.pkg-name}}' != 'lightning' else 'app-'))" >> $GITHUB_ENV - - - name: Install package & dependencies - timeout-minutes: 20 - run: | - pip install -e ".[${EXTRA_PREFIX}dev]" -U --prefer-binary \ - --find-links="${TORCH_URL}" --find-links="${PYPI_CACHE_DIR}" - pip list - - name: Dump handy wheels - if: github.event_name == 'push' && github.ref == 'refs/heads/master' - continue-on-error: true - uses: ./.github/actions/pip-wheels - with: - wheel-dir: ${{ env.PYPI_CACHE_DIR }} - torch-url: ${{ env.TORCH_URL }} - cache-key: "pypi_wheels" - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: "16" - - name: Install Yarn - timeout-minutes: 20 - run: npm install -g yarn - - - name: Adjust imports -> App - if: ${{ matrix.pkg-name != 'lightning' }} - run: | - python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ - --source_import="lightning.app,lightning.fabric,lightning.pytorch" \ - --target_import="lightning_app,lightning_fabric,pytorch_lightning" \ - --lightning_by="lightning_app" - python .actions/assistant.py copy_replace_imports --source_dir="./examples" \ - --source_import="lightning.app,lightning.fabric,lightning.pytorch" \ - --target_import="lightning_app,lightning_fabric,pytorch_lightning" \ - --lightning_by="lightning_app" - - - name: Switch coverage scope - run: python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_app'))" >> $GITHUB_ENV - - - name: Set parallel for Unix - if: ${{ runner.os != 'windows' }} - # on Win, tests takes even loner then with normal single thread - run: echo "PYTEST_XDIST_ARGS=-n auto --dist=loadfile" >> $GITHUB_ENV - - name: Tests - working-directory: ./tests - run: | - set -e - python -m coverage run --source ${{ env.COVERAGE_SCOPE }} \ - -m pytest -m "not cloud" -vvvv tests_app \ - --ignore="tests_app/components/python/test_python.py" \ - --timeout=120 --durations=50 ${PYTEST_XDIST_ARGS} - pytest -m "not cloud" -v \ - tests_app/components/python/test_python.py \ - --timeout=120 --durations=50 - - - name: Statistics - if: success() - working-directory: ./tests - run: | - coverage xml -i - coverage report -i - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/coverage.xml - flags: ${{ env.COVERAGE_SCOPE }},cpu,pytest - env_vars: OS,PYTHON - name: codecov-umbrella - fail_ci_if_error: false -# TODO: figure out why we clone and install quick-start -# - name: Clone Quick Start Example Repo -# uses: actions/checkout@v4 -# # TODO: this needs to be git submodule -# if: matrix.os == 'windows-2022' # because the install doesn't work on windows -# with: -# repository: Lightning-AI/lightning-quick-start -# ref: 'main' -# path: lightning-quick-start -# -# - name: Lightning Install quick-start -# if: matrix.os != 'windows-2022' # because the install doesn't work on windows -# run: | -# python -m lightning install app lightning/quick-start -y diff --git a/.github/workflows/ci-tests-store.yml b/.github/workflows/ci-tests-store.yml deleted file mode 100644 index 60614005c614d..0000000000000 --- a/.github/workflows/ci-tests-store.yml +++ /dev/null @@ -1,96 +0,0 @@ -name: Test Store - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped - paths: - - ".actions/*" - - "requirements/ci.txt" - - "requirements/store/**" - - "src/lightning/__init__.py" - - "src/lightning/__setup__.py" - - "src/lightning/__version__.py" - - "src/lightning/store/**" - - "tests/tests_store/**" - - "pyproject.toml" # includes pytest config - - ".github/workflows/ci-tests-store.yml" - - "!requirements/*/docs.txt" - - "!*.md" - - "!**/*.md" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -defaults: - run: - shell: bash - -jobs: - store-cpu: - runs-on: ${{ matrix.os }} - if: github.event.pull_request.draft == false - strategy: - fail-fast: false - matrix: - os: ["macOS-11", "ubuntu-20.04", "windows-2022"] - pkg-name: ["lightning"] - python-version: ["3.10"] - pytorch-version: ["2.0"] - timeout-minutes: 25 # because of building grpcio on Mac - env: - PACKAGE_NAME: ${{ matrix.pkg-name }} - FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - # PYPI_CACHE_DIR: "_pip-wheels" - TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu/torch_stable.html" - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Adjust PyTorch versions in requirements files - if: ${{ matrix.requires != 'oldest' && matrix.release != 'pre' }} - run: | - pip install -q -r requirements/ci.txt - python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py - for fpath in `ls requirements/store/*.txt`; do \ - python ./adjust-torch-versions.py $fpath ${{ matrix.pytorch-version }}; \ - done - - - name: Install package & dependencies - timeout-minutes: 20 - run: | - pip install -e ".[store,store-test]" -U -f ${TORCH_URL} --prefer-binary - pip list - - - name: Testing Store - working-directory: tests/tests_store - # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 - run: | - python -m coverage run --source lightning \ - -m pytest -v --timeout=60 --durations=60 - - - name: Statistics - if: success() - working-directory: tests/tests_store - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_store/coverage.xml - flags: lightning,cpu,pytest,python${{ matrix.python-version }} - name: CPU-coverage - fail_ci_if_error: false diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 535274e403a58..55e2a8ec4a01a 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -44,7 +44,6 @@ jobs: FREEZE_REQUIREMENTS: 1 timeout-minutes: 20 run: | - # TODO: investigate hanging installation with app sub-package pip install -e '.[pytorch-all,fabric-all]' -r requirements/typing.txt pip list diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml index 7f7a09ce44007..4a9889eae02e6 100644 --- a/.github/workflows/docs-build.yml +++ b/.github/workflows/docs-build.yml @@ -15,8 +15,6 @@ on: - "docs/**" - "_notebooks" - "requirements/**" - - "src/lightning/app/**" - - "src/lightning_app/*" - "src/lightning/fabric/**" - "src/lightning_fabric/*" - "src/lightning/pytorch/**" @@ -58,7 +56,7 @@ jobs: strategy: fail-fast: false matrix: - pkg-name: ["app", "fabric", "pytorch"] + pkg-name: ["fabric", "pytorch"] target: ["html", "doctest", "linkcheck"] env: DOCS_COPY_NOTEBOOKS: 1 @@ -144,7 +142,7 @@ jobs: strategy: fail-fast: false matrix: - pkg-name: ["app", "fabric", "pytorch"] + pkg-name: ["fabric", "pytorch"] env: GCP_TARGET: "gs://lightning-docs-${{ matrix.pkg-name }}" # use input if dispatch or git tag diff --git a/.github/workflows/release-pkg.yml b/.github/workflows/release-pkg.yml index 696efd8b68291..e922a34446d97 100644 --- a/.github/workflows/release-pkg.yml +++ b/.github/workflows/release-pkg.yml @@ -165,7 +165,7 @@ jobs: strategy: fail-fast: false matrix: - name: ["APP", "FABRIC", "PYTORCH", "LIGHTNING"] + name: ["FABRIC", "PYTORCH", "LIGHTNING"] steps: - uses: actions/checkout@v4 # needed for local action below - uses: actions/download-artifact@v3 @@ -190,7 +190,7 @@ jobs: strategy: fail-fast: false matrix: - name: ["APP", "FABRIC", "PYTORCH", "LIGHTNING"] + name: ["FABRIC", "PYTORCH", "LIGHTNING"] steps: - uses: actions/checkout@v4 # needed for local action below - uses: actions/download-artifact@v3 diff --git a/.gitignore b/.gitignore index de1de44fec235..cf5face7db3d4 100644 --- a/.gitignore +++ b/.gitignore @@ -9,10 +9,6 @@ lightning_logs/ # Documentations docs/venv*/ docs/build*/ -docs/source-app/*/api -docs/source-app/generated -docs/source-app/*/generated -docs/source-app/_static/fetched-s3-assets docs/source-fabric/_static/fetched-s3-assets docs/source-pytorch/api docs/source-pytorch/*.md @@ -59,7 +55,6 @@ wheels/ .installed.cfg *.egg src/*/version.info -src/lightning_app/* src/lightning_fabric/* src/pytorch_lightning/* !src/*/__about__.py @@ -182,8 +177,6 @@ cifar-10-batches-py # ctags tags .tags -src/lightning_app/ui/* -src/lightning/app/ui/* *examples/template_react_ui* hars* artifacts/* diff --git a/.lightningignore b/.lightningignore deleted file mode 100644 index 4ce8d526e30e3..0000000000000 --- a/.lightningignore +++ /dev/null @@ -1,16 +0,0 @@ -_notebooks -.azure -.github -.ipynb_checkpoints -.pytest_cache -.shared -.storage -.venv -.vscode -.git -artifacts -Datasets -dist -docs -examples -tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 834a903bcac19..fbf4b2de6a999 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,7 +90,6 @@ repos: - mdformat_frontmatter exclude: | (?x)^( - src/lightning/app/CHANGELOG.md| src/lightning/fabric/CHANGELOG.md| src/lightning/pytorch/CHANGELOG.md| README.md diff --git a/.readthedocs.yml b/.readthedocs.yml index a42fbb7e88214..625c56a5fe61b 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -42,8 +42,7 @@ build: - pip install -U pip awscli py-tree --user - python -m awscli s3 sync --no-sign-request s3://sphinx-packages/ dist/ ; ls -lh dist/ - > - pip install -e ".[app]" -q -r _notebooks/.actions/requires.txt \ - -r requirements/app/docs.txt \ + pip install -e . -q -r _notebooks/.actions/requires.txt \ -r requirements/fabric/docs.txt \ -r requirements/pytorch/docs.txt \ -f 'https://download.pytorch.org/whl/cpu/torch_stable.html' -f dist/ ; diff --git a/Makefile b/Makefile index 2f7ca37222a10..426c18042994c 100644 --- a/Makefile +++ b/Makefile @@ -21,13 +21,10 @@ clean: rm -rf ./docs/source-pytorch/generated rm -rf ./docs/source-pytorch/*/generated rm -rf ./docs/source-pytorch/api - rm -rf ./docs/source-app/generated - rm -rf ./docs/source-app/*/generated rm -rf build rm -rf dist rm -rf *.egg-info rm -rf src/*.egg-info - rm -rf src/lightning_app/*/ rm -rf src/lightning_fabric/*/ rm -rf src/pytorch_lightning/*/ @@ -35,14 +32,11 @@ test: clean # Review the CONTRIBUTING documentation for other ways to test. pip install -e . \ -r requirements/pytorch/base.txt \ - -r requirements/app/app.txt \ -r requirements/fabric/base.txt \ -r requirements/pytorch/test.txt \ - -r requirements/app/test.txt # run tests with coverage python -m coverage run --source src/lightning/pytorch -m pytest src/lightning/pytorch tests/tests_pytorch -v - python -m coverage run --source src/lightning/app -m pytest tests/tests/app -v python -m coverage run --source src/lightning/fabric -m pytest src/lightning/fabric tests/tests_fabric -v python -m coverage report @@ -54,10 +48,6 @@ sphinx-theme: aws s3 sync --no-sign-request s3://sphinx-packages/ dist/ pip install lai-sphinx-theme -f dist/ -docs-app: clean sphinx-theme - pip install -e .[all] --quiet -r requirements/app/docs.txt - cd docs/source-app && $(MAKE) html --jobs $(nproc) - docs-fabric: clean sphinx-theme pip install -e .[all] --quiet -r requirements/fabric/docs.txt cd docs/source-fabric && $(MAKE) html --jobs $(nproc) diff --git a/docs/crossroad.html b/docs/crossroad.html index fc072aba60df5..0c25930c6343a 100644 --- a/docs/crossroad.html +++ b/docs/crossroad.html @@ -9,7 +9,6 @@ - diff --git a/docs/rtfd-build.sh b/docs/rtfd-build.sh index 2aa6928f10a19..1b1d5dcab319b 100644 --- a/docs/rtfd-build.sh +++ b/docs/rtfd-build.sh @@ -5,7 +5,7 @@ if ! [ $READTHEDOCS_VERSION == "latest" -o $READTHEDOCS_VERSION == "stable" ]; then export FAST_DOCS_DEV=1 ; root=$(pwd) ; - for pkg in 'app' 'fabric' 'pytorch' ; + for pkg in 'fabric' 'pytorch' ; do cd $root/docs/source-$pkg ; make html --jobs $(nproc) ; diff --git a/docs/source-app/Makefile b/docs/source-app/Makefile deleted file mode 100644 index 268e09561bb72..0000000000000 --- a/docs/source-app/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -T -W -SPHINXBUILD = sphinx-build -SOURCEDIR = . -BUILDDIR = ../build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source-app/_static/copybutton.js b/docs/source-app/_static/copybutton.js deleted file mode 100644 index aef241ab6ac40..0000000000000 --- a/docs/source-app/_static/copybutton.js +++ /dev/null @@ -1,78 +0,0 @@ -/* Copied from the official Python docs: https://docs.python.org/3/_static/copybutton.js */ -$(document).ready(function () { - /* Add a [>>>] button on the top-right corner of code samples to hide - * the >>> and ... prompts and the output and thus make the code - * copyable. */ - var div = $( - ".highlight-python .highlight," + - ".highlight-python3 .highlight," + - ".highlight-pycon .highlight," + - ".highlight-default .highlight", - ); - var pre = div.find("pre"); - - // get the styles from the current theme - pre.parent().parent().css("position", "relative"); - var hide_text = "Hide the prompts and output"; - var show_text = "Show the prompts and output"; - var border_width = pre.css("border-top-width"); - var border_style = pre.css("border-top-style"); - var border_color = pre.css("border-top-color"); - var button_styles = { - cursor: "pointer", - position: "absolute", - top: "0", - right: "0", - "border-color": border_color, - "border-style": border_style, - "border-width": border_width, - color: border_color, - "text-size": "75%", - "font-family": "monospace", - "padding-left": "0.2em", - "padding-right": "0.2em", - "border-radius": "0 3px 0 0", - }; - - // create and add the button to all the code blocks that contain >>> - div.each(function (index) { - var jthis = $(this); - if (jthis.find(".gp").length > 0) { - var button = $('>>>'); - button.css(button_styles); - button.attr("title", hide_text); - button.data("hidden", "false"); - jthis.prepend(button); - } - // tracebacks (.gt) contain bare text elements that need to be - // wrapped in a span to work with .nextUntil() (see later) - jthis - .find("pre:has(.gt)") - .contents() - .filter(function () { - return this.nodeType == 3 && this.data.trim().length > 0; - }) - .wrap(""); - }); - - // define the behavior of the button when it's clicked - $(".copybutton").click(function (e) { - e.preventDefault(); - var button = $(this); - if (button.data("hidden") === "false") { - // hide the code output - button.parent().find(".go, .gp, .gt").hide(); - button.next("pre").find(".gt").nextUntil(".gp, .go").css("visibility", "hidden"); - button.css("text-decoration", "line-through"); - button.attr("title", show_text); - button.data("hidden", "true"); - } else { - // show the code output - button.parent().find(".go, .gp, .gt").show(); - button.next("pre").find(".gt").nextUntil(".gp, .go").css("visibility", "visible"); - button.css("text-decoration", "none"); - button.attr("title", hide_text); - button.data("hidden", "false"); - } - }); -}); diff --git a/docs/source-app/_static/images/icon.svg b/docs/source-app/_static/images/icon.svg deleted file mode 100644 index e88fc19036178..0000000000000 --- a/docs/source-app/_static/images/icon.svg +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/docs/source-app/_static/images/logo-large.svg b/docs/source-app/_static/images/logo-large.svg deleted file mode 100644 index 39531f95e9dba..0000000000000 --- a/docs/source-app/_static/images/logo-large.svg +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/docs/source-app/_static/images/logo-small.svg b/docs/source-app/_static/images/logo-small.svg deleted file mode 100644 index 1f523a57c4a16..0000000000000 --- a/docs/source-app/_static/images/logo-small.svg +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/docs/source-app/_static/images/logo.png b/docs/source-app/_static/images/logo.png deleted file mode 100644 index 308a6ee419d4454a46323e65b398752e2b6ffb40..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16014 zcma*Obx>T*6E2JsYzgiV2ohki;O_3WECk5n?hu^d4k0+f-4<9pI13?2aF^i0HNpMv zdw*58>ih5Os_mV5rswHC(=$`2&Y2UVrJ;z2O^JPAaVPhQbg%~JcwTK&*g<j(*n_M_3Ui}=2o!Lh5@J_RvQLqo${|JYr4Pg7IV zhjYhK_P~>G*G+WaO7jq6cx-BX;wG|Zt9nRGObh`Zd-Crg>+C_MzprAl#gxIJbBFrfNuaJ_GA`p3bd1uLe1c0FO5crSBmva|UMfgRu zBC2X?Y6y)G58xkft{oT2eJ7a%1R=skU0wZuB#2ghnFxeNAP5}M(T8IfF+#?yM~HZo zl#~z}005AaliR2r{>P=EfiP26Rz{E`x>)EO{SWzn8o_|T5%Qm@h6bXF@Q5gokdPqQ zR8&+D2txkn3t@&RAjtoR{|~g*Jo10tMhQ_y zH2qHyRm2ETQ&VS$p8wAVfgl{&3+(>a=l{lUDSDLYbc!e=Xup0uu826dR5|n%*hBdB z5#DptI1JW5iZwru(LZt*-1~0=aO`+i&xe}!>5b0r``33qd$*Z z3-_ZvH@_$E=ZEin)el{CkGh~&-6fX;P1l}^2g&v)EgAo&f82foA2k$QR%Km`w_SfW zJuZy@J5YU9oA@_Gc0b?eEZFWe#sAD>Gt3r|^?Oep4PB(?|F0lFzdS#qAh4(ZhhscH z-vOQ(Q2zsFg)bkUvBTTB?y#S2DV}$qV^9A5q`Cgbh-d?4X^`aq#+ddjKKzS`$^&2{ zkbH)$2N4RCKDYQU7zZ@le(f9msToU6NxHO_0ylJKWmYiXdiJc=&Dza-gDW=1=)BGQ zd>SsxAvtya_qj`@UWKUsC=vI$n&|nN`l90h29yTBnO5MfnYReIe*Poze6%sl7Ow7L z*?9W=EMRb+UZEhbh#1!YAM$LNzL$oCBqFFRC#~nRe4L}HM72UYh$?SW*1{Br!Fbq* z5%Z3jnUS*UJv?DKTuC~GvuQ(xFkC2`s-`T;2~A~bmzzJHB#kxp|`cj&zON$IPv zAAGt3Z@O=MvO8?a@J~CuuXac8cx{-qQ|`~L{5%#} zE-o!zEuK?!WXLqB;5c!1a?k$Iq3Xz%X_6!E-?2&No3|$*xF%sS13xxb>&du|qh@g% z_-oAERc@o(cZ>o%T)gqmA~WoI25jX*lJ;JKJE!u4hD$K5*U8cGTY$L=!w|I8!Y!f8rSD$7 zTl9@CU&%kUI;&VCTMjFW@_l6iI9lA$XER&Nn|n2#U>dI;|F9f(WTg<}DmSX5FdB`m zPxRci=0N#nUI>?oA)Ri1|4{ANkV*2*xaQR6P}SbSszHP`BZ8VlyBWzsr)t1D-cVZm z{OM<0@_jNPB@hQ0CsO8C@@{7L`U5F`R<0fXN0*+83>X`n6NmBq4JD;;QA$zLtL?&f zrABIgkWcO6%c?fk4u?lpP4*6MLaD~2YE6*Dff=|K@xWM3AV{nKnD~Rp;xQY@nQtE;g!cw2_^}niUGjo%T(^`;WZ0tvKmW7ZZqBiLt2c%o7mo9WXO_@8Y zwWiKJtZ7hnjOqr$J;Qf1p$@XZ9uLMTnJiMSe_5WGaZ#g9gtCevx~8Upp-|5122=I) zePJ^LjcvYQ28&H)NwlPnSqB^oAxYp=16rW5kV5;}a z*sOt#eNLhpq|peR-317Mb!5#MQ)n{@Vm7a$$mTv*D_A1KRBQ;q%eb@#BGGKksn=#= z8B1rAjmlPKXN&7?{;iahNH_&^0;1J_3{wv$W4yh6B7jzPm`{UiJ7yTBN6QE8s$uB4 zpbslRVj4X=(%)+Yn&@#U>pQ~c9bzvF*T*M6<2oka!3rj(fp{)#MX2)|^ePPc7(?6Y zhidmbJB9jts!iW&7vw1K`^+%6u@vE5m)W(owZ^cK!W}RVF%d~c<~=`GN9DT zjxv4?s(NJwZ$Pf~G+&z#(V;NZLTz|>u3GTsZ2fE`3d9;8*`_M%yZ$<~^oOKz4tj(R zAwv)A+_WT*tmqU|Lez1nW#O*5^P^#I-o$;n2}J2{I=gZ6hU)w5eB;JHhffN!{kfvT zmih+OLsim*0WsLXDlmoJnOqVXNzBm)Y&8Rgum<&;{K?);l7`;n8Tre0BvCxT$8lr8 zSU-<^vH_VtOW2|x%7U`F&jT_f0xn~zuU|QZ;?V!i&neFAC5LNgj)glnaicQzwqd>h_QtZTdKtqL+Kf)%&MiX<5_= z5GPp4D|28ak{JBM<|`ZJ+`lB0xb@NFT2ml6O4mn*hztNSK{EQIy!7@&7~E+p2Za?GrVw>vPsVt zQBF!=e_w?`%jVYYq<3+)BUO6?<1h8^Bi6DD7W9ox9lF=L@&l+NR@%Pvt8DrhTpR#p zY|UXqh2nvVW72=$MO5#U45S^MdxAXOAai)_5Y*PwB6t-QtlxT^!9?n_arc-^#n}jh znaU>H%4r*5WbO!GGCfTn$-!32HWvvbjN$<55IAPU7Srqk{M4nIrtX^Pce1|5;j9oQ z0_IMLvefkMw)Zb!LVG8p+1bArB^Z37irQm^g*!jp{2YO01nGjbBZsQqNK~s1Z&dfT z(C;+Z^X(XNlcO{M6_P*1G^ntR*rRHuM-m&MtK!>)mEz!bbgcHv4}Z3hHmeW|*z1-&2q1J+(Hlq^*(D!`iY=gE?%19odn9x!tvkm-o}~X$rM6c26oS_A&i=0lV8gWAT6%PDUk0ThmD&94Z3ym58iHc@m=e6c%th5UEYdv zHQniIw_98iI$S3X@QMQoq|cw*yYL5OGfXv2%JyNe(i<@BVD7#a;SJ@*oFE625n?SQ zbxv7^(QOT%31*h!3p%TQQax$p!wN7k6|QcoslhNvt!lyJPzWW`?COjC=VN+RM1FfH z>EjawJ5-x&n&O^Zpwze>-Oty4t7{4?DT|SF$XBKrQ3J4G9K{4cLnLTo`x4V!8ICWxr1;8lD1kuP}o{9c*d-2^f}!)FgCH1nfPDNBKd(h{F&kAdGhVa4B@&av?|+R!}HE2rGhYo3KY1#V&ZeJ))I zNaxY&!8RUaSA6!yk%dd*GJf$*N*1|N;$Hdbd~^JTv-VovLMV!2rP?Cj%mhEVNZE)O z=d{Y^j!Dp6YSCneRisfEfA}3?Jz!0vW;^-*=01M zXYYtS>wdhr7~KAn|M~6&zvl2w;Q&Fh)~i250$+xzKN0h)?qT8xz}O)NRAHP}cN~jk zgI>zB33KJhzo@gzL5DJRL~|IpOASU2Q4ly+S55n$qC;S1(EEocCH+1~p)94BSzUre z_FOSGZ<$$j?!7~Y+zi^`fgUW`12$z`T#N6dyNtmPDV+I)c}dGI9`4&(^pf=#lxj$p zSBAm%z3A!kZ2|t5YwW`|w+LoeoL&;NIA$D0S))(`7wQqqu7`qL_BlmE;~9wiaIG0UB|3T}QqW!m~{B>)9Xds3Xy zoG?d%Hvioqt^}f<0|52#md`{JHlSE|EY~-E;UGqu$BqsEmwqGE{>I-U2==XbIb|`- zj04<_oN>MEM;1LniNCYS#;#Tvg*`B`wCiVZaoB#O2BwYRR^UK*;Pk@-j z^Poo>27YhDWE!4lki3&Z|5*kSh+*QwGHwU+7sZ$p6$RHTXnNc4HZM^PbO&Ib{}82{ zA&zjoO~h9)GUTe*d0P{idXW{7`7dOS_nSYwuR#_y-G+@UhnU{$o0xX(olobg2F6i8 zWL3A%Ok%1~x{1)9vj{{!!!6=R5uTb5sHqhk7j)J3y5IaG!3joe@U;Bjwcz zueh`KO*}CKEt{PS>{eOhz($N4S(duw9%^&(npglLXY+?LYRVcLXh3~wpMa6Aqk@zz zB8CTaRhlRL3dqt9nE8#S(%2X7Rw%f9Vw~5q+)EMO2zTF7dPV9HN&2Ci5ED0DT?Hq? zCuO0nlboKyyiO5yo?7T)lrmH7y^a~)96qf%&!-Vl0)gy)S!q5cTF2UexEym+#(;2kHu?y4YbNs?7Rw7SIcUoAYRhJOjir) z-#JLV^9MC^(~(Gx2523!feRl#gy0Xaq!%YznfJl@Z-$(u3Ym%h>aecHJPXLZSTdQqTQQavi z-X+QTRFjQ~W6}wiFzoWt{_ex6F1h~`g{Jbe!q2o^g@D6NER=NnCfsdc8OHjo=M87N zjrepEKv8AC+aG_pQG+<&N2Pfr8fZMZ6+mxOz4+-eElht2IU^FIRF{vK1fGoYqt72g z<1{ap=8c`+Nb+qo_vCf*2FVC|Iv)CX11pRHzta43dAGKTjG& z%@yF#Mh_l$*4g@Zx5=N6l>Uc_tPVe*`qI?;t%!od?$1NlW?Egnny*%=R5~!el;!|f zYwK@0s_8GJ#Bt4OS`GbaxmQ=01RW+rTZeb{`Xo!~3C=fzUggU9dn0i7UX(kUKT+A# zj5%bz$@dFG9hOS9C7Zv3ndWr<%Fx}8K6eK|cz($*{3@&8Ma7&^HOU;_ZYPJ1kV_03 z0-q^cJoXrZt@2CPEs-Bn@QuCk_=#ViMfDklwA~K8UtU zwAy@YF;~aZ%a07VVd*;;(@i^Zs*kyn+W-1$gyts2vezZY^Qf7sl%@*xaMxAa!vBuI zDSs}E43oM8e@wiBxGirAN7p!TqNY3iEVw*z|AQJwlsb;|vnsGjO6$ZN{ph`t_C+u{ zp(NIQp0uO@tK--B;2j(G@?6NsM>JvBF@JnBukX1&Y@xDEJN2RR+c369^YA2RRc&oh z4gLm&OEl^&zT$_268Dk@Rc?kS$ekVsUx`F9!@)2c0efAJV};dBLh(gxEJh2-n1#H< zUjqOsP(IV4zom;CoVTw9-1 zDLgtjCe)*$WqNP-Xzy;5H`EBy*qEC=V5lD78acD<-U87Zn~~ec8MpqJW}^t**;$K_ zAI}%zL;gBj>_KkQrbwGV{MdP!z`C0+l(G_Zi=9jp=TJCq~lY3@gFI&xQDRs zLR$T(g_Ojp!88Ho1ecfgLbgF)dL(ps%@*?&AoRr%!$mrncpTK$t1o#nmn8%|2VB20 zfddq$K^j#}Rshw!4dgg^%fZwe=BXFbs-#GamCDFS9YA?s2-CI_h;R*H#fl6Cn5tPk zAj2@!jT8R8hYWo05f~3eC-51fe5pyAs{2%f786?Ya)-csDy_k zYPC^Hb|k{^yW67c97OmWB>^phYUOhm9EEp-1Xt>pJVe@EShO-~>})cGpM7*4fBR@B z%@Br`^+s(`zA3;e=}bVSMeEJH{FGnxkoh*SKk~8;`|S#yw)6Ko*UOz?ZIJ>p4L@5KLeA!K3E+IqCCBrEQBs7@6 zDpY?=up|V!PHf}zfUk$G?&Msk(g*51_~;q;762rMzm;vy`@N`EDD&f)LEA*cOoe)0%UX6mMl+? zC~04Do73&R2_8T@7x7KEZu}-mZBg?s)cPSQQAtt@&S12YoP4L2+{v3OY!ivcuHuol zUMI#5{DnZWK9)AnN+D4;I`++KXkr_~J} z?=ZUDS!0ja5~#d=8fY)rd@8I1ACiq04)bDlN2v*6={;;*)&|4D1CP#x28tv&8M%Ym zA{#7E*gF_(Ic@H+H&3-Y&gzW@?Q0Ne%yEfc`GtazVPp`Q#>+85hO3k+)jcW_sji}a z&*yL_K0=sZThrt$a`WrPO_U+o-c|?f-gaIQ759E)%af<9RW&EeZtJg1%Z4(L)u)|| z{yJ8<$qb=2ZpQkAq>&tJ0i6#WHhZ4^k{X9n`P^C+%zQ#oN^#nn$IGEi+X^wDRc4G* zW*4IN1`HjNd|9ENn+jEA-zK}>FXTj3^Zy7L@al`Z4?YFyCNc$gEBvSmi2&e1x6GyG z`eV~vKHuJ+tgOKYp;}4OD;`U;Gj0pGeP*oSU$zSyK z@fFs%;wwBXHYv34LI~>PX2MzOrqrItKy^OBdb)1}ZYyqOhM^1-lLuxV zrv%KQsIb#`cY;TYL?Gy9;qh^sM}@#UvB8cbb=~03Nb`)AUNUn{y>nLl_iIW16vw5S zgT^LBK!G%@I)iyaies79S)0}VBEmx@(@&IM>>oK8vKeV8 zU<-^JYp%q3^_eC{dw;T+$*m3#E&c#Omu_F#g!Fe_Z&qorNkzJ=rsj6(=(5BULF1qF zK4(VIyktgEN$k*G`5%D>UKT}r@|kCVi9K)HXUz0o5f1$zLBm1?)V`4v2_qW*frCue zVE8I4!Bi-QKKvMMi;epeF;CW8vZk&`4(oBIx}WtqZKohkSF zGvA;dm$>aW`nLC@3Fa%6*E1J6dpTzp;&QixEkDSEnRdLd?ez?fA6jSG^-LI?Xro!d zI==)0?$&wC|wv^HA8#cvVu{wIV88!v}&Hq6A#wo68=2| z?MeYSto$_tY5uDv=i}RsLKa7Ev>&V7b9nL*hBG)fAzThUa;v%PmQ4Y1wxXUIDV13_ zXQ)PXd1A=yl;5n}1|{Y9xJe<()1#R_U)mhiSeyu;>4!Q~!8|Px<-0H%b1~mZ6HX@p zDUpC}5uL-n6Q#Ol_bX7LhLx0-U!J}!1sFAA)o9K@x5Bac_i=*Ed@;7pg;Jqq43ke2 zl{pMWI%CE;`y%qjqv1Ee6Pu4~KM)Nn8&wwhWXqM)EWUlgeyYjU?kJG$VC7dw2-PHM zhk`?x_-FC2irfS!$BGk=-cEzy$v5Skc&b(KOv!YJvfb01_}FcrL-q>hj;6f_jPo}r z?}1@FzUikBR(rlUz6hgB(E=s4FhL+N;ReV_VnpTQNo{#ahu=gE{hTWIE!O0KNYmevOI0E z9b{#OGAVmkJA?QV6LMW>#Ja4Oa78Y1981qHl)dY=5Jjxd+AYQF= zR@fw1nKk&#N#h#PcEpEUSd1b4$pl?lMmQE+du&^5K7A_1m)Kms9MugQW#n7_ z#n;4_U2N2hyT}=@#E9tPg6;1%DTf7>bI$Ae8FzL;JJjT+1hL70=`~e~W8DF!T3YACB?9c&?8*Tw$7-Vtp!(XHjs}AV)hJH; zn_`zi9j}J=`fWw^tar{y)B7f|4JjSiP#}O*TO?_W+Yo;c&YxA?S*iq^B(b|`@8qC7 zVJE84qQsiVK{`rPeV%J?pieNL&*H5gsmESv9!mQTRU8}_&1(Oo6X~3{R0+9yiLS|R zUKOL5pbk(H?T+Zc0g@PgKSmO{>-zu0-S2J;+tZLUBO@cXUkCQggZ7;;UHFJii$Yo$-rafk zFRwu8(xV^<2whzwjQ`CwZ2 z=Os^yyOAT2Q^x%)fTXm|W9x@vy$H!aiWV!ARtL`Jd0dli+IaoxL=&>C3e5*tWzp62 z_9s);s?LVE-jSi94NImN3Ngnhde05i>>|UNX`wU`w0B^sbh26nB6el8`5%SFOGB#e z_5@93q{eAr;UMEmPlyGE#Jw)CSEXY}njwz=fmnf7l*96G_EFVM4NQ841VZ1W(~vqw zqc<+H_4gY%hIcd-Kn{GDW)1_s2V=eik7I4+rRAQO7VPNH{4(%71b82p8F;V#B(}t{ zZ1$4x>L~J^HQ+qi@M_&Us(3Nxc(16@;wGpocd}OrK#r_2NWo|s;Dr@ zIxwM!;S7GnH0#8ij6lz$VPK7>7HmJ|V({|GKuu{FNqyOSp{uyd9ILmY7PI#tN4M%B^ zSn83CzC5*l`6(0wW@75z^kvNE@pp(&?s3#157sXre#XK(SZwy39y!Z__5@Xy?do_v zji&umv&(>v_B~TM1(NXd*UW{X;&H5ySj8!xvjxoHl#wJWlDKezg)9l`8}{9Pl#Yb> z8gn&|JkZNQ$(V@Z1aiu_)ZXoe$9SCKoX)Ijy%PzDJ#JhxsN=3*TPcEI*)qA2jByN% z(NH%0yOayC%4+5q52N;-6INBZXql=sy5cK$oI2MeX^_weCdp@KVNr}WG-UIgmyQ@D zX!Fl6$IML|NNj!&S|rx48L{9S5P3!;pmpV$ z!Jm_19O=&Ycc}Tt`2NC1*QHkL$TzCwkauU11dVtHQ%NRWEMa|0g?z|}jahzZ%NEO$ z$*oe>{a=l%Ex+w6$6$q%abodu{wrrBExlT-jP2IsOrJL83EREU$FsgB*uPw7p++wA% zh7Y#iN=G*)5Z5US`-6^HR^dfq1zt(RBHuSRu+r>;64tvE(Wmt+50P5WR!wS^|ocPX$^-+cpGC&7zYuj?JsD-ESv7^1!MA4d%d8;Y*dn-Cb-iGS_ zjTT4WZ~shlGsC?ES!AFidEc-~Fv$d@UZ&FMChBdvIIcdXE#A0)IDx0qYH#3@j`k0u z3Q^Z!1t2>32ktXFZ9_1$3z0MeV4n5PA4^zOn2a3JCpLGe?r4L*CK7)kYTql7Xy^Gf ztaZw<`XW^t>>s!i$ROj;(rClMqG;RXAo+3`77-FkCo^k|{MJoHi8Gv`I1C#@*Qi0; zPj@s0cGn#a#h<*1c^uCEqJiC7B_{NLLX`qZu?(TdcBWQ={eR!oM1C_E%u==+8d0-8 zPMQ6ui8n8?wU}@LMcCtkM4V4y1p^lIJ3#TuPO7mrBy@KC8$9^lsx4mq*dz_3rJ>_d z^Hu@3{!nOJ*5K5}!$i=33k||*0cu+19tiG=rIyC`V7&_VZ5ZJB60o*6j|BVyvVTQU zn`PXud&^M$21Lq!Zd{#%#QxlSFJ?f{IPFnHO!Fp8a~L}+R61;rG$XaoM{V2w%>+?6 zYb3ouQgf|yfNvl_-$8#ypFN5BPO*K%AKYhrIp|jP`s2;R*~|Ud9R8-Ben3mbzDrep zsR;+${UF`v($Dg~^Y|~%6Ph18H08x_j2g1T%OZ68Byxx;DP@#0I0E;XVVVz9mX!Q`KVHwGhTXIjzpX9iZ+X;ym8zX zzU&bs8wfd)8sTFG`LFc4<8VYFpTQdS3cbdDA3~C^@Lp9Z3eHyaFBIH8?;xpu{L)=X zD-dv%zU?kM(M}&+dKPwr1#%uWGiLXP()6<1D07O~2QcUuj}YNnXbAuD+b*4H5D^vu-%%f!r)u}SV3Fq?^m!PB0QSz z!o6xhdHKBoO`>N0H!9Vh+iUd&d9!vGCFG->47;O9iC$~~4}nReND3CMgUG)C?4F`& z240>MU+ej?+0r>`yTN@Zw?_<{5M@PfT!W zoSXQJpvlkQwtlCNcv2*VPmr|na;t~(!sbudrJiNFt3nI#5%tpcc}iYQ={GWjeudWp zdU<)MmYdJtIn`@O<$le4jWi{;SHhh3-PeTA6n!@`X(&lkQY@`Y0(IuUsUebthzp4n z+4dXQ`rA_br>Q&n+#ertNxnSneU$LKWY!^Su?Renu2GkR7OXJne zO=XeXf&K)HH-hI!?}lvOv7mv#F(r09V8T^8PP{N9pxy7#A+ zhj;2XOJP96v(!t& zzimnJr|W}8mRbj1FPMbK*V4dJ&;HTrv`wzl&gPH4o&?Ii=1M#FB?SNz5Pjv>il1+U zNT8mAR%lGGMFst75Vz>B^xnu%*HS~jeZ|q4fs1^}g5&0>C_sfsT*NG?x||>;nGxG% zAbQ?6md?uI8h8Y(4Rp{FDf^#0fF1IARk;B>vdvg@&~&XbRf1h#Y)?hd$IggbySHY4 zOemVK?d9fwk9A5JEtLw3G~=k=8jR)rT9pswvx9h9Ci{>p3BIDeM?Nwz{o3pxeSdvI z_B-lqj%a#tP*BWUX=P**!y#^T8t%8(-(_5i`3kR#$24*(s2Pcrtl!;ACqLLYniyg{ zpis!A590O_d3B(MQfOX$&TZ&oR>Y{^){<1h+^4FX+ba2cTOEoT zi|`L8&XSFK;gM>X4)4shKckzum0{84xq|+@iBT>h7D9u*FBqyDe3I#i_|bw2t;)=_ zX*cmvyNqH%ujG9F3AbzL2*Sh`Di&<2S1|a`6BW8l8HZ=3OJ``z1Nn%Q!OSgEE}ktI!`%;+A$95G8)4euc!&DrkI;$Hp#|_X1 zkket5`BGss;;(ry_&K*}di!KlPu``Z2n~HlCd~%l%(nbwfnXAc`k@*!sM09#%VLIx zRzh~>xj_W_zup?ZI>lt~eY{VRZ7P(tHffw#6Ln#E`O3b#for>3grR~sV}A3{HXRfj zXw43`bgmqxceQvd$wXHKee0aSx0jy@Gd`CCxC9RN zfYW?^Bf?rS4jk*Y@{2Od#tta_DyY@K;xtOz=vhN^(LK>#Y_!v$`;EYFqkaPxeg2wC zABzKGA{iR_3ccoHZG6oEH*3!qRNd*7az$#WAK>OG^QlaeY)5(p_Y8*boaLGOv}W%x zz}z9R&3t4*x7SQ8s82M|k1A9T19yMfc zG_-eK+M)diWu~DGR;ZsT3`P-ZQ(*E9Y(0CMs&7Jyi$w|cQoGC;q25&J?nRM3>Uz*y zNb+MoZh2rY9UL@uvJJh;s>E9j(ZMv5e7eJ=yE6`otlv~zfg5Gn=hv&5tjb5iyqxP(Z-j8%=)SwwtO^d=UwiPR_{R{6#5ZT%=V z;}7V{Z%Qj6Sm|fKD+%8J3$=PVRKTu`xZ+ot|714f!3JikhPMghe$#{@)%5%>9|0Q8 zZpt5K*yb#sN3G4}F2v^o1Pm#`be&g)>jhi1F#|im1zN}Yq%?(wl?+@jzqc?AR){i? zBG2%yNWQNxM=mN_?U*_ARYB++tSgpQaS=veZcbwoPRK-sSa=kwv1W>}KIHT+#X>2i zcx+H#I*dbRybD6Yv1z!gTvaabK+T{;q)rSY-SDOA3eL~Syuczwy+2Qp)FJhT%9=_3 zuR{NPcJk#6EEykn9N@b25RG{of1bc2W7`*GL) zQn*T6ua3XR_(Afo%_pbeG-rs;Fptr<;ZHRFOzHOaPn&R4oM-vE(f1Ni$KU>+`>XLz z!l0+>d~611&({gTFSC`DyO*>u6^||DPJxmGOL-<0^-w8Phu|)Xjw%!>#)C;r1{x^J zwjRZwLgzJt-;?5EW**8OEZ{)yM;Zsd>CY6_(3}_a>>u#P01Yzf?o=$3-Ddy`SUcZy%Ai9dQkD7ci=@Bjz;@bsd$2!q_V=mVBfPQzgbQ5* zYeIUf9+0cYzpn{H65Q#&pmSL1;5%3LoBPf$f16qT%Zlfg5pjh_4|{&-DYhcvVw)s7 zU89@gBBg2)Bu+$1u{2SjYDAYR>{!p3rmojFeU1HLrFQ+~w)bq)@Esi?C9@mXf9?(z_KT$e}7S&fn5kMv|j70+;F=>W~^U zyL5bQdu0`~BL0Wq0Zqxk=RjZjwD2KP%?p(UMN`OM^?s})Oh;Kcj|=t~=@Isw|Aq%F=exWz9J0s?nEy+cKUC@_?|P0LTbqeeot4#& zj0r-sW-M-^Gb9-B0n$MVgSp8XUaxMrV*NQqX_rog-OvzE147`tlXR*madrEt){-kh zY9BdZ`vjvHZxeS%qnv|hg4g9O3T)hep~OYMJ%oQ@eR2z#hj&qkwbX=Vv+{>rT`jn< zGDM$vd16>pL<0kwt#hTQ!QnqP<^p@1RXl!ari5TQGSW+sYivJxml32e18&`Fd-=~NM7}@7l})2+zoCT> zw$VksYP^aEjQ8Rm7@>R2)WJ1;4gT2k8HKFFES{mqMU!Iy+ro#ipTqoQPfYvsVjE zB@s*tehSw}v!R4~y63jPC2l~axuin@Dx(#M%}gE>qg$_8G?SRXmkxYLg3(&VU_-=3 zw7>00$3?zVP3M)0cK0?UHB%34SWznwL*(>}gcH*b8HzvfOyEJg5YN9WCRF6?nS|cH z<5^ikn}w0`%m!u0;{jz*{mK+4!pkehn6D?#h8%S!v#OAnE08j=re#w^ue^Mv6Zw-^ zBF{N%0MObSU^KD{28lBP%xBcNcPkbf$^3omlf*LRtA5eo2tDC3%E7PFAtZM3$Z7k1 zGbgP-M=Uu9x85YmE!dX{g=`iz=0x%l+C1&uwxWjO z0)R5h;qqMjsVvz$0MEtIaHfFZfl7;lFD%OX93&88)~rAfm;`i$*Gz$$mWcs5p$HwI&LO$1VB@C%2S=D__A2bnt z{tM+-0j=}6nB0NG$7wiZ9cuHf48QeyF_iYg0^pvEFT;-ahk7Azry85_+J zxkJ^D?3Zt0B9TjmcGmu}dzRq3txJqDH^$3^1R_xO;7Z;Ix#=1PuldT~6NPu4=(JN%3`OtN?L=heZWsRIM}ntEjHrFfA&|?){r&D$QJ~;b zU=6>WU7HXOJ~a7F{3o#-Pw8lSB)UGHHLe$OvnsFl;0bqvQm&i{;jwP#TP~M8ug5`T zWwA)N4{a4y?KjTRjZm9A*_@Ngj*cmbvz%15R(+1jK6Ct&>|W8A&Z(m-JO4|8M-%ak z0UAlM5&)*fwwxt=4bQd7QDc?kn$cTgfW@=z^fOg*_7DEZ9Yau`bxhn0U(5}5Bk;&l zMD!fu`2E_f!#dGn{lkxY)!)lz$m6LVF-^*}9~Ap%3Jr5G)5G%@+MX|ulb59OWu86V zcXg&ZTd1bjRyP)46V~4r8v^GSKK6(MpwDLMDbn(s9r^is9SJ) zM=uUKQ2yOJ=#Xi@)Z2Gyc)|ILY2tnV=HCX3}9@x!0HcE&80%i|j9I(z+2IVL53Gn}kcE{H9q znUO9V>cfpY%ODoQ?wNL2+(3y%8A_k0V+6C z@Ra?zlY}A?Md}#y!Yct6oxD(tGR8t^1lF_ckZ_4jT!7}5N=7jDamlAWJTrdXG;iER zN8}^b^Oh2gL}!-iGwtU**n0u(;#BuZMPf-Hdm~A8un}E-bwnKO4le(g94>9ezIki>Xr>um(#i;`D$U;)PVpqo(-?Uc zr{CS*9cj)exhWP^aElhYZJjy$7(FS|(K`6(?%NFMc^&WN!{61J5S2o*{Qi@x+oyJ) zopD^S4Vg4!QK7g;SmnIh(Rn$Y_Wjn*CG#Vtlx@fsy;?JAdL(z-J-LRqZ3uSs?BI$= z4?3x2kB@Ia;A;&nTWOADk?%q`hQyx&2>aNYFn{*D2pZ7(&d-f$gK;9SA*wVcre>n- z`KD`SeacuT+7(ep7c&&-FMGc(;uMpkBS|TXXlrpZyu2)wbD PX_1uWHRNh#Aff*U84h~( diff --git a/docs/source-app/_static/images/logo.svg b/docs/source-app/_static/images/logo.svg deleted file mode 100644 index b73eaa8cedb50..0000000000000 --- a/docs/source-app/_static/images/logo.svg +++ /dev/null @@ -1,22 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/source-app/_static/main.css b/docs/source-app/_static/main.css deleted file mode 100644 index c1bd8ad0305b7..0000000000000 --- a/docs/source-app/_static/main.css +++ /dev/null @@ -1,3 +0,0 @@ -col { - width: 50% !important; -} diff --git a/docs/source-app/_templates/classtemplate.rst b/docs/source-app/_templates/classtemplate.rst deleted file mode 100644 index 5b7f465516787..0000000000000 --- a/docs/source-app/_templates/classtemplate.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. role:: hidden - :class: hidden-section -.. currentmodule:: {{ module }} - - -{{ name | underline }} - -.. autoclass:: {{ name }} - :members: diff --git a/docs/source-app/_templates/classtemplate_no_index.rst b/docs/source-app/_templates/classtemplate_no_index.rst deleted file mode 100644 index 858c37b51567a..0000000000000 --- a/docs/source-app/_templates/classtemplate_no_index.rst +++ /dev/null @@ -1,12 +0,0 @@ -:orphan: - -.. role:: hidden - :class: hidden-section -.. currentmodule:: {{ module }} - - -{{ name | underline }} - -.. autoclass:: {{ name }} - :members: - :noindex: diff --git a/docs/source-app/_templates/layout.html b/docs/source-app/_templates/layout.html deleted file mode 100644 index 90e6125f9ad28..0000000000000 --- a/docs/source-app/_templates/layout.html +++ /dev/null @@ -1,16 +0,0 @@ -{% extends "!layout.html" %} - - -{% block footer %} {{ super() }} - - -{% endblock %} diff --git a/docs/source-app/_templates/theme_variables.jinja b/docs/source-app/_templates/theme_variables.jinja deleted file mode 100644 index 914f8dcafc96b..0000000000000 --- a/docs/source-app/_templates/theme_variables.jinja +++ /dev/null @@ -1,18 +0,0 @@ -{%- set external_urls = { - 'github': 'https://github.com/Lightning-AI/lightning', - 'github_issues': 'https://github.com/Lightning-AI/lightning/issues', - 'contributing': 'https://github.com/Lightning-AI/lightning/blob/master/.github/CONTRIBUTING.md', - 'governance': 'https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/governance.rst', - 'docs': 'https://lightning.rtfd.io/en/latest', - 'twitter': 'https://twitter.com/PyTorchLightnin', - 'discuss': 'https://discord.gg/VptPCZkGNa', - 'tutorials': 'https://pt-lightning.readthedocs.io/en/latest/#tutorials', - 'previous_pytorch_versions': 'https://pt-lightning.rtfd.io/en/latest/', - 'home': 'https://lightning.ai/', - 'get_started': 'https://pt-lightning.readthedocs.io/en/latest/introduction_guide.html', - 'features': 'https://pt-lightning.rtfd.io/en/latest/', - 'blog': 'https://www.pytorchlightning.ai/blog', - 'resources': 'https://pt-lightning.readthedocs.io/en/latest/#community-examples', - 'support': 'https://pt-lightning.rtfd.io/en/latest/', -} --%} diff --git a/docs/source-app/api_reference/components.rst b/docs/source-app/api_reference/components.rst deleted file mode 100644 index 69d53b79e76ce..0000000000000 --- a/docs/source-app/api_reference/components.rst +++ /dev/null @@ -1,35 +0,0 @@ -######################## -lightning.app.components -######################## - -.. contents:: - :depth: 1 - :local: - :backlinks: top - -.. currentmodule:: lightning.app.components - - -Built-in Components -___________________ - -.. autosummary:: - :toctree: generated/ - :nosignatures: - :template: classtemplate.rst - - ~database.client.DatabaseClient - ~database.server.Database - ~python.popen.PopenPythonScript - ~python.tracer.TracerPythonScript - ~training.LightningTrainerScript - ~serve.gradio_server.ServeGradio - ~serve.serve.ModelInferenceAPI - ~serve.python_server.PythonServer - ~serve.streamlit.ServeStreamlit - ~multi_node.base.MultiNode - ~multi_node.fabric.FabricMultiNode - ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode - ~multi_node.trainer.LightningTrainerMultiNode - ~serve.auto_scaler.AutoScaler - ~serve.auto_scaler.ColdStartProxy diff --git a/docs/source-app/api_reference/core.rst b/docs/source-app/api_reference/core.rst deleted file mode 100644 index 324f3c448978f..0000000000000 --- a/docs/source-app/api_reference/core.rst +++ /dev/null @@ -1,26 +0,0 @@ -:orphan: - -################## -lightning.app.core -################## - -.. contents:: - :depth: 1 - :local: - :backlinks: top - -.. currentmodule:: lightning.app.core - -Core APIs -___________________ - -.. autosummary:: - :toctree: api/ - :nosignatures: - :template: classtemplate.rst - - LightningApp - LightningFlow - LightningWork - -Learn more about :ref:`Lightning Core `. diff --git a/docs/source-app/api_reference/frontend.rst b/docs/source-app/api_reference/frontend.rst deleted file mode 100644 index 514b2cf35bc75..0000000000000 --- a/docs/source-app/api_reference/frontend.rst +++ /dev/null @@ -1,25 +0,0 @@ -###################### -lightning.app.frontend -###################### - -.. contents:: - :depth: 1 - :local: - :backlinks: top - -.. currentmodule:: lightning.app.frontend - -Lightning FrontEnds -___________________ - -.. autosummary:: - :toctree: generated/ - :nosignatures: - :template: classtemplate.rst - - ~frontend.Frontend - ~web.StaticWebFrontend - ~stream_lit.StreamlitFrontend - ~panel.PanelFrontend - -Learn more about :ref:`Frontend's `. diff --git a/docs/source-app/api_reference/runners.rst b/docs/source-app/api_reference/runners.rst deleted file mode 100644 index f7e550b7c7733..0000000000000 --- a/docs/source-app/api_reference/runners.rst +++ /dev/null @@ -1,21 +0,0 @@ -##################### -lightning.app.runners -##################### - -.. contents:: - :depth: 1 - :local: - :backlinks: top - -.. currentmodule:: lightning.app.runners - -Lightning Core -______________ - -.. autosummary:: - :toctree: generated/ - :nosignatures: - :template: classtemplate.rst - - ~cloud.CloudRuntime - ~multiprocess.MultiProcessRuntime diff --git a/docs/source-app/api_reference/storage.rst b/docs/source-app/api_reference/storage.rst deleted file mode 100644 index 3173914427586..0000000000000 --- a/docs/source-app/api_reference/storage.rst +++ /dev/null @@ -1,71 +0,0 @@ -##################### -lightning.app.storage -##################### - -Lightning Core -______________ - -.. contents:: - :depth: 1 - :local: - :backlinks: top - -.. currentmodule:: lightning.app.storage - -.. autosummary:: - :toctree: generated/ - :nosignatures: - :template: classtemplate.rst - - ~path.Path - ~drive.Drive - ~payload.Payload - ~mount.Mount - ----- - -************************ -Learn more about Storage -************************ - -.. raw:: html - -
-
- -.. displayitem:: - :header: Learn about the differences between Drive vs Path. - :description: Learn about their differences. - :col_css: col-md-4 - :button_link: ../glossary/storage/differences.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: The Drive Object. - :description: Put, List and Get Files From a Shared Drive Disk. - :col_css: col-md-4 - :button_link: ../glossary/storage/drive.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: The Path Object. - :description: Transfer Files From One Component to Another by Reference. - :col_css: col-md-4 - :button_link: ../glossary/storage/path.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: The Mount Object. - :description: Mount an AWS S3 Bucket When Running on the Cloud. - :col_css: col-md-4 - :button_link: ../workflows/mount_aws_s3_bucket.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/basics.rst b/docs/source-app/basics.rst deleted file mode 100644 index 69b1bf751014b..0000000000000 --- a/docs/source-app/basics.rst +++ /dev/null @@ -1,259 +0,0 @@ -:orphan: - -.. _basics: - -###### -Basics -###### - -In this guide, we'll cover the basic terminology associated with the Lightning framework. - ----- - -************** -Lightning App -************** - -The :class:`~lightning.app.core.app.LightningApp` runs a tree of one or more components that interact to create end-to-end applications. There are two kinds of components: :class:`~lightning.app.core.flow.LightningFlow` and :class:`~lightning.app.core.work.LightningWork`. This modular design enables you to reuse components created by other users. - ----- - -Lightning Work -^^^^^^^^^^^^^^ - -The :class:`~lightning.app.core.work.LightningWork` component is a building block optimized for long-running jobs or integrating third-party services. LightningWork can be used for training large models, downloading a dataset, or any long-lasting operation. - ----- - -Lightning Flow -^^^^^^^^^^^^^^ - -The :class:`~lightning.app.core.flow.LightningFlow` component coordinates long-running tasks :class:`~lightning.app.core.work.LightningWork` and runs its children :class:`~lightning.app.core.flow.LightningFlow` components. - ----- - -Lightning App Tree -^^^^^^^^^^^^^^^^^^ - -Components can be nested to form component trees where the LightningFlows are its branches and LightningWorks are its leaves. - -Here's a basic application with four flows and two works: - -.. literalinclude:: code_samples/quickstart/app_comp.py - -And here's its associated tree structure: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/tree.gif - :alt: Basic App Components - :width: 100 % - -A Lightning App runs all flows into a single process. Its flows coordinate the execution of the works each running in their own independent processes. - ----- - -Lightning Distributed Event Loop -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Drawing inspiration from modern web frameworks like `React.js `_, the Lightning app runs all flows in an **event loop** (forever), which is triggered every 0.1 seconds after collecting any works' state change. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning_loop.gif - -When running an app in the cloud, the :class:`~lightning.app.core.work.LightningWork` run on different machines. Lightning communicates any :class:`~lightning.app.core.work.LightningWork` state changes to the **event loop** which re-executes the flow with the newly-collected works' state. - ----- - -Lightning App State -^^^^^^^^^^^^^^^^^^^ - -By design, each component is stateful and its state is composed of all its attributes. The **Lightning App State** is the collection of all its components state. - -With this mechanism, any component can **react** to any other component **state changes**, simply by relying on its attributes within the flow. - -For example, here we define two flow components, **RootFlow** and **ChildFlow**, where the child flow prints and increments a counter indefinitely and gets reflected in **RootFlow** state. - -You can easily check the state of your entire app: - -.. literalinclude:: code_samples/quickstart/app_01.py - -Here's the entire tree structure associated with your app: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/parent_child.png - :alt: Parent Child Components - :width: 100 % - -And here's the output you get when running the above application using **Lightning CLI**: - -.. code-block:: console - - $ lightning_app run app docs/source/code_samples/quickstart/app_01.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - State: {'works': {'w_1': {'vars': {'counter': 1}}, 'w_2': {'vars': {'counter': 0}}}} - - State: {'works': {'w_1': {'vars': {'counter': 3}}, 'w_2': {'vars': {'counter': 1}}}} - - State: {'works': {'w_1': {'vars': {'counter': 4}}, 'w_2': {'vars': {'counter': 1}}}} - - State: {'works': {'w_1': {'vars': {'counter': 5}}, 'w_2': {'vars': {'counter': 2}}}} - - State: {'works': {'w_1': {'vars': {'counter': 6}}, 'w_2': {'vars': {'counter': 2}}}} - - State: {'works': {'w_1': {'vars': {'counter': 7}}, 'w_2': {'vars': {'counter': 3}}}} - ... - -This app will count forever because the **lightning event loop** indefinitely calls the root flow run method. - ----- - -******************************* -Controlling the Execution Flow -******************************* - - -LightningWork: To Cache or Not to Cache Calls -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -With Lightning, you can control how to run your components. - -By default, the :class:`~lightning.app.core.flow.LightningFlow` is executed infinitely by the **Lightning Infinite Loop** and the :class:`~lightning.app.core.work.LightningWork` does not run in **parallel**, -meaning the **Lightning Infinite Loop** (a.k.a the flow) waits until that long-running work is completed to continue. - -Similar to `React.js Components and Props `_, the :class:`~lightning.app.core.work.LightningWork` -component accepts arbitrary inputs (the "props") to its **run** method and by default runs **once** for each unique input provided. - -Here's an example of this behavior: - -.. literalinclude:: code_samples/basics/0.py - :language: python - :emphasize-lines: 10, 19 - -And you should see the following by running the code above: - -.. code-block:: console - - $ python example.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - # After you have clicked `run` on the UI. - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 10} - -As you can see, the intermediate run didn't execute as already cached. - -To disable this behavior, set ``cache_calls=False`` to make any LightningWork run infinitely. - -.. literalinclude:: code_samples/basics/1.py - :diff: code_samples/basics/0.py - -.. code-block:: console - - $ python example.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - # After you have clicked `run` on the UI. - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 10} - - -.. note:: Passing a sequence of different props to the work run method queues their execution. We recommend avoiding this behavior as it can be hard to debug. Instead, wait for the previous run to execute. - ----- - -LightningWork: Parallel vs Non Parallel -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The LightningWork component is made for long-running jobs. - -As an example, let's create a long-running **LightningWork** component that will take 1 hour to do its "work". - -.. literalinclude:: code_samples/quickstart/app_02.py - :language: python - :emphasize-lines: 15 - -Here's the output you get when running the above application using **Lightning CLI**: - -.. code-block:: console - - $ lightning_app run app docs/source/code_samples/quickstart/app_02.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - # After you have clicked `run` on the UI. - 0.0 0.0 - ... - 0.0003 0.0003 - ... - 1.0 1.0 - ... - 1 hour later! - 1.0 1.0 - 1 hour later! - 1.0 1.0 - 1 hour later! - ... - -The child work runs only once, hence why the progress counter stops increasing once the work is completed. - -This is useful for monitoring the progress of a long-running operation, like training a big model. - -.. note :: - The Lightning Infinite Loop runs multiple cycles per second. - It is good practice to keep the loop running fast, so that your application stays responsive, - especially when it contains user-interface components. - ----- - -**************** -Multiple works -**************** - -In practical use cases, you might want to execute multiple long-running works in parallel. - -To enable this behavior, set ``parallel=True`` in the ``__init__`` method of -your :class:`~lightning.app.core.work.LightningWork`. - -Here's an example of the interaction between parallel and non-parallel behaviors: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/blocking_non_blocking.gif - :alt: mnist GPU bar - :width: 100 % - -Below, we reuse the **HourLongWork** work defined in the previous example, but modify the **RootFlow** -to run two **HourLongWork** works in a parallel way. - -.. literalinclude:: code_samples/quickstart/app/app_0.py - :emphasize-lines: 21 - -Above, both ``child_work_1`` and ``child_work_2`` are long-running works that are executed -asynchronously in parallel. - -When running the above app, we see the following logs: - -.. code-block:: console - - $ lightning_app run app docs/source/code_samples/quickstart/app/app_0.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - # After you have clicked `run` on the UI. - 0.0, 0.0 - ... - 0.0003, 0.0003 - ... - 1.0, 1.0 - ... - 1 hour later `child_work_1` started! - 1 hour later `child_work_2` started! - 0.0, 0.0 - ... - 0.0003, 0.0003 - ... - 1.0, 1.0 - 1 hour later `child_work_1` started! - 1 hour later `child_work_2` started! - ... - ----- - -*********** -Next Steps -*********** - -To keep learning about Lightning, build a :ref:`ui_and_frontends`. diff --git a/docs/source-app/code_samples/basics/0.py b/docs/source-app/code_samples/basics/0.py deleted file mode 100644 index 7f4658e3977af..0000000000000 --- a/docs/source-app/code_samples/basics/0.py +++ /dev/null @@ -1,19 +0,0 @@ -from lightning.app import LightningWork - - -class ExampleWork(LightningWork): - def run(self, *args, **kwargs): - print(f"I received the following props: args: {args} kwargs: {kwargs}") - - -work = ExampleWork() -work.run(value=1) - -# Providing the same value. This won't run as already cached. -work.run(value=1) -work.run(value=1) -work.run(value=1) -work.run(value=1) - -# Changing the provided value. This isn't cached and will run again. -work.run(value=10) diff --git a/docs/source-app/code_samples/basics/1.py b/docs/source-app/code_samples/basics/1.py deleted file mode 100644 index 1a696b8e4c45c..0000000000000 --- a/docs/source-app/code_samples/basics/1.py +++ /dev/null @@ -1,22 +0,0 @@ -from lightning.app import LightningWork - - -class ExampleWork(LightningWork): - def __init__(self): - super().__init__(cache_calls=False) - - def run(self, *args, **kwargs): - print(f"I received the following props: args: {args} kwargs: {kwargs}") - - -work = ExampleWork() -work.run(value=1) - -# Providing the same value. This won't run as already cached. -work.run(value=1) -work.run(value=1) -work.run(value=1) -work.run(value=1) - -# Changing the provided value. This isn't cached and will run again. -work.run(value=10) diff --git a/docs/source-app/code_samples/convert_pl_to_app/app.py b/docs/source-app/code_samples/convert_pl_to_app/app.py deleted file mode 100644 index a590cbaab8ea0..0000000000000 --- a/docs/source-app/code_samples/convert_pl_to_app/app.py +++ /dev/null @@ -1,17 +0,0 @@ -from lightning.app import LightningFlow, LightningApp, CloudCompute -from lightning.app.components import TracerPythonScript - - -class RootFlow(LightningFlow): - def __init__(self): - super().__init__() - self.runner = TracerPythonScript( - "train.py", - cloud_compute=CloudCompute("gpu"), - ) - - def run(self): - self.runner.run() - - -app = LightningApp(RootFlow()) diff --git a/docs/source-app/code_samples/convert_pl_to_app/requirements.txt b/docs/source-app/code_samples/convert_pl_to_app/requirements.txt deleted file mode 100644 index e8fb43ef7dc83..0000000000000 --- a/docs/source-app/code_samples/convert_pl_to_app/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -torchvision -pytorch_lightning diff --git a/docs/source-app/code_samples/convert_pl_to_app/train.py b/docs/source-app/code_samples/convert_pl_to_app/train.py deleted file mode 100644 index d0b7919b75843..0000000000000 --- a/docs/source-app/code_samples/convert_pl_to_app/train.py +++ /dev/null @@ -1,46 +0,0 @@ -import os - -import torch -import torch.nn.functional as F -from torch import nn -from torch.utils.data import DataLoader, random_split -from torchvision import transforms as T -from torchvision.datasets import MNIST - -import lightning.pytorch as pl - - -class LitAutoEncoder(pl.LightningModule): - def __init__(self): - super().__init__() - self.encoder = nn.Sequential(nn.Linear(28 * 28, 128), nn.ReLU(), nn.Linear(128, 3)) - self.decoder = nn.Sequential(nn.Linear(3, 128), nn.ReLU(), nn.Linear(128, 28 * 28)) - - def forward(self, x): - # in lightning, - # forward defines the prediction/inference actions - embedding = self.encoder(x) - return embedding - - def training_step(self, batch, batch_idx): - # training_step defines the train loop. - # It is independent of forward - x, _ = batch - x = x.view(x.size(0), -1) - z = self.encoder(x) - x_hat = self.decoder(z) - loss = F.mse_loss(x_hat, x) - self.log("train_loss", loss) - return loss - - def configure_optimizers(self): - optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) - return optimizer - - -dataset = MNIST(os.getcwd(), download=True, transform=T.ToTensor()) -train, val = random_split(dataset, [55000, 5000]) - -autoencoder = LitAutoEncoder() -trainer = pl.Trainer(accelerator="auto") -trainer.fit(autoencoder, DataLoader(train), DataLoader(val)) diff --git a/docs/source-app/code_samples/quickstart/__init__.py b/docs/source-app/code_samples/quickstart/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/docs/source-app/code_samples/quickstart/app/__init__.py b/docs/source-app/code_samples/quickstart/app/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/docs/source-app/code_samples/quickstart/app/app_0.py b/docs/source-app/code_samples/quickstart/app/app_0.py deleted file mode 100644 index 370f6818a778f..0000000000000 --- a/docs/source-app/code_samples/quickstart/app/app_0.py +++ /dev/null @@ -1,21 +0,0 @@ -from lightning.app import LightningWork, LightningFlow, LightningApp -from docs.quickstart.app_02 import HourLongWork - - -class RootFlow(LightningFlow): - def __init__(self, child_work_1: LightningWork, child_work_2: LightningWork): - super().__init__() - self.child_work_1 = child_work_1 - self.child_work_2 = child_work_2 - - def run(self): - print(round(self.child_work_1.progress, 4), round(self.child_work_2.progress, 4)) - self.child_work_1.run() - self.child_work_2.run() - if self.child_work_1.progress == 1.0: - print("1 hour later `child_work_1` started!") - if self.child_work_2.progress == 1.0: - print("1 hour later `child_work_2` started!") - - -app = LightningApp(RootFlow(HourLongWork(parallel=True), HourLongWork(parallel=True))) diff --git a/docs/source-app/code_samples/quickstart/app/app_1.py b/docs/source-app/code_samples/quickstart/app/app_1.py deleted file mode 100644 index e6b876e452f2b..0000000000000 --- a/docs/source-app/code_samples/quickstart/app/app_1.py +++ /dev/null @@ -1,92 +0,0 @@ -import flash -from flash.core.data.utils import download_data -from flash.image import ImageClassificationData, ImageClassifier - -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute -from lightning.pytorch.callbacks import ModelCheckpoint - - -# Step 1: Create a training LightningWork component that gets a backbone as input -# and saves the best model and its score -class ImageClassifierTrainWork(LightningWork): - def __init__(self, max_epochs: int, backbone: str, cloud_compute: CloudCompute): - # parallel is set to True to run asynchronously - super().__init__(parallel=True, cloud_compute=cloud_compute) - # Number of epochs to run - self.max_epochs = max_epochs - # The model backbone to train on - self.backbone = backbone - self.best_model_path = None - self.best_model_score = None - - def run(self, train_folder): - # Create a datamodule from the given dataset - datamodule = ImageClassificationData.from_folders( - train_folder=train_folder, - batch_size=1, - val_split=0.5, - ) - # Create an image classfier task with the given backbone - model = ImageClassifier(datamodule.num_classes, backbone=self.backbone) - # Start a Lightning trainer, with 1 training batch and 4 validation batches - trainer = flash.Trainer( - max_epochs=self.max_epochs, - limit_train_batches=1, - limit_val_batches=4, - callbacks=[ModelCheckpoint(monitor="val_cross_entropy")], - ) - # Train the model - trainer.fit(model, datamodule=datamodule) - # Save the model path - self.best_model_path = trainer.checkpoint_callback.best_model_path - # Save the model score - self.best_model_score = trainer.checkpoint_callback.best_model_score.item() - - -# Step 2: Create a serving LightningWork component that gets a model input and serves it -class ImageClassifierServeWork(LightningWork): - def run(self, best_model_path: str): - # Load the model from the model path - model = ImageClassifier.load_from_checkpoint(best_model_path) - model.serve(output="labels") - - -# Step 3: Create a root LightningFlow component that gets number of epochs and a path to -# a dataset as inputs, initialize 2 training components and serves the best model -class RootFlow(LightningFlow): - def __init__(self, max_epochs: int, data_dir: str): - super().__init__() - self.data_dir = data_dir - # Init an image classifier with resnet18 backbone - self.train_work_1 = ImageClassifierTrainWork( - max_epochs, - "resnet18", - ) - # Init an image classifier with resnet26 backbone - self.train_work_2 = ImageClassifierTrainWork( - max_epochs, - "resnet26", - ) - # Init the serving component - self.server_work = ImageClassifierServeWork() - - def run(self): - # running both `train_work_1` and `train_work_2` in parallel and asynchronously. - self.train_work_1.run(self.data_dir) - self.train_work_2.run(self.data_dir) - - # run serve_work only when both `best_model_score` are available. - if self.train_work_1.best_model_score and self.train_work_2.best_model_score: - # serve only the best model between `train_work_1` and `train_work_2`. - self.server_work.run( - self.train_work_1.best_model_path - if self.train_work_1.best_model_score < self.train_work_2.best_model_score - else self.train_work_2.best_model_path - ) - - -# Step 4: download a dataset to your local directory under `/data` -download_data("https://pl-flash-data.s3.amazonaws.com/hymenoptera_data.zip", "./data") - -# Initialize your Lightning app with 5 epochs -app = LightningApp(RootFlow(5, "./data/hymenoptera_data")) diff --git a/docs/source-app/code_samples/quickstart/app_01.py b/docs/source-app/code_samples/quickstart/app_01.py deleted file mode 100644 index 42f716f99ffb2..0000000000000 --- a/docs/source-app/code_samples/quickstart/app_01.py +++ /dev/null @@ -1,27 +0,0 @@ -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.utilities.app_helpers import pretty_state - - -class Work(LightningWork): - def __init__(self): - super().__init__(cache_calls=False) - # Attributes are registered automatically in the state. - self.counter = 0 - - def run(self): - # Incrementing an attribute gets reflected in the `Flow` state. - self.counter += 1 - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.w = Work() - - def run(self): - if self.w.has_started: - print(f"State: {pretty_state(self.state)} \n") - self.w.run() - - -app = LightningApp(Flow()) diff --git a/docs/source-app/code_samples/quickstart/app_02.py b/docs/source-app/code_samples/quickstart/app_02.py deleted file mode 100644 index c5c6445a0e32e..0000000000000 --- a/docs/source-app/code_samples/quickstart/app_02.py +++ /dev/null @@ -1,32 +0,0 @@ -from time import sleep - -from lightning.app import LightningWork, LightningFlow, LightningApp - - -# This work takes an hour to run -class HourLongWork(LightningWork): - def __init__(self, parallel: bool = False): - super().__init__(parallel=parallel) - self.progress = 0.0 - - def run(self): - self.progress = 0.0 - for _ in range(3600): - self.progress += 1.0 / 3600 # Reporting my progress to the Flow. - sleep(1) - - -class RootFlow(LightningFlow): - def __init__(self, child_work: LightningWork): - super().__init__() - self.child_work = child_work - - def run(self): - # prints the progress from the child work - print(round(self.child_work.progress, 4)) - self.child_work.run() - if self.child_work.counter == 1.0: - print("1 hour later!") - - -app = LightningApp(RootFlow(HourLongWork())) diff --git a/docs/source-app/code_samples/quickstart/app_03.py b/docs/source-app/code_samples/quickstart/app_03.py deleted file mode 100644 index 6fdbc3ac67083..0000000000000 --- a/docs/source-app/code_samples/quickstart/app_03.py +++ /dev/null @@ -1,31 +0,0 @@ -from time import sleep - -from lightning.app import LightningWork, LightningFlow, LightningApp - - -class HourLongWork(LightningWork): - def __init__(self): - super().__init__(cache_calls=False) - self.progress = 0.0 - - def run(self): - self.progress = 0.0 - for _ in range(3600): - self.progress += 1.0 / 3600 - sleep(1) - - -class RootFlow(LightningFlow): - def __init__(self, child_work: LightningWork): - super().__init__() - self.child_work = child_work - - def run(self): - # prints the progress from the child work - print(round(self.child_work.progress, 4)) - self.child_work.run() - if self.child_work.counter == 1.0: - print("1 hour later!") - - -app = LightningApp(RootFlow(HourLongWork())) diff --git a/docs/source-app/code_samples/quickstart/app_comp.py b/docs/source-app/code_samples/quickstart/app_comp.py deleted file mode 100644 index 9aee70009f478..0000000000000 --- a/docs/source-app/code_samples/quickstart/app_comp.py +++ /dev/null @@ -1,26 +0,0 @@ -from lightning.app import LightningFlow, LightningApp -from lightning.app.testing import EmptyFlow, EmptyWork - - -class FlowB(LightningFlow): - def __init__(self): - super().__init__() - self.flow_d = EmptyFlow() - self.work_b = EmptyWork() - - def run(self): - ... - - -class FlowA(LightningFlow): - def __init__(self): - super().__init__() - self.flow_b = FlowB() - self.flow_c = EmptyFlow() - self.work_a = EmptyWork() - - def run(self): - ... - - -app = LightningApp(FlowA()) diff --git a/docs/source-app/code_samples/quickstart/hello_world/app.py b/docs/source-app/code_samples/quickstart/hello_world/app.py deleted file mode 100644 index 18dd2d78a0c6f..0000000000000 --- a/docs/source-app/code_samples/quickstart/hello_world/app.py +++ /dev/null @@ -1,15 +0,0 @@ -from lightning.app import LightningFlow, LightningApp - - -# Step 1: Subclass LightningFlow component to define the app flow. -class HelloWorld(LightningFlow): - # Step 2: Add the app logic to the LightningFlow run method to - # ``print("Hello World!")`. - # The LightningApp executes the run method of the main LightningFlow - # within an infinite loop. - def run(self): - print("Hello World!") - - -# Step 3: Initialize a LightningApp with the LightningFlow you defined (in step 1) -app = LightningApp(HelloWorld()) diff --git a/docs/source-app/code_samples/quickstart/hello_world/app_ui.py b/docs/source-app/code_samples/quickstart/hello_world/app_ui.py deleted file mode 100644 index ad0e5065f5bb5..0000000000000 --- a/docs/source-app/code_samples/quickstart/hello_world/app_ui.py +++ /dev/null @@ -1,57 +0,0 @@ -import os - -from lightning.app import LightningFlow, LightningApp -from lightning.app.frontend import StaticWebFrontend, StreamlitFrontend -from lightning.app.utilities.state import AppState - - -# Step 1: Define your LightningFlow component with the app UI -class UIStreamLit(LightningFlow): - def __init__(self): - super().__init__() - self.should_print = False - - # Step 2: Override `configure_layout` to define the layout of the UI - # In this example, we are using `StreamlitFrontend` - def configure_layout(self): - return StreamlitFrontend(render_fn=render_fn) - - -# Step 3: Implement the StreamLit render method -def render_fn(state: AppState): - import streamlit as st - from streamlit_autorefresh import st_autorefresh - - st_autorefresh(interval=2000, limit=None, key="refresh") - - state.should_print = st.select_slider( - "Should the Application print 'Hello World !' to the terminal:", - [False, True], - ) - - -# Step 4: Implement a Static Web Frontend. This could be react, vue, etc. -class UIStatic(LightningFlow): - # Step 5: - def configure_layout(self): - return StaticWebFrontend(os.path.join(os.path.dirname(__file__), "ui")) - - -# Step 6: Implement the root flow. -class HelloWorld(LightningFlow): - def __init__(self): - super().__init__() - self.static_ui = UIStatic() - self.streamlit_ui = UIStreamLit() - - def run(self): - print("Hello World!" if self.streamlit_ui.should_print else "") - - def configure_layout(self): - return [ - {"name": "StreamLit", "content": self.streamlit_ui}, - {"name": "Static", "content": self.static_ui}, - ] - - -app = LightningApp(HelloWorld()) diff --git a/docs/source-app/code_samples/quickstart/hello_world/ui/index.html b/docs/source-app/code_samples/quickstart/hello_world/ui/index.html deleted file mode 100644 index fe38c432f504c..0000000000000 --- a/docs/source-app/code_samples/quickstart/hello_world/ui/index.html +++ /dev/null @@ -1 +0,0 @@ -
Hello from component UIStatic
diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py deleted file mode 100644 index 5399d8205cd49..0000000000000 --- a/docs/source-app/conf.py +++ /dev/null @@ -1,412 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import glob -import inspect -import os -import shutil -import sys - -import lai_sphinx_theme -from lightning_utilities.docs import fetch_external_assets - -import lightning - -_PATH_HERE = os.path.abspath(os.path.dirname(__file__)) -_PATH_ROOT = os.path.realpath(os.path.join(_PATH_HERE, "..", "..")) -sys.path.insert(0, os.path.abspath(_PATH_ROOT)) - -_SPHINX_MOCK_REQUIREMENTS = int(os.environ.get("SPHINX_MOCK_REQUIREMENTS", True)) -_FAST_DOCS_DEV = int(os.environ.get("FAST_DOCS_DEV", True)) -_FETCH_S3_ASSETS = int(os.getenv("DOCS_FETCH_ASSETS", not _FAST_DOCS_DEV)) - -# -- Project information ----------------------------------------------------- - -# this name shall match the project name in Github as it is used for linking to code -project = "lightning" -copyright = lightning.__copyright__ -author = lightning.__author__ - -# The short X.Y version -version = lightning.__version__ -# The full version, including alpha/beta/rc tags -release = lightning.__version__ - -# -- Project documents ------------------------------------------------------- - -if _FETCH_S3_ASSETS: - fetch_external_assets( - docs_folder=_PATH_HERE, - assets_folder="_static/fetched-s3-assets", - retrieve_pattern=r"https?://[-a-zA-Z0-9_]+\.s3\.[-a-zA-Z0-9()_\\+.\\/=]+" - ) - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. - -needs_sphinx = "5.3" - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.doctest", - "sphinx.ext.intersphinx", - "sphinx_toolbox.collapse", - "sphinx.ext.todo", - "sphinx.ext.coverage", - # "sphinx.ext.linkcode", - "sphinx.ext.autosummary", - "sphinx.ext.napoleon", - # 'sphinxcontrib.mockautodoc', # raises error: directive 'automodule' is already registered ... - # 'sphinxcontrib.fulltoc', # breaks pytorch-theme with unexpected kw argument 'titles_only' - "sphinxcontrib.video", - "myst_parser", - "sphinx.ext.autosectionlabel", - "nbsphinx", - "sphinx_autodoc_typehints", - "sphinx_copybutton", - "sphinx_paramlinks", - "sphinx_togglebutton", - "sphinx.ext.githubpages", - "lai_sphinx_theme.extensions.lightning", - 'sphinx.ext.mathjax', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# myst-parser, forcing to parse all html pages with mathjax -# https://github.com/executablebooks/MyST-Parser/issues/394 -myst_update_mathjax = False -# https://myst-parser.readthedocs.io/en/latest/syntax/optional.html?highlight=anchor#auto-generated-header-anchors -myst_heading_anchors = 3 - -# https://berkeley-stat159-f17.github.io/stat159-f17/lectures/14-sphinx..html#conf.py-(cont.) -# https://stackoverflow.com/questions/38526888/embed-ipython-notebook-in-sphinx-document -# I execute the notebooks manually in advance. If notebooks test the code, -# they should be run at build time. -nbsphinx_execute = "never" -nbsphinx_allow_errors = True -nbsphinx_requirejs_path = "" - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -# source_suffix = ['.rst', '.md', '.ipynb'] -source_suffix = { - ".rst": "restructuredtext", - ".txt": "markdown", - ".md": "markdown", - ".ipynb": "nbsphinx", -} - -# The master toctree document. -master_doc = "index" - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = 'en' - -# List of patterns, relative to source-app directory, that match files and -# directories to ignore when looking for source-app files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [ - "PULL_REQUEST_TEMPLATE.md", - "**/README.md/*", - "readme.md", - "_templates", - "code_samples/convert_pl_to_app/requirements.txt", - "**/_static/*" -] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = None - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "lai_sphinx_theme" -html_theme_path = [os.environ.get('LIT_SPHINX_PATH', lai_sphinx_theme.get_html_theme_path())] - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. - -html_theme_options = { - "pytorch_project": lightning.__homepage__, - "analytics_id": "G-D3Q2ESCTZR", - "canonical_url": lightning.__homepage__, - "collapse_navigation": False, - "display_version": True, - "logo_only": False, -} - -html_favicon = "_static/images/icon.svg" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_templates", "_static"] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = project + "-doc" - -# -- Options for LaTeX output ------------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # 'preamble': '', - # Latex figure (float) alignment - "figure_align": "htbp", -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source-app start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, project + ".tex", project + " Documentation", author, "manual"), -] - -# MathJax configuration -mathjax3_config = { - 'tex': { - 'packages': {'[+]': ['ams', 'newcommand', 'configMacros']} - }, -} - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source-app start file, name, description, authors, manual section). -man_pages = [(master_doc, project, project + " Documentation", [author], 1)] - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source-app start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ( - master_doc, - project, - project + " Documentation", - author, - project, - lightning.__docs__, - "Miscellaneous", - ), -] - -# -- Options for Epub output ------------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = project - -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -# -# epub_identifier = '' - -# A unique identification for the text. -# -# epub_uid = '' - -# A list of files that should not be packed into the epub file. -epub_exclude_files = ["search.html"] - -# -- Extension configuration ------------------------------------------------- - -# -- Options for intersphinx extension --------------------------------------- - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = { - "python": ("https://docs.python.org/3", None), - "torch": ("https://pytorch.org/docs/stable/", None), - "numpy": ("https://numpy.org/doc/stable/", None), -} - -nitpicky = True - - -nitpick_ignore = [ - ("py:class", "typing.Self"), - # missing in generated API - ("py:exc", "MisconfigurationException"), - # TODO: generated list of all existing ATM, need to be fixed - ('py:exc', 'ApiException'), - ('py:class', 'BaseModel'), - ('py:exc', 'LightningPlatformException'), - ('py:class', 'forwarded'), - ('py:class', 'lightning.app.api.http_methods.Delete'), - ('py:class', 'lightning.app.api.http_methods.Get'), - ('py:class', 'lightning.app.api.http_methods.HttpMethod'), - ('py:class', 'lightning.app.api.http_methods.Post'), - ('py:class', 'lightning.app.api.http_methods.Put'), - ('py:class', 'lightning.app.components.python.TracerPythonScript'), - ('py:func', 'lightning.app.pdb.set_trace'), - ('py:class', 'lightning.app.runners.runtime.Runtime'), - ('py:class', 'lightning.app.source_code.local.LocalSourceCodeDir'), - ('py:class', 'lightning.app.storage.payload._BasePayload'), - ('py:class', 'lightning.app.structures.Dict'), - ('py:class', 'lightning.app.structures.List'), - ('py:class', 'lightning.app.testing.testing.LightningTestApp'), - ('py:class', 'lightning.app.utilities.app_status.WorkStatus'), - ('py:class', 'lightning.app.utilities.frontend.AppInfo'), - ('py:class', 'lightning.app.utilities.packaging.app_config.AppConfig'), - ('py:class', 'lightning.app.utilities.packaging.build_config.BuildConfig'), - ('py:class', 'lightning.app.utilities.packaging.cloud_compute.CloudCompute'), - ('py:class', 'lightning.app.utilities.proxies.WorkRunExecutor'), - ('py:class', 'lightning.app.utilities.tracer.Tracer'), - ('py:class', 'lightning_cloud.openapi.models.cloudspace_id_runs_body.CloudspaceIdRunsBody'), - ('py:class', 'lightning_cloud.openapi.models.externalv1_lightningapp_instance.Externalv1LightningappInstance'), - ('py:class', 'lightning_cloud.openapi.models.v1_cloud_space.V1CloudSpace'), - ('py:class', 'lightning_cloud.openapi.models.v1_env_var.V1EnvVar'), - ('py:class', 'lightning_cloud.openapi.models.v1_flowserver.V1Flowserver'), - ('py:class', 'lightning_cloud.openapi.models.v1_lightning_auth.V1LightningAuth'), - ('py:class', 'lightning_cloud.openapi.models.v1_lightning_run.V1LightningRun'), - ('py:class', 'lightning_cloud.openapi.models.v1_lightningwork_drives.V1LightningworkDrives'), - ('py:class', 'lightning_cloud.openapi.models.v1_membership.V1Membership'), - ('py:class', 'lightning_cloud.openapi.models.v1_network_config.V1NetworkConfig'), - ('py:class', 'lightning_cloud.openapi.models.v1_queue_server_type.V1QueueServerType'), - ('py:class', 'lightning_cloud.openapi.models.v1_work.V1Work'), - ('py:class', 'pydantic.main.BaseModel'), - ('py:meth', 'transfer'), -] - -# -- Options for todo extension ---------------------------------------------- - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True - - -def setup(app): - # this is for hiding doctest decoration, - # see: http://z4r.github.io/python/2011/12/02/hides-the-prompts-and-output/ - app.add_js_file("copybutton.js") - app.add_css_file("main.css") - -# copy all examples to local folder -path_examples = os.path.join(_PATH_HERE, "..", "examples") -if not os.path.isdir(path_examples): - os.mkdir(path_examples) -for path_app_example in glob.glob(os.path.join(_PATH_ROOT, "examples", "app_*")): - path_app_example2 = os.path.join(path_examples, os.path.basename(path_app_example)) - if not os.path.isdir(path_app_example2): - shutil.copytree(path_app_example, path_app_example2, dirs_exist_ok=True) - - -# Ignoring Third-party packages -# https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule -def _package_list_from_file(file): - list_pkgs = [] - with open(file) as fp: - lines = fp.readlines() - for ln in lines: - found = [ln.index(ch) for ch in list(",=<>#") if ch in ln] - pkg = ln[: min(found)] if found else ln - if pkg.rstrip(): - list_pkgs.append(pkg.rstrip()) - return list_pkgs - - -# define mapping from PyPI names to python imports -PACKAGE_MAPPING = { - "PyYAML": "yaml", -} -MOCK_PACKAGES = [] -if _SPHINX_MOCK_REQUIREMENTS: - # mock also base packages when we are on RTD since we don't install them there - MOCK_PACKAGES += _package_list_from_file(os.path.join(_PATH_ROOT, "requirements.txt")) -MOCK_PACKAGES = [PACKAGE_MAPPING.get(pkg, pkg) for pkg in MOCK_PACKAGES] - -autodoc_mock_imports = MOCK_PACKAGES - - -autosummary_generate = True - -autodoc_member_order = "groupwise" -autoclass_content = "both" -# the options are fixed and will be soon in release, -# see https://github.com/sphinx-doc/sphinx/issues/5459 -autodoc_default_options = { - "members": None, - "methods": None, - # 'attributes': None, - "special-members": "__call__", - "exclude-members": "_abc_impl", - "show-inheritance": True, - "private-members": True, - "noindex": True, -} - -# Sphinx will add “permalinks” for each heading and description environment as paragraph signs that -# become visible when the mouse hovers over them. -# This value determines the text for the permalink; it defaults to "¶". Set it to None or the empty -# string to disable permalinks. -# https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-html_permalinks -# html_add_permalinks = "¶" -# True to prefix each section label with the name of the document it is in, followed by a colon. -# For example, index:Introduction for a section called Introduction that appears in document index.rst. -# Useful for avoiding ambiguity when the same section heading appears in different documents. -# http://www.sphinx-doc.org/en/master/usage/extensions/autosectionlabel.html -autosectionlabel_prefix_document = True - -# only run doctests marked with a ".. doctest::" directive -doctest_test_doctest_blocks = "" -doctest_global_setup = """ -import importlib -import os - -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute -from lightning.fabric.loggers.tensorboard import _TENSORBOARD_AVAILABLE, _TENSORBOARDX_AVAILABLE -""" -coverage_skip_undoc_in_source = True - -# skip false positive linkcheck errors from anchors -linkcheck_anchors = False - -# A timeout value, in seconds, for the linkcheck builder. -linkcheck_timeout = 60 - -# ignore all links in any CHANGELOG file -linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"] - - -# ignore the following relative links (false positive errors during linkcheck) -linkcheck_ignore = [ - "https://www.openai.com/index/clip/", -] diff --git a/docs/source-app/contribute_app.rst b/docs/source-app/contribute_app.rst deleted file mode 100644 index 2f690e8479062..0000000000000 --- a/docs/source-app/contribute_app.rst +++ /dev/null @@ -1,7 +0,0 @@ -:orphan: - -################# -Contribute an app -################# - -Show off your work! Contribute and example to be highlighted in our documentation and App gallery. diff --git a/docs/source-app/core_api/lightning_app/app.py b/docs/source-app/core_api/lightning_app/app.py deleted file mode 100644 index 42f716f99ffb2..0000000000000 --- a/docs/source-app/core_api/lightning_app/app.py +++ /dev/null @@ -1,27 +0,0 @@ -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.utilities.app_helpers import pretty_state - - -class Work(LightningWork): - def __init__(self): - super().__init__(cache_calls=False) - # Attributes are registered automatically in the state. - self.counter = 0 - - def run(self): - # Incrementing an attribute gets reflected in the `Flow` state. - self.counter += 1 - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.w = Work() - - def run(self): - if self.w.has_started: - print(f"State: {pretty_state(self.state)} \n") - self.w.run() - - -app = LightningApp(Flow()) diff --git a/docs/source-app/core_api/lightning_app/communication.rst b/docs/source-app/core_api/lightning_app/communication.rst deleted file mode 100644 index d1aa1d35f54b1..0000000000000 --- a/docs/source-app/core_api/lightning_app/communication.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -########################################## -Communication between Lightning Components -########################################## - -**Audience:** Users that want to create interactive applications. - -**Level:** Intermediate - -**Prerequisite**: Read the :doc:`Communication in Lightning Apps article <../../../workflows/access_app_state>`. - ----- - -.. include:: ../../core_api/lightning_app/communication_content.rst diff --git a/docs/source-app/core_api/lightning_app/communication_content.rst b/docs/source-app/core_api/lightning_app/communication_content.rst deleted file mode 100644 index 4b373dbb07588..0000000000000 --- a/docs/source-app/core_api/lightning_app/communication_content.rst +++ /dev/null @@ -1,160 +0,0 @@ - -******************************** -Communication Between Components -******************************** - -When creating interactive Lightning Apps (App) with multiple components, you may need your components to share information with each other and rely on that information to control their execution, share progress in the UI, trigger a sequence of operations, etc. - -To accomplish that, Lightning components can communicate using the App State. The App State is composed of all attributes defined within each component's **__init__** method e.g anything attached to the component with **self.x = y**. - -All attributes of all **LightningWork (Work)** components are accessible in the **LightningFlow (Flow)** components in real-time. - -By design, the Flows communicate to all **Works** within the application. However, Works can't communicate with each other directly, they must use Flows as a proxy to communicate. - -Once a Work is running, any updates to the Work's state is automatically communicated to the Flow, as a delta (using `DeepDiff `_). The state communication isn't bi-directional, communication is only done from Work to Flow. - -Internally, the App is alternatively collecting deltas sent from all the registered Works and/or UI, and running the root Flow run method of the App. - ----- - -************************************************* -Communication from LightningWork to LightningFlow -************************************************* - -LightningFlow (Flow) can access their children's LightningWork (Work) state. - -When a running Work attribute gets updated inside its method (separate process locally or remote machine), the app re-executes Flow's run method once it receives the state update from the Work. - -Here's an example to better understand communication from Work to Flow. - -The ``WorkCounter`` increments a counter until 1 million and the ``Flow`` prints the work counter. - -As the Work is running its own process, its state changes are sent to the Flow which contains the latest value of the counter. - -.. code-block:: python - - import lightning as L - - - class WorkCounter(L.LightningWork): - def __init__(self): - super().__init__(parallel=True) - self.counter = 0 - - def run(self): - for _ in range(int(10e6)): - self.counter += 1 - - - class Flow(L.LightningFlow): - def __init__(self): - super().__init__() - self.w = WorkCounter() - - def run(self): - self.w.run() - print(self.w.counter) - - - app = L.LightningApp(Flow()) - - -A delta sent from the Work to the Flow looks like this: - -.. code-block:: python - - {"values_changed": {"root['works']['w']['vars']['counter']": {"new_value": 425}}} - -Here is the associated illustration: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/deltas.gif - :alt: Mechanism showing how delta are sent. - :width: 100 % - -Here's another example that is slightly different. Here we define a Flow and Work, where the Work increments a counter indefinitely and the Flow prints its state which contain the Work. - -You can easily check the state of your entire app as follows: - -.. literalinclude:: ../../core_api/lightning_app/app.py - -Run the app with: - -.. code-block:: bash - - lightning run app docs/source/core_api/lightning_app/app.py - -And here's the output you get when running the App using the **Lightning CLI**: - -.. code-block:: console - - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - State: {'works': {'w': {'vars': {'counter': 1}}}} - State: {'works': {'w': {'vars': {'counter': 2}}}} - State: {'works': {'w': {'vars': {'counter': 3}}}} - State: {'works': {'w': {'vars': {'counter': 3}}}} - State: {'works': {'w': {'vars': {'counter': 4}}}} - ... - ----- - -************************************************* -Communication from LightningFlow to LightningWork -************************************************* - -Communication from the LightningFlow (Flow) to the LightningWork (Work) while running **isn't supported yet**. If your application requires this feature, please open an issue on Github. - -Here's an example of what would happen if you try to have the Flow communicate with the Work: - -.. code-block:: python - - import lightning as L - from time import sleep - - - class WorkCounter(L.LightningWork): - def __init__(self): - super().__init__(parallel=True) - self.counter = 0 - - def run(self): - while True: - sleep(1) - print(f"Work {self.counter}") - - - class Flow(L.LightningFlow): - def __init__(self): - super().__init__() - self.w = WorkCounter() - - def run(self): - self.w.run() - sleep(1) - print(f"Flow {self.w.counter}") - self.w.counter += 1 - - - app = L.LightningApp(Flow()) - -As you can see, there is a divergence between the values within the Work and the Flow. - -.. code-block:: console - - Flow 0 - Flow 1 - Flow 2 - Flow 3 - Work 0 - Flow 4 - Work 0 - Flow 5 - Work 0 - Flow 6 - Work 0 - Flow 7 - Work 0 - Flow 8 - Work 0 - Flow 9 - Work 0 - Flow 10 diff --git a/docs/source-app/core_api/lightning_app/compute_content.rst b/docs/source-app/core_api/lightning_app/compute_content.rst deleted file mode 100644 index 8bb2e7039ad83..0000000000000 --- a/docs/source-app/core_api/lightning_app/compute_content.rst +++ /dev/null @@ -1,40 +0,0 @@ -:orphan: - -*************************** -Customize my Flow resources -*************************** - -In the cloud, you can simply configure which machine to run on by passing -a :class:`~lightning.app.utilities.packaging.cloud_compute.CloudCompute` to your work ``__init__`` method: - -.. code-block:: python - - import lightning as L - - # Run on a small, shared CPU machine. This is the default for every LightningFlow. - app = L.LightningApp(L.Flow(), flow_cloud_compute=L.CloudCompute()) - - -Here is the full list of supported machine names: - -.. list-table:: Hardware by Accelerator Type - :widths: 25 25 25 - :header-rows: 1 - - * - Name - - # of CPUs - - Memory - * - flow-lite - - 0.3 - - 4 GB - -The up-to-date prices for these instances can be found `here `_. - ----- - -************ -CloudCompute -************ - -.. autoclass:: lightning.app.utilities.packaging.cloud_compute.CloudCompute - :noindex: diff --git a/docs/source-app/core_api/lightning_app/dynamic_work.rst b/docs/source-app/core_api/lightning_app/dynamic_work.rst deleted file mode 100644 index bf202aa590a79..0000000000000 --- a/docs/source-app/core_api/lightning_app/dynamic_work.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -.. _dynamic_work: - -##################### -Dynamic LightningWork -##################### - -**Audience:** Users who want to create applications that adapt to user demands. - -**Level:** Advanced - ----- - -.. include:: dynamic_work_content.rst diff --git a/docs/source-app/core_api/lightning_app/dynamic_work_content.rst b/docs/source-app/core_api/lightning_app/dynamic_work_content.rst deleted file mode 100644 index 31616a42a1493..0000000000000 --- a/docs/source-app/core_api/lightning_app/dynamic_work_content.rst +++ /dev/null @@ -1,202 +0,0 @@ -*************************************** -What Dynamic LightningWork does for you -*************************************** - -Dynamic LightningWork (Work) changes the resources your application uses while the application is running (aka at runtime). - -For example, imagine you want to create a research notebook app for your team. You want every member to be able to create multiple `JupyterLab `_ sessions on their hardware of choice. - -To allow every notebook to choose hardware, it needs to be set up in it's own :class:`~lightning.app.core.work.LightningWork`, but you can't know the number of notebooks user will need in advance. In this case you'll need to add ``LightningWorks`` dynamically at run time. - ----- - -***************** -Use Dynamic Works -***************** - -Dynamic Works should be used anytime you want change the resources your application is using while it is running (aka at runtime). - -You're usually going to use the ``start`` and ``stop`` methods together. - ----- - -Add a Dynamic Work -^^^^^^^^^^^^^^^^^^ - -There are a couple of ways you can add a dynamic Work: - -- Option 1: Attach your components in the **run** method using the Python functions. -- Option 2: Use the Lightning built-in classes :class:`~lightning.app.structures.Dict` or :class:`~lightning.app.structures.List`. - -.. note:: Using the Lightning built-in classes is usually easier to read. - ----- - -**OPTION 1:** Attach your components in the run method of a flow using the Python functions **hasattr**, **setattr**, and **getattr**: - -.. code-block:: python - - class RootFlow(lapp.LightningFlow): - - def run(self): - - if not hasattr(self, "work"): - # The `Work` component is created and attached here. - setattr(self, "work", Work()) - # Run the `Work` component. - getattr(self, "work").run() - -**OPTION 2:** Use the built-in Lightning classes :class:`~lightning.app.structures.Dict` or :class:`~lightning.app.structures.List` - -.. code-block:: python - - from lightning.app.structures import Dict - - class RootFlow(lapp.LightningFlow): - - def __init__(self): - super().__init__() - self.dict = Dict() - - def run(self): - if "work" not in self.dict: - # The `Work` component is attached here. - self.dict["work"] = Work() - self.dict["work"].run() - ----- - -Stop a Work -^^^^^^^^^^^ -Stop a work when you are concerned about cost. - -To stop a work, use the work ``stop`` method: - -.. code-block:: python - - class RootFlow(L.LightningFlow): - - def __init__(self): - super().__init__() - self.work = Work() - - def run(self): - self.work.stop() - ----- - -********************* -Dynamic Work Examples -********************* - -.. - The entire application can be found `here `_. - ----- - -Dynamic Work with Jupyter Notebooks -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In this example, we are dynamically creating ``JupyterLabWork`` every time a user clicks the **Create Jupyter Notebook** button. - -In order to do that, we are iterating over the list of ``jupyter_config_requests`` infinitely. - -.. code-block:: python - - import lightning as L - - - class JupyterLabManager(L.LightningFlow): - - """This flow manages the users notebooks running within works."""" - - def __init__(self): - super().__init__() - self.jupyter_works = L.structures.Dict() - self.jupyter_config_requests = [] - - def run(self): - for idx, jupyter_config in enumerate(self.jupyter_config_requests): - - # The Jupyter Config has this form is: - # {"use_gpu": False/True, "token": None, "username": ..., "stop": False} - - # Step 1: Check if JupyterWork already exists for this username - username = jupyter_config["username"] - if username not in self.jupyter_works: - jupyter_config["ready"] = False - - # Set the hardware selected by the user: GPU or CPU. - cloud_compute = L.CloudCompute("gpu" if jupyter_config["use_gpu"] else "cpu-small") - - # Step 2: Create new JupyterWork dynamically ! - self.jupyter_works[username] = JupyterLabWork(cloud_compute=cloud_compute) - - # Step 3: Run the JupyterWork - self.jupyter_works[username].run() - - # Step 4: Store the notebook token in the associated config. - # We are using this to know when the notebook is ready - # and display the stop button on the UI. - if self.jupyter_works[username].token: - jupyter_config["token"] = self.jupyter_works[username].token - - # Step 5: Stop the work if the user requested it. - if jupyter_config['stop']: - self.jupyter_works[username].stop() - self.jupyter_config_requests.pop(idx) - - def configure_layout(self): - return L.app.frontend.StreamlitFrontend(render_fn=render_fn) - ----- - -Dynamic Works with StreamLit UI -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Continuing from the Jupyter Notebook example, in the UI, we receive the **state** of the Jupyter Manager and the state can be modified directly from the UI. - -.. code-block:: python - - import streamlit as st - - - def render_fn(state): - - # Step 1: Enable users to select their notebooks and create them - column_1, column_2, column_3 = st.columns(3) - with column_1: - create_jupyter = st.button("Create Jupyter Notebook") - with column_2: - username = st.text_input('Enter your username', "tchaton") - assert username - with column_3: - use_gpu = st.checkbox('Use GPU') - - # Step 2: If a user clicked the button, add an element to the list of configs - # Note: state.jupyter_config_requests = ... will sent the state update to the component. - if create_jupyter: - new_config = [{"use_gpu": use_gpu, "token": None, "username": username, "stop": False}] - state.jupyter_config_requests = state.jupyter_config_requests + new_config - - # Step 3: List of running notebooks. - for idx, config in enumerate(state.jupyter_config_requests): - column_1, column_2, column_3 = st.columns(3) - with column_1: - if not idx: - st.write(f"Idx") - st.write(f"{idx}") - with column_2: - if not idx: - st.write(f"Use GPU") - st.write(config['use_gpu']) - with column_3: - if not idx: - st.write(f"Stop") - if config["token"]: - should_stop = st.button("Stop this notebook") - - # Step 4: Change stop if the user clicked the button - if should_stop: - config["stop"] = should_stop - state.jupyter_config_requests = state.jupyter_config_requests diff --git a/docs/source-app/core_api/lightning_app/index.rst b/docs/source-app/core_api/lightning_app/index.rst deleted file mode 100644 index bf0430f3bee5b..0000000000000 --- a/docs/source-app/core_api/lightning_app/index.rst +++ /dev/null @@ -1,94 +0,0 @@ -############# -Lightning App -############# -**Audience:** Users who want to know how an app works under the hood 🤯. - -**Lightning App:** We call workflows composed of multiple LightningWorks a **Lightning App**. - ----- - -******************* -Peek under the hood -******************* - - -.. raw:: html - -
-
- -.. displayitem:: - :header: App Components Tree (Basic) - :description: Learn more component composition and nesting. - :col_css: col-md-4 - :button_link: ../../glossary/app_tree.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: The event loop (Basic) - :description: Learn more about the event loop. - :col_css: col-md-4 - :button_link: ../../glossary/event_loop.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Communication between Flow and Works - :description: Learn more about components communicate. - :col_css: col-md-4 - :button_link: communication.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Customize Flow compute resources - :description: Learn more about Flow customizations. - :col_css: col-md-4 - :button_link: compute_content.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Dynamically create, execute and stop Work - :description: Learn more about components creation. - :col_css: col-md-4 - :button_link: dynamic_work.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Sharing My Components (Intermediate) - :description: Learn more component composition and nesting. - :col_css: col-md-4 - :button_link: ../../glossary/sharing_components.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
- ----- - -***************** -Lightning App API -***************** - -.. raw:: html - -
-
- -.. displayitem:: - :header: LightningApp API - :description: Look into the Lightning API reference. - :col_css: col-md-4 - :button_link: lightning_app.html - :height: 180 - -.. raw:: html - -
-
diff --git a/docs/source-app/core_api/lightning_app/lightning_app.rst b/docs/source-app/core_api/lightning_app/lightning_app.rst deleted file mode 100644 index af9592628a6f2..0000000000000 --- a/docs/source-app/core_api/lightning_app/lightning_app.rst +++ /dev/null @@ -1,12 +0,0 @@ -:orphan: - -.. _lightning_app: - -############ -LightningApp -############ - - -.. autoclass:: lightning.app.core.app.LightningApp - :exclude-members: _run, connect, get_component_by_name, maybe_apply_changes, set_state - :noindex: diff --git a/docs/source-app/core_api/lightning_flow.rst b/docs/source-app/core_api/lightning_flow.rst deleted file mode 100644 index 642112ae02793..0000000000000 --- a/docs/source-app/core_api/lightning_flow.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _lightning_flow: - -############# -LightningFlow -############# - -.. autoclass:: lightning.app.core.flow.LightningFlow - :exclude-members: _attach_backend, _exit, _is_state_attribute, set_state diff --git a/docs/source-app/core_api/lightning_work/compute.rst b/docs/source-app/core_api/lightning_work/compute.rst deleted file mode 100644 index 89313c4878cec..0000000000000 --- a/docs/source-app/core_api/lightning_work/compute.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -.. _cloud_compute: - -############################ -Customize your Cloud Compute -############################ - -**Audience:** Users who want to select the hardware to run in the cloud. - -**Level:** Intermediate - ----- - -.. include:: compute_content.rst diff --git a/docs/source-app/core_api/lightning_work/compute_content.rst b/docs/source-app/core_api/lightning_work/compute_content.rst deleted file mode 100644 index 1ca64429dc342..0000000000000 --- a/docs/source-app/core_api/lightning_work/compute_content.rst +++ /dev/null @@ -1,94 +0,0 @@ - -*************************** -Customize my Work resources -*************************** - -In the cloud, you can simply configure which machine to run on by passing -a :class:`~lightning.app.utilities.packaging.cloud_compute.CloudCompute` to your work ``__init__`` method: - -.. code-block:: python - - import lightning as L - - # Run on a free, shared CPU machine. This is the default for every LightningWork. - MyCustomWork(cloud_compute=L.CloudCompute()) - - # Run on a dedicated, medium-size CPU machine (see specs below) - MyCustomWork(cloud_compute=L.CloudCompute("cpu-medium")) - - # Run on cheap GPU machine with a single GPU (see specs below) - MyCustomWork(cloud_compute=L.CloudCompute("gpu")) - - # Run on a fast multi-GPU machine (see specs below) - MyCustomWork(cloud_compute=L.CloudCompute("gpu-fast-multi")) - -.. warning:: - Custom base images are not supported with the default CPU cloud compute. For example: - - .. code-block:: py - - class MyWork(LightningWork): - def __init__(self): - super().__init__(cloud_build_config=BuildConfig(image="my-custom-image")) # no cloud compute, for example default work - - -Here is the full list of supported machine names: - -.. list-table:: Hardware by Accelerator Type - :widths: 25 25 25 25 - :header-rows: 1 - - * - Name - - # of CPUs - - GPUs - - Memory - * - default - - 1 - - 0 - - 4 GB - * - cpu-small - - 2 - - 0 - - 8 GB - * - cpu-medium - - 8 - - 0 - - 32 GB - * - gpu - - 4 - - 1 (T4, 16 GB) - - 16 GB - * - gpu-fast - - 8 - - 1 (V100, 16 GB) - - 61 GB - * - gpu-fast-multi - - 32 - - 4 (V100 16 GB) - - 244 GB - -The up-to-date prices for these instances can be found `here `_. - ----- - -********************** -Stop my work when idle -********************** - -By providing **idle_timeout=X Seconds**, the work is automatically stopped **X seconds** after doing nothing. - -.. code-block:: python - - import lightning as L - - # Run on a single CPU and turn down immediately when idle. - MyCustomWork(cloud_compute=L.CloudCompute("gpu", idle_timeout=0)) - ----- - -************ -CloudCompute -************ - -.. autoclass:: lightning.app.utilities.packaging.cloud_compute.CloudCompute - :noindex: diff --git a/docs/source-app/core_api/lightning_work/handling_app_exception.rst b/docs/source-app/core_api/lightning_work/handling_app_exception.rst deleted file mode 100644 index 20c9b618d97aa..0000000000000 --- a/docs/source-app/core_api/lightning_work/handling_app_exception.rst +++ /dev/null @@ -1,13 +0,0 @@ -:orphan: - -############################### -Handle Lightning App exceptions -############################### - -**Audience:** Users who want to make Lightning Apps more robust to potential issues. - -**Level:** Advanced - ----- - -.. include:: handling_app_exception_content.rst diff --git a/docs/source-app/core_api/lightning_work/handling_app_exception_content.rst b/docs/source-app/core_api/lightning_work/handling_app_exception_content.rst deleted file mode 100644 index 4840cf5fdf6f3..0000000000000 --- a/docs/source-app/core_api/lightning_work/handling_app_exception_content.rst +++ /dev/null @@ -1,74 +0,0 @@ - -*************************************************** -What handling Lightning App exceptions does for you -*************************************************** - -Imagine you are creating a Lightning App (App) where your team can launch model training by providing their own Github Repo any time they want. - -As the App admin, you don't want the App to go down if their code has a bug and breaks. - -Instead, you would like the LightningWork (Work) to capture the exception and present the issue to users. - ----- - -**************************** -Configure exception handling -**************************** - -The LightningWork (Work) accepts an argument **raise_exception** which is **True** by default. This aligns with Python default behaviors. - -However, for the user case stated in the previous section, we want to capture the Work exceptions. This is done by providing ``raise_exception=False`` to the work ``__init__`` method. - -.. code-block:: python - - import lightning as L - - MyCustomWork(raise_exception=False) # <== HERE: The exception is captured. - - # Default behavior - MyCustomWork(raise_exception=True) # <== HERE: The exception is raised within the flow and terminates the app - - -And you can customize this behavior by overriding the ``on_exception`` hook to the Work. - -.. code-block:: python - - import lightning as L - - class MyCustomWork(L.LightningWork): - - def on_exception(self, exception: Exception): - # do something when an exception is triggered. - ----- - -************************** -Exception handling example -************************** - -This is the pseudo-code for the application described above. - -.. code-block:: python - - import lightning as L - - class RootFlow(L.LightningFlow): - def __init__(self): - super().__init__() - self.user_jobs = L.structures.Dict() - self.requested_jobs = [] - - def run(self): - for request in self.requested_jobs: - job_id = request["id"] - if job_id not in self.user_jobs: - # Note: The `GithubRepoLauncher` doesn't exist yet. - self.user_jobs[job_id] = GithubRepoLauncher( - **request, - raise_exception=False, # <== HERE: The exception is captured. - ) - self.user_jobs[job_id].run() - - if self.user_jobs[job_id].status.stage == "failed" and "printed" not in request: - print(self.user_jobs[job_id].status) # <== HERE: Print the user exception. - request["printed"] = True diff --git a/docs/source-app/core_api/lightning_work/index.rst b/docs/source-app/core_api/lightning_work/index.rst deleted file mode 100644 index 0b660f2209aba..0000000000000 --- a/docs/source-app/core_api/lightning_work/index.rst +++ /dev/null @@ -1,112 +0,0 @@ -############## -Lightning Work -############## - -**Audience:** Users who want to know how Lightning Work works under the hood 🤯. - ----- - -******************* -Peek under the hood -******************* - - -.. raw:: html - -
-
- -.. displayitem:: - :header: To Cache or Not to Cache Calls - :description: Learn more about work execution and internal caching. - :col_css: col-md-4 - :button_link: ../../workflows/run_work_once.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Parallel Work - :description: Learn more about work parallelization. - :col_css: col-md-4 - :button_link: ../../workflows/run_work_in_parallel.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Sharing files between works - :description: Learn more about data transfer works. - :col_css: col-md-4 - :button_link: ../../glossary/storage/storage.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Sharing Python Objects between works - :description: Learn more about sharing objects. - :col_css: col-md-4 - :button_link: payload.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Checking Work Status - :description: Learn more about work status. - :col_css: col-md-4 - :button_link: status.html - :height: 180 - :tag: Advanced - -.. displayitem:: - :header: Handling App Exceptions - :description: Learn more about exception capturing. - :col_css: col-md-4 - :button_link: handling_app_exception.html - :height: 180 - :tag: Advanced - -.. raw:: html - -
-
- -.. raw:: html - -
-
- -.. displayitem:: - :header: Customize your Cloud Compute - :description: Learn more about changing hardware and requirements. - :col_css: col-md-4 - :button_link: compute.html - :height: 180 - :tag: Cloud - -.. raw:: html - -
-
- - ----- - -****************** -Lightning Work API -****************** - -.. raw:: html - -
-
- -.. displayitem:: - :header: LightningWork API - :description: Look into the Lightning API reference. - :col_css: col-md-4 - :button_link: lightning_work.html - :height: 180 - -.. raw:: html - -
-
diff --git a/docs/source-app/core_api/lightning_work/lightning_work.rst b/docs/source-app/core_api/lightning_work/lightning_work.rst deleted file mode 100644 index 54c7328411a3c..0000000000000 --- a/docs/source-app/core_api/lightning_work/lightning_work.rst +++ /dev/null @@ -1,11 +0,0 @@ -:orphan: - -.. _lightning_work: - -############# -LightningWork -############# - -.. autoclass:: lightning.app.core.work.LightningWork - :exclude-members: _aggregate_status_timeout, _is_state_attribute, _is_state_attribute, set_state - :noindex: diff --git a/docs/source-app/core_api/lightning_work/payload.rst b/docs/source-app/core_api/lightning_work/payload.rst deleted file mode 100644 index cde42af1bd6d2..0000000000000 --- a/docs/source-app/core_api/lightning_work/payload.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -###################################### -Sharing Objects between LightningWorks -###################################### - -**Audience:** Users who want to know how to transfer Python objects between their LightningWorks. - -**Level:** Advanced - -**Prerequisite**: Reach Level 16+, know about the `pandas DataFrames `_ and read and read the :doc:`Access app state guide <../../workflows/access_app_state>`. - ----- - -.. include:: payload_content.rst diff --git a/docs/source-app/core_api/lightning_work/payload_content.rst b/docs/source-app/core_api/lightning_work/payload_content.rst deleted file mode 100644 index 780f3985e30ea..0000000000000 --- a/docs/source-app/core_api/lightning_work/payload_content.rst +++ /dev/null @@ -1,75 +0,0 @@ - -************************************** -What transferring objects does for you -************************************** - -Imagine your application is processing some data using `pandas DaFrame `_ and you want to pass that data to another LightningWork (Work). This is what the **Payload API** is meant for. - ----- - -************************* -Use the Lightning Payload -************************* - -The Payload enables non JSON-serializable attribute objects to be part of your Work's state and to be communicated to other Works. - -Here is an example: - -.. code-block:: python - - import lightning as L - import pandas as pd - - - class SourceWork(L.LightningWork): - def __init__(self): - super().__init__() - self.df = None - - def run(self): - # do some processing - - df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) - - # The object you care about needs to be wrapped into a Payload object. - self.df = L.storage.Payload(df) - - # You can access the original object from the payload using its value property. - print("src", self.df.value) - # src col1 col2 - # 0 1 3 - # 1 2 4 - -Once the Payload object is attached to your Work's state, it can be passed to another work using the LightningFlow (Flow) as follows: - -.. code-block:: python - - import lightning as L - import pandas as pd - - - class DestinationWork(L.LightningWork): - def run(self, df: L.storage.Payload): - # You can access the original object from the payload using its value property. - print("dst", df.value) - # dst col1 col2 - # 0 1 3 - # 1 2 4 - - - class Flow(L.LightningFlow): - def __init__(self): - super().__init__() - self.src = SourceWork() - self.dst = DestinationWork() - - def run(self): - self.src.run() - # The pandas DataFrame created by the ``SourceWork`` - # is passed to the ``DestinationWork``. - # Internally, Lightning pickles and un-pickle the python object, - # so you receive a copy of the original object. - self.dst.run(df=self.src.df) - - - app = L.LightningApp(Flow()) diff --git a/docs/source-app/core_api/lightning_work/status.rst b/docs/source-app/core_api/lightning_work/status.rst deleted file mode 100644 index af3a27ac4047e..0000000000000 --- a/docs/source-app/core_api/lightning_work/status.rst +++ /dev/null @@ -1,13 +0,0 @@ -:orphan: - -#################### -LightningWork Status -#################### - -**Audience:** Users who want to understand ``LightningWork`` under the hood. - -**Level:** Advanced - ----- - -.. include:: status_content.rst diff --git a/docs/source-app/core_api/lightning_work/status_content.rst b/docs/source-app/core_api/lightning_work/status_content.rst deleted file mode 100644 index bb1f2f0273c84..0000000000000 --- a/docs/source-app/core_api/lightning_work/status_content.rst +++ /dev/null @@ -1,197 +0,0 @@ - -************************************* -Everything about LightningWork Status -************************************* - -Statuses indicate transition points in the life of a LightningWork (Work) and contain metadata. - -The different stages are: - -.. code-block:: python - - class WorkStageStatus: - NOT_STARTED = "not_started" - STOPPED = "stopped" - PENDING = "pending" - RUNNING = "running" - SUCCEEDED = "succeeded" - FAILED = "failed" - -And a single status is as follows: - -.. code-block:: python - - @dataclass - class WorkStatus: - stage: WorkStageStatus - timestamp: float - reason: Optional[str] = None - message: Optional[str] = None - count: int = 1 - - -On creation, the Work's status flags all evaluate to ``False`` (in particular ``has_started``) and when calling ``work.run`` in your Lightning Flow (Flow), -the Work transitions from ``is_pending`` to ``is_running`` and then to ``has_succeeded`` if everything went well or ``has_failed`` otherwise. - -.. code-block:: python - - from time import sleep - import lightning as L - - - class Work(L.LightningWork): - def run(self, value: int): - sleep(1) - if value == 0: - return - raise Exception(f"The provided value was {value}") - - - class Flow(L.LightningFlow): - def __init__(self): - super().__init__() - self.work = Work(raise_exception=False) - self.counter = 0 - - def run(self): - if not self.work.has_started: - print("NOT STARTED") - - elif self.work.is_pending: - print("PENDING") - - elif self.work.is_running: - print("RUNNING") - - elif self.work.has_succeeded: - print("SUCCESS") - - elif self.work.has_failed: - print("FAILED") - - elif self.work.has_stopped: - print("STOPPED") - self.stop() - - print(self.work.status) - self.work.run(self.counter) - self.counter += 1 - - - app = L.LightningApp(Flow()) - -Run this app as follows: - -.. code-block:: bash - - lightning run app test.py > app_log.txt - -And here is the expected output inside ``app_log.txt`` and as expected, -we are observing the following transition ``has_started``, ``is_pending``, ``is_running``, ``has_succeeded``, ``is_running`` and ``has_failed`` - -.. code-block:: console - - NOT STARTED - WorkStatus(stage='not_started', timestamp=1653498225.18468, reason=None, message=None, count=1) - PENDING - WorkStatus(stage='pending', timestamp=1653498225.217413, reason=None, message=None, count=1) - PENDING - WorkStatus(stage='pending', timestamp=1653498225.217413, reason=None, message=None, count=1) - PENDING - ... - PENDING - WorkStatus(stage='pending', timestamp=1653498225.217413, reason=None, message=None, count=1) - PENDING - WorkStatus(stage='pending', timestamp=1653498225.217413, reason=None, message=None, count=1) - RUNNING - WorkStatus(stage='running', timestamp=1653498228.825194, reason=None, message=None, count=1) - ... - SUCCESS - WorkStatus(stage='succeeded', timestamp=1653498229.831793, reason=None, message=None, count=1) - SUCCESS - WorkStatus(stage='succeeded', timestamp=1653498229.831793, reason=None, message=None, count=1) - SUCCESS - WorkStatus(stage='succeeded', timestamp=1653498229.831793, reason=None, message=None, count=1) - RUNNING - WorkStatus(stage='running', timestamp=1653498229.846451, reason=None, message=None, count=1) - RUNNING - ... - WorkStatus(stage='running', timestamp=1653498229.846451, reason=None, message=None, count=1) - RUNNING - WorkStatus(stage='running', timestamp=1653498229.846451, reason=None, message=None, count=1) - FAILED - WorkStatus(stage='failed', timestamp=1653498230.852565, reason='user_exception', message='The provided value was 1', count=1) - FAILED - WorkStatus(stage='failed', timestamp=1653498230.852565, reason='user_exception', message='The provided value was 1', count=1) - FAILED - WorkStatus(stage='failed', timestamp=1653498230.852565, reason='user_exception', message='The provided value was 1', count=1) - FAILED - WorkStatus(stage='failed', timestamp=1653498230.852565, reason='user_exception', message='The provided value was 1', count=1) - ... - -In order to access all statuses: - -.. code-block:: python - - from time import sleep - import lightning as L - - - class Work(L.LightningWork): - def run(self, value: int): - sleep(1) - if value == 0: - return - raise Exception(f"The provided value was {value}") - - - class Flow(L.LightningFlow): - def __init__(self): - super().__init__() - self.work = Work(raise_exception=False) - self.counter = 0 - - def run(self): - print(self.statuses) - self.work.run(self.counter) - self.counter += 1 - - - app = L.LightningApp(Flow()) - - -Run this app as follows: - -.. code-block:: bash - - lightning run app test.py > app_log.txt - -And here is the expected output inside ``app_log.txt``: - - -.. code-block:: console - - # First execution with value = 0 - - [] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1)] - ... - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498626.185683, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498626.185683, reason=None, message=None, count=1)] - ... - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498626.185683, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498626.185683, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498626.185683, reason=None, message=None, count=1), WorkStatus(stage='succeeded', timestamp=1653498627.191053, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498626.185683, reason=None, message=None, count=1), WorkStatus(stage='succeeded', timestamp=1653498627.191053, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498622.252016, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498626.185683, reason=None, message=None, count=1), WorkStatus(stage='succeeded', timestamp=1653498627.191053, reason=None, message=None, count=1)] - - # Second execution with value = 1 - - [WorkStatus(stage='pending', timestamp=1653498627.204636, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498627.205509, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498627.204636, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498627.205509, reason=None, message=None, count=1)] - ... - [WorkStatus(stage='pending', timestamp=1653498627.204636, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498627.205509, reason=None, message=None, count=1)] - [WorkStatus(stage='pending', timestamp=1653498627.204636, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498627.205509, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498627.205509, reason=None, message=None, count=1), WorkStatus(stage='failed', timestamp=1653498628.210164, reason='user_exception', message='The provided value was 1', count=1)] - [WorkStatus(stage='pending', timestamp=1653498627.204636, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498627.205509, reason=None, message=None, count=1), WorkStatus(stage='running', timestamp=1653498627.205509, reason=None, message=None, count=1), WorkStatus(stage='failed', timestamp=1653498628.210164, reason='user_exception', message='The provided value was 1', count=1)] diff --git a/docs/source-app/core_api/overview.rst b/docs/source-app/core_api/overview.rst deleted file mode 100644 index 594433acce2aa..0000000000000 --- a/docs/source-app/core_api/overview.rst +++ /dev/null @@ -1,40 +0,0 @@ -:orphan: - -.. _core_api: - -############################### -Learn more about Lightning Core -############################### - -.. raw:: html - -
-
- -.. displayitem:: - :header: Level-up with Lightning Apps - :description: From Basics to Advanced Skills - :col_css: col-md-6 - :button_link: ../levels/basic/index.html - :height: 180 - -.. displayitem:: - :header: Understand Lightning App - :description: Detailed description - :col_css: col-md-6 - :button_link: lightning_app/index.html - :height: 180 - -.. displayitem:: - :header: Understand Lightning Flow - :description: Detailed description - :col_css: col-md-6 - :button_link: lightning_flow.html - :height: 180 - -.. displayitem:: - :header: Understand Lightning Work - :description: Detailed description - :col_css: col-md-6 - :button_link: lightning_work/index.html - :height: 180 diff --git a/docs/source-app/examples/dag/dag.rst b/docs/source-app/examples/dag/dag.rst deleted file mode 100644 index 0df028ebbb0ce..0000000000000 --- a/docs/source-app/examples/dag/dag.rst +++ /dev/null @@ -1,81 +0,0 @@ -:orphan: - -###################################### -Develop a Directed Acyclic Graph (DAG) -###################################### - -.. _dag_example: - -**Audience:** Users coming from MLOps to Lightning Apps, looking for more flexibility. - -A typical ML training workflow can be implemented with a simple DAG. - -Below is a pseudo-code using the lightning framework that uses a LightningFlow to orchestrate the serial workflow: process data, train a model, and serve the model. - -.. code-block:: python - - import lightning as L - - class DAGFlow(L.LightningFlow): - - def __init__(self): - super().__init__() - self.processor = DataProcessorWork(...) - self.train_work = TrainingWork(...) - self.serve_work = ServeWork(...) - - def run(self): - self.processor.run(...) - self.train_work.run(...) - self.serve_work.run(...) - -Below is a pseudo-code to run several works in parallel using a built-in :class:`~lightning.app.structures.Dict`. - -.. code-block:: python - - import lightning as L - - class DAGFlow(L.LightningFlow): - - def __init__(self): - super().__init__() - ... - self.train_works = L.structures.Dict(**{ - "1": TrainingWork(..., parallel=True), - "2": TrainingWork(..., parallel=True), - "3": TrainingWork(..., parallel=True), - ... - }) - ... - - def run(self): - self.processor.run(...) - - # The flow runs through them all, so we need to guard self.serve_work.run - for work in self.train_works.values(): - work.run(...) - - # Wait for all to have finished without errors. - if not all(w.has_succeeded for w in self.train_works): - continue - - self.serve_work.run(...) - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Scheduled DAG with pandas and sklearn from scratch. - :description: DAG example in pure Lightning. - :col_css: col-md-4 - :button_link: dag_from_scratch.html - :height: 180 - :tag: intermediate diff --git a/docs/source-app/examples/dag/dag_from_scratch.rst b/docs/source-app/examples/dag/dag_from_scratch.rst deleted file mode 100644 index 6625317471938..0000000000000 --- a/docs/source-app/examples/dag/dag_from_scratch.rst +++ /dev/null @@ -1,53 +0,0 @@ -:orphan: - -################################################### -Scheduled DAG with pandas and sklearn from scratch. -################################################### - -**Audience:** Users coming from MLOps to Lightning Apps, looking for more flexibility. - -**Level:** Intermediate. - -In this example, you will learn how to create a simple DAG which: - -* Download and process some data -* Train several models and report their associated metrics - -and learn how to schedule this entire process. - -Find the complete example `here `_. - ----- - -************************** -Step 1: Implement your DAG -************************** - -Here is an illustration of the DAG to implement: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/simple_dag.png - :alt: Simple DAG - :width: 100 % - -First, let's define the component we need: - -* DataCollector is responsible to download the data -* Processing is responsible to execute a ``processing.py`` script. -* A collection of model work to train all models in parallel. - -.. literalinclude:: ../../../../examples/app/dag/app.py - :lines: 52-74 - -And its run method executes the steps described above. - -.. literalinclude:: ../../../../examples/app/dag/app.py - :lines: 76-99 - ----- - -***************************** -Step 2: Define the scheduling -***************************** - -.. literalinclude:: ../../../../examples/app/dag/app.py - :lines: 102-130 diff --git a/docs/source-app/examples/data_explore_app.rst b/docs/source-app/examples/data_explore_app.rst deleted file mode 100644 index cd7011a10e93c..0000000000000 --- a/docs/source-app/examples/data_explore_app.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -########################## -Build a Data Exploring App -########################## diff --git a/docs/source-app/examples/etl_app.rst b/docs/source-app/examples/etl_app.rst deleted file mode 100644 index 5b494e943e445..0000000000000 --- a/docs/source-app/examples/etl_app.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -############### -Build a ETL App -############### diff --git a/docs/source-app/examples/file_server/app.py b/docs/source-app/examples/file_server/app.py deleted file mode 100644 index 6c54944fac10d..0000000000000 --- a/docs/source-app/examples/file_server/app.py +++ /dev/null @@ -1,243 +0,0 @@ -import json -import os -import tarfile -import uuid -import zipfile -from pathlib import Path - -from lightning.app import LightningWork, LightningApp -from lightning.app.storage import Drive - - -class FileServer(LightningWork): - def __init__( - self, - drive: Drive, - base_dir: str = "file_server", - chunk_size=10240, - **kwargs - ): - """This component uploads, downloads files to your application. - - Arguments: - drive: The drive can share data inside your application. - base_dir: The local directory where the data will be stored. - chunk_size: The quantity of bytes to download/upload at once. - - """ - super().__init__( - cloud_build_config=BuildConfig(["flask, flask-cors"]), - parallel=True, - **kwargs, - ) - # 1: Attach the arguments to the state. - self.drive = drive - self.base_dir = base_dir - self.chunk_size = chunk_size - - # 2: Create a folder to store the data. - os.makedirs(self.base_dir, exist_ok=True) - - # 3: Keep a reference to the uploaded filenames. - self.uploaded_files = dict() - - def get_filepath(self, path: str) -> str: - """Returns file path stored on the file server.""" - return os.path.join(self.base_dir, path) - - def get_random_filename(self) -> str: - """Returns a random hash for the file name.""" - return uuid.uuid4().hex - - def upload_file(self, file): - """Upload a file while tracking its progress.""" - # 1: Track metadata about the file - filename = file.filename - uploaded_file = self.get_random_filename() - meta_file = uploaded_file + ".meta" - self.uploaded_files[filename] = { - "progress": (0, None), "done": False - } - - # 2: Create a stream and write bytes of - # the file to the disk under `uploaded_file` path. - with open(self.get_filepath(uploaded_file), "wb") as out_file: - content = file.read(self.chunk_size) - while content: - # 2.1 Write the file bytes - size = out_file.write(content) - - # 2.2 Update the progress metadata - self.uploaded_files[filename]["progress"] = ( - self.uploaded_files[filename]["progress"][0] + size, - None, - ) - # 4: Read next chunk of data - content = file.read(self.chunk_size) - - # 3: Update metadata that the file has been uploaded. - full_size = self.uploaded_files[filename]["progress"][0] - self.drive.put(self.get_filepath(uploaded_file)) - self.uploaded_files[filename] = { - "progress": (full_size, full_size), - "done": True, - "uploaded_file": uploaded_file, - } - - # 4: Write down the metadata about the file to the disk - meta = { - "original_path": filename, - "display_name": os.path.splitext(filename)[0], - "size": full_size, - "drive_path": uploaded_file, - } - with open(self.get_filepath(meta_file), "w") as f: - json.dump(meta, f) - - # 5: Put the file to the drive. - # It means other components can access get or list them. - self.drive.put(self.get_filepath(meta_file)) - return meta - - def list_files(self, file_path: str): - # 1: Get the local file path of the file server. - file_path = self.get_filepath(file_path) - - # 2: If the file exists in the drive, transfer it locally. - if not os.path.exists(file_path): - self.drive.get(file_path) - - if os.path.isdir(file_path): - result = set() - for _, _, f in os.walk(file_path): - for file in f: - if not file.endswith(".meta"): - for filename, meta in self.uploaded_files.items(): - if meta["uploaded_file"] == file: - result.add(filename) - return {"asset_names": [v for v in result]} - - # 3: If the filepath is a tar or zip file, list their contents - if zipfile.is_zipfile(file_path): - with zipfile.ZipFile(file_path, "r") as zf: - result = zf.namelist() - elif tarfile.is_tarfile(file_path): - with tarfile.TarFile(file_path, "r") as tf: - result = tf.getnames() - else: - raise ValueError("Cannot open archive file!") - - # 4: Returns the matching files. - return {"asset_names": result} - - def run(self): - # 1: Imports flask requirements. - from flask import Flask, request - from flask_cors import CORS - - # 2: Create a flask app - flask_app = Flask(__name__) - CORS(flask_app) - - # 3: Define the upload file endpoint - @flask_app.post("/upload_file/") - def upload_file(): - """Upload a file directly as form data.""" - f = request.files["file"] - return self.upload_file(f) - - @flask_app.get("/") - def list_files(): - return self.list_files(str(Path(self.base_dir).resolve())) - - # 5: Start the flask app while providing the `host` and `port`. - flask_app.run(host=self.host, port=self.port, load_dotenv=False) - - def alive(self): - """Hack: Returns whether the server is alive.""" - return self.url != "" - - -import requests - -from lightning import LightningWork - - -class TestFileServer(LightningWork): - def __init__(self, drive: Drive): - super().__init__(cache_calls=True) - self.drive = drive - - def run(self, file_server_url: str, first=True): - if first: - with open("test.txt", "w") as f: - f.write("Some text.") - - response = requests.post( - file_server_url + "/upload_file/", - files={'file': open("test.txt", 'rb')} - ) - assert response.status_code == 200 - else: - response = requests.get(file_server_url) - assert response.status_code == 200 - assert response.json() == {"asset_names": ["test.txt"]} - - -from lightning import LightningApp, LightningFlow - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - # 1: Create a drive to share data between works - self.drive = Drive("lit://file_server") - # 2: Create the filer server - self.file_server = FileServer(self.drive) - # 3: Create the file ser - self.test_file_server = TestFileServer(self.drive) - - def run(self): - # 1: Start the file server. - self.file_server.run() - - # 2: Trigger the test file server work when ready. - if self.file_server.alive(): - # 3 Execute the test file server work. - self.test_file_server.run(self.file_server.url) - self.test_file_server.run(self.file_server.url, first=False) - - # 4 When both execution are successful, exit the app. - if self.test_file_server.num_successes == 2: - self.stop() - - def configure_layout(self): - # Expose the file_server component - # in the UI using its `/` endpoint. - return {"name": "File Server", "content": self.file_server} - - -from lightning.app.runners import MultiProcessRuntime - - -def test_file_server(): - app = LightningApp(Flow()) - MultiProcessRuntime(app).dispatch() - - -from lightning.app.testing import run_app_in_cloud - - -def test_file_server_in_cloud(): - # You need to provide the directory containing the app file. - app_dir = "docs/source-app/examples/file_server" - with run_app_in_cloud(app_dir) as (admin_page, view_page, get_logs_fn, name): - """# 1. `admin_page` and `view_page` are playwright Page Objects. - - # Check out https://playwright.dev/python/ doc to learn more. - # You can click the UI and trigger actions. - - # 2. By calling logs = get_logs_fn(), - # you get all the logs currently on the admin page. - - """ diff --git a/docs/source-app/examples/file_server/file_server.rst b/docs/source-app/examples/file_server/file_server.rst deleted file mode 100644 index f9f800a085bb8..0000000000000 --- a/docs/source-app/examples/file_server/file_server.rst +++ /dev/null @@ -1,13 +0,0 @@ -:orphan: - -.. _fileserver_example: - -##################### -Develop a File Server -##################### - -**Prerequisite**: Reach :ref:`level 16+ ` and read the :ref:`Drive article `. - ----- - -.. include:: file_server_content.rst diff --git a/docs/source-app/examples/file_server/file_server_content.rst b/docs/source-app/examples/file_server/file_server_content.rst deleted file mode 100644 index e9e9017232927..0000000000000 --- a/docs/source-app/examples/file_server/file_server_content.rst +++ /dev/null @@ -1,85 +0,0 @@ - - -********* -Our Goal -********* - -Create a simple Lightning App (App) that allows users to upload files and list the uploaded files. - ----- - -************* -Completed App -************* - -Here is a recording of the final App built in this example, tested with pytest. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/file_server.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/file_server.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -********** -App Design -********** - -In order to create this App, we need to develop two components and an App: - -* A **File Server Component** that gives you the ability to download or list files shared with your App. This is particularly useful when you want to trigger an ML job but your users need to provide their own data or if the user wants to download the trained checkpoints. - -* A **Test File Server** Component to interact with the file server. - -* An App putting everything together and the App's associated pytest tests. - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement the File Server general structure - :description: Put together the shape of the Component - :col_css: col-md-4 - :button_link: file_server_step_1.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 2: Implement the File Server upload and list files methods - :description: Add the core functionalities to the Component - :col_css: col-md-4 - :button_link: file_server_step_2.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 3: Implement a File Server Testing Component - :description: Create a Component to test the file server - :col_css: col-md-4 - :button_link: file_server_step_3.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 4: Implement tests for the File Server component with pytest - :description: Create an App to validate the upload and list files endpoints - :col_css: col-md-4 - :button_link: file_server_step_4.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/file_server/file_server_step_1.rst b/docs/source-app/examples/file_server/file_server_step_1.rst deleted file mode 100644 index 8703a1d443ef2..0000000000000 --- a/docs/source-app/examples/file_server/file_server_step_1.rst +++ /dev/null @@ -1,49 +0,0 @@ -:orphan: - -################################################## -Step 1: Implement the FileServer general structure -################################################## - -Let’s dive in on how to develop the component with the following code: - -.. literalinclude:: ./app.py - :lines: 1-41, 132-158 - :emphasize-lines: 16, 51- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 2: Implement the File Server upload and list files methods - :description: Add the core functionalities to the Component - :col_css: col-md-4 - :button_link: file_server_step_2.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 3: Implement a File Server Testing Component - :description: Create a Component to test the file server - :col_css: col-md-4 - :button_link: file_server_step_3.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 4: Implement tests for the File Server component with pytest - :description: Create an App to validate the upload and list files endpoints - :col_css: col-md-4 - :button_link: file_server_step_4.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/file_server/file_server_step_2.rst b/docs/source-app/examples/file_server/file_server_step_2.rst deleted file mode 100644 index 86471604a124f..0000000000000 --- a/docs/source-app/examples/file_server/file_server_step_2.rst +++ /dev/null @@ -1,75 +0,0 @@ -:orphan: - -################################################################ -Step 2: Implement the File Server upload and list_files methods -################################################################ - -Let's dive in on how to implement these methods. - -*************************** -Implement the upload method -*************************** - -In this method, we are creating a stream between the uploaded file and the uploaded file stored on the file server disk. - -Once the file is uploaded, we are putting the file into the :class:`~lightning.app.storage.drive.Drive`, so it becomes persistent and accessible to all Components. - -.. literalinclude:: ./app.py - :lines: 12, 51-99 - :emphasize-lines: 49 - -******************************* -Implement the fist_files method -******************************* - -First, in this method, we get the file in the file server filesystem, if available in the Drive. Once done, we list the the files under the provided paths and return the results. - -.. literalinclude:: ./app.py - :lines: 12, 100-130 - :emphasize-lines: 9 - - -******************* -Implement utilities -******************* - -.. literalinclude:: ./app.py - :lines: 12, 43-49 - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement the File Server general structure - :description: Put together the shape of the Component - :col_css: col-md-4 - :button_link: file_server_step_1.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 3: Implement a File Server Testing Component - :description: Create a Component to test the file server - :col_css: col-md-4 - :button_link: file_server_step_3.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 4: Implement tests for the File Server component with pytest - :description: Create an App to validate the upload and list files endpoints - :col_css: col-md-4 - :button_link: file_server_step_4.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/file_server/file_server_step_3.rst b/docs/source-app/examples/file_server/file_server_step_3.rst deleted file mode 100644 index 4703ef0750f1e..0000000000000 --- a/docs/source-app/examples/file_server/file_server_step_3.rst +++ /dev/null @@ -1,54 +0,0 @@ -:orphan: - -################################################# -Step 3: Implement a File Server Testing Component -################################################# - -Let's dive in on how to implement a testing component for a server. - -This component needs to test two things: - -* The **/upload_file/** endpoint by creating a file and sending its content to it. - -* The **/** endpoint listing files, by validating the that previously uploaded file is present in the response. - -.. literalinclude:: ./app.py - :lines: 165-182 - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement the File Server general structure - :description: Put together the shape of the Component - :col_css: col-md-4 - :button_link: file_server_step_1.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 2: Implement the File Server upload and list files methods - :description: Add the core functionalities to the Component - :col_css: col-md-4 - :button_link: file_server_step_2.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 4: Implement tests for the File Server component with pytest - :description: Create an App to validate the upload and list files endpoints - :col_css: col-md-4 - :button_link: file_server_step_4.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/file_server/file_server_step_4.rst b/docs/source-app/examples/file_server/file_server_step_4.rst deleted file mode 100644 index 04517dfa7baf3..0000000000000 --- a/docs/source-app/examples/file_server/file_server_step_4.rst +++ /dev/null @@ -1,127 +0,0 @@ -:orphan: - -################################################################# -Step 4: Implement tests for the File Server component with pytest -################################################################# - -Let's create a simple App with our **File Server** and **File Server Test** components. - -Once the File Server is up and running, we'll execute the **test_file_server** LightningWork and when both calls are successful, we exit the App using ``self._exit``. - -.. literalinclude:: ./app.py - :lines: 187-218 - - -Simply create a ``test.py`` file with the following code and run ``pytest tests.py``: - -.. literalinclude:: ./app.py - :lines: 221-226 - -To test the App in the cloud, create a ``cloud_test.py`` file with the following code and run ``pytest cloud_test.py``. -Under the hood, we are using the end-to-end testing `playwright `_ library, so you can interact with the UI. - -.. literalinclude:: ./app.py - :lines: 229- - ----- - -******************** -Test the application -******************** - -Clone the Lightning repo and run the following command: - -.. code-block:: bash - - pytest docs/source/examples/file_server/app.py --capture=no -v - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement the File Server general structure - :description: Put together the shape of the Component - :col_css: col-md-4 - :button_link: file_server_step_1.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 2: Implement the File Server upload and list files methods - :description: Add the core functionalities to the Component - :col_css: col-md-4 - :button_link: file_server_step_2.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: Step 3: Implement a File Server Testing Component - :description: Create a Component to test the file server - :col_css: col-md-4 - :button_link: file_server_step_3.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
- ----- - -****************** -Find more examples -****************** - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Develop a DAG - :description: Create a dag pipeline - :col_css: col-md-4 - :button_link: ../dag/dag.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a Github Repo Script Runner - :description: Run any script on github in the cloud - :col_css: col-md-4 - :button_link: ../github_repo_runner/github_repo_runner.html - :height: 150 - :tag: Intermediate - - -.. displayitem:: - :header: Develop a HPO Sweeper - :description: Train multiple models with different parameters - :col_css: col-md-4 - :button_link: ../hpo/hpo.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a Model Server - :description: Serve multiple models with different parameters - :col_css: col-md-4 - :button_link: ../model_server_app/model_server_app.html - :height: 150 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/github_repo_runner/app.py b/docs/source-app/examples/github_repo_runner/app.py deleted file mode 100644 index f57cfcd12dc24..0000000000000 --- a/docs/source-app/examples/github_repo_runner/app.py +++ /dev/null @@ -1,309 +0,0 @@ -import io -import os -import subprocess -import sys -from copy import deepcopy -from functools import partial -from subprocess import Popen -from typing import Dict, List, Optional - -from lightning import BuildConfig, CloudCompute, LightningApp, LightningFlow -from lightning.app import structures -from lightning.app.components import TracerPythonScript -from lightning.app.frontend import StreamlitFrontend -from lightning.app.storage.path import Path -from lightning.app.utilities.state import AppState - - -class GithubRepoRunner(TracerPythonScript): - def __init__( - self, - id: str, - github_repo: str, - script_path: str, - script_args: List[str], - requirements: List[str], - cloud_compute: Optional[CloudCompute] = None, - **kwargs, - ): - """The GithubRepoRunner Component clones a repo, runs a specific script with provided arguments and collect - logs. - - Arguments: - id: Identified of the component. - github_repo: The Github Repo URL to clone. - script_path: The path to the script to execute. - script_args: The arguments to be provided to the script. - requirements: The python requirements tp run the script. - cloud_compute: The object to select the cloud instance. - - """ - super().__init__( - script_path=script_path, - script_args=script_args, - cloud_compute=cloud_compute, - cloud_build_config=BuildConfig(requirements=requirements), - **kwargs, - ) - self.id = id - self.github_repo = github_repo - self.logs = [] - - def run(self, *args, **kwargs): - # 1. Hack: Patch stdout so we can capture the logs. - string_io = io.StringIO() - sys.stdout = string_io - - # 2: Use git command line to clone the repo. - repo_name = self.github_repo.split("/")[-1].replace(".git", "") - cwd = os.path.dirname(__file__) - subprocess.Popen( - f"git clone {self.github_repo}", cwd=cwd, shell=True).wait() - - # 3: Execute the parent run method of the TracerPythonScript class. - os.chdir(os.path.join(cwd, repo_name)) - super().run(*args, **kwargs) - - # 4: Get all the collected logs and add them to the state. - # This isn't optimal as heavy, but works for this demo purpose. - self.logs = string_io.getvalue() - string_io.close() - - def configure_layout(self): - return {"name": self.id, "content": self} - - -class PyTorchLightningGithubRepoRunner(GithubRepoRunner): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.best_model_path = None - self.best_model_score = None - - def configure_tracer(self): - from lightning.pytorch import Trainer - from lightning.pytorch.callbacks import Callback - - tracer = super().configure_tracer() - - class TensorboardServerLauncher(Callback): - def __init__(self, work): - # The provided `work` is the - # current ``PyTorchLightningScript`` work. - self.w = work - - def on_train_start(self, trainer, *_): - # Add `host` and `port` for tensorboard to work in the cloud. - cmd = f"tensorboard --logdir='{trainer.logger.log_dir}'" - server_args = f"--host {self.w.host} --port {self.w.port}" - Popen(cmd + " " + server_args, shell=True) - - def trainer_pre_fn(self, *args, work=None, **kwargs): - # Intercept Trainer __init__ call - # and inject a ``TensorboardServerLauncher`` component. - kwargs["callbacks"].append(TensorboardServerLauncher(work)) - return {}, args, kwargs - - # 5. Patch the `__init__` method of the Trainer - # to inject our callback with a reference to the work. - tracer.add_traced( - Trainer, "__init__", pre_fn=partial(trainer_pre_fn, work=self)) - return tracer - - def on_after_run(self, end_script_globals): - import torch - - # 1. Once the script has finished to execute, - # we can collect its globals and access any objects. - trainer = end_script_globals["cli"].trainer - checkpoint_callback = trainer.checkpoint_callback - lightning_module = trainer.lightning_module - - # 2. From the checkpoint_callback, - # we are accessing the best model weights - checkpoint = torch.load(checkpoint_callback.best_model_path) - - # 3. Load the best weights and torchscript the model. - lightning_module.load_state_dict(checkpoint["state_dict"]) - lightning_module.to_torchscript(f"{self.name}.pt") - - # 4. Use lightning.app.storage.Pathto create a reference to the - # torch scripted model. In the cloud with multiple machines, - # by simply passing this reference to another work, - # it triggers automatically a file transfer. - self.best_model_path = Path(f"{self.name}.pt") - - # 5. Keep track of the metrics. - self.best_model_score = float(checkpoint_callback.best_model_score) - - -class KerasGithubRepoRunner(GithubRepoRunner): - """Left to the users to implement.""" - - -class TensorflowGithubRepoRunner(GithubRepoRunner): - """Left to the users to implement.""" - - -GITHUB_REPO_RUNNERS = { - "PyTorch Lightning": PyTorchLightningGithubRepoRunner, - "Keras": KerasGithubRepoRunner, - "Tensorflow": TensorflowGithubRepoRunner, -} - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - # 1: Keep track of the requests within the state - self.requests = [] - # 2: Create a dictionary of components. - self.ws = structures.Dict() - - def run(self): - # Iterate continuously over all requests - for request_id, request in enumerate(self.requests): - self._handle_request(request_id, deepcopy(request)) - - def _handle_request(self, request_id: int, request: Dict): - # 1: Create a name and find selected framework - name = f"w_{request_id}" - ml_framework = request["train"].pop("ml_framework") - - # 2: If the component hasn't been created yet, create it. - if name not in self.ws: - work_cls = GITHUB_REPO_RUNNERS[ml_framework] - work = work_cls(id=request["id"], **request["train"]) - self.ws[name] = work - - # 3: Run the component - self.ws[name].run() - - # 4: Once the component has finished, - # add metadata to the original request for the UI. - if self.ws[name].best_model_path: - request = self.requests[request_id] - request["best_model_score"] = self.ws[name].best_model_score - request["best_model_path"] = self.ws[name].best_model_path - - def configure_layout(self): - # Create a StreamLit UI for the user to run his Github Repo. - return StreamlitFrontend(render_fn=render_fn) - - -def page_1__create_new_run(state): - import streamlit as st - - st.markdown("# Create a new Run 🎈") - - # 1: Collect arguments from the users - id = st.text_input("Name your run", value="my_first_run") - github_repo = st.text_input( - "Enter a Github Repo URL", value="https://github.com/Lightning-AI/lightning-quick-start.git" - ) - - default_script_args = ( - "--trainer.max_epochs=5" - " --trainer.limit_train_batches=4" - " --trainer.limit_val_batches=4" - " --trainer.callbacks=ModelCheckpoint" - " --trainer.callbacks.monitor=val_acc" - ) - default_requirements = "torchvision, pytorch_lightning, jsonargparse[signatures]" - - script_path = st.text_input("Enter your script to run", value="train_script.py") - script_args = st.text_input("Enter your base script arguments", value=default_script_args) - requirements = st.text_input("Enter your requirements", value=default_requirements) - ml_framework = st.radio( - "Select your ML Training Frameworks", options=["PyTorch Lightning", "Keras", "Tensorflow"] - ) - - if ml_framework not in ("PyTorch Lightning"): - st.write(f"{ml_framework} isn't supported yet.") - return - - clicked = st.button("Submit") - - # 2: If clicked, create a new request. - if clicked: - new_request = { - "id": id, - "train": { - "github_repo": github_repo, - "script_path": script_path, - "script_args": script_args.split(" "), - "requirements": requirements.split(" "), - "ml_framework": ml_framework, - }, - } - # 3: IMPORTANT: Add a new request to the state in-place. - # The flow receives the UI request and dynamically create - # and run the associated work from the request information. - state.requests = state.requests + [new_request] - - -def page_2__view_run_lists(state): - import streamlit as st - - st.markdown("# Run Lists 🎈") - # 1: Iterate through all the requests in the state. - for i, r in enumerate(state.requests): - i = str(i) - # 2: Display information such as request, logs, work state, model score. - work = state._state["structures"]["ws"]["works"][f"w_{i}"] - with st.expander(f"Expand to view Run {i}", expanded=False): - if st.checkbox("Expand to view your configuration", key=i): - st.json(r) - if st.checkbox("Expand to view logs", key=i): - st.code(body=work["vars"]["logs"]) - if st.checkbox("Expand to view your work state", key=i): - work["vars"].pop("logs") - st.json(work) - best_model_score = r.get("best_model_score", None) - if best_model_score: - if st.checkbox("Expand to view your run performance", key=i): - st.json({"best_model_score": best_model_score, "best_model_path": r.get("best_model_path")}) - - -def page_3__view_app_state(state): - import streamlit as st - - st.markdown("# App State 🎈") - st.write(state._state) - - -def render_fn(state: AppState): - import streamlit as st - - page_names_to_funcs = { - "Create a new Run": partial(page_1__create_new_run, state=state), - "View your Runs": partial(page_2__view_run_lists, state=state), - "View the App state": partial(page_3__view_app_state, state=state), - } - selected_page = st.sidebar.selectbox( - "Select a page", page_names_to_funcs.keys()) - page_names_to_funcs[selected_page]() - - -class RootFlow(LightningFlow): - def __init__(self): - super().__init__() - # Create the flow - self.flow = Flow() - - def run(self): - # Run the flow - self.flow.run() - - def configure_layout(self): - # 1: Add the main StreamLit UI - selection_tab = [{ - "name": "Run your Github Repo", - "content": self.flow, - }] - # 2: Add a new tab whenever a new work is dynamically created - run_tabs = [e.configure_layout() for e in self.flow.ws.values()] - # 3: Returns the list of tabs. - return selection_tab + run_tabs - - -app = LightningApp(RootFlow()) diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner.rst deleted file mode 100644 index 7e239b2dfd33d..0000000000000 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -.. _github_repo_script_runner_example: - -################################### -Develop a Github Repo Script Runner -################################### - -**Audience:** Users that want to develop interactive applications which runs Github Repo in the cloud at any scale for multiple users. - -**Prerequisite**: Reach :ref:`level 16+ ` and read the docstring of of :class:`~lightning.app.components.python.tracer.TracerPythonScript` component. - ----- - -.. include:: github_repo_runner_content.rst diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst deleted file mode 100644 index 2a0a3aa35c5ac..0000000000000 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst +++ /dev/null @@ -1,97 +0,0 @@ - -******** -Our Goal -******** - -Create a simple Lightning App (App) where users can enter information in a UI to run a given PyTorch Lightning Script from a given Github Repo with some optional extra Python requirements and arguments. - -Users should be able to monitor their training progress in real-time, view the logs, and get the best monitored metric and associated checkpoint for their models. - ----- - -Completed App -^^^^^^^^^^^^^ - -Here is a recording of the final application built in this example. The example is around 200 lines in total and should give you a great foundation to build your own Lightning App. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/github_app.mp4 - :poster: "https://pl-public-data.s3.amazonaws.com/assets_lightning/github_app.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -********** -App Design -********** - -In order to develop the App, we need to build several components: - -* A GithubRepoRunner Component that clones a repo, runs a specific script with provided arguments and collect logs. - -* A PyTorch Lightning GithubRepoRunner Component that augments the GithubRepoRunner component to track PyTorch Lightning Trainer. - -* A UI for the users to provide to trigger dynamically a new execution. - -* A Flow to dynamically create GithubRepoRunner once a user submits information from the UI. - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement the GithubRepoRunner Component - :description: Clone and execute script from a GitHub Repo. - :col_css: col-md-4 - :button_link: github_repo_runner_step_1.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 2: Implement the PyTorch Lightning GithubRepoRunner Component - :description: Automate PyTorch Lightning execution - :col_css: col-md-4 - :button_link: github_repo_runner_step_2.html - :height: 180 - :tag: Advanced - -.. displayitem:: - :header: Step 3: Implement the Flow to manage user requests - :description: Dynamically create GithubRepoRunner - :col_css: col-md-4 - :button_link: github_repo_runner_step_3.html - :height: 180 - :tag: Intermediate - - -.. displayitem:: - :header: Step 4: Implement the UI with StreamLit - :description: Several pages application - :col_css: col-md-4 - :button_link: github_repo_runner_step_4.html - :height: 180 - :tag: Intermediate - - -.. displayitem:: - :header: Step 5: Put it all together - :description: - :col_css: col-md-4 - :button_link: github_repo_runner_step_5.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst deleted file mode 100644 index e85ecc9da6b95..0000000000000 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst +++ /dev/null @@ -1,62 +0,0 @@ -:orphan: - -************************************************ -Step 1: Implement the GithubRepoRunner Component -************************************************ - -The GithubRepoRunner Component clones a repo, runs a specific script with provided arguments, and collect logs. - -Let's dive in on how to develop the component with the following code: - -.. literalinclude:: ./app.py - :lines: -72 - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 2: Implement the PyTorch Lightning GithubRepoRunner Component - :description: Automate PyTorch Lightning execution - :col_css: col-md-4 - :button_link: github_repo_runner_step_2.html - :height: 180 - :tag: Advanced - -.. displayitem:: - :header: Step 3: Implement the Flow to manage user requests - :description: Dynamically create GithubRepoRunner - :col_css: col-md-4 - :button_link: github_repo_runner_step_3.html - :height: 180 - :tag: Intermediate - - -.. displayitem:: - :header: Step 4: Implement the UI with StreamLit - :description: Several pages application - :col_css: col-md-4 - :button_link: github_repo_runner_step_4.html - :height: 180 - :tag: Intermediate - - -.. displayitem:: - :header: Step 5: Put it all together - :description: - :col_css: col-md-4 - :button_link: github_repo_runner_step_5.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst deleted file mode 100644 index deae8844c8fc2..0000000000000 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst +++ /dev/null @@ -1,68 +0,0 @@ -:orphan: - -****************************************************************** -Step 2: Implement the PyTorch Lightning GithubRepoRunner Component -****************************************************************** - -The PyTorch Lightning GithubRepoRunner Component subclasses the GithubRepoRunner but tailors the execution experience to PyTorch Lightning. - -As a matter of fact, this component adds two primary tailored features for PyTorch Lightning users: - -* It injects dynamically a custom callback ``TensorboardServerLauncher`` in the PyTorch Lightning Trainer to start a tensorboard server so it can be exposed in Lightning App UI. - -* Once the script has run, the ``on_after_run`` hook of the :class:`~lightning.app.components.python.tracer.TracerPythonScript` is invoked with the script globals, meaning we can collect anything we need. In particular, we are reloading the best model, torch scripting it, and storing its path in the state along side the best metric score. - -Let's dive in on how to develop the component with the following code: - -.. literalinclude:: ./app.py - :lines: 75-136 - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement the GithubRepoRunner Component - :description: Clone and execute script from a GitHub Repo. - :col_css: col-md-4 - :button_link: github_repo_runner_step_1.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 3: Implement the Flow to manage user requests - :description: Dynamically create GithubRepoRunner - :col_css: col-md-4 - :button_link: github_repo_runner_step_3.html - :height: 180 - :tag: Intermediate - - -.. displayitem:: - :header: Step 4: Implement the UI with StreamLit - :description: Several pages application - :col_css: col-md-4 - :button_link: github_repo_runner_step_4.html - :height: 180 - :tag: Intermediate - - -.. displayitem:: - :header: Step 5: Put it all together - :description: - :col_css: col-md-4 - :button_link: github_repo_runner_step_5.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst deleted file mode 100644 index 44cf7dd5a6523..0000000000000 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst +++ /dev/null @@ -1,62 +0,0 @@ -:orphan: - -************************************************** -Step 3: Implement the Flow to manage user requests -************************************************** - -In step 1 and 2, we have implemented the ``GithubRepoRunner`` and ``PyTorchLightningGithubRepoRunner`` components. - -Now, we are going to develop a component to dynamically handle user requests. -Let's dive in on how to develop the component with the following code: - -.. literalinclude:: ./app.py - :lines: 142-190 - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Step 1: Implement the GithubRepoRunner Component - :description: Clone and execute script from a GitHub Repo. - :col_css: col-md-4 - :button_link: github_repo_runner_step_1.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 2: Implement the PyTorch Lightning GithubRepoRunner Component - :description: Automate PyTorch Lightning execution - :col_css: col-md-4 - :button_link: github_repo_runner_step_2.html - :height: 180 - :tag: Advanced - -.. displayitem:: - :header: Step 4: Implement the UI with StreamLit - :description: Several pages application - :col_css: col-md-4 - :button_link: github_repo_runner_step_4.html - :height: 180 - :tag: Intermediate - - -.. displayitem:: - :header: Step 5: Put it all together - :description: - :col_css: col-md-4 - :button_link: github_repo_runner_step_5.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst deleted file mode 100644 index 16893aafee183..0000000000000 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst +++ /dev/null @@ -1,86 +0,0 @@ -:orphan: - -*************************************** -Step 4: Implement the UI with StreamLit -*************************************** - -In step 3, we have implemented a Flow which dynamically creates a Work when a new request is added to the requests list. - -From the UI, we create 3 pages with `StreamLit `_: - -* **Page 1**: Create a form with add a new request to the Flow state **requests**. - -* **Page 2**: Iterate through all the requests and display the associated information. - -* **Page 3**: Display the entire App State. - - -Render All Pages -^^^^^^^^^^^^^^^^ - -.. literalinclude:: ./app.py - :lines: 274-284 - -**Page 1** - -.. literalinclude:: ./app.py - :lines: 193-241 - :emphasize-lines: 43 - -**Page 2** - -.. literalinclude:: ./app.py - :lines: 244-264 - -**Page 3** - -.. literalinclude:: ./app.py - :lines: 267-271 - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: 1. Implement the GithubRepoRunner Component - :description: Clone and execute script from a GitHub Repo. - :col_css: col-md-4 - :button_link: github_repo_runner_step_1.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: 2. Implement the PyTorch Lightning GithubRepoRunner Component - :description: Automate PyTorch Lightning execution - :col_css: col-md-4 - :button_link: github_repo_runner_step_2.html - :height: 180 - :tag: Advanced - -.. displayitem:: - :header: 3. Implement the Flow to manage user requests - :description: Dynamically create GithubRepoRunner - :col_css: col-md-4 - :button_link: github_repo_runner_step_3.html - :height: 180 - :tag: Intermediate - -.. displayitem:: - :header: Step 5: Put it all together - :description: - :col_css: col-md-4 - :button_link: github_repo_runner_step_5.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst deleted file mode 100644 index 9b5b469d5b793..0000000000000 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst +++ /dev/null @@ -1,75 +0,0 @@ -:orphan: - -*************************** -Step 5: Put it all together -*************************** - -Let's dive in on how to develop the component with the following code: - -.. literalinclude:: ./app.py - :lines: 287- - -Run the application -^^^^^^^^^^^^^^^^^^^ - -Clone the Lightning repo and run the following command: - -.. code-block:: bash - - lightning run app docs/source/examples/github_repo_runner/app.py - -Add ``--cloud`` to run this application in the cloud. - -.. code-block:: bash - - lightning run app docs/source/examples/github_repo_runner/app.py --cloud - ----- - -********************** -More hands-on examples -********************** - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Develop a DAG - :description: Create a dag pipeline - :col_css: col-md-4 - :button_link: ../dag/dag.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a File Server - :description: Train multiple models with different parameters - :col_css: col-md-4 - :button_link: ../file_server/file_server.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a HPO Sweeper - :description: Train multiple models with different parameters - :col_css: col-md-4 - :button_link: ../hpo/hpo.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a Model Server - :description: Serve multiple models with different parameters - :col_css: col-md-4 - :button_link: ../model_server/model_server.html - :height: 150 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/hands_on_example.rst b/docs/source-app/examples/hands_on_example.rst deleted file mode 100644 index 57fa1e5ff114a..0000000000000 --- a/docs/source-app/examples/hands_on_example.rst +++ /dev/null @@ -1,50 +0,0 @@ -:orphan: - -################# -Hands-on Examples -################# - -.. raw:: html - -
-
- -.. displayitem:: - :header: Build a DAG - :description: Learn how to orchestrate workflows - :col_css: col-md-6 - :button_link: dag/dag.html - :height: 180 - -.. displayitem:: - :header: Build a File Server - :description: Learn how to upload and download files - :col_css: col-md-6 - :button_link: file_server/file_server.html - :height: 180 - -.. displayitem:: - :header: Build a Github Repo Script Runner - :description: Learn how to configure dynamic execution from the UI - :col_css: col-md-6 - :button_link: github_repo_runner/github_repo_runner.html - :height: 180 - -.. displayitem:: - :header: Build a HPO Sweeper - :description: Learn how to scale your training - :col_css: col-md-6 - :button_link: hpo/hpo.html - :height: 180 - -.. displayitem:: - :header: Build a Model Server - :description: Learn how to server your models - :col_css: col-md-6 - :button_link: model_server_app_content.html - :height: 180 - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/index.rst b/docs/source-app/examples/index.rst deleted file mode 100644 index bb7e645da446e..0000000000000 --- a/docs/source-app/examples/index.rst +++ /dev/null @@ -1,36 +0,0 @@ -######## -Examples -######## - -.. raw:: html - -
-
- -.. displayitem:: - :header: Develop a DAG workflow - :description: Develop sequential, non-reactive workflows - :col_css: col-md-4 - :button_link: dag/dag.html - :height: 150 - -.. displayitem:: - :header: Develop a File Server - :description: Develop a file server - :col_css: col-md-4 - :button_link: file_server/file_server.html - :height: 150 - -.. displayitem:: - :header: Develop a Github Repo Script Runner - :description: Build an app to run a Github repo - :col_css: col-md-4 - :button_link: github_repo_runner/github_repo_runner.html - :height: 150 - -.. displayitem:: - :header: Deploy a model - :description: Learn how to deploy a model - :col_css: col-md-4 - :button_link: model_server_app/model_server_app.html - :height: 150 diff --git a/docs/source-app/examples/model_server_app/app.py b/docs/source-app/examples/model_server_app/app.py deleted file mode 100644 index 9985014b11912..0000000000000 --- a/docs/source-app/examples/model_server_app/app.py +++ /dev/null @@ -1,34 +0,0 @@ -from locust_component import Locust -from model_server import MLServer -from train import TrainModel - -from lightning import LightningApp, LightningFlow - - -class TrainAndServe(LightningFlow): - def __init__(self): - super().__init__() - self.train_model = TrainModel() - self.model_server = MLServer( - name="mnist-svm", - implementation="mlserver_sklearn.SKLearnModel", - workers=8, - ) - self.performance_tester = Locust(num_users=100) - - def run(self): - self.train_model.run() - self.model_server.run(self.train_model.best_model_path) - if self.model_server.alive(): - # The performance tester needs the model server to be up - # and running to be started, so the URL is added in the UI. - self.performance_tester.run(self.model_server.url) - - def configure_layout(self): - return [ - {"name": "Server", "content": self.model_server.url + "/docs"}, - {"name": "Server Testing", "content": self.performance_tester}, - ] - - -app = LightningApp(TrainAndServe()) diff --git a/docs/source-app/examples/model_server_app/load_testing.rst b/docs/source-app/examples/model_server_app/load_testing.rst deleted file mode 100644 index 9ea347bd0dd44..0000000000000 --- a/docs/source-app/examples/model_server_app/load_testing.rst +++ /dev/null @@ -1,57 +0,0 @@ -:orphan: - -*********************************** -3. Build the Load Testing Component -*********************************** - -Now, we are going to create a component to test the performance of your model server. - -We are going to use a python performance testing tool called `Locust `_. - -.. literalinclude:: ./locust_component.py - - -Finally, once the component is done, we need to create a ``locustfile.py`` file which defines the format of the request to send to your model server. - -The endpoint to hit has the following format: ``/v2/models/{MODEL_NAME}/versions/{VERSION}/infer``. - -.. literalinclude:: ./locustfile.py - - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1. Build a Train Component - :description: Train a model and store its checkpoints with SKlearn - :col_css: col-md-4 - :button_link: train.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 2. Build a Model Server Component - :description: Use MLServer to server your models - :col_css: col-md-4 - :button_link: model_server.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 4. Putting everything together. - :description: Ensemble the components together and run the app - :col_css: col-md-4 - :button_link: putting_everything_together.html - :height: 150 - :tag: basic - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/model_server_app/locust_component.py b/docs/source-app/examples/model_server_app/locust_component.py deleted file mode 100644 index 432336adf83b3..0000000000000 --- a/docs/source-app/examples/model_server_app/locust_component.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import subprocess - -from lightning import BuildConfig, LightningWork - - -class Locust(LightningWork): - def __init__(self, num_users: int = 100): - """This component checks the performance of a server. The server url is passed to its run method. - - Arguments: - num_users: Number of users emulated by Locust - - """ - # Note: Using the default port 8089 of Locust. - super().__init__( - port=8089, - parallel=True, - cloud_build_config=BuildConfig(requirements=["locust"]), - ) - self.num_users = num_users - - def run(self, load_tested_url: str): - # 1: Create the locust command line. - cmd = " ".join( - [ - "locust", - "--master-host", - str(self.host), - "--master-port", - str(self.port), - "--host", - str(load_tested_url), - "-u", - str(self.num_users), - ] - ) - # 2: Create another process with locust - process = subprocess.Popen(cmd, cwd=os.path.dirname(__file__), shell=True) - - # 3: Wait for the process to finish. As locust is a server, - # this waits infinitely or if killed. - process.wait() diff --git a/docs/source-app/examples/model_server_app/locustfile.py b/docs/source-app/examples/model_server_app/locustfile.py deleted file mode 100644 index 198d6de6cb553..0000000000000 --- a/docs/source-app/examples/model_server_app/locustfile.py +++ /dev/null @@ -1,41 +0,0 @@ -from locust import FastHttpUser, task -from sklearn import datasets -from sklearn.model_selection import train_test_split - - -class HelloWorldUser(FastHttpUser): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._prepare_inference_request() - - @task - def predict(self): - self.client.post( - "/v2/models/mnist-svm/versions/v0.0.1/infer", - json=self.inference_request, - ) - - def _prepare_inference_request(self): - # The digits dataset - digits = datasets.load_digits() - - # To apply a classifier on this data, - # we need to flatten the image, to - # turn the data in a (samples, feature) matrix: - n_samples = len(digits.images) - data = digits.images.reshape((n_samples, -1)) - - # Split data into train and test subsets - _, X_test, _, _ = train_test_split(data, digits.target, test_size=0.5, shuffle=False) - - x_0 = X_test[0:1] - self.inference_request = { - "inputs": [ - { - "name": "predict", - "shape": x_0.shape, - "datatype": "FP32", - "data": x_0.tolist(), - } - ] - } diff --git a/docs/source-app/examples/model_server_app/model_server.py b/docs/source-app/examples/model_server_app/model_server.py deleted file mode 100644 index 10ad770b012cc..0000000000000 --- a/docs/source-app/examples/model_server_app/model_server.py +++ /dev/null @@ -1,90 +0,0 @@ -import json -import subprocess - -from lightning import BuildConfig, LightningWork -from lightning.app.storage.path import Path - -# ML_SERVER_URL = https://github.com/SeldonIO/MLServer - - -class MLServer(LightningWork): - """This components uses SeldonIO MLServer library. - - The model endpoint: /v2/models/{MODEL_NAME}/versions/{VERSION}/infer. - - Arguments: - name: The name of the model for the endpoint. - implementation: The model loader class. - Example: "mlserver_sklearn.SKLearnModel". - Learn more here: $ML_SERVER_URL/tree/master/runtimes - workers: Number of server worker. - - """ - - def __init__( - self, - name: str, - implementation: str, - workers: int = 1, - **kwargs, - ): - super().__init__( - parallel=True, - cloud_build_config=BuildConfig( - requirements=["mlserver", "mlserver-sklearn"], - ), - **kwargs, - ) - # 1: Collect the config's. - self.settings = { - "debug": True, - "parallel_workers": workers, - } - self.model_settings = { - "name": name, - "implementation": implementation, - } - # 2: Keep track of latest version - self.version = 1 - - def run(self, model_path: Path): - """The model is downloaded when the run method is invoked. - - Arguments: - model_path: The path to the trained model. - - """ - # 1: Use the host and port at runtime so it works in the cloud. - # $ML_SERVER_URL/blob/master/mlserver/settings.py#L50 - if self.version == 1: - # TODO: Reload the next version model of the model. - - self.settings.update({"host": self.host, "http_port": self.port}) - - with open("settings.json", "w") as f: - json.dump(self.settings, f) - - # 2. Store the model-settings - # $ML_SERVER_URL/blob/master/mlserver/settings.py#L120 - self.model_settings["parameters"] = { - "version": f"v0.0.{self.version}", - "uri": str(model_path.absolute()), - } - with open("model-settings.json", "w") as f: - json.dump(self.model_settings, f) - - # 3. Launch the Model Server - subprocess.Popen("mlserver start .", shell=True) - - # 4. Increment the version for the next time run is called. - self.version += 1 - - else: - # TODO: Load the next model and unload the previous one. - pass - - def alive(self): - # Current hack, when the url is available, - # the server is up and running. - # This would be cleaned out and automated. - return self.url != "" diff --git a/docs/source-app/examples/model_server_app/model_server.rst b/docs/source-app/examples/model_server_app/model_server.rst deleted file mode 100644 index 283dc97bc99e3..0000000000000 --- a/docs/source-app/examples/model_server_app/model_server.rst +++ /dev/null @@ -1,48 +0,0 @@ -:orphan: - -************************************* -2. Develop the Model Server Component -************************************* - -In the code below, we use `MLServer `_ which aims to provide an easy way to start serving your machine learning models through a REST and gRPC interface, -fully compliant with KFServing's V2 Dataplane spec. - -.. literalinclude:: ./model_server.py - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1. Develop a Train Component - :description: Train a model and store its checkpoints with SKlearn - :col_css: col-md-4 - :button_link: train.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 3. Develop a Load Testing Component - :description: Use Locust to test your model servers - :col_css: col-md-4 - :button_link: load_testing.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 4. Putting everything together. - :description: Ensemble the Components together and run the App - :col_css: col-md-4 - :button_link: putting_everything_together.html - :height: 150 - :tag: basic - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/model_server_app/model_server_app.rst b/docs/source-app/examples/model_server_app/model_server_app.rst deleted file mode 100644 index 933c89d035b00..0000000000000 --- a/docs/source-app/examples/model_server_app/model_server_app.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -.. _model_server_example: - -###################### -Develop a Model Server -###################### - -**Audience:** Users who want to serve their trained models. - -**Prerequisite**: Reach :ref:`level 16+ `. - ----- - -.. include:: model_server_app_content.rst diff --git a/docs/source-app/examples/model_server_app/model_server_app_content.rst b/docs/source-app/examples/model_server_app/model_server_app_content.rst deleted file mode 100644 index 0a9280c53c2b1..0000000000000 --- a/docs/source-app/examples/model_server_app/model_server_app_content.rst +++ /dev/null @@ -1,84 +0,0 @@ - -********* -Objective -********* - -Create a simple application that trains and serves a `Sklearn `_ machine learning model with `MLServer from SeldonIO `_ - ----- - -***************** -Final Application -***************** - -Here is a gif of the final application built in this example. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/ml_server_2.gif - ----- - -************* -System Design -************* - -In order to create such application, we need to build several components: - -* A Model Train Component that trains a model and provides its trained weights - -* A Model Server Component that serves as an API endpoint for the model generated by the **Model Train Component**. - -* A Load Testing Component that tests the model server works as expected. This could be used to CI/CD the performance of newly generated models (left to the users). - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/model_server_app_2.png - -Let's dive into the tutorial. - ----- - -******** -Tutorial -******** - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1. Build a Train Component - :description: Train a model and store its checkpoints with SKlearn - :col_css: col-md-4 - :button_link: train.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 2. Build a Model Server Component - :description: Use MLServer to server your models - :col_css: col-md-4 - :button_link: model_server.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 3. Build a Load Testing Component - :description: Use Locust to test your model servers - :col_css: col-md-4 - :button_link: load_testing.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 4. Putting everything together. - :description: Ensemble the components together and run the app - :col_css: col-md-4 - :button_link: putting_everything_together.html - :height: 150 - :tag: basic - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/model_server_app/putting_everything_together.rst b/docs/source-app/examples/model_server_app/putting_everything_together.rst deleted file mode 100644 index 48162a911f1a0..0000000000000 --- a/docs/source-app/examples/model_server_app/putting_everything_together.rst +++ /dev/null @@ -1,80 +0,0 @@ -:orphan: - -****************************** -4. Putting everything together -****************************** - -In the code below, we put together the **TrainWork**, the **MLServer** and the **Locust** components in an ``app.py`` file. - -.. literalinclude:: ./app.py - - -*********** -Run the App -*********** - -To run the app, simply open a terminal and execute this command: - -.. code-block:: bash - - lightning run app docs/source/examples/model_deploy_app/app.py - -Here is a gif of the UI. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/ml_server_2.gif - -.. raw:: html - -
- -Congrats, you have finished the **Build a Model Server** example ! - ----- - -****************** -Find more examples -****************** - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Develop a DAG - :description: Develop a DAG pipeline - :col_css: col-md-4 - :button_link: ../dag/dag.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a File Server - :description: Train multiple models with different parameters - :col_css: col-md-4 - :button_link: ../file_server/file_server.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a Github Repo Script Runner - :description: Run code from the internet in the cloud - :col_css: col-md-4 - :button_link: ../github_repo_runner/github_repo_runner.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Develop a HPO Sweeper - :description: Train multiple models with different parameters - :col_css: col-md-4 - :button_link: ../hpo/hpo.html - :height: 150 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/model_server_app/train.py b/docs/source-app/examples/model_server_app/train.py deleted file mode 100644 index 457cc03cda69d..0000000000000 --- a/docs/source-app/examples/model_server_app/train.py +++ /dev/null @@ -1,41 +0,0 @@ -import joblib -from sklearn import datasets, svm -from sklearn.model_selection import train_test_split - -from lightning import LightningWork -from lightning.app.storage.path import Path - - -class TrainModel(LightningWork): - """This component trains a Sklearn SVC model on digits dataset.""" - - def __init__(self): - super().__init__() - # 1: Add element to the state. - self.best_model_path = None - - def run(self): - # 2: Load the Digits - digits = datasets.load_digits() - - # 3: To apply a classifier on this data, - # we need to flatten the image, to - # turn the data in a (samples, feature) matrix: - n_samples = len(digits.images) - data = digits.images.reshape((n_samples, -1)) - - # 4: Create a classifier: a support vector classifier - classifier = svm.SVC(gamma=0.001) - - # 5: Split data into train and test subsets - X_train, _, y_train, _ = train_test_split(data, digits.target, test_size=0.5, shuffle=False) - - # 6: We learn the digits on the first half of the digits - classifier.fit(X_train, y_train) - - # 7: Save the Sklearn model with `joblib`. - model_file_name = "mnist-svm.joblib" - joblib.dump(classifier, model_file_name) - - # 8: Keep a reference the the generated model. - self.best_model_path = Path("mnist-svm.joblib") diff --git a/docs/source-app/examples/model_server_app/train.rst b/docs/source-app/examples/model_server_app/train.rst deleted file mode 100644 index fdb6f6a93a0a7..0000000000000 --- a/docs/source-app/examples/model_server_app/train.rst +++ /dev/null @@ -1,49 +0,0 @@ -:orphan: - -**************************** -1. Build the Train Component -**************************** - -In the code below, we create a work which trains a simple `SVC `_ model on the digits dataset (classification). - -Once the model is trained, it is saved and a reference :class:`~lightning.app.storage.path.Path` with ``best_model_path`` state attribute. - -.. literalinclude:: ./train.py - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 2. Build a Model Server Component - :description: Use MLServer to server your models - :col_css: col-md-4 - :button_link: model_server.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 3. Build a Load Testing Component - :description: Use Locust to test your model servers - :col_css: col-md-4 - :button_link: load_testing.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: 4. Putting everything together. - :description: Ensemble the components together and run the app - :col_css: col-md-4 - :button_link: putting_everything_together.html - :height: 150 - :tag: basic - -.. raw:: html - -
-
diff --git a/docs/source-app/examples/research_demo_app.rst b/docs/source-app/examples/research_demo_app.rst deleted file mode 100644 index 90276f9b95f97..0000000000000 --- a/docs/source-app/examples/research_demo_app.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -######################### -Build a Research Demo App -######################### diff --git a/docs/source-app/get_started/add_an_interactive_demo.rst b/docs/source-app/get_started/add_an_interactive_demo.rst deleted file mode 100644 index 0ad0e6b9c8771..0000000000000 --- a/docs/source-app/get_started/add_an_interactive_demo.rst +++ /dev/null @@ -1,15 +0,0 @@ -:orphan: - -####################### -Add an Interactive Demo -####################### - -.. _add_an_interactive_Demo: - -**Required background:** Basic Python familiarity and complete the install guide. - -**Goal:** We'll walk you through the 4 key steps to run a Lightning App that trains and demos a model. - ----- - -.. include:: go_beyond_training_content.rst diff --git a/docs/source-app/get_started/build_model.rst b/docs/source-app/get_started/build_model.rst deleted file mode 100644 index 300b220ee61ce..0000000000000 --- a/docs/source-app/get_started/build_model.rst +++ /dev/null @@ -1,73 +0,0 @@ -:orphan: - -.. _build_model: - -####################### -Build and Train a Model -####################### - -**Required background:** Basic Python familiarity and complete the guide. - -**Goal:** We'll walk you through the creation of a model using PyTorch Lightning. - ----- - -********************************* -A simple PyTorch Lightning script -********************************* - -Let's assume you already have a folder with those two files. - -.. code-block:: bash - - pl_project/ - train.py # your own script to train your models - requirements.txt # your python requirements. - -If you don't, simply create a ``pl_project`` folder with those two files and add the following `PyTorch Lightning `_ code in the ``train.py`` file. This code trains a simple ``AutoEncoder`` on `MNIST Dataset `_. - -.. literalinclude:: ../code_samples/convert_pl_to_app/train.py - -Add the following to the ``requirements.txt`` file. - -.. literalinclude:: ../code_samples/convert_pl_to_app/requirements.txt - -Simply run the following commands in your terminal to install the requirements and train the model. - -.. code-block:: bash - - pip install -r requirements.txt - python train.py - -Get through `PyTorch Lightning Introduction `_ to learn more. - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
-
- -.. displayitem:: - :header: Evolve a Model into an ML System - :description: Develop an App to train a model in the cloud - :col_css: col-md-6 - :button_link: training_with_apps.html - :height: 180 - -.. displayitem:: - :header: Start from a Template ML System - :description: Learn about Apps, from a template. - :col_css: col-md-6 - :button_link: go_beyond_training.html - :height: 180 - -.. raw:: html - -
-
diff --git a/docs/source-app/get_started/go_beyond_training.rst b/docs/source-app/get_started/go_beyond_training.rst deleted file mode 100644 index f45e7f9f0ab62..0000000000000 --- a/docs/source-app/get_started/go_beyond_training.rst +++ /dev/null @@ -1,14 +0,0 @@ -:orphan: - -################################ -Start from an ML system template -################################ - -.. _go_beyond_training: - -**Required background:** Basic Python familiarity and complete the install guide. - -**Goal:** We'll walk you through the 4 key steps to run a Lightning App that trains and demos a model. - - -.. include:: go_beyond_training_content.rst diff --git a/docs/source-app/get_started/go_beyond_training_content.rst b/docs/source-app/get_started/go_beyond_training_content.rst deleted file mode 100644 index 5bc632b08ddd0..0000000000000 --- a/docs/source-app/get_started/go_beyond_training_content.rst +++ /dev/null @@ -1,405 +0,0 @@ -************************************************ -The *Train & Demo PyTorch Lightning* Application -************************************************ - -Find the *Train & Demo PyTorch Lightning* application in the `Lightning.ai App Gallery `_. - -Here is a recording of this App running locally and in the cloud with the same behavior. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning_app_experience_cut.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning_app_experience_cut.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -In the steps below, we are going to show you how to build this application. - -Here are `the entire App's code `_ and `its commented components. `_ - ----- - -************************* -Step 1: Install Lightning -************************* - -If you are using a virtual env, don't forget to activate it before running commands. -You must do so in every new shell. - -.. tip:: We highly recommend using virtual environments. - -.. code:: bash - - pip install lightning - ----- - -**************************************** -Step 2: Install the *Train and Demo* App -**************************************** -The first Lightning App we'll explore is an App to train and demo a machine learning model. - -.. - [|qs_code|], [|qs_live_app|]. - - .. |qs_live_app| raw:: html - -
live app - - .. |qs_code| raw:: html - - code - - -Install this App by typing: - -.. code-block:: bash - - lightning_app install app lightning/quick-start - -Verify the App was successfully installed: - -.. code-block:: bash - - cd lightning-quick-start - ----- - -*************************** -Step 3: Run the App locally -*************************** - -Run the app locally with the ``run`` command 🤯 - -.. code:: bash - - lightning_app run app app.py - ----- - -******************************** -Step 4: Run the App in the cloud -******************************** - -Add the ``--cloud`` argument to run on the `Lightning.AI cloud `_. 🤯🤯🤯 - -.. code:: bash - - lightning_app run app app.py --cloud - -.. - Your app should look like this one (|qs_live_app|) - ----- - -******************* -Understand the code -******************* -The App that we just launched trained a PyTorch Lightning model (although any framework works), then added an interactive demo. - -This is the App's code: - -.. code:: python - - # lightning-quick-start/app.py - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = PyTorchLightningScript( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - ) - - self.serve_work = ImageServeGradio(L.CloudCompute()) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - app = L.LightningApp(TrainDeploy()) - -Let's break down the code section by section to understand what it is doing. - ----- - -1: Define root component -^^^^^^^^^^^^^^^^^^^^^^^^ - -A Lightning App provides a cohesive product experience for a set of unrelated components. - -The top-level component (Root) must subclass ``L.LightningFlow`` - - -.. code:: python - :emphasize-lines: 6 - - # lightning-quick-start/app.py - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = PyTorchLightningScript( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - ) - - self.serve_work = ImageServeGradio(L.CloudCompute("cpu-small")) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - app = L.LightningApp(TrainDeploy()) - ----- - -2: Define components -^^^^^^^^^^^^^^^^^^^^ -In the __init__ method, we define the components that make up the App. In this case, we have 2 components, -a component to execute any PyTorch Lightning script (model training) and a second component to -start a Gradio server for demo purposes. - -.. code:: python - :emphasize-lines: 9, 14 - - # lightning-quick-start/app.py - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = PyTorchLightningScript( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - ) - - self.serve_work = ImageServeGradio(L.CloudCompute("cpu-small")) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - app = L.LightningApp(TrainDeploy()) - ----- - -3: Define how components Flow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Every component has a ``run`` method. The run method defines the 🌊 Flow 🌊 of how components interact together. - -In this case, we train a model (until completion). When it's done AND there exists a checkpoint, we launch a -demo server: - -.. code:: python - :emphasize-lines: 18, 21, 22 - - # lightning-quick-start/app.py - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = PyTorchLightningScript( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - ) - - self.serve_work = ImageServeGradio(L.CloudCompute("cpu-small")) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - app = L.LightningApp(TrainDeploy()) - -.. note:: If you've used other ML systems you'll be pleasantly surprised to not find decorators or YAML files. - ----- - -4: Connect web user interfaces -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All our favorite tools normally have their own web user interfaces (UI). - -Implement the ``configure_layout`` method to connect them together: - -.. code:: python - :emphasize-lines: 24-27 - - # lightning-quick-start/app.py - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = PyTorchLightningScript( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - ) - - self.serve_work = ImageServeGradio(L.CloudCompute("cpu-small")) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - app = L.LightningApp(TrainDeploy()) - ----- - -5: Init the ``app`` object -^^^^^^^^^^^^^^^^^^^^^^^^^^ -Initialize an ``app`` object with the ``TrainDeploy`` component (this won't run the App yet): - -.. code:: python - :emphasize-lines: 29 - - # lightning-quick-start/app.py - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = PyTorchLightningScript( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - ) - - self.serve_work = ImageServeGradio(L.CloudCompute("cpu-small")) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - app = L.LightningApp(TrainDeploy()) - ----- - -****************************** -What components are supported? -****************************** -Any component can work with Lightning AI! - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/Lightning.gif - :alt: What is Lightning gif. - :width: 100 % - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Add components to your App - :description: Expand your App by adding components. - :col_css: col-md-4 - :button_link: ../workflows/extend_app.html - :height: 180 - -.. displayitem:: - :header: Build a component - :description: Learn to build your own component. - :col_css: col-md-4 - :button_link: ../workflows/build_lightning_component/index.html - :height: 180 - -.. displayitem:: - :header: Explore more Apps - :description: Explore more apps for inspiration. - :col_css: col-md-4 - :button_link: https://lightning.ai/apps - :height: 180 - -.. displayitem:: - :header: Under the hood - :description: Explore how it works under the hood. - :col_css: col-md-4 - :button_link: ../core_api/lightning_app/index.html - :height: 180 - -.. displayitem:: - :header: Run on your private cloud - :description: Run Lightning Apps on your private VPC or on-prem. - :button_link: ../workflows/run_on_private_cloud.html - :col_css: col-md-4 - :height: 180 - -.. raw:: html - -
-
diff --git a/docs/source-app/get_started/jumpstart_from_app_gallery.rst b/docs/source-app/get_started/jumpstart_from_app_gallery.rst deleted file mode 100644 index c6a49674c9aa9..0000000000000 --- a/docs/source-app/get_started/jumpstart_from_app_gallery.rst +++ /dev/null @@ -1,123 +0,0 @@ -:orphan: - -##################################### -Start from Ready-to-Run Template Apps -##################################### - -.. _jumpstart_from_app_gallery: - -Anyone can build Apps for their own use cases and promote them on the `App Gallery `_. - -In return, you can benefit from the work of others and get started faster by re-using a ready-to-run App close to your own use case. - - -************* -User Workflow -************* - -#. Visit the `App Gallery `_ and look for an App close to your own use case. - - .. raw:: html - -
- -#. If **Launch** is available, it means the App is live and ready to be used! Take it for a spin. - - .. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/launch_button.png - :alt: Launch Button on lightning.ai - :width: 100 % - -#. By clicking **Clone & Run**, a copy of the App is added to your account and an instance starts running. - - - .. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/clone_and_run.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/clone_and_run.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -#. If you found an App that matches what you need, move to **step 5**! Otherwise, go back to **step 1**. - - .. raw:: html - -
- -#. Copy the installation command (optionally from the clipboard on the right). - - .. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/install_command.png - :alt: Install command on lightning.ai - :width: 100 % - -#. Copy the command to your local terminal. - - .. code-block:: bash - - lightning_app install app lightning/hackernews-app - -#. Go through the installation steps. - - .. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/install_an_app.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/install_an_app.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -#. Run the App locally. - - .. code-block:: bash - - cd LAI-Hackernews-App - lightning_app run app app.py - - .. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/hackernews.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/hackernews.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -#. Open the code with your favorite IDE, modify it, and run it back in the cloud. - - .. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/hackernews_modified.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/hackernews_modified.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Add Component made by others to your App - :description: Add more functionality to your projects - :col_css: col-md-6 - :button_link: jumpstart_from_component_gallery.html - :height: 180 - -.. displayitem:: - :header: Level-up your skills with Lightning Apps - :description: From Basic to Advanced Skills - :col_css: col-md-6 - :button_link: ../levels/basic/index.html - :height: 180 - -.. raw:: html - -
-
-
diff --git a/docs/source-app/get_started/jumpstart_from_component_gallery.rst b/docs/source-app/get_started/jumpstart_from_component_gallery.rst deleted file mode 100644 index 95f7d570d87d0..0000000000000 --- a/docs/source-app/get_started/jumpstart_from_component_gallery.rst +++ /dev/null @@ -1,151 +0,0 @@ -:orphan: - -######################################## -Add Component made by others to your App -######################################## - -.. _jumpstart_from_component_gallery: - -Anyone can build components for their own use case and promote them on the `Component Gallery `_. - -In return, you can benefit from the work of others and add new functionalities to your Apps with minimal effort. - - -************* -User Workflow -************* - -#. Visit the `Component Gallery `_ and look for a Component close to something you want to do. - - .. raw:: html - -
- -#. Check out the code for inspiration or simply install the component from PyPi and use it. - ----- - -************* -Success Story -************* - -The default `Train and Demo Application `_ trains a PyTorch Lightning -model and then starts a demo with `Gradio `_. - -.. code-block:: python - - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = PyTorchLightningScript( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - ) - - self.serve_work = ImageServeGradio(L.CloudCompute("cpu")) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - - app = L.LightningApp(TrainDeploy()) - -However, someone who wants to use this Aop (maybe you) found `Lightning HPO `_ -from browsing the `Component Gallery `_ and decided to give it a spin after checking the associated -`Github Repository `_. - -Once ``lightning_hpo`` installed, they improved the default App by easily adding HPO support to their project. - -Here is the resulting App. It is almost the same code, but it's way more powerful now! - -This is the power of `lightning.ai `_ ecosystem 🔥⚡🔥 - -.. code-block:: python - - import os.path as ops - import lightning as L - from quick_start.components import PyTorchLightningScript, ImageServeGradio - import optuna - from optuna.distributions import LogUniformDistribution - from lightning_hpo import Optimizer, BaseObjective - - - class HPOPyTorchLightningScript(PyTorchLightningScript, BaseObjective): - @staticmethod - def distributions(): - return {"model.lr": LogUniformDistribution(0.0001, 0.1)} - - - class TrainDeploy(L.LightningFlow): - def __init__(self): - super().__init__() - self.train_work = Optimizer( - script_path=ops.join(ops.dirname(__file__), "./train_script.py"), - script_args=["--trainer.max_epochs=5"], - objective_cls=HPOPyTorchLightningScript, - n_trials=4, - ) - - self.serve_work = ImageServeGradio(L.CloudCompute("cpu")) - - def run(self): - # 1. Run the python script that trains the model - self.train_work.run() - - # 2. when a checkpoint is available, deploy - if self.train_work.best_model_path: - self.serve_work.run(self.train_work.best_model_path) - - def configure_layout(self): - tab_1 = {"name": "Model training", "content": self.train_work.hi_plot} - tab_2 = {"name": "Interactive demo", "content": self.serve_work} - return [tab_1, tab_2] - - - app = L.LightningApp(TrainDeploy()) - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Start from Ready-to-Run Template Apps - :description: Jump-start your projects development - :col_css: col-md-6 - :button_link: jumpstart_from_app_gallery.html - :height: 180 - -.. displayitem:: - :header: Level-up your skills with Lightning Apps - :description: From Basic to Advanced Skills - :col_css: col-md-6 - :button_link: ../levels/basic/index.html - :height: 180 - -.. raw:: html - -
-
-
diff --git a/docs/source-app/get_started/training_with_apps.rst b/docs/source-app/get_started/training_with_apps.rst deleted file mode 100644 index 46cc851bdd290..0000000000000 --- a/docs/source-app/get_started/training_with_apps.rst +++ /dev/null @@ -1,125 +0,0 @@ -:orphan: - -################################ -Evolve a model into an ML system -################################ - -.. _convert_pl_to_app: - -**Required background:** Basic Python familiarity and complete the :ref:`build_model` guide. - -**Goal:** We'll walk you through the two key steps to build your first Lightning App from your existing PyTorch Lightning scripts. - - -******************* -Training and beyond -******************* - -With `PyTorch Lightning `__, we abstracted distributed training and hardware, by organizing PyTorch code. -With `Lightning Apps `__, we unified the local and cloud experience while abstracting infrastructure. - -By using `PyTorch Lightning `__ and `Lightning Apps `__ -together, a completely new world of possibilities emerges. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/pl_to_app_4.png - :alt: From PyTorch Lightning to Lightning App - :width: 100 % - ----- - -****************************************** -1. Write an App to run the train.py script -****************************************** - -This article continues where the :ref:`build_model` guide finished. - -Create an additional file ``app.py`` in the ``pl_project`` folder as follows: - -.. code-block:: bash - - pl_project/ - app.py - train.py - requirements.txt - -Inside the ``app.py`` file, add the following code. - -.. literalinclude:: ../code_samples/convert_pl_to_app/app.py - -This App runs the PyTorch Lightning script contained in the ``train.py`` file using the powerful :class:`~lightning.app.components.python.tracer.TracerPythonScript` component. This is really worth checking out! - ----- - -************************************************ -2. Run the train.py file locally or in the cloud -************************************************ - -First, go to the ``pl_folder`` folder from the local terminal and install the requirements. - -.. code-block:: bash - - cd pl_folder - pip install -r requirements.txt - -To run your app, copy the following command to your local terminal: - -.. code-block:: bash - - lightning_app run app app.py - -Simply add ``--cloud`` to run this application in the cloud with a GPU machine 🤯 - -.. code-block:: bash - - lightning_app run app app.py --cloud - - -Congratulations! Now, you know how to run a `PyTorch Lightning `_ script with Lightning Apps. - -Lightning Apps can make your ML system way more powerful, keep reading to learn how. - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Level-up with Lightning Apps - :description: From Basics to Advanced Skills - :col_css: col-md-4 - :button_link: ../levels/basic/index.html - :height: 180 - -.. displayitem:: - :header: Add an Interactive Demo - :description: Add a Gradio Demo once the training is finished - :col_css: col-md-4 - :button_link: add_an_interactive_demo.html - :height: 180 - -.. displayitem:: - :header: Add Model Serving - :description: Serve and load testing with MLServer and Locust - :col_css: col-md-4 - :button_link: ../examples/model_server_app/model_server_app.html - :height: 180 - -.. displayitem:: - :header: Add DAG Orchestration - :description: Organize your processing, training and metrics collection - :col_css: col-md-4 - :button_link: ../examples/dag/dag.html - :height: 180 - -.. displayitem:: - :header: Add Team Collaboration - :description: Create an app to run any PyTorch Lightning Script from Github - :col_css: col-md-4 - :button_link: ../examples/github_repo_runner/github_repo_runner.html - :height: 180 diff --git a/docs/source-app/get_started/what_app_can_do.rst b/docs/source-app/get_started/what_app_can_do.rst deleted file mode 100644 index b4033dd40d594..0000000000000 --- a/docs/source-app/get_started/what_app_can_do.rst +++ /dev/null @@ -1,187 +0,0 @@ -:orphan: - -############################################ -Discover what Lightning Apps can do in 5 min -############################################ - -.. _what_app_can_do: - -Lightning Apps can be plenty things, and while a picture is worth a thousand words, videos showing you examples should be worth even more. - - -***************************** -Flashy - Auto ML App (Public) -***************************** - -Train a model on any image or text dataset without writing any code. Flashy uses `React.js `_ for its frontend. - -Find `Flashy `_ on the App Gallery and the `Flashy codebase. `_ on GitHub. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/flashy.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/flashy.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -.. ---- - -.. *************************************** -.. NVIDIA Omniverse Sampling App (Private) -.. *************************************** - -.. Use `Nvidia Sampling Omniverse `_ to generate synthetic samples from 3D meshes and train an object detector on that data. - -.. .. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/Omniverse-Sampling.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/Omniverse-Sampling.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -********************* -Research App (Public) -********************* - -Share your paper ``bundled`` with the arxiv link, poster, live jupyter notebook, interactive demo to try the model, and more! - -Find the `Research App `_ on the App Gallery and the `Research App codebase. `_ on GitHub. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/research_app.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/research_app.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -************************************************ -ScratchPad - Notebook Manager for Team (Public) -************************************************ - -Run multiple Jupyter Notebooks on cloud CPUs or machines with multiple GPUs. - -Find the `ScratchPad App `_ on the App Gallery and the `ScratchPad App codebase `_ on GitHub. - -.. note:: ScratchPad is `tested end-to-end `_ on every Lightning App commit with `pytest `_. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/notebook_apps.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/notebook_apps.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -*********************** -InVideo Search (Public) -*********************** - -This App lets you find anything you're looking for inside a video. The engine is powered by `Open AI CLIP `_. - -Find the `InVideo Search App `_ on the App Gallery and the `InVideo Search App codebase. `_ in GitHub. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/video_search_2.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/video_search_2.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -****************************** -AI-powered HackerNews (Public) -****************************** - -Save yourself time, and get Hacker News story recommendations, chosen for you specifically. This Lightning App was designed to illustrate a full end-to-end MLOPs workflow aimed at enterprise recommendation systems. - -Find the `AI-powered HackerNews App `_ on the App Gallery and the `AI-powered HackerNews App codebase. `_ on GitHub. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/hackernews_app.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/hackernews_app.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - ----- - -********************************************************************* -Lightning Apps can turn ML into scalable systems in days — not months -********************************************************************* - -Use the Lightning framework to develop any ML system: train and deploy a model, create an ETL pipeline, -or spin up a research demo — using the intuitive principles we pioneered with PyTorch Lightning. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/apps_logos_2.png - :alt: Apps with Logos - :width: 100 % - -Anyone who knows Python can build a Lightning App, even without machine learning experience. - -Lightning Apps are: - -- cloud agnostic -- fault-tolerant, distributed, cost optimized -- production ready -- local and cloud debuggable -- highly reactive & interactive -- connect multiple UIs together -- built for team collaboration -- framework agnostic, use your own stack -- and much more - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning_app_experience_cut.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning_app_experience_cut.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -********** -Next Steps -********** - -.. raw:: html - -
-
-
- -.. displayitem:: - :header: Build & Train a Model - :description: Discover PyTorch Lightning and train your first Model. - :col_css: col-md-4 - :button_link: build_model.html - :height: 180 - -.. displayitem:: - :header: Evolve a Model into an ML System - :description: Develop an App to train a model in the cloud - :col_css: col-md-4 - :button_link: training_with_apps.html - :height: 180 - -.. displayitem:: - :header: Start from an ML system template - :description: Learn about Apps, from a template. - :col_css: col-md-4 - :button_link: go_beyond_training.html - :height: 180 - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/app_tree.rst b/docs/source-app/glossary/app_tree.rst deleted file mode 100644 index 37413f30cfe0f..0000000000000 --- a/docs/source-app/glossary/app_tree.rst +++ /dev/null @@ -1,113 +0,0 @@ -:orphan: - -.. _app_component_tree: - -################### -App Component Tree -################### - -**Audience:** Users who want to know how components can be composed with each other. - -**Level:** Basic - ----- - -************************************** -What is an Application Component Tree? -************************************** - -Components can be nested to form component trees where the LightningFlows are its branches and LightningWorks are its leaves. - -This design enables users to organize and maintain their code with more ease, but more importantly, this helps creating an ecosystem with reusable components. - -Here's a basic application with four flows and two works (associated tree structure): - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/tree.gif - :alt: Basic App Components - :width: 100 % - - -.. literalinclude:: ../code_samples/quickstart/app_comp.py - -A Lightning app runs all flows into a single process. Its flows coordinate the execution of the works each running in their own independent processes. - ----- - -*********************************************** -How do I define my application component tree? -*********************************************** - -In order to define your application component tree, you need create a tree of components and attach them to your root flow. - -You can attach your components in the **__init__** method of a flow. - -.. code-block:: python - - import lightning as L - - - class RootFlow(L.LightningFlow): - def __init__(self): - super().__init__() - # The `Work` component is attached here. - self.work = Work() - - # The `NestedFlow` component is attached here. - self.nested_flow = NestedFlow() - -Once done, simply add the root flow to a Lightning app as follows: - -.. code-block:: python - - app = L.LightningApp(RootFlow()) - ----- - -****************************************** -Is my application component tree static? -****************************************** - -No, Lightning supports dynamic flows and works. - -You can simply attach your components in the **run** method of a flow using the Python functions **hasattr**, **setattr**, and **getattr**. - -.. code-block:: python - - class RootFlow(L.LightningFlow): - def run(self): - - if not hasattr(self, "work"): - # The `Work` component is attached here. - setattr(self, "work", Work()) - # Run the `Work` component. - getattr(self, "work").run() - - if not hasattr(self, "nested_flow"): - # The `NestedFlow` component is attached here. - setattr(self, "nested_flow", NestedFlow()) - # Run the `NestedFlow` component. - getattr(self, "wonested_flowrk").run() - - -But it is usually more readable to use Lightning built-in :class:`~lightning.app.structures.Dict` or :class:`~lightning.app.structures.List` as follows: - -.. code-block:: python - - from lightning.app.structures import Dict - - - class RootFlow(L.LightningFlow): - def __init__(self): - super().__init__() - self.dict = Dict() - - def run(self): - if "work" not in self.dict: - # The `Work` component is attached here. - self.dict["work"] = Work() - self.dict["work"].run() - - if "nested_flow" not in self.dict: - # The `NestedFlow` component is attached here. - self.dict["nested_flow"] = NestedFlow() - self.dict["nested_flow"].run() diff --git a/docs/source-app/glossary/build_config/build_config.rst b/docs/source-app/glossary/build_config/build_config.rst deleted file mode 100644 index 43ba0b02e2bcf..0000000000000 --- a/docs/source-app/glossary/build_config/build_config.rst +++ /dev/null @@ -1,43 +0,0 @@ -:orphan: - -.. _build_config: - -################### -Build Configuration -################### - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Basic - :description: Learn how to manage Python dependencies for an individual LightningWork - :col_css: col-md-6 - :button_link: build_config_basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Intermediate - :description: Learn how to run custom build commands for a LightningWork - :col_css: col-md-6 - :button_link: build_config_intermediate.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Advanced - :description: Learn how to use a custom Docker image for a LightningWork - :col_css: col-md-6 - :button_link: build_config_advanced.html - :height: 150 - :tag: advanced - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/build_config/build_config_advanced.rst b/docs/source-app/glossary/build_config/build_config_advanced.rst deleted file mode 100644 index bc6f5a2062d12..0000000000000 --- a/docs/source-app/glossary/build_config/build_config_advanced.rst +++ /dev/null @@ -1,63 +0,0 @@ -:orphan: - -############################## -Build Configuration (Advanced) -############################## - -**Audience:** Users who want full control over the docker image that is being installed in the cloud. - -**Level:** Advanced - -Advanced users who need full control over the environment a LightningWork runs in can specify a custom docker image that will be deployed in the cloud. - - ----- - -****************** -Use a docker image -****************** - -Create a :class:`~lightning.app.utilities.packaging.build_config.BuildConfig` and provide a **publicly accessible** link to where the image is hosted: - -.. code-block:: python - - from lightning.app import LightningWork, BuildConfig - - - class MyWork(LightningWork): - def __init__(self): - super().__init__() - - # Using a publicly hosted docker image: - self.cloud_build_config = BuildConfig( - # This is one of the base images Lightning uses by default - image="ghcr.io/gridai/base-images:v1.8-gpu" - ) - - # Can also be combined with extra requirements - self.cloud_build_config = BuildConfig(image="...", requirements=["torchmetrics"]) - - -.. warning:: - Many public hosters like DockerHub apply rate limits for public images. We recommend to pull images from your own registry. - For example, you can set up a - `docker registry on GitHub `_. - - -.. note:: - - The build config only applies when running in the cloud and gets ignored otherwise. A local build config is currently not supported. - - Images from private registries are currently not supported. - -.. note:: - Custom docker images must have python installed. We'll use `virtualenv` from this system python to create a virtual environment. - We'll also configure the `virtualenv` to use the packages installed under system's python so your packages are not lost - ----- - - -********************* -Provide a docker file -********************* - -.. note:: - Not yet supported. Coming soon. diff --git a/docs/source-app/glossary/build_config/build_config_basic.rst b/docs/source-app/glossary/build_config/build_config_basic.rst deleted file mode 100644 index 538cf868c5642..0000000000000 --- a/docs/source-app/glossary/build_config/build_config_basic.rst +++ /dev/null @@ -1,68 +0,0 @@ -:orphan: - -########################### -Build Configuration (Basic) -########################### - -**Audience:** Users who need to install Python packages for an individual LightningWork. - -**Level:** Basic - ----- - -*********************************** -List dependencies in separate files -*********************************** - -If you are building an app with multiple LightningWorks that have different or even conflicting requirements, split your dependencies into individual files -for more granular control. - -.. code-block:: bash - - ├── app.py - ├── requirements.txt # Global requirements for the entire app - └── works - ├── serve - │ ├── requirements.txt # Requirements specific to the 'serve' work - │ └── serve.py # Source file for the LightningWork - └── train - ├── requirements.txt # Requirements specific to the 'train' work - └── train.py # Source file for the LightningWork - -The requirements.txt file must be located in the same directory as the source file of the LightningWork. -When the LightningWork starts up, it will pick up the requirements file if present and install all listed packages. - -.. note:: - This only applies when running in the cloud. The requirements.txt files get ignored when running locally. - ----- - -*********************************** -Define the requirements in the code -*********************************** - -Instead of listing the requirements in a file, you can also pass them to the LightningWork at runtime using the -:class:`~lightning.app.utilities.packaging.build_config.BuildConfig`: - -.. code-block:: python - :emphasize-lines: 7 - - from lightning.app import LightningWork, BuildConfig - - - class MyWork(LightningWork): - def __init__(self): - super().__init__() - self.cloud_build_config = BuildConfig(requirements=["torch>=1.8", "torchmetrics"]) - -.. note:: - The build config only applies when running in the cloud and gets ignored otherwise. A local build config is currently not supported. - -.. warning:: - Custom base images are not supported with the default CPU cloud compute. For example: - - .. code-block:: py - - class MyWork(LightningWork): - def __init__(self): - super().__init__(cloud_build_config=BuildConfig(image="my-custom-image")) # no cloud compute, for example default work diff --git a/docs/source-app/glossary/build_config/build_config_intermediate.rst b/docs/source-app/glossary/build_config/build_config_intermediate.rst deleted file mode 100644 index 600b8e85f36ef..0000000000000 --- a/docs/source-app/glossary/build_config/build_config_intermediate.rst +++ /dev/null @@ -1,56 +0,0 @@ -:orphan: - -################################## -Build Configuration (Intermediate) -################################## - -**Audience:** Users who need to execute commands to configure the machine before a LightningWork runs on it. - -**Level:** Intermediate - -When a LightningWork machine starts up in the cloud, it uses a lightweight operating system with essential packages pre-installed. -If you need to install additional system packages or run other configuration steps before your code executes on that machine, it is possible to do so by creating a custom -:class:`~lightning.app.utilities.packaging.build_config.BuildConfig`: - -1. Subclass :class:`~lightning.app.utilities.packaging.build_config.BuildConfig`: - - .. code-block:: python - - from lightning.app import BuildConfig - - - @dataclass - class CustomBuildConfig(BuildConfig): - def build_commands(self): - return ["sudo apt-get install libsparsehash-dev"] - - -2. Set the build config on the LightningWork: - - .. code-block:: python - - from lightning.app import LightningWork - - - class MyWork(LightningWork): - def __init__(self): - super().__init__() - - # Use the custom build config - self.cloud_build_config = CustomBuildConfig() - - # Can also be combined with extra requirements - self.cloud_build_config = CustomBuildConfig(requirements=["torchmetrics"]) - -.. note:: - - When you need to execute commands or install tools that require more privileges than the current user has, you can use ``sudo`` without needing to provide a password, e.g., when installing system packages. - - The build config only applies when running in the cloud and gets ignored otherwise. A local build config is currently not supported. - -.. warning:: - Custom base images are not supported with the default CPU cloud compute. For example: - - .. code-block:: py - - class MyWork(LightningWork): - def __init__(self): - super().__init__(cloud_build_config=BuildConfig(image="my-custom-image")) # no cloud compute, for example default work diff --git a/docs/source-app/glossary/command_lines/command_lines.rst b/docs/source-app/glossary/command_lines/command_lines.rst deleted file mode 100644 index 15d77272c1692..0000000000000 --- a/docs/source-app/glossary/command_lines/command_lines.rst +++ /dev/null @@ -1,69 +0,0 @@ -:orphan: - -############################ -Command-line Interface (CLI) -############################ - -**Audience:** Users looking to create a command line interface (CLI) for their application. - ----- - -************** -What is a CLI? -************** - -A Command-line Interface (CLI) is an user interface (UI) in a terminal to interact with a specific program. - -.. note:: - - The Lightning guideline to build CLI is `lightning_app ...` or ` ...`. - -As an example, Lightning provides a CLI to interact with your Lightning Apps and the `lightning.ai `_ platform as follows: - -.. code-block:: bash - - main - ├── fork - Forks an App. - ├── init - Initializes a Lightning App and/or Component. - │ ├── app - │ ├── component - │ ├── pl-app - Creates an App from your PyTorch Lightning source files. - │ └── react-ui - Creates a React UI to give a Lightning Component a React.js web UI - ├── install - Installs a Lightning App and/or Component. - │ ├── app - │ └── component - ├── list - Lists Lightning AI self-managed resources (apps) - │ └── apps - Lists your Lightning AI Apps. - ├── login - Logs in to your lightning.ai account. - ├── logout - Logs out of your lightning.ai account. - ├── run - Runs a Lightning App locally or on the cloud. - │ └── app - Runs an App from a file. - ├── show - Shows given resource. - │ └── logs - Shows cloud application logs. By default prints logs for all currently available Components. - ├── stop - Stops your App. - └── tree - Shows the command tree of your CLI. - -Learn more about `Command-line interfaces here `_. - ----- - -********** -Learn more -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Develop a Command Line Interface - :description: Learn how to develop a CLI for your App. - :col_css: col-md-6 - :button_link: ../../workflows/build_command_line_interface/index_content.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/dag.rst b/docs/source-app/glossary/dag.rst deleted file mode 100644 index 9578b1e5260a8..0000000000000 --- a/docs/source-app/glossary/dag.rst +++ /dev/null @@ -1,46 +0,0 @@ -###################### -Directed Acyclic Graph -###################### -**Audience:** Users coming from MLOps to Lightning Apps, looking for more flexibility. - ----- - -***************************** -Is Lightning a DAG framework? -***************************** -No. - -A Lightning App enables developers to express complex, interactive applications that are impossible to create with DAGs. - ----- - -********************************* -Can I Build a DAG with Lightning? -********************************* -Yes! - -DAGs are one of the easiest Lightning Apps to build. For example, here's a :doc:`full app that defines a DAG <../examples/dag/dag>`. - ----- - -******** -Examples -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Build a DAG - :description: Learn how to create a DAG with Lightning - :col_css: col-md-4 - :button_link: ../examples/dag/dag.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/debug_app.rst b/docs/source-app/glossary/debug_app.rst deleted file mode 100644 index 2d5c0d19903b0..0000000000000 --- a/docs/source-app/glossary/debug_app.rst +++ /dev/null @@ -1,3 +0,0 @@ -:orphan: - -.. include:: ../workflows/debug_locally.rst diff --git a/docs/source-app/glossary/distributed_fe.rst b/docs/source-app/glossary/distributed_fe.rst deleted file mode 100644 index 36d64b01436b6..0000000000000 --- a/docs/source-app/glossary/distributed_fe.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -##################### -Distributed Front-End -##################### diff --git a/docs/source-app/glossary/distributed_hardware.rst b/docs/source-app/glossary/distributed_hardware.rst deleted file mode 100644 index 0a64f5f5c0720..0000000000000 --- a/docs/source-app/glossary/distributed_hardware.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -#################### -Distributed Hardware -#################### diff --git a/docs/source-app/glossary/environment_variables.rst b/docs/source-app/glossary/environment_variables.rst deleted file mode 100644 index dcbcd8c6f4ab7..0000000000000 --- a/docs/source-app/glossary/environment_variables.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. _environment_variables: - -********************* -Environment Variables -********************* - -If your App is using configuration values you don't want to commit with your App source code, you can use environment variables. - -Lightning allows you to set environment variables when running the App from the CLI with the `lightning_app run app` command. You can use environment variables to pass any values to the App, and avoiding sticking those values in the source code. - -Set one or multiple variables using the **--env** option: - -.. code:: bash - - lightning_app run app app.py --cloud --env FOO=BAR --env BAZ=FAZ - -Environment variables are available in all Flows and Works, and can be accessed as follows: - -.. code:: python - - import os - - print(os.environ["FOO"]) # BAR - print(os.environ["BAZ"]) # FAZ - -.. note:: - Environment variables are not encrypted. For sensitive values, we recommend using :ref:`Encrypted Secrets `. diff --git a/docs/source-app/glossary/event_loop.rst b/docs/source-app/glossary/event_loop.rst deleted file mode 100644 index 30d1bd3b3acfd..0000000000000 --- a/docs/source-app/glossary/event_loop.rst +++ /dev/null @@ -1,11 +0,0 @@ -########## -Event loop -########## - -Drawing inspiration from modern web frameworks like `React.js `_, the Lightning App runs all flows in an **event loop** (forever), which is triggered several times a second after collecting any works' state change. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning_loop.gif - -When running a Lightning App in the cloud, the ``LightningWork`` run on different machines. LightningWork communicates any state changes to the **event loop** which re-executes the flow with the newly-collected works' state. - -.. _app_event_loop: diff --git a/docs/source-app/glossary/fault_tolerance.rst b/docs/source-app/glossary/fault_tolerance.rst deleted file mode 100644 index b0ee6dfd21102..0000000000000 --- a/docs/source-app/glossary/fault_tolerance.rst +++ /dev/null @@ -1,7 +0,0 @@ -:orphan: - -############### -Fault tolerance -############### - -.. note:: documentation under construction diff --git a/docs/source-app/glossary/index.rst b/docs/source-app/glossary/index.rst deleted file mode 100644 index 50e4c1be435d8..0000000000000 --- a/docs/source-app/glossary/index.rst +++ /dev/null @@ -1,155 +0,0 @@ -.. toctree:: - :maxdepth: 1 - :hidden: - - ios_and_android - app_tree - build_config/build_config - command_lines/command_lines - dag - event_loop - environment_variables - secrets - front ends <../workflows/add_web_ui/glossary_front_end> - Lightning app <../core_api/lightning_app/index> - sharing_components - scheduling - storage/storage - restful_api/restful_api - add web ui <../workflows/add_web_ui/glossary_ui> - use_local_lightning - -######## -Glossary -######## - -.. raw:: html - -
-
- -.. displayitem:: - :header: Android Lightning App - :description: Use Lightning with android apps. - :col_css: col-md-12 - :button_link: ios_and_android.html - :height: 100 - -.. displayitem:: - :header: App Components Tree - :description: Learn how components can be nested to form component trees where the LightningFlows are its branches and LightningWorks are its leaves. - :col_css: col-md-12 - :button_link: app_tree.html - :height: 100 - -.. displayitem:: - :header: Build Configuration - :description: Prepare your requirements, add custom build commands or use docker image - :col_css: col-md-12 - :button_link: build_config/build_config.html - :height: 100 - -.. displayitem:: - :header: Command Line Interface (CLI) - :description: Learn about the Lightning CLI - :col_css: col-md-12 - :button_link: command_lines/command_lines.html - :height: 100 - -.. displayitem:: - :header: DAG - :description: Learn about directed acyclic graph, their properties and usage - :col_css: col-md-12 - :button_link: dag.html - :height: 100 - -.. displayitem:: - :header: Event Loop - :description: Learn how the Infinite Event Loop enables high distributed reactivity by triggering after collecting state changes. - :col_css: col-md-12 - :button_link: event_loop.html - :height: 100 - -.. displayitem:: - :header: Environment Variables - :description: Add secrets such as API keys or access tokens - :col_css: col-md-12 - :button_link: environment_variables.html - :height: 100 - -.. displayitem:: - :header: Encrypted Secrets - :description: Learn how to add passwords to your Lightning apps - :col_css: col-md-12 - :button_link: secrets.html - :height: 100 - -.. displayitem:: - :header: Frontend - :description: Customize your App View with any framework you want - :col_css: col-md-12 - :button_link: ../workflows/add_web_ui/glossary_front_end.html - :height: 100 - -.. displayitem:: - :header: iOS Lightning App - :description: Use Lightning with iOS apps. - :col_css: col-md-12 - :button_link: ios_and_android.html - :height: 100 - -.. displayitem:: - :header: Lightning App - :description: A Lightning app is a collection of connected components that form a workflow - :col_css: col-md-12 - :button_link: ../core_api/lightning_app/index.html - :height: 100 - -.. displayitem:: - :header: Mounts - :description: Mount Cloud Data - :col_css: col-md-12 - :button_link: mount.html - :height: 100 - -.. displayitem:: - :header: Sharing Components - :description: Let's create an ecosystem altogether - :col_css: col-md-12 - :button_link: sharing_components.html - :height: 100 - -.. displayitem:: - :header: Scheduling - :description: Orchestrate execution at specific times - :col_css: col-md-12 - :button_link: scheduling.html - :height: 100 - -.. displayitem:: - :header: Storage - :description: Easily share files even across multiple machines - :col_css: col-md-12 - :button_link: storage/storage.html - :height: 100 - -.. displayitem:: - :header: REST API - :description: Learn how to set up a RESTful API endpoint - :col_css: col-md-12 - :button_link: restful_api/restful_api.html - :height: 100 - -.. displayitem:: - :header: UI - :description: Combine multiple frameworks to create your own UI - :col_css: col-md-12 - :button_link: ../workflows/add_web_ui/glossary_ui.html - :height: 100 - -.. displayitem:: - :header: Using a development branch of Lightning on the Cloud - :description: Learn how to contribute to the Lightning framework in the cloud - :col_css: col-md-12 - :button_link: use_local_lightning.html - :height: 100 diff --git a/docs/source-app/glossary/ios_and_android.rst b/docs/source-app/glossary/ios_and_android.rst deleted file mode 100644 index 90aeecbc0b141..0000000000000 --- a/docs/source-app/glossary/ios_and_android.rst +++ /dev/null @@ -1,26 +0,0 @@ - -############################################### -Apple and Android mobile devices with Lightning -############################################### - -Audience: Users who want to develop Lightning Apps for Apple or Android mobile devices. - ----- - -*********************************************************** -Develop a Lightning App for Apple or Android mobile devices -*********************************************************** - -There are a couple of ways you can go about building Lightning Apps that work on Apple or Android mobile devices. - -Option 1 -^^^^^^^^ - -You can develop a Lightning App that interacts with an iOS or Android app. -The ML and backend services live on the Lightning App, but the iOS or Android code (obj-c/swift or android) lives on the mobile devices. - -Option 2 -^^^^^^^^ - -You can build a mobile-first React Lightning App that works on both Apple and Android mobile devices. -The `InVideo app `_ is a good example of a Lightning App that does just that. diff --git a/docs/source-app/glossary/lightning_app_overview/index.rst b/docs/source-app/glossary/lightning_app_overview/index.rst deleted file mode 100644 index 09de273affc50..0000000000000 --- a/docs/source-app/glossary/lightning_app_overview/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -:orphan: - -########################### -Lightning Apps Key concepts -########################### - -**Audience:** Users who want to know how the 🤯 magic works under the hood. - ----- - -.. note:: This page is under construction diff --git a/docs/source-app/glossary/mount.rst b/docs/source-app/glossary/mount.rst deleted file mode 100644 index a62d72b5b798d..0000000000000 --- a/docs/source-app/glossary/mount.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../workflows/mount_cloud_object_store.rst diff --git a/docs/source-app/glossary/restful_api/restful_api.rst b/docs/source-app/glossary/restful_api/restful_api.rst deleted file mode 100644 index 6e04f60c75f1c..0000000000000 --- a/docs/source-app/glossary/restful_api/restful_api.rst +++ /dev/null @@ -1,53 +0,0 @@ -:orphan: - -########### -RESTful API -########### - -**Audience:** Users looking to create an API in their App to allow users to activate functionalities from external sources. - ----- - -********************** -What is a RESTful API? -********************** - -A RESTful API is a set of external URL routes exposed by a server that enables clients to trigger some functionalities, such as getting or putting some data, uploading files, etc.. - -This provides great flexibility for users as they can easily discover functionalities made available by the App Builders. - -The Lightning App framework supports the four primary HTTP methods: `GET`, `POST`, `PUT`, `DELETE`. - -These methods are guidelines to organize your RESTful Services and help users understand your functionalities. - -* **`GET`:** Reads data from the server. -* **`POST`:** Creates new resources. -* **`PUT`:** Updates/replaces existing resources. -* **`DELETE`:** Deletes resources. - -Learn more about `HTTP Methods for RESTful Services here `_. - -The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. - ----- - -********** -Learn more -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Develop a RESTful API - :description: Learn how to develop an API for your App. - :col_css: col-md-6 - :button_link: ../../workflows/build_rest_api/index_content.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/scheduling.rst b/docs/source-app/glossary/scheduling.rst deleted file mode 100644 index 0e04dc35c897d..0000000000000 --- a/docs/source-app/glossary/scheduling.rst +++ /dev/null @@ -1,185 +0,0 @@ -:orphan: - -########## -Scheduling -########## - -The Lightning Scheduling system makes it easy to schedule your components execution with any arbitrary conditions. - - ----- - -************************ -Schedule your components -************************ - -The LightningFlow has a ``schedule`` method which can be used to schedule your components. - -.. code-block:: python - - from lightning.app import LightningWork, LightningFlow - from lightning.app.storage import Path - - - class MyFlow(LightningFlow): - - def run(self): - if self.schedule("hourly"): - # run some code once every hour. - - if self.schedule("daily"): - # run some code once day. - - if self.schedule("daily") and anything_else: - # run some code once day if the anything else is also True. - - if self.schedule("2 4 * * mon,fri"): - # defined with cron syntax, run some code at 04:02 on every Monday and Friday. - -Learn more about the cron syntax `here `_ - ----- - -************** -Best Practices -************** - -In the example above, the line ``self.schedule("hourly")`` will return ``True`` for a **single** flow execution every hour. Mathematically, this is known as a dirac. - -1. Instantiate your component under the schedule method and run outside as follows: - -.. code-block:: python - - from lightning.app import LightningFlow - from lightning.app.structures import List - - class ScheduledDAG(LightningFlow): - def __init__(self): - super().__init__() - self.list = List() - - def run(self): - if self.schedule("hourly"): - # dynamically instantiate - # don't forget to always attach - # your components to the flow !!! - self.list.append(MyDAGFlow(...)) - - # run all dags, but the completed ones - # are cached and don't re-execute. - for dag in self.list: - dag.run() - - -2. Run a single work under the schedule with different arguments to have it re-run. - -.. code-block:: python - - from lightning.app import LightningFlow - from time import time - - class ScheduledDAG(LightningFlow): - def __init__(self): - super().__init__() - self.data_processor = DataProcessorWork(...) - - def run(self): - ... - if self.schedule("hourly"): - self.data_processor.run(trigger_time=time()) - - -3. Capture the event in the state and execute your sequential works outside. - -.. code-block:: python - - from lightning.app import LightningFlow - from time import time - - class ScheduledDAG(LightningFlow): - def __init__(self): - super().__init__() - self.should_execute = False - self.data_processor = DataProcessorWork(...) - self.training_work = KerasTrainingWork(...) - - def run(self): - ... - if self.schedule("hourly"): - self.should_execute = True - - # Runs in 10 min - if self.should_execute: - # Runs in 5 min - self.data_processor.run(trigger_time=time()) - if self.data_processor.has_succeeded: - # Runs in 5 min - self.training_work.run(self.data_processor.data) - if self.training_work.has_succeeded: - self.should_execute = False - ----- - -*********** -Limitations -*********** - -As stated above, the schedule acts as a dirac and is **True** for a single flow execution. -Therefore, sequential works execution under the schedule won't work as they don't complete within a single flow execution. - -Here is an example of something which **WON'T** work: - -.. code-block:: python - - from lightning.app import LightningFlow - from time import time - - class ScheduledDAG(LightningFlow): - def __init__(self): - super().__init__() - self.data_processor = DataProcessorWork(...) - self.training_work = KerasTrainingWork(...) - - def run(self): - ... - if self.schedule("hourly"): - # This finishes 5 min later - self.data_processor.run(trigger_time=time()) - if self.data_processor.has_succeeded: - # This will never be reached as the - # data processor will keep processing forever... - self.training_work.run(self.data_processor.data) - ----- - -************************** -Frequently Asked Questions -************************** - -- **Q: Can I use multiple nested scheduler?** No, as they might cancel themselves out, but you can capture the event of one to trigger the next one. - -- **Q: Can I use any arbitrary logic to schedule?** Yes, this design enables absolute flexibility, but you need to be careful to avoid bad practices. - ----- - -******** -Examples -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Build a DAG - :description: Learn how to schedule a DAG execution - :col_css: col-md-4 - :button_link: ../examples/dag/dag.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/secrets.rst b/docs/source-app/glossary/secrets.rst deleted file mode 100644 index 95a0d564c648d..0000000000000 --- a/docs/source-app/glossary/secrets.rst +++ /dev/null @@ -1,74 +0,0 @@ -.. _secrets: - -################# -Encrypted Secrets -################# - -Encrypted Secrets allow you to pass private data to your apps, like API keys, access tokens, database passwords, or other credentials, in a secure way without exposing them in your code. -Secrets provide you with a secure way to store this data in a way that is accessible to Apps so that they can authenticate third-party services/solutions. - -.. tip:: - For non-sensitive configuration values, we recommend using :ref:`plain-text Environment Variables `. - -************ -Add a secret -************ - -Add the secret to your profile on lightning.ai. -Log in to your lightning.ai account > **Profile** > **Secrets** tab > click the **+New** button. -Provide a name and value to your secret, for example, name could be "github_api_token". - -.. note:: - Secret names must start with a letter and can only contain letters, numbers, dashes, and periods. The Secret names must comply with `RFC1123 naming conventions `_. The Secret value has no restrictions. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning//encrypted_secrets_login.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning//encrypted_secrets_login.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -************ -Use a secret -************ - -1. Add an environment variable to your app to read the secret. For example, add an "api_token" environment variable: - -.. code:: python - - import os - - component.connect(api_token=os.environ["api_token"]) - -2. Pass the secret to your app run with the following command: - -.. code:: bash - - lightning_app run app app.py --cloud --secret = - -In this example, the command would be: - -.. code:: bash - - lightning_app run app app.py --cloud --secret api_token=github_api_token - - -The ``--secret`` option can be used for multiple Secrets, and alongside the ``--env`` option. - -Here's an example: - -.. code:: bash - - lightning_app run app app.py --cloud --env FOO=bar --secret MY_APP_SECRET=my-secret --secret ANOTHER_SECRET=another-secret - - ----- - -****************** -How does this work -****************** - -When a Lightning App (App) **runs in the cloud**, a Secret can be exposed to the App using environment variables. -The value of the Secret is encrypted in the Lightning.ai database, and is only decrypted and accessible to -LightningFlow (Flow) or LightningWork (Work) processes in the cloud (when you use the ``--cloud`` option running your App). diff --git a/docs/source-app/glossary/sharing_components.rst b/docs/source-app/glossary/sharing_components.rst deleted file mode 100644 index 2426bb43d4469..0000000000000 --- a/docs/source-app/glossary/sharing_components.rst +++ /dev/null @@ -1,50 +0,0 @@ -##################### -Sharing my components -##################### - -**Audience:** Users who want to know how to share component. - -**Level:** Basic - ----- - -******************************************** -Why should I consider sharing my components? -******************************************** - -Lightning is community driven and its core objective is to make AI accessible to everyone. - -By creating components and sharing them with everyone else, the barrier to entry will go down. - ----- - -************************************ -How should I organize my components? -************************************ - -By design, Lightning components are nested to form component trees where the ``LightningFlows`` are its branches and ``LightningWorks`` are its leaves. - -This design has two primary advantages: - -* This helps users organize and maintain their code with more ease. -* This also helps create an ecosystem with **reusable** components. - - -Now, imagine you have implemented a **KerasScriptRunner** component for training any `Keras `_ model with `Tensorboard UI `_ integrated. - -Here are the best practices steps before sharing the component: - -* **Testing**: Ensure your component is well tested by following the :doc:`../testing` guide. -* **Documented**: Ensure your component has a docstring and comes with some usage explications. - -.. Note:: As a Lightning user, it helps to implement your components thinking someone else is going to use them. - ----- - -***************************************** -How should I proceed to share components? -***************************************** - -Once your component is ready, create a *PiPy* package with your own library and then it can be reused by anyone else. - -Here is a `Component Template `_ from `William Falcon `_ to guide your component. diff --git a/docs/source-app/glossary/storage/differences.rst b/docs/source-app/glossary/storage/differences.rst deleted file mode 100644 index ed45edd069632..0000000000000 --- a/docs/source-app/glossary/storage/differences.rst +++ /dev/null @@ -1,78 +0,0 @@ -:orphan: - -################################## -Differences between Drive and Path -################################## - -**Audience:** Users who want to share files between components. - - -The Lightning Storage system makes it easy to share files between LightningWork so you can run your app both locally and in the cloud without changing the code. - - -Lightning storage provides two solutions :class:`~lightning.app.storage.drive.Drive` and :class:`~lightning.app.storage.path.Path` to deal with files locally and in the cloud likewise. - - ----- - -***************** -What is a Drive ? -***************** - -The Drive object provides a central place for your components to share data. - -The drive acts as an isolate folder and any component can access it by knowing its name. - -Your components can put, list, get, delete files from and to the Drive (except LightningFlow's). - ----- - -**************** -What is a Path ? -**************** - -The Path object is a reference to a specific file or directory from a LightningWork and can be used to transfer those files to another LightningWork (one way, from source to destination). - -A good mental representation of the Path Object usage is the `relay race `_. -To make a transfer, the LightningWork Receiver asks (e.g when the path object is passed by the flow to the Receiver) -for a copy of the files (baton) owned by their LightningWork Producer (e.g the work that created the files). - ----- - -********************************* -When should I use Drive vs Path ? -********************************* - -The Drive should be used when you want to easily share data between components but the Path enables to create cleaner shareable -component where you want to exposes some files to be transferred (like an HPO component sharing the best model weights) for anyone else to use. - -The Drive is more intuitive and easier to get on-boarded with, but in more advanced use cases, you might appreciate the Path Object -which makes uni-directional files transfer simpler. - ----- - -.. raw:: html - -
-
- -.. displayitem:: - :header: The Drive Object. - :description: Put, List and Get Files From a Shared Drive Disk. - :col_css: col-md-4 - :button_link: drive.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: The Path Object. - :description: Transfer Files From One Component to Another by Reference. - :col_css: col-md-4 - :button_link: path.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/storage/drive.rst b/docs/source-app/glossary/storage/drive.rst deleted file mode 100644 index dffdb979f305a..0000000000000 --- a/docs/source-app/glossary/storage/drive.rst +++ /dev/null @@ -1,13 +0,0 @@ -:orphan: - -.. _drive_storage: - -############# -Drive Storage -############# - -**Audience:** Users who want to put, list, and get files from a shared disk space. - ----- - -.. include:: ../../glossary/storage/drive_content_old.rst diff --git a/docs/source-app/glossary/storage/drive_content.rst b/docs/source-app/glossary/storage/drive_content.rst deleted file mode 100644 index ff1540e8ce6f6..0000000000000 --- a/docs/source-app/glossary/storage/drive_content.rst +++ /dev/null @@ -1,223 +0,0 @@ -:orphan: - -************************** -What are Lightning Drives? -************************** - -Lightning Drives are shared app storage that allow you to share files between :doc:`LightningWork (Work) <../../core_api/lightning_work/index>` components, so that you distributed components can share files when running on the cloud. Using drives, you can run your Lightning App both locally and in the cloud without changing the code. - -The Drive object provides a central place for your components to share data. - -The Drive acts as an isolated folder and any component can access it by knowing its name. - -We currently support two types of Drives: Lightning-managed (``lit://``) and S3 (``s3://``). - -+-----------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ -| Lightning-managed (``lit://``) | Allows read-write operations and are accessible through the Drive API from a Work. | -| | | -| | They allow your components to put, list, get, and delete files from and to the Drive (except LightningFlows). | -+-----------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ -| S3 (``s3://``) | S3 is AWS S3 storage mounted at a filesystem mount point. S3 is read-only (for now) and its primary purpose is | -| | to give you a permanent location to access your training data. | -| | | -| | They allow your components to list and get files located on the Drive. | -+-----------------------------------+-------------------------------------------------------------------------------------------------------------------------------+ - ----- - -********************** -What Drives do for you -********************** - -Think of every instance of the Drive object acting like a Google Drive or like Dropbox. - -By sharing the Drive between components through the LightningFlow, -several components can have a shared place to read (S3 Drives) or read and write (Lightning-managed Drives) files from. - -S3 Drive Limitations -^^^^^^^^^^^^^^^^^^^^ - -These limitations only apply to S3 Drives: - -* There is no top level “shareable” S3 drive object. Each S3 Drive is owned by a particular Work. However, it’s possible to create a Drive with the same location across multiple Works. - -* S3 buckets cannot be mounted as Drives once a Work has been instantiated. The `Drive` object must be initialized passed to a Work at creation time. - -* Whenever a Drive is mounted to a Work, an indexing process will be done again for the provided S3 bucket. This may lead to performance issues with particularly large S3 buckets. For context, 1M files with 2-3 levels of nesting takes less than 1 second to index. - ----- - -************** -Create a Drive -************** - -In order to create a Drive, you simply need to pass its name with the prefix ``lit://`` or ``s3://``. - -.. note:: We do not support mounting single objects for S3 buckets, so there must be a trailing `/` in the s3:// URL. For example: ``s3://foo/bar/``. - -.. code-block:: python - - from lightning.app.storage import Drive - - # The identifier of this Drive is ``drive_1`` - # Note: You need to add Lightning protocol ``lit://`` as a prefix. - - drive_1 = Drive("lit://drive_1") - - # The identifier of this Drive is ``drive_2`` - drive_2 = Drive("s3://drive_2/") - -Any component can create a drive object for ``lit://`` Drives. - -.. code-block:: python - - from lightning.app import LightningFlow, LightningWork - from lightning.app.storage import Drive - - - class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.drive_1 = Drive("lit://drive_1") - - def run(self): - ... - - - class Work(LightningWork): - def __init__(self): - super().__init__() - self.drive_1 = Drive("lit://drive_1") - - def run(self): - ... - ----- - -***************************** -Supported actions with Drives -***************************** - -A Lightning-managed Drive supports put, list, get, and delete actions. - -An S3 Drive supports list and get actions (for now). - -.. code-block:: python - - from lightning.app.storage import Drive - - drive = Drive("lit://drive") - - drive.list(".") # Returns [] as empty - - # Created file. - with open("a.txt", "w") as f: - f.write("Hello World !") - - drive.put("a.txt") - - drive.list(".") # Returns ["a.txt"] as the file copied in the Drive during the put action. - - drive.get("a.txt") # Get the file into the current worker - - drive.delete("a.txt") - - drive.list(".") # Returns [] as empty - ----- - -********************************** -Component interactions with Drives -********************************** - -Here is an illustrated code example on how to create drives within Works. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/drive_2.png - -.. code-block:: python - - from lightning.app import LightningFlow, LightningWork, LightningApp - from lightning.app.storage import Drive - - - class Work_A(LightningWork): - def __init__(self): - super().__init__() - # The identifier of the Drive is ``drive_1`` - # Note: You need to add Lightning protocol ``lit://`` as a prefix. - self.drive_1 = Drive("lit://drive_1") - - def run(self): - # 1. Create a file. - with open("a.txt", "w") as f: - f.write("Hello World !") - - # 2. Put the file into the drive. - self.drive_1.put("a.txt") - - - class Work_B(LightningWork): - def __init__(self): - super().__init__() - - # Note: Work B has access 2 drives. - - # The identifier of this Drive is ``drive_1`` - self.drive_1 = Drive("lit://drive_1") - # The identifier of this Drive is ``drive_2`` - self.drive_2 = Drive("lit://drive_2") - - def run(self): - # 1. Create a file. - with open("b.txt", "w") as f: - f.write("Hello World !") - - # 2. Put the file into both drives. - self.drive_1.put("b.txt") - self.drive_2.put("b.txt") - - - class Work_C(LightningWork): - def __init__(self): - super().__init__() - self.drive_2 = Drive("lit://drive_2") - - def run(self): - # 1. Create a file. - with open("c.txt", "w") as f: - f.write("Hello World !") - - # 2. Put the file into the drive. - self.drive_2.put("c.txt") - ----- - -************************* -Transfer files with Drive -************************* - -In the example below, the Drive is created by the Flow and passed to its Works. - -The ``Work_1`` put a file **a.txt** in the **Drive("lit://this_drive_id")** and the ``Work_2`` can list and get the **a.txt** file from it. - -.. literalinclude:: ../../../../examples/app/drive/app.py - ----- - -.. raw:: html - -
-
- -.. displayitem:: - :header: Learn about the Path Object. - :description: Transfer Files From One Component to Another by Reference. - :col_css: col-md-4 - :button_link: path.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/storage/drive_content_old.rst b/docs/source-app/glossary/storage/drive_content_old.rst deleted file mode 100644 index 3c37f883fd31d..0000000000000 --- a/docs/source-app/glossary/storage/drive_content_old.rst +++ /dev/null @@ -1,199 +0,0 @@ -:orphan: - - -************ -About Drives -************ - -Lightning Drive storage makes it easy to share files between LightningWorks so you can run your Lightning App both locally and in the cloud without changing the code. - -The Drive object provides a central place for your components to share data. - -The Drive acts as an isolate folder and any component can access it by knowing its name. - -Your components can put, list, get, and delete files from and to the Drive (except LightningFlows). - ----- - -*********************** -What Drive does for you -*********************** - -Think of every instance of the Drive object acting like a Google Drive or like Dropbox. - -By sharing the Drive between components through the LightningFlow, -several components can have a shared place to read and write files from. - ----- - -************** -Create a Drive -************** - -In order to create a Drive, you simply need to pass its name with the prefix ``lit://`` as follows: - -.. code-block:: python - - from lightning.app.storage import Drive - - # The identifier of this Drive is ``drive_1`` - # Note: You need to add Lightning protocol ``lit://`` as a prefix. - - drive_1 = Drive("lit://drive_1") - - # The identifier of this Drive is ``drive_2`` - drive_2 = Drive("lit://drive_2") - -Any components can create a drive object. - -.. code-block:: python - - from lightning.app import LightningFlow, LightningWork - from lightning.app.storage import Drive - - - class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.drive_1 = Drive("lit://drive_1") - - def run(self): - ... - - - class Work(LightningWork): - def __init__(self): - super().__init__() - self.drive_1 = Drive("lit://drive_1") - - def run(self): - ... - ----- - -***************************** -Supported actions with Drives -***************************** - -A Drive supports put, list, get, and delete actions. - -.. code-block:: python - - from lightning.app.storage import Drive - - drive = Drive("lit://drive") - - drive.list(".") # Returns [] as empty - - # Created file. - with open("a.txt", "w") as f: - f.write("Hello World !") - - drive.put("a.txt") - - drive.list(".") # Returns ["a.txt"] as the file copied in the Drive during the put action. - - drive.get("a.txt") # Get the file into the current worker - - drive.delete("a.txt") - - drive.list(".") # Returns [] as empty - ----- - -********************************** -Component interactions with Drives -********************************** - -Here is an illustrated code example on how to create drives within works. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/drive_2.png - -.. code-block:: python - - from lightning.app import LightningFlow, LightningWork, LightningApp - from lightning.app.storage import Drive - - - class Work_A(LightningWork): - def __init__(self): - super().__init__() - # The identifier of the Drive is ``drive_1`` - # Note: You need to add Lightning protocol ``lit://`` as a prefix. - self.drive_1 = Drive("lit://drive_1") - - def run(self): - # 1. Create a file. - with open("a.txt", "w") as f: - f.write("Hello World !") - - # 2. Put the file into the drive. - self.drive_1.put("a.txt") - - - class Work_B(LightningWork): - def __init__(self): - super().__init__() - - # Note: Work B has access 2 drives. - - # The identifier of this Drive is ``drive_1`` - self.drive_1 = Drive("lit://drive_1") - # The identifier of this Drive is ``drive_2`` - self.drive_2 = Drive("lit://drive_2") - - def run(self): - # 1. Create a file. - with open("b.txt", "w") as f: - f.write("Hello World !") - - # 2. Put the file into both drives. - self.drive_1.put("b.txt") - self.drive_2.put("b.txt") - - - class Work_C(LightningWork): - def __init__(self): - super().__init__() - self.drive_2 = Drive("lit://drive_2") - - def run(self): - # 1. Create a file. - with open("c.txt", "w") as f: - f.write("Hello World !") - - # 2. Put the file into the drive. - self.drive_2.put("c.txt") - ----- - -***************************** -Transfer files with Drive -***************************** - -In the example below, the Drive is created by the flow and passed to its LightningWork's. - -The ``Work_1`` put a file **a.txt** in the **Drive("lit://this_drive_id")** and the ``Work_2`` can list and get the **a.txt** file from it. - -.. literalinclude:: ../../../../examples/app/drive/app.py - - ----- - -.. raw:: html - -
-
- -.. displayitem:: - :header: Learn about the Path Object. - :description: Transfer Files From One Component to Another by Reference. - :col_css: col-md-4 - :button_link: path.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/storage/path.rst b/docs/source-app/glossary/storage/path.rst deleted file mode 100644 index 4cf41c1453aa4..0000000000000 --- a/docs/source-app/glossary/storage/path.rst +++ /dev/null @@ -1,326 +0,0 @@ -:orphan: - -############ -Path Storage -############ - -**Audience:** Users who want to share files between components. - - -The Lightning Storage system makes it easy to share files between LightningWork so you can run your app both locally and in the cloud without changing the code. - ----- - -*********************** -What is a Path Object ? -*********************** - -The Path object is a reference to a specific file or directory from a LightningWork and can be used to transfer those files to another LightningWork (one way, from source to destination). - -A good mental representation of the Path Object usage is the `relay race `_. -To make a transfer, the receiver asks (e.g when the path object is passed by the flow to the receiver) -for a copy of the files (baton) owned by their producer (e.g the LightningWork which created the files). - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/path2.png - -******************************************* -How does the Path Object works internally ? -******************************************* - -To understand the Path Object internal, let's first answer this question: How do you locate a specific file or folder within a distributed system made of multiple machines ? - -You need to know on which machine the file or folder is located (e.g the LightningWork name uniquely identify its own machine in the cloud) and -then you need the local path of the file or folder on that machine. - -In simple words, the Lightning Path augments :class:`pathlib.Path` object by tracking on which machine the file or folder is located. - ----- - -************************** -When to use Path storage ? -************************** - -In the cloud, every :class:`~lightning.app.core.work.LightningWork` runs in a separate machine with its own filesystem. -This means files in one Work cannot be directly accessed in another like you would be able to when running the app locally. -But with Lightning Storage, this is easy: Simply declare which files need to be shared and Lightning will take care of the rest. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/path.mp4 - :width: 600 - :autoplay: - :loop: - :muted: - - ----- - - -*********************************** -Tell Lightning where your files are -*********************************** - -Convert every filesystem path you want to share with other LightningWorks to by adding ``lit://`` in front of it. - -.. code-block:: python - - from lightning.app import LightningWork - from lightning.app.storage import Path - - - class SourceWork(LightningWork): - def __init__(self): - super().__init__() - self.checkpoint_dir = None - - def run(self): - # Normally you would do: - # self.checkpoint_dir = "outputs/checkpoints" - # os.makedirs("outputs/checkpoints") - # ... - - # In Lightning, do: - self.checkpoint_dir = "lit://outputs/checkpoints" - os.makedirs(self.checkpoint_dir) - ... - - -Under the hood, we convert this string to a :class:`~lightning.app.storage.path.Path` object, which is a drop-in replacement for :class:`pathlib.Path` meaning it will work with :mod:`os`, :mod:`os.path` and :mod:`pathlib` filesystem operations out of the box! - - ----- - - -**************************** -Access files in another Work -**************************** - -Accessing files from another LightningWork is as easy as handing the path over by reference. -For example, share a directory by passing it as an input to the run method of the destination work: - -.. code-block:: python - :emphasize-lines: 12 - - from lightning.app import LightningFlow - - - class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.source = SourceWork() - self.destination = DestinationWork() - - def run(self): - self.source.run() - # Pass the Path reference from one work to another - self.destination.run(self.source.checkpoint_dir) - - -When the destination Work starts, Lightning will automatically transfer the files to its filesystem (if they exist on the other end): - -.. code-block:: python - - class DestinationWork(LightningWork): - def run(self, checkpoint_dir): - # The directory is now accessible inside this Work - files = os.listdir(checkpoint_dir) - ... - - -The automatic transfer only happens if the referenced files already exist in the originating LightningWork and it will overwrite any files that already exist locally. -In all other cases, you can trigger the transfer manually. - - ----- - - -****************** -Get files manually -****************** - -If you need to access files at a specific time or transfer them multiple times, use ``.get()`` method: - -.. code-block:: python - - def run(self, checkpoint_dir): - ... - # Make the directory available - checkpoint_dir.get() - - # If the path already exists locally, you can force overwriting it - checkpoint_dir.get(overwrite=True) - - files = os.listdir(checkpoint_dir) - ... - - -Multiple calls to the ``.get()`` method will always result in file transfers, regardless of whether the files have changed or not. -If the path does not exist remotely, it will raise a ``FileNotFoundError``. -If you need to handle this case, the Path also offers a method to check if files exist remotely. - ----- - - -******************************** -Check if a file or folder exists -******************************** - -You can check if a path exists locally or remotely in the source Work using the ``.exists_local()`` and ``.exists_remote()`` methods: - -.. code-block:: python - - def run(self, checkpoint_dir): - if checkpoint_dir.exists_remote(): - # Get the file only if it exists in the source Work - checkpoint_dir.get() - - # OR - - if checkpoint_dir.exists_local(): - # Do something with the file if it exists locally - files = os.listdir(checkpoint_dir) - - ----- - - -************* -Persist files -************* - -If a LightningWork finishes or stops due to an interruption (e.g., due to insufficient credits), the filesystem and all files in it get deleted (unless running locally). -Lightning makes sure all Paths that are part of the state get stored and made accessible to the other Works that still need these files. - -.. code-block:: python - - from lightning.app.storage import Path - - - class Work(LightningWork): - def __init__(self): - super().__init__() - # The files in this path will be saved as an artifact when the Work finishes - self.checkpoint_dir = "lit://outputs/checkpoints" - - # The files in this path WON'T be saved because it is not declared as a Lightning Path - self.log_dir = "outputs/logs" - - ----- - - -********************************* -Example: Share a model checkpoint -********************************* - -A common workflow in ML is to use a checkpoint created by another component. -First, define a component that saves a checkpoint: - -.. code:: python - :emphasize-lines: 14-18 - - from lightning.app import LightningFlow, LightningWork - from lightning.app.storage import Path - import torch - import os - - - class ModelTraining(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.checkpoint_dir = "lit://outputs/checkpoints" - - def run(self): - # create a directory - os.makedirs(self.model_checkpoints_path, exist_ok=True) - # make fake checkpoints - checkpoint_1 = torch.tensor([0, 1, 2, 3, 4]) - checkpoint_2 = torch.tensor([0, 1, 2, 3, 4]) - torch.save(checkpoint_1, os.path.join(self.checkpoint_dir, "checkpoint_1.ckpt")) - torch.save(checkpoint_2, os.path.join(self.checkpoint_dir, "checkpoint_2.ckpt")) - - -Next, define a component that needs the checkpoints: - -.. code:: python - :emphasize-lines: 4, 7 - - class ModelDeploy(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__() - - def run(self, checkpoint_dir): - ckpts = os.listdir(checkpoint_dir) - checkpoint_1 = torch.load(ckpts[0]) - checkpoint_2 = torch.load(ckpts[1]) - -Link both components via a parent component: - -.. code:: python - :emphasize-lines: 7 - - class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.train = ModelTraining() - - # pass the checkpoint path - self.deploy = ModelDeploy() - - def run(self): - self.train.run() - self.deploy.run(checkpoint_dir=self.train.checkpoint_dir) - - - app = L.LightningApp(Flow()) - - ----- - -************************** -Frequently Asked Questions -************************** - -- **Q: Can files in a LightningWork be accessed inside the LightningFlow too?** - - No, LightningFlow is intentionally designed not to perform filesystem operations and computations and is intended to exclusively orchestrate Flow and Work. - -- **Q: Is it possible to reference any file using the Lightning lit:// path notation?** - - Yes, but only files for which the app has write permissions can be copied from Work to Work (apps don't run with root privileges). - -- **Q: Can I access the Lightning Storage in my UI (StreamLit, Web, ...)?** - - This is currently not supported but will be in the future. - -- **Q: Should I define my lit:// path in the __init__ or the run method?** - - You can declare a Lightning path anywhere you'd like. However, the ``.get()`` and ``.exists_*()`` methods only work inside of the run method of a LightningWork. - -- **Q:How often does Lightning synchronize the files between my Work?** - - Lightning does not synchronize the files between works. It only transfers the files once when the Work ``run`` method starts. - But you can call ``Path.get()`` as many times as you wish to transfer the latest file into the current Work. - -- **Does Lightning provide me direct access to the shared cloud folder?** - - No, and this is on purpose. This restriction forces developers to build modular components that can be shared and integrated - into apps easily. This would be much harder to achieve if file paths in these components would reference a global shared storage. - ----- - -.. raw:: html - -
-
- -.. displayitem:: - :header: Learn about the Drive Object. - :description: Put, List and Get Files From a Shared Drive Disk. - :col_css: col-md-4 - :button_link: drive.html - :height: 180 - :tag: Basic - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/storage/storage.rst b/docs/source-app/glossary/storage/storage.rst deleted file mode 100644 index af115a813fc80..0000000000000 --- a/docs/source-app/glossary/storage/storage.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. _storage: - -####### -Storage -####### - -**Audience:** Users who want to share files between components. - - -The Lightning Storage system makes it easy to share files between LightningWork so you can run your app both locally and in the cloud without changing the code. - - -Lightning storage provides two solutions :class:`~lightning.app.storage.drive.Drive` and :class:`~lightning.app.storage.path.Path` to deal with files locally and in the cloud likewise. - - ----- - -.. raw:: html - -
-
- -.. displayitem:: - :header: Learn about the differences between Drive vs Path. - :description: Learn about their differences. - :col_css: col-md-4 - :button_link: differences.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: The Drive Object. - :description: Put, List and Get Files From a Shared Drive Disk. - :col_css: col-md-4 - :button_link: drive.html - :height: 180 - :tag: Basic - -.. displayitem:: - :header: The Path Object. - :description: Transfer Files From One Component to Another by Reference. - :col_css: col-md-4 - :button_link: path.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
- - ----- - -******** -Examples -******** - - - -.. raw:: html - -
-
- -.. displayitem:: - :header: Build a File Server - :description: Learn how to use Drive to upload / download files to your app. - :col_css: col-md-4 - :button_link: ../../examples/file_server/file_server.html - :height: 180 - :tag: Intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/glossary/use_local_lightning.rst b/docs/source-app/glossary/use_local_lightning.rst deleted file mode 100644 index 1efc8e730e97a..0000000000000 --- a/docs/source-app/glossary/use_local_lightning.rst +++ /dev/null @@ -1,15 +0,0 @@ -################################################################ -How to run an app on the cloud with a local version of lightning -################################################################ - -The lightning cloud uses the latest release by default. However, you might want to run your app with some local changes you've made to the lightning framework. To use your local version of lightning on the cloud, set the following environment variable: - -```bash -git clone https://github.com/Lightning-AI/lightning.git -cd lightning -pip install -e . -export PACKAGE_LIGHTNING=1 # <- this is the magic to use your version (not mainstream PyPI)! -lightning_app run app app.py --cloud -``` - -By setting `PACKAGE_LIGHTNING=1`, lightning packages the lightning source code in your local directory in addition to your app source code and uploads them to the cloud. diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst deleted file mode 100644 index fb4258ad28e3a..0000000000000 --- a/docs/source-app/index.rst +++ /dev/null @@ -1,153 +0,0 @@ -.. lightning documentation master file, created by - sphinx-quickstart on Sat Sep 19 16:37:02 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -####################### -Welcome to ⚡ Lightning -####################### -Build models, ML components and full stack AI apps ⚡ *Lightning fast*. - -**Featured examples of what you can do with Lightning:** - -| - -.. raw:: html - -
-
- -.. app_card:: - :title: Develop and Train - :description: Train a model (32 GPUs) - :width: 280 - :image: https://lightning-ai-docs.s3.amazonaws.com/develop_n_train_v1.jpg - :target: levels/basic/real_lightning_component_implementations.html#ex-pytorch-lightning-trainer - :preview: levels/basic/real_lightning_component_implementations.html#ex-pytorch-lightning-trainer - :tags: Training - -.. app_card:: - :title: Serve and deploy - :description: Develop a Model Server - :width: 280 - :image: https://lightning-ai-docs.s3.amazonaws.com/serve_n_deploy_v1.jpg - :target: examples/model_server_app/model_server_app.html - :preview: examples/model_server_app/model_server_app.html - :tags: Serving - -.. app_card:: - :title: Scale and build a product - :description: Production-ready generative AI app - :width: 280 - :app_id: HvUwbEG90E - :image: https://lightning-ai-docs.s3.amazonaws.com/scale_n_build_v1.jpg - :target: https://lightning.ai/app/HvUwbEG90E-Muse - :tags: AI App - -.. raw:: html - -
-
- ----- - -******************************** -Build self-contained, components -******************************** -Use Lightning, the hyper-minimalistic framework, to build machine learning components that can plug into existing ML workflows. -A Lightning component organizes arbitrary code to run on the cloud, manage its own infrastructure, cloud costs, networking, and more. -Focus on component logic and not engineering. - -Use components on their own, or compose them into full-stack AI apps with our next-generation Lightning orchestrator. - -.. raw:: html - -
- -
- -| - -| - -**Run an example component on the cloud**: - -.. include:: ./levels/basic/hero_components.rst - -| - -Components run the same on the cloud and locally on your choice of hardware. - -.. lit_tabs:: - :code_files: landing_app_run.bash - :highlights: 5 - :height: 150px - :code_only: True - -Explore pre-built community components in `our gallery `_. - -| - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Get started - :description: Learn to build Lightning components step-by-step. - :col_css: col-md-12 - :button_link: levels/basic/index.html - :height: 160 - :tag: 10 minutes - -.. raw:: html - -
-
- -.. raw:: html - -
- -.. toctree:: - :maxdepth: 1 - :caption: Home - - self - Install - -.. toctree:: - :maxdepth: 1 - :caption: Get started in steps - - Basic - Intermediate - Advanced - -.. toctree:: - :maxdepth: 1 - :caption: Core API Reference - - LightningApp - LightningFlow - LightningWork - -.. toctree:: - :maxdepth: 1 - :caption: Addons API Reference - - api_reference/components - api_reference/frontend - api_reference/runners - api_reference/storage - -.. toctree:: - :maxdepth: 1 - :caption: More - - Examples - Glossary - How-to diff --git a/docs/source-app/install/install_beginner.rst b/docs/source-app/install/install_beginner.rst deleted file mode 100644 index f690ef74e7d2f..0000000000000 --- a/docs/source-app/install/install_beginner.rst +++ /dev/null @@ -1,117 +0,0 @@ -:orphan: - -.. _install_beginner: - -############################# -What is a virtual environment -############################# -A virtual environment keeps the packages you install isolated from the rest of your system. -This allows you to work on multiple projects that have different and potentially conflicting requirements, and it -keeps your system Python installation clean. - -.. raw:: html - - - ----- - -We will describe two choices here, pick one: - - -1. :ref:`Python virtualenv `. -2. :ref:`Conda virtual environment `. - ----- - -.. _python-virtualenv: - -******************** -1. Python Virtualenv -******************** - -First, make sure that you have Python 3.8+ installed on your system. - -.. code-block:: bash - - python3 --version - -If you can't run the command above or it returns a version older than 3.8, -`install the latest version of Python `_. -After installing it, make sure you can run the above command without errors. - ----- - -Creating a Virtual Environment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When starting with a new Python project, you typically want to create a new Python virtual environment. -Navigate to the location of your project and run the following command: - -.. code-block:: bash - - python3 -m venv lightning - -The name of the environment here is *lightning* but you can choose any other name you like. -By running the above command, Python will create a new folder *lightning* in the current working directory. - ----- - -Activating the Virtual Environment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Before you can install packages into the environment, you need to activate it: - -.. code-block:: bash - - source lightning/bin/activate - -You need to do this step every time you want to work on your project / open the terminal. -With your virtual environment activated, you are now ready to -:doc:`install Lightning ` and get started with Apps! - ----- - -.. _conda: - -******** -2. Conda -******** - -To get started, you first need to download and install the `Miniconda package manager `_. -To check that the installation was successful, open an new terminal and run: - -.. code:: bash - - conda - -It should return a list of commands. - ----- - -Creating a Conda Environment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When starting with a new Python project, you typically want to create a new Conda virtual environment. -Navigate to the location of your project and run the following command: - -.. code-block:: bash - - conda create --yes --name lightning python=3.8 - -The name of the environment here is *lightning* but you can choose any other name you like. -Note how we can also specify the Python version here. - ----- - -Activating the Conda Environment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Before you can install packages into the environment, you need to activate it: - -.. code-block:: bash - - conda activate lightning - -You need to do this step every time you want to work on your project / open the terminal. -With your virtual environment activated, you are now ready to -:doc:`install Lightning ` and get started with Apps! diff --git a/docs/source-app/install/installation.rst b/docs/source-app/install/installation.rst deleted file mode 100644 index 294e26853007d..0000000000000 --- a/docs/source-app/install/installation.rst +++ /dev/null @@ -1,29 +0,0 @@ - -.. _install: - - -############ -Installation -############ - -**Prerequisites**: Use Python 3.8.x or later (3.8.x, 3.9.x, 3.10.x). We also recommend you install in a virtual environment (learn how). - -.. lit_tabs:: - :descriptions: Pip; Macs, Apple Silicon (M1/M2/M3); Windows - :code_files: pip.bash; mac.bash; windows.bash - :tab_rows: 4 - :height: 180px - ----- - -************ -Troubleshoot -************ -If you encounter issues during installation join our community discord and share the output of the following command: - -.. code:: bash - - pip list | grep lightning - -.. join_slack:: - :align: left diff --git a/docs/source-app/install/mac.bash b/docs/source-app/install/mac.bash deleted file mode 100644 index 22825bb246530..0000000000000 --- a/docs/source-app/install/mac.bash +++ /dev/null @@ -1,5 +0,0 @@ -# needed for M1/M2/M3 -export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 -export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1 - -pip install lightning diff --git a/docs/source-app/install/pip.bash b/docs/source-app/install/pip.bash deleted file mode 100644 index f6d38b76b8107..0000000000000 --- a/docs/source-app/install/pip.bash +++ /dev/null @@ -1 +0,0 @@ -pip install lightning diff --git a/docs/source-app/install/windows.bash b/docs/source-app/install/windows.bash deleted file mode 100644 index 150b04e5b4d5d..0000000000000 --- a/docs/source-app/install/windows.bash +++ /dev/null @@ -1,4 +0,0 @@ -# install pip -# install git -# setup an alias for Python: python=python3 -# Add the root folder of Lightning to the Environment Variables to PATH diff --git a/docs/source-app/intro.rst b/docs/source-app/intro.rst deleted file mode 100644 index c975ee7090bbe..0000000000000 --- a/docs/source-app/intro.rst +++ /dev/null @@ -1,88 +0,0 @@ -:orphan: - -.. _what: - -################### -What is Lightning? -################### - -Lightning is a free, modular, distributed, and open-source framework for building -AI applications where the components you want to use interact together. - -Lightning apps can be built for **any AI use case**, ranging from AI research to -production-ready pipelines (and everything in between!). - -By abstracting the engineering boilerplate, Lightning allows researchers, data scientists, and software engineers to -build highly-scalable, production-ready AI apps using the tools and technologies of their choice, -regardless of their level of engineering expertise. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/Lightning.gif - :alt: What is Lightning gif. - :width: 100 % - ----- - -.. _why: - -*************** -Why Lightning? -*************** - - -Easy to learn -^^^^^^^^^^^^^ - -Lightning was built for creating AI apps, not for dev-ops. It offers an intuitive, pythonic -and highly composable interface that allows you to focus on solving the problems that are important to you. - ----- - -Quick to deliver -^^^^^^^^^^^^^^^^ - -Lightning speeds the development process by offering testable templates you can build from, -accelerating the process of moving from idea to prototype and finally to market. - ----- - -Easy to scale -^^^^^^^^^^^^^ - -Lightning provides a mirrored experience locally and in the cloud. The `lightning.ai `_. -cloud platform abstracts the infrastructure, so you can run your apps at any scale. - ----- - -Easy to collaborate -^^^^^^^^^^^^^^^^^^^ - -Lightning was built for collaboration. -By following the best MLOps practices provided through our documentation and example use cases, -you can deploy state-of-the-art ML applications that are ready to be used by teams of all sizes. - ----- - -***************************** -What's Novel With Lightning? -***************************** - - -Cloud Infra Made Simple and Pythonic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Lightning is for building reactive, scalable, cost effective, easy-to-maintain and reliable ML products in the cloud without worrying about infrastructure. Lightning provides several engineering novelties to enable this: - -#. **Reactivity**: Lightning allows you to run stateful components distributed across different machines, so you can design async, dynamic and reactive workflows in python, without having to define DAGs. - -#. **Scalable & Cost-Effective**: Lightning provides a granular and simple way to run components preemptively or on-demand and on any desired resource such as CPU or GPU. It also enables you to easily transfer artifacts from one machine to another. - -#. **Reliability**: - - #. **Checkpointing**: Lightning apps can be paused and resumed from generated state and artifact-based checkpoints. - #. **Resilience**: Lightning has a strong fault-tolerance foundation. Your application can be written and tested to be resilient for cloud hazards at the component level. - #. **Testing Tools**: Lightning provides you with tools and best practices you can use to develop and test your application. All of our built-in templates have unit integration and end-to-end tests. - -#. **Easy to maintain**: - - #. **Easy Debugging**: Lightning apps can be debugged locally and in the cloud with **breakpoints** in any components. - #. **Non-Invasive**: Lightning is the glue that connects all parts of your workflow, but this is done in a non-invasive way by formalizing API contracts between components. In other words, your application can run someone else's code with little assumption. diff --git a/docs/source-app/landing_app.py b/docs/source-app/landing_app.py deleted file mode 100644 index fa9429a1b53cd..0000000000000 --- a/docs/source-app/landing_app.py +++ /dev/null @@ -1,12 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - -# run on a cloud machine ("cpu", "gpu", ...) -component = YourComponent(cloud_compute=CloudCompute("cpu")) -app = LightningApp(component) diff --git a/docs/source-app/landing_app_run.bash b/docs/source-app/landing_app_run.bash deleted file mode 100644 index e4bed41dadddd..0000000000000 --- a/docs/source-app/landing_app_run.bash +++ /dev/null @@ -1,5 +0,0 @@ -# install lightning -pip install lightning - -# run the app on the --cloud (--setup installs deps automatically) -lightning_app run app app.py --setup --cloud diff --git a/docs/source-app/levels/advanced/index.rst b/docs/source-app/levels/advanced/index.rst deleted file mode 100644 index 4ba7d09d68ba7..0000000000000 --- a/docs/source-app/levels/advanced/index.rst +++ /dev/null @@ -1,94 +0,0 @@ -.. _advanced_level: - -.. toctree:: - :maxdepth: 1 - :hidden: - - start_dynamic_components - level_16 - level_17 - level_18 - level_19 - level_20 - -############### -Advanced skills -############### -Learn to build nested components with advanced functionality. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 9: Start dynamic components - :description: Learn to start works dynamically - :button_link: start_dynamic_components.html - :col_css: col-md-6 - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Level 10: Check component status - :description: Learn to use work status to coordinate complex apps. - :button_link: level_16.html - :col_css: col-md-6 - :height: 150 - :tag: advanced - -.. displayitem:: - :header: Level: Nest flows - :description: Learn to nest flows into other flows. - :button_link: level_14.html - :col_css: col-md-6 - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Level: Develop reactive apps. - :description: Learn to develop reactive Lightning Apps. Lightning shines with reactive workflows. - :button_link: level_14.html - :col_css: col-md-6 - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Level: Enable CLI commands for your app - :description: Speak to your app from a CLI over the network - :button_link: level_17.html - :col_css: col-md-6 - :height: 150 - :tag: advanced - -.. displayitem:: - :header: Level 11: Connect two components over the network - :description: Connect two LightningWorks over the network. - :button_link: level_14.html - :col_css: col-md-6 - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Level 13: Rerun components - :description: Learn to reuse components by passing different variables. - :button_link: level_17.html - :col_css: col-md-6 - :height: 150 - :tag: advanced - -.. displayitem:: - :header: Level 14: Handle Lightning App exceptions - :description: Learn to handle Lightning App exceptions. - :button_link: level_19.html - :col_css: col-md-6 - :height: 150 - :tag: advanced - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/advanced/level_16.rst b/docs/source-app/levels/advanced/level_16.rst deleted file mode 100644 index 58c1e1fc7e0ad..0000000000000 --- a/docs/source-app/levels/advanced/level_16.rst +++ /dev/null @@ -1,10 +0,0 @@ -########################### -Level 16: Check Work status -########################### -**Audience:** Users who want to stop/start Lightning Work based on a status. - -**Prereqs:** Level 16+ - ----- - -.. include:: ../../core_api/lightning_work/status_content.rst diff --git a/docs/source-app/levels/advanced/level_17.rst b/docs/source-app/levels/advanced/level_17.rst deleted file mode 100644 index 6650860eaa177..0000000000000 --- a/docs/source-app/levels/advanced/level_17.rst +++ /dev/null @@ -1,10 +0,0 @@ -########################## -Level 17: Rerun components -########################## -**Audience:** Users who want Work.run() to activate multiple times in an app. - -**Prereqs:** Level 16+ and read the :doc:`Event Loop guide <../../glossary/event_loop>`. - ----- - -.. include:: ../../workflows/run_work_once_content.rst diff --git a/docs/source-app/levels/advanced/level_18.rst b/docs/source-app/levels/advanced/level_18.rst deleted file mode 100644 index 87fba3eb8bcc9..0000000000000 --- a/docs/source-app/levels/advanced/level_18.rst +++ /dev/null @@ -1,10 +0,0 @@ -############################################## -Level 18: Share objects between LightningWorks -############################################## -**Audience:** Users moving DataFrames or outputs, between Lightning Works (usually data engineers). - -**Prereqs:** Level 16+ and know about the Pandas library and read the :doc:`Access app state guide <../../workflows/access_app_state>`. - ----- - -.. include:: ../../core_api/lightning_work/payload_content.rst diff --git a/docs/source-app/levels/advanced/level_19.rst b/docs/source-app/levels/advanced/level_19.rst deleted file mode 100644 index 99a859e1ad2ca..0000000000000 --- a/docs/source-app/levels/advanced/level_19.rst +++ /dev/null @@ -1,11 +0,0 @@ -######################################### -Level 19: Handle Lightning App exceptions -######################################### - -**Audience:** Users who want to make Lightning Apps more robust to potential issues. - -**Prereqs:** Level 16+ - ----- - -.. include:: ../../core_api/lightning_work/handling_app_exception_content.rst diff --git a/docs/source-app/levels/advanced/level_20.rst b/docs/source-app/levels/advanced/level_20.rst deleted file mode 100644 index 1d045e85fc692..0000000000000 --- a/docs/source-app/levels/advanced/level_20.rst +++ /dev/null @@ -1,11 +0,0 @@ -####################################### -Level 20: Enable dynamic LightningWorks -####################################### - -**Audience:** Users who want to create/run/stop multiple LightningWorks not defined at app instantiation. - -**Prereqs:** Level 16+ - ----- - -.. include:: ../../core_api/lightning_app/dynamic_work_content.rst diff --git a/docs/source-app/levels/advanced/start_dynamic_components.rst b/docs/source-app/levels/advanced/start_dynamic_components.rst deleted file mode 100644 index 2e91bc2f632f0..0000000000000 --- a/docs/source-app/levels/advanced/start_dynamic_components.rst +++ /dev/null @@ -1,38 +0,0 @@ -############################### -Level: Start dynamic components -############################### -**Audience:** Users who want to run a Lightning Component in parallel (asynchronously). - -**Prereqs:** You must have finished the :doc:`Basic levels <../basic/index>`. - ----- - -.. include:: ../../workflows/run_work_in_parallel_content.rst - ----- - -********************************************** -Next steps: Share variables between components -********************************************** -Now that you know how to run components in parallel, we'll learn to share variables -across components to simplify complex workflows. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 6: Share variables between components - :description: Learn to connect components - :col_css: col-md-12 - :button_link: share_variables_between_lightning_components.html - :height: 150 - :tag: 10 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/basic/build_a_dag.rst b/docs/source-app/levels/basic/build_a_dag.rst deleted file mode 100644 index e430306447797..0000000000000 --- a/docs/source-app/levels/basic/build_a_dag.rst +++ /dev/null @@ -1,20 +0,0 @@ -:orphan: - -########################### -Example: Deploy a model API -########################### - -**Prereqs:** You have an app already running locally. - ----- - -**************************** -What is the Lightning Cloud? -**************************** -The Lightning Cloud is the platform that we've created to interface with the cloud providers. Today -the Lightning Cloud supports AWS. - -.. note:: Support for GCP and Azure is coming soon! - -To use the Lightning Cloud, you buy credits that are used to pay the cloud providers. If you want to run -on your own AWS credentials, please contact us (support@lightning.ai) so we can get your clusters set up for you. diff --git a/docs/source-app/levels/basic/build_a_lightning_component.rst b/docs/source-app/levels/basic/build_a_lightning_component.rst deleted file mode 100644 index 00b777e150813..0000000000000 --- a/docs/source-app/levels/basic/build_a_lightning_component.rst +++ /dev/null @@ -1,154 +0,0 @@ -############################################## -Level 1: Package code in a lightning component -############################################## - -**Prereqs:** You know *basic* Python. - -**Goal:** In this guide you'll learn to develop `a Lightning component `_. - - -********************************* -Why you need Lightning components -********************************* -A Lightning component is a self-contained, modular machine-learning component -that you can plug into your existing ML workflows. A Lightning component organizes arbitrary code so it can run on the cloud, manages -its own infrastructure, cloud costs, networking and more. Connect components using your current workflow management tools or -our :doc:`next-generation reactive orchestrator <../intermediate/index>`. - -Components run on the cloud or your laptop without code changes 🤯🤯. - -.. raw:: html - -
- -
- -| - -Organizing your code into Lightning components offers these benefits: - -.. collapse:: Build systems not scripts - - | - - The Lightning structure forces best practices so you don't have to be an expert production engineer. - Although it feels like you're writing a script, you are actually building a production-ready system. - -.. collapse:: Cost control - - | - - The component run-time has been optimized for cost management to support the largest machine-learning workloads. - Lower your cloud bill with machines that shut down or spin up faster. - -.. collapse:: For beginners: Code like an expert - - | - - Lightning embeds the best practices of building production-ready full stack AI apps into your - coding experience. You can write code like you normally do, and the Lightning structure - ensures your code is implicitly production ready... even if you're just doing research. - -.. collapse:: For experts: Scale with full control - - | - - if you know what you are doing, Lightning gives you full control to manage your own - scaling logic, fault-tolerance and even pre-provisioning, all from Python. - -.. collapse:: Integrate into your current workflow tools - - | - - Lightning components are self-contained pieces of functionality. Add them to your current workflow - tools to quickly fill in gaps in your ML workflow such as monitoring drift, training LLMs and more. - You can (optionally) use the Lightning App to integrate components into a cohesive workflow. - -.. collapse:: Packaged code - - | - - Lightning apps bundles components into an app that runs in any environment. The same code will run on your laptop, - or any cloud or private clusters. You don't have to think about the cluster or know anything about the cloud. - -.. collapse:: Rapid iteration - - | - - Iterate through ideas in hours not months because you don't have to learn a million other concepts that the components - handle for you such as kubernetes, cost management, auto-scaling and more. - -.. collapse:: Modularity - - | - - Components are modular and inter-operable by design. Leverage our vibrant community of components so you don't - have to build each piece of the system yourself. - ----- - -***************** -Install Lightning -***************** -First, install Lightning. - -.. lit_tabs:: - :descriptions: Pip; Macs, Apple Silicon (M1/M2/M3); Windows - :code_files: /install/pip.bash; /install/mac.bash; /install/windows.bash - :tab_rows: 4 - :height: 180px - ----- - -************************** -Build your first component -************************** -A Lightning component organizes arbitrary code so it can run on the cloud, manages its own infrastructure, cloud costs, networking and more - -**Run one of these components!** - -.. include:: ./hero_components.rst - ----- - -************ -Key features -************ -You now know enough to build a self-contained component that runs any Python code on the cloud that can be connected to form a -powerful Lightning app. Here are a few key features available to super-charge your work: - -.. lit_tabs:: - :titles: 15+ accelerators; Auto-stop idle machines; Auto-timeout submitted work; Use spot machines (~70% discount); Work with massive datasets; Mount cloud storage; Use a custom container - :code_files: ./key_features/accelerators.py; ./key_features/idle_machine.py; ./key_features/auto_timeout.py; ./key_features/spot.py; ./key_features/massive_dataset.py; ./key_features/mount_data.py; ./key_features/custom_container.py; - :highlights: 11;11;11;11;11;2,7,10, 11; 11 - :enable_run: true - :tab_rows: 3 - :height: 430px - ----- - -******************************************** -Next: Explore real component implementations -******************************************** -In this section we introduced components. Let's explore -real component implementations in-depth. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 2: Explore real component implementations - :description: Go deep into real component implementations. - :col_css: col-md-12 - :button_link: real_lightning_component_implementations.html - :height: 150 - :tag: beginner - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/basic/create_a_model_demo.rst b/docs/source-app/levels/basic/create_a_model_demo.rst deleted file mode 100644 index 72750717efde4..0000000000000 --- a/docs/source-app/levels/basic/create_a_model_demo.rst +++ /dev/null @@ -1,20 +0,0 @@ -:orphan: - -############################ -Example: Create a model demo -############################ - -**Prereqs:** You have an app already running locally. - ----- - -**************************** -What is the Lightning Cloud? -**************************** -The Lightning Cloud is the platform that we've created to interface with the cloud providers. Today -the Lightning Cloud supports AWS. - -.. note:: Support for GCP and Azure is coming soon! - -To use the Lightning Cloud, you buy credits that are used to pay the cloud providers. If you want to run -on your own AWS credentials, please contact us (support@lightning.ai) so we can get your clusters set up for you. diff --git a/docs/source-app/levels/basic/deploy_ai_model_api.rst b/docs/source-app/levels/basic/deploy_ai_model_api.rst deleted file mode 100644 index e430306447797..0000000000000 --- a/docs/source-app/levels/basic/deploy_ai_model_api.rst +++ /dev/null @@ -1,20 +0,0 @@ -:orphan: - -########################### -Example: Deploy a model API -########################### - -**Prereqs:** You have an app already running locally. - ----- - -**************************** -What is the Lightning Cloud? -**************************** -The Lightning Cloud is the platform that we've created to interface with the cloud providers. Today -the Lightning Cloud supports AWS. - -.. note:: Support for GCP and Azure is coming soon! - -To use the Lightning Cloud, you buy credits that are used to pay the cloud providers. If you want to run -on your own AWS credentials, please contact us (support@lightning.ai) so we can get your clusters set up for you. diff --git a/docs/source-app/levels/basic/hello_components/code_run_cloud.bash b/docs/source-app/levels/basic/hello_components/code_run_cloud.bash deleted file mode 100644 index f81431222724c..0000000000000 --- a/docs/source-app/levels/basic/hello_components/code_run_cloud.bash +++ /dev/null @@ -1 +0,0 @@ -lightning_app run app app.py --cloud diff --git a/docs/source-app/levels/basic/hello_components/code_run_cloud_setup.bash b/docs/source-app/levels/basic/hello_components/code_run_cloud_setup.bash deleted file mode 100644 index 09435ff5d7caa..0000000000000 --- a/docs/source-app/levels/basic/hello_components/code_run_cloud_setup.bash +++ /dev/null @@ -1 +0,0 @@ -lightning_app run app app.py --setup --cloud diff --git a/docs/source-app/levels/basic/hello_components/code_run_local.bash b/docs/source-app/levels/basic/hello_components/code_run_local.bash deleted file mode 100644 index 1f8c994a84b1d..0000000000000 --- a/docs/source-app/levels/basic/hello_components/code_run_local.bash +++ /dev/null @@ -1 +0,0 @@ -lightning_app run app app.py diff --git a/docs/source-app/levels/basic/hello_components/code_run_local_setup.bash b/docs/source-app/levels/basic/hello_components/code_run_local_setup.bash deleted file mode 100644 index 45b0529de3736..0000000000000 --- a/docs/source-app/levels/basic/hello_components/code_run_local_setup.bash +++ /dev/null @@ -1 +0,0 @@ -lightning_app run app app.py --setup diff --git a/docs/source-app/levels/basic/hello_components/deploy_model.py b/docs/source-app/levels/basic/hello_components/deploy_model.py deleted file mode 100644 index 7911f6a3158ac..0000000000000 --- a/docs/source-app/levels/basic/hello_components/deploy_model.py +++ /dev/null @@ -1,31 +0,0 @@ -# !pip install torchvision -from lightning.app import LightningApp, CloudCompute -from lightning.app.components.serve import PythonServer, Image, Number -import base64, io, torchvision, torch -from PIL import Image as PILImage - - -class PyTorchServer(PythonServer): - def setup(self): - self._model = torchvision.models.resnet18(pretrained=True) - self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self._model.to(self._device) - - def predict(self, request): - image = base64.b64decode(request.image.encode("utf-8")) - image = PILImage.open(io.BytesIO(image)) - transforms = torchvision.transforms.Compose([ - torchvision.transforms.Resize(224), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]) - image = transforms(image) - image = image.to(self._device) - prediction = self._model(image.unsqueeze(0)) - return {"prediction": prediction.argmax().item()} - - -component = PyTorchServer( - input_type=Image, output_type=Number, cloud_compute=CloudCompute('gpu') -) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/hello_world.py b/docs/source-app/levels/basic/hello_components/hello_world.py deleted file mode 100644 index a716fecb2fb2c..0000000000000 --- a/docs/source-app/levels/basic/hello_components/hello_world.py +++ /dev/null @@ -1,12 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - - -component = YourComponent() -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/hello_world_gpu.py b/docs/source-app/levels/basic/hello_components/hello_world_gpu.py deleted file mode 100644 index 67e5d92fb3666..0000000000000 --- a/docs/source-app/levels/basic/hello_components/hello_world_gpu.py +++ /dev/null @@ -1,12 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - -# run on a cloud machine ("cpu", "gpu", ...) -compute = CloudCompute("gpu") -component = YourComponent(cloud_compute=compute) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/multi_node.py b/docs/source-app/levels/basic/hello_components/multi_node.py deleted file mode 100644 index 03ce0fbec2341..0000000000000 --- a/docs/source-app/levels/basic/hello_components/multi_node.py +++ /dev/null @@ -1,29 +0,0 @@ -# !pip install torch -from lightning.app import LightningWork, LightningApp, CloudCompute -from lightning.app.components import MultiNode - - -class MultiNodeComponent(LightningWork): - def run( - self, - main_address: str, - main_port: int, - node_rank: int, - world_size: int, - ): - print(f"ADD YOUR DISTRIBUTED CODE: {main_address=} {main_port=} {node_rank=} {world_size=}") - print("supports ANY ML library") - - - - - - - - - - -# gpu-multi-fast has 4 GPUs x 8 nodes = 32 GPUs -component = MultiNodeComponent(cloud_compute=CloudCompute("gpu-multi-fast")) -component = MultiNode(component, nodes=8) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py deleted file mode 100644 index 09480da44eee6..0000000000000 --- a/docs/source-app/levels/basic/hello_components/pl_multinode.py +++ /dev/null @@ -1,20 +0,0 @@ -# app.py -from lightning import Trainer -from lightning.app import LightningWork, LightningApp, CloudCompute -from lightning.app.components import LightningTrainerMultiNode -from lightning.pytorch.demos.boring_classes import BoringModel - - -class LightningTrainerDistributed(LightningWork): - def run(self): - model = BoringModel() - trainer = Trainer(max_epochs=10, strategy="ddp") - trainer.fit(model) - -# 8 GPUs: (2 nodes of 4 x v100) -component = LightningTrainerMultiNode( - LightningTrainerDistributed, - num_nodes=4, - cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100 -) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/pt_multinode.py b/docs/source-app/levels/basic/hello_components/pt_multinode.py deleted file mode 100644 index 569a7c99b4d0c..0000000000000 --- a/docs/source-app/levels/basic/hello_components/pt_multinode.py +++ /dev/null @@ -1,60 +0,0 @@ -# app.py -# ! pip install torch -from lightning.app import LightningWork, LightningApp, CloudCompute -from lightning.app.components import MultiNode -import torch -from torch.nn.parallel.distributed import DistributedDataParallel - - -def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int): - # 1. SET UP DISTRIBUTED ENVIRONMENT - global_rank = local_rank + node_rank * nprocs - world_size = num_nodes * nprocs - - if torch.distributed.is_available() and not torch.distributed.is_initialized(): - torch.distributed.init_process_group( - "nccl" if torch.cuda.is_available() else "gloo", - rank=global_rank, - world_size=world_size, - init_method=f"tcp://{main_address}:{main_port}", - ) - - # 2. PREPARE DISTRIBUTED MODEL - model = torch.nn.Linear(32, 2) - device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model, device_ids=[local_rank] if torch.cuda.is_available() else None).to(device) - - # 3. SETUP LOSS AND OPTIMIZER - criterion = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - - # 4.TRAIN THE MODEL FOR 50 STEPS - for step in range(50): - model.zero_grad() - x = torch.randn(64, 32).to(device) - output = model(x) - loss = criterion(output, torch.ones_like(output)) - print(f"global_rank: {global_rank} step: {step} loss: {loss}") - loss.backward() - optimizer.step() - - # 5. VERIFY ALL COPIES OF THE MODEL HAVE THE SAME WEIGTHS AT END OF TRAINING - weight = model.module.weight.clone() - torch.distributed.all_reduce(weight) - assert torch.equal(model.module.weight, weight / world_size) - - print("Multi Node Distributed Training Done!") - -class PyTorchDistributed(LightningWork): - def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int): - nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1 - torch.multiprocessing.spawn( - distributed_train, - args=(main_address, main_port, num_nodes, node_rank, nprocs), - nprocs=nprocs - ) - -# 32 GPUs: (8 nodes x 4 v 100) -compute = CloudCompute("gpu-fast-multi") # 4xV100 -component = MultiNode(PyTorchDistributed, num_nodes=8, cloud_compute=compute) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/run_ptl_script.py b/docs/source-app/levels/basic/hello_components/run_ptl_script.py deleted file mode 100644 index 46501310e2aa7..0000000000000 --- a/docs/source-app/levels/basic/hello_components/run_ptl_script.py +++ /dev/null @@ -1,13 +0,0 @@ -# app.py -# !curl https://raw.githubusercontent.com/Lightning-AI/lightning/master/examples/app/multi_node/pl_boring_script.py -o pl_boring_script.py -from lightning.app import LightningApp, CloudCompute -from lightning.app.components.training import LightningTrainerScript - -# run script that trains PyTorch with the Lightning Trainer -model_script = 'pl_boring_script.py' -component = LightningTrainerScript( - model_script, - num_nodes=1, - cloud_compute=CloudCompute("gpu") -) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/hello_components/streamlit_demo.py b/docs/source-app/levels/basic/hello_components/streamlit_demo.py deleted file mode 100644 index 41ad3988d908d..0000000000000 --- a/docs/source-app/levels/basic/hello_components/streamlit_demo.py +++ /dev/null @@ -1,30 +0,0 @@ -# app.py -# !pip install streamlit omegaconf scipy -# !pip install torch -from lightning.app import LightningApp -import torch -from io import BytesIO -from functools import partial -from scipy.io.wavfile import write -import streamlit as st - - -class StreamlitApp(app.components.ServeStreamlit): - def build_model(self): - sample_rate = 48000 - model, _ = torch.hub.load('snakers4/silero-models', model='silero_tts',speaker="v3_en") - return partial(model.apply_tts, sample_rate=sample_rate, speaker="en_0"), sample_rate - - def render(self): - st.title("Text To Speech") - text = st.text_input("Text:", "Lightning Apps are the best!") - - if text: - model, sample_rate = self.model - audio_numpy = model(text).numpy() - audio = BytesIO() - write(audio, sample_rate, audio_numpy) - audio.seek(0) - st.audio(audio) - -app = LightningApp(StreamlitApp()) diff --git a/docs/source-app/levels/basic/hello_components/train_ptl.py b/docs/source-app/levels/basic/hello_components/train_ptl.py deleted file mode 100644 index 3fa9684f9213b..0000000000000 --- a/docs/source-app/levels/basic/hello_components/train_ptl.py +++ /dev/null @@ -1,15 +0,0 @@ -# A hello world component -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - - -# run on a cloud machine -compute = CloudCompute("cpu") -worker = YourComponent(cloud_compute=compute) -app = LightningApp(worker) diff --git a/docs/source-app/levels/basic/hello_components/train_pytorch.py b/docs/source-app/levels/basic/hello_components/train_pytorch.py deleted file mode 100644 index fa54c577bd19b..0000000000000 --- a/docs/source-app/levels/basic/hello_components/train_pytorch.py +++ /dev/null @@ -1,28 +0,0 @@ -# app.py -# ! pip install torch -from lightning.app import LightningWork, LightningApp, CloudCompute -import torch - -class PyTorchComponent(LightningWork): - def run(self): - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - model = torch.nn.Sequential(torch.nn.Linear(1, 1), - torch.nn.ReLU(), - torch.nn.Linear(1, 1)) - model.to(device) - criterion = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=0.1) - - for step in range(10000): - model.zero_grad() - x = torch.tensor([0.8]).to(device) - target = torch.tensor([1.0]).to(device) - output = model(x) - loss = criterion(output, target) - print(f'step: {step}. loss {loss}') - loss.backward() - optimizer.step() - -compute = CloudCompute('gpu') -componet = PyTorchComponent(cloud_compute=compute) -app = LightningApp(componet) diff --git a/docs/source-app/levels/basic/hello_components/xgboost.py b/docs/source-app/levels/basic/hello_components/xgboost.py deleted file mode 100644 index 68cc2c181e050..0000000000000 --- a/docs/source-app/levels/basic/hello_components/xgboost.py +++ /dev/null @@ -1,21 +0,0 @@ -# app.py -# !pip install scikit-learn xgboost -from lightning.app import LightningWork, LightningApp -from sklearn import datasets -from sklearn.model_selection import train_test_split -from xgboost import XGBClassifier - -class XGBoostComponent(LightningWork): - def run(self): - iris = datasets.load_iris() - X, y = iris.data, iris.target - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - bst = XGBClassifier(verbosity=3) - bst.fit(X_train, y_train) - preds = bst.predict(X_test) - print(f'preds: {preds}') - - -app = LightningApp(XGBoostComponent()) diff --git a/docs/source-app/levels/basic/hello_components/xgboost_gpu.py b/docs/source-app/levels/basic/hello_components/xgboost_gpu.py deleted file mode 100644 index f8058a3169e34..0000000000000 --- a/docs/source-app/levels/basic/hello_components/xgboost_gpu.py +++ /dev/null @@ -1,22 +0,0 @@ -# app.py -# !pip install sklearn xgboost -# !conda install py-xgboost-gpu -from lightning.app import LightningWork, LightningApp, CloudCompute -from sklearn import datasets -from sklearn.model_selection import train_test_split -from xgboost import XGBClassifier - -class XGBoostComponent(LightningWork): - def run(self): - iris = datasets.load_iris() - X, y = iris.data, iris.target - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) - - bst = XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity=3) - bst.fit(X_train, y_train) - preds = bst.predict(X_test) - print(f'preds: {preds}') - -compute = CloudCompute('gpu') -app = LightningApp(XGBoostComponent(cloud_compute=compute)) diff --git a/docs/source-app/levels/basic/hero_components.rst b/docs/source-app/levels/basic/hero_components.rst deleted file mode 100644 index da1d9d076a794..0000000000000 --- a/docs/source-app/levels/basic/hero_components.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. lit_tabs:: - :titles: Hello world; Hello GPU world; PyTorch & ⚡⚡⚡ Trainer (1+ cloud GPUs); Train PyTorch (cloud GPU); Train PyTorch (32 cloud GPUs); Deploy a model on cloud GPUs; Run a model script; XGBoost; Streamlit demo - :code_files: /levels/basic/hello_components/hello_world.py; /levels/basic/hello_components/hello_world_gpu.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/train_pytorch.py; /levels/basic/hello_components/pt_multinode.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/run_ptl_script.py; /levels/basic/hello_components/xgboost.py; /levels/basic/hello_components/streamlit_demo.py - :highlights: 7; 10, 11; 9-11, 16, 17; 4, 8, 12, 18-19, 26; 5, 10, 22, 27, 31, 41, 57-59; 3, 11-12, 25, 29; 7, 10; 15, 21; 9, 15, 24 - :works: [{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"default","preemptible":false,"shmSize":0},"networkConfig":[{"name":"dzodf","port":61304}]}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"qnlgd","port":61516}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu","preemptible":false,"shmSize":0}}}];[{"name":"root.ws.0","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"ajfrc","port":61553}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.1","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"ttyqc","port":61554}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.2","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"svyej","port":61555}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.3","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"parme","port":61556}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"cutdu","port":61584}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu","preemptible":false,"shmSize":0}}}];[{"name":"root.ws.0","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"whhby","port":61613}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.1","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"yhjtf","port":61614}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.2","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"rqwkt","port":61615}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.3","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"pjdsj","port":61616}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.4","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"efdor","port":61617}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.5","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"pxmso","port":61618}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.6","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"feevy","port":61619}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}},{"name":"root.ws.7","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"tbmse","port":61620}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu-fast-multi","preemptible":false,"shmSize":0}}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"umqqg","port":7777}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"gpu","preemptible":false,"shmSize":0}}}];[];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"tggba","port":61729}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"default","preemptible":false,"shmSize":0}}}];[{"name":"root.work","spec":{"buildSpec":{"commands":[],"pythonDependencies":{"packageManager":"PACKAGE_MANAGER_PIP","packages":""}},"drives":[],"networkConfig":[{"name":"hpyaz","port":61763}],"userRequestedComputeConfig":{"count":1,"diskSize":0,"name":"default","preemptible":false,"shmSize":0}}}] - :enable_run: true - :tab_rows: 3 - :height: 620px diff --git a/docs/source-app/levels/basic/index.rst b/docs/source-app/levels/basic/index.rst deleted file mode 100644 index 2912b69b6b7ff..0000000000000 --- a/docs/source-app/levels/basic/index.rst +++ /dev/null @@ -1,54 +0,0 @@ -.. _level_basic: - -.. toctree:: - :maxdepth: 1 - :hidden: - - build_a_lightning_component - real_lightning_component_implementations - save_money_on_cloud_costs - -############ -Basic skills -############ -Learn to package your code into Lightning components which can plug into your existing ML workflows. - -A Lightning component organizes arbitrary code so it can run on the cloud, manages -its own infrastructure, cloud costs, networking and more. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 1: Package code in a Lightning component - :description: Learn to package your code into Lightning components which can plug into your existing ML workflows. - :button_link: build_a_lightning_component.html - :col_css: col-md-6 - :height: 170 - :tag: 10 minutes - -.. displayitem:: - :header: Level 2: Explore real component implementations - :description: Go deep into real component implementations. - :button_link: real_lightning_component_implementations.html - :col_css: col-md-6 - :height: 170 - :tag: 10 minutes - -.. displayitem:: - :header: Level 3: Save money on cloud costs - :description: Explore key Lightning features that save you cloud costs and improve performance. - :button_link: save_money_on_cloud_costs.html - :col_css: col-md-6 - :height: 150 - :tag: 10 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/basic/key_features/accelerators.py b/docs/source-app/levels/basic/key_features/accelerators.py deleted file mode 100644 index fd27325fda74e..0000000000000 --- a/docs/source-app/levels/basic/key_features/accelerators.py +++ /dev/null @@ -1,21 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - -# custom accelerators -compute = CloudCompute('gpu') -component = YourComponent(cloud_compute=compute) -app = LightningApp(component) - -# OTHER ACCELERATORS: -# compute = CloudCompute('default') # 1 CPU -# compute = CloudCompute('cpu-medium') # 8 CPUs -# compute = CloudCompute('gpu') # 1 T4 GPU -# compute = CloudCompute('gpu-fast-multi') # 4 V100 GPU -# compute = CloudCompute('p4d.24xlarge') # AWS instance name (8 A100 GPU) -# compute = ... diff --git a/docs/source-app/levels/basic/key_features/auto_timeout.py b/docs/source-app/levels/basic/key_features/auto_timeout.py deleted file mode 100644 index 73d6281b602d5..0000000000000 --- a/docs/source-app/levels/basic/key_features/auto_timeout.py +++ /dev/null @@ -1,13 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - -# if the machine hasn't started after 60 seconds, cancel the work -compute = CloudCompute('gpu', wait_timeout=60) -component = YourComponent(cloud_compute=compute) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/key_features/custom_container.py b/docs/source-app/levels/basic/key_features/custom_container.py deleted file mode 100644 index 5e574d43d90a8..0000000000000 --- a/docs/source-app/levels/basic/key_features/custom_container.py +++ /dev/null @@ -1,13 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - -# custom image (from any provider) -config= BuildConfig(image="gcr.io/google-samples/hello-app:1.0") -component = YourComponent(cloud_build_config=config) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/key_features/idle_machine.py b/docs/source-app/levels/basic/key_features/idle_machine.py deleted file mode 100644 index 89ab43355c8e6..0000000000000 --- a/docs/source-app/levels/basic/key_features/idle_machine.py +++ /dev/null @@ -1,13 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - -# stop the machine when idle for 10 seconds -compute = CloudCompute('gpu', idle_timeout=10) -component = YourComponent(cloud_compute=compute) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/key_features/massive_dataset.py b/docs/source-app/levels/basic/key_features/massive_dataset.py deleted file mode 100644 index 2c12b9cb4e8f7..0000000000000 --- a/docs/source-app/levels/basic/key_features/massive_dataset.py +++ /dev/null @@ -1,13 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - - -# use 100 GB of space on that machine (max size: 64 TB) -compute = CloudCompute('gpu', disk_size=100) -component = YourComponent(cloud_compute=compute) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/key_features/mount_data.py b/docs/source-app/levels/basic/key_features/mount_data.py deleted file mode 100644 index e6096d7e925d2..0000000000000 --- a/docs/source-app/levels/basic/key_features/mount_data.py +++ /dev/null @@ -1,13 +0,0 @@ -from lightning.app import LightningWork, LightningApp, CloudCompute -import os - - -class YourComponent(LightningWork): - def run(self): - os.listdir('/foo') - -# mount the files on the s3 bucket under this path -mount = Mount(source="s3://lightning-example-public/", mount_path="/foo") -compute = CloudCompute(mounts=mount) -component = YourComponent(cloud_compute=compute) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/key_features/spot.py b/docs/source-app/levels/basic/key_features/spot.py deleted file mode 100644 index b1a0291eeacee..0000000000000 --- a/docs/source-app/levels/basic/key_features/spot.py +++ /dev/null @@ -1,13 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningApp, CloudCompute - - -class YourComponent(LightningWork): - def run(self): - print('RUN ANY PYTHON CODE HERE') - -# spot machines can be turned off without notice, use for non-critical, resumable work -# request a spot machine, after 60 seconds of waiting switch to full-price -compute = CloudCompute('gpu', wait_timeout=60, spot=True) -component = YourComponent(cloud_compute=compute) -app = LightningApp(component) diff --git a/docs/source-app/levels/basic/real_lightning_component_implementations.rst b/docs/source-app/levels/basic/real_lightning_component_implementations.rst deleted file mode 100644 index 391bda9ada605..0000000000000 --- a/docs/source-app/levels/basic/real_lightning_component_implementations.rst +++ /dev/null @@ -1,75 +0,0 @@ -############################################### -Level 2: Explore real component implementations -############################################### -**Audience:** Users who want to deeply understand what is possible with Lightning components. - -**Prereqs:** You must have finished :doc:`level 1 <../basic/build_a_lightning_component>`. - ----- - -*********************** -Real component examples -*********************** -Use this guide to understand what is happening in each type of component. -These are a few prototypical components. Since each component organizes -Python, you can build virtually infinite components for any use-case -you can think of. - ----- - -******************************* -Ex: PyTorch + Lightning Trainer -******************************* -This example shows how to train PyTorch with the Lightning trainer on your machine -or cloud GPUs without code changes. - -.. lit_tabs:: - :descriptions: import Lightning; We're using a demo LightningModule; Move your training code here (usually your main.py); Pass your component to the multi-node executor (it works on CPU or single GPUs also); Select the number of machines (nodes). Here we choose 4.; Choose from over 15+ machine types. This one has 4 v100 GPUs.; Initialize the App object that executes the component logic. - :code_files: /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; /levels/basic/hello_components/pl_multinode.py; - :highlights: 2; 4; 9-11; 14-17; 16; 17; 19 - :enable_run: true - :tab_rows: 5 - :height: 420px - ----- - -********************************* -Ex: Deploy a PyTorch API endpoint -********************************* -This example shows how to deploy PyTorch and create an API - -.. lit_tabs:: - :descriptions: Shortcut to list dependencies without a requirements.txt file.; Import one of our serving components (high-performance ones are available on the enterprise tiers); Define the setup function to load your favorite pretrained models and do any kind of pre-processing.; Define the predict function which is called when the endpoint is hit.; Initialize the server and define the type of cloud machine to use. - :code_files: /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; /levels/basic/hello_components/deploy_model.py; - :highlights: 1; 3; 10-12; 15-25; 28-30 - :enable_run: true - :tab_rows: 4 - :height: 620px - ----- - -************************* -Next: Save on cloud costs -************************* -Let's review key lightning features to help you run components more efficiently on the cloud -so you can save on cloud costs. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 3: Save money on cloud costs - :description: Explore key Lightning features that save you cloud costs and improve performance. - :button_link: save_money_on_cloud_costs.html - :col_css: col-md-12 - :height: 150 - :tag: 10 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/basic/run_jupyter_notebook_on_the_cloud.rst b/docs/source-app/levels/basic/run_jupyter_notebook_on_the_cloud.rst deleted file mode 100644 index 9bc5cdd5207ef..0000000000000 --- a/docs/source-app/levels/basic/run_jupyter_notebook_on_the_cloud.rst +++ /dev/null @@ -1,20 +0,0 @@ -:orphan: - -############################################# -Example: Develop a Jupyter Notebook component -############################################# - -**Prereqs:** You have an app already running locally. - ----- - -**************************** -What is the Lightning Cloud? -**************************** -The Lightning Cloud is the platform that we've created to interface with the cloud providers. Today -the Lightning Cloud supports AWS. - -.. note:: Support for GCP and Azure is coming soon! - -To use the Lightning Cloud, you buy credits that are used to pay the cloud providers. If you want to run -on your own AWS credentials, please contact us (support@lightning.ai) so we can get your clusters set up for you. diff --git a/docs/source-app/levels/basic/save_money_on_cloud_costs.rst b/docs/source-app/levels/basic/save_money_on_cloud_costs.rst deleted file mode 100644 index b2ff007ca8bba..0000000000000 --- a/docs/source-app/levels/basic/save_money_on_cloud_costs.rst +++ /dev/null @@ -1,51 +0,0 @@ -################################## -Level 3: Save money on cloud costs -################################## -**Audience:** Users who want to use the AWS cloud efficiently. - -**Prereqs:** You must have finished :doc:`level 1 <../basic/build_a_lightning_component>`. - ----- - -*********************************** -Save money with these optimizations -*********************************** -A Lightning component gives you fine-grain control over the cloud lifecycle of that component. - -Here are a few features that will enable you save a lot on your cloud costs: - -.. lit_tabs:: - :titles: 15+ accelerators; Auto-stop idle machines; Auto-timeout submitted work; Use spot machines (~70% discount); Work with massive datasets; Mount cloud storage; Use a custom container - :code_files: ./key_features/accelerators.py; ./key_features/idle_machine.py; ./key_features/auto_timeout.py; ./key_features/spot.py; ./key_features/massive_dataset.py; ./key_features/mount_data.py; ./key_features/custom_container.py; - :highlights: 11;11;11;11;11;1,7, 10, 11; 11 - :enable_run: true - :tab_rows: 3 - :height: 430px - ----- - -****************************** -Next: Coordinate 2+ components -****************************** -Now that you know how to organize arbitrary code inside a Lightning component, -learn to coordinate 2 or more components into workflows which we call Lightning apps. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Intermediate skills - :description: Learn to coordinate 2+ components into workflows which we call Lightning apps. - :button_link: ../intermediate/index.html - :col_css: col-md-12 - :height: 170 - :tag: 15 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/basic/scripts/toy_app_1_component.py b/docs/source-app/levels/basic/scripts/toy_app_1_component.py deleted file mode 100644 index e09df3ecb0d9c..0000000000000 --- a/docs/source-app/levels/basic/scripts/toy_app_1_component.py +++ /dev/null @@ -1,17 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp - -class Component(LightningWork): - def run(self, x): - print(x) - - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component = Component() - - def run(self): - self.component.run('i love Lightning') - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/basic/scripts/toy_app_1_component_pdb.py b/docs/source-app/levels/basic/scripts/toy_app_1_component_pdb.py deleted file mode 100644 index 6348f9c4ed46e..0000000000000 --- a/docs/source-app/levels/basic/scripts/toy_app_1_component_pdb.py +++ /dev/null @@ -1,18 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.pdb import set_trace - -class Component(LightningWork): - def run(self, x): - print(x) - set_trace() - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component = Component() - - def run(self): - self.component.run('i love Lightning') - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/basic/train_pytorch_on_the_cloud.rst b/docs/source-app/levels/basic/train_pytorch_on_the_cloud.rst deleted file mode 100644 index 1860094624f15..0000000000000 --- a/docs/source-app/levels/basic/train_pytorch_on_the_cloud.rst +++ /dev/null @@ -1,20 +0,0 @@ -:orphan: - -################################### -Example: Train PyTorch on the cloud -################################### - -**Prereqs:** You have an app already running locally. - ----- - -**************************** -What is the Lightning Cloud? -**************************** -The Lightning Cloud is the platform that we've created to interface with the cloud providers. Today -the Lightning Cloud supports AWS. - -.. note:: Support for GCP and Azure is coming soon! - -To use the Lightning Cloud, you buy credits that are used to pay the cloud providers. If you want to run -on your own AWS credentials, please contact us (support@lightning.ai) so we can get your clusters set up for you. diff --git a/docs/source-app/levels/expert/index.rst b/docs/source-app/levels/expert/index.rst deleted file mode 100644 index c98199b9432e7..0000000000000 --- a/docs/source-app/levels/expert/index.rst +++ /dev/null @@ -1,82 +0,0 @@ -:orphan: - -.. _expert_level: - -.. toctree:: - :maxdepth: 1 - :hidden: - -############# -Expert skills -############# - - -.. raw:: html - -
-
- -.. displayitem:: - :header: Level : Use custom containers - :description: Learn to use a custom cloud container. - :button_link: build_a_machine_learning_workflow.html - :col_css: col-md-6 - :height: 150 - :tag: basic - - -.. raw:: html - -
-
- ----- - -********************* -Intermediate Examples -********************* -As you work through the intermediate levels, try these examples: - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example: Develop a Github Repo Script Runner - :description: Develop a workflow to execute Github Repos - :button_link: ../../examples/github_repo_runner/github_repo_runner.html - :col_css: col-md-6 - :height: 150 - :tag: basic - -.. displayitem:: - :header: Example: Develop a file server - :description: Create a simple Lightning App (App) that allows users to upload files and list the uploaded files. - :button_link: ../../examples/file_server/file_server.html - :col_css: col-md-6 - :height: 150 - :tag: basic - -.. displayitem:: - :header: Example: Develop a Jupyter Notebook component - :description: Develop a LightningWork that runs a notebook on the cloud. - :button_link: run_jupyter_notebook_on_the_cloud.html - :col_css: col-md-6 - :height: 150 - :tag: basic - -.. displayitem:: - :header: Example: Create a model demo - :description: Demo POCs and MVPs which can be shared with a public web user interface. - :button_link: create_a_model_demo.html - :col_css: col-md-6 - :height: 150 - :tag: basic - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/connect_lightning_components.rst b/docs/source-app/levels/intermediate/connect_lightning_components.rst deleted file mode 100644 index 31f6cc1516441..0000000000000 --- a/docs/source-app/levels/intermediate/connect_lightning_components.rst +++ /dev/null @@ -1,116 +0,0 @@ -#################################################### -Level 4: Connect components into a full stack AI app -#################################################### - -**Audience:** Users who want to build apps with multiple components. - -**Prereqs:** You know how to :doc:`build a component <../basic/build_a_lightning_component>`. - ----- - -**************************** -What is a full stack AI app? -**************************** -In the ML world, workflows coordinate multiple pieces of code working together. In Lightning, -when we coordinate 2 or more :doc:`Lightning components <../basic/build_a_lightning_component>` working together, -we instead call it a Lightning App. The difference will become more obvious when we introduce reactive -workflows in the advanced section. - -For the time being, we'll go over how to coordinate 2 components together in a traditional workflow setting -and explain how it works. - -.. note:: If you've used workflow tools for Python, this page describes conventional DAGs. - In :doc:`level 6 `, we introduce reactive workflows that generalize beyond DAGs - so you can build complex systems without much effort. - ----- - -*********** -The toy app -*********** - -In this app, we define two components that run across 2 separate machines. One to train a model on a GPU machine and one to analyze the model -on a separate CPU machine. We save money by stopping the GPU machine when the work is done. - -.. lit_tabs:: - :titles: Import Lightning; Define Component 1; Define Component 2; Orchestrator; Connect component 1; Connect component 2; Implement run; Train; Analyze; Define app placeholder - :descriptions: First, import Lightning; This component trains a model on a GPU machine; This component analyzes a model on a CPU machine; Define the LightningFlow that orchestrates components; Component 1 will run on a CPU machine; Component 2 will run on an accelerated GPU machine; Describe the workflow in the run method; Training runs first and completes; Analyze runs after training completes; This allows the app to be runnable - :code_files: ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py - :highlights: 2; 5-7; 9-11; 13; 16; 17; 19; 20; 21; 23 - :enable_run: true - :tab_rows: 4 - :height: 460px - -| - -Now run the app: - -.. lit_tabs:: - :titles: Run on Lightning cloud; Your own hardware - :descriptions: Run to see these 2 components execute on separate machines 🤯; Run it locally without code changes 🤯🤯; - :code_files: ./level_2_scripts/code_run_cloud.bash; ./level_2_scripts/code_run_local.bash - :tab_rows: 7 - :height: 195px - -| - -Now you can develop distributed cloud apps on your laptop 🤯🤯🤯🤯! - ----- - -************* -Now you know: -************* - -Without going out of your way, you're now doing the following: (Hint: Click **visualize** to see an animation describing the code). - -.. lit_tabs:: - :titles: Orchestration; Distributed cloud computing; Multi-machine communication; Multi-machine communication; Multi-cloud; - :descriptions: Define orchestration in Python with full control-flow; The two pieces of independent Python code ran on separate machines 🤯🤯; The text "CPU machine 1" was sent from the flow machine to the machine running the TrainComponent; The text "GPU machine 2" was sent from the flow machine to the machine running the AnalyzeComponent; The full Lightning app can move across clusters and clouds - :code_files: ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; ./level_2_scripts/hello_app.py; - :tab_rows: 4 - :highlights: 19-21; 16-17; 20; 21 - :images: | | | | - :height: 470px - ----- - -********************* -Maintain full control -********************* -Although we've abstracted the infrastructure, you still have full control when you need it: - -.. lit_tabs:: - :titles: Scheduler; Crontab syntax; Auto-scaling; Organized Python; Full terraform control; - :descriptions: Although you can use Python timers, we have a scheduler short-hand; You can also use full cron syntax; Code your own auto-scaling syntax (Lightning plays well with Kubernetes); *Remember* components organize ANY Python code which can even call external non-python scripts such as optimized C++ model servers ;Experts have the option to use terraform to configure Lightning clusters - :code_files: ./level_2_scripts/hello_app_scheduler.py; ./level_2_scripts/hello_app_cron.py; ./level_2_scripts/hello_app_auto_scale.py; ./level_2_scripts/organized_app_python.py; - :tab_rows: 4 - :highlights: 24; 24; 21, 24, 27, 28; 9, 16, 17 - :height: 700px - ----- - -************************* -Next: Review how to debug -************************* -The next levels does a 2 minute review to make sure you know how to debug a Lightning app. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 5: Debug a Lightning App - :description: Learn to debug a lightning app. - :button_link: debug_a_lightning_app.html - :col_css: col-md-12 - :height: 170 - :tag: 10 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/debug_a_lightning_app.rst b/docs/source-app/levels/intermediate/debug_a_lightning_app.rst deleted file mode 100644 index 8fdc7c67ae973..0000000000000 --- a/docs/source-app/levels/intermediate/debug_a_lightning_app.rst +++ /dev/null @@ -1,48 +0,0 @@ -############################## -Level 5: Debug A Lightning app -############################## -**Audience:** Users who want to debug a distributed app locally. - -**Prereqs:** You must have finished the :doc:`Basic levels <../basic/index>`. - ----- - -****************** -Enable breakpoints -****************** -To enable a breakpoint, use :func:`~lightning.app.pdb.set_trace()` (note direct python pdb support is work in progress and open to contributions). - -.. lit_tabs:: - :descriptions: Toy app; Add a breakpoint. When the program runs, it will stop at this line. - :code_files: ./debug_app_scripts/toy_app_1_component.py; ./debug_app_scripts/toy_app_1_component_pdb.py - :highlights: ; 7 - :enable_run: true - :tab_rows: 3 - :height: 350px - ----- - -********************************* -Next: Run a component in parallel -********************************* -Learn to run components in parallel to enable more powerful workflows. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 6: Run a Lightning component in parallel - :description: Learn when and how to run Components in parallel (asynchronous). - :button_link: run_lightning_work_in_parallel.html - :col_css: col-md-12 - :height: 150 - :tag: 15 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/debug_app_scripts/debug_app.py b/docs/source-app/levels/intermediate/debug_app_scripts/debug_app.py deleted file mode 100644 index cc2bbe16be35e..0000000000000 --- a/docs/source-app/levels/intermediate/debug_app_scripts/debug_app.py +++ /dev/null @@ -1,25 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.runners import MultiProcessRuntime - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent() - self.analyze = AnalyzeComponent() - - def run(self): - self.train.run("GPU machine 1") - self.analyze.run("CPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) -MultiProcessRuntime(app).dispatch() diff --git a/docs/source-app/levels/intermediate/debug_app_scripts/toy_app.py b/docs/source-app/levels/intermediate/debug_app_scripts/toy_app.py deleted file mode 100644 index 5f32ac07e2431..0000000000000 --- a/docs/source-app/levels/intermediate/debug_app_scripts/toy_app.py +++ /dev/null @@ -1,24 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp - - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent() - self.analyze = AnalyzeComponent() - - def run(self): - self.train.run("CPU machine 1") - self.analyze.run("CPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component.py b/docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component.py deleted file mode 100644 index e09df3ecb0d9c..0000000000000 --- a/docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component.py +++ /dev/null @@ -1,17 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp - -class Component(LightningWork): - def run(self, x): - print(x) - - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component = Component() - - def run(self): - self.component.run('i love Lightning') - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component_pdb.py b/docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component_pdb.py deleted file mode 100644 index 6348f9c4ed46e..0000000000000 --- a/docs/source-app/levels/intermediate/debug_app_scripts/toy_app_1_component_pdb.py +++ /dev/null @@ -1,18 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.pdb import set_trace - -class Component(LightningWork): - def run(self, x): - print(x) - set_trace() - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component = Component() - - def run(self): - self.component.run('i love Lightning') - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/embed_web_ui_into_lightningwork.rst b/docs/source-app/levels/intermediate/embed_web_ui_into_lightningwork.rst deleted file mode 100644 index 354a82023f3a3..0000000000000 --- a/docs/source-app/levels/intermediate/embed_web_ui_into_lightningwork.rst +++ /dev/null @@ -1,40 +0,0 @@ -###################################### -Level 9: Embed graphical UIs into work -###################################### -**Audience:** Users who need to embed a Graphical UI in their Lightning Apps. - -**Prereqs:** You have finished :doc:`Level 8 `. - ----- - -.. include:: ../../workflows/add_web_ui/index_content.rst - ----- - -******************************************* -Next steps: Practice adapting app templates -******************************************* -One of the most exciting powers of Lightning is the ability -to start an app from a template, replace or add components -and build a powerful system. - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 10: Practice adapting app templates - :description: Practice starting apps from templates and evolving them by replacing or adding components. - :button_link: start_from_lightning_app_templates.html - :col_css: col-md-12 - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/index.rst b/docs/source-app/levels/intermediate/index.rst deleted file mode 100644 index ed5c3912acb0c..0000000000000 --- a/docs/source-app/levels/intermediate/index.rst +++ /dev/null @@ -1,87 +0,0 @@ -.. _intermediate_level: - -.. toctree:: - :maxdepth: 1 - :hidden: - - connect_lightning_components - debug_a_lightning_app - run_lightning_work_in_parallel - share_variables_between_lightning_components - share_files_between_components - embed_web_ui_into_lightningwork - start_from_lightning_app_templates - -################### -Intermediate skills -################### -Learn to coordinate 2+ components into workflows which we call Lightning apps. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 4: Coordinate 2+ components in a workflow - :description: Learn to coordinate 2_ components in a workflow which we call a Lightning app. - :button_link: connect_lightning_components.html - :col_css: col-md-6 - :height: 170 - :tag: 15 minutes - -.. displayitem:: - :header: Level 5: Debug a Lightning App - :description: Learn to debug a lightning app. - :button_link: debug_a_lightning_app.html - :col_css: col-md-6 - :height: 170 - :tag: 2 minutes - -.. displayitem:: - :header: Level 6: Run a Lightning component in parallel - :description: Learn when and how to run Components in parallel (asynchronous). - :button_link: run_lightning_work_in_parallel.html - :col_css: col-md-6 - :height: 150 - :tag: 10 minutes - -.. displayitem:: - :header: Level 7: Share variables between components - :description: Share variables between Lightning components. - :button_link: share_variables_between_lightning_components.html - :col_css: col-md-6 - :height: 150 - :tag: 15 minutes - -.. displayitem:: - :header: Level 8: Share files between components - :description: Learn how Drives share files between components - :button_link: share_files_between_components.html - :col_css: col-md-6 - :height: 150 - :tag: 20 minutes - -.. displayitem:: - :header: Level 9: Render a web UI with other components - :description: Learn how to embed graphical UIs like react, vue, streamlit and notebook UIs into a lightning workflow. - :button_link: embed_web_ui_into_lightningwork.html - :col_css: col-md-6 - :height: 150 - :tag: 15 minutes - -.. displayitem:: - :header: Level 10: Practice adapting app templates - :description: Practice starting apps from templates and evolving them by replacing or adding components. - :button_link: start_from_lightning_app_templates.html - :col_css: col-md-6 - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/level_12.rst b/docs/source-app/levels/intermediate/level_12.rst deleted file mode 100644 index 60c23909040a6..0000000000000 --- a/docs/source-app/levels/intermediate/level_12.rst +++ /dev/null @@ -1,12 +0,0 @@ -:orphan: - -###################### -Level 12: Flow vs Work -###################### -**Audience:** Users who need to do non trivial workloads in their apps. - -**Prereqs:** Level 8+ - ----- - -.. include:: ../../workflows/build_lightning_component/from_scratch_component_content.rst diff --git a/docs/source-app/levels/intermediate/level_2_scripts/code_run_cloud.bash b/docs/source-app/levels/intermediate/level_2_scripts/code_run_cloud.bash deleted file mode 100644 index 6594fe0ecac33..0000000000000 --- a/docs/source-app/levels/intermediate/level_2_scripts/code_run_cloud.bash +++ /dev/null @@ -1 +0,0 @@ -lightning run app app.py --cloud diff --git a/docs/source-app/levels/intermediate/level_2_scripts/code_run_local.bash b/docs/source-app/levels/intermediate/level_2_scripts/code_run_local.bash deleted file mode 100644 index 8a00b45e132ca..0000000000000 --- a/docs/source-app/levels/intermediate/level_2_scripts/code_run_local.bash +++ /dev/null @@ -1 +0,0 @@ -lightning run app app.py diff --git a/docs/source-app/levels/intermediate/level_2_scripts/hello_app.py b/docs/source-app/levels/intermediate/level_2_scripts/hello_app.py deleted file mode 100644 index 22edaa42add12..0000000000000 --- a/docs/source-app/levels/intermediate/level_2_scripts/hello_app.py +++ /dev/null @@ -1,23 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu')) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('gpu')) - - def run(self): - self.train.run("CPU machine 1") - self.analyze.run("GPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/level_2_scripts/hello_app_auto_scale.py b/docs/source-app/levels/intermediate/level_2_scripts/hello_app_auto_scale.py deleted file mode 100644 index 5ea0903caa3c8..0000000000000 --- a/docs/source-app/levels/intermediate/level_2_scripts/hello_app_auto_scale.py +++ /dev/null @@ -1,30 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu')) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('gpu')) - - def run(self): - # run() starts the machine - self.train.run("GPU machine 1") - - # stop() stops the machine - self.train.stop() - - # run analysis ONLY when machine 1 stopped - if self.train.status.STOPPED: - self.analyze.run("CPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/level_2_scripts/hello_app_cron.py b/docs/source-app/levels/intermediate/level_2_scripts/hello_app_cron.py deleted file mode 100644 index a5c1cdb8a7bee..0000000000000 --- a/docs/source-app/levels/intermediate/level_2_scripts/hello_app_cron.py +++ /dev/null @@ -1,27 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu')) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('gpu')) - - def run(self): - # run training once - self.train.run("GPU machine 1") - - # run analysis once, then every hour again... - if self.schedule("5 4 * * *"): - self.analyze.run("CPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/level_2_scripts/hello_app_scheduler.py b/docs/source-app/levels/intermediate/level_2_scripts/hello_app_scheduler.py deleted file mode 100644 index a7ca0f415dfcf..0000000000000 --- a/docs/source-app/levels/intermediate/level_2_scripts/hello_app_scheduler.py +++ /dev/null @@ -1,27 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu')) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('gpu')) - - def run(self): - # run training once - self.train.run("GPU machine 1") - - # run analysis once, then every hour again... - if self.schedule("hourly"): - self.analyze.run("CPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/level_2_scripts/organized_app_python.py b/docs/source-app/levels/intermediate/level_2_scripts/organized_app_python.py deleted file mode 100644 index e5e21d856f74d..0000000000000 --- a/docs/source-app/levels/intermediate/level_2_scripts/organized_app_python.py +++ /dev/null @@ -1,36 +0,0 @@ -# app.py -import subprocess -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class ExternalModelServer(LightningWork): - def run(self, x): - # compile - process = subprocess.Popen('g++ model_server.cpp -o model_server') - process.wait() - process = subprocess.Popen('./model_server') - process.wait() - -class LocustLoadTester(LightningWork): - def run(self, x): - cmd = f'locust --master-host {self.host} --master-port {self.port}' - process = subprocess.Popen(cmd) - process.wait() - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.serve = ExternalModelServer( - cloud_compute=CloudCompute('cpu'), parallel=True - ) - self.load_test = LocustLoadTester(cloud_compute=CloudCompute('cpu')) - - def run(self): - # start the server (on a CPU machine 1) - self.serve.run() - - # load testing when the server is up (on a separate cpu machine 2) - if self.serve.state.RUNNING: - self.load_test.run() - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/level_9.rst b/docs/source-app/levels/intermediate/level_9.rst deleted file mode 100644 index 344c321bac79b..0000000000000 --- a/docs/source-app/levels/intermediate/level_9.rst +++ /dev/null @@ -1,16 +0,0 @@ -:orphan: - -################### -Level 9: Event loop -################### -**Audience:** Users who want to build reactive Lightning Apps and move beyond DAGs. - -**Prereqs:** Level 8+ - ----- - -Drawing inspiration from modern web frameworks like `React.js `_, the Lightning App runs all flows in an **event loop** (forever), which is triggered several times a second after collecting any works' state change. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/lightning_loop.gif - -When running a Lightning App in the cloud, the ``LightningWork`` run on different machines. LightningWork communicates any state changes to the **event loop** which re-executes the flow with the newly-collected works' state. diff --git a/docs/source-app/levels/intermediate/run_lightning_work_in_parallel.rst b/docs/source-app/levels/intermediate/run_lightning_work_in_parallel.rst deleted file mode 100644 index 7a561893b2a35..0000000000000 --- a/docs/source-app/levels/intermediate/run_lightning_work_in_parallel.rst +++ /dev/null @@ -1,38 +0,0 @@ -############################################## -Level 6: Run a Lightning component in parallel -############################################## -**Audience:** Users who want to run a Lightning Component in parallel (asynchronously). - -**Prereqs:** You must have finished :doc:`Level 5 `. - ----- - -.. include:: ../../workflows/run_work_in_parallel_content.rst - ----- - -********************************************** -Next steps: Share variables between components -********************************************** -Now that you know how to run components in parallel, we'll learn to share variables -across components to simplify complex workflows. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 7: Share variables between components - :description: Learn to connect components - :col_css: col-md-12 - :button_link: share_variables_between_lightning_components.html - :height: 150 - :tag: 10 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/scripts/.storage/a b/docs/source-app/levels/intermediate/scripts/.storage/a deleted file mode 100644 index 1c6c4ccab0cee0d58b978c1952d2b9bccf2213a3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22 YcmZo*naa%o0kKmwycxZjyqQz=04ZhzxBvhE diff --git a/docs/source-app/levels/intermediate/scripts/.storage/embeddings b/docs/source-app/levels/intermediate/scripts/.storage/embeddings deleted file mode 100644 index af3ee639fa570840013e8c0cb187c94491ba64c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36 hcmZo*nJUQu0kKmwfXJKCo5`Cw6~qG4-Ylu9dH{sD2dn@9 diff --git a/docs/source-app/levels/intermediate/scripts/a b/docs/source-app/levels/intermediate/scripts/a deleted file mode 100644 index 1c6c4ccab0cee0d58b978c1952d2b9bccf2213a3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22 YcmZo*naa%o0kKmwycxZjyqQz=04ZhzxBvhE diff --git a/docs/source-app/levels/intermediate/scripts/comms_1.py b/docs/source-app/levels/intermediate/scripts/comms_1.py deleted file mode 100644 index d62d047c545f6..0000000000000 --- a/docs/source-app/levels/intermediate/scripts/comms_1.py +++ /dev/null @@ -1,18 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp - -class Component(LightningWork): - def run(self, x): - print(f'MACHINE 1: this string came from machine 0: "{x}"') - print('MACHINE 1: this string is on machine 1') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component = Component() - - def run(self): - x = 'hello from machine 0' - self.component.run(x) - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/scripts/debug_app.py b/docs/source-app/levels/intermediate/scripts/debug_app.py deleted file mode 100644 index cc2bbe16be35e..0000000000000 --- a/docs/source-app/levels/intermediate/scripts/debug_app.py +++ /dev/null @@ -1,25 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.runners import MultiProcessRuntime - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent() - self.analyze = AnalyzeComponent() - - def run(self): - self.train.run("GPU machine 1") - self.analyze.run("CPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) -MultiProcessRuntime(app).dispatch() diff --git a/docs/source-app/levels/intermediate/scripts/embeddings b/docs/source-app/levels/intermediate/scripts/embeddings deleted file mode 100644 index af3ee639fa570840013e8c0cb187c94491ba64c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36 hcmZo*nJUQu0kKmwfXJKCo5`Cw6~qG4-Ylu9dH{sD2dn@9 diff --git a/docs/source-app/levels/intermediate/scripts/toy_app.py b/docs/source-app/levels/intermediate/scripts/toy_app.py deleted file mode 100644 index 5f32ac07e2431..0000000000000 --- a/docs/source-app/levels/intermediate/scripts/toy_app.py +++ /dev/null @@ -1,24 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp - - - -class TrainComponent(LightningWork): - def run(self, x): - print(f'train a model on {x}') - -class AnalyzeComponent(LightningWork): - def run(self, x): - print(f'analyze model on {x}') - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent() - self.analyze = AnalyzeComponent() - - def run(self): - self.train.run("CPU machine 1") - self.analyze.run("CPU machine 2") - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/scripts/toy_payload.py b/docs/source-app/levels/intermediate/scripts/toy_payload.py deleted file mode 100644 index 473b450a5f56f..0000000000000 --- a/docs/source-app/levels/intermediate/scripts/toy_payload.py +++ /dev/null @@ -1,31 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp - - -class EmbeddingProcessor(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.embeddings = None - - def run(self): - print('PROCESSOR: Generating embeddings...') - fake_embeddings = [[1, 2, 3], [2, 3, 4]] - self.embeddings = storage.Payload(fake_embeddings) - -class EmbeddingServer(LightningWork): - def run(self, payload): - print('SERVER: Using embeddings from processor', payload) - embeddings = payload.value - print('serving embeddings sent from EmbeddingProcessor: ', embeddings) - -class WorkflowOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.processor = EmbeddingProcessor() - self.server = EmbeddingServer() - - def run(self): - self.processor.run() - self.server.run(self.processor.embeddings) - -app = LightningApp(WorkflowOrchestrator()) diff --git a/docs/source-app/levels/intermediate/scripts/two_comms_non_ml.py b/docs/source-app/levels/intermediate/scripts/two_comms_non_ml.py deleted file mode 100644 index 1a9e32589c110..0000000000000 --- a/docs/source-app/levels/intermediate/scripts/two_comms_non_ml.py +++ /dev/null @@ -1,36 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp -import time - -class A(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.msg_changed = False - self.new_msg = '' - - def run(self): - # pretend to train and save a checkpoint every 10 steps - for step in (range(1000)): - time.sleep(1.0) - if step % 10 == 0: - self.msg_changed = True - self.new_msg = f'A is at step: {step}' - print(self.new_msg) - -class B(LightningWork): - def run(self, msg): - print(f'B: message from A: {msg}') - -class Example(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.a = A(parallel=True) - self.b = B(parallel=True) - - def run(self): - self.a.run() - if self.a.msg_changed: - self.a.msg_changed = False - self.b.run(self.a.new_msg) - -app = LightningApp(Example()) diff --git a/docs/source-app/levels/intermediate/scripts/two_work_comms.py b/docs/source-app/levels/intermediate/scripts/two_work_comms.py deleted file mode 100644 index df37a468253fe..0000000000000 --- a/docs/source-app/levels/intermediate/scripts/two_work_comms.py +++ /dev/null @@ -1,35 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp -import time - -class TrainComponent(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.last_checkpoint_path = None - - def run(self): - # pretend to train and save a checkpoint every 10 steps - for step in (range(1000)): - time.sleep(1.0) - fake_loss = round(1/(step + 0.00001), 4) - print(f'{step=}: {fake_loss=} ') - if step % 10 == 0: - self.last_checkpoint_path = f'/some/path/{step=}_{fake_loss=}' - print(f'TRAIN COMPONENT: saved new checkpoint: {self.last_checkpoint_path}') - -class ModelDeploymentComponent(LightningWork): - def run(self, new_checkpoint): - print(f'DEPLOY COMPONENT: load new model from checkpoint: {new_checkpoint}') - -class ContinuousDeployment(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(parallel=True) - self.model_deployment = ModelDeploymentComponent(parallel=True) - - def run(self): - self.train.run() - if self.train.last_checkpoint_path: - self.model_deployment.run(self.train.last_checkpoint_path) - -app = LightningApp(ContinuousDeployment()) diff --git a/docs/source-app/levels/intermediate/share_files_between_components.rst b/docs/source-app/levels/intermediate/share_files_between_components.rst deleted file mode 100644 index 89c170f350a37..0000000000000 --- a/docs/source-app/levels/intermediate/share_files_between_components.rst +++ /dev/null @@ -1,34 +0,0 @@ -####################################### -Level 8: Share files between components -####################################### -**Audience:** Users who are moving large files such as artifacts or datasets. - -**Prereqs:** Level 6+ - ----- - -************************************************* -Next steps: Render a web UI with other components -************************************************* -Now that we know the key ways of sharing files and variables, -we'll apply it to embed web UIs alongside components. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 9: Render a web UI with other components - :description: Learn how to embed graphical UIs like react, vue, streamlit and notebook UIs into a lightning workflow. - :button_link: embed_web_ui_into_lightningwork.html - :col_css: col-md-12 - :height: 150 - :tag: 15 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst b/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst deleted file mode 100644 index 81bbfbf718638..0000000000000 --- a/docs/source-app/levels/intermediate/share_variables_between_lightning_components.rst +++ /dev/null @@ -1,162 +0,0 @@ -########################################### -Level 7: Share variables between components -########################################### -**Audience:** Users who want to share variables and files across Lightning components. - -**Prereqs:** You must have finished `intermediate level 5+ `_. - ----- - -**************************************** -Send a variable from Flow to a Component -**************************************** -When a variable is defined on the LightningFlow (orchestrator), and -then it's passed into functions for the work components, under the hood -Lightning sends the variables across the machines for you automatically. - -.. lit_tabs:: - :descriptions: Remember this component may live on its own machine; The flow may be on a separate machine as well; This variable is on the flow machine; When passed to the work component, it is actually sent across the network under the hood.; When it prints here, it prints on the work component machine (not the flow machine); The second string was directly created on machine 1 - :code_files: ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py; ./scripts/comms_1.py - :highlights: 4-7; 9-16; 15; 16; 6; 7; - :enable_run: true - :tab_rows: 3 - :height: 380px - -| - -.. collapse:: CLI output - - .. code-block:: - - $ lightning run app app.py --open-ui=false - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - MACHINE 1: this string came from machine 0: "hello from machine 0" - MACHINE 1: this string is on machine 1 - -| - -In this example, we learned that we can send variables to components like in regular Python. -On a local machine, it will behave like Python. When the workflow is distributed on the cloud, -it makes network calls under the hood, but still functions like Python to you. - ----- - -************************************** -Send a variable between two components -************************************** -A majority of workflows (especially in ML), require components to respond to a change in a component -likely running on a separate machine or even cluster. - -Example Continuous deployment: Every time a model saves a checkpoint, we redeploy a model: - -.. lit_tabs:: - :descriptions: Define a component that simulates training; Define a component that simulates deployment; Training will happen in parallel over a long period; The deployment server also runs in parallel forever; Start training in parallel (could take months); Whenever the model has a checkpoint deploy; When the checkpoint is updated, model re-deploys - :code_files: ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py; ./scripts/two_work_comms.py - :highlights: 5-18; 20-22; 27; 28; 31; 32, 33; 33 - :enable_run: true - :tab_rows: 3 - :height: 690px - -| - -.. collapse:: CLI output: - - .. code:: - - $ lightning run app app.py --open-ui=false - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - step=0: fake_loss=100000.0 - TRAIN COMPONENT: saved new checkpoint: /some/path/step=0_fake_loss=100000.0 - step=1: fake_loss=1.0 - DEPLOY COMPONENT: load new model from checkpoint: /some/path/step=0_fake_loss=100000.0 - step=2: fake_loss=0.5 - step=3: fake_loss=0.3333 - step=4: fake_loss=0.25 - step=5: fake_loss=0.2 - step=6: fake_loss=0.1667 - step=7: fake_loss=0.1429 - step=8: fake_loss=0.125 - step=9: fake_loss=0.1111 - step=10: fake_loss=0.1 - TRAIN COMPONENT: saved new checkpoint: /some/path/step=10_fake_loss=0.1 - DEPLOY COMPONENT: load new model from checkpoint: /some/path/step=10_fake_loss=0.1 - step=11: fake_loss=0.0909 - step=12: fake_loss=0.0833 - step=13: fake_loss=0.0769 - step=14: fake_loss=0.0714 - step=15: fake_loss=0.0667 - step=16: fake_loss=0.0625 - step=17: fake_loss=0.0588 - step=18: fake_loss=0.0556 - step=19: fake_loss=0.0526 - step=20: fake_loss=0.05 - TRAIN COMPONENT: saved new checkpoint: /some/path/step=20_fake_loss=0.05 - DEPLOY COMPONENT: load new model from checkpoint: /some/path/step=20_fake_loss=0.05 - step=21: fake_loss=0.0476 - ----- - -******************************************** -Send a large variable between two components -******************************************** -For large variables such as arrays, tensors, embeddings and so on, use Payload to enable -transferring them across components. - -.. lit_tabs:: - :descriptions: Let's define a component to simulate generating embeddings (from a DB, feature store, etc...); This component simulates a server that will use the embeddings.; Run the component to generate the embeddings; Simulate embeddings as an array. Here you would query a DB, load from a feature store or disk or even use a neural network to extract the embedding.; Allow the embeddings to be transferred efficiently by wrapping them in the Payload object.; Pass the variable to the EmbeddingServer (just the pointer).; The data gets transferred once you use the .value attribute in the other component. - :code_files: ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; ./scripts/toy_payload.py; - :highlights: 5-13; 15-19; 28; 12; 13; 29; 18 - :enable_run: true - :tab_rows: 3 - :height: 600px - -| - -.. collapse:: CLI output - - .. code:: - - $ lightning run app app.py --open-ui=false - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - PROCESSOR: Generating embeddings... - SERVER: Using embeddings from processor - serving embeddings sent from EmbeddingProcessor: [[1, 2, 3], [2, 3, 4]] - -| - -The payload object keeps the data on the machine and passes a pointer -to the data around the app until the data is needed by a component. - ----- - -****************************************** -Next steps: Share files between components -****************************************** -Now that you know how to run components in parallel, we'll learn to share variables -across components to simplify complex workflows. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Level 8: Share files between components - :description: Learn to share files between components. - :col_css: col-md-12 - :button_link: share_files_between_components.html - :height: 150 - :tag: 10 minutes - -.. raw:: html - -
-
diff --git a/docs/source-app/levels/intermediate/start_from_lightning_app_templates.rst b/docs/source-app/levels/intermediate/start_from_lightning_app_templates.rst deleted file mode 100644 index 2b5ae06e48022..0000000000000 --- a/docs/source-app/levels/intermediate/start_from_lightning_app_templates.rst +++ /dev/null @@ -1,34 +0,0 @@ -############################################ -Level 10: Start from lightning app templates -############################################ -**Audience:** All users who want to move quickly with Lightning - -**Prereqs:** You have finished :doc:`Level 9 `. - ----- - -**************************************************** -Next step: Learn to build powerful nested components -**************************************************** -Now that you can build powerful apps, learn to build nested components -that can do things like start dynamic works and connect to each other -via networking or CLI commands. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Advanced skills - :description: Learn to build nested components with advanced functionality. - :button_link: ../advanced/index.html - :col_css: col-md-12 - :height: 170 - -.. raw:: html - -
-
diff --git a/docs/source-app/make.bat b/docs/source-app/make.bat deleted file mode 100644 index 9b565142aecbf..0000000000000 --- a/docs/source-app/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=../build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd diff --git a/docs/source-app/moving_to_the_cloud.rst b/docs/source-app/moving_to_the_cloud.rst deleted file mode 100644 index a4d57e9fc026c..0000000000000 --- a/docs/source-app/moving_to_the_cloud.rst +++ /dev/null @@ -1,122 +0,0 @@ -:orphan: - -.. _moving_to_the_cloud: - -#################### -Moving to the Cloud -#################### - -.. warning:: This is in progress and not yet fully supported. - -In the :ref:`quick_start` guide, you learned how to implement a simple app -that trains an image classifier and serve it once trained. - -In this tutorial, you'll learn how to extend that application so that it works seamlessly -both locally and in the cloud. - ----- - -******************************** -Step 1: Distributed Application -******************************** - - -Distributed Storage -^^^^^^^^^^^^^^^^^^^ - -When running your application in a fully-distributed setting, the data available on one machine won't necessarily be available on another. - -To solve this problem, Lightning introduces the :class:`~lightning.app.storage.path.Path` object. -This ensures that your code can run both locally and in the cloud. - -The :class:`~lightning.app.storage.path.Path` object keeps track of the work which creates -the path. This enables Lightning to transfer the files correctly in a distributed setting. - -Instead of passing a string representing a file or directory, Lightning simply wraps -them into a :class:`~lightning.app.storage.path.Path` object and makes them an attribute of your LightningWork. - -Without doing this conscientiously for every single path, your application will fail in the cloud. - -In the example below, a file written by **SourceFileWork** is being transferred by the flow -to the **DestinationFileAndServeWork** work. The Path object is the reference to the file. - -.. literalinclude:: ../../examples/app/boring/app.py - :emphasize-lines: 5, 22, 28, 48 - -In the ``scripts/serve.py`` file, we are creating a **FastApi Service** running on port ``1111`` -that returns the content of the file received from **SourceFileWork** when -a post request is sent to ``/file``. - -.. literalinclude:: ../../examples/app/boring/scripts/serve.py - :emphasize-lines: 21, 23-26 - ----- - -Distributed Frontend -^^^^^^^^^^^^^^^^^^^^ - -In the above example, the **FastAPI Service** was running on one machine, -and the frontend UI in another. - -In order to assemble them, you need to do two things: - -* Provide **port** argument to your work's ``__init__`` method to expose a single service. - -Here's how to expose the port: - -.. literalinclude:: ../../examples/app/boring/app.py - :emphasize-lines: 8 - :lines: 33-44 - - -And here's how to expose your services within the ``configure_layout`` flow hook: - -.. literalinclude:: ../../examples/app/boring/app.py - :emphasize-lines: 5 - :lines: 53-57 - -In this example, we're appending ``/file`` to our **FastApi Service** url. -This means that our ``Boring Tab`` triggers the ``get_file_content`` from the **FastAPI Service** -and embeds its content as an `IFrame `_. - -.. literalinclude:: ../../examples/app/boring/scripts/serve.py - :lines: 23-26 - - -Here's a visualization of the application described above: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/storage_ui.gif - :alt: Storage API Animation - :width: 100 % - ----- - -***************************** -Step 2: Scalable Application -***************************** - -The benefit of defining long-running code inside a -:class:`~lightning.app.core.work.LightningWork` -component is that you can run it on different hardware -by providing :class:`~lightning.app.utilities.packaging.cloud_compute.CloudCompute` to -the ``__init__`` method of your :class:`~lightning.app.core.work.LightningWork`. - -By adapting the :ref:`quick_start` example as follows, you can easily run your component on multiple GPUs: - - -Without doing much, you’re now running a script on its own cluster of machines! 🤯 - ----- - -***************************** -Step 3: Resilient Application -***************************** - -We designed Lightning with a strong emphasis on supporting failure cases. -The framework shines when the developer embraces our fault-tolerance best practices, -enabling them to create ML applications with a high degree of complexity as well as a strong support -for unhappy cases. - -An entire section would be dedicated to this concept. - -TODO diff --git a/docs/source-app/quickstart.rst b/docs/source-app/quickstart.rst deleted file mode 100644 index 99872c5f8ae46..0000000000000 --- a/docs/source-app/quickstart.rst +++ /dev/null @@ -1,125 +0,0 @@ -:orphan: - -.. _quick_start: - -############ -Quick Start -############ - -In this guide, we'll run an application that trains -an image classification model with the `MNIST Dataset `_, -and uses `Gradio `_ to serve it. - ----- - -********************** -Step 1 - Installation -********************** - -First, you'll need to install Lightning. You can find the complete guide here. - -Then, you'll need to install the `Lightning Quick Start package `_. - -.. code-block:: bash - - lightning install app lightning/quick-start - -And download the training script used by the App: - - ----- - -********************** -Step 2 - Run the app -********************** - -To run your app, copy the following command to your local terminal: - -.. code-block:: bash - - lightning run app app.py - -And that's it! - -.. admonition:: You should see the app logs in your terminal. - :class: dropdown - - .. code-block:: console - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - - Global seed set to 42 - - GPU available: True (mps), used: False - TPU available: False, using: 0 TPU cores - - | Name | Type | Params | In sizes | Out sizes - ------------------------------------------------------------------ - 0 | model | Net | 1.2 M | [1, 1, 28, 28] | [1, 10] - 1 | val_acc | Accuracy | 0 | ? | ? - ------------------------------------------------------------------ - 1.2 M Trainable params - 0 Non-trainable params - 1.2 M Total params - Epoch 4: 100%|█████████████████████████| 16/16 [00:00<00:00, 32.31it/s, loss=0.0826, v_num=0] - `Trainer.fit` stopped: `max_epochs=5` reached. - - Running on local URL: http://127.0.0.1:62782/ - ... - - -The app will open your browser and show an interactive demo: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/qiuck-start-tensorboard-tab.png - :alt: Quick Start UI - Model Training Tab - :width: 100 % - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/quick-start-gradio-tab.png - :alt: Quick Start UI - Interactive Demo Tab - :width: 100 % - ----- - -This app behind the scenes -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This application has one flow component which coordinates two works executing their own python script. -Once the training is finished, the trained model weights are passed to the serve component. - - -Here is how the components of a Lightning app are structured: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/quick_start_components.gif - :alt: Quick Start Application - :width: 100 % - -Here is the application timeline: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/timeline.gif - :alt: Quick Start Timeline Application - :width: 100 % - ----- - -************************************** -Steps 3 - Build your app in the cloud -************************************** - -Simply add ``--cloud`` to run this application in the cloud 🤯 - -.. code-block:: bash - - lightning run app app.py --cloud - -Congratulations! You've now run your first application with Lightning. - ----- - -*********** -Next Steps -*********** - -To learn how to build and modify apps, go to the :ref:`basics`. - -To learn how to create UIs for your apps, read :ref:`ui_and_frontends`. diff --git a/docs/source-app/testing.rst b/docs/source-app/testing.rst deleted file mode 100644 index 752291541000e..0000000000000 --- a/docs/source-app/testing.rst +++ /dev/null @@ -1,155 +0,0 @@ -:orphan: - -.. _testing: - -####################### -Productionize your Apps -####################### - -.. TODO: Cleanup - -At the core of our system is an integration testing framework that will allow for a first-class experience creating integration tests for Lightning Apps. This document will explain how we can create a lightning app test, how we can execute it, and where to find more information. - ----- - -*********** -Philosophy -*********** - -Testing a Lightning app is unique. It is a superset of an application that converges machine learning, API development, and UI development. With that in mind, there are several philosophies (or "best practices") that you should adhere to: - - -#. **Control your app state** - With integration and end to end tests, you have the capabilities of controlling your app's state through dependency injection. Use it! -#. **Integration focuses on the work, End to End focuses on the app** - When writing tests, think of the depth and breath of what you are writing. Write many integration tests since they are relatively cheap, while keeping the end to end tests for holistic app testing. -#. **Don't overthink it** - What needs to be tested? What is the order of risk? These are the questions you should build with before writing your first line of code. Writing tests for the sake of writing tests is an exercise in futility. Write meaningful, impactful tests. -#. **Test Isolation** - Write your tests in an isolated manner. No two tests should ever depend on each other. -#. **Use your framework** - Testing apps should be framework agnostic. -#. **Have fun!** - At the heart of testing is experimentation. Like any experiment, tests begin with a hypothesis of workability, but you can extend that to be more inclusive. Ask the question, write the test to answer your question, and make sure you have fun while doing it. - ----- - -**************************************** -Anatomy of a Lightning integration test -**************************************** - -The following is a PyTest example of an integration test using the ``lightning.app.testing`` module. - -.. code-block:: python - - import os - - from lightning.app import _PROJECT_ROOT - from lightning.app.testing import application_testing, LightningTestApp - from lightning.app.utilities.enum import AppStage - - - class TestLightningAppInt(TestLightningApp): - def run_once(self) -> bool: - if self.root.counter > 1: - print("V0 App End") - self.stage = AppStage.STOPPING - return True - return super().run_once() - - - def test_v0_app_example(): - command_line = [ - os.path.join(_PROJECT_ROOT, "examples/app_v0/app.py"), - "--blocking", - "False", - "--multiprocess", - "--open-ui", - "False", - ] - result = application_testing(TestLightningAppInt, command_line) - assert "V0 App End" in str(result.stdout_bytes) - assert result.exit_code == 0 - ----- - -Setting up the app -^^^^^^^^^^^^^^^^^^ - -Lightning apps are unique in that they represent a full stack model for your machine learning application. To be clear, the integration tests are *NOT* going to touch the UI flow. Instead we inject your application with helper methods that, when executed, can assist in validating your application. - -To get started, you simply need to import the following: - -.. code-block:: python - - from lightning.app.testing import application_testing, LightningTestApp - -We will discuss ``application_testing`` in a bit, but first let's review the structure of ``LightningTestApp``. - ----- - -LightningTestApp -^^^^^^^^^^^^^^^^^ - -The :class:`lightning.app.testing.testing.LightningTestApp` class is available to use for provisioning and setting up your testing needs. Note that you do not need this class to move forward with testing. Any application that inherits ``LightningApp`` should suffice as long as you override the correct methods. Reviewing the TestLightnigApp we see some overrides that are already there. Please revuew the class for more information. - -.. code-block:: python - - class TestLightningAppInt(LightningTestApp): - def run_once(self) -> bool: - if self.root.counter > 1: - print("V0 App End") - self.stage = AppStage.STOPPING - return True - return super().run_once() - -We create a test class overriding the ``run_once`` function. This function helps control the flow of your application and is ran first. In this example we are calling ``self.root.counter`` and checking if the job has executed once. If so, we want to print out ``V0 App End`` and set the ``self.stage`` to ``AppStage.STOPPING``. This is how we control the flow through state. Your situation might be different, so experiment and see what you can do! - -Besides ``run_once`` there are a few other overrides available: - - -* ``on_before_run_once`` - This runs before your ``run_once`` function kicks off. You can set up your application pre-conditions here. -* ``on_after_run_once`` - Similar to ``on_before_run_once`` but after the ``run_once`` method is called. - -These methods will skew your tests, so use them when needed. - ----- - -The Test -^^^^^^^^ - -We provide ``application_testing`` as a helper function to get your application up and running for testing. It uses ``click``\ 's invocation tooling underneath. - -.. code-block:: - - command_line = [ - os.path.join(_PROJECT_ROOT, "examples/app_v0/app.py"), - "--blocking", - "False", - "--open-ui", - "False", - ] - -First in the list for ``command_line`` is the location of your script. It is an external file. In this example we have ``_PROJECT_ROOT`` but this is *not* a helper constant for you to utilize. You will need to provide the location yourself. - -Next there are a couple of options you can leverage: - -* ``blocking`` - Blocking is an app status that says "Do not run until I click run in the UI". For our integration test, since we are not using the UI, we are setting this to "False". -* ``open-ui`` - We set this to false since this is the routine that opens a browser for your local execution. - -Once you have your commandline ready, you will then be able to kick off the test and gather results: - -.. code-block:: python - - result = application_testing(TestLightningAppInt, command_line) - -As mentioned earlier, ``application_testing`` is a helper method that allows you to inject your TestLightningApp class (with overrides) and the commandline flags. Once the process is done it returns the results back for parsing. - -.. code-block:: python - - assert "V0 App End" in str(result.stdout_bytes) - assert result.exit_code == 0 - -Since we injected "V0 App End" to the end of our test flow. The state was changed to ``AppStatus.STOPPING`` which means the process is done. Finally, we check the result's exit code to make sure that we did not throw an error during execution. - ----- - -************ -End to End -************ - -TODO diff --git a/docs/source-app/ui_and_frontends.rst b/docs/source-app/ui_and_frontends.rst deleted file mode 100644 index 3ffa5f983e7d8..0000000000000 --- a/docs/source-app/ui_and_frontends.rst +++ /dev/null @@ -1,23 +0,0 @@ -:orphan: - -.. _ui_and_frontends: - -################ -UI and Frontends -################ - - -The Lightning framework allows you to create customized, interactive UIs with the framework of your choice. - -You can easily embed other tools and services (like a GitHub repo, a `FastAPI Service `_, an `arXiv `_ paper or a `Dask Cluster `_ Admin page), or create a complete UI from scratch. - - -To get started, you can use built-in templates for the following frameworks: - -* `React.js `_ -* `StreamLit `_ - - - -To keep learning about Lightning, check out :ref:`moving_to_the_cloud`. -This section covers best practices to seamlessly make your Lightning code work both locally and in the cloud. diff --git a/docs/source-app/workflows/access_app_state.rst b/docs/source-app/workflows/access_app_state.rst deleted file mode 100644 index 8f99534bd239b..0000000000000 --- a/docs/source-app/workflows/access_app_state.rst +++ /dev/null @@ -1,59 +0,0 @@ -.. _access_app_state: - -################ -Access App State -################ - -**Audience:** Users who want to know how the App State can be accessed. - -**Level:** Basic - -********************** -What is the App State? -********************** - -In Lightning, each component is stateful and their state is composed of all attributes defined within their **__init__** method. - -The **App State** is the collection of all the components' states forming the App. - -************************************ -What is special about the App State? -************************************ - -The **App State** is always up-to-date, even running an App in the cloud on multiple machines. -This means that every time an attribute is modified in a Work, that information is automatically -broadcasted to the Flow. With this mechanism, any Component can **react** to any other -Component's **state changes** through the Flow and complex systems can be easily implemented. -Lightning requires a state based driven mindset when implementing the Flow. - -*************************************** -When do I need to access the App State? -*************************************** - -As a user, you are interacting with your component attributes, so most likely, -you won't need to access the Component's state directly, but it can be helpful to -understand how the state works under the hood. - -For example, here we define a **Flow** component and **Work** component, where the Work increments a counter indefinitely and the Flow prints its state which contains the Work. - -You can easily check the state of your entire App as follows: - -.. literalinclude:: ../code_samples/quickstart/app_01.py - -Run the App with: - -.. code-block:: bash - - lightning run app docs/quickstart/app_01.py - -And here's the output you get when running the App using **Lightning CLI**: - -.. code-block:: console - - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - State: {'works': {'w': {'vars': {'counter': 1}}}} - State: {'works': {'w': {'vars': {'counter': 2}}}} - State: {'works': {'w': {'vars': {'counter': 3}}}} - State: {'works': {'w': {'vars': {'counter': 3}}}} - State: {'works': {'w': {'vars': {'counter': 4}}}} - ... diff --git a/docs/source-app/workflows/add_components.rst b/docs/source-app/workflows/add_components.rst deleted file mode 100644 index 3edff4c1c0c7d..0000000000000 --- a/docs/source-app/workflows/add_components.rst +++ /dev/null @@ -1,31 +0,0 @@ -:orphan: - -########################### -Add a component to your app -########################### -**Audience:** Users looking to expand the functionality of their Lightning apps. - ----- - -******************* -Install a component -******************* - -Any Lightning component can be installed with: - -.. code:: python - - lightning install component org/the-component-name - -`Browse all community-built components here `_. - -.. note:: Components are being populated daily - ----- - -********************** -Contribute a component -********************** -One of the first principles of the Lightning community is to code something *once* for the benefit or everyone! - -To contribute a component, :doc:`follow this guide `. diff --git a/docs/source-app/workflows/add_server/any_server.rst b/docs/source-app/workflows/add_server/any_server.rst deleted file mode 100644 index 948ae22a2abc0..0000000000000 --- a/docs/source-app/workflows/add_server/any_server.rst +++ /dev/null @@ -1,187 +0,0 @@ -######################### -Enable any server (basic) -######################### -**Audience:** Users who want to enable an arbitrary server/UI. - -**Prereqs:** Basic python knowledge. - ----- - -***************** -What is a server? -***************** -A server is a program that enables other programs or users to connect to it. As long as your server can listen on a port, -you can enable it with a Lightning App. - ----- - -*************************** -Add a server to a component -*************************** -Any server that listens on a port, can be enabled via a work. For example, here's a plain python server: - -.. code:: python - :emphasize-lines: 11-12 - - import socketserver - from http import HTTPStatus, server - - - class PlainServer(server.SimpleHTTPRequestHandler): - def do_GET(self): - self.send_response(HTTPStatus.OK) - self.end_headers() - # Data must be passed as bytes to the `self.wfile.write` call - html = b"

Hello lit world

" - self.wfile.write(html) - - - httpd = socketserver.TCPServer(("localhost", "3000"), PlainServer) - httpd.serve_forever() - -To enable the server inside the component, start the server in the run method and use the ``self.host`` and ``self.port`` properties: - -.. code:: python - :emphasize-lines: 14-15 - - import lightning as L - import socketserver - from http import HTTPStatus, server - - - class PlainServer(server.SimpleHTTPRequestHandler): - def do_GET(self): - self.send_response(HTTPStatus.OK) - self.end_headers() - # Data must be passed as bytes to the `self.wfile.write` call - html = b"

Hello lit world

" - self.wfile.write(html) - - - class LitServer(L.LightningWork): - def run(self): - httpd = socketserver.TCPServer((self.host, self.port), PlainServer) - httpd.serve_forever() - ----- - -************************************** -Route the server in the root component -************************************** -The final step, is to tell the Root component in which tab to render this component's output: -In this case, we render the ``LitServer`` output in the ``home`` tab of the application. - -.. code:: python - :emphasize-lines: 20, 23, 28 - - import lightning as L - import socketserver - from http import HTTPStatus, server - - - class PlainServer(server.SimpleHTTPRequestHandler): - def do_GET(self): - self.send_response(HTTPStatus.OK) - self.end_headers() - # Data must be passed as bytes to the `self.wfile.write` call - html = b"

Hello lit world

" - self.wfile.write(html) - - - class LitServer(L.LightningWork): - def run(self): - httpd = socketserver.TCPServer((self.host, self.port), PlainServer) - httpd.serve_forever() - - - class Root(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_server = LitServer(parallel=True) - - def run(self): - self.lit_server.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_server} - return tab1 - - - app = L.LightningApp(Root()) - -We use the ``parallel=True`` argument of ``LightningWork`` to run the server in parallel -while the rest of the Lightning App runs everything else. - ----- - -*********** -Run the app -*********** -Start the app to see your new UI! - -.. code:: bash - - lightning_app run app app.py - -To run the app on the cloud, use the ``--cloud`` argument. - -.. code:: bash - - lightning_app run app app.py --cloud - ----- - -***************************************** -Interact with a component from the server -***************************************** - -.. TODO:: how do we do this? - - ----- - -***************************************** -Interact with the server from a component -***************************************** - -.. TODO:: how do we do this? - ----- - -******** -Examples -******** -Here are a few example apps that expose a server via a component: - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example: Tensorboard - :description: TODO - :col_css: col-md-4 - :button_link: example_app.html - :height: 150 - -.. displayitem:: - :header: Example: Streamlit - :description: TODO - :col_css: col-md-4 - :button_link: example_app.html - :height: 150 - -.. displayitem:: - :header: Example: React - :description: TODO - :col_css: col-md-4 - :button_link: example_app.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_server/flask_basic.rst b/docs/source-app/workflows/add_server/flask_basic.rst deleted file mode 100644 index 38ca282346248..0000000000000 --- a/docs/source-app/workflows/add_server/flask_basic.rst +++ /dev/null @@ -1,155 +0,0 @@ -############################### -Add a web UI with Flask (basic) -############################### -**Audience:** Users who want to enable a flask app within a component. - -**Prereqs:** Basic python knowledge. - ----- - -************** -What is Flask? -************** -Flask is a web framework, that lets you develop web applications in Python easily. - ----- - -************************ -Add Flask to a component -************************ -First, define your flask app as you normally would without Lightning: - -.. code:: python - :emphasize-lines: 9 - - from flask import Flask - - flask_app = Flask(__name__) - - - @flask_app.route("/") - def hello(): - return "Hello, World!" - - - flask_app.run(host="0.0.0.0", port=80) - -To enable the server inside the component, start the Flask server in the run method and use the ``self.host`` and ``self.port`` properties: - -.. code:: python - :emphasize-lines: 12 - - import lightning as L - from flask import Flask - - - class LitFlask(L.LightningWork): - def run(self): - flask_app = Flask(__name__) - - @flask_app.route("/") - def hello(): - return "Hello, World!" - - flask_app.run(host=self.host, port=self.port) - ----- - -************************************** -Route the server in the root component -************************************** -The final step, is to tell the Root component in which tab to render this component's output: -In this case, we render the ``LitFlask`` output in the ``home`` tab of the application. - -.. code:: python - :emphasize-lines: 17, 23 - - import lightning as L - from flask import Flask - - - class LitFlask(L.LightningWork): - def run(self): - flask_app = Flask(__name__) - - @flask_app.route("/") - def hello(): - return "Hello, World!" - - flask_app.run(host=self.host, port=self.port) - - - class Root(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_flask = LitFlask(parallel=True) - - def run(self): - self.lit_flask.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_flask} - return tab1 - - - app = L.LightningApp(Root()) - -We use the ``parallel=True`` argument of ``LightningWork`` to run the server in the background -while the rest of the Lightning App runs everything else. - ----- - -*********** -Run the app -*********** -Start the app to see your new UI! - -.. code:: bash - - lightning run app app.py - -To run the app on the cloud, use the ``--cloud`` argument. - -.. code:: bash - - lightning run app app.py --cloud - ----- - -******** -Examples -******** -Here are a few example apps that expose a Flask server via a component: - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: TODO - :col_css: col-md-4 - :button_link: example_app.html - :height: 150 - -.. displayitem:: - :header: Example 2 - :description: TODO - :col_css: col-md-4 - :button_link: example_app.html - :height: 150 - -.. displayitem:: - :header: Example 3 - :description: TODO - :col_css: col-md-4 - :button_link: example_app.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_server/index.rst b/docs/source-app/workflows/add_server/index.rst deleted file mode 100644 index 1429b08679b2a..0000000000000 --- a/docs/source-app/workflows/add_server/index.rst +++ /dev/null @@ -1,8 +0,0 @@ -################################### -Run a server within a Lightning App -################################### -Any type of server can run inside a Lightning App. - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/add_server/index_content.rst b/docs/source-app/workflows/add_server/index_content.rst deleted file mode 100644 index d3623698cfb19..0000000000000 --- a/docs/source-app/workflows/add_server/index_content.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. toctree:: - :maxdepth: 1 - :hidden: - - any_server - flask_basic - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Any server - :description: Learn how to enable any server inside a Lightning App. - :col_css: col-md-6 - :button_link: any_server.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Flask - :description: Learn how to add a Flask server inside a Lightning App. - :col_css: col-md-6 - :button_link: flask_basic.html - :height: 150 - :tag: basic - - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_link.rst b/docs/source-app/workflows/add_web_link.rst deleted file mode 100644 index 01ffdf62ef3e3..0000000000000 --- a/docs/source-app/workflows/add_web_link.rst +++ /dev/null @@ -1,54 +0,0 @@ -############## -Add a web link -############## -**Audience:** Users who want to link to other pages from their app. - ----- - -************** -Add a url link -************** -In this example we'll replicate |urls_link|. - -To add a url link to an app, simply specify it in the ``configure_layout`` method -and connect the UIs. Create a file named **app.py** with this code: - -.. |urls_link| raw:: html - - the app running here - -.. code:: python - :emphasize-lines: 7,11 - - import lightning as L - - class LitApp(L.LightningFlow): - def configure_layout(self): - tab_1 = { - "name": "Logger", - "content": "https://bit.ly/tb-aasae" - } - tab_2 = { - "name": "Paper", - "content": "https://arxiv.org/pdf/2107.12329.pdf" - } - return tab_1, tab_2 - - app = L.LightningApp(LitApp()) - ----- - -*********** -Run the app -*********** -Run the app locally to see it! - -.. code:: python - - lightning run app app.py - -Now run it on the cloud as well: - -.. code:: python - - lightning run app app.py --cloud diff --git a/docs/source-app/workflows/add_web_ui/angular_js_intermediate.rst b/docs/source-app/workflows/add_web_ui/angular_js_intermediate.rst deleted file mode 100644 index 095dee362bf48..0000000000000 --- a/docs/source-app/workflows/add_web_ui/angular_js_intermediate.rst +++ /dev/null @@ -1,6 +0,0 @@ -:orphan: - -########################################### -Add a web UI with Angular.js (intermediate) -########################################### -coming... diff --git a/docs/source-app/workflows/add_web_ui/dash/basic.rst b/docs/source-app/workflows/add_web_ui/dash/basic.rst deleted file mode 100644 index 4316fc13a1992..0000000000000 --- a/docs/source-app/workflows/add_web_ui/dash/basic.rst +++ /dev/null @@ -1,221 +0,0 @@ -############################## -Add a web UI with Dash (basic) -############################## -**Audience:** Users who want to add a web UI with Dash by Plotly. - -**Prereqs:** Basic python knowledge. - ----- - -************* -What is Dash? -************* -`Dash `_ is the original low-code framework for rapidly building data apps in Python, R, Julia, and F# (experimental). - -Install Dash with: - -.. code:: bash - - pip install dash - ----- - -************************ -Create the dash demo app -************************ - -To explain how to use Dash with Lightning, let's build a simple app with Dash. - - -.. - To explain how to use Dash with Lightning, let's replicate the |dash_link|. - - .. |dash_link| raw:: html - - example running here - -In the next few sections we'll build an app step-by-step. -First **create a file named app.py** with the app content: - -.. code:: bash - - import lightning as L - import dash - import plotly.express as px - - class LitDash(L.LightningWork): - def run(self): - dash_app = dash.Dash(__name__) - X = [1, 2, 3, 4, 5, 6] - Y = [2, 4, 8, 16, 32, 64] - fig = px.line(x=X, y=Y) - - dash_app.layout = dash.html.Div(children=[ - dash.html.H1(children='⚡ Hello Dash + Lightning⚡'), - dash.html.Div(children='The Dash framework running inside a ⚡ Lightning App'), - dash.dcc.Graph(id='example-graph', figure=fig) - ]) - - dash_app.run_server(host=self.host, port=self.port) - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_dash = LitDash(parallel=True) - - def run(self): - self.lit_dash.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_dash} - return tab1 - - app = L.LightningApp(LitApp()) - - -add 'dash' to a requirements.txt file: - -.. code:: bash - - echo "dash" >> requirements.txt - -this is a best practice to make apps reproducible. - ----- - -*********** -Run the app -*********** -Run the app locally to see it! - -.. code:: python - - lightning run app app.py - -Now run it on the cloud as well: - -.. code:: python - - lightning run app app.py --cloud - ----- - -************************ -Step-by-step walkthrough -************************ -In this section, we explain each part of this code in detail. - ----- - -0. Define a Dash app -^^^^^^^^^^^^^^^^^^^^ -First, find the dash app you want to integrate. In this example, that app looks like: - -.. code:: python - - import dash - import plotly.express as px - - dash_app = dash.Dash(__name__) - X = [1, 2, 3, 4, 5, 6] - Y = [2, 4, 8, 16, 32, 64] - fig = px.line(x=X, y=Y) - - dash_app.layout = dash.html.Div(children=[ - dash.html.H1(children='⚡ Hello Dash + Lightning⚡'), - dash.html.Div(children='The Dash framework running inside a ⚡ Lightning App'), - dash.dcc.Graph(id='example-graph', figure=fig) - ]) - - dash_app.run_server(host='0.0.0.0', port=80) - -This dash app plots a simple line curve along with some HTMlapp. -`Visit the Dash documentation for the full API `_. - ----- - -1. Add Dash to a component -^^^^^^^^^^^^^^^^^^^^^^^^^^ -Add the dash app to the run method of a ``LightningWork`` component and run the server on that component's **host** and **port**: - -.. code:: python - :emphasize-lines: 6, 18 - - import lightning as L - import dash - import plotly.express as px - - class LitDash(L.LightningWork): - def run(self): - dash_app = dash.Dash(__name__) - X = [1, 2, 3, 4, 5, 6] - Y = [2, 4, 8, 16, 32, 64] - fig = px.line(x=X, y=Y) - - dash_app.layout = dash.html.Div(children=[ - dash.html.H1(children='⚡ Hello Dash + Lightning⚡'), - dash.html.Div(children='The Dash framework running inside a ⚡ Lightning App'), - dash.dcc.Graph(id='example-graph', figure=fig) - ]) - - dash_app.run_server(host=self.host, port=self.port) - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_dash = LitDash(parallel=True) - - def run(self): - self.lit_dash.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_dash} - return tab1 - - app = L.LightningApp(LitApp()) - ----- - -2. Route the UI in the root component -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The final step, is to tell the Root component in which tab to render this component's UI. -In this case, we render the ``LitDash`` UI in the ``home`` tab of the application. - -.. code:: python - :emphasize-lines: 23, 29 - - import lightning as L - import dash - import plotly.express as px - - class LitDash(L.LightningWork): - def run(self): - dash_app = dash.Dash(__name__) - X = [1, 2, 3, 4, 5, 6] - Y = [2, 4, 8, 16, 32, 64] - fig = px.line(x=X, y=Y) - - dash_app.layout = dash.html.Div(children=[ - dash.html.H1(children='⚡ Hello Dash + Lightning⚡'), - dash.html.Div(children='The Dash framework running inside a ⚡ Lightning App'), - dash.dcc.Graph(id='example-graph', figure=fig) - ]) - - dash_app.run_server(host=self.host, port=self.port) - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_dash = LitDash(parallel=True) - - def run(self): - self.lit_dash.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_dash} - return tab1 - - app = L.LightningApp(LitApp()) - -We use the ``parallel=True`` argument of ``LightningWork`` to run the server in the background -while the rest of the Lightning App runs everything else. diff --git a/docs/source-app/workflows/add_web_ui/dash/index.rst b/docs/source-app/workflows/add_web_ui/dash/index.rst deleted file mode 100644 index 5abb444c8e9a9..0000000000000 --- a/docs/source-app/workflows/add_web_ui/dash/index.rst +++ /dev/null @@ -1,84 +0,0 @@ -:orphan: - -.. toctree:: - :maxdepth: 1 - :hidden: - - basic - intermediate - -###################### -Add a web UI with Dash -###################### - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1: Connect Dash - :description: Learn how to connect a Dash app. - :col_css: col-md-6 - :button_link: basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: 2: Enable two-way communication - :description: Enable two-way communication between the dash app and a Lightning App. - :col_css: col-md-6 - :button_link: intermediate.html - :height: 150 - :tag: [docs coming soon] - -.. raw:: html - -
-
- ----- - -******** -Examples -******** -Here are a few example apps that use a Dash web UI. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 2 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 3 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/dash/intermediate.rst b/docs/source-app/workflows/add_web_ui/dash/intermediate.rst deleted file mode 100644 index 5c5da1a19403e..0000000000000 --- a/docs/source-app/workflows/add_web_ui/dash/intermediate.rst +++ /dev/null @@ -1,42 +0,0 @@ -##################################### -Add a web UI with Dash (intermediate) -##################################### -**Audience:** Users who want to communicate between the Lightning App and Dash. - -**Prereqs:** Must have read the :doc:`dash basic ` guide. - ----- - -******************************* -Interact with the App from Dash -******************************* - -In the example below, every time you change the select year on the dashboard, this is directly communicated to the flow -and another work process the associated data frame with the provided year. - -.. literalinclude:: intermediate_plot.py - -Here is how the app looks like once running: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/dash_plot.gif - ----- - -*********************************** -Interact with Dash from a component -*********************************** - -In the example below, when you click the toggle, the state of the work appears. - -Install the following libraries if you want to run the app. - -```bash -pip install dash_daq dash_renderjson -``` - -.. literalinclude:: intermediate_state.py - - -Here is how the app looks like once running: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/dash_state.gif diff --git a/docs/source-app/workflows/add_web_ui/dash/intermediate_plot.py b/docs/source-app/workflows/add_web_ui/dash/intermediate_plot.py deleted file mode 100644 index 95a3edb8f99d4..0000000000000 --- a/docs/source-app/workflows/add_web_ui/dash/intermediate_plot.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import Optional - -import pandas as pd -import plotly.express as px -from dash import Dash, dcc, html, Input, Output - -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.storage import Payload - - -class LitDash(LightningWork): - def __init__(self): - super().__init__(parallel=True) - self.df = None - self.selected_year = None - - def run(self): - df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv") - self.df = Payload(df) - - dash_app = Dash(__name__) - - dash_app.layout = html.Div( - [ - dcc.Graph(id="graph-with-slider"), - dcc.Slider( - df["year"].min(), - df["year"].max(), - step=None, - value=df["year"].min(), - marks={str(year): str(year) for year in df["year"].unique()}, - id="year-slider", - ), - ] - ) - - @dash_app.callback(Output("graph-with-slider", "figure"), Input("year-slider", "value")) - def update_figure(selected_year): - self.selected_year = selected_year - filtered_df = df[df.year == selected_year] - - fig = px.scatter( - filtered_df, - x="gdpPercap", - y="lifeExp", - size="pop", - color="continent", - hover_name="country", - log_x=True, - size_max=55, - ) - - fig.update_layout(transition_duration=500) - - return fig - - dash_app.run_server(host=self.host, port=self.port) - - -class Processor(LightningWork): - def run(self, df: Payload, selected_year: Optional[str]): - if selected_year: - df = df.value - filtered_df = df[df.year == selected_year] - print(f"[PROCESSOR|selected_year={selected_year}]") - print(filtered_df) - - -class LitApp(LightningFlow): - def __init__(self): - super().__init__() - self.lit_dash = LitDash() - self.processor = Processor(parallel=True) - - def run(self): - self.lit_dash.run() - - # Launch some processing based on the Dash Dashboard. - self.processor.run(self.lit_dash.df, self.lit_dash.selected_year) - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_dash} - return tab1 - - -app = LightningApp(LitApp()) diff --git a/docs/source-app/workflows/add_web_ui/dash/intermediate_state.py b/docs/source-app/workflows/add_web_ui/dash/intermediate_state.py deleted file mode 100644 index d30bdc8c02c25..0000000000000 --- a/docs/source-app/workflows/add_web_ui/dash/intermediate_state.py +++ /dev/null @@ -1,39 +0,0 @@ -import dash -import dash_daq as daq -import dash_renderjson -from dash import html, Input, Output - -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.utilities.state import AppState - - -class LitDash(LightningWork): - def run(self): - dash_app = dash.Dash(__name__) - - dash_app.layout = html.Div([daq.ToggleSwitch(id="my-toggle-switch", value=False), html.Div(id="output")]) - - @dash_app.callback(Output("output", "children"), [Input("my-toggle-switch", "value")]) - def display_output(value): - if value: - state = AppState() - state._request_state() - return dash_renderjson.DashRenderjson(id="input", data=state._state, max_depth=-1, invert_theme=True) - - dash_app.run_server(host=self.host, port=self.port) - - -class LitApp(LightningFlow): - def __init__(self): - super().__init__() - self.lit_dash = LitDash(parallel=True) - - def run(self): - self.lit_dash.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_dash} - return tab1 - - -app = LightningApp(LitApp()) diff --git a/docs/source-app/workflows/add_web_ui/example_app.rst b/docs/source-app/workflows/add_web_ui/example_app.rst deleted file mode 100644 index e5d2cbbd3f8d4..0000000000000 --- a/docs/source-app/workflows/add_web_ui/example_app.rst +++ /dev/null @@ -1,7 +0,0 @@ -:orphan: - -########### -Example App -########### - -This is an example app that needs to be built for this part of the docs. diff --git a/docs/source-app/workflows/add_web_ui/glossary_front_end.rst b/docs/source-app/workflows/add_web_ui/glossary_front_end.rst deleted file mode 100644 index ce51ef12aa77b..0000000000000 --- a/docs/source-app/workflows/add_web_ui/glossary_front_end.rst +++ /dev/null @@ -1,9 +0,0 @@ -######## -Frontend -######## -Web pages visible to users are also known as **front-ends**. Lightning Apps can have multiple -types of Frontends. - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/add_web_ui/glossary_ui.rst b/docs/source-app/workflows/add_web_ui/glossary_ui.rst deleted file mode 100644 index bc9e4f529e3b6..0000000000000 --- a/docs/source-app/workflows/add_web_ui/glossary_ui.rst +++ /dev/null @@ -1,9 +0,0 @@ -################### -UI (User Interface) -################### -We use (UI) as short for a **web page** with interactions. Lightning Apps can have multiple -types of UIs. - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/add_web_ui/gradio/basic.rst b/docs/source-app/workflows/add_web_ui/gradio/basic.rst deleted file mode 100644 index 4f5ab87b8b8d4..0000000000000 --- a/docs/source-app/workflows/add_web_ui/gradio/basic.rst +++ /dev/null @@ -1,217 +0,0 @@ -################################ -Add a web UI with Gradio (basic) -################################ -**Audience:** Users who want to add a web UI written with Python. - -**Prereqs:** Basic python knowledge. - ----- - -*************** -What is Gradio? -*************** -Gradio is a Python library that automatically generates a web interface to demo a machine learning model. - ----- - -***************** -Install gradio -***************** -First, install gradio. - -.. code:: bash - - pip install gradio - ----- - -************************** -Create the gradio demo app -************************** -To explain how to use Gradio with Lightning, let's replicate the |gradio_link|. - -.. |gradio_link| raw:: html - - example running here - -In the next few sections we'll build an app step-by-step. -First **create a file named app.py** with the app content: - -.. code:: python - - import lightning as L - from lightning.app.components import ServeGradio - import gradio as gr - - class LitGradio(ServeGradio): - - inputs = gr.inputs.Textbox(default='lightning', label='name input') - outputs = gr.outputs.Textbox(label='output') - examples = [["hello lightning"]] - - def predict(self, input_text): - return self.model(input_text) - - def build_model(self): - fake_model = lambda x: f"hello {x}" - return fake_model - - class RootFlow(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_gradio = LitGradio() - - def run(self): - self.lit_gradio.run() - - def configure_layout(self): - return [{"name": "home", "content": self.lit_gradio}] - - app = L.LightningApp(RootFlow()) - -add "gradio" to a requirements.txt file: - -.. code:: bash - - echo 'gradio' >> requirements.txt - -this is a best practice to make apps reproducible. - ----- - -*********** -Run the app -*********** -Run the app locally to see it! - -.. code:: python - - lightning run app app.py - -Now run it on the cloud as well: - -.. code:: python - - lightning run app app.py --cloud - ----- - -************************ -Step-by-step walkthrough -************************ -In this section, we explain each part of this code in detail. - ----- - -Create a Gradio component -^^^^^^^^^^^^^^^^^^^^^^^^^ -To create a Gradio component, simply take any Gradio app and subclass it from ``ServeGradio``. -If you haven't created a Gradio demo, you have to implement the following elements: - -1. Input which is text. -2. Output which is text. -3. A build_model function. -4. A predict function. - -| - -Here's an example: - -.. code:: python - :emphasize-lines: 4 - - from lightning.app.components import ServeGradio - import gradio as gr - - class LitGradio(ServeGradio): - - inputs = gr.inputs.Textbox(default='lightning', label='name input') - outputs = gr.outputs.Textbox(label='output') - - def predict(self, input_text): - return self.model(input_text) - - def build_model(self): - fake_model = lambda x: f"hello {x}" - return fake_model - -This fake model simply concatenates 2 strings. - ----- - -Route the UI in the root component -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Now, tell the Root component in which tab to render this component's UI. -In this case, we render the ``LitGradio`` UI in the ``home`` tab of the application. - -.. code:: python - :emphasize-lines: 21, 27 - - import lightning as L - from lightning.app.components import ServeGradio - import gradio as gr - - class LitGradio(ServeGradio): - - inputs = gr.inputs.Textbox(default='lightning', label='name input') - outputs = gr.outputs.Textbox(label='output') - examples = [["hello lightning"]] - - def predict(self, input_text): - return self.model(input_text) - - def build_model(self): - fake_model = lambda x: f"hello {x}" - return fake_model - - class RootFlow(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_gradio = LitGradio() - - def run(self): - self.lit_gradio.run() - - def configure_layout(self): - return [{"name": "home", "content": self.lit_gradio}] - - app = L.LightningApp(RootFlow()) - ----- - -Call run -^^^^^^^^ -Finally, don't forget to call run inside the Root Flow to serve the Gradio app. - -.. code:: python - :emphasize-lines: 24 - - import lightning as L - from lightning.app.components import ServeGradio - import gradio as gr - - class LitGradio(ServeGradio): - - inputs = gr.inputs.Textbox(default='lightning', label='name input') - outputs = gr.outputs.Textbox(label='output') - examples = [["hello lightning"]] - - def predict(self, input_text): - return self.model(input_text) - - def build_model(self): - fake_model = lambda x: f"hello {x}" - return fake_model - - class RootFlow(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_gradio = LitGradio() - - def run(self): - self.lit_gradio.run() - - def configure_layout(self): - return [{"name": "home", "content": self.lit_gradio}] - - app = L.LightningApp(RootFlow()) diff --git a/docs/source-app/workflows/add_web_ui/gradio/index.rst b/docs/source-app/workflows/add_web_ui/gradio/index.rst deleted file mode 100644 index 740ae93aae0c4..0000000000000 --- a/docs/source-app/workflows/add_web_ui/gradio/index.rst +++ /dev/null @@ -1,84 +0,0 @@ -:orphan: - -.. toctree:: - :maxdepth: 1 - :hidden: - - basic - intermediate - -######################## -Add a web UI with Gradio -######################## - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1: Connect Gradio - :description: Learn how to connect Gradio to a Lightning Component. - :col_css: col-md-6 - :button_link: basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: 2: Enable two-way communication - :description: Enable two-way communication between Gradio and a Lightning App. - :col_css: col-md-6 - :button_link: intermediate.html - :height: 150 - :tag: [documentation coming soon] - -.. raw:: html - -
-
- ----- - -******** -Examples -******** -Here are a few example apps that use a Gradio web UI. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 2 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 3 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/gradio/intermediate.rst b/docs/source-app/workflows/add_web_ui/gradio/intermediate.rst deleted file mode 100644 index bb20d566243b1..0000000000000 --- a/docs/source-app/workflows/add_web_ui/gradio/intermediate.rst +++ /dev/null @@ -1,21 +0,0 @@ -####################################### -Add a web UI with Gradio (intermediate) -####################################### - -.. note:: documentation coming soon. - - -************************************* -Interact with a component from the UI -************************************* - -.. warning:: is there such a thing for this with gradio? - - ----- - -************************************* -Interact with the UI from a component -************************************* - -.. warning:: is there such a thing for this with gradio? diff --git a/docs/source-app/workflows/add_web_ui/html/basic.rst b/docs/source-app/workflows/add_web_ui/html/basic.rst deleted file mode 100644 index cb9bb5293f7c9..0000000000000 --- a/docs/source-app/workflows/add_web_ui/html/basic.rst +++ /dev/null @@ -1,166 +0,0 @@ -############################## -Add a web UI with HTML (basic) -############################## -**Audience:** Users who want to add a web UI written in HTMlapp. - -**Prereqs:** Basic html knowledge. - ----- - -************* -What is HTML? -************* -HyperText Markup Language (HTML) is the Language used to create web pages. Use HTML for simple -web user interfaces that tend to be more static. - -For reactive web applications, we recommend using: React.js, Angular.js or Vue.js - ----- - -******************* -Create an HTML page -******************* -The first step is to create an HTML file named **index.html**: - -.. code:: html - - - - - - -

Hello World

- - - ----- - -************************ -Create the HTML demo app -************************ - -.. - To explain how to use html with Lightning, let's replicate the |html_app_link|. - - .. |html_app_link| raw:: html - - example running here - -In the next few sections we'll build an app step-by-step. -First **create a file named app.py** with the app content (in the same folder as index.html): - -.. code:: bash - - # app.py - import lightning as L - import lightning.app.frontend as frontend - - - class HelloComponent(L.LightningFlow): - def configure_layout(self): - return frontend.StaticWebFrontend(serve_dir='.') - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.hello_component = HelloComponent() - - def run(self): - self.hello_component.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.hello_component} - return tab1 - - - app = L.LightningApp(LitApp()) - ----- - -*********** -Run the app -*********** -Run the app locally to see it! - -.. code:: python - - lightning run app app.py - -Now run it on the cloud as well: - -.. code:: python - - lightning run app app.py --cloud - ----- - -************************ -Step-by-step walkthrough -************************ -In this section, we explain each part of this code in detail. - ----- - -Enable an HTML UI for the component -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Give the component an HTML UI, by returning a ``StaticWebFrontend`` object from the ``configure_layout`` method: - -.. code:: bash - :emphasize-lines: 6,7 - - # app.py - import lightning as L - import lightning.app.frontend as frontend - - class HelloComponent(L.LightningFlow): - def configure_layout(self): - return frontend.StaticWebFrontend(serve_dir='.') - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.hello_component = HelloComponent() - - def run(self): - self.hello_component.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.hello_component} - return tab1 - - app = L.LightningApp(LitApp()) - -The folder path given in ``StaticWebFrontend(serve_dir=)`` must point to a folder with an ``index.html`` page. - ----- - -Route the UI in the root component -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The final step, is to tell the Root component in which tab to render this component's UI. -In this case, we render the ``HelloComponent`` UI in the ``home`` tab of the application. - -.. code:: python - :emphasize-lines: 18, 19 - - # app.py - import lightning as L - import lightning.app.frontend as frontend - - class HelloComponent(L.LightningFlow): - def configure_layout(self): - return frontend.StaticWebFrontend(serve_dir='.') - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.hello_component = HelloComponent() - - def run(self): - self.hello_component.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.hello_component} - return tab1 - - app = L.LightningApp(LitApp()) diff --git a/docs/source-app/workflows/add_web_ui/html/index.rst b/docs/source-app/workflows/add_web_ui/html/index.rst deleted file mode 100644 index 0eae9309877a7..0000000000000 --- a/docs/source-app/workflows/add_web_ui/html/index.rst +++ /dev/null @@ -1,87 +0,0 @@ -:orphan: - -.. toctree:: - :maxdepth: 1 - :hidden: - - basic - intermediate - -###################### -Add a web UI with HTML -###################### -**Audience:** Users who want to add a web UI using plain html. - -**Prereqs:** Basic html knowledge. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1: Connect HTML - :description: Learn how to connect an HTML app. - :col_css: col-md-6 - :button_link: basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: 2: Enable two-way communication - :description: Enable two-way communication between HTML and a Lightning App. - :col_css: col-md-6 - :button_link: intermediate.html - :height: 150 - :tag: [docs coming soon] - -.. raw:: html - -
-
- ----- - -******** -Examples -******** -Here are a few example apps that use an HTML web UI. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 2 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 3 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/html/intermediate.rst b/docs/source-app/workflows/add_web_ui/html/intermediate.rst deleted file mode 100644 index 17b6b585c0173..0000000000000 --- a/docs/source-app/workflows/add_web_ui/html/intermediate.rst +++ /dev/null @@ -1,20 +0,0 @@ -##################################### -Add a web UI with HTML (intermediate) -##################################### -**Audience:** Users who want to add a web UI using plain html. - -**Prereqs:** Must have read the :doc:`html basic ` guide. - ----- - -******************************* -Interact with the App from HTML -******************************* -.. note:: documentation in progress - ----- - -*********************************** -Interact with HTML from a component -*********************************** -.. note:: documentation in progress diff --git a/docs/source-app/workflows/add_web_ui/index.rst b/docs/source-app/workflows/add_web_ui/index.rst deleted file mode 100644 index 79c0f16d66409..0000000000000 --- a/docs/source-app/workflows/add_web_ui/index.rst +++ /dev/null @@ -1,10 +0,0 @@ - -############################# -Add a web user interface (UI) -############################# - -**Audience:** Users who want to add a UI to their Lightning Apps - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/add_web_ui/index_content.rst b/docs/source-app/workflows/add_web_ui/index_content.rst deleted file mode 100644 index f3d516c5af546..0000000000000 --- a/docs/source-app/workflows/add_web_ui/index_content.rst +++ /dev/null @@ -1,121 +0,0 @@ -************************************* -Web UIs for non Javascript Developers -************************************* - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Dash - :description: Learn how to add a web UI built in Python with Dash. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/dash/index.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Gradio - :description: Learn how to add a web UI built in Python with Gradio. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/gradio/index.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Panel - :description: Learn how to add a web UI built in Python with Panel. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/panel/index.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Jupyter Notebook - :description: Learn how to enable a web UI that is a Jupyter Notebook. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/jupyter_basic.html - :height: 150 - :tag: [docs coming soon] - -.. displayitem:: - :header: Streamlit - :description: Learn how to add a web UI built in Python with Streamlit. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/streamlit/index.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: JustPy - :description: Learn how to add a web UI built in Python with JustPy. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/justpy/index.html - :height: 150 - :tag: basic - -.. raw:: html - -
-
- ----- - -********************************* -Web UIs for Javascript Developers -********************************* - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Any javascript framework - :description: Learn how to link up any javascript framework to a Lightning app. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/integrate_any_javascript_framework.html - :height: 150 - :tag: advanced - -.. displayitem:: - :header: Angular.js - :description: Learn how to add a web UI built in Javascript with Angular.js - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/angular_js_intermediate.html - :height: 150 - :tag: [Docs coming soon] - -.. displayitem:: - :header: HTML - :description: Learn how to add a web UI built with html. - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/html/index.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: React.js - :description: Learn how to add a web UI built in Javascript with React.js - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/react/index.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Vue.js - :description: Learn how to add a web UI built in Javascript with Vue.js - :col_css: col-md-4 - :button_link: ../../workflows/add_web_ui/vue_js_intermediate.html - :height: 150 - :tag: [Docs coming soon] - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/integrate_any_javascript_framework.rst b/docs/source-app/workflows/add_web_ui/integrate_any_javascript_framework.rst deleted file mode 100644 index 66625289a6e49..0000000000000 --- a/docs/source-app/workflows/add_web_ui/integrate_any_javascript_framework.rst +++ /dev/null @@ -1,164 +0,0 @@ -:orphan: - -################################## -Integrate any javascript framework -################################## -**Audience:** Advanced web developers with complex apps that may not have been covered by the other tutorials - -**Pre-requisites:** Intermediate knowledge of html and javascript - ----- - -************************ -Import LightningState.js -************************ -To connect any javascript framework, import the `LightningState.js `_ library. -LightningState.js enables two-way communication between a javascript framework and a Lightning app. - -To import this library, add this to your html: - -.. code:: html - - - -Once it's imported, use it inside your app, this example uses it inside a React App: - -.. code-block:: - :emphasize-lines: 1, 5 - - import { useLightningState } from "./hooks/useLightningState"; - import cloneDeep from "lodash/cloneDeep"; - - function App() { - const { lightningState, updateLightningState } = useLightningState(); - - const modify_and_send_back_the_state = async (event: ChangeEvent) => { - if (lightningState) { - const newLightningState = cloneDeep(lightningState); - // Update the state and send it back. - newLightningState.flows.counter += 1 - - updateLightningState(newLightningState); - } - }; - - return ( -
-
- ); - } - - export default App; - ----- - -************************ -Update the Lightning app -************************ -Use `updateLightningState` to update the lightning app. Here we update a variable called counter. - -.. code-block:: - :emphasize-lines: 11 - - import { useLightningState } from "./hooks/useLightningState"; - import cloneDeep from "lodash/cloneDeep"; - - function App() { - const { lightningState, updateLightningState } = useLightningState(); - - const modify_and_send_back_the_state = async (event: ChangeEvent) => { - if (lightningState) { - const newLightningState = cloneDeep(lightningState); - // Update the state and send it back. - newLightningState.flows.counter += 1 - - updateLightningState(newLightningState); - } - }; - - return ( -
-
- ); - } - - export default App; - ----- - -************************************** -Receive updates from the Lightning app -************************************** -Whenever a variable in the Lightning app changes, the javascript app will receive those values via `lightningState`. - -Extract any variable from the state and update the javascript app: - -.. code-block:: - :emphasize-lines: 5 - - import { useLightningState } from "./hooks/useLightningState"; - import cloneDeep from "lodash/cloneDeep"; - - function App() { - const { lightningState, updateLightningState } = useLightningState(); - - const modify_and_send_back_the_state = async (event: ChangeEvent) => { - if (lightningState) { - const newLightningState = cloneDeep(lightningState); - // Update the state and send it back. - newLightningState.flows.counter += 1 - - updateLightningState(newLightningState); - } - }; - - return ( -
-
- ); - } - - export default App; - ----- - -******** -Examples -******** - -See this in action in these examples: - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: React.js - :description: Explore how React.js uses lightningState.js - :col_css: col-md-4 - :button_link: react/communicate_between_react_and_lightning.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Example 2 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 3 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :height: 150 - :tag: Waiting for contributed example - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/jupyter_basic.rst b/docs/source-app/workflows/add_web_ui/jupyter_basic.rst deleted file mode 100644 index 61f58ab40670d..0000000000000 --- a/docs/source-app/workflows/add_web_ui/jupyter_basic.rst +++ /dev/null @@ -1,70 +0,0 @@ -:orphan: - -##################################### -Add a Jupyter Notebook web UI (basic) -##################################### -**Audience:** Users who want to enable a Jupyter notebook UI. - -**Prereqs:** Basic python knowledge. - -TODO - ----- - -*************************** -What is a Jupyter Notebook? -*************************** - -TODO - ----- - -******************* -Install Jupyter Lab -******************* - -First, install Jupyter Lab. - -.. code:: bash - - pip install jupyterlab - ----- - -******** -Examples -******** -Here are a few example apps that use Jupyter Lab. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: TODO - :col_css: col-md-4 - :button_link: angular_js_intermediate.html - :height: 150 - -.. displayitem:: - :header: Example 2 - :description: TODO - :col_css: col-md-4 - :button_link: angular_js_intermediate.html - :height: 150 - -.. displayitem:: - :header: Example 3 - :description: TODO - :col_css: col-md-4 - :button_link: angular_js_intermediate.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/justpy/index.rst b/docs/source-app/workflows/add_web_ui/justpy/index.rst deleted file mode 100644 index 1626930c13d82..0000000000000 --- a/docs/source-app/workflows/add_web_ui/justpy/index.rst +++ /dev/null @@ -1,92 +0,0 @@ -:orphan: - -######################## -Add a web UI with JustPy -######################## - - -****** -JustPy -****** - -The `JustPy `_ framework is an object oriented high-level Python Web Framework that requires no JavaScript programming, while at the same time providing the full flexibility of a frontend framework. - -Additionally, it provides a higher level API called `Quasar `_ with stylized components. - - -You can install ``justpy`` from PyPi. - -.. code-block:: - - pip install justpy - -******* -Example -******* - - -In the following example, we are creating a simple UI with 2 buttons. -When clicking the first button, the flow state ``counter`` is incremented and re-rendered on the UI. - - -First of all, you would need to import the ``JustPyFrontend`` and return it from the ``configure_layout`` hook of the flow. - -.. code-block:: - - from typing import Callable - - from lightning import LightningApp, LightningFlow - from lightning.app.frontend import JustPyFrontend - - - class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.counter = 0 - - def run(self): - print(self.counter) - - def configure_layout(self): - return JustPyFrontend(render_fn=render_fn) - -Secondly, you would need to implement a ``render_fn`` that takes as input a ``get_state`` function and return a function. - - -.. code-block:: - - def render_fn(get_state: Callable) -> Callable: - import justpy as jp - - def webpage(): - wp = jp.QuasarPage(dark=True) - # the `a=wp` argument adds the div to the web page - d = jp.Div(classes="q-pa-md q-gutter-sm", a=wp) - container = jp.QBtn(color="primary", text="Counter: 0") - - async def click(*_): - state = get_state() - state.counter += 1 - container.text = f"Counter: {state.counter}" - - button = jp.QBtn(color="primary", text="Click Me!", click=click) - - d.add(button) - d.add(container) - - return wp - - return webpage - - -Finally, you can wrap your flow in a LightningAp. - -.. code-block:: - - app = LightningApp(Flow()) - -Now, you can run the Lightning App with: - -.. code-block:: - - lightning_app run app app.py diff --git a/docs/source-app/workflows/add_web_ui/panel/basic.rst b/docs/source-app/workflows/add_web_ui/panel/basic.rst deleted file mode 100644 index b033312f6f0d1..0000000000000 --- a/docs/source-app/workflows/add_web_ui/panel/basic.rst +++ /dev/null @@ -1,369 +0,0 @@ -:orphan: - -############################### -Add a web UI with Panel (basic) -############################### - -**Audience:** Users who want to add a web UI written with Python and Panel. - -**Prereqs:** Basic Python knowledge. - ----- - -************** -What is Panel? -************** - -`Panel`_ and the `HoloViz`_ ecosystem provide unique and powerful -features such as big data visualization using `DataShader`_, easy cross filtering -using `HoloViews`_, streaming and much more. - -* Panel is highly flexible and ties into the PyData and Jupyter ecosystems as you can develop in notebooks and use ipywidgets. You can also develop in .py files. - -* Panel is one of the most popular data app frameworks in Python with `more than 400.000 downloads a month `_. It's especially popular in the scientific community. - -* Panel is used, for example, by Rapids to power `CuxFilter`_, a CuDF based big data visualization framework. - -* Panel can be deployed on your favorite server or cloud including `Lightning`_. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-intro.gif - :alt: Example Panel App - - Example Panel App - -Panel is **particularly well suited for Lightning Apps** that need to display live progress. This is because the Panel server can react -to state changes and asynchronously push messages from the server to the client using web socket communication. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-streaming-intro.gif - :alt: Example Panel Streaming App - - Example Panel Streaming App - -Install Panel with: - -.. code:: bash - - pip install panel - ----- - -********************* -Run a basic Panel App -********************* - -In the next few sections, we'll build an App step-by-step. - -First, create a file named ``app_panel.py`` with the App content: - -.. code:: python - - # app_panel.py - - import panel as pn - - pn.panel("Hello **Panel ⚡** World").servable() - -Then, create a file named ``app.py`` with the following App content: - -.. code:: python - - # app.py - - import lightning as L - from lightning.app.frontend import PanelFrontend - - - class LitPanel(L.LightningFlow): - - def configure_layout(self): - return PanelFrontend("app_panel.py") - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_panel = LitPanel() - - def run(self): - self.lit_panel.run() - - def configure_layout(self): - return {"name": "home", "content": self.lit_panel} - - - app = L.LightningApp(LitApp()) - -Finally, add ``panel`` to your ``requirements.txt`` file: - -.. code:: bash - - echo 'panel' >> requirements.txt - -.. note:: This is a best practice to make Apps reproducible. - ----- - -*********** -Run the App -*********** - -Run the App locally: - -.. code:: bash - - lightning_app run app app.py - -The App should look like this: - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-lightning-basic.png - :alt: Basic Panel Lightning App - - Basic Panel Lightning App - -Now, run it on the cloud: - -.. code:: bash - - lightning_app run app app.py --cloud - ----- - -************************* -Step-by-step walk-through -************************* - -In this section, we explain each part of the code in detail. - ----- - -0. Define a Panel app -^^^^^^^^^^^^^^^^^^^^^ - -First, find the Panel app you want to integrate. In this example, that app looks like: - -.. code:: python - - import panel as pn - - pn.panel("Hello **Panel ⚡** World").servable() - -Refer to the `Panel documentation `_ and `awesome-panel `_ for more complex examples. - ----- - -1. Add Panel to a Component -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Link this app to the Lightning App by using the ``PanelFrontend`` class which needs to be returned from -the ``configure_layout`` method of the Lightning Component you want to connect to Panel. - -.. code:: python - :emphasize-lines: 7-10 - - import lightning as L - from lightning.app.frontend import PanelFrontend - - - class LitPanel(L.LightningFlow): - - def configure_layout(self): - return PanelFrontend("app_panel.py") - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_panel = LitPanel() - - def run(self): - self.lit_panel.run() - - def configure_layout(self): - return {"name": "home", "content": self.lit_panel} - - - app = L.LightningApp(LitApp()) - -The argument of the ``PanelFrontend`` class, points to the script, notebook, or function that -runs your Panel app. - ----- - -2. Route the UI in the root component -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The second step, is to tell the Root component in which tab to render this component's UI. -In this case, we render the ``LitPanel`` UI in the ``home`` tab of the app. - -.. code:: python - :emphasize-lines: 19-20 - - import lightning as L - from lightning.app.frontend import PanelFrontend - - - class LitPanel(L.LightningFlow): - - def configure_layout(self): - return PanelFrontend("app_panel.py") - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_panel = LitPanel() - - def run(self): - self.lit_panel.run() - - def configure_layout(self): - return {"name": "home", "content": self.lit_panel} - - app = L.LightningApp(LitApp()) - ----- - -************* -Tips & Tricks -************* - -0. Use autoreload while developing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -To speed up your development workflow, you can run your Lightning App with Panel **autoreload** by -setting the environment variable ``PANEL_AUTORELOAD`` to ``yes``. - -Try running the following: - -.. code-block:: - - PANEL_AUTORELOAD=yes lightning run app app.py - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-lightning-autoreload.gif - :alt: Basic Panel Lightning App with autoreload - - Basic Panel Lightning App with autoreload - -1. Theme your App -^^^^^^^^^^^^^^^^^ - -To theme your App you, can use the Lightning accent color ``#792EE5`` with the `FastListTemplate`_. - -Try replacing the contents of ``app_panel.py`` with the following: - -.. code:: bash - - # app_panel.py - - import panel as pn - import plotly.express as px - - ACCENT = "#792EE5" - - pn.extension("plotly", sizing_mode="stretch_width", template="fast") - pn.state.template.param.update( - title="⚡ Hello Panel + Lightning ⚡", accent_base_color=ACCENT, header_background=ACCENT - ) - - pn.config.raw_css.append( - """ - .bk-root:first-of-type { - height: calc( 100vh - 200px ) !important; - } - """ - ) - - - def get_panel_theme(): - """Returns 'default' or 'dark'""" - return pn.state.session_args.get("theme", [b"default"])[0].decode() - - - def get_plotly_template(): - if get_panel_theme() == "dark": - return "plotly_dark" - return "plotly_white" - - - def get_plot(length=5): - xseries = [index for index in range(length + 1)] - yseries = [x**2 for x in xseries] - fig = px.line( - x=xseries, - y=yseries, - template=get_plotly_template(), - color_discrete_sequence=[ACCENT], - range_x=(0, 10), - markers=True, - ) - fig.layout.autosize = True - return fig - - - length = pn.widgets.IntSlider(value=5, start=1, end=10, name="Length") - dynamic_plot = pn.panel( - pn.bind(get_plot, length=length), sizing_mode="stretch_both", config={"responsive": True} - ) - pn.Column(length, dynamic_plot).servable() - - -Install some additional libraries and remember to add the dependencies to the ``requirements.txt`` file: - - -.. code:: bash - - echo 'plotly' >> requirements.txt - echo 'pandas' >> requirements.txt - -Finally run the App - -.. code:: bash - - lightning_app run app app.py - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-lightning-theme.gif - :alt: Basic Panel Plotly Lightning App with theming - - Basic Panel Plotly Lightning App with theming - -.. _Panel: https://panel.holoviz.org/ -.. _FastListTemplate: https://panel.holoviz.org/reference/templates/FastListTemplate.html#templates-gallery-fastlisttemplate -.. _HoloViz: https://holoviz.org/ -.. _DataShader: https://datashader.org/ -.. _HoloViews: https://holoviews.org/ -.. _Lightning: https://lightning.ai/ -.. _CuxFilter: https://github.com/rapidsai/cuxfilter -.. _AwesomePanel: https://github.com/awesome-panel/awesome-panel - - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: 2: Enable two-way communication - :description: Enable two-way communication between Panel and a Lightning App. - :col_css: col-md-6 - :button_link: intermediate.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Add a web user interface (UI) - :description: Users who want to add a UI to their Lightning Apps - :col_css: col-md-6 - :button_link: ../index.html - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/panel/index.rst b/docs/source-app/workflows/add_web_ui/panel/index.rst deleted file mode 100644 index 0d48a1dc9f7ea..0000000000000 --- a/docs/source-app/workflows/add_web_ui/panel/index.rst +++ /dev/null @@ -1,85 +0,0 @@ -:orphan: - -.. toctree:: - :maxdepth: 1 - :hidden: - - basic - intermediate - -####################### -Add a web UI with Panel -####################### - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1: Connect Panel - :description: Learn how to connect Panel to a Lightning Component. - :col_css: col-md-6 - :button_link: basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: 2: Enable two-way communication - :description: Enable two-way communication between Panel and a Lightning App. - :col_css: col-md-6 - :button_link: intermediate.html - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
- ----- - -******** -Examples -******** - -Here are a few example apps that use a Panel web UI. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 2 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 3 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/panel/intermediate.rst b/docs/source-app/workflows/add_web_ui/panel/intermediate.rst deleted file mode 100644 index 20cf141248739..0000000000000 --- a/docs/source-app/workflows/add_web_ui/panel/intermediate.rst +++ /dev/null @@ -1,210 +0,0 @@ -:orphan: - -###################################### -Add a web UI with Panel (intermediate) -###################################### - -**Audience:** Users who want to communicate between the Lightning App and Panel. - -**Prereqs:** Must have read the :doc:`Panel basic ` guide. - ----- - -************************************** -Interact with the Component from Panel -************************************** - -The ``PanelFrontend`` enables user interactions with the Lightning App using widgets. -You can modify the state variables of a Lightning Component using the ``AppStateWatcher``. - -For example, here we increase the ``count`` variable of the Lightning Component every time a user -presses a button: - -.. code:: python - - # app_panel.py - - import panel as pn - from lightning.app.frontend import AppStateWatcher - - pn.extension(sizing_mode="stretch_width") - - app = AppStateWatcher() - - submit_button = pn.widgets.Button(name="submit") - - @pn.depends(submit_button, watch=True) - def submit(_): - app.state.count += 1 - - @pn.depends(app.param.state) - def current_count(_): - return f"current count: {app.state.count}" - - pn.Column( - submit_button, - current_count, - ).servable() - - - -.. code:: python - - # app.py - - import lightning as L - from lightning.app.frontend import PanelFrontend - - class LitPanel(L.LightningFlow): - def __init__(self): - super().__init__() - self.count = 0 - self.last_count = 0 - - def run(self): - if self.count != self.last_count: - self.last_count = self.count - print("Count changed to: ", self.count) - - def configure_layout(self): - return PanelFrontend("app_panel.py") - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_panel = LitPanel() - - def run(self): - self.lit_panel.run() - - def configure_layout(self): - return {"name": "home", "content": self.lit_panel} - - - app = L.LightningApp(LitApp()) - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-lightning-counter-from-frontend.gif - :alt: Panel Lightning App updating a counter from the frontend - - Panel Lightning App updating a counter from the frontend - ----- - -************************************ -Interact with Panel from a Component -************************************ - -To update the `PanelFrontend` from any Lightning Component, update the property in the Component. -Make sure to call the ``run`` method from the parent component. - -In this example, we update the ``count`` value of the Component: - -.. code:: python - - # app_panel.py - - import panel as pn - from lightning.app.frontend import AppStateWatcher - - app = AppStateWatcher() - - pn.extension(sizing_mode="stretch_width") - - def counter(state): - return f"Counter: {state.count}" - - last_update = pn.bind(counter, app.param.state) - - pn.panel(last_update).servable() - -.. code:: python - - # app.py - - from datetime import datetime as dt - from lightning.app.frontend import PanelFrontend - - import lightning as L - - - class LitPanel(L.LightningFlow): - def __init__(self): - super().__init__() - self.count = 0 - self._last_update = dt.now() - - def run(self): - now = dt.now() - if (now - self._last_update).microseconds >= 250: - self.count += 1 - self._last_update = now - print("Counter changed to: ", self.count) - - def configure_layout(self): - return PanelFrontend("app_panel.py") - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_panel = LitPanel() - - def run(self): - self.lit_panel.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_panel} - return tab1 - - app = L.LightningApp(LitApp()) - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-lightning-counter-from-component.gif - :alt: Panel Lightning App updating a counter from the component - - Panel Lightning App updating a counter from the Component - ----- - -************* -Tips & Tricks -************* - -* Caching: Panel provides the easy to use ``pn.state.cache`` memory based, ``dict`` caching. If you are looking for something persistent try `DiskCache `_ its really powerful and simple to use. You can use it to communicate large amounts of data between the components and frontend(s). - -* Notifications: Panel provides easy to use `notifications `_. You can for example use them to provide notifications about runs starting or ending. - -* Tabulator Table: Panel provides the `Tabulator table `_ which features expandable rows. The table is useful to provide for example an overview of you runs. But you can dig into the details by clicking and expanding the row. - -* Task Scheduling: Panel provides easy to use `task scheduling `_. You can use this to for example read and display files created by your components on a scheduled basis. - -* Terminal: Panel provides the `Xterm.js terminal `_ which can be used to display live logs from your components and allow you to provide a terminal interface to your component. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/panel-lightning-github-runner.gif - :alt: Panel Lightning App running models on github - - Panel Lightning App running models on GitHub - ----- - -********** -Next Steps -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Add a web user interface (UI) - :description: Users who want to add a UI to their Lightning Apps - :col_css: col-md-6 - :button_link: ../index.html - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst b/docs/source-app/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst deleted file mode 100644 index ef836e7e4b0d1..0000000000000 --- a/docs/source-app/workflows/add_web_ui/react/communicate_between_react_and_lightning.rst +++ /dev/null @@ -1,58 +0,0 @@ -####################################### -Communicate Between React and Lightning -####################################### -**Audience:** Anyone who wants to add a web user interface (UI) written in react to their app. - -**pre-requisites:** Make sure you've already connected the React and Lightning app. - -**Difficulty level:** intermediate. - ----- - -************ -Example code -************ -To illustrate how to communicate between a React app and a lightning App, we'll be using the `example_app.py` file -which :doc:`lightning init react-ui ` created: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/example_app.py - -and the App.tsx file also created by :doc:`lightning init react-ui `: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/ui/src/App.tsx - ----- - -****************************** -Update React --> Lightning app -****************************** -To change the Lightning app from the React app, use `updateLightningState`. - -In this example, when you press **Start printing** in the React UI, it toggles -the `react_ui.vars.should_print`: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/ui/src/App.tsx - :emphasize-lines: 20, 21, 23 - -By changing that variable in the Lightning app state, it sets **react_ui.should_print** to True, which enables the -Lightning app to print: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/example_app.py - :emphasize-lines: 10, 22 - ----- - -****************************** -Update React <-- Lightning app -****************************** -To change the React app from the Lightning app, use the values from the `lightningState`. - -In this example, when the ``react_ui.counter`` increaes in the Lightning app: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/example_app.py - :emphasize-lines: 18, 24 - -The React UI updates the text on the screen to reflect the count - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/ui/src/App.tsx - :emphasize-lines: 15 diff --git a/docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst b/docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst deleted file mode 100644 index bacdd3d299b05..0000000000000 --- a/docs/source-app/workflows/add_web_ui/react/connect_react_and_lightning.rst +++ /dev/null @@ -1,107 +0,0 @@ -################################ -Connect React to a Lightning app -################################ -**Audience:** Users who already have a react app and want to connect it to a Lightning app. - -**pre-requisites:** Make sure you already have a react app you want to connect. - -**Difficulty level:** intermediate. - ----- - -************ -Example code -************ -To illustrate how to connect a React app and a lightning App, we'll be using the `example_app.py` file -which :doc:`lightning_app init react-ui ` created: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/example_app.py - -and the App.tsx file also created by :doc:`lightning_app init react-ui `: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/ui/src/App.tsx - ----- - -************************************* -Connect the component to the react UI -************************************* -The first step is to connect the dist folder of the react app using `StaticWebFrontend`: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/example_app.py - :emphasize-lines: 13 - -the dist folder must contain an index.html file which is generated by the compilating command `yarn build` which -we'll explore later. - ----- - -********************************** -Connect component to the root flow -********************************** -Next, connect your component to the root flow. Display the react app on the tab of your choice -using `configure_layout`: - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/example_app.py - :emphasize-lines: 19, 27 - ----- - -********************************* -Connect React and Lightning state -********************************* -At this point, the React app will render in the Lightning app. Test it out! - -.. code:: bash - - lightning_app run app example_app.py - -However, to make powerful React+Lightning apps, you must also connect the Lightning App state to the react app. -These lines enable two-way communication between the react app and the Lightning app. - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/ui/src/App.tsx - :emphasize-lines: 10, 13 - ----- - -**************** -Component vs App -**************** -Notice that in this guide, we connected a single react app to a single component. - -.. literalinclude:: ../../../../../src/lightning/app/cli/react-ui-template/example_app.py - :emphasize-lines: 6-13 - -You can use this single react app for the FULL Lightning app, or you can specify a React app for EACH component. - -.. code:: python - :emphasize-lines: 5, 9, 18-20 - - import lightning as L - - - class ComponentA(L.LightningFlow): - def configure_layout(self): - return L.app.frontend.StaticWebFrontend(Path(__file__).parent / "react_app_1/dist") - - - class ComponentB(L.LightningFlow): - def configure_layout(self): - return L.app.frontend.StaticWebFrontend(Path(__file__).parent / "react_app_2/dist") - - - class HelloLitReact(L.LightningFlow): - def __init__(self): - super().__init__() - self.react_app_1 = ComponentA() - self.react_app_2 = ComponentB() - - def configure_layout(self): - tab_1 = {"name": "App 1", "content": self.react_app_1} - tab_2 = {"name": "App 2", "content": self.react_app_2} - return tab_1, tab_2 - - - app = L.LightningApp(HelloLitReact()) - -This is a powerful idea that allows each Lightning component to have a self-contained web UI. diff --git a/docs/source-app/workflows/add_web_ui/react/create_react_template.rst b/docs/source-app/workflows/add_web_ui/react/create_react_template.rst deleted file mode 100644 index a5626bc32dcc1..0000000000000 --- a/docs/source-app/workflows/add_web_ui/react/create_react_template.rst +++ /dev/null @@ -1,51 +0,0 @@ -###################################### -Create a React Template (intermediate) -###################################### -**Audience:** Anyone who wants to add a web user interface (UI) written in react to their app. - ----- - -************** -What is react? -************** -`React.js `_ is a JavaScript library for building user interfaces. -A huge number of websites are written in React.js (like Facebook). - ----- - -************************ -Bring your own React app -************************ -If you already have a React.js app, then you don't need the section below. However, it might be helpful -to see our React template so you can understand how to connect it to a Lightning app. - ----- - -**************************** -Create the react-ui template -**************************** -Lightning can generate a react-ui template out of the box (generated with `Vite `_). - -Run this command to set up a react-ui template for a component: - -.. code:: bash - - lightning init react-ui - -If everything was successful, run the example_app.py listed in the output of the command: - -.. code:: bash - - INFO: Checking pre-requisites for react - INFO: - found npm version: 8.5.5 - found node version: 16.15.0 - found yarn version: 1.22.10 - - ... - ... - - ⚡ run the example_app.py to see it live! - lightning run app react-ui/example_app.py - -If the command didn't work, make sure to install `npm+nodejs `_, and `yarn `_. diff --git a/docs/source-app/workflows/add_web_ui/react/index.rst b/docs/source-app/workflows/add_web_ui/react/index.rst deleted file mode 100644 index ba0f8d97d67d1..0000000000000 --- a/docs/source-app/workflows/add_web_ui/react/index.rst +++ /dev/null @@ -1,106 +0,0 @@ -:orphan: - -.. toctree:: - :maxdepth: 1 - :hidden: - - create_react_template - connect_react_and_lightning - communicate_between_react_and_lightning - react_development_workflow - -########################## -Add a web UI with React.js -########################## -**Audience:** Anyone who wants to add a web user interface (UI) written in react to their app. - -**Prereqs:** Basic html knowledge. - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1: Create a React project template - :description: Use our React template to start a react app or bring your own. - :col_css: col-md-6 - :button_link: create_react_template.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: 2: Connect a React app and a Lightning app - :description: Learn how to connect a React app to a Lightning app. - :col_css: col-md-6 - :button_link: connect_react_and_lightning.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: 3: Communicate between React and Lightning - :description: Learn how to communicate between a React app and a Lightning app. - :col_css: col-md-6 - :button_link: communicate_between_react_and_lightning.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: 4: Develop like a React pro - :description: Learn the development workflow of a React developer. - :col_css: col-md-6 - :button_link: react_development_workflow.html - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
- ----- - -******** -Examples -******** -Here are a few example apps that use a React web UI. - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 2 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 3 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/react/react_development_workflow.rst b/docs/source-app/workflows/add_web_ui/react/react_development_workflow.rst deleted file mode 100644 index 02d855f383d69..0000000000000 --- a/docs/source-app/workflows/add_web_ui/react/react_development_workflow.rst +++ /dev/null @@ -1,27 +0,0 @@ -######################################### -Add a web UI with React.js (intermediate) -######################################### -**Audience:** Anyone who wants to add a web user interface (UI) written in react to their app. - -**pre-requisites:** You already have a React app connected with a Lightning app. - ----- - -********************** -Develop your react app -********************** -Every time you make a change to your React.js app, you must call `yarn build` to apply the changes (this is a React.js thing): - -.. code:: bash - - # if you're lost, the right folder has a package.json in it - cd folder-with-ui-folder/ui - yarn build - -This can get very repetitive, there is a "hot reload" command that you can enable with: - -.. code:: bash - - # TODO - -There are many other tricks that React.js developers use to improve their development speed. diff --git a/docs/source-app/workflows/add_web_ui/streamlit/basic.rst b/docs/source-app/workflows/add_web_ui/streamlit/basic.rst deleted file mode 100644 index ced0314af54be..0000000000000 --- a/docs/source-app/workflows/add_web_ui/streamlit/basic.rst +++ /dev/null @@ -1,186 +0,0 @@ -################################### -Add a web UI with Streamlit (basic) -################################### -**Audience:** Users who want to add a web UI written with Python. - -**Prereqs:** Basic python knowledge. - ----- - -****************** -What is Streamlit? -****************** -Streamlit is a web user interface builder for Python developers. Streamlit builds beautiful web pages -directly from Python. - -Install Streamlit with: - -.. code:: bash - - pip install streamlit - ----- - -************************* -Run a basic streamlit app -************************* - -.. - To explain how to use Streamlit with Lightning, let's replicate the |st_link|. - - .. |st_link| raw:: html - - example running here - -In the next few sections we'll build an app step-by-step. -First **create a file named app.py** with the app content: - -.. code:: python - - # app.py - import lightning as L - import lightning.app.frontend as frontend - import streamlit as st - - def your_streamlit_app(lightning_app_state): - st.write('hello world') - - class LitStreamlit(L.LightningFlow): - def configure_layout(self): - return frontend.StreamlitFrontend(render_fn=your_streamlit_app) - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_streamlit = LitStreamlit() - - def run(self): - self.lit_streamlit.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_streamlit} - return tab1 - - app = L.LightningApp(LitApp()) - -add "streamlit" to a requirements.txt file: - -.. code:: bash - - echo 'streamlit' >> requirements.txt - -this is a best practice to make apps reproducible. - ----- - -*********** -Run the app -*********** -Run the app locally to see it! - -.. code:: python - - lightning run app app.py - -Now run it on the cloud as well: - -.. code:: python - - lightning run app app.py --cloud - ----- - -************************ -Step-by-step walkthrough -************************ -In this section, we explain each part of this code in detail. - ----- - -0. Define a streamlit app -^^^^^^^^^^^^^^^^^^^^^^^^^ -First, find the streamlit app you want to integrate. In this example, that app looks like: - -.. code:: python - - import streamlit as st - - def your_streamlit_app(): - st.write('hello world') - -Refer to the `Streamlit documentation `_ for more complex examples. - ----- - -1. Add Streamlit to a component -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Link this function to the Lightning App by using the ``StreamlitFrontend`` class which needs to be returned from -the ``configure_layout`` method of the Lightning component you want to connect to Streamlit. - -.. code:: python - :emphasize-lines: 9-11 - - # app.py - import lightning as L - import lightning.app.frontend as frontend - import streamlit as st - - def your_streamlit_app(lightning_app_state): - st.write('hello world') - - class LitStreamlit(L.LightningFlow): - def configure_layout(self): - return frontend.StreamlitFrontend(render_fn=your_streamlit_app) - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_streamlit = LitStreamlit() - - def run(self): - self.lit_streamlit.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_streamlit} - return tab1 - - app = L.LightningApp(LitApp()) - -The ``render_fn`` argument of the ``StreamlitFrontend`` class, points to a function that runs your Streamlit app. -The first argument to the function is the lightning app state. Any changes to the app state update the app. - ----- - -2. Route the UI in the root component -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The second step, is to tell the Root component in which tab to render this component's UI. -In this case, we render the ``LitStreamlit`` UI in the ``home`` tab of the application. - -.. code:: python - :emphasize-lines: 22 - - # app.py - import lightning as L - import lightning.app.frontend as frontend - import streamlit as st - - def your_streamlit_app(lightning_app_state): - st.write('hello world') - - class LitStreamlit(L.LightningFlow): - def configure_layout(self): - return frontend.StreamlitFrontend(render_fn=your_streamlit_app) - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_streamlit = LitStreamlit() - - def run(self): - self.lit_streamlit.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_streamlit} - return tab1 - - app = L.LightningApp(LitApp()) diff --git a/docs/source-app/workflows/add_web_ui/streamlit/index.rst b/docs/source-app/workflows/add_web_ui/streamlit/index.rst deleted file mode 100644 index 2496729d45660..0000000000000 --- a/docs/source-app/workflows/add_web_ui/streamlit/index.rst +++ /dev/null @@ -1,84 +0,0 @@ -:orphan: - -.. toctree:: - :maxdepth: 1 - :hidden: - - basic - intermediate - -########################### -Add a web UI with Streamlit -########################### - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: 1: Connect Streamlit - :description: Learn how to connect Streamlit to a Lightning Component. - :col_css: col-md-6 - :button_link: basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: 2: Enable two-way communication - :description: Enable two-way communication between Streamlit and a Lightning App. - :col_css: col-md-6 - :button_link: intermediate.html - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
- ----- - -******** -Examples -******** -Here are a few example apps that use a Streamlit web UI. - - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Example 1 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 2 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. displayitem:: - :header: Example 3 - :description: Show off your work! Contribute an example. - :col_css: col-md-4 - :button_link: ../../../contribute_app.html - :height: 150 - :tag: Waiting for contributed example - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst b/docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst deleted file mode 100644 index a89b6e6a5ce55..0000000000000 --- a/docs/source-app/workflows/add_web_ui/streamlit/intermediate.rst +++ /dev/null @@ -1,105 +0,0 @@ -########################################## -Add a web UI with Streamlit (intermediate) -########################################## -**Audience:** Users who want to communicate between the Lightning App and Streamlit. - -**Prereqs:** Must have read the :doc:`streamlit basic ` guide. - ----- - -************************************ -Interact with the App from Streamlit -************************************ -The streamlit UI enables user interactions with the Lightning App via UI elements like buttons. -To modify the variables of a Lightning component, access the ``lightning_app_state`` variable in . - -For example, here we increase the count variable of the Lightning Component every time a user presses a button: - -.. code:: python - :emphasize-lines: 8, 14 - - # app.py - import lightning as L - import lightning.app.frontend as frontend - import streamlit as st - - - def your_streamlit_app(lightning_app_state): - if st.button("press to increase count"): - lightning_app_state.count += 1 - st.write(f"current count: {lightning_app_state.count}") - - - class LitStreamlit(L.LightningFlow): - def __init__(self): - super().__init__() - self.count = 0 - - def configure_layout(self): - return frontend.StreamlitFrontend(render_fn=your_streamlit_app) - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_streamlit = LitStreamlit() - - def run(self): - self.lit_streamlit.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_streamlit} - return tab1 - - - app = L.LightningApp(LitApp()) - ----- - -**************************************** -Interact with Streamlit from a component -**************************************** -To update the streamlit UI from any Lightning component, update the property in the component and make sure to call ``run`` from the -parent component. - -In this example we update the value of the counter from the component: - -.. code:: python - :emphasize-lines: 7, 15 - - # app.py - import lightning as L - import lightning.app.frontend as frontend - import streamlit as st - - - def your_streamlit_app(lightning_app_state): - st.write(f"current count: {lightning_app_state.count}") - - - class LitStreamlit(L.LightningFlow): - def __init__(self): - super().__init__() - self.count = 0 - - def run(self): - self.count += 1 - - def configure_layout(self): - return frontend.StreamlitFrontend(render_fn=your_streamlit_app) - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_streamlit = LitStreamlit() - - def run(self): - self.lit_streamlit.run() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_streamlit} - return tab1 - - - app = L.LightningApp(LitApp()) diff --git a/docs/source-app/workflows/add_web_ui/vue_js_intermediate.rst b/docs/source-app/workflows/add_web_ui/vue_js_intermediate.rst deleted file mode 100644 index e8d9f3e843155..0000000000000 --- a/docs/source-app/workflows/add_web_ui/vue_js_intermediate.rst +++ /dev/null @@ -1,6 +0,0 @@ -:orphan: - -####################################### -Add a web UI with Vue.js (intermediate) -####################################### -coming... diff --git a/docs/source-app/workflows/arrange_tabs/arrange_app_basic.rst b/docs/source-app/workflows/arrange_tabs/arrange_app_basic.rst deleted file mode 100644 index 91c0e53854760..0000000000000 --- a/docs/source-app/workflows/arrange_tabs/arrange_app_basic.rst +++ /dev/null @@ -1,69 +0,0 @@ -######################## -Arrange app tabs (basic) -######################## -**Audience:** Users who want to control the layout of their app user interface. - ----- - -***************************** -Enable a full-page single tab -***************************** - -To enable a single tab on the app UI, return a single dictionary from the ``configure_layout`` method: - -.. code:: python - :emphasize-lines: 9 - - import lightning as L - - - class DemoComponent(L.demo.dumb_component): - def configure_layout(self): - tab1 = {"name": "THE TAB NAME", "content": self.component_a} - return tab1 - - - app = L.LightningApp(DemoComponent()) - - -The "name" key defines the visible name of the tab on the UI. It also shows up in the URL. -The **"content"** key defines the target component to render in that tab. -When returning a single tab element like shown above, the UI will display it in full-page mode. - - ----- - -******************** -Enable multiple tabs -******************** - -.. code:: python - :emphasize-lines: 7 - - import lightning as L - - - class DemoComponent(L.demo.dumb_component): - def configure_layout(self): - tab1 = {"name": "Tab A", "content": self.component_a} - tab2 = {"name": "Tab B", "content": self.component_b} - return tab1, tab2 - - - app = L.LightningApp(DemoComponent()) - -The order matters! Try any of the following configurations: - -.. code:: python - :emphasize-lines: 4, 9 - - def configure_layout(self): - tab1 = {"name": "Tab A", "content": self.component_a} - tab2 = {"name": "Tab B", "content": self.component_b} - return tab1, tab2 - - - def configure_layout(self): - tab1 = {"name": "Tab A", "content": self.component_a} - tab2 = {"name": "Tab B", "content": self.component_b} - return tab2, tab1 diff --git a/docs/source-app/workflows/arrange_tabs/arrange_app_intermediate.rst b/docs/source-app/workflows/arrange_tabs/arrange_app_intermediate.rst deleted file mode 100644 index 1bb638b1d2c0c..0000000000000 --- a/docs/source-app/workflows/arrange_tabs/arrange_app_intermediate.rst +++ /dev/null @@ -1,21 +0,0 @@ -############################### -Arrange app tabs (intermediate) -############################### - -.. TODO:: fill-in - ----- - -*********************************** -Render components with a defined UI -*********************************** - -component directly - ----- - -************* -Render a link -************* - -tensorboard link diff --git a/docs/source-app/workflows/arrange_tabs/index.rst b/docs/source-app/workflows/arrange_tabs/index.rst deleted file mode 100644 index f639c0238e1bb..0000000000000 --- a/docs/source-app/workflows/arrange_tabs/index.rst +++ /dev/null @@ -1,5 +0,0 @@ -################ -Arrange App Tabs -################ - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/arrange_tabs/index_content.rst b/docs/source-app/workflows/arrange_tabs/index_content.rst deleted file mode 100644 index 66ac9328e67a7..0000000000000 --- a/docs/source-app/workflows/arrange_tabs/index_content.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. toctree:: - :maxdepth: 1 - :hidden: - - arrange_app_basic - arrange_app_intermediate - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Basic - :description: Learn how to enable and layout your app UI - :col_css: col-md-6 - :button_link: arrange_app_basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Intermediate - :description: Learn about all the possible ways of rendering a component. - :col_css: col-md-6 - :button_link: arrange_app_intermediate.html - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_command_line_interface/app.py b/docs/source-app/workflows/build_command_line_interface/app.py deleted file mode 100644 index 373aa2bfd137e..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/app.py +++ /dev/null @@ -1,35 +0,0 @@ -from commands.notebook.run import RunNotebook, RunNotebookConfig -from lit_jupyter import JupyterLab - -from lightning.app import LightningFlow, LightningApp, CloudCompute -from lightning.app.structures import Dict - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.notebooks = Dict() - - # 1. Annotates the handler input with the Notebook config. - def run_notebook(self, config: RunNotebookConfig): - if config.name in self.notebooks: - return f"The Notebook {config.name} already exists." - else: - # 2. Dynamically creates the Notebook if it doesn't exist and runs it. - self.notebooks[config.name] = JupyterLab( - cloud_compute=CloudCompute(config.cloud_compute) - ) - self.notebooks[config.name].run() - return f"The Notebook {config.name} was created." - - def configure_commands(self): - # 3. Returns a list of dictionaries with the format: - # {"command_name": CustomClientCommand(method=self.custom_server_handler)} - return [{"run notebook": RunNotebook(method=self.run_notebook)}] - - def configure_layout(self): - # 4. Dynamically displays the Notebooks in the Lightning App View. - return [{"name": n, "content": w} for n, w in self.notebooks.items()] - - -app = LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_command_line_interface/cli.rst b/docs/source-app/workflows/build_command_line_interface/cli.rst deleted file mode 100644 index 176e416b31630..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/cli.rst +++ /dev/null @@ -1,144 +0,0 @@ -:orphan: - -########################################### -1. Develop a CLI with server side code only -########################################### - -We are going to learn how to create a simple command-line interface. - -Lightning provides a flexible way to create complex CLI without much effort. - ----- - -************************* -1. Implement a simple CLI -************************* - -To create your first CLI, you need to override the :class:`~lightning.app.core.flow.LightningFlow.configure_commands` hook and return a list of dictionaries where the keys are the commands and the values are the server side handlers. - -First, create a file ``app.py`` and copy-paste the following code in to the file: - -.. literalinclude:: example_command.py - ----- - -************** -2. Run the App -************** - -Execute the following command in a terminal: - -.. code-block:: - - lightning_app run app app.py - -The following appears the terminal: - -.. code-block:: - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - [] - ----- - -*************************** -3. Connect to a running App -*************************** - -In another terminal, connect to the running App. -When you connect to an App, the Lightning CLI is replaced by the App CLI. To exit the App CLI, you need to run ``lightning_app disconnect``. - -.. code-block:: - - lightning_app connect localhost - -To see a list of available commands: - -.. code-block:: - - lightning_app --help - You are connected to the cloud Lightning App: localhost. - Usage: lightning_app [OPTIONS] COMMAND [ARGS]... - - --help Show this message and exit. - - Lightning App Commands - add Add a name. - -To find the arguments of the commands: - -.. code-block:: - - lightning_app add --help - You are connected to the cloud Lightning App: localhost. - Usage: lightning_app add [ARGS]... - - Options - name: Add description - ----- - -******************** -4. Execute a command -******************** - -Trigger the command line exposed by your App: - -.. code-block:: - - lightning_app add --name=my_name - WARNING: Lightning Command Line Interface is an experimental feature and unannounced changes are likely. - -In your first terminal, **Received name: my_name** and **["my_name"]** are printed. - -.. code-block:: - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - [] - Received name: my_name - ["my_name] - ----- - -************************** -5. Disconnect from the App -************************** - -To exit the App CLI, you need to run ``lightning_app disconnect``. - -.. code-block:: - - lightning_app disconnect - You are disconnected from the local Lightning App. - ----- - -********** -Learn more -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: 2. Implement a CLI with client side code execution - :description: Learn how to develop a complex API for your application - :col_css: col-md-6 - :button_link: cli_client.html - :height: 150 - -.. displayitem:: - :header: Develop a RESTful API - :description: Learn how to develop an API for your application. - :col_css: col-md-6 - :button_link: ../build_rest_api/index.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_command_line_interface/cli_client.rst b/docs/source-app/workflows/build_command_line_interface/cli_client.rst deleted file mode 100644 index 589db930bf5fa..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/cli_client.rst +++ /dev/null @@ -1,175 +0,0 @@ -:orphan: - -###################################################### -2. Develop a CLI with server and client code execution -###################################################### - -We've learned how to create a simple command-line interface. But in real-world use-cases, an App Builder wants to provide more complex functionalities where trusted code is executed on the client side. - -Lightning provides a flexible way to create complex CLI without much effort. - -In this example, we’ll create a CLI to dynamically run Notebooks: - - ----- - -************************** -1. Implement a complex CLI -************************** - -First of all, lets' create the following file structure: - -.. code-block:: python - - app_folder/ - commands/ - notebook/ - run.py - app.py - -We'll use the `Jupyter-Component `_. Follow the installation steps on the repo to install the Component. - -Add the following code to ``commands/notebook/run.py``: - -.. literalinclude:: commands/notebook/run.py - -Add the following code to ``app.py``: - -.. literalinclude:: app.py - ----- - -********************************************** -2. Run the App and check the API documentation -********************************************** - -In a terminal, run the following command and open ``http://127.0.0.1:7501/docs`` in a browser. - -.. code-block:: python - - lightning_app run app app.py - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - ----- - -*************************** -3. Connect to a running App -*************************** - -In another terminal, connect to the running App. -When you connect to an App, the Lightning CLI is replaced by the App CLI. To exit the App CLI, you need to run ``lightning_app disconnect``. - -.. code-block:: - - lightning_app connect localhost - - Storing `run_notebook` under /Users/thomas/.lightning/lightning_connection/commands/run_notebook.py - You can review all the downloaded commands under /Users/thomas/.lightning/lightning_connection/commands folder. - You are connected to the local Lightning App. - -To see a list of available commands: - -.. code-block:: - - lightning_app --help - - You are connected to the cloud Lightning App: localhost. - Usage: lightning_app [OPTIONS] COMMAND [ARGS]... - - --help Show this message and exit. - - Lightning App Commands - run notebook Run a Notebook. - - -To find the arguments of the commands: - -.. code-block:: - - lightning_app run notebook --help - - You are connected to the cloud Lightning App: localhost. - usage: notebook [-h] [--name NAME] [--cloud_compute CLOUD_COMPUTE] - - Run Notebook Parser - - optional arguments: - -h, --help show this help message and exit - --name NAME - --cloud_compute CLOUD_COMPUTE - ----- - -******************** -4. Execute a command -******************** - -And then you can trigger the command-line exposed by your App. - -Run the first Notebook with the following command: - -.. code-block:: python - - lightning_app run notebook --name="my_notebook" - WARNING: Lightning Command Line Interface is an experimental feature and unannounced changes are likely. - The notebook my_notebook was created. - -And run a second notebook. - -.. code-block:: python - - lightning_app run notebook --name="my_notebook_2" - WARNING: Lightning Command Line Interface is an experimental feature and unannounced changes are likely. - The notebook my_notebook_2 was created. - -Here is a recording of the Lightning App: - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/commands_1.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/commands_1.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -************************** -5. Disconnect from the App -************************** - -To exit the App CLI, you need to run **lightning disconnect**. - -.. code-block:: - - lightning_app disconnect - You are disconnected from the local Lightning App. - ----- - -********** -Learn more -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: 1. Develop a CLI with server side code only - :description: Learn how to develop a simple CLI for your App. - :col_css: col-md-6 - :button_link: cli.html - :height: 150 - -.. displayitem:: - :header: Develop a RESTful API - :description: Learn how to develop an API for your App. - :col_css: col-md-6 - :button_link: ../build_rest_api/index.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_command_line_interface/commands/__init__.py b/docs/source-app/workflows/build_command_line_interface/commands/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/docs/source-app/workflows/build_command_line_interface/commands/notebook/__init__.py b/docs/source-app/workflows/build_command_line_interface/commands/notebook/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py b/docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py deleted file mode 100644 index e0a6463c2d6a3..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/commands/notebook/run.py +++ /dev/null @@ -1,33 +0,0 @@ -from argparse import ArgumentParser -from uuid import uuid4 - -from pydantic import BaseModel - -from lightning.app.utilities.commands import ClientCommand - - -class RunNotebookConfig(BaseModel): - name: str - cloud_compute: str - - -class RunNotebook(ClientCommand): - description = "Run a Notebook." - - def run(self): - # 1. Define your own argument parser. You can use argparse, click, etc... - parser = ArgumentParser(description='Run Notebook Parser') - parser.add_argument("--name", type=str, default=None) - parser.add_argument("--cloud_compute", type=str, default="cpu") - hparams = parser.parse_args() - - # 2. Invoke the server side handler by sending a payload. - response = self.invoke_handler( - config=RunNotebookConfig( - name=hparams.name or str(uuid4()), - cloud_compute=hparams.cloud_compute, - ), - ) - - # 3. Print the server response. - print(response) diff --git a/docs/source-app/workflows/build_command_line_interface/example_command.py b/docs/source-app/workflows/build_command_line_interface/example_command.py deleted file mode 100644 index 4d837fc007171..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/example_command.py +++ /dev/null @@ -1,25 +0,0 @@ -from lightning import LightningApp, LightningFlow - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.names = [] - - def run(self): - print(self.names) - - def add_name(self, name: str): - """Add a name.""" - print(f"Received name: {name}") - self.names.append(name) - - def configure_commands(self): - # This can be invoked with `lightning add --name=my_name` - commands = [ - {"add": self.add_name}, - ] - return commands - - -app = LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_command_line_interface/index.rst b/docs/source-app/workflows/build_command_line_interface/index.rst deleted file mode 100644 index 9a3e24f784910..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/index.rst +++ /dev/null @@ -1,48 +0,0 @@ -############################ -Command-line Interface (CLI) -############################ - -**Audience:** Users looking to create a command line interface (CLI) for their application. - ----- - -************** -What is a CLI? -************** - -A Command-line Interface (CLI) is an user interface (UI) in a terminal to interact with a specific program. - -.. note:: - - The Lightning guideline to build CLI is `lightning_app ...` or ` ...`. - -As an example, Lightning provides a CLI to interact with your Lightning Apps and the `lightning.ai `_ platform as follows: - -.. code-block:: bash - - main - ├── fork - Forks an App. - ├── init - Initializes a Lightning App and/or Component. - │ ├── app - │ ├── component - │ ├── pl-app - Creates an App from your PyTorch Lightning source files. - │ └── react-ui - Creates a React UI to give a Lightning Component a React.js web UI - ├── install - Installs a Lightning App and/or Component. - │ ├── app - │ └── component - ├── list - Lists Lightning AI self-managed resources (apps) - │ └── apps - Lists your Lightning AI Apps. - ├── login - Logs in to your lightning.ai account. - ├── logout - Logs out of your lightning.ai account. - ├── run - Runs a Lightning App locally or on the cloud. - │ └── app - Runs an App from a file. - ├── show - Shows given resource. - │ └── logs - Shows cloud application logs. By default prints logs for all currently available Components. - ├── stop - Stops your App. - └── tree - Shows the command tree of your CLI. - -Learn more about `Command-line interfaces here `_. - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/build_command_line_interface/index_content.rst b/docs/source-app/workflows/build_command_line_interface/index_content.rst deleted file mode 100644 index ced369dbfd815..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/index_content.rst +++ /dev/null @@ -1,51 +0,0 @@ -************************************** -Develop a command line interface (CLI) -************************************** - -.. raw:: html - -
-
- -.. displayitem:: - :header: 1. Develop a CLI with server side code only - :description: Learn how to develop a simple CLI for your application - :col_css: col-md-6 - :button_link: cli.html - :height: 150 - -.. displayitem:: - :header: 2. Develop a CLI with server and client code execution - :description: Learn how to develop a complex CLI for your application - :col_css: col-md-6 - :button_link: cli_client.html - :height: 150 - -.. raw:: html - -
-
- - ----- - -********** -Learn more -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Develop a RESTful API - :description: Learn how to develop an API for your application. - :col_css: col-md-6 - :button_link: ../build_rest_api/index.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_command_line_interface/post_example.py b/docs/source-app/workflows/build_command_line_interface/post_example.py deleted file mode 100644 index c57a2d9426539..0000000000000 --- a/docs/source-app/workflows/build_command_line_interface/post_example.py +++ /dev/null @@ -1,26 +0,0 @@ -from lightning.app import LightningFlow, LightningApp -from lightning.app.api import Post - - -class Flow(LightningFlow): - # 1. Define the state - def __init__(self): - super().__init__() - self.names = [] - - # 2. Optional, but used to validate names - def run(self): - print(self.names) - - # 3. Method executed when a request is received. - def handle_post(self, name: str): - self.names.append(name) - return f'The name {name} was registered' - - # 4. Defines this Component's Restful API. You can have several routes. - def configure_api(self): - # Your own defined route and handler - return [Post(route="/name", method=self.handle_post)] - - -app = LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst b/docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst deleted file mode 100644 index f489edaedc13f..0000000000000 --- a/docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst +++ /dev/null @@ -1,109 +0,0 @@ -####################################################### -Develop a Lightning App from a PyTorch Lightning script -####################################################### - -**Audience:** Users who want to develop a Lightning App (App) from their PyTorch Lightning (PL) scripts. - ----- - -************************************************************* -What developing a Lightning App from a PL script does for you -************************************************************* - -Developing an App from a PL script allows you to immediately run on the cloud and share the progress with friends. -Once you're happy with your model, you can immediately expand beyond just model development to things like -making your own inference APIs, research demos, or even speeding up your data pipeline. - -The PyTorch Lightning App is your entry point to the full end-to-end ML licefycle. - ----- - -****************** -Develop a template -****************** - -To develop a template from a PyTorch Lightning script, use this command: - -.. code:: bash - - lightning_app init pl-app path/to/the/pl_script.py - - -If your script is not at the root of the project folder, and you'd like to include all source files within that folder, you can specify the root path as the first argument: - -.. code:: bash - - lightning_app init pl-app path/to/project/root path/to/the/pl_script.py - - -The default trainer App lets you train a model with a beautiful UI locally and on the cloud with zero effort! - ----- - -*********** -Run the App -*********** - -.. note:: This section is under construction. - -Run the App locally: - -.. code:: bash - - lightning_app run app pl-app/app.py - -Or run the App on the cloud so you can share with collaborators and even use all the cloud GPUs you want. - -.. code:: bash - - lightning_app run app pl-app/app.py --cloud - - -.. figure:: https://storage.googleapis.com/grid-packages/pytorch-lightning-app/docs-thumbnail.png - :alt: Screenshot of the PyTorch Lightning app running in the cloud - - ----- - -******************* -Modify the template -******************* - -The command above generates an App file like this: - -.. TODO:: list the file and show how to extend it - -.. code:: python - - from your_app_name import ComponentA, ComponentB - - import lightning as L - - - class LitApp(L.LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component_a = ComponentA() - self.component_b = ComponentB() - - def run(self): - self.component_a.run() - self.component_b.run() - - - app = L.LightningApp(LitApp()) - -Now you can add your own components as you wish! - ----- - -************ -Known issues -************ - -- The UI takes a couple seconds to load when opening the App, so please be patient. -- The timer resets when refreshing the page. -- The UI for adding new environment variables does not provide an option to delete an entry. -- A bug exists that leaves the script hanging at the start of training when using the DDP strategy. -- DDP-spawn is not supported due to pickling issues. -- It is currently not possible to submit a new run once the script has finished or failed. diff --git a/docs/source-app/workflows/build_lightning_app/from_scratch.rst b/docs/source-app/workflows/build_lightning_app/from_scratch.rst deleted file mode 100644 index 9042f105711cd..0000000000000 --- a/docs/source-app/workflows/build_lightning_app/from_scratch.rst +++ /dev/null @@ -1,11 +0,0 @@ -#################################### -Develop a Lightning App from Scratch -#################################### - -**Audience:** Users who want to develop a Lightning App from scratch. - -**Prereqs:** You must have finished the `Basic levels `_. - ----- - -.. include:: from_scratch_content.rst diff --git a/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst b/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst deleted file mode 100644 index a528fa6088736..0000000000000 --- a/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst +++ /dev/null @@ -1,60 +0,0 @@ - -************** -WAIT! -************** -Before you build a Lightning App from scratch, see if you can find an app that is similar to what you need -in the `Lightning App Gallery `_. - -Once you find the Lightning App you want, press "Clone & Run" to see it running on the cloud, then download the code -and change what you want! - ----- - -****************** -Build from scratch -****************** -If you didn't find a Lightning App similar to the one you need, simply create a file named **app.py** with these contents: - -.. code:: python - - import lightning as L - - - class WordComponent(L.LightningWork): - def __init__(self, word): - super().__init__() - self.word = word - - def run(self): - print(self.word) - - - class LitApp(L.LightningFlow): - def __init__(self) -> None: - super().__init__() - self.hello = WordComponent("hello") - self.world = WordComponent("world") - - def run(self): - print("This is a simple Lightning app, make a better app!") - self.hello.run() - self.world.run() - - - app = L.LightningApp(LitApp()) - ----- - -Run the Lightning App -^^^^^^^^^^^^^^^^^^^^^ -Run the Lightning App locally: - -.. code:: bash - - lightning_app run app app.py - -Run the Lightning App on the cloud: - -.. code:: bash - - lightning_app run app app.py --cloud diff --git a/docs/source-app/workflows/build_lightning_app/index.rst b/docs/source-app/workflows/build_lightning_app/index.rst deleted file mode 100644 index e60f0355afb8e..0000000000000 --- a/docs/source-app/workflows/build_lightning_app/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -:orphan: - -####################### -Develop a Lightning App -####################### - -A Lightning App (App) is a collection of components interacting together. Learn how to develop a basic App template. - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/build_lightning_app/index_content.rst b/docs/source-app/workflows/build_lightning_app/index_content.rst deleted file mode 100644 index 45264d85a4adc..0000000000000 --- a/docs/source-app/workflows/build_lightning_app/index_content.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. toctree:: - :maxdepth: 1 - :hidden: - - from_scratch - from_pytorch_lightning_script - -.. raw:: html - -
-
- -.. displayitem:: - :header: Develop a Lightning App from scratch - :description: Learn how to Develop a Lightning App from scratch - :col_css: col-md-6 - :button_link: from_scratch.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Develop an App from a PyTorch Lightning script - :description: Share your PyTorch Lightning training on the cloud, run on cloud GPUs, or extend your App - :col_css: col-md-6 - :button_link: from_pytorch_lightning_script.html - :height: 150 - :tag: basic - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_lightning_component/basic.rst b/docs/source-app/workflows/build_lightning_component/basic.rst deleted file mode 100644 index 07fac58cf21da..0000000000000 --- a/docs/source-app/workflows/build_lightning_component/basic.rst +++ /dev/null @@ -1,9 +0,0 @@ -############################# -Develop a Lightning Component -############################# - -**Audience:** Users who want to develop a Lightning Component. - ----- - -.. include:: from_scratch_component_content.rst diff --git a/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst b/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst deleted file mode 100644 index b29c19db40129..0000000000000 --- a/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst +++ /dev/null @@ -1,153 +0,0 @@ -******************************* -LightningFlow vs. LightningWork -******************************* - -.. _flow_vs_work: - -.. raw:: html - - Choosing between LightningFlow and LightningWork - -There are two types of components in Lightning, **LightningFlow** and **LightningWork**. - -Use a **LightningFlow** component for any programming logic that runs in less than 1 second. - -.. code:: python - - for i in range(10): - print(f"{i}: this kind of code belongs in a LightningFlow") - -Use a **LightningWork** component for any programming logic that takes more than 1 second or requires its own hardware. - -.. code:: python - - from time import sleep - - for i in range(100000): - sleep(2.0) - print(f"{i} LightningWork: work that is long running or may never end (like a server)") - ----- - -************************************************** -What developing a Lightning Component does for you -************************************************** -Lightning Components break up complex systems into modular components. The first obvious benefit is that components -can be reused across other apps. This means you can build once, test it and forget it. - -As a researcher it also means that your code can be taken to production without needing a team of engineers to help -productionize it. - -As a machine learning engineer, it means that your cloud system is: - -- fault tolerant -- cloud agnostic -- testable (unlike YAML/CI/CD code) -- version controlled -- enables cross-functional collaboration - ----- - -************** -WAIT! -************** -Before you build a Lightning component from scratch, see if you can find a component that is similar to what you need -in the `Lightning component Gallery `_. - -Once you find the component you want, download the code and change what you want! - ----- - -***************************************** -Build a Lighitning component from scratch -***************************************** -If you didn't find a Lightning component similar to the one you need, you can build one from scratch. - ----- - -Build a LightningFlow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To implement a LightningFlow, simply subclass ``LightningFlow`` and define the run method: - -.. code:: python - :emphasize-lines: 5 - - # app.py - import lightning as L - - - class LitFlow(L.LightningFlow): - def run(self): - for i in range(10): - print(f"{i}: this kind of code belongs in a LightningFlow") - - - app = L.LightningApp(LitFlow()) - -run the app - -.. code:: bash - - lightning_app run app app.py - ----- - -Build a LightningWork -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Only implement a LightningWork if this particular piece of code: - -- takes more than 1 second to execute -- requires its own set of cloud resources -- or both - -To implement a LightningWork, simply subclass ``LightningWork`` and define the run method: - -.. code:: python - :emphasize-lines: 6 - - # app.py - from time import sleep - import lightning as L - - - class LitWork(L.LightningWork): - def run(self): - for i in range(100000): - sleep(2.0) - print(f"{i} LightningWork: work that is long running or may never end (like a server)") - -A LightningWork must always be attached to a LightningFlow and explicitly asked to ``run()``: - -.. code:: python - :emphasize-lines: 13, 16 - - from time import sleep - import lightning as L - - - class LitWork(L.LightningWork): - def run(self): - for i in range(100000): - sleep(2.0) - print(f"{i} LightningWork: work that is long running or may never end (like a server)") - - - class LitFlow(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_work = LitWork() - - def run(self): - self.lit_work.run() - - - app = L.LightningApp(LitFlow()) - -run the app - -.. code:: bash - - lightning_app run app app.py diff --git a/docs/source-app/workflows/build_lightning_component/index.rst b/docs/source-app/workflows/build_lightning_component/index.rst deleted file mode 100644 index 8620a9b9fd8d5..0000000000000 --- a/docs/source-app/workflows/build_lightning_component/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -:orphan: - -############################# -Develop a Lightning Component -############################# - -A Lightning App (App) is a collection of components interacting together. Learn how to build a Lightning Component (Component) in this section. - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/build_lightning_component/index_content.rst b/docs/source-app/workflows/build_lightning_component/index_content.rst deleted file mode 100644 index 9a940bc8b89b6..0000000000000 --- a/docs/source-app/workflows/build_lightning_component/index_content.rst +++ /dev/null @@ -1,122 +0,0 @@ -.. toctree:: - :maxdepth: 1 - :hidden: - - basic - ../add_components - -.. toctree:: - :maxdepth: 1 - :hidden: - - intermediate - ../run_work_in_parallel - ../run_work_once - -.. toctree:: - :maxdepth: 1 - :hidden: - - ../enable_fault_tolerance - -****** -Basics -****** -.. raw:: html - -
-
- -.. displayitem:: - :header: Develop a Lightning Component - :description: Learn the basics of developing a Lightning Component - :col_css: col-md-4 - :button_link: basic.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Explore community Lightning Components - :description: Discover community-built Lightning Components - :col_css: col-md-4 - :button_link: https://lightning.ai/components - :height: 150 - :tag: basic - -.. raw:: html - -
-
- ----- - -************ -Intermediate -************ - -.. raw:: html - -
-
- -.. displayitem:: - :header: Add a UI to a component - :description: Learn about all the possible ways of rendering a component. - :col_css: col-md-4 - :button_link: intermediate.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Run LightningWork in parallel - :description: Learn about running LightningWork in parallel. - :col_css: col-md-4 - :button_link: ../run_work_in_parallel.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Run LightningWork once - :description: Learn about running LightningWork multiple times or once. - :col_css: col-md-4 - :button_link: ../run_work_once.html - :height: 150 - :tag: intermediate - -.. displayitem:: - :header: Publish a Lightning component - :description: Learn the basics of publishing a Lightning component. - :col_css: col-md-4 - :button_link: publish_a_component.html - :height: 150 - :tag: intermediate - -.. raw:: html - -
-
- - ----- - -******** -Advanced -******** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Enable fault tolerance - :description: Learn how to make a component fault tolerant. - :col_css: col-md-4 - :button_link: ../enable_fault_tolerance.html - :height: 150 - :tag: advanced - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_lightning_component/intermediate.rst b/docs/source-app/workflows/build_lightning_component/intermediate.rst deleted file mode 100644 index b95ba36ca4c2d..0000000000000 --- a/docs/source-app/workflows/build_lightning_component/intermediate.rst +++ /dev/null @@ -1,71 +0,0 @@ -############################################ -Develop a Lightning Component (intermediate) -############################################ - -**Audience:** Users who want to connect a UI to a Lightning Component (Component). - ----- - -***************************** -Add a web user interface (UI) -***************************** -Every lightning component can have its own user interface (UI). Lightning components support any kind -of UI interface such as dash, gradio, panel, react.js, streamlit, vue.js, web urls, -etc...(:doc:`full list here <../add_web_ui/index>`). - -Let's say that we have a user interface defined in html: - -.. code:: html - - - - - - -

Hello World

- - - -To *connect* this user interface to the Component, define the configure_layout method: - -.. code:: python - :emphasize-lines: 5, 6 - - import lightning as L - - - class LitHTMLComponent(L.LightningFlow): - def configure_layout(self): - return L.app.frontend.StaticWebFrontend(serve_dir="path/to/folder/with/index.html/inside") - -Finally, route the Component's UI through the root Component's **configure_layout** method: - -.. code:: python - :emphasize-lines: 14 - - # app.py - import lightning as L - - - class LitHTMLComponent(L.LightningFlow): - def configure_layout(self): - return L.app.frontend.StaticWebFrontend(serve_dir="path/to/folder/with/index.html/inside") - - - class LitApp(L.LightningFlow): - def __init__(self): - super().__init__() - self.lit_html_component = LitHTMLComponent() - - def configure_layout(self): - tab1 = {"name": "home", "content": self.lit_html_component} - return tab1 - - - app = L.LightningApp(LitApp()) - -Run your App and you'll see the UI on the Lightning App view: - -.. code:: bash - - lightning run app app.py diff --git a/docs/source-app/workflows/build_lightning_component/publish_a_component.rst b/docs/source-app/workflows/build_lightning_component/publish_a_component.rst deleted file mode 100644 index bb5ec755ae190..0000000000000 --- a/docs/source-app/workflows/build_lightning_component/publish_a_component.rst +++ /dev/null @@ -1,59 +0,0 @@ -############################# -Publish a Lightning Component -############################# - -**Audience:** Users who want to build a Ligthtning Component (Component) to publish to the Lightning Gallery - ----- - -*********************************** -Develop a Component from a template -*********************************** - -The fastest way to build a Component that is ready to be published to the component Gallery is to use -the default template. - -Generate your Component template with this command: - -.. code:: python - - lightning init component your-component-name - ----- - -***************** -Run the Component -***************** - -To test that your Component works, first install all dependencies: - -.. code:: bash - - cd your-component - pip install -r requirements.txt - pip install -e . - -Now import your Component and use it in a Lightning App: - -.. code:: python - - # app.py - from your_component import TemplateComponent - import lightning as L - - class LitApp(L.LightningFlow): - def __init__(self) -> None: - super().__init__() - self.your_component = TemplateComponent() - - def run(self): - print('this is a simple Lightning app to verify your component is working as expected') - self.your_component.run() - - app = L.LightningApp(LitApp()) - -and run the app: - -.. code:: bash - - lightning run app app.py diff --git a/docs/source-app/workflows/build_rest_api/add_api.rst b/docs/source-app/workflows/build_rest_api/add_api.rst deleted file mode 100644 index 00fd16d18715a..0000000000000 --- a/docs/source-app/workflows/build_rest_api/add_api.rst +++ /dev/null @@ -1,104 +0,0 @@ -:orphan: - -############################ -Add an API Route to your App -############################ - -In order to add a new route, you need to override the :class:`~lightning.app.core.flow.LightningFlow.configure_api` hook and return a list of :class:`~lightning.app.api.http_methods.HttpMethod` such as :class:`~lightning.app.api.http_methods.Get`, :class:`~lightning.app.api.http_methods.Post`, :class:`~lightning.app.api.http_methods.Put`, :class:`~lightning.app.api.http_methods.Delete`. - ----- - -********************** -1. Create a simple App -********************** - -We're going to create a single route ``/name`` that takes a string input ``name`` and stores the value within the ``names`` attribute of the flow state. - -Create a file called ``app.py`` and copy-paste the following code in to the file: - -.. literalinclude:: post_example.py - ----- - -************** -2. Run the App -************** - -Execute the following command in a terminal: - -.. code-block:: - - lightning_app run app app.py - -The following appears: - -.. code-block:: - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - ----- - -**************** -3. Check the API -**************** - -The Lightning App framework automatically generates API documentation from your App using `Swagger UI `_. - -You can access it by accessing the following URL: ``http://127.0.0.1:7501/docs`` in your browser and validate your API with the route ``/name`` directly from the documentation page as shown below. - -.. video:: https://pl-public-data.s3.amazonaws.com/assets_lightning/rest_post.mp4 - :poster: https://pl-public-data.s3.amazonaws.com/assets_lightning/rest_png.png - :width: 600 - :class: background-video - :autoplay: - :loop: - :muted: - -Alternatively, you can invoke the route directly from a second terminal using `curl `_. - -.. code-block:: - - curl -X 'POST' \ - 'http://127.0.0.1:7501/name?name=my_name' \ - -H 'accept: application/json' \ - -d '' - - "The name my_name was registered" - -And you can see the following in your first terminal running your App. - -.. code-block:: - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - [] - ["my_name"] - -************************************** -Develop a command line interface (CLI) -************************************** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Add Requests Validation - :description: Learn how to use pydantic with your API. - :col_css: col-md-6 - :button_link: request_validation.html - :height: 150 - -.. displayitem:: - :header: Develop a Command Line Interface (CLI) - :description: Learn how to develop an CLI for your App. - :col_css: col-md-6 - :button_link: ../build_command_line_interface/index.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_rest_api/index.rst b/docs/source-app/workflows/build_rest_api/index.rst deleted file mode 100644 index 590f3d03d538d..0000000000000 --- a/docs/source-app/workflows/build_rest_api/index.rst +++ /dev/null @@ -1,34 +0,0 @@ -:orphan: - -########### -RESTful API -########### - -**Audience:** Users looking to create an API in their App to allow users to activate functionalities from external sources. - ----- - -********************** -What is a RESTful API? -********************** - -A RESTful API is a set of external URL routes exposed by a server that enables clients to trigger some functionalities, such as getting or putting some data, uploading files, etc.. - -This provides great flexibility for users as they can easily discover functionalities made available by the App Builders. - -The Lightning App framework supports the four primary HTTP methods: `GET`, `POST`, `PUT`, `DELETE`. - -These methods are guidelines to organize your RESTful Services and help users understand your functionalities. - -* **`GET`:** Reads data from the server. -* **`POST`:** Creates new resources. -* **`PUT`:** Updates/replaces existing resources. -* **`DELETE`:** Deletes resources. - -Learn more about `HTTP Methods for RESTful Services here `_. - -The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. - ----- - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/build_rest_api/index_content.rst b/docs/source-app/workflows/build_rest_api/index_content.rst deleted file mode 100644 index 9f77225f24f59..0000000000000 --- a/docs/source-app/workflows/build_rest_api/index_content.rst +++ /dev/null @@ -1,50 +0,0 @@ -************** -Develop an API -************** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Add an API Route to your App - :description: Learn how to develop a simple API for your App. - :col_css: col-md-6 - :button_link: add_api.html - :height: 150 - -.. displayitem:: - :header: Add Requests Validation - :description: Learn how to use pydantic with your API. - :col_css: col-md-6 - :button_link: cli_client.html - :height: 150 - -.. raw:: html - -
-
- ----- - -********** -Learn more -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Develop a Command-line Interface - :description: Learn how to develop an CLI for your App. - :col_css: col-md-6 - :button_link: ../build_command_line_interface/index.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/build_rest_api/models.py b/docs/source-app/workflows/build_rest_api/models.py deleted file mode 100644 index 7ebb3ac8c8c17..0000000000000 --- a/docs/source-app/workflows/build_rest_api/models.py +++ /dev/null @@ -1,6 +0,0 @@ -from pydantic import BaseModel - - -# 1. Subclass the BaseModel and defines your payload format. -class NamePostConfig(BaseModel): - name: str diff --git a/docs/source-app/workflows/build_rest_api/post_example.py b/docs/source-app/workflows/build_rest_api/post_example.py deleted file mode 100644 index 0e56117a44fcf..0000000000000 --- a/docs/source-app/workflows/build_rest_api/post_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from lightning.app import LightningFlow, LightningApp -from lightning.app.api import Post - - -class Flow(LightningFlow): - # 1. Define the state - def __init__(self): - super().__init__() - self.names = [] - - # 2. Optional, but used to validate names - def run(self): - print(self.names) - - # 3. Method executed when a request is received. - def handle_post(self, name: str): - self.names.append(name) - return f'The name {name} was registered' - - # 4. Defines this Component's Restful API. You can have several routes. - def configure_api(self): - return [Post(route="/name", method=self.handle_post)] - - -app = LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_rest_api/post_example_pydantic.py b/docs/source-app/workflows/build_rest_api/post_example_pydantic.py deleted file mode 100644 index 35ae78186f538..0000000000000 --- a/docs/source-app/workflows/build_rest_api/post_example_pydantic.py +++ /dev/null @@ -1,32 +0,0 @@ -from models import NamePostConfig # 2. Import your custom model. - -from lightning.app import LightningFlow, LightningApp -from lightning.app.api import Post - - -class Flow(LightningFlow): - # 1. Define the state - def __init__(self): - super().__init__() - self.names = [] - - # 2. Optional, but used to validate names - def run(self): - print(self.names) - - # 3. Annotate your input with your custom pydantic model. - def handle_post(self, config: NamePostConfig): - self.names.append(config.name) - return f'The name {config} was registered' - - # 4. Defines this Component's Restful API. You can have several routes. - def configure_api(self): - return [ - Post( - route="/name", - method=self.handle_post, - ) - ] - - -app = LightningApp(Flow()) diff --git a/docs/source-app/workflows/build_rest_api/request_validation.rst b/docs/source-app/workflows/build_rest_api/request_validation.rst deleted file mode 100644 index 6caaccdc239d2..0000000000000 --- a/docs/source-app/workflows/build_rest_api/request_validation.rst +++ /dev/null @@ -1,69 +0,0 @@ -:orphan: - -*********************** -Add Requests Validation -*********************** - -The Lightning App framework uses the popular `FastAPI `_ and `Pydantic `_ frameworks under the hood. This means you can use all their features while building your App. - -pydantic enables fast data validation and settings management using Python type annotations and FastAPI is a modern, fast (high-performance), web framework for building APIs. - -You can easily use pydantic by defining your own payload format. - -.. literalinclude:: models.py - -Then, type your handler input with your custom model. - -.. literalinclude:: post_example_pydantic.py - -After running the updated App, the App documentation ``/name`` has changed and takes JSON with ``{"name": ...}`` as input. - -.. figure:: https://pl-public-data.s3.amazonaws.com/assets_lightning/rest_post_pydantic.png - :alt: Rest API with pydantic - :width: 100 % - -You can invoke the RESTful API route ``/name`` with the following command: - -.. code-block:: bash - - curl -X 'POST' \ - 'http://127.0.0.1:7501/name' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "name": "my_name" - }' - -.. note:: - - Using curl, you can pass a JSON payload using the ``-d`` argument. - ----- - -********** -Learn more -********** - -.. raw:: html - -
-
- -.. displayitem:: - :header: Add an API Route to your App - :description: Learn how to develop a simple API for your App. - :col_css: col-md-6 - :button_link: add_api.html - :height: 150 - -.. displayitem:: - :header: Develop a Command Line Interface (CLI) - :description: Learn how to develop an CLI for your App. - :col_css: col-md-6 - :button_link: ../build_command_line_interface/index.html - :height: 150 - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/debug_locally.rst b/docs/source-app/workflows/debug_locally.rst deleted file mode 100644 index cd5a5a80fde7c..0000000000000 --- a/docs/source-app/workflows/debug_locally.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -##################################### -Debug a Distributed Cloud App Locally -##################################### diff --git a/docs/source-app/workflows/enable_fault_tolerance.rst b/docs/source-app/workflows/enable_fault_tolerance.rst deleted file mode 100644 index b1630d4d396ac..0000000000000 --- a/docs/source-app/workflows/enable_fault_tolerance.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -###################### -Enable Fault Tolerance -###################### diff --git a/docs/source-app/workflows/extend_app.rst b/docs/source-app/workflows/extend_app.rst deleted file mode 100644 index bc7ffd7d2f87a..0000000000000 --- a/docs/source-app/workflows/extend_app.rst +++ /dev/null @@ -1,59 +0,0 @@ -###################### -Extend an Existing App -###################### -You can extend a Lightning App by using community components or building your own. - ----- - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Add more Components - :description: Extend an App by adding a prebuilt component. - :col_css: col-md-4 - :button_link: add_components.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Add a web user interface (UI) - :description: Extend an App by adding a web user interface (UI) - :col_css: col-md-4 - :button_link: add_web_ui/index.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Add a URL link - :description: Extend an App by adding a web URL link - :col_css: col-md-4 - :button_link: add_web_link.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Build a Component - :description: Extend an App by building a Component - :col_css: col-md-4 - :button_link: build_lightning_component/index.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Add a server - :description: Extend an App by adding a server to a Component. - :col_css: col-md-4 - :button_link: add_server/index.html - :height: 150 - :tag: Intermediate - - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/index.rst b/docs/source-app/workflows/index.rst deleted file mode 100644 index 9993971873286..0000000000000 --- a/docs/source-app/workflows/index.rst +++ /dev/null @@ -1,186 +0,0 @@ -.. toctree:: - :maxdepth: 1 - :hidden: - - access_app_state - add_web_ui/index - add_web_link - secrets <../glossary/secrets> - arrange_tabs/index - connect components <../levels/intermediate/connect_lightning_components> - build components <../levels/basic/build_a_lightning_component> - run_work_once - cloud compute <../core_api/lightning_work/compute> - build_command_line_interface/index - rest API <../glossary/restful_api/restful_api> - extend_app - build_lightning_component/publish_a_component - add_server/index - run_app_on_cloud/index - run_work_in_parallel - drive <../glossary/storage/drive> - share_app - share_files_between_components - -####### -How to: -####### - -.. raw:: html - -
-
- -.. displayitem:: - :header: Access the App State - :description: Learn to work with the app state - :col_css: col-md-4 - :button_link: access_app_state.html - :height: 180 - -.. displayitem:: - :header: Add a web user interface - :description: Learn how to add React, StreamLit, Dash to your App. - :col_css: col-md-4 - :button_link: add_web_ui/index.html - :height: 180 - -.. displayitem:: - :header: Add a web link - :description: Learn how to embed external websites - :col_css: col-md-4 - :button_link: add_web_link.html - :height: 180 - -.. displayitem:: - :header: Add encrypted secrets - :description: Learn how to organize your UI - :col_css: col-md-4 - :button_link: ../glossary/secrets.html - :height: 180 - -.. displayitem:: - :header: Arrange App tabs - :description: Learn how to organize your UI - :col_css: col-md-4 - :button_link: arrange_tabs/index.html - :height: 180 - -.. displayitem:: - :header: Build a Lightning App - :description: Simple App to get started - :col_css: col-md-4 - :button_link: ../levels/basic/connect_lightning_components.html - :height: 180 - -.. displayitem:: - :header: Build a Lightning Component - :description: Understand how to separated the glue from the actual work - :col_css: col-md-4 - :button_link: ../levels/basic/build_a_lightning_component.html - :height: 180 - -.. displayitem:: - :header: Cache Work run calls - :description: Understand how to trigger a work run method - :col_css: col-md-4 - :button_link: run_work_once.html - :height: 180 - -.. displayitem:: - :header: Customize your cloud compute - :description: Select machines to run on - :col_css: col-md-4 - :button_link: ../core_api/lightning_work/compute.html - :height: 180 - -.. displayitem:: - :header: Develop a Command Line Interface (CLI) - :description: Learn to develop a CLI - :col_css: col-md-4 - :button_link: build_command_line_interface/index.html - :height: 180 - -.. displayitem:: - :header: Develop a Lightning App - :description: Learn to connect components together into a Lightning App - :col_css: col-md-4 - :button_link: ../levels/basic/connect_lightning_components.html - :height: 180 - -.. displayitem:: - :header: Develop a REST API - :description: Learn to deploy a model behind a REST API - :col_css: col-md-4 - :button_link: ../glossary/restful_api/restful_api.html - :height: 180 - -.. displayitem:: - :header: Extend an existing App - :description: Learn where to go next with an App - :col_css: col-md-4 - :button_link: extend_app.html - :height: 180 - -.. displayitem:: - :header: Publish a Lightning Component - :description: Share your components with others - :col_css: col-md-4 - :button_link: build_lightning_component/publish_a_component.html - :height: 180 - -.. displayitem:: - :header: Run a server within a Lightning App - :description: Lightning Work can be infinite jobs - :col_css: col-md-4 - :button_link: add_server/index.html - :height: 180 - -.. displayitem:: - :header: Run an App on the cloud - :description: Learn how to get things done in the cloud with ease - :col_css: col-md-4 - :button_link: run_app_on_cloud/index.html - :height: 180 - -.. displayitem:: - :header: Run Works in parallel - :description: Learn how to make your Work non blocking - :col_css: col-md-4 - :button_link: run_work_in_parallel.html - :height: 180 - -.. displayitem:: - :header: Save files - :description: Learn how to save files in a work by using Drive - :col_css: col-md-4 - :button_link: ../glossary/storage/drive.html - :height: 180 - -.. displayitem:: - :header: Share an App - :description: Learn how to share your work with others - :col_css: col-md-4 - :button_link: share_app.html - :height: 180 - -.. displayitem:: - :header: Share files between components - :description: Learn how Lightning Storage emulates a single filesystem in a distributed setting - :col_css: col-md-4 - :button_link: share_files_between_components.html - :height: 180 - -.. displayitem:: - :header: Mount Cloud Data - :description: Learn how Lightning Mounts are used to make the contents of an cloud object store bucket available on disk when running in the cloud. - :col_css: col-md-4 - :button_link: mount_cloud_object_store.html - :height: 180 - - - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/mount_cloud_object_store.rst b/docs/source-app/workflows/mount_cloud_object_store.rst deleted file mode 100644 index 31ac2a44aa3c0..0000000000000 --- a/docs/source-app/workflows/mount_cloud_object_store.rst +++ /dev/null @@ -1,141 +0,0 @@ -:orphan: - -############## -Add Cloud Data -############## - -**Audience:** Users who want to read files stored in a Cloud Object Bucket in an app. - -****************************** -Mounting Public AWS S3 Buckets -****************************** - -=================== -Add Mount to a Work -=================== - -To mount data from a cloud bucket to your app compute, initialize a :class:`~lightning.app.storage.mount.Mount` -object with the source path of the s3 bucket and the absolute directory path where it should be mounted and -pass the :class:`~lightning.app.storage.mount.Mount` to the :class:`~lightning.app.utilities.packaging.cloud_compute.CloudCompute` -of the :class:`~lightning.app.core.work.LightningWork` it should be mounted on. - -In this example, we will mount an S3 bucket: ``s3://ryft-public-sample-data/esRedditJson/`` to ``/content/esRedditJson/``. - -.. code-block:: python - - from lightning.app import CloudCompute - from lightning.app.storage import Mount - - self.my_work = MyWorkClass( - cloud_compute=CloudCompute( - mounts=Mount( - source="s3://ryft-public-sample-data/esRedditJson/", - mount_path="/content/esRedditJson/", - ), - ) - ) - -You can also pass multiple mounts to a single work by passing a ``List[Mount(...), ...]`` to the -``CloudCompute(mounts=...)`` argument. - -.. note:: - - * Mounts supported up to 1 Million files, 5GB per file. Need larger mounts? Contact support@lightning.ai - * When adding multiple mounts, each one should have a unique ``mount_path``. - * A maximum of 10 :class:`~lightning.app.storage.mount.Mount`\s can be added to a :class:`~lightning.app.core.work.LightningWork`. - -======================= -Read Files From a Mount -======================= - -Once a :class:`~lightning.app.storage.mount.Mount` object is passed to :class:`~lightning.app.utilities.packaging.cloud_compute.CloudCompute`, -you can access, list, or read any file from the mount under the specified ``mount_path``, just like you would if it -was on your local machine. - -Assuming your ``mount_path`` is ``"/content/esRedditJson/"`` you can do the following: - ----------- -Read Files ----------- - -.. code-block:: python - - with open("/content/esRedditJson/esRedditJson1", "r") as f: - some_data = f.read() - - # do something with "some_data"... - ----------- -List Files ----------- - -.. code-block:: python - - files = os.listdir("/content/esRedditJson/") - --------------------- -See the Full Example --------------------- - -.. code-block:: python - :emphasize-lines: 10,15 - - import os - - import lightning as L - from lightning.app import CloudCompute - from lightning.app.storage import Mount - - class ReadMount(L.LightningWork): - def run(self): - # Print a list of files stored in the mounted S3 Bucket. - files = os.listdir("/content/esRedditJson/") - for file in files: - print(file) - - # Read the contents of a particular file in the bucket "esRedditJson1" - with open("/content/esRedditJson/esRedditJson1", "r") as f: - some_data = f.read() - # do something with "some_data"... - - class Flow(L.LightningFlow): - def __init__(self): - super().__init__() - self.my_work = ReadMount( - cloud_compute=CloudCompute( - mounts=Mount( - source="s3://ryft-public-sample-data/esRedditJson/", - mount_path="/content/esRedditJson/", - ), - ) - ) - - def run(self): - self.my_work.run() - -.. note:: - - When running a Lightning App on your local machine, any :class:`~lightning.app.utilities.packaging.cloud_compute.CloudCompute` - configuration (including a :class:`~lightning.app.storage.mount.Mount`) is ignored at runtime. If you need access to - these files on your local disk, you should download a copy of them to your machine. - -.. note:: - - Mounted files from an S3 bucket are ``read-only``. Any modifications, additions, or deletions - to files in the mounted directory will not be reflected in the cloud object store. - ----- - -********************************************** -Mounting Private AWS S3 Buckets - Coming Soon! -********************************************** - -We'll Let you know when this feature is ready! - ----- - -************************************************ -Mounting Google Cloud GCS Buckets - Coming Soon! -************************************************ - -We'll Let you know when this feature is ready! diff --git a/docs/source-app/workflows/run_app_on_cloud/cloud_files.rst b/docs/source-app/workflows/run_app_on_cloud/cloud_files.rst deleted file mode 100644 index 8e8f01a00f3d7..0000000000000 --- a/docs/source-app/workflows/run_app_on_cloud/cloud_files.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. _ignore: - -################################## -Configure Your Lightning Cloud App -################################## - -**Audience:** Users who want to control Lightning App files on the cloud. - ----- - -************************************** -Ignore file uploads to Lightning cloud -************************************** -Running Lightning Apps on the cloud will upload the source code of your app to the cloud. You can use ``.lightningignore`` file(s) to ignore files or directories while uploading. The `.lightningignore` file follows the same format as a `.gitignore` -file. - -For example, the source code directory below with the ``.lightningignore`` file will ignore the file named -``model.pt`` and directory named ``data_dir``. - -.. code:: bash - - . - ├── README.md - ├── app.py - ├── data_dir - │ ├── image1.png - │ ├── image2.png - │ └── ... - ├── .lightningignore - ├── requirements.txt - └── model.pt - -.. code:: bash - - ~/project/home ❯ cat .lightningignore - model.pt - data_dir - -A sample ``.lightningignore`` file can be found `here `_. - -If you are a component author and your components creates local files that you want to ignore, you can do: - -.. code-block:: python - - class MyComponent(L.LightningWork): # or L.LightningFlow - def __init__(self): - super().__init__() - self.lightningignore = ("model.pt", "data_dir") - - -This has the benefit that the files will be ignored automatically for all the component users, making an easier -transition between running locally vs in the cloud. - ----- - -******************* -Structure app files -******************* - -We recommend your app contain the following files: - -.. code:: bash - - . - ├── .lightning (auto-generated- contains Lightning configuration) - ├── .lightningignore (contains files not to upload to the cloud) - ├── app.py - ├── README.md (optional- a markdown description of your app) - └── requirements.txt (optional- contains all your app dependencies) diff --git a/docs/source-app/workflows/run_app_on_cloud/index.rst b/docs/source-app/workflows/run_app_on_cloud/index.rst deleted file mode 100644 index 55bc3b6807809..0000000000000 --- a/docs/source-app/workflows/run_app_on_cloud/index.rst +++ /dev/null @@ -1,5 +0,0 @@ -##################### -Run apps on the cloud -##################### - -.. include:: index_content.rst diff --git a/docs/source-app/workflows/run_app_on_cloud/index_content.rst b/docs/source-app/workflows/run_app_on_cloud/index_content.rst deleted file mode 100644 index 737b4365df4e2..0000000000000 --- a/docs/source-app/workflows/run_app_on_cloud/index_content.rst +++ /dev/null @@ -1,115 +0,0 @@ -.. _run_app_in_cloud: - -.. toctree:: - :maxdepth: 1 - :hidden: - - cloud_files - lightning_cloud - on_prem - on_your_own_machine - -**Audience:** Users who want to share or scale Lightning Apps. - ----- - -***************************** -Run on Lightning Public Cloud -***************************** - -You can run Lightning Apps for free on the Public Lightning cloud with a single flag! - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Run on Lightning Cloud - :description: Learn how to run on the Lightning public cloud - :col_css: col-md-4 - :button_link: lightning_cloud.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Choose Hardware - :description: Configure you app cloud resources - :col_css: col-md-4 - :button_link: ../../core_api/lightning_work/compute.html - :height: 150 - :tag: Basic - -.. displayitem:: - :header: Set Environment Variables - :description: Manage your environment variables in the cloud - :col_css: col-md-4 - :button_link: ../../glossary/environment_variables.html - :height: 150 - :tag: Basic - -.. displayitem:: - :header: Configure Your Lightning Cloud App - :description: Customize your cloud apps files - :col_css: col-md-4 - :button_link: cloud_files.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Manage App Dependencies - :description: Configure your python requirements or use a custom docker image - :col_css: col-md-4 - :button_link: ../../glossary/build_config/build_config.html - :height: 150 - :tag: Intermediate - -.. displayitem:: - :header: Share Files Between Works - :description: Learn more about data transferring - :col_css: col-md-4 - :button_link: ../../glossary/storage/storage.html - :height: 150 - :tag: Intermediate - -.. raw:: html - -
-
- ----- - -************ -Other Clouds -************ - -.. raw:: html - -
-
- -.. Add callout items below this line - -.. displayitem:: - :header: Run On Your Own Machine - :description: Run Lightning Apps on any machine - :col_css: col-md-4 - :button_link: on_your_own_machine.html - :height: 150 - :tag: basic - -.. displayitem:: - :header: Run On Your Private Cloud - :description: Run Lightning Apps on your own cloud - :col_css: col-md-4 - :button_link: on_prem.html - :height: 150 - :tag: basic - - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/run_app_on_cloud/lightning_cloud.rst b/docs/source-app/workflows/run_app_on_cloud/lightning_cloud.rst deleted file mode 100644 index 494a523852bd9..0000000000000 --- a/docs/source-app/workflows/run_app_on_cloud/lightning_cloud.rst +++ /dev/null @@ -1,67 +0,0 @@ -####################### -Run an App on the Cloud -####################### - -**Audience:** Users who want to share their apps or run on specialized hardware (like GPUs). - ----- - -********************************* -Run on the public Lightning cloud -********************************* -To run any app on the public lightning cloud use the ``--cloud`` argument: - -.. code:: bash - - lightning_app run app app.py --cloud - - -.. note:: - By default, running your apps on the public Lightning cloud is free of charge using default CPUs, and any app uploaded to the Lightning cloud will be shared with the community (source code and app view will be public). If you would like to make your apps private please `contact us `_. - -If your app contains ``LightningWork`` components that require more compute resources, such as larger CPUs or **GPUs**, you'll need to add credits to your Lightning AI account. - - ----- - -************************** -Add dependencies to my app -************************** - - -Add all dependencies required to run your app to a `requirements.txt` file in your app's directory. Read :ref:`build_config` for more details. - - - ----- - - -******** -Name app -******** - -Simply use the ``--name`` flag when running your app, for example: - -.. code:: bash - - lightning_app run app app.py --cloud --name my-awesome-app - -Alternatively, you can change the name of the app in the ``.lightning`` file: - -.. code:: bash - - ~/project/home ❯ cat .lightning - name: my-awesome-app - -The ``.lightning`` file is a general configuration file. -To learn more about optional configuration file parameters, see :class:`~lightning.app.utilities.packaging.app_config.AppConfig`. - ------- - -******************** -Choose Cloud Compute -******************** - -You can configure the hardware your app is running on by setting a :class:`~lightning.app.utilities.packaging.cloud_compute.CloudCompute` object onto the ``cloud_compute`` property of your work's. - -Learn more with the :ref:`cloud_compute` guide diff --git a/docs/source-app/workflows/run_app_on_cloud/on_prem.rst b/docs/source-app/workflows/run_app_on_cloud/on_prem.rst deleted file mode 100644 index be0a954f29b16..0000000000000 --- a/docs/source-app/workflows/run_app_on_cloud/on_prem.rst +++ /dev/null @@ -1,6 +0,0 @@ -########################### -Run an App on Private Cloud -########################### - - -To run Lightning apps on a private or on-prem cluster, `contact us `_. diff --git a/docs/source-app/workflows/run_app_on_cloud/on_your_own_machine.rst b/docs/source-app/workflows/run_app_on_cloud/on_your_own_machine.rst deleted file mode 100644 index 8226a1a00469b..0000000000000 --- a/docs/source-app/workflows/run_app_on_cloud/on_your_own_machine.rst +++ /dev/null @@ -1,26 +0,0 @@ -####################### -Run on your own machine -####################### - -**Audience:** Users who want to run Lightning App on a remote machine. - ----- - -*********** -Run via ssh -*********** -To run a Lightning App on any machine, simply ssh to the machine and run the app directly - -.. code:: bash - - # Copy over credentials from your local machine to your cloud machine - scp ~/.lightning/credentials.json your_name@your_cloud_machine:~/.lightning - - # log into your cloud machine - ssh your_name@your_cloud_machine - - # get your code on the machine and install deps - ... - - # start the app - lightning run app app.py diff --git a/docs/source-app/workflows/run_app_snippet.rst b/docs/source-app/workflows/run_app_snippet.rst deleted file mode 100644 index 6283cb88104eb..0000000000000 --- a/docs/source-app/workflows/run_app_snippet.rst +++ /dev/null @@ -1,33 +0,0 @@ -:orphan: - -*********** -Run the app -*********** - -.. raw:: html - -
-
- -Run the app with the ``run`` command - -.. code:: bash - - lightning_app run app app.py - -.. raw:: html - -
-
- - -Add the ``--cloud`` argument to run on the `lightning cloud `_. - -.. code:: bash - - lightning_app run app app.py --cloud - -.. raw:: html - -
-
diff --git a/docs/source-app/workflows/run_components_on_different_hardware.rst b/docs/source-app/workflows/run_components_on_different_hardware.rst deleted file mode 100644 index 9685c3461e511..0000000000000 --- a/docs/source-app/workflows/run_components_on_different_hardware.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -#################################### -Run components on different hardware -#################################### diff --git a/docs/source-app/workflows/run_on_private_cloud.rst b/docs/source-app/workflows/run_on_private_cloud.rst deleted file mode 100644 index 84d64e9060bde..0000000000000 --- a/docs/source-app/workflows/run_on_private_cloud.rst +++ /dev/null @@ -1,26 +0,0 @@ -:orphan: - -###################### -Run on a private cloud -###################### -**Audience:** Users looking to run Lightning apps on their private cloud accounts. - ----- - -****************************** -Run on a private cloud account -****************************** -For enterprise, startups and University use-cases, Lightning AI can run on your own AWS account (with your own credentials), with all the infrastructure fully managed by us. -To enable this, contact our support team to get started: - -onprem@lightning.ai - ----- - - -*********** -Run on-prem -*********** -For enterprise-level security with full control of the Lightning AI system on your own on-prem cluster, contact our support team to get started: - -onprem@lightning.ai diff --git a/docs/source-app/workflows/run_work_in_parallel.rst b/docs/source-app/workflows/run_work_in_parallel.rst deleted file mode 100644 index b87e653afc920..0000000000000 --- a/docs/source-app/workflows/run_work_in_parallel.rst +++ /dev/null @@ -1,10 +0,0 @@ -############################# -Run LightningWork in parallel -############################# -**Audience:** Users who want to run a LightningWork in parallel (asynchronously). - -**Prereqs:** You must have finished the :doc:`Basic levels <../levels/basic/index>`. - ----- - -.. include:: run_work_in_parallel_content.rst diff --git a/docs/source-app/workflows/run_work_in_parallel_content.rst b/docs/source-app/workflows/run_work_in_parallel_content.rst deleted file mode 100644 index 1c8d5b374dbb2..0000000000000 --- a/docs/source-app/workflows/run_work_in_parallel_content.rst +++ /dev/null @@ -1,41 +0,0 @@ - - - -************************************ -When to run a Components in parallel -************************************ -Run LightningWork in parallel when you want to execute work in the background or at the same time as another work. -An example of when this comes up in machine learning is when data streams-in while a model trains. - ----- - -************ -Toy example -************ -By default, a Component must complete before the next one runs. We can enable one -component to start in parallel which allows the code to proceed without having -to wait for the first one to finish. - -.. lit_tabs:: - :descriptions: No parallel components; Allow the train component to run in parallel; When the component runs, it will run in parallel; The next component is unblocked and can now immediately run. - :code_files: /workflows/scripts/parallel/toy_app.py; /workflows/scripts/parallel/toy_parallel.py; /workflows/scripts/parallel/toy_parallel.py; /workflows/scripts/parallel/toy_parallel.py; - :highlights: ; 18; 23; 24; - :enable_run: true - :tab_rows: 3 - :height: 540px - ----- - -******************************* -Multiple components in parallel -******************************* -In this example, we start all 3 components at once. The first two start in parallel, which -allows the third component to run without waiting for the others to finish. - -.. lit_tabs:: - :descriptions: No parallel components; Enable 2 components to run in parallel; Start both components together in parallel; Last component is not blocked and can start immediately. - :code_files: /workflows/scripts/parallel/toy_two_parallel_not_started.py; /workflows/scripts/parallel/toy_two_parallel.py; /workflows/scripts/parallel/toy_two_parallel.py; /workflows/scripts/parallel/toy_two_parallel.py - :highlights: ; 18, 19; 23, 24; 25 - :enable_run: true - :tab_rows: 3 - :height: 540px diff --git a/docs/source-app/workflows/run_work_once.rst b/docs/source-app/workflows/run_work_once.rst deleted file mode 100644 index 8bdd576a2bc76..0000000000000 --- a/docs/source-app/workflows/run_work_once.rst +++ /dev/null @@ -1,13 +0,0 @@ -########################## -Cache LightningWork Runs -########################## - -**Audience:** Users who want to know how ``LightningWork`` works. - -**Level:** Advanced - -**Prereqs**: Level 16+ and read the :doc:`Event Loop guide <../glossary/event_loop>`. - ----- - -.. include:: run_work_once_content.rst diff --git a/docs/source-app/workflows/run_work_once_content.rst b/docs/source-app/workflows/run_work_once_content.rst deleted file mode 100644 index 355c2ed2bac43..0000000000000 --- a/docs/source-app/workflows/run_work_once_content.rst +++ /dev/null @@ -1,151 +0,0 @@ - -******************************************************** -What caching the calls of Work's run method does for you -******************************************************** - -By default, the run method in a LightningWork (Work) "remembers" (caches) the input arguments it is getting called with and does not execute again if called with the same arguments again. -In other words, the run method only executes when the input arguments have never been seen before. - -You can turn caching on or off: - -.. code-block:: python - - # Run only when the input arguments change (default) - work = MyWork(cache_calls=True) - - # Run every time regardless of whether input arguments change or not - work = MyWork(cache_calls=False) - -To better understand this, imagine that every day you want to sequentially download and process some data and then train a model on that data. -As explained in the :doc:`Event Loop guide <../../glossary/event_loop>`, the Lightning App runs within an infinite while loop, so the pseudo-code of your application might looks like this: - -.. code-block:: python - - from datetime import datetime - - # Lightning code - while True: # This is the Lightning Event Loop - - # Your code - today = datetime.now().strftime("%D") # '05/25/22' - data_processor.run(today) - train_model.run(data_processor.data) - -In this scenario, you want your components to run ``once`` a day, and no more than that! But your code is running within an infinite loop, how can this even work? -This is where the Work's internal caching mechanism comes in. By default, Lightning caches a hash of the input provided to its run method and won't re-execute the method if the same input is provided again. -In the example above, the **data_processor** component run method receives the string **"05/25/22"**. It runs one time and any further execution during the day is skipped until tomorrow is reached and the work run method receives **06/25/22**. This logic applies everyday. -This caching mechanism is inspired from how `React.js Components and Props `_ renders websites. Only changes to the inputs re-trigger execution. - -*************** -Caching Example -*************** - -Here's an example of this behavior with LightningWork: - -.. code:: python - :emphasize-lines: 11, 17 - - import lightning as L - - - class ExampleWork(L.LightningWork): - def run(self, *args, **kwargs): - print(f"I received the following props: args: {args} kwargs: {kwargs}") - - - work = ExampleWork() - work.run(value=1) - - # Providing the same value. This won't run as already cached. - work.run(value=1) - work.run(value=1) - work.run(value=1) - work.run(value=1) - - # Changing the provided value. This isn't cached and will run again. - work.run(value=10) - -And you should see the following by running the code above: - -.. code-block:: console - - $ python example.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - # After you have clicked `run` on the UI. - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 10} - -As you can see, the intermediate run didn't execute, as we would expected when ``cache_calls=True``. - -*********************************** -Implications of turning caching off -*********************************** - -By setting ``cache_calls=False``, Lightning won't cache the return value and re-execute the run method on every call. - -.. code:: python - :emphasize-lines: 7 - - from lightning.app import LightningWork - - - class ExampleWork(LightningWork): - def run(self, *args, **kwargs): - print(f"I received the following props: args: {args} kwargs: {kwargs}") - - - work = ExampleWork(cache_calls=False) - work.run(value=1) - - # Providing the same value. This won't run as already cached. - work.run(value=1) - work.run(value=1) - work.run(value=1) - work.run(value=1) - - # Changing the provided value. This isn't cached and will run again. - work.run(value=10) - -.. code-block:: console - - $ python example.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - # After you have clicked `run` on the UI. - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 1} - I received the following props: args: () kwargs: {'value': 10} - -Be aware than when setting both ``cache_calls=False`` and ``parallel=False`` to a work, the code after the ``self.work.run()`` is unreachable -as the work continuously execute in a blocking way. - -.. code-block:: python - :emphasize-lines: 9-10 - - from lightning.app import LightningApp, LightningFlow, LightningWork - - - class Flow(LightningFlow): - def __init__(self): - super().__init__() - - self.work = Work(cache_calls=False, parallel=False) - - def run(self): - print("HERE BEFORE") - self.work.run() - print("HERE AFTER") - - - app = LightningApp(Flow()) - -.. code-block:: console - - $ lightning run app app.py - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - print("HERE BEFORE") - print("HERE BEFORE") - print("HERE BEFORE") - ... diff --git a/docs/source-app/workflows/schedule_apps.rst b/docs/source-app/workflows/schedule_apps.rst deleted file mode 100644 index 7b596cd08b179..0000000000000 --- a/docs/source-app/workflows/schedule_apps.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -################# -Schedule App Runs -################# diff --git a/docs/source-app/workflows/scripts/parallel/toy_app.py b/docs/source-app/workflows/scripts/parallel/toy_app.py deleted file mode 100644 index 8cfcd61c5bd31..0000000000000 --- a/docs/source-app/workflows/scripts/parallel/toy_app.py +++ /dev/null @@ -1,27 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class AnalyzeComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class LitWorkflow(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu')) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('cpu')) - - - def run(self): - self.train.run("machine A counting") - self.analyze.run("machine B counting") - - -app = LightningApp(LitWorkflow()) diff --git a/docs/source-app/workflows/scripts/parallel/toy_parallel.py b/docs/source-app/workflows/scripts/parallel/toy_parallel.py deleted file mode 100644 index 6b4059caa4209..0000000000000 --- a/docs/source-app/workflows/scripts/parallel/toy_parallel.py +++ /dev/null @@ -1,27 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class AnalyzeComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class LitWorkflow(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu'), parallel=True) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('cpu')) - - - def run(self): - self.train.run("machine A counting") - self.analyze.run("machine B counting") - - -app = LightningApp(LitWorkflow()) diff --git a/docs/source-app/workflows/scripts/parallel/toy_two_parallel.py b/docs/source-app/workflows/scripts/parallel/toy_two_parallel.py deleted file mode 100644 index 57967137f0f25..0000000000000 --- a/docs/source-app/workflows/scripts/parallel/toy_two_parallel.py +++ /dev/null @@ -1,27 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class AnalyzeComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class LitWorkflow(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu'), parallel=True) - self.baseline_1 = TrainComponent(cloud_compute=CloudCompute('cpu'), parallel=True) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('cpu')) - - def run(self): - self.train.run("machine A counting") - self.baseline_1.run("machine C counting") - self.analyze.run("machine B counting") - -app = LightningApp(LitWorkflow()) diff --git a/docs/source-app/workflows/scripts/parallel/toy_two_parallel_not_started.py b/docs/source-app/workflows/scripts/parallel/toy_two_parallel_not_started.py deleted file mode 100644 index 7ded9d8a93935..0000000000000 --- a/docs/source-app/workflows/scripts/parallel/toy_two_parallel_not_started.py +++ /dev/null @@ -1,27 +0,0 @@ -# app.py -from lightning.app import LightningWork, LightningFlow, LightningApp, CloudCompute - - -class TrainComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class AnalyzeComponent(LightningWork): - def run(self, message): - for i in range(100000000000): - print(message, i) - -class LitWorkflow(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.train = TrainComponent(cloud_compute=CloudCompute('cpu')) - self.baseline_1 = TrainComponent(cloud_compute=CloudCompute('cpu')) - self.analyze = AnalyzeComponent(cloud_compute=CloudCompute('cpu')) - - def run(self): - self.train.run("machine A counting") - self.baseline_1.run("machine C counting") - self.analyze.run("machine B counting") - -app = LightningApp(LitWorkflow()) diff --git a/docs/source-app/workflows/share_app.rst b/docs/source-app/workflows/share_app.rst deleted file mode 100644 index 8d7482e21d2c3..0000000000000 --- a/docs/source-app/workflows/share_app.rst +++ /dev/null @@ -1,33 +0,0 @@ -############ -Share an App -############ -**Audience:** Users who want to show off their work. - ----- - -*********************************** -Option 1: Run on the cloud to share -*********************************** -To share an app, simply run your app on the cloud: - -.. code:: bash - - lightning run app app.py --cloud - -Then share the link that's generated. - ----- - -********************************** -Option 2: Expose a tunnel to share -********************************** -If you'd like to share yourself, feel free to run the app in local mode -and expose the URlapp. - -Run local: - -.. code:: bash - - lightning run app app.py - -And then, use one of the many guides to `expose a tunnel `_. diff --git a/docs/source-app/workflows/share_files_between_components.rst b/docs/source-app/workflows/share_files_between_components.rst deleted file mode 100644 index 5ccf2b9e2a441..0000000000000 --- a/docs/source-app/workflows/share_files_between_components.rst +++ /dev/null @@ -1,120 +0,0 @@ -:orphan: - -############################## -Share Files Between Components -############################## - -.. note:: The contents of this page is still in progress! - -**Audience:** Users who want to share files between components. - ----- - -********************************** -Why do I need distributed storage? -********************************** -In a Lightning App some components can be executed on their own hardware. Distributed storage -enables a file saved by a component on one machine to be used by components in other machines (transparently). - -If you've asked the question "how do I use the checkpoint from this model to deploy this other thing", you've -needed distributed storage. - ----- - -************ -Write a file -************ -To write a file, first create a reference to the file with the :class:`~lightning.app.storage.path.Path` class, then write to it: - -.. code:: python - - from lightning.app.storage import Path - - # file reference - boring_file_reference = Path("boring_file.txt") - - # write to that file - with open(self.boring_file_reference, "w") as f: - f.write("yolo") - - ----- - -********** -Use a file -********** -To use a file, pass the reference to the file: - -.. code:: python - - f = open(boring_file_reference, "r") - print(f.read()) - ----- - -.. - ******************************** - Create a directory - coming soon - ******************************** - - - ---- - - ****************************** - Use a directory - coming soon - ****************************** - TODO - - ---- - -********************************* -Example: Share a model checkpoint -********************************* -A common workflow in ML is to use a checkpoint created by another component. -First, define a component that saves a checkpoint: - -.. literalinclude:: ./share_files_between_components/app.py - :lines: -19 - -Next, define a component that needs the checkpoints: - -.. literalinclude:: ./share_files_between_components/app.py - :lines: 20-31 - -Link both components via a parent component: - -.. literalinclude:: ./share_files_between_components/app.py - :lines: 32- - - -Run the app above with the following command: - -.. code-block:: bash - - lightning run app docs/source/workflows/share_files_between_components/app.py - -.. code-block:: console - - Your Lightning App is starting. This won't take long. - INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view - Loaded checkpoint_1: tensor([0, 1, 2, 3, 4]) - Loaded checkpoint_2: tensor([0, 1, 2, 3, 4]) - - -For example, here we save a file on one component and use it in another component: - -.. code:: python - - from lightning.app.storage import Path - - - class ComponentA(LightningWork): - def __init__(self): - super().__init__() - self.boring_path = None - - def run(self): - # This should be used as a REFERENCE to the file. - self.boring_path = Path("boring_file.txt") - with open(self.boring_path, "w") as f: - f.write(FILE_CONTENT) diff --git a/docs/source-app/workflows/share_files_between_components/app.py b/docs/source-app/workflows/share_files_between_components/app.py deleted file mode 100644 index 7bf0686f65954..0000000000000 --- a/docs/source-app/workflows/share_files_between_components/app.py +++ /dev/null @@ -1,48 +0,0 @@ -import os - -import torch - -from lightning.app import LightningWork, LightningFlow, LightningApp -from lightning.app.storage.path import Path - - -class ModelTraining(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.checkpoints_path = Path("./checkpoints") - - def run(self): - # make fake checkpoints - checkpoint_1 = torch.tensor([0, 1, 2, 3, 4]) - checkpoint_2 = torch.tensor([0, 1, 2, 3, 4]) - os.makedirs(self.checkpoints_path, exist_ok=True) - checkpoint_path = str(self.checkpoints_path / "checkpoint_{}.ckpt") - torch.save(checkpoint_1, str(checkpoint_path).format("1")) - torch.save(checkpoint_2, str(checkpoint_path).format("2")) - - -class ModelDeploy(LightningWork): - def __init__(self, ckpt_path, *args, **kwargs): - super().__init__() - self.ckpt_path = ckpt_path - - def run(self): - ckpts = os.listdir(self.ckpt_path) - checkpoint_1 = torch.load(os.path.join(self.ckpt_path, ckpts[0])) - checkpoint_2 = torch.load(os.path.join(self.ckpt_path, ckpts[1])) - print(f"Loaded checkpoint_1: {checkpoint_1}") - print(f"Loaded checkpoint_2: {checkpoint_2}") - - -class LitApp(LightningFlow): - def __init__(self): - super().__init__() - self.train = ModelTraining() - self.deploy = ModelDeploy(ckpt_path=self.train.checkpoints_path) - - def run(self): - self.train.run() - self.deploy.run() - - -app = LightningApp(LitApp()) diff --git a/docs/source-app/workflows/test_an_app.rst b/docs/source-app/workflows/test_an_app.rst deleted file mode 100644 index c51ae3aa8f652..0000000000000 --- a/docs/source-app/workflows/test_an_app.rst +++ /dev/null @@ -1,5 +0,0 @@ -:orphan: - -########### -Test an App -########### diff --git a/docs/source-pytorch/versioning.rst b/docs/source-pytorch/versioning.rst index 96038e63cf807..ebae1f920a5a6 100644 --- a/docs/source-pytorch/versioning.rst +++ b/docs/source-pytorch/versioning.rst @@ -16,8 +16,6 @@ A Lightning release number is in the format of ``MAJOR.MINOR.PATCH``. With every release, we publish a changelog where we list additions, removals, deprecations, changed functionality and fixes. -The ``lightning.app`` package is an exception to this rule, as it may contain any change with or without deprecations in any of the releases. - API Stability ************* diff --git a/examples/app/argparse/app.py b/examples/app/argparse/app.py deleted file mode 100644 index 5fa8039908eb3..0000000000000 --- a/examples/app/argparse/app.py +++ /dev/null @@ -1,28 +0,0 @@ -import argparse - -from lightning.app import CloudCompute, LightningApp, LightningFlow, LightningWork - - -class Work(LightningWork): - def __init__(self, cloud_compute): - super().__init__(cloud_compute=cloud_compute) - - def run(self): - pass - - -class Flow(LightningFlow): - def __init__(self, cloud_compute): - super().__init__() - self.work = Work(cloud_compute) - - def run(self): - assert self.work.cloud_compute.name == "gpu", self.work.cloud_compute.name - self.stop() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--use_gpu", action="store_true", default=False, help="Whether to use GPU in the cloud") - hparams = parser.parse_args() - app = LightningApp(Flow(CloudCompute("gpu" if hparams.use_gpu else "cpu"))) diff --git a/examples/app/boring/.gitignore b/examples/app/boring/.gitignore deleted file mode 100644 index 94018704d9f90..0000000000000 --- a/examples/app/boring/.gitignore +++ /dev/null @@ -1,10 +0,0 @@ -lightning_logs -*.pt -.storage/ -.shared/ -data -*.ckpt -redis-stable -node_modules -*.rdb -boring_file.txt diff --git a/examples/app/boring/app.py b/examples/app/boring/app.py deleted file mode 100644 index 0dfaedfae0107..0000000000000 --- a/examples/app/boring/app.py +++ /dev/null @@ -1,61 +0,0 @@ -import os - -from lightning.app import CloudCompute, LightningApp, LightningFlow, LightningWork -from lightning.app.components import TracerPythonScript -from lightning.app.storage.path import Path - -FILE_CONTENT = """ -Hello there! -This tab is currently an IFrame of the FastAPI Server running in `DestinationFileAndServeWork`. -Also, the content of this file was created in `SourceFileWork` and then transferred to `DestinationFileAndServeWork`. -Are you already 🤯 ? Stick with us, this is only the beginning. Lightning is 🚀. -""" - - -class SourceFileWork(LightningWork): - def __init__(self, cloud_compute: CloudCompute = CloudCompute(), **kwargs): - super().__init__(parallel=True, **kwargs, cloud_compute=cloud_compute) - self.boring_path = None - - def run(self): - # This should be used as a REFERENCE to the file. - self.boring_path = "lit://boring_file.txt" - with open(self.boring_path, "w", encoding="utf-8") as f: - f.write(FILE_CONTENT) - - -class DestinationFileAndServeWork(TracerPythonScript): - def run(self, path: Path): - assert path.exists() - self.script_args += [f"--filepath={path}", f"--host={self.host}", f"--port={self.port}"] - super().run() - - -class BoringApp(LightningFlow): - def __init__(self): - super().__init__() - self.source_work = SourceFileWork() - self.dest_work = DestinationFileAndServeWork( - script_path=os.path.join(os.path.dirname(__file__), "scripts/serve.py"), - port=1111, - parallel=False, # runs until killed. - cloud_compute=CloudCompute(), - raise_exception=True, - ) - - @property - def ready(self) -> bool: - return self.dest_work.is_running - - def run(self): - self.source_work.run() - if self.source_work.has_succeeded: - # the flow passes the file from one work to another. - self.dest_work.run(self.source_work.boring_path) - self.stop("Boring App End") - - def configure_layout(self): - return {"name": "Boring Tab", "content": self.dest_work.url + "/file"} - - -app = LightningApp(BoringApp()) diff --git a/examples/app/boring/app_dynamic.py b/examples/app/boring/app_dynamic.py deleted file mode 100644 index b08b8cf5ce10d..0000000000000 --- a/examples/app/boring/app_dynamic.py +++ /dev/null @@ -1,72 +0,0 @@ -import os - -from lightning.app import CloudCompute, LightningApp, LightningFlow, LightningWork -from lightning.app.components import TracerPythonScript -from lightning.app.storage.path import Path -from lightning.app.structures import Dict - -FILE_CONTENT = """ -Hello there! -This tab is currently an IFrame of the FastAPI Server running in `DestinationFileAndServeWork`. -Also, the content of this file was created in `SourceFileWork` and then transferred to `DestinationFileAndServeWork`. -Are you already 🤯 ? Stick with us, this is only the beginning. Lightning is 🚀. -""" - - -class SourceFileWork(LightningWork): - def __init__(self, cloud_compute: CloudCompute = CloudCompute(), **kwargs): - super().__init__(parallel=True, **kwargs, cloud_compute=cloud_compute) - self.boring_path = None - - def run(self): - # This should be used as a REFERENCE to the file. - self.boring_path = "lit://boring_file.txt" - with open(self.boring_path, "w") as f: - f.write(FILE_CONTENT) - - -class DestinationFileAndServeWork(TracerPythonScript): - def run(self, path: Path): - assert path.exists() - self.script_args += [f"--filepath={path}", f"--host={self.host}", f"--port={self.port}"] - super().run() - - -class BoringApp(LightningFlow): - def __init__(self): - super().__init__() - self.dict = Dict() - - @property - def ready(self) -> bool: - if "dst_w" in self.dict: - return self.dict["dst_w"].url != "" - return False - - def run(self): - # create dynamically the source_work at runtime - if "src_w" not in self.dict: - self.dict["src_w"] = SourceFileWork() - - self.dict["src_w"].run() - - if self.dict["src_w"].has_succeeded: - # create dynamically the dst_w at runtime - if "dst_w" not in self.dict: - self.dict["dst_w"] = DestinationFileAndServeWork( - script_path=os.path.join(os.path.dirname(__file__), "scripts/serve.py"), - port=1111, - parallel=False, # runs until killed. - cloud_compute=CloudCompute(), - raise_exception=True, - ) - - # the flow passes the file from one work to another. - self.dict["dst_w"].run(self.dict["src_w"].boring_path) - self.stop("Boring App End") - - def configure_layout(self): - return {"name": "Boring Tab", "content": self.dict["dst_w"].url + "/file" if "dst_w" in self.dict else ""} - - -app = LightningApp(BoringApp(), log_level="debug") diff --git a/examples/app/boring/scripts/__init__.py b/examples/app/boring/scripts/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/examples/app/boring/scripts/serve.py b/examples/app/boring/scripts/serve.py deleted file mode 100644 index dedd6013985ca..0000000000000 --- a/examples/app/boring/scripts/serve.py +++ /dev/null @@ -1,29 +0,0 @@ -import argparse -import os - -import uvicorn -from fastapi import FastAPI -from fastapi.requests import Request -from fastapi.responses import HTMLResponse - -if __name__ == "__main__": - parser = argparse.ArgumentParser("Server Parser") - parser.add_argument("--filepath", type=str, help="Where to find the `filepath`") - parser.add_argument("--host", type=str, default="0.0.0.0", help="Server host`") - parser.add_argument("--port", type=int, default="8888", help="Server port`") - hparams = parser.parse_args() - - fastapi_service = FastAPI() - - if not os.path.exists(str(hparams.filepath)): - content = ["The file wasn't transferred"] - else: - with open(hparams.filepath) as fo: - content = fo.readlines() # read the file received from SourceWork. - - @fastapi_service.get("/file") - async def get_file_content(request: Request, response_class=HTMLResponse): - lines = "\n".join(["

" + line + "

" for line in content]) - return HTMLResponse(f"
    {lines}
") - - uvicorn.run(app=fastapi_service, host=hparams.host, port=hparams.port) diff --git a/examples/app/commands_and_api/.lightningignore b/examples/app/commands_and_api/.lightningignore deleted file mode 100644 index f7275bbbd035b..0000000000000 --- a/examples/app/commands_and_api/.lightningignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/examples/app/commands_and_api/app.py b/examples/app/commands_and_api/app.py deleted file mode 100644 index 3f59c117c4180..0000000000000 --- a/examples/app/commands_and_api/app.py +++ /dev/null @@ -1,52 +0,0 @@ -from command import CustomCommand, CustomConfig -from lightning import LightningFlow -from lightning.app.api import Get, Post -from lightning.app.core.app import LightningApp - - -async def handler(): - print("Has been called") - return "Hello World !" - - -class ChildFlow(LightningFlow): - def nested_command(self, name: str): - """A nested command.""" - print(f"Hello {name}") - - def configure_commands(self): - return [{"nested_command": self.nested_command}] - - -class FlowCommands(LightningFlow): - def __init__(self): - super().__init__() - self.names = [] - self.child_flow = ChildFlow() - - def run(self): - if self.names: - print(self.names) - - def command_without_client(self, name: str): - """A command without a client.""" - self.names.append(name) - - def command_with_client(self, config: CustomConfig): - self.names.append(config.name) - - def configure_commands(self): - commands = [ - {"command_without_client": self.command_without_client}, - {"command_with_client": CustomCommand(self.command_with_client)}, - ] - return commands + self.child_flow.configure_commands() - - def configure_api(self): - return [ - Post("/user/command_without_client", self.command_without_client), - Get("/pure_function", handler), - ] - - -app = LightningApp(FlowCommands(), log_level="debug") diff --git a/examples/app/commands_and_api/command.py b/examples/app/commands_and_api/command.py deleted file mode 100644 index e2dd26f684b03..0000000000000 --- a/examples/app/commands_and_api/command.py +++ /dev/null @@ -1,18 +0,0 @@ -from argparse import ArgumentParser - -from lightning.app.utilities.commands import ClientCommand -from pydantic import BaseModel - - -class CustomConfig(BaseModel): - name: str - - -class CustomCommand(ClientCommand): - description = "A command with a client." - - def run(self): - parser = ArgumentParser() - parser.add_argument("--name", type=str) - args = parser.parse_args() - self.invoke_handler(config=CustomConfig(name=args.name)) diff --git a/examples/app/components/python/__init__.py b/examples/app/components/python/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/examples/app/components/python/app.py b/examples/app/components/python/app.py deleted file mode 100644 index 944cb7de2995d..0000000000000 --- a/examples/app/components/python/app.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -from pathlib import Path - -from lightning.app import LightningApp, LightningFlow - -from examples.components.python.component_tracer import PLTracerPythonScript - - -class RootFlow(LightningFlow): - def __init__(self): - super().__init__() - script_path = Path(__file__).parent / "pl_script.py" - self.tracer_python_script = PLTracerPythonScript(script_path) - - def run(self): - assert os.getenv("GLOBAL_RANK", "0") == "0" - if not self.tracer_python_script.has_started: - self.tracer_python_script.run() - if self.tracer_python_script.has_succeeded: - self.stop("tracer script succeed") - if self.tracer_python_script.has_failed: - self.stop("tracer script failed") - - -app = LightningApp(RootFlow()) diff --git a/examples/app/components/python/component_popen.py b/examples/app/components/python/component_popen.py deleted file mode 100644 index bc70b9f47b16d..0000000000000 --- a/examples/app/components/python/component_popen.py +++ /dev/null @@ -1,7 +0,0 @@ -from pathlib import Path - -from lightning.app.components import PopenPythonScript - -if __name__ == "__main__": - comp = PopenPythonScript(Path(__file__).parent / "pl_script.py") - comp.run() diff --git a/examples/app/components/python/component_tracer.py b/examples/app/components/python/component_tracer.py deleted file mode 100644 index 3e2e96f38a7f3..0000000000000 --- a/examples/app/components/python/component_tracer.py +++ /dev/null @@ -1,52 +0,0 @@ -from lightning.app.components import TracerPythonScript -from lightning.app.storage.path import Path -from lightning.app.utilities.tracer import Tracer -from lightning.pytorch import Trainer - - -class PLTracerPythonScript(TracerPythonScript): - """This component can be used for ANY PyTorch Lightning script to track its progress and extract its best model - path.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Define the component state. - self.global_step = None - self.best_model_path = None - - def configure_tracer(self) -> Tracer: - from lightning.pytorch.callbacks import Callback - - class MyInjectedCallback(Callback): - def __init__(self, lightning_work): - self.lightning_work = lightning_work - - def on_train_start(self, trainer, pl_module) -> None: - print("This code doesn't belong to the script but was injected.") - print("Even the Lightning Work is available and state transfer works !") - print(self.lightning_work) - - def on_batch_train_end(self, trainer, *_) -> None: - # On every batch end, collects some information. - # This is communicated automatically to the rest of the app, - # so you can track your training in real time in the Lightning App UI. - self.lightning_work.global_step = trainer.global_step - best_model_path = trainer.checkpoint_callback.best_model_path - if best_model_path: - self.lightning_work.best_model_path = Path(best_model_path) - - # This hook would be called every time - # before a Trainer `__init__` method is called. - - def trainer_pre_fn(trainer, *args, **kwargs): - kwargs["callbacks"] = kwargs.get("callbacks", []) + [MyInjectedCallback(self)] - return {}, args, kwargs - - tracer = super().configure_tracer() - tracer.add_traced(Trainer, "__init__", pre_fn=trainer_pre_fn) - return tracer - - -if __name__ == "__main__": - comp = PLTracerPythonScript(Path(__file__).parent / "pl_script.py") - res = comp.run() diff --git a/examples/app/components/python/pl_script.py b/examples/app/components/python/pl_script.py deleted file mode 100644 index 75538daf4bed2..0000000000000 --- a/examples/app/components/python/pl_script.py +++ /dev/null @@ -1,10 +0,0 @@ -from lightning.pytorch import Trainer -from lightning.pytorch.demos.boring_classes import BoringModel - -if __name__ == "__main__": - model = BoringModel() - trainer = Trainer(max_epochs=1, accelerator="cpu", devices=2, strategy="ddp") - trainer.fit(model) - trainer.validate(model) - trainer.test(model) - trainer.predict(model) diff --git a/examples/app/components/serve/gradio/app.py b/examples/app/components/serve/gradio/app.py deleted file mode 100644 index ec07e4ba99c06..0000000000000 --- a/examples/app/components/serve/gradio/app.py +++ /dev/null @@ -1,51 +0,0 @@ -from functools import partial - -import gradio as gr -import requests -import torch -from lightning.app import LightningApp, LightningFlow -from lightning.app.components import ServeGradio -from PIL import Image - - -# Credit to @akhaliq for his inspiring work. -# Find his original code there: https://huggingface.co/spaces/akhaliq/AnimeGANv2/blob/main/app.py -class AnimeGANv2UI(ServeGradio): - inputs = gr.inputs.Image(type="pil") - outputs = gr.outputs.Image(type="pil") - elon = "https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/Elon_Musk_Royal_Society_%28crop2%29.jpg/330px-Elon_Musk_Royal_Society_%28crop2%29.jpg" - img = Image.open(requests.get(elon, stream=True).raw) - img.save("elon.jpg") - examples = [["elon.jpg"]] - - def __init__(self): - super().__init__() - self.ready = False - - def predict(self, img): - return self.model(img=img) - - def build_model(self): - repo = "AK391/animegan2-pytorch:main" - model = torch.hub.load(repo, "generator", device="cpu") - face2paint = torch.hub.load(repo, "face2paint", size=512, device="cpu") - self.ready = True - return partial(face2paint, model=model) - - -class RootFlow(LightningFlow): - def __init__(self): - super().__init__() - self.demo = AnimeGANv2UI() - - def run(self): - self.demo.run() - - def configure_layout(self): - tabs = [] - if self.demo.ready: - tabs.append({"name": "Home", "content": self.demo}) - return tabs - - -app = LightningApp(RootFlow()) diff --git a/examples/app/components/serve/gradio/beyonce.jpg b/examples/app/components/serve/gradio/beyonce.jpg deleted file mode 100644 index 68b6084475b019bd37db953b87c37ec905b79b86..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 132520 zcmbrlcUY58w=Rqd3L+w)ROwxMm#%b(p-K%!Iw6Ee@1Q8Xw;)~V9R-rmkuDuVPbdxE*nlb^tlCgzB1+DF1Hui$P(6=%ya=tP*pHO>GYZhOqi>rr>uRPno#4_L6>#-n|F@C411d zaQoJsJGTk%5D^g)-t-Q-krNP75K*# z|HH?y@QBE$=;V~tG)zH|~+11_C+xMe?06j4|H9a#s zhgrt0tgfwZ{My{YA08e5J~=)6bN&ymTLgFhbIAS;?0?{*xWRRsknj%Sy?=1sy6tmQ z?@$mDv5MTKe5rTO+Kq}$^gZ!orKB(4I!M^X^belcxR2kb<`7@z#Qy{BU&#K?fQ9~F zA^T5Y|Ah-f@Zip^o5{OFL7+%*eaMp^P4K_hPkZGLwS}cmm|C0cJb#ZmD^^L|8d3_? zek*?$k~i(OEg*;nJ7!`;U@-onQ>sGmHQTNy$x0ostJU}VDK*5tfgY%aP@bizAB?2? zRBWe@&w;%Y#Zf+7mNvv{Q-->LmdZ}?gf;hW;XfMIm-uUJJ1Qcw1sH^DC%EKljvtZr zb*uDETJ_^UjuUiDVxgXPvH?|j=*H9O*@=BNIUHOg%IJ&8b!?E!&yxT;e4aD_BUh*U z0~TT0NJsB`cR-A0;+wtNa)Y^=vYei|%|cN@8|ByYO9uu5>>cTCTxXD@c|XQp2B zHz#PI-?Y^xXaRy`yRMnh{GDcV0v)E~f;C!UEmIPtV2Ot6ciN~+U;&tF&j1b6gd*-qxz>TK1s~SQI7Rs_N&}<`nhVCV=H}88&w(czR z5j8P@Ub0+x#M(Gr!_M!z7R_LRG^%(!9}!ozkJ$bd$oM%Ze_P#&6*Vp7`A7Gv1PEqi zM|*xA=w*;aqLh7n#9;w`5NAV!ow`28K1ckA(J_H~#gCQ`KFx1q3iX{8A0w~;Hx7(M z{51EA+6H64-710NFx%!sdTv%mN>NkGMsWS;GtPs)u>G*b63IL$b0jR&px|Z6uTL7T zHj@$>v;77Ar3)}rrqdH91|A$p?sumV1V-n!T=ubU~!v)npa za7wEQqFzvddDUbLEmgszh}x2@VI{u`blW}Sas6sVK3=7j{dcTh7HhJ1RavQ#Ce$w_ z>@-UtsfNLCyLla>j+vkM)@XdtfOj5sDNvyq?AuyBw_zbwnC{P@CFhxBoaTC~(VzUU zsBoS)Z=uVG#bhD$;-drYh+-3G1*ysUu!t^d8FOTle6~@PP_&$nd=@d~hSCRcipBx3 z1&zTndShMA)28ajI&BQ( zOpc)tCzK#F3_fDUt+%D0m0fc;RsS??Z?6$n5Ol%iq0N`nY&s`@zFE-tGniW^f{C~( z9uViFI~@pg51q#JX-)|2k872yBUkkNxIX|Q{K*CRd~`(o)V9sv;iCbJL(~T^VL>7$ ze+lXld{u%Woa67=p0h-?d}Mvg@wWTxeGP#TV8VwUw7EGr&Z7eRl2s~~z43k3u^xNi zk(^9eXHbq!LaM2NU7~{MuZ`O+kRJRQ`>RF>Z;R9Z?Fc_xG)r?cxbPuBnW+0n<;$I~ zvIR=CrR)U8U$nBhmPAu(f8|tS{QXWY&NtU_5bdYYlz@OuPc#B`c{2XX-)kxQ@@-vv zQi+p4qD6$_qT<(%!`t#Zem+J<@EH-gPwWtFHkaiBZbCnW_7$MAAu>nkxMG3I93}el zNoEqX>;T%PKCS(9e zI_r#|lzoEVe^?DaC~6Oj_!Q)i8B}jM3xYmANwIXM8rt?CTI;DaRuSW|cKu7>n6|2R zSKR%grCsRR_ix-Ao#H0zl1Q1q1X#jtwxY${Z~b1d2C5Uys)6oay4g>w*^<`!riU+U z^wDpw#|~sS@#Yrb(-Sp6!)_R7N4Ozfck{<~e|}VIh_{`pG+EMkRPh&Dnc$Umt;H>~ zIp@V3{@+`VpsgV6WqVC~*|s9dC3}c{sp4#voOI#GxJA1bi>LF2`U&$|GyX&AQu=HJ z`?OyXwW6A-T(4QUvDZ{7dcuE1i364Sz4u*f7oCtuBkx@Inz?B*{gZVr4aYYsgG5bc z%xRbTN;9jRZ3Qa=!pcl(xO;FFQhKbqlMU}vzflZ^1^1JxAN(U_pWt=@H6wd!O}TPD|Mb^fE1o6 zCq_PNWWS9BeNL2Q!#)&u0BUPE80`w7iuy}{3Y7Od@aZv7QIP`U7%E~IKGRc69*xQw zeKp@Z;-pT}fd#V)-0D7j&vQ9@=7{gZ8HeYJc!;&cvYI55Q(1KXFr-ay`PwkPXVZv` zmAO8#{g(P@k1i$wO&*rVgwSCG%UFKES&^S9TLzq&flTC#tqEDv0hy+Sxy+AD)Jwr; z$^8(*vM3rLe5jAKg?}5_@<8=|YPocPh9pw6r#0PYd>?}PV~B-1g4g^}MKV zvk-T1_)@%OMGO-kR_+YuYDk>#p^aWL01JA``(xHPKnxBu%UB5G(Xt%Y!r7P9yf~XZ zR+;_9f?7J>5qO47+{|S`a~&+ji0=?uxK8b?zh`woHPUmHxQ=q3IV~Dy-c3_2b7w(i zq9yB-R5Gcyny#~!vx1a6?t%)SLB-A}$zYxXrgBF%G20SLXHj+PyeCI;y0hP50&H13 zBN)#%yXbFko6@_2)O(QWam&r1Rz-`0o{yA3N4uj1XSn3ILBXHMiTB=>9rhp|y9>D` zTN5-_D+NPy!ZXW>Omu73)Lb*PwORhk$9TemseLtNA13dOjA`sTEBKy{5sKoP^sHdu zlf|}63aEFT)&U^PHMDb&{cNPt=>tBaT?cD{#b+PXh0SLdu*%+Z!k;+LX?~_B*ta*d zPgI8u(GRz#$`~~lI)|%Ef$f-Bi_Xdi?uG&ul+e7D)#Ce)!8H&sFNa+>h=1Tq}shhYsJfRXylc@95SZZ3*34YM?^kFDpaWq!juqm0>>p8z*OH4*! zfaTd5EvkK|@RXtT^-mp+*W=K}ntSy+ju6*~$k;NT$>JT{1h{oNb02GwdrJ5D(hASr zx!~~Kw|e6O(v{hLuODBOUmjWArSdhzMVS=Q#E?~DQY0z+298k{sBO)>Lq45%U(t!Q z0TICb)R(3k2rcRIVLXI2MKk2w)0^1iM93I++=m5^K27T9$|ltn>0B?9$;RQ#u(F!GJ;Q+^{ zZ;Ru>tY_)peK6cnEmDWS%r3eFi@((^U%W{aiTDr=;m2qWxBf(BHk_j!YI>3U!u5lHZ3-+tq ze+lfZ&UkXuaE#td%Z|Okunq z1!1Qp#%o5^<*ci>9V=w+Bi`sr`-W&nouRsD1M-Ls_#>`~FH@EIGz4b+K(`}mR1Tef zyM>~4-Wl&wblbyiCW zdq&$wtHmoxRs~0OW*f-7jkn6^=q-Hxnp<*TD11Z?yO$Cmo)9_WZm4?aejaOq*-yR<5kv|K$rkOro@KB$$*4&klmpsRukN6f^ETh-w7VmW~ zFWG)owVqrxHlbx|v9ohz=ZoE2eMsC|kQ61LS(gRC4E^x!rM&aNprtJ``Pg^|s zrMIJbKHo0zqW$r7Bk$(q#Q?Cji^$5&3bB4zTr{us>KQC#~^5YL5@e`9J5`SF=@ zGjd0}FUScVrsb4|R51S@4>`VD^)x~6PxMIpy0H4qKJG8UJBe44btx@Rlb zJl9U>{OD&<_j&T){jYjwY^BDzzP#3H zGl>H2){Mwmg#P;-^dZ`bo=kFTMR*GnH(a{JKVkl2QMC}w;LFdbvMDxHUU8Pg9r~Nh z@>A)i1McHHPcpWXos_EL@z)bGYPavWN`n0S^5H}lRaGQTR{);1&~8|Q7xj7r%=rJr|G)m z?;S5K1V>?gbX=BR38g5T7Enc(x2)H1#%OO2O7on@oKHrSkP?D zcJnr9yFzyY<1C443PKd;s$sbsqkr3hcvqu1Abfsyxz~OiwP|U02>f8gzV1LFU>>eL z;WzJ_=61wa`j?^4cf}{e>A0e&moz4FhIAg2^7R$uEh;~N1qA0qs+7Rneb=16IH2n8 z??FBb4X4u?zIdiS2>)58^HyiD)p_5-|%W;$#@|o!m zc%Hoemq4%i)4(?nXENWGSXJ-1!mJIaA_pb@67(cI23fRtCxGppLl6k^+gJ!iRdt3f zVH|VR?-W7%6u}!SdY);^F&1Fe+BYS~ z<^d1Z#ik*6>qAr@{ET^W!wv^#Y$%*;pnfd1P>Q}bwm?Ew`^0!A_;vjh_{zmmk85Q?z{+QH z;s|T7o}<{=rWpN~pj(O_ndY{_9G%Ms;ER%z=BR)z>kw^9IxCERwn7wi6>U;r@&vCp z0Du@i$N0t9pR}frnl7{Oy)g1wO8}(wDhp>ETg8zv1Q+FBde|a&n>2EI+lW~FO8bOd z!aBK=*iMJ*Udo_;5oot2UV$C>uuJz~rmG^y*+StO4OH5bU2Tdn!S|NAxpC!4qHuSW zj%Gn(FITBPJoJ%`ryTEbo>5e%Fh&e zpLqVbS?;E~6y$?EkxOf5Ocv_;24F6rjfRXx+)cdqCoGY~t z7VcwdQ>vD%*CFk5EUg;DbLA9enM}&joLdZ3RW9m=aJm+~hZgxo4kf*O&}_`jptjt5 zhQsz441yo)j-VH`%7tFlnt=d}^`%wLWw-7Zez{r+zaK@}8TrDG;e$+kU7_}NnTffD z%)HFLA@ds(0IyLz13Z_N*yO2jN3t$sSNWbnhn7ex)p}jCQCpLr)5%0|CSywe&z=11 z(W*Q?|I1^<(tL2iB`5k=h@zgPddW9HFWR>NWLj^b-j!k{xAnP&`b5Y(oyHq&9yvKT z*PHy~F}ASRh;n8|a;X*-nLHpwb@?0xwFJH7{M{G9847~B`m#jr(d|OlCF|(I7iLIH z{>0{MpuxrQlHB;w<^5ird!jK=AJvJfDI3G+3!VBKL)@oVKr z#6*5i4(h^mz;pxJ7ZN3NM}HKW1e<-(x%6O;Y?owsocL~a=AV%7lM)FGXPJElP2+(| z)_?$#nw6G=^_GX(bu9sCpd_}3K9JWBT&855WIgfQ$xAjM$~MgYBSDRYVpmQ4In4)& z{f95zO3&iIJ3Aw%PZT)XLI-pw{F-+I$yPog&6Y$#ngWH3nr;(?qbf?nd{xb>CNDgP2clwATxDY4I=F$GKp^s^u71Ad@c zS^u!(_B;QuMrbADE4>&QtiDBwZ?`IJR<}r0^AolFh5$;0!A#HI&#Nos2+Y@-10UP) zO*K&DZ-(inX&Q8^9%@TA1Ju3tJ==?0X_O!V-@JImSL_fFwM=MH>vYq#q{7u%#xm^Q zB@pd;X#o~!UC)xAs`vjLZH7$1DH1yyyC#EYXfho!T#Ye0lW||;G3!!?xmO#tSE~Su z`h$(BK%D~v^YMKn(!T^$mx`0ey+EzLS6*2c9@^3N7qG}nUe<8@3SJ=t%o5YxM%@?> zF8&(2^g5In9q@zeCMaUA7m8J&-WJ*P@6N+4KAp(j28M1OT(xb#`r2SB@SBW5581xN z)SSQM7mC^MqThZJt#)Bpcygpl)N#=x2P*Jw?}RgPo&`v(H~$=cj5R|x{{&UNxI%R` zwWV#S`UL$Xe>D2^FM;(Gl&B+1qqPBK0T|3(QNiB_67qUT$p=0-7)xDmExIRY$TguG z11?@@okq5%Zv;BofJX}1_UVu*+H&P5SgWJ`W*Gs8#qF-lyLl_*cbq4D3p!=qTlmiZ zddo85MJg@qIJH~ZwLl8g=ZI#wIXUuPr}icYnysEl30B%mL`uQpoW@`>*<^TG6Z=qjeB zGbD1xAVXM z=-A?Os!~(Sqdu0{Ab%L$QX z<>&C(O0b~hdj)YzGzWPAS#!!$JiyDTbJ0F3*x-!03^WI*;QCe!y5T>a$`rl@eOZDZ z?(RIQ-S!X4DMHJ1J*+4h<|-)utmxR(5LNA>on}M#;(I21(2?K_pxmmkm(ICcl^Qui)eRw#l=+;d4~|h*q~is+Il6zOE;`E-X!` z#!=9ufs&Z13u*P1cvWl*}X#NZVv@T~nw61RRD;fYfCLuh=L$ya?UNhdn-@`E=9!T(bF> zS*UkhbD|o@1wD(c85fhhLgb9`%8nA>op2zJ(rQG3Gw}d&#(T^xO;W#^JNJyaD_HKp zsHEajx_$~6(Zd`*bLjhFnLM(~Tos11U|~ielh;}5fW_PQi%0dN94Nu*lrK5($bNg7 zsgCg;wKS*Ui5}g0c5eH-HBQLOW1ao!Zlwavl86583-VrwdL4(XB^115XJe=8F9Ff* ztIvN4erDy0eH&Nc+mgbZ?rc|DZQ|#Akpt*`v#~s#PhDWn;^xmfK7^TMhw7}53nwCrFO=oW$&^^)2DNBp}GO%QBSD zvGLp2=62KY4EJ_ta?WOrF`no*{(lMH=TQd>|0U>9y!2s?RbaGS03KwXzK*!2VvacR z5h$yP*sW3FE8c?Ls`IlS6M}OkPoDy5sK>>*X63VY`SmXvT|+?kp6m5U(PvMuS^kzb z>j!EsY4S-R7Ry|G|AbrIN0q6`A?UFM$fjVKlcm8ca_GD$Xl%0-z_Uw$xz{! z9IN=;;z~QFkIW@HE?zb76>Uc`gKAfwZ)k0+99IO5$!W|kDElba&% zCx91n=SD)Tt`VB51S03bieqChNc!HAuk`QjZCm%EzQUXzaH=}Po< zkmAp8O!CN?%V$d@Z`zv_VR)K|-=nl$52^?ho}!CyPS^|6h{3CqnP}f3ZMCI({@5Zl z^JuHq-$bHVLw>D(Xy0Z9g^Gs`17u2z2@Z4bfCN&9&0j%(1b!)(HYbXyl^qe zIyc6s_d}vaVsi;6?uq-7NI9iWoNGeGea+>b5pR5R&=2rLcH)iL1>D|3&c09Wglnd0)#d06rQRI$33D<6 zIgvkqR)A}dL(rxT;EJD#+PbnbPt% z(eqlNYQ&2I2NRvBV1B%dk6z*Eax3|(WSqRQ&!_1V%2CGFIxt#9wrI+(L-UcK=cm)K zNQ>m4iWRBQg7DwsaW5=&Acdo)pq!|uNlTStm`x_*lsnRp?OLek>ivC@_`~&u7a)>M z9s7fgcU9s-w=r0gIojW{S+aD3X|G&XOs3c)b};ljtQd%cU2TW`#bc3v(d;m@5^0pr zAAg@eL92@HT6`LtAz5u(Ggb=yO9t!vlv^kT(X$V3cUp%>!!PAx3K<-1n04E;8TbIQ zPFR8-4-c@wWO+H|;LF;0MPZI!gRN_VGy5`#e^uO6+K2fqqjaRdcO&_M`V7#YGrxzQktu^R zack})E|;5tkH`8EC**@2u{ZYr#_-;tBPJFjY zG(1yi4w}s2mmPiHx1x98@ChEwbnv^iaEZK);sr<}^VO$I13bN~T3zeRU5faULZ|Yi z%_c53k$LBekB&ku!EXaKZlcOs2r2O%LO!7T*P+!MM+i6`yb-Udwe9g^Oyyi|f_S`g zAGYHn5%nctyE%2D%FceTq5O?}vRLrcW|8~lB;XK&AJA(ClmlzmXs z67Zq_Sc_#shABs;j#mJ8(a!ND`R$puF7QcKoT}p@m*h8=hRnk)N7?N8p~GA9BSS6r z`mz$YQg}ad__ki=!CK?Q7uX~c8{_XKs~|hL;Z8{za{%iozuOyh7PIUa2@5fUrXy?Q z#2O#3Q?+ZMSElZWf`+vzjU8lk`0G637Fsjb$ReSO?;<>0Qr$QRxBh(SjdIGNNJjWz zN zZv70z-|sP*e&^0Or--mlEAbiev`YGOcVfPG z1mCM&v;&l4QgiJ2Ji~Uq)9PJ!+xbkbfWrsbiZR^-V_yslXT%U!*otESFi8Bj0?9#1 z7i5Bg0#>mF|JI&4llgjIGSeHUR0Vq*Lp3NUxK^v-tSlt_K&+NiJ{3DLDGf*?W<5qr8}CBFm}&< z2HzxPb-O*@LT;Lg%pF-}0bEns7f$m45z7(Uyruo7zZwf_*XkfuyEO%h!hwK;CF9BV zy4^)~Tjxa=6pevWLI0um=XyFr>a~~_1K=$(&W3yIuEraRQI^c`F1^I+^k=g@w?Sa< zNg`d`iuvqqg*K;>O6KuCJyVl`r%ZEEFu>uqKSTmxGO!6%b zsljOv24*cHHTr%DwjMOm;fuN$vMdG?zj(N6DGh0b$G-WVxFf7NNl=L$d3mrz704Gl zslgU*Xv7F}!Rd&*;FOs&4Zdx7>b)1Zb1j0O`N)lILA162GVDM(GKZK=U3R1taj4>_(bfHa zBeleeoaHEkJxj8y0*;p2Jm5^$j?GB`DCbjyKn6vz6W^h(9b7-*l8VXh;2N6^c2rpcBU)Z z!}&gCXB=qt zQyJGi?HPU71*)@@qvpN5FDH^Eitk%IC5Ro{K6C{OHN_5oan_rS7z;EYNXoQeFc)N_ zSj6zZ>GN~G+==FU_><w97Oqt zZyQPjtbt`I2!a;Q+%KK!{hB8vcJCO7=Cd;dnKuY&l zl3EZ$ij;^!p=brB2D}Q5J|M)Sgz=^F?OKepL2pIH3!H+(QrXUZTX}OnaG7Qnx-5(FDmK&vLK(#m$6S9r8GcR#r`xwBDiXP4MbQ`J zx4f&iqOF-LOT|%?PtmAZW$(1^R1_=b>bH)JJ}ZBNEP(kiz1F&DD-kHsLek%dme0a2 zbENr-tE+2H`8XW{uel6PDpJh6K(1E`PxdOf+k;OreQ4&o(S%(@5X1GOb|y{E_Vl+& zU?w3d5{EJRev#s(?)V2NVYAk5Z2`4ERl9b=#KT!~B&Wd@ax#}PcMHHij1k?6_psQ- z*1h-6DT@$z=4&_p^WOI%{Nr&-(-*HA!qB8XDMR(h0NxkP*asnronY}fw($>DRfz2^ zb)83v2^{BPO%{C*jQJ2%d&^T*AK`U0x1epK94TxR^jBR6cW?l5aB^*RcVTf-a~mK!LdE4V(qIk)KP!!9=*(8Q#{YO0Q7 zw*Bg7C4>(LAysGYD(Wayi|_~&u4hfH842>Dx~k^9kO~b1oRRnx%v$)ycaD$&Ue3_- zaMjg?YScsB(PA{Sh+RS2$j0#+{#6wPt})x>#dS=8uWS97LpXH}H;L85)dSz{0L{WL zM!UoE@{`APK(q6rW(eTqb@9?ZZ;kh~8v>0v-3_{lsF}Yp*GcWt;P0GBpRFxBjBm7M zb@*Sk!c&S9sGLz|5n+MGMNg6$AhNJWmf~<3jZi=;y=z|Sf=)=G{QTHp;B+r-$_39= zZo3m170x)q6Wi#z=Xu)JlL^LTy*kAN*SyJRRMzoZFp_>ejM^!B%rOpyv*CYKhKx-7 zbnFG>J(wH5kO{QX_+4|$7rBmJ={om`vIA}ZB_KCD}W?NEu$JKyAuIKU9(xW zNR&ua?#Kb=FM%fWSMHt+1sdP|?~AwbRZEeu%Ud`GS$XEJYiamlK#tU_Nhp7^4#^g# zb+{Yt2PwmFG~NBi)7aQF?lmmK2UG$j1v&b8?X8)9tQ`W%JmFZ=E}Sajb;j)!K$eOy z2rR@qEkTGmy^G?WS@vT`Vicd?wD&|mXw-Q2y85#V*SgETdD+hEcv>cpylS0IgNLDw zLd07oug8{iUu$fb+9AEP*&ITRVutnLhv+x?uVck_=U+Dq^ZSpz+&uq%scvgk+*|0Z zs>R^5pFguVv*EFtbH3ygaJqf8_n0XH@z~%|;RaI|^GI?u4?#J7!uiL)1l9jyIxKDL z9qBfnDC@F(kGRgP+4#2Qc+j4{9v#ugj8{KW3wx~hT0>9uF@e6LqE9mU@Xqt&{ogqF zr^n&8x#V(8jXUb7vw*QXn{J7VUo@F5C-XKA3k4F)H1NU3I#1ue3nSmpem^*~BM;Os}dJZ(f&Z6P!jcI;u8J?r}Fy z2uVOT163>Lf#N=tR-2oCY-WA;;K2c#E0ZqAaa>%o$q;Z1xlnKjJ^r3lxDHM7 z({EKpyU#%x6$^g9YN}9*%0Dj&OGlnJC55!;t-*?Z8#Oy$S=3KdB{YWEPB!3udVk0T zh49LRM1Wp@9TV?gb6N5YUtS4A^v`KT?8t!B#o;H|texHwrK&bf&5LpirQU$7D$hj> z!(c`GBXvS*QO=gzq+mL-t6yBKKBWL@wP@cb#U{HM$~B)l+UU=$tGK&r!kWCUvr5z- ze#3Qmd*Yv#V~Sc&Dr9CiC6j&-R#N)@CN{epN&(|f7n$FC>ABv@jN6Fx9KG)w(iQwD zdD(LqpdO=fP&dke*tpd8A`Ot17;FqHOAd&MTt_uDefrVKpbj^ODnSkhC}Qm}$?K`f zMn2YEps*iIbi~}l#6CtOguBWdh{{g zO*aeeQHA1AfsT4wm3`cMiay7ub}F3`XX+m_iwWN&W@t}4H} z3Df9-u_^i2Tm5W};ajU;b=g?dA3nZidHfc?s#TQQXb|k9S$w*9VOpKzdH9VoPgHMW zvGt1Hct#a<$@G!_UPO!7!L@)vkEo_=`srT+3>;&B)!a|oU+vpvO+{3Oa?G>Rj|!m7YmdT;}<} z<>}2#kp!%eR>a}k@)+L()Ik}aquUJ5a`2)767Pa%_M7}2Si9evC76AQm{k|@A z-+enBe?9)+k%f^Y_HSr~?Zm;?(g|JZxpYWSuOGi0RmrSssod9=bMLYsN6wGu(@*_? zm2+poWcD_?Qbd!vID>U1GQUsYfCs96hFVL9`#_j|{-1rQB2{(K6U$Uv$@stJEQ2#osPpKbHIvkC=_ElRj?%atwKAUW`+hO zm<;OT7vtlT!WgU$w$q*IbmWpvdy+^H>yvE6Z}cvT+-!lV&xeP!Yv+A!yNr$s=7EHE zFQNP~kE9IL@O^7@Y=6ej(tG|AI87pgJd7sy78%v{j$YHGn~fT+dTEL?qZxl#k@NA-kulj z9qs7~o+Mk})#gUcYQbRFCS5oexEdnAhtWRqx7y!ES=^(YQ{$DeOFqI5|0U1?KJ>sL zej_VzWZIx@zt2F+-NnH!GlN0F228+i{D=;vvmaQ%g-I4BGunkGoD?hs3kqcyZ{aU; z2yT!CW>R?UsNaM^e7T%SI{ zIp(e#C5x;tnvd+*OC8eKFsU8cm_D$ziK{ry>k4^~I`4@4O}m~^{liDiM$vzy`P;1U zc5~x~Tp|-6!-MzNPW1Bv>`5cZ{ST4j-qyx+E3~7sQMfkj8TSR#*@s>l60cJ1g6YkX z>!U2U=8uB}ey$s7bSv9dH-{|$WP1(*Om?eX>o@NU$9*c)(0|49+yf==e;8D8JoEi8 z!2_523d88pSFt>CkAy!KsDCFLAmYG}%HJ%x(ROM%m7_Ugk%#v?1B(>19{!P06|QtZZ1$I2Oi>fOEN zGh>zOC!`GhYdIAoKZVurJSTLF56Fc)Q8P%j7Yy2)uYn5WH_rfq1t{b^SyFj>MF7Dj zML{(IK1YrCp1${0I8EqY0RyXUOq=M7v{Q<3{ReIvboY(pOV}kkwioZdQ+lQG1toGs z_|n2%CW3%^{`nu2+t_c3lj5pnYgYGn{S0Lfix>#M_L^khiXWbR@F70ey4;CIma{&| zz!^2jD~Fs&)9p)o-`CJ%sKEnJoBF*3n1ihD|JfRLvcbl3OmODM~T?@fOR zx~@6Y!qTERL4y|^b#SJVFC4bgzxX=jfCqC$@lKs0^CnbZv)>87?Eh!eH1lh*y`b*L zyWUyCiogQbTLf?L@B47%FD(sm(~c zs2Y~Zx;I0or*-f~7P7yBZG0n^yQhUHUVOn>G)Gh3PBeVnxJr>&f*iZ`yC(A{GbJVq z0%tOJ?|C{w!Z=3r5c}a-BSYxX_j`nHKeo>+a0%o+yiKAnBspG{YY}{QcfYG}9jxo? z>dV!pf$t03P2==>*+5yi!+wutt@Y*8+d(geEomM%)>OgEkjFDtXO8RjV|`TGsh?L- zm{H^LV1KCV#>P@)3xng^3;Sr=esu+#-sV(|mz|-XDgAxeGgD{R^kCqI?X%d_;^A| zi~UsKWpteEH2QZuU{6(`{gGjW#_ZL3XBzb+ySXpYGnuxBB`?=D=snn4kcYiG8C?BZ zZJI1>KSYut(M?h^IaRr(7Gw+AiE~yz9{EL;b`}E_F4q4H8Bb4-KA8B>u%Lv^dOiw# z2B4s`oN+#>Jq01lE^AKxvJqUkKbdB~E?0_kYEC!XW)vzX167$aYIF+Buq~L^;~*$jtX~xS8+hlhWzdW=nIe^ICx1 zsVsTR`kBL80?|WXbv|z=JU!5eTL-Z=C)wBKic8<`c?Cc0CO8}~Nw{ULXBlo9mdYt1 zK9%Bb35R0Gu~i8F(AHOWI0*^0vV90|`fnZl`sD@)Y&449n5IE`S{T466E>!G!gY z9BcvN=Am`DmoVP`VTt;k;w)?VSc(nD0QtjVnq_(bAj)LqeqDQ^ zymcO^H}q|3;M6SeRn2;rRnhhRF_&WyDaZ?u5Se#hV%D~G9Kp_mn4FgUOYnNSwjr5O zw%g41hfOk_zGPDd^~mW6GQ-Ylqvm_=^A>bTJ*rg_6P`o%6K(>%DZlynKCK$wENl)k zcC&ggk1wrUSm5wl{K-|>Rr0k&e?Azw$c(pG*;*o(*zN(@oEvB|hJWEazRSDM`UAeW zqLPWzSl7@D0%%Mc0sJhX(x$UAH$rJO0JKBh~ybYjxqP*sg}0nuYg-ucvpj)K=2)HnAUv zb-$iFy|!>X>*3qD6YUx@oXD-2VW=2VwBr#FqNTudC=jIk&g7 z)wQ20%TUv`Q)w;2U7I`Qir(78RyJ0VyQ6t`Q%oneK{K0n{S2}2$AtV#;H`5@Td1z| zy+al^7Zw*GZ}ge%aW0!8OsyQ&=1HAr()Cax*wEX_c2?<<;=V`t!}~7kcCdI`Uhs7P z0NS^ZYnlz#jjri7(!+bNTWQeT+uZ5iGSjw3zuN09+_&<5qSnR>`|Deat7z?SSM06g z;PZIb!~IJeD`F^q@||5&?+ZBET3u}3`fuFt#o(PQ&VzHMQN_0Jr#-aoZ7sF>KR@gP z_9*zpZSgbV#7iylk5{#l-HnUL8sgg#i0&3hzG7|OSX@G35+p)MMoKty_grM0^PYL? zc|Msr{CVkL%-8%9_v4kOkMR!U!6fRU{VZ6avpF;NmpS)?<&K3J&4{{LB5b{vb_z;eAg{wA4+IZ!<_)Fv4s0 z46Qtt@{QK^sSJA}GhFO~)<$DH5DK3gJjbjC2CYS{M;Rz6>eZF?Tk6+Kt8R|078&C4 zROuw#Zn>>}8s6+=yI!BokNg!o;|8nof8tffgXhI3hwpV*wcU4EyOu_^)3nPA zgC?Kh>p~r(xwbazCDp~1yfaGo*ZOUgCTR@5U*$fD;q6byzA(~sYh5eL@a~gobK+f2 z1%hiWHo@#-l-UdWr{4^pYVu=iSS==GM7c6s!lFndm+fzjHql>r&s5cKFP`$!!(00# zFo>mV818Q#8Lf0pDk&L`qpMvK7hBaNM0ji(OKTf@ZNF#~ejofkEcSjMGDdt+@f%o) z;$*p%H5s&%cN{vlp?5J49j=vWU@SCm3+dJ(Hj3xXy>{}_2%-LS&2uPMm1#8PiK{tt zC-EfOcYX9)-E{1J)s$e>3N+JJgek>Z>88?7?S1Rg_gxO>;l71E#rBorJI2|dcm?fG znH0!urSVRq0_sr1VT5*;$iJ+dUkd~flW;>1sN;y)3(3#)GoKZ^B| z^0dh{=BKIY_SV+!^UT0ElXE;VD205)l+NKm!q@8m0E%=?Bfwt-r|_Mz)8(5)ylAhb zy<*LI;~x}e^H8}HD@2zT+Ds6~9MDV{t+CVKmE;T*e>)$vF0DFvQfL<9$z}1+jc%f~ zns#ezN76LA`0e*gY~|;Y9YXARv#^N8DB&F7iss93vaO1urw@0Tb1QxQ)}PmN(ZpgE zDs4eu+AUVeYLZ@k+CKFE0D@q6-uG0G<898U^WQ&+Y_zL=^UdT2TU+TRNmAVjn-NWK z3P}XTM2RiD%Gpw*sQ8QVBJW*3BJjbwEf1j2a$`<3h4pX_j6N@RiqzH9cy~Ax%-tw(#lZK`QyuB7zKd zp_)03`jrZhtB(fKyj7`qpT_!ciEh$UVRhpzTFNM}6=1QjhfT4CLZ|^!8${U|8=NYJ zB;b+s*qSOeWTMrg(_3tkzS?@)+Vtpo5R;r!>w7<2-rtYveWBxT8^&75C& z9?teSPUV7a13=Q>O|XM-DGR8U+}upy$3A9Hm+b6Ed#-A--Q63HB1?F!POEQyc`f8J z?YE9YqFQ3z0_NjUT|VX;AgaxDHGyZ9GD##qAL;k@lK95TSmS${r}3tze*`Sqw_C*2 z8qOOThUAt-hflJ%xK%PMI-x_D*yri1XG@(YN58$1uI#QYZ1rh{u`|u8UD({>(-!eZ zu{6y$QnqWG$l5&<6hN^U0hyV&2ZMzq=RH~GPAUJQdKtWfrFDH6k>`H<+|UD8r{#u9~
t|(Tt)ka`yC-9-2TR)4O4mu-Wozl{ zclc}9W7NJCd^WhhzSFI(;n@j$asx)p42~OmGJ(5jI3SP?2;7ceeYze9xi9W1)tn z3J_7*S=(o(iC@=mPft1UXTv=!MzgiGOGNV*YLc-G3}>M2`2KuXtS5$SthVOfQJfL6 zKn1&DKmnHk^ei)jUXP~fHnBI$ZcLl;x%ezs8;*KlbDnT>^5(k#0PLk0X(1#98$v0N zr%Y`C6N8+Rd-30oq{8L$ly0f$v~8!Q?QiSnq4M~w&V2m0yKh@5P{?x3K_C^z-bWcEVDa9$jYit)H7eEtVv9Q`ASuZtIuKZrrMGs*M_h4TZv?5U zxLw~>c9LH1zI|2uU(n$C?=y;0zP8`GUp?>lc05k^L@=YzGv?ps^2Sb1SDn$QW(Pi} zZv#9U^PO`@w=l7C9EM`qP(D%t``tJka8GW$3is_!GTY$-HDUlb%K)-q2IGyRJf7cC z&x?4CU|%-YWM`1?T<*Xu{{Sdmqq^rjXM@xn*O7`)rrL}XdL?^bTRrsO`bV>dtth20 zi=*?}@A~}_^WME_6V78;K|roR91MV-oDs;uIUb$s$+gR9AdSTF`EEl6RT(GHWCMfK zIl#}aZ*|n{4AJdyT2zsYajR@)j36g*IL<;2{A2MRYpy`DTe2)1l{G z>&LAaIKOw5TJ0y(eLnT+>to)bRZ1~ZOQxL`x?8T+zN@XepEr1uPD$+8nAM^sfkHOP zlHVwE+%9qW9FC&CR`Irj856ofgDO>W3#zCfWq`=%pvl2FBe1Wud`V_j7W08(K_hPf zT?j1sVH|sN3Pv&iub~UcRSl zjAom6^jma#cm9^g=l6~@r?m5$Bn!GSj6f^KPIiI`InGW2$l|_O@vfTB6T=D!rz%f7 zkz|MrnHfrkB$DGj@G)O`c&|*JSuSN}g`poTO6h{Ke8faLo;HDy1_|WX$-X?aieoLx zCs=;noba5*&!}{d56nEK>*vBTmb=2WWQfQlWMlg5HyoWoTjBOTJk_+uPVe(@N z4<^2|@%4~PcLF6lB@D(xhAMc%w^7)P?%8ub7{it>Kw`xsZ&r z#t8@{K4Wd}QY#@H+O(GUW+j!RBli;lCL}$6U|6Zz4^f^ttc_ef#j-;Z?Up-f(pgn< z?*;-qPD%NUJC;zqV36HZXXxf3p7|RaId@t&o%1OUdnT- z)!``JJ>H2YrkYu4{UegUH2uiKncu(P-E`aUTAz9T$=5I9zXeM^E+^DHIcp}ZVQFrX zT-+72#6Bg{5MUSkZl5T%Y47y&k46z7qte}9)l(<13%+<_3r<_mO%a$^9Dn{<=Dr!15 z+JDV&CBJ>v5*2W=Q-ZU+T1hQ=Qr_Jg-ralIAI;96ZLj!mP`k2?MyGQKcOpxeOxG6C zhC^v3yLnfQbrQz#g5`{>AtW@n^@L7SlXI51(;!qnNC(bvB~?%507|9SJ4_dZy>z9)9rQpMze=UGsvPl`7MlB z_J_>#?cAhf$!j@2SDC2A6!4;Kudl~pusNn46{8r=O-GvDTC?8oT^;`Y4?7Ldg00CU z{hVy#p4REs%J$t^w0}guv#;$*tN126J#PA2U1#C6Nok?@&dS)yaj*Cz?fz8$81W2R zd~Y)yDqBgebbk?AT&==tQQBHuLblgqVf~Ttb4B9qD@^hK0K<0Eyg>$`rOekJ5YqJ@ zD#OHH9Jqi-CZXWnO{9WLEpE~ln>rozkm?pU!61zc_P`(GgW(Uw^!R(Gz*k2|G>L93 zpHWGK*JX6k70fqQcd+@9eWuM8>7QG&K?K&zwrH3}p{w^V_67LiH--?jNtev8Rk1Tkss7AG(?~ zztK!F%VHi!md$QeVrZDphmSWF@)>QPe3sWOa|=3^8@{K*yu+Ij^=Zn`7QPC=qlZQvahZCzqz~r09zjr6j4Z29=WCiToZyjXD8FA`Qo$hiSkruBg`NA3;Od~ zN(daB)g4FuLVwp){{YY)rhg0herTwCasL1me>7jBiYkc;D58K2fyo4c$=lZ-k*K7Z zV~w1V#^5@DPS#}vgs! zOM$elemZ=5zL&(8cOrY|TPqVinx&aCNHjUr+@;K`G>o_Ro@U+h%0M&93P!~?*IDt? z6Suv@jr}af_bCWC-bC<^h+1Of;&Av1Pyj9)ug%9t>LkPQyFQc z`OCwa-S3R`?ITapEG;hKlJe0)-AO&gu9*WlisQr@^npYL*`$%QeG(gsfUfB?2~HLd z!??4Gs$%gn!r_{dN>Ns|qVHv`?|W{Zy7$!m1BY^%P^V89RW$kUX)RoB7Z+Wm{Xh<#>c~1!(Q9 z#J3MC$u#RHZJVto4W|y4Iw7-AOOfeMeiA&l_C$fi2oJg-xJf(C#tY z*xOs$I}jE)8`1nD1pfdIyfGJwo;0`A?1v<4^u4Wh z-)mi3%=mNm;Lz=KxpW_f-X6WOTde}x=TXJK(Uh5vJ|W82E?b-@+ShU&Q)sFkncRcNV%mln}JnGeZPPXK8MXMi%P-09d&H z07;fZwT+u3OhZGv=`b~&97IxF(TaDHSLm;`+P>EAx76da`l_8v^I6%;M{BRsrM(YV z@b|(!N5J2+N5m;SJ8c#1zP;kB?HVmc-C1t#w2d*_P4M`JBpz6?x?MK!Qi9rH=0|I6 zzI!x|tUfN2z?z?g_0NXh9I~J7ub_BuSMhGVBP}9Dd8*5$S=vE%yAhq9OPOs*jDZ~L z?Uqz3KcjDm9}n%k8}PTn-w3puCVA{UKmPy;$BibJ%3EzOP?truhAlr(n24shzG?0j z!h6E|&_vA|GT+RL`ET(f;qpI@{wizNjv#};dN+rBC3hqdh@;nhQ*WhRwd}CT7(|BE z{86dh$pxrT#@^o9**|c1bk4M=imN%RQiTaNo{@5GTH0N{mfKA>ZF6B4H9K8jo%PpS zB<#C8@6+Ub1>k$}aGGAeQc-np;?EuGbCLn`dX3VlLeA_M7C;6z zqi8%h+6RU$u7p$Rrdx!PARIKB;hRdiyOrE2e==bqX|O{whFlB*;@a$&cU~0um8T${ zZ?x+Ab)*xy-qD+FYCG8{c3iiX^1Hy~_Bi2G0rQ^L6R*$8qZKP_wyieayKCyY?Wem! zjFL)sX>zyb)tmWwc`q~0G#e25WK%=W_Koe-u|~Pal_SLjTNi>02xKJgJZ84!QdaQe?h+kd80Avhz>&Ii)&BfPsYr1sr zt9mWISJ$?y)cQO%DAT1Xikgd3-78y2uE}n$w7<;z)8G$F}&OLKBiYj2<}ZdSGMfaqV6m@bgG(C@tCJzz`?Pw`m#Ml_P>r z0G>}c8Ly)(ky<1i$GJgJ7o7T&>)WqtJ2GU*In#p+}TNLjOYs_?y)ZN zUvzu|z%W);>^cLU1~XjFtEbsX43j)SJaTMurGaGf!2@EDti+Hq;fDwC?4?_g6SSgP zqk&0}cEyyqIA1l`fFn-1T*$-)1DfGAxu#XRRAiRpFv^jD@qr*kJUPb*31*F#C5ag| zb!n#=IYQC5RinO%U)A47rM!%&$v330D7ELKr$GSMu(Y90xhoK;Ix~t@{n@NFd{U{ zkMW7o;IrnVlttL;1PgVYBJ*)-caWm2iH7LZ7$uz=N@&szVrLY znd4bv3_2A^r9?0MPN&kmd&G#aOaXOLu(NG;APEaItXR+AK1m=R=N;?8uB?}vvtEbL zP_v4Dc2Ci-({K25#r#g^5^UU7MPnpRL1=@LcD6Q+pmE0|9c$+w5lQ8x)NZIs?%y-7 z%NYzp?Hj&$j@H2Yz%r8O2R}vYat58_3ftxfXkZS>GIC$2ag4493*V8N@DCQG1{UF% z%IuOXYZ?_~R&&b|2xJO_810;%zEq_sw`Xm7Z)-){$?D%l)b(~s&90Werrou_sz=HG zE|^^ER}mtJl!;ib25~Va88{fpKncL(KH|Pf_<;|bD{WwsC>z`ihVq!k$bbMoWC4je zz#rJW374!$gC0+uZEJWs4&2 z4yz+HOi0QC2{_(H918R>bC)uF&!ztW3t#%z%;d$wKFP%|dP#EY>Au_P_onaH&vof1S^pt}UJ$ zHcX3y3zYjg;g!PVJ9OMKbJ&xP*$11u)FYWxTdOqCurqELb0b6{o;1M&_p7(f+zv{q zUA*Vgbr_vgVk9HZW-pvSIUg_w1o6A@_2#@+Rl7EN<^8y15tb6hx!UXjQz9-&1=#0l z;B#Mff##~Dr7wS#G_LOLZT$Lvdsz5-&~)d@wY$5u^xJ(GMQeJkS$`2^7Q_gXSRNr3?fv2*%Z8_lUt9 z{vNz#C4{R9=_tjdpT%a9SGQh|^R|XAn{^}0D@n;EZ8ckdJ=a}N*&o=;PS-vi{9o`# z#GNME@*O(MU(&Q~R^m9CNMh4GP8QPo7q^VNY`l52@*k(yicv2VArH+JstmpK>EaSsCY-dyJ;Qs&+*!XplMFG@Xs=}sXncFOO)0;;bIGQYw7@GWJGtT4vrn$9n zlBDk1bCeoMb-Pw>-4*^{VTQ{az{R(BWbe}YKU-ecdau{=d*YRzpW?3o`19c|xiTf> zn$5k|iF0(Mt){(gXLbvj*@}5r7Sh<-i7-*^^+^jAc(T8f`X7b$dk+xFrdTcQ#kHee z&8b+KQ+}Lnpa)cjTR;(`S4if9<)da3MzBj69$J(7q4CD{J3kU?&X7kXo~3DhbM{af z%ui#c!K~XBl!(|1*yFpkR7NBvvJ!bAx%Y7qt8+5jvo}P&L9LAbcyw}3HRJ2=d z>ur8}qNS&cEj;ZfTDH<5jywBrwA@?4D2?E~5v+4r0Ld!3k|(!}l8lVAd4W8Z^2gTS z0sjDL3tt0x_e<2{(%#!gynQ!C)U9mhir&y!B>qH~5$V`;j%x>#?Qawl`A}a#vqZmU zxG@irbgA@DA19w3^omqiZuplWmI85u|-1RS62gac||xYkut?*u*ay z$64CWk*MfTZ+8v6z{TV-Kou4#rtxmmUr7^qjHQ%2(4 zZ`~!fx?R22rR9^a6$wShl}RhzH@>|#xA|YU_X+zPc)IIeO<8ZGYfIZFp65om-j_CV zYV*L>zi3gN?_sFg#cilt#-$b`a|BV!=^EGV@9bUU9Y*`cgHF_hZ*{9^(CU*;iqRZ> zh`hVJ$0>H!_rGbkmhLrQ`4XU0wJ?z@8YI+Bq5i z(Qq!jH={d8a#>m9^FAfoa01w-**`wlR+I&v%+gfg#CbJW?HuJT_t*0Gg z>dCI4#7q0@X}qMpi87J21~z7C10RCn>{POx?h$h3wPj7Mc5Qdj-CwP>^10~e^y%X8 za+Q}fVBL~STdTEiKgY8_pmrX7pUea5J?IP%LskOS4M+V#f7ez20MH(<)hpl(Di8ehANmW_ z&*6VB%@q%hKjL4Rc8VyfBq+**&JVF8nrk{P0X&`p@w@rs9^{XpuQ~DOjx@go=(>ir ztKV7JTj}<)+{-oW>jV(48)T0ol~R$F+~G;%XynL&vaMR33UaAdY7mONn|&Sv#DvTYpLq`d33YWboP)>Cx)~uQTxY?+^KC| z?8^n^)Y4m-r9cFcOO3mdAD%mDP<^II zU|g_B%#ucghWu#w?fXyY+FVa-;rOj*pIVWVaGLGz-DV2n=IF{yrW7VdmSV6pnBfMdGgzCXeB}c81m)Del(F%T{X!Np38szO^lKbjwAx0@lM#jynS^Fd~FP zxb}OWi+&&YSKvSRN-ebs^sOgEvAMatw-GI^vD0K}B{SaJqD^sp&UZeYYdo=BJkv#S zGbFo6{AJL-E$h>2DXaL4NtJbpZmq5~jXLfqu5~LbX4?vCw|ZJKT4{Q6!zsA8S)N9j z1-#c*mlK#RqvC!k{BhU6XwQq@68H(>3!Cfj2>2^p(eA}7m_zv9Y?_6uyyOsc!{}lLN29A$@4d~thQZms=K#S?Q1%5RA^F+ zS1Prmvsx>(pKqU+Mf#`UZyM@;Fx1oHhm9_->@Baf`^|pp&RJFov>ypLmQ5Dc;gKSj z?H^)>OY5s?BTFl5tEk#LQt=sU&;B?3Rq+M3op-Ow&|7$e#NHad(sa!q%<}Dg7x0aC zJ(!l@yDp!v_{wXgwDMtqO>DnvzLMfeMX*V=&1S>G{uI4FB3oJX3nIGKyLB;Ei(k96 zg=DhQTO!?)OtiL+($Or@P-;^{B3u?rnZo{g_`}40Ao!W%IXpRMr|6d-ZM!^&~*$k5nF1<9N27M2zaOmW=m8jsn-h+-~MTYtArV-!-BhDWY%L(`OQ-f`5t zNv>O+LgF7JyxMx)fjlMQWA?1iku~uo>Uhe5FHMXDd z9rZYB^HjAZcNGUYyL|Ws6ak`WUX_kQHdHbjTDEjTvtz_|d5NcFVx}b)~K4g4}8PeaDA1 zcq5i*rt&mfQK|ULRgv5wmrzNhwy>7vcgJ&aADmpl8I&rRiBycP-@caix=Aj-@a9-d zJfS;uR<-x<`ueV?;V;HWq)#5*Yc^qR;9XYA-%YpNr7rCBqEhDUO6SeY*LN3JP^b!a zFd;`I_(y`?OOttXB0EE`YFDwfvl8g=@4MXOumrO-%tMj;zq`nX zWS&_jx{_m;3I&S^17q*7$WikTClOIoqbc2dQHs@W-~0vlpHqdSDb5N`_p-H@mc9Oa zS@k~S{ggf!wcm)Yd`W+Ah^^To(zha%@*_>Kv}=ws3Eht4oZ~h7q3~zJ@!H96Jdv{8 zn6mE36*6(h;lbm!eJkUi*~j6WQb(u31P^UtY9o#(KtP@%%AR|<<04RZ08S9(631x$ zSMaBVVACz2R@}<61qi?^va0QFIDBM`9-NMISzxf$YSeX_=G8r$cK2UT*7i5dtHTc; zV-(wpNxiPxS6*t~oAtfVNzloLIF@7%R4#$IgYVdK{EiHGV(?t6IN6BL5nr`b@?Q1<+S8r`QE{Aoc9NR8I1j-j`@JR=e zpI@dp>C(E}OE6gPktJCfm`ItBnK&nLOY zbB{ttKDF6e+p?<3BEp+6<-uZbUvXZ9obCc%CL5WrZL zaV)4&Mh4@xnYlcL9exk>2~ZaGyUber~Q5>k!qGwz?y^n z`=mx#T?uKfB50UMw&l55l6iRx!v)AWuMO8U{gKDC?2w6Lob9#idyAR z@JwW$E8cuXENJS%rZU>020@=K+6Dxeg6CtG42OUnz${fsBgH&KzHA}o8)l45u|*~e z&eAbvGGKrtjS>_JrIZ(Z9+~*;(h-VIDOyc^{eJ%d>uo-hE}W;kwzgk~r~D3wi1>{g zWztJ;ut@1JRvbqqwYx>;BY)Fr0F|;&RXzFeEqdK%i_N-e7Ui7;$cwvts!HYZMm8Ya z!;B7u*U?@h^JkC(6U>;%jTk3rRgy46WF4wV3n5+DM-{T%)VwRzlhm$Qo&e4{DoGwAsjE`cU6!|6 zTKo3CyC<{Q^%K&|MP%*rO=$fW{5FTmJ}|JE;=W~(e5E%4v_bOF#QP(aUj;yB4ivD% z6*$d&-Qo=et^u0e9LmH;x>FHj8*FE0^8R4*1GQU{g>ps+ud(%=0_$X!lC!|Dnj zc?HOBc7idx^T#>)_P?cef!q#shJ4d)HZ)OuRhtB0^ByJ&cHf_i)kC!}4YL~Id zHmr!QSxM-Dj($_uAd{Y$C(vTQQOYZ*RHr#FjuE?hTJ2r$*H0t!tj3QdLRjD&1AO_x`-Rj5oVpoSpQk1-t7co|@ZB+j~B} z5A8$#2;cDbEoS64!y1!s^*3NhiLwRNaE8t*T9LDdd-iCF%gLtv}$C8m*6m ze`b44b{S`u9U2$BE=WZn>M%`Ugs|S;YDQH`jQLj+Dz?x`@UPnQQqy%SKa6*_XJpZ8 zx^x8*6-ISPWW0h9(m8dK@t`8vlt|G@0gwQ!SbsX_l_CA71sMC^T9ozG#W|#}WVP4t zKS#^dr3#U4Do!cEFYv7vn|5C-v+*P2mYJ(uU-$^5v%lJ6(~O1DUJ0(Q8Yh+Y>S^W(Gp8>Zzu#?M!6?Rffh-rt$><=(xh=~~0wMKi-JSMyB+#$zzzJD;^m ztjn`#VzEyFiI+0NH!2d`kFq>1s#*B>>C!c7Mw_9OFPk;g2uLq-j)@fE zO35v&2_&D*M+f2^Pf*kE4b{b&ac^&UVhJ=*$pU$XNg(qYEz=~7mgtR@#$CY#o_>b- z7kIGieiXB}GA-5itm$oXvnoYzr>3E0_NmJtk>+_VZe@Wjzy;3bESOS$*D}FIt{PSC z6*xh(`@3I6zrvoLn_T&920hZMsq)E1&z0NlIc)Xlt*6fVT>iFy;FuPd7QPJl->F;M z&E^a1KMUz_6_3sqH`f7PLp0i*#hmd>{{TFWWGp~n6mR@ZzL&t>HT{w9^!Pr_sM>sT z__g4x%|3Ms!$j6RPo~}pCIpsyX|$We<(K)+p0TaJ?!eFEG}~G99=|8bl(JMI-IvDE#2IW zaeXW?GQs9s#L=ben9ajl+Q?QVW=rC)G!a{`z5d!Cw50wi*R@}VUKG0{N7F1PeQNO) zUcx)5W>uC8sZLd1IP$PSSyo$;gcgXEugI^6T9RooTC2l)$qXjyFSG@dQ@zzBk08rs zGQ`6DVPSNsb(rmKr-`04oX(^14msk<;$=L)0WaL&mZey`w(g?VmhAR&c53?Qev89d z928@T#lxCV<&;xfYMt7WyVa*}&7U*;W%z}!TwFEYqQgq?u7vW-r+8#b6cEWa9NW*X zO&N|>)E+5(?UOT4a~npn+FeQzC&5tbFMIu|@h4fBNp4bTv}?7un$-s$srv&4{D#Wc!o^b3td!t&n8()o7N zT*OxD@6Cp5nJ=y_vv+wa+r5^@$Zj(M>m9iA6Uz&c)P?fzM*rg+3E9I zjW)t7d33lhwLHR?u-F+Ti|mp}Y+|}im!c+$R9SyYe`Jq=e-8c@>pI2V>sWZNR!fPs zFA?hTJn~!I>vt&~-mz+LZ2rvmR@z(+v0le)!rnjhOD{GU9^j9fz7|{fm&dn{q-nP? z>HZRm<_p&0!LN8H!uo}z_c~sajeJ|n=LxeMcJ_K=7)f`sf@_H+xKEPjQIpKLx{nct?g}`+Z{+ncXbn9$EV30GEaGbcJh`;t*mEuyl9n8<FRvTty23}X>RY5?no`QTbo-uHJPCi z#Qq)*GIox~njnK>lSE&a46-q|Bi78oLRMuur8W|zi3G?P&AWLjjJgp;MF z?Ly-EJ6mb2P_LKv_PM%+NP_&Zw$pU@aDL4_+|w9jVzWi_=AGh=d*SE9tBq$=SS;@@ zu5RMCw0Wnx)^0B~IG`b?k^HjSjrGRctxIfc9kIBU+6eIm^Yed=zAuz6X7%R0T+U(x9T7Sdo`JZs;emK{>cjK$QR_bG8p?E`ExNSR6)C#7TsB3cD z8#|p!`pz#gn?=wp?AAR6AIuk*w({KCDx#A9OYw)ra(qJggQ0vo@b03jWbpQ(FN$@D zWKn70&xRUHrlGCsx6;SDJ3ULrmoa}~T1PPxq!UdWNTf&?!YqCuUVJh5X>DrJoomJ# z#Hpk%o|g|CiLGidIlPA|R$IF(WD%sZ5)~0z&IW;IW;gr`VxJGb8F-IT)HQe1d{yxh z<4{d@TexD9=I+N#i_g1L1x{c-3wot*TSw5SfTiV2_HKl_@ zAkr<^K^cGxJpv_=i5^)2oWAcgw%)JyZ1BK3l3l5{R+>L4C7w?r8(S;8pCROO*(zhx z;N2XD0gTbc4)R5LuC$iA#QJuVZlxY;*{>&ZO0+WGPK;z=hY&(7t)AA{&E;*3uxQk{ zX6^hP;olqlL``eLp9$LXM%1&UYS)@6lhgM`;a%GCu}|S=GCFFHpXcs*?9{q7{D7CW94GPhlGFNv3?$z?sUul00GYh z#igv%!FzKx%2_?cNhGt|8DtF6-3a1PrK2#hW!%FJk2CmL{{RHR(JwUp6IIscpHkFe zluGY4 zXGr9U;+>}ZB(U6<5x^3C@3(%{T#Efg@J6MhYS7!AKT^|g-ue}kycZDMLXl)BDKu>8 zOM|!oM)gXHU7Xn@n3>7UNhtovb=(ZMPQp zwwJKqTj~LB7UDY@QKen6B&H~BEdrGXN~08rUsHomHGp^WsQVu)XGwOCOlBVnlu}Aie7bF=toC0&&c+mQmGKkn zYpbg?qO@At>G$ma00ZuQOHhR({{Tp4npK^eFESJcHD+X4NRgtAqc|l%c5S$2T-T24 zIM3pAz3nw$iuF6m8~8 z%H$RSm&%SZ&6Z;_s>+<=B^uI@^O?7&{%f77oT(<*sn@w8UE4OCWseH?t&Z!fNi3))thG|%RKR31>*VEcUam_EHby08Hbw2Tq*}qoa{g)e4Hw`Q<~B7Z;PVRA-Q{*)Fh7? zWdIWrJ7v|1<&H21C*}-LcP{t89nOWVh|@aKTEojyEEY{OT&k|>0+6@Ai45Mrewe=O|ASaiVz z{GvQ9X{y{@+-bU0kzU=}=|5%jB9`LTD`;KzI*Hwf+N6b9o#$z$jz2b3oJ(~Ir@QzU z_F%v9C8fRavl@JI41;wSnH$00P#Q$zqiVbNtI!O24#&xpRlLH+&7j& zKHb8s?yl!i>nPh!S8FY8)!Wed3FkTfK156&3>9aIQ}oth>+K{{U6J)8w@Aj+>`kUd!jp z_NyoiayreiUP-jN6QBUBhl8G{u^yQGdCB*#FUCI$ z{7LZY`EIR@liR}~-0IRJFv$!V26!QI*>V8`rv|n(9bI)hBxwdN0~B<@1~ND&2N^tf z>({k+VsJF5#yGqcNlJ@L-Cp#dpDWcSovhZshUYB|eR{W{jEtRCD|0y7??q*6U+&v? zdl-6@BG|iy`9g9!gPamik+AJ;d$Xr}QaShC6Tli$&8zT%^ z0M0T)b)mK?KvK-(?%*E7IPK0e-;UMi8hzZ>n!UZmVnFt~zLL?R=a6pIT<{5Rm6=XK z1RUej*UF>ynr@u+Qj70vzFlnkdUQE6Ch+v&WBWD#0D^q{ zS+dc8Wt|^SwQ|#2=u>Dq{k*cq*DY}+#qOnLaKK8)B!a@_ZJ}iVs@ld|4#W;4mQUM;)UbT1cp zXZuNB+3wR)V;PKnjyUbStD9M-jY>=v)uUGdSq$3{t^glXd{@*TUigdQdqs3h4-06T zubd$Pm_DS|a%u{o0wcK)%``-|GD?BUsUOeWMSxg-BgH3sbKcgyou1on(sx@U^jxN@ zub1JfL8p3JrF-ogsdc~KcV*t+Gkjytli*n4hDoz=;~@kfVGLHb*B1uvUFVbr-QwLF zWr{+;6Tz>`{{W44cN!mtZR{V)N7nTdszV7`6U~*b8s=nBTg@@H%Y(5-#~T8S$cleW zz9attXlpt&8a=^OFa4pe&37Wf&3$Q@rn7lr{PD`$p+^TGiKw zb?aE48YC9?uv+;p=^dMxAh+HD$=w~ab1v}5bLYK%zXw&PM-zsG+k&ZE-q%O&t6KKc zTYS9F1Bl}(;i}FrSkro>-L*?pm#6##^TyWND~YbeOk*v0tnn)Vr2V2ch!VUoRcR5J zgR~N)hUUJW_zQ1m2Z^+si}&2Jnwi4)EuV&tGK_3gd@@iv{M zjdm#s^DMkN@~E9+Nrlum){w807+@o2xVn4;D_mj@PS$llv*O`X*vUpJP?Q=;KC*(h zyt`}a^*=blP7b^|z1gWK>elYc_I9@ZzMCK1*ZdO)!e8)_d^+*Yof1oMJX!F0bxZiJ zq5|G&?RBkKD#Z|H+c|HaDNKL&YW{@S*J#&SvX6j*EQD3+Sa#M-(&O`w>lG! zmW)z%?P&Jj{Paiauk6+13rl?^BGCk*4SPbg)U=C9+DAw=uMfm8rN(!>b80#>$smYl z_o&MIM$kV|v|U9nAoG?si*#{0PI>bpSx8U-7+u&Su1~FgIe1%CweZJ`mcvDWSZLaY zgQj@G&USw^>2Xizo9i`&A1J~NM(*5*#@G=9zW8D>H|qDp4MHtX!MY{PZJ8F%IA(b{ zc^W*b$W9eVmOYFJQMe4?8vTcat0_>chm@}2Wfb16Cc1C6n*RVhpPT0tr%t`0WeINe z_>xz>{{YLc-RY`90f%r8KEIbg{c2Kk%`pm)0+G|VZr@Myp7inc&3&2W1y{KQ#sKH% zAIYl4M<4}oeSe{;pThoMnl2w4f5eGEBpd=q-sAm{SmW0U2_w)G&>Hy1_QCkGd*F`; zCx`qrE(eMHO08pQI=;yC>twTDQ1M=~91E#yQD04IYvnWB>Dp2v{j%ao5qBcnCBonaQ#_Yekx4A_3?H{VV`*=A z&}lZ)`2$WdX_{)EK6kvD@r2hhG_u^iq_ddN+!TDuvmY|xgetZ0C+(-HEv1izd?{yp zWu_#a48HMyjCF*%^DVUB4UJ0r-$=EA(m;}GH_+N&-P|+0jRIR~vc?{B3y;AkgLUf3 zHms#8j!D_M%XQaH7e#wtWA!`^A_=JLykiv>?aGv6ce>p-_&%rOKg16fYg6lYn*HGy zTE?W>l-?S-xD&MT%{ABAKHDTLcIcWmn`ojtRE8J3)J^y9@*|TSw}*TYs9j%0;jLFw zy45^QsLP~TPo>RhWwlv#J4CwEbPL;wFCtq@{Z{2)Ue~~qBAR`jzLO%%B>NuaXW@Q_Br{{}a^8`sc*(`dXS$0|yRw%~x4fDsQqVJ6 z#~4JFyJ1?M8l3rQ#YtObt@QQRMEZWGRccPGuX$%IU3Z_H&5d8DY|7wo@P3qeQrS7!+xGeY`Q+T1PB;mFV(9LaXu(o}H+8hRap7 zxV?bO{i$_h7m1>tX|8oWUjB1;_G@KQ687Ut^P2sqxQW%Kc*Q^B_r;r!6I^PV zwvII^taUrfoiD>0j;4&aHyU-6_c7`^>%(yq#%!n5qMqW*2<+=+YQAGE=YOjBE?Z3} z!MY99mKOSOjU>F*W51bXxl1)IJ?mJsQJBN6tX5FUH2(lC<#xJAru$U0l=+;-w45b1 z{7KDxPgQobYTbIT_;c66VVoM0{{WYEx?OhN>X%2|^wRb$yfNXghuSWkjUlx?eVRKM zG-+95p6Bfn6E>%X*`8`KHrF@cUgwc_pcM*7E5PJK=U$&>j`nbiaka4`=ba-OqQaPj7#v zcz)(^(@17%b7MZwoD02X`^?s-a}jv$UrC9fm6@S#dP?-2I*!vxruVaby6vT}zkZ!H zT%}r5PggBccGk&muk+ikn;r}CwkPqQiARTDQFy!=;1`-d5Pg-6ocCTNm<81p@5cBb znlTjiZ*a1**lbAxDUTOFJn81n_f?<9+Kfwd__fpgC807h*xPBl3Z${eaKcy}ZPFth z^nuJaOXtq(G^?Lt)vU~seU9zq8m6Hw#-XiQq=@!0vzYEdh9`3zmlDe)>L!`9c;rYF z2|yk-<9%~W@SF-Em(B4l^|QCwZxSD}Xc{_|iEgKbz=Qi<^hmQq0Qn-eNTVoKTRfiD zPEeysxwyS%*0O5XzMESA04wg!l^au1no4U|vq{~2y6fHf7d$oL-Al$EI9Vr_eNN)- zyxXWPa~tTkW@eV-Pt%|w659U&QHPuB+l6+~w1E=kZ*MZbm+^zg+D?IBz7(;F+%pxC z#iUDczhShrkZ9UuB4ztcmXD{YeM)%)3!PU|dr4y#3C`~ZBF_LBDHCn9qN~}O!pVjslgs?jk${2NQem=DFYcGevDuEzSh~=%GKoE z8bAOHR7P7KV-TleAs}r~00RI4PK8OvNh@g9yuEbU{Qm$x=cQ7VB@~)^*>%x0wX)Z? zm+PVT_KD*y4@(f*!)YD5HpYpz?=KpF5*@haAcJdznF?fQpsvPW9ch=)G_cLb%-D^* zsNOsR=s|QU=L5+*zTo36fnS=|-Z`4uOwPqv2L0i4Qamf#^`2{9rt&YGod7f)i`!^!&4Mh$%a#k6*LRb@4RH)A zEh9)nGQ}s^nE7*w6(qQu3;Wp+h|&qRvz)Y;8Lt`9{tRgz8oiqHLA4?&5>W(e(@DIJeW}n)3#Ja_nm7#d7LvbATHuq6#HZsUnT6Bc}0K~z*Xrga2SjoAx zlVo0e7Scx>D=mJC=zc4f@*!|0xLBl+2|m#yv`E0=kTw!QA|1eN+`leBAcJ3vz6gq|d$Jm4OQwnS!`rGZbngUzLr1p8PhtD{ZGh?j~72&$zs< z=?Zy!i4(5qHtSv-@piB+^vLH}Eo4ham&GAWJ^KX7?vlX{U1q)LQE7nvbwkwZtmSRZo&3HzVGUj`(aYX-Zq zXO?(kxwYP;nI)42rbR$nL_`dM2*Qv7Y;?Ee6!X&NP}Ug?sLB-@trNtq@XSrM)-7^p`Pm{>%rfHu{^ zBNgmLKe2_qou#}`q;PJEMo_Kg#-xbXf;N1tQ~{jtZuzfhzW9gWn>`X^;q5oXzAUkd zN!D#gM9`KP4i$)vo-VE@=WATcVctGO)I}L$qw&UkUtC{iGnXyqYUt2eKfR zTSqhahe{Jbmh3}F@msry7?2dkSTduo6oRF=ULP^XOPi8&l2+BVyrT3?FE99W(panp z4xcBpjAEVD?Yg%5*(a{OzYkm1JXH&se8*H(K=F{O!^|D9#1cs71adnJ3i;#5{w1`# zM{^@#QPrdhcf6dKcSruNAdifse|Qo|I5_#M#Qy-aQryWhOQ+oF;7;iranBXwMIemf z{K)?LA^Azc0kQ$l1IsnfiXUi(JCwOHGxGbK#u#Ik*^<7WdyM{d(}&^wR430?Gmfd+ z&s%GB?a?c}?wQF~h;UU>rB9O9wwm8fd0R`jO*Gu+zAi1TiD0@Y+qKYBIPCmh#Dg7(6K~PI$&a9Fy)d&3y~vZxcbN$s>7%#EHoc3XJE0 zjt_5tKm~aArqQ`BSR9<~;191%DlIj5rDbhi%YAKo`X4Eb z_H`?|uxiU#tFGzaMwZ<@4DDXy21Zf1G$$MqI%AH&_v@a0>x7L9-iHJQF9fe4K3>6X zaE&7tA9TlzyAD$ToMQ*Ex^-yTa7fv~Jw|=~x%cUc+W24LiF{Gx-5X9wx7xsNFLhZ2 zVp!qPt%E(pw<_y_Nl=?T|-|M3N=Sw1E)Fsk0f0fc!UzYr`RX$X zwN{fr@TS-c#@AL_MxM9Y*|bK^<(B4IH(Z>;-el3U6AK0LKOSnDbUJmn*)3;To5PlR z&Gp2>Ip!kq3|dB$8nby}7ACs8m6zru!eeYgNb3IpX+2f7j~(dZSRq|2!;?v=O)^Nq zB}lEUudiW}H8=+P`eO*SX(X=J_qLktzKYD^uQgscxGP@vv*{fh zUXJ&^x9Qyc&heLpZ|$|4dnxYP6;!#@Y7#@+2*H{jslGX_~-{bh3s_9Yz)9{Hx7MVLi-ofMP0#ARc|wT=3d!Ha;-4jQP5y zmF~N#*v2vEvRSpuS;K6s1feg2H1q8TafZx{V-4b72k@=c*Ni+LrcZMfwaJr8yfVDP z87*$_CQHe!uOcu?;iE-Zm^e`sz$>GVyZc64i;aA_Dw9g2X{Md;W})uucYQ6cx~{q( znBs7ArH@rwii(}pm6NmG`B`s%nsi6@hx-bAQ;SWD_AAjXFG?E^1Nh59Q$57-3zW2n zP_?#bWQ}~e;=Hx9mU~j!SnF7;`W>2y;Jr!SZ2CN{`u(5Z+C5;%|vs6|83IHT!FeFl=OY-!8M@y;1E{J8g~w z_PZM^i=fZ`x;92AUys4~k ze@>60KSllv>;5kBpToZnc**rGCT&jJ!xonJmQ8N4-zc)uEn(Cx4aWVK5F<-GGshO^ zGc+DM4+7iA??v?!VLC9#w z-CWo6?f(D-&hcHuy3>3*@eZpU_l7)2rfWyS(%f%Xx4OO3qLy1L_L6BHa7CTWx^==Z zl1H7lWF$HLaQ%!tVQJxwPT#|^m@N|bQPngpS_`#U@4Q1iGThv0o-5PDtdiX%&1-R^ z+{qF=Z8n>N;ISj_xOW`rV=2~hm$IB`#-r^fps#J-x;J}meG|XPXEo1A&x-DZlH7R z$2HH{#>8&9iA2zZQerG4k~99M;YsRbO{IXy2f608!w^Q|P6j{2u0K9M4wd^Ans!#Q zjAf?lnI@ChYi+jvr^c75ulG+Iy+?EH-~RyBnq+bZV!6p320xd7^XzJtv#HzYx>lj8 zUAo5&m8{nc&w!*xyNdJNfItJ1*d6L7(uA9KNnO6lYTwCfX+|=YT5*bUl&2WQC3fW} zWRmXNNap-^tw-Vi01E5c#Lu!TI|-!o8McHu0yCaDSvD~C2RwEBlh@7Ph-|e>i$g2k z_>*0>u(gb2wZM3xMVHJ|=5PBUxH$P$B9X#jJ67@ikB4KCq#hT(x3;;5WMl0*{r#M`5-bdi z(HOMY@2y~F3<8*$aG(nO>x$<(`K|*EQatpzw1<5XS6jU_T{OA$8A3}5iLRHw7^l9S z5?UqO^j+7?pR~V+bngS%_;bTrQ^s{IFTu9g6WWDyb7^;Ze($L3^O$5?qUoFv=buMFJ{Rw{2I*aSyw0G?TsYx!Q6{mnc7Pglpu-iS%>0unU@I`T_Tn+1Q zCAOgVmj&dtVK}#BTaBcO{#iU<3GuJSzlV>1GoK82Q$@S+tlE@NBHJAyt}gt!BbH*R zb0(o2Z5SbvSgx%MLODE=&GC5Z3XUqhprHspRc5S`igs&GuI<|T+p+bTJfh+4XC|j* zB^R!mJv@_Z%ke(R{gyr+L85#H)I2w@>1n2&N5pS;;tAT-OXZV*dxYjDt8!j|&eiQYZx5ud}v!ZG$~J1M<-+Sar0_x_rF1{)BkCG8cXeg5lh zwfgDTOYVHnt$aIePvQr|T>+Nx+uHbZPPNnpqbAsNomWu1)^!(YV-h41L2U}z#84qJ z$sCsOBrz-v?XU2{@5J6M@YjSOhD|;VYgW_r%eApqcx*f?Y6RD|vQLu2c`Q7;T~_Rt zx%1_?3PYE6_gz21+I_EyaSTzNt)mUQLL0LRL> zPPQ#Y-NGEsUn^ThrS11`zNgVbai>Yf4*u57K8dSob>(Y4m!e4Vj}_?=_*X;JVS?uJ z2|PimSlCwJUctca0QP6a$ zo)Iq8^m!WMA?GT5KEaAS&0&Hlj2o-ItuA}0c?FB66 zjv+bH^avi!CNg08R`%%*vvVb~f#!(qSYe4Yu};bIb#HC-QA>O5`}+1rJZ99RccYYV zez$2qJ-^@|3w%%2EIuCS{v_1(TV)TWY2yA_AIp+!-G9XKLvi7&2a-~rXOQw4Jhs?K zw>+!4MEt4oC&Xu(>s8cs3x>Va@7wJYNh`dT@LNLC8{3r+m1$hd6jo4$ib&*GhUpZP zPr3g9ZS8W_^TS$vV^v7MwRPQl#hRL_D;!o(rMLE64j7kq)1$b(5pN&4^5bZ5N!$3x zb*|iMH*;QxI(Cs08Sk#=luIS0lo7YtZR1SrM*BKEtk1cgRt=rNSE9 zs`6G^ySJvc3~oA|IHJq4=X_nGQ9HaM!U1D2{iC9Bv1ZmrNS5-^@02XQKEz8~maSorQBxIOP#iF8W z8us7_F+Is!Z9jmQ3l#GQ0I6?~EqqaFd2)5;hD%E;dl+>801s)&Dzxz0Pq}TNjy??c zceeK5YQMQb`?Xt{U9)otKXd-X-T>6TCj3aT^DWKMTcl=zV{fypt1y~XjN=|$0fcIc zwl!43j1nv8^Sm^0l$SL%8$~5$Yr96>uJ675HzO^stIqIT(2BgB_SdW3w)V4ny-(QR zfciiDB7PBDYlszZXO2tfx*%riYp{UY;`@z09YapO*Q1L0Y&ADWDGj~7!ImTBG>jTuncicJ41h=N zSw>lssK$;N#TYKU3B;9Yjozrpcz?O>Sklf;3&tEwF(j4Ir9Vl1SwFSy3V?Ghd;8 z9q}K4{{Ufs+F#aQXgHEz*>&Y%6wy}y& zk{c@zG(i!_3j{+r1cBXNSu?^{uTl}c4rsbOsq^x!mD9Vr)hBPBhnWhwwMz6MN~KwQ zXBMQS+V`yX+huEO_v(I=d=UQtf`oX7O4O|^JaMWY5m?CHQ{AZl0Ev}~0<2Od`D4|U z!8?{%Vv#uv7Z^Xa-wQr7=)ODAEwz6SYS!0w_vs$N7!4~TBT3{w@ zgOEQrzh%GpByN}S+u+xXJb!Dh>i!<_ABMGAq0%mG#8YXS*htakONe9<%%a;8N9Wrz zqmh<50$BxHflAOHs3dS<-m;E%#~*8Clzc!Jd1GTm51EN^Q(cko=nD*2J! zTHf1FG*ZDV1w6BrWMb;WdXPT1(7ZM@Xk&;#Hf{NodYo*r;= z)ux+XN=YScwbJjlmp-@qL+c5}D@T>xCw7zF>1XB9YP%lRnn#`C`-xHnku7Z_FqAwK z8=00R9$PF@nbtECEOJPvc#TU)r9U0NX||I^yVU2JQxqj+eW>r|JWu94OA7`LQCW!c zx1mBIB`2r@HUG<@ggjiGjyHyQaBKuURtM3Lk&Fl7e63_ovMhSQFv zbcR=rZQ3+=RFH^>L~7XR%NZkq`@%fj?I#9phlHVoabELD%Ii%co42p7omH4*)0)K8 zcWp(tZm#V+b-L)a^k?(y`#^Y_>r(L!p&hc5aPF|%Fp#5Lsb=|Im&`6)?sN?y+5t>* z8m#;)@Wqw9a9-TU9J91>Pdq^uJ((EX<%75HBq!z{D)P?9w-@3IIW?~lYBCpTqD4#=4fH4aK*BblpxpL&JUm(#&MT1d#`h-(B%aOC%;+jau1kH-6H3c8Brz<9@Ysb~ODj#TvwZ zB=FaR=ZG1_QvIuM zJ|kJC_;cc`tK0o5Zwp>uTdA_LyRp=7Ef(WWiWneR zCAXDW!4MyXKL)-C>i#Y9HN1M3oSsl&=Do+3v4~zUOUM|Mg%MSZa!mVLBN;d+74-1N zWY{Wor5HkW=(MFv-R708wu@WtwY~JVr^(lbb%}81oc^y`E=o#Gr5DSjpMUfD9&p|# z*6!_FS&9uvS%;oCkhzhkoDd?kid9|6h*>3>q>KU^0JjZUmR=(9M6D&|%a{>KQys?R z0aYYAmym^zXc_CgoB)Hj>c4>h0B6_Nd@JLm@cw}Mo|AFnKPN-DTUj=|)>f9bS68;o zAt=_u8jk~Bw> z7};SFeWF6-temK=X=a!#U%OWhpzEl{o74Ah@k!aG+RxV4*_=|q8B69#5#u1C z=f9^Tx99ZjUu}NIz6g)R+NJHRcFjJeuj*QUj|{-$63ABH2D0%;aDwS zQ=4W{H9codnkyI)&f;XYws)54)Maxb51Ea8=4XwD7b?ZpjGr{)XtdYbva-`%top8- zpI1VUx5Uw<_Envgn$fE#_v`-v41SGx_ri~%=pO^VAKOC72DPl|aCyv&8qa@ut3zRI zje+LKlil4~#Nz>$AOf&IydHt$ABWH<#2*-0!xE;I;SDoN(qUvoWZk9cBI@jb6s#`# zh19BBY4bA7+nhPCLhRFW{$2I{aQ-ac zSn8hBk+S+Ye+$bB^zuIJJ*;ylz)-Vv2Lfxjl;hT%? zbHhF$yb-0Ip>)yM#dM-iwq8kdrx~m*FiSPjAoEDdV8CP^p zu*a$E%(6!zk=-WL8cjt2Qg-?GMk^+EJ4_mMF1ZI4^p2&cUTV+qmeH0tZ)KlY)zeQQ zqx%)4R$pm}m`n34D-3TevLF)fl*ychBkilyg(`R)I=huRQBr#)Hm=f1rmmLS{-=#X zeD4)0EnV-|MP=)KKU?EFitk{xO%&4Rs#!!i4G9XtC#yGFiuZ0>i zYCb9P4w*5M*?c$Pi@VF)hl!Y4YTho3%C(Mtzc#@O7;Z2_r17&b+U#rLuZX@1O(sn; zG!e~xrR!cI@b%M1#$~yh%6mJYuBuB%y{_&73y@Wq9;UvND91`N#8T#JF{cKu^s|go z*QfNI?sz!-KefArZ27c$^|rfR->sT^ZhdFrFA?q0e`NWsE#b6|%i=GL?R87Y=TUPk z>E+!|87K=z)7{Q*E#OB`Rwc^1m#^u6!c!~!Q1KR-Ey0^j*Ze2p?-2c+SUuEt6XQ=YcLFgTY@9-Uvi*;ctr?&xkD01Ytb#_+M3y?8=P0 z)+=eW`)Om7J5?w0!j{{g=ug33ExDKCmw~kwNR5hoBJh@)i;S8Z>0*I&tdxf))=wJEiDa@qH3CwF(z zrPp1V?|-o_tlkOnyjMyiypO?_Lh=YutSdd9y)?ca(U^IN(xt?DV?4Jnw9UHR;wZV2 zMg45><^8gFOW-X%{oUtj*WG#>Ni?d^h6ggD{4&zxRy1DD3-l4Q>lpNjq?-8Zaika*G0A-tP)~O5qS{M| zK>}D*G~h8Vf{K;rN7=@#=N%jzZ24WT?Coan+syH^8WE{Ewed<$R(*c!cGpg~@_*H5 zgM3Y=YuYWAscUx?#8X*YOw&T?EDnY=-7GUiOme^_EWTpyP|OucU8cPOknU`7GH_2B z0OKTk<0p#zl>LIfIa>Iq!S;R>|jc%P+2+>jq@*2Y`6)J>^L(0R?l|^*udt{J;9%6Z=D1HSdLVC=oa3 z#oBgP1n~NPsTn66;CY$v)4gk%#-y+qsM373CrQE&dpWfm{mq@Z9N|wl%j#81RN;w) zz8ZW2qqS5mc)QM$L4H%1m2B)NoV=VN8%hgB<|)Sr)fdDng( z_?tvF4Ox~*UUvBj7H|kJjlrcbOshUkc0U9`vht$rhG^Ejo?V0 zNd~Rpx^0rmr2vu1?jREiZhTrHTV8NQomjPGl$IbINj$6y*?-e^->Gr&3K> z-YUfn4{{Y(y!?EdJA^2VK zQrbIvbk#KNXW^fViW3FJ&Yh~qHT)4=t}#3gT`sKLnPoFtq;N8Bln=qz@U7RutIyh> z#yZ3?zL#y{{ZmZ-(}@=AQorzqn{2n+?IkhW-AfOMFR$T(D=T$IifuivmSr4C^BhYL zV_KyyTl3-^Uz(GRq?)_aYgYT@udD-6ah@>Ckk`;SAC_YfDI0Tq{gTj^z88&1>iWw{bIE+%V`mlsmA{IVG& zRNSQ3*E;rrWvg0vB6quj+DmKACi7H{cU)Reb>XXdZLDOtbG7bFu_%>{q@+nYs~HhC z*U$d|8ay>DJ|5Gx+o@)2h&0yI-|X$>%(pYmHo}`>L|CH|q>{qR8+p*Q>Q40s+|IJP ziugV&Xy>-Hzwme3lG5;nj(uJ`OG{RUNrLXg%M8u9f*6T57AkAtuyt=&654id*SfX& zR-5koU-3SYYNCxBHlm}iEAPu|@3q>0im}F8J+<$}uML>sH!|4W==#JHBQ3p@b6(r) zSCY$orwb$>YSYrx7X@8oiZQuLisODH+FjjvKUBQ?CA@c5eiitKuC$tb0L3NcUTV;eyhg7vAP`46>!n0dV zc+nTg#5V_mNEIKCZ|*L~m^pYE?Txn%7d>#J#P_gQ(rJ34A^q-}X5({1YSb>#am>1q7ae%hMTTijdN zHRL8+?L+$}INot1MWtEX0evY+03@t)sFQZpmF86$^4A}k8okfjMW2SO;wxpT?3VTX zpEltwRm`QDCiz1(uvmi@7!s8O4Yt0_{j@dJe~UVmys?80pQEkpirAlYI>DCuSpbt@ zXx@8M2_z;Aize-nxnIw1f5qcR@W!#?Q{=&WcK3Gn4IEBMN8M>>92uA84YlR~00Kl9 zfUKLpPvK0tPY;IWgH6d@B<$Ul`)r=xU3NS?^AQSaa&0@i-%TB!sd}q+ck?UT_=+3f z9k!Kjs9=I=CSh$S?kz3tkpwH|9C>#WZAH0-LV(dO;@(<;{Yn1-!65z>TTg9e;_JAx zBAJ;1T!n@}dYlosU=BGM0N|Vt7}Pa*GY)X!bTf*?*JaHkamE>H~mEIv-d4k-y-f-?WG9_oR&Uw{V1=HI} zY9m-6hfbc%U+PiZO(MnwiEpS{NWyO_XptR|oiD_CJi2zfr^9(`I+d0=B?}yp zZw{vkmM40tkhU5{1syh%Ulx2${gC`M;%!|ujV)%=#0xUEn5h)D1!l3-EO27-P^nbE zc_JQe-1THQb^2_-IK<`hRcK1nXB6$tR5R12~xx>#_N{XQMY%~ zM|anwz4bpSKWa||{15%F{C%(dc7x&F-@@yE7fk~ENr%H8F7Q^rr}&##xM*#c#ad36 zsm-Bkx@%que7UEV)$OmP((Wa>mfP&>B%hS}-|WQx1o3XErFd&n(lp&PUEM`tsb1=~ z`o6J$rNbagyK!})o9j5WZzpxd+;(1K2X8LcSrnNU_u||38}T)a^64;HTHRSIUD`)I z`G`j~mEd=e?J>qCc_+0lvBv5Tmk0?HF$_d{H~bUx#n*a*M|F8)Z7rlJ1%%ptrNfw^ zgt-3zN|xkXqe$y88DVjgY)CP0Qrlwl7l>+Zhb-Kb+ovftDK?vOjd>Z-05={{Rc=mcQ9+ZQ(Bl zL8INaWL?)+`p&hf-e2mPjnfq0B13QH31*N50V6+We$U?rzBc?%(_&3q#J>{!tEQUR z%W?ga46|Fz;u&sdmRl0~bTC^)*1*d1UEDRyWO-2nCH*A$N&f%@{qQG+qtkUS6)D$s zTWMpKV=4!ZO@Qu#IK*+rw#>|>t<*;xf&dsCjQ#!L&jomY!?ylLm1HcV@}pJ=ishOt zh}2=JF&HDzF>Ffq6S%AZ|V{5|z=Ro>@+Z{{YN z@gu6FkU5o$d1nh5!l37I3QiOeoZ}-T*Wy?0!lPBT@d5;LBZ6az1IF?>QpvXg(=UdM zoDkz`$Z@n+?9_UcNv6%afU#joI3%8-R1#e693c6e@S`LH`0M*>v{7if{-O+V;u!?7 z?F}RdK}9hN3XTA_+-8?}zs;u%C` zHaG@0-6)N69Wocs+q5&B5s~p`uRb-0yt+X54X7(gEQjWf2a|HDs&TqR3n$7?2MvS4 z`orOVrvpJ|n?aCW#_b|uqVnLAZjT>Xf?4PVVCjdah1`WA($Exb})P2os}k1gxlwYv#s zn#4@SXSrY`5Uepq%XaG|Vpoy%`~{+TL*X9DJTKypi2ew=w3gmxeFw*0Cen3CZY`y2 zi`gyj?JrUbiQ0J~wbLP%+Iwiw29`M@MrHc*;m?WXzS3>1Z{fGP+ouR6a?7=wW<-#y zs{$ni2{s*}hEcJCBk`}pj|zC60wlAE7{?yv5r$NhZVo|L$;J;Ps_}Xb+ zX}F~rK4$Ee$tNeX>7svm=SwA_Uh1Vd+FtX~IdpsU`6aEk>-pjPQ`~s__Na`|_6u84u!Kz|#2 zP4Mx49xctG*Y!v=+ow9~@Q}t?cM?H6nAK6@7O@VdDjd?3lXFW)s%^a;w){Ko zolYE#se39CjBb*QHc2+Rb?x^pY<^vMxAvv@nSbH?e+FsVlztfSo{|#E4J*U;7TO)V zSgQG~V>)KWnqcwEF*3Za#g!S^+b80G6yHs)Y6jj*={1R^xBET3&n?9BTDwYbBa#Ux zX&N+`Nw(@RR2h|+f=RE{{crvW_u?CwCtXWcn7gqv$7cwMSqV4+T2RQj=wr(P*QV27 zLj8(#?+eJnOIMcX{cJPcAeFv5vic=f&A{VHaoZV!g|&N2d3-$z6ZJ zJZbQbF0J|1qZuWmX-imHtvUeLKq$XyW!GJIJ{j;2!+W{(n|ZZ(_gB>%9kHy{uT^iWSvnq>{91zR_JhxnuFitb;U!<+@u<4MLHtN#GH#k%X zT&_d9r#k|Hg+=X>1$@u(Z%9~HYj=-g0k%AY$Rq)u_P$^UJwF4F4jJRdV_vDoPTZ?i zcGlO?>GR#a4_c;093B?Yr%fm&EAM$ItsC$6^zM97;C(8?Q}Nc12Bu`T(`_!IySsJ! z%@*RR8WtlaNlF|H<6vxjynjI-u;;>kU&CJobg!`Sec$X`)%}SxUaQ9+4BeK0%EZ^5%1bGPA}(P{l=U z#CU}qCKm{sy=NX|ue79_Z?awg0M5smfTbAKl=&1|eJ^db?`z-ux?dl`ULBKO{kuF7 zYpA*|i{SqN2i@LZ#pR^FRkoUJqFYO#ToqfAZQj@)-v0gk#M(1lm&aw7!v6rX{{W8l zKyPdlUe>kG7V2}r<_CjFm&HCdxi+^J@hfebT_(>>-*qHx1V%(GL$E!U?6G|piatC3 z(H=gM`dih}JWJvoA~tzeYkO-OeKSk9Xoxsy78{Kg_E21ajOEleIr;Pc3g_`Y%i)j1 zoqxevb>OqP@y?B*X)SKbvIt^|J!<~|TDf5w?%m~Cq*il6%F6OgUKdsyyetL{*2hu9 zQchIkQi^FOqUMW|Pp+5swYrmwtL&jxZc|?F$=^*pyp{Cnx%}dM9KOHtPP_3N#`h8# zZv$SPHrH#kYPXknk8QOuL5U5Ts=6Y)viYn~GdQo-zZU#2z0p1@cw$Xy%yxbc_-ChH z>i13t&o*xe*~8)qo->7mcp8jgF!Kzs4ZyB3@`wBrQ{hxU5$rrms_Pb0n;V}Ncz?qh zYzu2VaNl{;3#pN1a$_>u*}KUqBbP&P(nt`<0sUhAwjzB4;n&B_ZYy}=yuZ|Z7vR4Q z-WXLSj&B(}_LABp;O}&@hfrTIXZ>7coE)0;v+B~GZS^V%s*uD+5mD1c2uIS|Klm(I z9$8>&)tqeFc9Tx>T59j#UY*a%pMlejLHu>&Nn&`@$2#@Y^V^VEE}5z8-|&w3PShQ_ zA7@#$3Gsk|L^uFOG4pqc^z9c(xzjZZ&@LH#OXEAqZPgY_$@ROthgj{TCoKL^v|DzX zP1(b;k>+n-sD1;zl-?4BMa(34m*dBSz8>h-S@RNIC&2di*H`zZB34sx_PfNAN#Pqr zw`$WcklP0NTlTS$z8`!;e+ffw3u(6(H)|qDNMovBUqKD*G2?tUjFUty9!$uw2YI93 zBrED=^ea@y&WAHnoh55sE?A{@?b^=QR$4pma87jSuYX-TUd~qX()QER`mO8f+J&W; zg{(CDE4xWe_rzTf%wLzxith2Hwwzmp{{Vs} zw1!LTdw&Dy0_P7Co9k4wo?|SZ^3ET#YKdqfl0wofZ*R1QQU~+f;Fzvb*WtNll53Xn z&8+tdKLI>l8S&qOKGLswrrh{$9|h`3BfCjw zai=}y(%epnHmsMoP;Y40a52jqF3CCw#vDOORQjzbHq|v#o!;%*bmHI5w|4c@uS&~m zX^NWBSy?5c=25$AM|HA4On+tH8(T}_Pl>m1NpoxD>00DYmc=I1?5=f~^*h*XV{|b_ z+Lgt`_Oa?PmR4!!zF2n@D1PmBeyWyYaR*k{GJ!m^~Tg5E3XFpLAAZG z)l*FIMU;rwdT)k&U27x~!*+=+h4GRL*cKa;e#7zW_IB0pJU68LU$=rLv$9`_9xT1_ zO!}1HxX~fguVI%-Ru=(oCu^jD!jRh|#wEkSxAmwMH#wsw~j&tq$<+)8ciG}nSTMy00668bojGT+q4 z?Ah__MEJMhFAaFI9Y)M*v)O5B;yWvAly!pY@+I;wItez&?6F8?UH)XsBy#Z+UxB|K zJ_N(y-xO+|GWanSo~dE+Kf~~9x=cES`f5{O_-0*X!L8|!=l=l5j;W$u*~ND9*-LdL z-jr<;DB^oISvl{49~AW;fPWG6-|WV?i{Zj}Zad_;xsOs);y(szx`dI(YRNR7RmQR5 zy=-a_OKouv+pOMT#y=wF$e&jy&1>-PCYBcwdEwn&Rd0Hoy!Xo~MlT%{$?5i&MuR5BxXcXl0J!HEk*h-dMPeqzST8BO~Q3z&HVkPI3V?oAECE zX{+FAUl@NL=&_X}JA>P5OhF!-RFXQ73Xhn8e_xGAVras2+LS8B5>0IRoPRBQ-r652 zhf}4Af_hF4SGP%WOHapb&W}>l;?;ExQqE^aw_OhA(&h(1<;k)24JWdd5ll* z){h)!^HaGae(Sdm08{dadX37&?KmW7IR^xG{(OIKI|+Yh{{Y(Z$Kt-51g&Y~4;uJO z#MXXOG)Q&1Rt+Osi3vILB+|6wF^w>?NpTpF(Q&)uIJTUqO9PLMCpCtzPA%%M&AV^p zysv%rJuHq~)iHQ!E@z0NHy-zji%BJAbndUaT~EG0YTt$$kBz)|Yb8uAvNaBl&KYdV!OCS&+vI5r6@&=TG*R(zQPkTxfp|G`&_hUta#$(%)Rv zuBRz3p`&PUS?QM7Q-x_J^Y5aIbIt<5>m`hYpD?fWTjSWFy^3utTE{k-<6S;D=Cizr zn6D)v1 z{C%d`O|9Gr+CZxEYPNda$A`XR>8`f<>#delCr(rRBYGF zd-u0v=s0Sth90bHd#*E_WAbZFUv%ER`y=x2#GVnK;r_qzn?~^!sMB@Z?N<9z(5>Q< zHoS+$OX2&4@g@8cTKSE0VGWg^*e&fNSl~AZmkTU{7m0tSui0M5Rq-F}{{Zll#d`Li zbENpkUif!yb>ld0<%(PDnRJ~x^TTOxc^i4MrS_+#TUqP306d@RF4TB^QR83P zpI-4t?NRW1;^p0*k!|4L5PT%I*8EH_t<92;6xc(o=$em*Q%{;`oa!d=F7;hTLXB?) zjD~4HqOGx62_{(Z?rn3O(dhr zX|7#mARV%Nu78Lh zGu=lfrD@?iza-0Zrs@{@nn@p)?KD>K-p3$lS#3@8qgcFF=%Q0?_PF8RAX|}>mdrFqd{{T?8*y#?XtN3jrl1sfd09H$Jt7#g1vfSFp2b*-8*go);`Thd$ z+d@vs%{WV|@=o7-UX1$(7U@B_@};Gl(WKLFUzY2yt3H(QmZN!n@sq;(nk10f_^U#? z)2`uY3zfQyN7p<#GD9kXCAHPRh%`GJ&=(Ut5=Ak}?#}1Mt9y8~ySEn(kYD(FN}4-s zQ0pu=);B99%DP4iGYRa+_Jl-jd@*bgbA{nQ5kqnDL%~{}k8S5$Uh7Y<&8Em?f@=vi zSuLO!c1i#eEtSpp+vHGMHnzFDl&DxXk5BPmhP3N_dio32x{}`38~sAk(n+u;hD*i0 zy@E-U{|nOKy%gYrT`dZN*AaRbw3+dfNIlt#zl8 z+vm{!I)7~4K`wl8scLts&`GO!XTmy7?bXCZ8cQo(I$1S&qfod;7Q2O|zyZvz9D+_O z@}J^|iY+yJjVnulwYHOKZ5*o+4d!tYB8fv|Ge>V6N=psKNhNdhHY@e}{t8K;P2lev zUTJqQ%>!vSdd8V5mJ$n_YguBCO}LScRb!gPt->(ED4ZtDqcQv*)^Fv$zn$e`#Lp`! zVhe<0(ZJ}~01=!92a+rIE(NFUa_CfTzh!3^ZS;BEj{22ur!FV2K%w9N~4_MZfs0KqT(Z4S5a zN5g66`Hg919kT)%f`lu#x#gD`C$FV^hl)^M-wBB2<%)7iHG8`*x~tz+*M9asr^DGP zP_0SJ-*AqLSN3)3b+y+;({uM%!k0$oCEi9rABhAGdk}qiQ%JIhRNn{NkImy5yJv})DXMij7 zys~kZE&dm)ce7Wu{F?s&fPGG5g-O0x`^#A*mb*LO&-!PeC5%o{*)kW9s>B?Tj18xu z91(-h9`$N1KGd@-MB!w}CEeR3@(U@(N{oeW!;Dic*uelK60cd&2DQXmW^q?-9CPu>^(I7 zIX0)+;~y?V1VH`3JAe(4K{?%yK*w5GwDh=a0!Z8xbS;C%Ly%l@bB)J7*r--?-WNF^ zAtOBHi39_a>0KqQw1Ht_pascn1NTWdCnp^br(aCsyJ^#m?2@*rH|Obnp2sCxvW25} z?<*~Pb@J@I?O?Lg%yE;22pyDeIbsONDh^L4AP-Lc>g~!$6}bqZPSAG{GrOMNnB)BM zp_GasLn&Mhryibw_r`OMKb>G|H)`nMu;6a_a0vr}oSuiTW5?h+YN*n~$)#qVua{Ka zuCKGx?s}~@=7l**e9>>wB>OML_tSpI1+CsPIYl4<8+HydPCDTFdwTj;n%-SZ(}gUg z7aZ_N;Ba~8zte(iXT&1;p>q%{(zL^Jkq!clqbSVBV8L^M4Sv^Zmkkz# z@x>{MX(5y`A+{L8u{*FsV+WEBamYMozX?BW*(a9Xd9E5JmP2@@Kme<#Xx4Dc$n#*} z;N%mP+Ir41c35R?Tbr@MMR-0|T?6lwfkHXDwRhh5m z5y;N=lm6vYNjU+G%A}all$&@_N6PHLvngOPUq^f!@fyb{ir_mj`y7^$d66m^WZo{( zVlTZC6n%*nGIr+#SIXMvrv<*Hc@%q8Zz^Juj#0eDjBb=NyQW>K<4i5&t zh>8;;oziW0l&c2$zk3)dCm0_oaK(Ct45f3Pen$8WB-dI-ohz)9EGn^~M@x2&GZ2l$ zy9p9Bn+DvHp_uSNvFppoDt`1kZIDVn;nVuVj+h65XyHvnyjGXO{93gA$Md#7x2CB=RPXSa2FL&CJ|K zxQPp>GKO!u3Is*kVcv7e#^AUaZhb{Ws_(n9()aA1+I0K>025kp{p2jVt9QPS?zZpG z&EMx;W5M=U-em0y4g+CAatH9c1Ox%j2RQXPuM_b{hV1PL5!)m~Y0l6w!knNvBy|TF zCm;b|TI;h!VR;k;ft!&3;LIaq%#cOeb=^o+kA{Vd;}x_O>%c%erNH+6K}TDMy!t={`;`<|Qs00i&wdi+NHsJ*78~^vVEFt1_cUv)2w7+=7o6}2EVFbk6tFz?fgIBy$4^AMyujo zBT$T4fgFcV(lxZ6-pg_gkXqY__sJG;Wl%WVivDc>0N{}tzMr9dSnY&2Sm|1J ziKksK#Fl#2v8u;$6~(D#DfY>3q<6aud2YU;r7%;}856pW@D?;;-5`_*7j+A=5k?c`f~a$fw^V7j61M9PxIaTYxsv)_@Vn@{AAQ{vo(gLCx`7&d06WL zc=K&K+{!sKtc8eB7QTaqXu_spR=uML(2S<_(b?H6-ED0(-p7rbNvc+hvwA&x$;Bn< zs_A_%>-(Gk00fipK~DmFCAQO$tHa|>e_qsMF5Xq`%zxO{)=2^o$|biDNdadB%_Yk* z${+{nU-&GAyx$IgXW=!KtSNiq?}`&@&Mr1a@Xz4C4chr%XI3D2(P}sLaUucpN)R#2 z16-f{5;DR$Y&-(DFO54*@df^#iw^aY8`y4im=@wqhvkOq;jSh}kQEIOCn`DekN7yB z$CASz?DydtOPh(TZ#6%RUL=sb!T!|*+Qx%#WjxWw0*0Bi{1zjHKx5hgC(fl!Vt9_F zUUw$zRg#x=6&IrN{8^;xI$3=vC1om6QnvfH-qzR3%g)E8{{V@#IE?T|7*$kDOK>CDs(9s?0YiBLIB{S2(=pQg7ak3DVbBtdmb$Z5!!pce~rk z#YwtwQGFoWee_pX?R|gD_@m*y^{w6K_K^8(w7qh|TO|zJBDA!IM1|d0e(=o`kj?-b z!#+MzEAL;~KgLhujdp8zj8Weo8eHly+{ktV7;8x`A&-2B%(E;LT#1emB)KgfR2|=) zeh<@cyf5JRzSA$2J6TCA#q!I)>^EWvoVznIL`fvdkQI(`*!v&sPvU_!>)lU0avNPk zQ;kc$-?U6^B83;vk`2o{)jpfl2%0_KVO1ia$q7lUJKr;n&0nw0%cUxYaDRIITQRWtAVw zneX(-5>{Kd&>LIb8gX>dz2mq`#q)1%-qv3(yDJrjZ$d|aa^1bcy zZuaBE)M}*l_gmWUw0bW)y&KT}?7koTT+(%IE8)%Fo~XYLykq4{8H!T`{{Rd%dyAbq z#^XxU?g~V<`c1~7iF%S*Nh{deJW$?Qz_238R#J`V7md>F#ZWab8;7#)YJ{P(K%c;GZA048|J|f!;3h z0^EEz_=z0w=oh-~zo)8L_?F;8wigoIO?zi#f$o+&29n!Qj^fjNF>2N(*-#(p_rQ8Q zA0K`W`1|1pj!v0(;tfCHmXT$9XL;mZ*=w3TfYhw4bx*RRmxo`|uC1buOZK=Elu2z` z7D?ikPcs#Qm1=b`^?!8X``UG1#N{7@chz3k-rYw+a-}*|f91?ujYy=lQnJ z0~rrrr5_#$kHWe(poH(!JW;0VP`D~cXf3bRTOo+si5LPfGIBZqe~|wG*^9;L`!asf zejJ;_PkG_%9cRTYeJaImcX_6NYU%J#rudWL#gwb&!C_~vwWDhax`d4#w9;wq75j-T z?XN$tTbsQv<1fOUFILcQR^rE8(`@y9Hg6%I`DC+en|P&x)ntZ29KmFXfOK-`-bMFc$~ZHQ+&1X({WN#eDx#G7tLp)(KTnH*2jsCr3x76;-jT0&3n7r@#Svowe;O> zbJ#TNVFWTNocXgftVrnW!y`S$%#P#MzY4$LwVEQ?_#46gGZDOV=-Q0-@ECl*v|c>6 ziE|ybv6vW=SCY^$kP#w;VVq*WcwqkkQ@0>6CxgMp0XR78>5Ba0{{VuX{4k&5U+lZ% zD+?Gz`pt;4)im{$QBo+h`=xuf^PDo<;>v5*d7J%WDUE($1$vxSMs>10Z!i41QLhxQ zd(&%3+g9E5w!fz)O;$8>96aT>GmF;N=3QS!weMxr%lLcaOaB0jAF@}&ABesl)Lk{( z4~Lc>A+^*aGG2dcYBp*#=0K50F`97DG8A9~{h~!D5Q_X%{iVJEOYsBXzKijz!}zuF zPKo3FdHi8-XL$lm<|el+x(AEzWw70G9Nrncw6w9;EG_3*9>(r9zA#(5iv5E9k*w`> zzu6rIyT!shPHniaJMhkX5^HwXt`Zo>87deZu_4f>Yf+S z{AU)QsOpw_LS5<5>Ke|krrXJLC6<=YrP=9TGt%u#Pq8%ni@Upa)7D0L#FvLHG-|Ap z`0K8|AH_Pze7cM_QGA;>1$=vU3ZAN)_I z>z@z2VWZAtvR1UcwA1b*Mi+LvJdo%w92Y7#s;m-S$boj8kq*!qWP1EtG{&r6GPN~y z)#IzXUh>z=)%hQ(Wmst_LUND0E?3u8qq=t0CErf{Pnhg=jaSFt2K-s#>uo~M!}`aE zukQ744aFgmZshQ)-{0wa^}GrUvn<+YiL5Md<1-?L(}V0%jE!D(@axCv;$3rC(XA~b zv$?(S;kv(u7Exz!;ynuCCe*CqNZLnRxvgQ=;((+r2?_RdZyfO3z1j`m6TKJsR`imx*-S!oTs z>4Gb(msiA*BI*qU@JNxLu`7+6V&m$sh#H(4t82p2+s)Q&G|gVZ*45>U{UUqmwM%%V zf#2mfHupMI5IjzHqFh=&OEQZ1YUOmhuM7N0@P3ydo^KD9@gIe*ky=SD5=k4xNo=Av z+d9Ezdo{xq%OtQ#7{E-|+?rw1d_k?vty`ES7urSbwTy2n2p3h>bPXR}(lEJMKhiF( zeAq(*cV5VbM>1i0B%>t#qE78Mdn?H+UsU=lSmcxUxs%mw{_j<1WBGai0D{hVa%~Q8 z4|tACXysps9u?6wYg^)pwiSZTD|^*fyYF(uW!bLsZi zA}y^Lia20PD9&4JP07w{_{8xQjFz`|JL54tcMc?01#rdN?t)1mFVKHQEoo-ak91S_E&b5mDbvC`TqbIs*1fix6K(SKAyMtbX!|jyXyDd@cyQ4 zqcR{*Ha8n`$L_JjfNjHIgZ?~mU)AUQ5uf5R-}ov zE>1}$v4QQ8kz76{V3s2*no&-rDN0ajYZRiddpmXO=i2Aca1JUou{1h5k#Li}l(x0D zv%Q!6IsK4$Gf^XJhxs-J>TOL{>uA?CGloB{jhmtTk=dt4hudg-jT1C~{h_KiJO6_6JPeOYPj02JMufgzhRHB-C zsWr-WXE86-itrOFwox8o0?*6~id_rfF%wmQHBrL-_8f@C^$7MU4!k7!op!Hvkp# zGR=dKM`44L#(LMO=`&lEj$3yu+%aRF!?^)|UrglkM;zDG;Bg+&rOcL^>$a;~-E6*A zdUifr8HSB5*Tr|)B)M#^x9jJlNc9_QWSN#hIOtyZM&y zZ+5^=6o7G(dwvJ;>DSXZuP*T(u`D+4<**)L0SIHe9D+#5Ja*)F`d0p8H-WofO8tEWbZE)+PB z4Z$)w0Q0vPIU@%->CJq`4m9iHaP;RS<2kN((^*Ao?`@lRzst9+R#K;juk7rl1oSGFoN?8bX2N_VSo~MlDX1@TxZ$As0Ju6q47k8G9 z*6oggK~^5yazW&A$7=qP{wH|*MH)ic*j-y;qKZiZy6#BgjQNrAz!Pp#bHTo&5Ds-oU)5po*NRIs7$Y~60$qSI zEUb$F5xE6JZDLdom?N=0H^gmwOq;_ohg)%&P)nCzFNE70Nt1*!ZdODjGZVs`5+@Ca zoa2UswNtg%Mw?CgZggTOsY<PYp2bAS7m+uH9uAV0N|Y-KheG^{5r6n8;IXj z(@|{oD=8SQ%Xv}lS!Rk{{{S1);fL)qd@VKLkHg+Hw{Xx!AOh0*CX;^ElzC_*X#f~pfOu{*U)Nrr zcVlxMz15U!wp(cJ?V^rA(Y!z`Z6tvJXCwxZN}L&0Pbol_kP=-6EUK2SWgb4Anf<&rNX8;=Uu`qPwCbey7_uZ@Y zv%ar&Z_C%iH7O}yRV_Yte~$VqU2osa_1_jGmQpAnNhv$yeiAku?vUpUfOH(3^skZp zedBe9?TDr(MPsvY+$4@r$kVSvg}&$?^2{Qr>zeex7wUgyfr{k;BPa=D=9hMn9CAcq z2tYU-<2cQHQSk>@-jj|?AD=t(l1Af^)8$e6@m^OesMC72qITPDUG=*3>7!TI+KQ)6 zDx+TYxt6zGG`E(z>23WyOX8o3NBTiB$PUn~%r-j$7$6Q0JF*5(9CbX`Qn|a2{UgA?6C#IeI=R#}8}BSei=W;YqPLbg7aZHh zCnG8=^27E=zHK|k9M&Biih*jJ@}iQ=6~O%=UZumnXLIU|_2Ma050b876xpTTv{@%pSDYmFBQo2b(DX&D?a3I<#7$ zj9RG7lKqWhiB&<8Ey0kC8vF>gmgmI(03JRf_>%Gd)h4BNuU%Zs_+?9p^>`v=!3C98 z-Yb}MC(g4xhdCAd58&?-Uifd}*TkUjG0L&)H$|wPgvmz1YgvjPDXa^j|9JW|D62wYSmV zv(x$q{{Vtgc#0h-N77@G2+UTVDc0so%*bN7Tf2)EXg*)L6^LZ=Wf{xKBXxYA&d>NP z=ZK~Fk^4^m&pr-{<2q!ktJ$2)5n`~^{vyj_@&%jaju@>iqzt&)shJ4J754Y+8$HdA znW8qI8^H{IC40+7Ayj>qSk~e~%%Ci-GC~IB0B{Zfb`iF>hIgEG*ZrLV?WA6z9-V2Ye$Q9&-*~?BUAJiMbsbm4-X+#`Yn6ntgG05u`wxZUNdf)L zQR$X?gt2bCv7we}W79vHR^G(Ie)sh>Z()R%NlcZ!*us z*E)HP*8p2yUq@>fndqj@cYWO>^E|P^HN|`lz8)U%_lWef=B4${y?Fykv?@T6YWr8p zJZ+Kg>>*4J3`cHpnv+4kZB1m*ujQHMkO|^x7!qY`J1HY}&Ns42`@O?IFeexQ)+Eqh zT1jbOhC!&b&v;S=E9c+EsNYC9qeVDBX}lAEVVei>uIg# z5XeEAdstwVJmIi3@qMH^tAoLKBjN>+@du2qZYGi|-FfcT7KUA}tzgFP-qKYh2am~u z9IGi{gs{OG4e`?xXQu0Z9JzT8p=EN~hMgf*XzqT}T1gPHd2>N5GOJuev7} z-e{g8mMhH~+(irvWv<3zk4)7gxVex;ZF8`MyWDLWNzJ3A5}CO=ksgZpM{z8~?o zf;=1X9{T#;_d|2w{Z~}+C6vvoUHDG+;?@gQlKCQvd7ny{!>IY1ygGufUSgqumS&M9k?au|Tk#x@r8KQNSBSz2c zAHsi$-Xfnu@W00$S5~#N(eHIze+PKt>5ayaE&M?>vA=^@)Ql-}ac1%|xEC>qEiM*2 z#Dx!&9D|!RjO3`|qjecRT9;n!(n)`HCwnfBMY-39Ue(JC+Fsf`u}{j{UfZo(?5>wv ze?Z^1UWIG$U-qxoFD^st5qS4ZzwtkSbUi)gN50g2I~}IIdkQQ7*lJqCY4GZh*a+?5 z)NEzbm0^ zT_v*AUiu`K2qUzW9ZC}~n34V)d~&<-U+n|?D*Ooejp2((bw7wd0lZ-L&#bqU+SZ+} zcwWu*8who2RUT`dFI4cI)~c5>!Z%$^Ep88)9n`BI-yiTvkA>&qFYOQUhfUJ8#klZC z#6JQ{@mIk5a==wCbT1U@eh|MiLokh2NHkqe{{U05h!Q15)Rq~+cyc-NTE4?M!Oog- zr;Bb4E80y(H!JI;t?B0UJp64X^=j1S)1DqmF_W@VO4_Bp?DTr+eOdhtGwpqu!(@EN zt}&bdGH?cZkH^#I&)UmRw%5K3_|L=gYBAakPfBR6E-}6B>|l}`%jRX;cB|UOBthAS z`qvSwkVSn1BKc7?vpadHb~z-2xDBE;IRHE4g*YQ{=ZuQ@zxKcJ+ITbI_l_@K{>MtX z)Vw`)q8(mYql)GPpD32r&psh(H_R}nGS2TRq>zMN-+!J-Ln+3}cIEaGYqQ%&du!yi z-%T!j<_?tV;%G*i)zyvLPnO$i*W=k(`Aha$_?w}4PxgA#^vnH1F$_^&>N<_>qg$iQ zTU8gb&k$Ia+}sFVdDI=6Pp|G-YeogKp^U-dLa0wQVm)_(S2{3dc)$^;4$Y+iChO zlQ4!oSH&Ob_V;?ZMn_PVc3{{ew%@YF8aeZjjDI`lJWGkIf}ue;Q>6*IaZznb$)uZ4 z-Aem((e^)3$}%bt#Z;8wr0TV*)oP-h)9b%&-$Z=%@t@*cviPUOJ{Qv^I)s#?|uV$AQ^6DDCu@8n@Syl#XzZ2`% zpqooM6MTyuoq)Z%y}9YKJ6M^SfyXiXM#Xg=pP^XTxU$j*#5a)5pElyxMbf+>tl4Se zMUfs|@n3~DvBrWHw+Qi!@m~*?My4YdQnguL@@cv4B$~5J&qkl!*57qkl8zpoDrv#R za_y$cUB5jaTWZh9&xiW|0QTp{V{DCa4}x^r^-T{}zLi;Ro-55JSsi7C!h*lr5G~!( zMA9-_Tf-JSyL=C*JYnJ8b6T>#gY6A1_l0geK@Wx6VJtTjXm;rOu(pKZL0|0ew99K6 z6)zRMD69#ui}jm}&kgD}`Zk9kvcHqWvt8f5@+LudV{|PPKJ)jWy0?~ek7t){H{Cq4(c1q0-zz(L(a2_7l8Y*$ zSLk#vS<;6h_E221<=)QL-P+H$Z8{znB5T|t*L2u5`|r+*HM29-aeyhK3}vwt+Tqwr9u0~ z&gRobx0?DDcwbAi&^0j}i7e=U;UClFvs?R^SId>-vb3~MJVFYza>)2n$&buW*+;}$ zH-_Z+`S8AJH5heY2Kej3o;8NkFc;ITwJWQi4(jn2iQ~)*qrJWBNB~xb9pE8cAJtnJ zX4dryB)GDlPtq>5O+&*IVd44hZc|m$H5q0KrS^&J;_s}zo3)zPuJ6mX+OJdR587+N*1r-zXD{1F!IOD+*BXC?JR$Ly zLs%{4k?rP(L%W*hx{oJb4CBO`h~72u^!954S!*}?eXx&kFr~fC(6>ymQG`JhPk>hg zzXram!~Al>;^&2gq@z`;jX&==#_4Tk-}-(R5n(>HLDWe!qqVN|+25jfUHbGsGSP4* zTreuGrMD0P1D{?`_be;p(B+)!$p=$Cw7PQoDrrsNA8yt~?oPF)1 z(*)xc`i^szQ_L_}sJ5jlm}39_c&1KU3r~Y~Fc(BMm`ET6mgrlWnWVnklR8 zrvCsd_CL}0;nuSrv7~8QlyWP+=D0OaHldG+MykU9V>zR@Ho(HnG-vnueX z3norjl1b;#4n1+(*2C0F*1g`kKTR}R>XP#5yErjWjG-8~-syC^z3f^osGUeh-DGD-@NNn?;m&p7Td zM|_XYjZWK1*Da^h^;>vutnRH^-dH4DqwXUsg#!-5AROdm^zXJd5X4Y$PAY$8$*x|T z`ZoUntDKRXs8xiec|lv9Lw(k-`nu@4pUz+HPyYY~N7Q}-+(T!gcuw=hSIc&gHI=l6 z+{*(qE>du0Rh5GLpSzHfK-vK$eFytAe&0Gr#vczRzdwY$b>KZlNF$bO4;5)r+r;v+ zlFebL+pF4Lz+0opV36kl4l`eRYM-$uz>EI?5RVi80KyTa%c|Sl8>peXwzt#~CCENv zJh70c2ZaYKlafdVy<1QCJK?K6CjS6Lhe6U|w7Q7KVTRHu)JL;wZB;~2=rOeVf={6N zzp2%R8dMb|@e!XiD^ruzK38Yt^jd9yGu!&s5^++rl9iK>yM5KJo~bt9U6Wfg^Rvf4 zwML&U<;*Xy7{JU-7E*#C9B5d}yao=uw$KVN0B`2MH$Q9d+MmK2SA*_cYu3@;Lj;dK z?Zm<>OK@S{sMlC!P~$2-R&2Hcp5LY)w8!inr})CgS-dHzOQmX$7)yJo$VXEtVq{p= zN*0B{`955#<%v^@{1W}7zu=Ky6E3Ba_rzX4)qF=|3{N~7T(YpWVibgxB9Cli85`A+ znTX(yb2us#KD7_+iq$Dni%xP%Qs%cUqOGq@U+2?Wn1=Aujs~AB+Dl}$UTV*`NB8Q# z4n8n`)mk>CuV+R3X0rm{$#FFC$>c{fnS-OG%vj7KP$NcB8tep-mKEtbf5$s79(Yl% zqyGSjef^9`C9|IC43WmEk2*MHV6tawjmL2umD}IdrT+keW@|9(KW5Zo4R0$H07Sqc zjN|1UdUGy%k}<|>+B_})00h(U!|{HRR(7?3Xr;^7X$>?Cq*n*!ZvZ zYlp;sAo%^^U1vs)@9cU`tsmMi;!W|(B$ox+F)l!mEMRUU0P@G>ukKIa{;P3y;LR3W zf@GWQwwsw>Ybh{C0c9){s$35~Wf%;&4nW60ioXMYWcUc+_y!Pp~IizR#3Seq}<0D8Tm`%O0}tX86_v0Nl-F}3aVKq zA)5hk6c7pd7d(@S_dQF*c5+)u_IZ^T$y}0iBebo)S3{gG2LO@6@IC%f_=j;T{i<_^ zj}r)BQ;5%+tfAQC0IW_KNIZhw8x`YMsHYnw^z&QkuGjoecMW*|0Nb*Qx23*+@ILe9 z4~Tc>IOZ(s;n~jRI9vg|{J`M1Jc08L;0|lxKZ?F0wzfC^TNPN8f;VBZ-*^JK!74*2 z{^{qM`v>Aas4bw8E><$p8M2T8*e3~pdDdV`Am!}z_fBx!v25}Z0dL2w8p zWn7GBJ4Vri(0(;I96zA18cP z@vQr9VG}EaiK9>?$zn-j**$(;qoCw^V4sP#yVkmzM}L%LyAVO%ILaP}89l!-UY+Cp zarFzyS~e{Yn9GdttO4L2P6!#~_32(CsLQaHB(~Pb!Sy_Y&N$jJj(xpl<*HoVUZoB!tQTcXhi=A9V+ij?}(i!ry_>jHp z#td%)Tk6sfp^jCzqk={UdCwY{yrnp)}XdN>KHnQUi!`_gj0jdHY;{57{@OUM2y5q{6UC4eLSq5d2EON!X6 zi-w=X8il5s2zb;2ZLO}ABRJ)i+bRh~`J?uAyVI<0w8$@|ksnj=U7e-ugc9i`o{^}> zX=CtKWh2y%-0fSC^bybi@b$4m3-$M5a=ysAUEukU4VG{_; zvX#fLmGe-_>Qj?;ttU7oZ{<-}R?^x%HCtK$_(9?6t%yUGivBrdX2Bpt*_|B~*!=!l<%1GGOFw32MSNlL zBgFch?y=*45H;ktt*X7W7Zaf^CB!q|U(IewEUN3|xjtJ24;nGaAOrQ@uYG-G@PFbq zr)}qp-9p<=@XdoDiCQVejpnNA2hEg-O}xYcN{tfjBai^|Kn%ZrD)H@%&EaIYX%vg? zBjNKml$grT2Cr)`n2Uw;E$?DjWMaF&(gDdQHTF3q6-QaQMn2MZj9atXGUoJd+F59= zW5LAE$tM{kqaB|_m#5>~%=1X)T`O7e&aEB7TwBdF*088-d9QJ070?p{wUow!X%sdv zsezsW^e+{-x6~%_HLvd_n?lnh5UV;|Tisn)+Gx6q#l2%zTU)Ut(JWz^L%YcMRyF1N zEClGc(A-ASO{d*kS-P=1p=5!rV!wo(l@i-XKq_&Gquf65KA)&YWVh1gx4V19xzg+} zj^+`Id7@dg{d&@LD&A`?#kH;ZRM^uA=L%G@HT0DivwX3-X)DRv+bvW1rR1!-9#vMk zWh-5~D?N9$p4;u!+Wq$N8MOPhywi1U90=aiR=T^?Vl2z{n;5NNj_Tqq%)m<40pywz z51Ml#VUuNS-)pj3O>bGCvgfa^7)u zqFfy&`r0)Qe>R@1kEiO7ZHVKNXMv=!xrE3R&ukgx49cT=6(NYE=+@*@QECoWyKmvU zY4z6E(`B*6QjF-R)Tb$1l~igv>15=*wZEo^apCwh%e(tsN*F%Rb09W)j-7LIvDoVx zb?)1HyE#H6a^>SbbTY?qNSZVrP&=J}K!0NQ(Cz$r@fYBSitbX|K=JQ{Z6eSvyh2)N zEPP9CEH_$o`g;iElTn6gr&APHTLcyw#riJu39rik00wxN-95FX>&<6>49s981d-)$ zm6F+Ahno{*ZiTko!s_M$F=C7Q3E*ql?{B;V;u~uZwD_CCH?a$=i+kOd_kJ9e$@>_N zNYySCBa-x3Tp(F2?hLl`EPi@OoH*`!G^d8dYff-{vW@K-SvYFd@1^g3{Et@%)kxFC zDJ5Ap?I*3Qt<1Hv*4JC>*#4crVLysD5cm@B;YW-feLB|bN`DI7YBz8{+E2s}hLF#5 zrO$qf(#rGeUk)`rTSS&fWm|i@tp?U9adeLw{{TQ=@J@XTPuFz+0EhlMwl+~~9v1jf z<1dC@DzdXPNS2zH{3c!z)pYp8vIPRw>@1q{$#%%E1@7o&*sQhup!_`XOy3QBUt)AW z5*zz36-{ID3fEPh#!G8?d|lybThH^LHxgZ~zPY7H{{Y)E!hy@&-rUV>vO-z^0HmMz zBKO3oyi2EiTJXe@v^RQor!J!|mn$TfR&mRD3W#nbS7cwYi`d|{5wx*2tH$xV#$pZO zaQ-ja=Ga;@_nj)qQfpszCb!XOx7}%Vwa1xZ`Ea#sJ6fbw?%Hoy>-Vp_U60s*9{9sc z(4f84?=Ov=wv(ovTJmd`M7Gncl(T)lVym`m*^n6REY9uh(UnGO{#*Y5Ywy~>Q26)a znV-aOs%gIpJRk86YpokZvATj=e-WmaXCzvE{i3E-P8ig%*i9_7tx|$f1p{k zE%*Nb{1hYhg4DFVYe4ux;VozFI=-yBzNz91*!2{(f@|06$zKv-D z$LCyXDRAO9lHTdZ=jXytgm#*xzk+-;x;~Y!YFc)tZ>{x7WSZwh)I3{zuUuMbcDEN% zKxWhpxQhE!)D|$-k%;US4hs;tvp=htV&RnF>Pbvz89m!#*!yo)j@6l8) zCgaM9m;m!$SVM0ibevoMUHn+Nmt5E5dkAB&mr~QF@b8CgRJYn=voggM?cxEE96DvS z)Clu}EzQg_n3{csm+?&M6RTRD9-N$18s@h5y1xBdf1Zr{{G~?Is@BhSw$W){(b=`F z)sK=s6X}-EaR#e!&2t8Yrs{qwitafCi3QfKvuO5LG9zqRuP!bwwHL8F@d%1*mwd?x zuLAv`ymFRWa%+r)Po!&lwb~`pLm|J_^&524JUs)$lLFe=>5yIO#Yj>ZqC!Dg3;u&^ zei60!ec-PUY5Gh)N801kEyK>S4=(0gdyP9l(k;~dn{70^YyEM_`9t2xaW*7^8vHi> zqWoids}{1-?iy<(p7pI|b%@J(b!j|S7HmTu^2qkqki%gD{_T<(B)WK=kD1elr5bKA zQPwId>i%7{cl;Nf&syqGl;x{QwAHO`7vHsyki1=X$HclD+QV@*_0wvPG?OW~l6fMM z)3dY!dfk@qxqg0_;28CS6z&$LoTDQ_>WBSU91wnU$icl1UGDu z0dDCeWMsX5hn7;5@bRk{yGARWzP9DFzgDkp6Vbkm`5bL!8u3@RHj3{1+g-l>F5SAH zP5cuHq<+>v8m_gAXea*wgx)ySCzwnal(ObhDUP>;tsx9@r@#K`Bea)TZk}H)Ert-2QaC!4pDNSDdt|>QtG*L?F z-v0pb;f86->8G>4n_vD4x$T-rx6pnh_+G|Y)>zZSek1VuuGCAIpz#H*>~|@87hxF| zu$VluSnQk}62ZTVKlmyag04I@@y}4zptWSuJVkSPb73MWMUjouB;dCRN6`vC;7zx z0D`N0F4g=;;D3icG_=#+TbZNrD&5SH99!F1>l2uoD}@C}YesnGa&qfvj1SC7;o;0< z!sqlN?a4P9a=W&VneNkGeHHx9+*KIQ1zq(~f11%Huif*qKbemhX+CAs+`zk;Hi^{- z-&_Oc1Jtg5QrXUX3iISXOzP{95!J^~z^eiV4;aWi_U66s#dapv?%rnHu*(mVBusMo zA38Dr0C2O1{e7Ktkb1Bz&honazJ={u#R(hM#Wox$-26IUTCIj zjdIB4+(V6mO+vL<*M;9nD9KrGuGi~lbbmo`jz9ZDCc?^Y?ljVVT&XKxz1I8rGt{)H zrn!zN)FLw`;gSP!li%o1=UttR)bYGK?jARdf~eiW$s7O zeuvQ2hbqzZ@4C~kpKthO?z#@C4bsC4DUxOX09h^uepcvPgMpmm(B$)8v0*K_McBh5 zjDQDG&u)PA&TvOe*U!2Y*|#X;8EgZ_aJU_Nbb=?ouRVX$TBC*mxxO#{-eu z7{?V9m!)yFiI1aWh66d=4n9+ya(#QAoY1%)TAjpj21hwRO!Ymgw7P%_G6Cmk3f#9| z598O<*R^z1ttQ~3F29;dXuB=H4L9OcV{LAqN2c0oZ6EdMb&ab2>__K?jaZo11ZOR)jTp|iJgeAyrYgIwODbA4>Db!VgMS5iSX$2Q{%`VbmiIq3 zJagb3GwnAQM$#O_S%%a|nH*p_0Dk~RE>2I~=R9M`^)C-aV=_l>%C^B{l{RvvcKp~< zK5g00JQ7c}eL3UXCvUe~h`@}OxZf4bQsZ!NV*yDcdSz8Qf^a=+<}GVfj@4QLg+YkX zu*$$P$Tjo+ui}-rca}L7HnQwloCaYk`C@g+9I^xijsxU-9UC7Pr`cmt$m-|>ip~fw zI+6k8t8$~W0rfqwg_knAZ6~u*dUe-N@&$;E`E0MYyRDaN>3-j#^$cDniP4c~$&ql| zn~ zCQ?3h1D6bV>GFZ=_+y&!SB@@w3F@ z=TN({65*HSETj>?wG}XQ5Ej8NKy3*FSr=LUQ z^Zaz<2N=66SvPCz%iC6!t$!}Zg=-fF?KcIVa56U_k_qJZ&p7&b`d6Q7k||foIVZ1W z^e3@Do_+bQr%;qDy00se%ui;?7|wsgp8VGve8uG3o(bqagE;PcbT#_MT#}VHvQbf5 zCfZ!K)!NotUM^=BxytlQPQ`d;r|_wBa&A7uW@I+(cd_ld18Bq-V@y`*X{I;kcL zO+qQH9|1WL{iQ9OoPVs5Ks!rk^n*}D{?(2Rt9cdr+RJfw8?cR`i^;N%>=wj~(#{Zt z)w_I#mOh{{Sm)9#j{1Gbmr)Zj2C{RN~`9t9W}%*7ciJXd}CS4eK_~D+reI1EcF2akx;; zmU~%kZSW%t8W}%&gvfVupS3OA8ZDo~x$UL1xqX`bt<119eVz-OxuBNKXL3TxJn1t* zG5-Kmt1QUhHMs-cG>a+q{{V&2%V!i9GG2H+Cyv^5SykvRe62}pw|d#G?^N%iJlekgE_Cm1MAzTSmDkSJ{CBt3 z-vqo-8Q1&)BsW2l;AofnbF&9^w7r7v^71%?Df1$=)-G=CreVM;LuUgXS^IeDlihq} z@s0P{VT@dOGvQUz!x?SO7LYYnx@9Njf3n0Bvh+kG9>%<{;75+GrM~cOvoz0p;jK~+ z52&?{BX4u0-`$&wS=7p_G*U#e%^b17r0;Gs#(t*#xU|OAuB^OcW*}K~?GHiHE+HOi zO-9jWdnt{?MlkPgqUJS~WMpynlE`wwtK}$8t{*)~S}MFH7W=8R(&o2Sw@*Ep>{O)d z;971rqwrey-8)~SZrAL7Z+OR7mrBtz-vwPS{Dr2}d{rgYq)#AconU3PxG}K;L!;i8 zic$oMeBqK20{B;oG(~L;+BMX|D{E~!$VRNPO%;`_@LXxPAOV@klIWzWPnZDS!*R*= ze;t2k>Pw{Snrt@_#irf(hfaI9WoyeD{T}Mt*3vD_D?@Rm-P_#x(8(re2H2wjfRUeR zp-rz{O$@e%-rrF0_1*obXORM2SXoOl*+sYpeYbWu2=1W9&`Ak|nH5ca-6=dPrzmO5 zXD8$-3-W}WLKftt=5a-ne6pDSpM1J&39GOi75Vc)MiO+uOvw}fZu73dpo;E zXHBya zhwbdO3pu5Eyvyx2&S@;~+E9!{(Y?E(%Nf8f(nlawLA%&KGJj!delfhe(X^Xw7sI|G zOIEvsPKb>*_eb#+wXU~i0~rInO#_*CTSU#}TtKs&i5M~aE!dmmaV6!nQD4}dO2wxs zE3D$?)u7cZriLLh2Glh7k=EfMWrfis^Zeil?tU_9P+E9TP4N6vLlypw@efe&#r4(G zG}jiEmKst^r0a3O@yKF}QPOX$Y=BaN=1o6`wqnx{7ZHe;)+)jlO-q(CyWY{aOW#$q zy7jr&3J_7VS4|}JU3z+c3-b9R_L2BMquS`+4)~?ueHb9r?e!Z^66tnAXeE79Nj5TD zc#_6gdVg){_pytsBE@a*>~2gW%8?ndJ_h_;wbHypquOdSUfbJ8KbaJd8cjZ&vT*AF zxx_NVGc2n#5;{EIPRVzPBgg9xfP4qx4~W0BkHmdTP=exH{{RTVqj<8~_Tpx?Z9Bv9 z+*)ax-Kem)NzhFMdgYbOiyhqlY*x)WOv`SV{M+&G?8W1M3hEEyZv_i+;v zg{`O5^qYAZrF})LCO4zk?aX|vAw%eCoCw1@!G%^NqJJf>fl z%B3uyKf@}~=BFCdoNK*#qN&NWo{H_-*WOn0JxoRtd6p8E8j_2tR&b=#vgPNT6K`9! z6?XK#i>hbOJ~-30--!PJ6Et6ljd-8i9zFigeILWqd6Qh}Iu!D5(jInYXyUw{>r&K(9L~yOEdli}rT#K85hQ>hHr=aJ9aV@ViE`@b#tT@+_Cy?}{#3S+2Dyv*u4E zjVoQ;Nqv~yNRp}`ge`fOz+Z;m8M*PspYaP_hDFo#MYh-cKcZk0s4SAEiQ)KLe2W}b z_O`lh=B+X_6}oF{6@z4P;r*A!9y8NCGvL9h$8~D=y2h{K3x&U!ED}u%zM*ApG}9mT zzh#CysO-r&2rX1(v5@%GDpA4Z)$@wY%M(>L;@Wbv)yX%$uYRd7o|u`-6N$rRv>J5i zN;iylR#94{md@(h{%0HUCtK0(63a-jyE0sOitY735?|a*cbx{(;(LodGy+v&hUZcJ z)QJvQ+Cq~{wMwb^OT$*0ZP$(UPZju*&{^6pnJgCeh7l#VhjgiB@}sndONCz|?^lW^ zznWvZM1nY@D4uJo`Hh!~HD4O|`%Ccsz2x6()nuDdxk#^7(?!skXM;|;xSS&;nlQGI z#M6X~KbJJ6qYAQmkHilUX#N1xd_g;{rgYsl^H!hj*N!caTR~^N^+3hRwdZVe9I1=# z5hi56MUV6O;vtTo*4$+X`)kXa`C`|-r{kvo0FT*G!%q)No*tc~++#U!r!}sZqE@`S z*`E!5*1r?~0I}n|(@o5_7h1>mrlY25k+M&6w-+{^OX?D=jON}QH%!~3T7)P4BT_H7 zU`DUSZC>VUJwpECNsaB@m88-co-md=XfAFHT$nu=+RcN4sEyR-Qqu)Ph&x!iS ziM(^-i}~71{Z7oAR$0rZ*)-p^d82iZVz4{vmlDfxjN2m9?W|E=ReJLQJPOL+4Qq*mx6p61;OEJ`iRtjz3#>9ULkTy0)x zNB2pmQhg%^{wry8mdR@L>7kB484L|RS0v>nc9Yq(yKikhbl0Kge-!*>d*Y<a zo5q*(Yp_joERa3Ff&Lu9{{RTymA`t3;d|M%d%J5GA^=GZl7W;^?Z*DkKeOfbtUNX2 zxl&2AJqtwFZS?!An1s?=Sz2mS&#(B1t1_cZnxxjcCA&ZwCb?y?X(G36MtgqW;^)Lo zKgFw{{5$YIofL3U5|hG zOAfQ*c)lupL4|JZwT(~08b^k-*d%FZx{?nPqHSa!FplNyWr}GMAnt}?gS-|`*Druy z*&YM%mxgo??O5Jz7eSj&v5x+4GD)==bepKWxOOm9jB4^+h+>JEzQSP+izJd`;cta{ zJ@u8o0u)5naYe?-N9s+Qq32b$!?)!c}$ZOi{P_-Pb}pH zZ*{eH?$Xmvo9UwL{(nV+qTMLen^AH}q_l5sG<`I;=ilp#?Mbv<3&p-Qy?9k6@O1ma zWy}}1kgcpTBD$~<4LeM-TSQh4BfBlNLla+{U$&;548ODAhP0-LU9{c`)o(u211k}B zZqrbORwis9DK*qC%5vu+muOu5OYuX;b}R6o#CpY~k0yeAM%o{SRzhv%)inPA38Xe{ zAxFWHZam9LAuJorfx-giO?;jENE^G4g*r`>%pPq&!mnzHj3YLn1;kOHKQLI?QX=GS z7%B@hSIa`BSmUV9%UQ-zdpjn&-?LAx&t5c=r8}hVnrZdXb?NfI%>GKedt#$iTVt7I z5^6EKDty}nak1J6RRs$cT%clyvB(}V;>{ssN4<>!T+9{5aU$BVjY#C@a>$L009PdP zFn*r+rDT?#D)C*_v2h)=cX!adk?jdR%&!W>Q;>xO@ABuB5re=O@Vl4=*M{bH1U{b~ ztiT_z@Hs_9eW&xG`-ZV>@Ei45tc)+1B@mG)SFlGa@%`1=6a!HmF?s z6WclJ3Ho*w+3DpTAP>qx`Mu71@<;SI!St`naOtGuC9{8pto=1VOPZ8rE8RQ2-j-`e zt@`cV^*ReF6v)c17YqQyU}ua0f`1>!y?1(T(!m=Sa6+zf%yYOLaoeVO^v9)nlsjXQ zq#%{t4nrOYBNzjyJ#)|LnvYlUWLhqtCEU@MiWO&(6^`~C;|HJw@N>w=Kx?`@%a>gh z`uQs_^7DNTYP7kbC8o>Xy6I)dz}y1*@(&({t!q9h z@b`wc{{U$}nI^-S6aw3aIDNeVBRTt^bs*p$#xEQH0KrXu8hCHS_dW=Z#9B6)soKwR z45se%TQp`Ia~YXbhB*ay;PK7?uaf>D{{X>Iz8>0LPj7qU-5&Pgol+FH)@>$~d5i** zAa*UBoT$zS9r&+j0hZUsH_J|>qOAFpTTM2$^7CD~bmygng+~V}i9xj6y_J(&cX#=v z>VHBPUm5LE;p4Z0#yfeSWLad{B$nz~NS-iTZ^p_^G}u@86tBSxCE*g*;hLNKY;!R{kwF(iFa>n;YQSaPkE+! z52;#cmKS%j+Cy?cifL99-y~&-Oro~dAYfNz;t$%Z#CqdB9xDB!{AI8Bf@tT}NGQV7*ghfnpl$Xrey-ud0P3Ob*jJXi6OBA0EbUc?~NmRBNKm|Pqjsmc{ED;`^LDiq}O=i>hW zA8Nm}CB~`$00_9#V6xP%BD$5M^J9g)i&Gh%;L2b=WczSqW-B2XP+un>A$(f3_{HJf zK7C(VxoIMCi@5FQmg4U?`S8A40550S6=c?Nu^}=?|biSJFA~u`1AIhhQm>gC!Kt`9LOVUjEjBEm3c{Bp^C6lqH8P4e=)-tIPD ze|DZ{??3z{H#evegG&^MrV}0kSlDGl72_G;{bya^jmSdb&VB_X8b&wUPh>Ok!w$iY1O^&g1i@%<~F zyx6&uC*{fGwnstf&Uy4W&2F@1)Z~B(-S7Ofk3pK`_3M?kw|%5Geplpz52sJ?{LOvd z8T(1orv&-$9hH|v_Ez7o-{teQs(U(9lXgjK{{REK?Qiok{5RrTuMBwS!~P+)VzOyk z#g+OU#Yg%)s^k(5LgRungOl5n`)%;A#;|I-PNGsoX!UH~9H#QnUfcOo!dw<~dE}BC zpWWfq0s#40N8n>gx>l2O9OMJ(oE|=hzg*<{SKr^V*Ts9Qd)q5pyQtJyUs}U)8?a(* zE+a^V*2V_{+TaF~LWj#ol)8ZYPZ?x#%yPPQ>BcIq5W`tb@G2?SX*4hWqorQx&#I$4P!-`TWgyT7-fW)GR1Rj>R8-ZUx>zIMQZr0 zG(Nk-kc=(u@KtBe3#Tn^?<;&OcW%$8U!tz%M>d^FMY=eJ1qoYCY~8n4>G$6IA5dvF z_uc{cvu&YWA-A5xTo6dpJBvWpxAMM|dW#ycD{zk-E`*aMx7po6T#@=g@pdoxO}rA4 zC|k>W&xbmMwHSdP@x3V2R!#*Xq+u`pJ*~fD}m@QuJ-g%hJw#Mbfp0#gxnLc?f zr;Y>>5D=`Ao!kQcyYc@3!PdH7wXNt@E_|bDY2tk%;!lznv(#SkWsWO^0QsiZ7ILa> z22m#XhGTd6*WtliYaM67GesPc$M)T0NfH;gxce;dE6ZkMe3@C`iYthjTVqdcI_GmS zA8&)JLmig8N0ziXIa_yaS}p5)?RV1cT^~1zUdI^itvO!WX?-tsXSLUEosL)H8(Q1= zzgV`ifnn18L8Mw9zZv2 zyL9_ABIFl1Pcd3Y3V~xP%v)}1c$(#HCetmaK5X_dmea1ckYbJIw{vT9WFPXzC%Cj^ zY@2z5Oa>OOd_S{es_NQipi997myogEOT}w*A)3nW-2VU+B1eHzMGk^DknK1uJ8Dyn zJT*C~Udq=_?Z0YZ;6q~aQlS-sI?#HhDMSTb24fV7dTeSL>_ko*Q*O?Ypux}GZrCn+^ z38uZcHz8s~u(gpSe=%AK5px^39v3~>xzYSQ(OI+V`X%xpjaB#EX_n*%h9^?LX^MDD z8Sa=moUwINjCMX5zP<36k6w`^ZA(X%OO%CFM6&5~*=i0EB98)T;c280Lh8a;U}KG{ z0P^U{6&zci?DPS}p2H0!Jrmu;dx6Awv-eC;Uoks%uL&(!Tn8 z&2{K@VCJe(j<;@Z3$nGe*KV(M)5!Yb<3#WmhxN}6__s^5IxiW36l24}*@QYpGh8=lgA~pNYILY5xEdE108VrVJg+cL32us{i;sZye`vTSGD`u zuIKbs`zZV)wbuRDShaZqhu@ zaBgp{j(j(=BwWgUA@wf%tp7;6^K6})%f4I_Dyc8Ul4Mc%1)mn;hbyzURb08CUrne!`S z`TF?lW#YdXd`j_c#Fn?SLvelP$En2>k=WX9y<5FXS*B@-j9cl-kwj!r2$Cm41jtXy zapZ9LER!I?;a@F$MjbptUlWy=n$fH7?yS0p3qn|o-!jEkPubF{)K-lr&3!j}e79Hc z%={?vvguLylg5*~TETs&eTPZ1TQo`K`#QmGrbBWBpK6>X!&=-bw30<5TmgtgOrt0E zn%6G$&k1`!xXQ^6h7Sn4GOLe8B8oI+QQfgXc7nb^dmlA_)-gL3N zk~l|+vVI>Z!&Su8qlQU)IdaBOdQI%*b*}bJ+3$Xc`>btvQKvfJ%L^{MSuJ(jPxv=z z{(kSiBlw5Kej&DnU+sFm)9U^vlE%@lt~C!3- zgRNcHuBug&ap$^HeLijL^F24?N5{LFv^y__9}4Y%vOF!T-06BYrEpa=+s$IuIOqE= znLXTB9$mMGt>)B0kHgw*i29_(CceCA{@C{&Pwb!Yc3&BMO}z2_=BG30eiyg9)AY@L zOAK1es6nT=lUTBgY&Nz!Tr*_ou-vRu+z2$MGeta8$~nISd@UX)*R?+jX?A*KzBt#S zZ5HcQf8Kw>x$sTBrM#MyrwrTFr5%*aP;dSd1v(yqoQI0fw{U zI5kM8){UYpwi9@n)mu&Q>=SI3+4S!ZTT5ph&5eXIvKE0Zwp`BsSDtv2$DR`Sa=OQj zucA7KhCUlL)vkeUJC=vVpB5U~wF%^wRFIam@XgD^f2_xAD+@P{+}{Pfy0*8k%`vXM zY3$`EFL|pw>a_XaO&4Cr)KP?aV%xN1Wv7zsZT8pCd!BFmNqDkL4~c#Q_#>j<8<+4u zhV;E(#1LDQo-2+lZOR`HjQ_1_(MvP+wd zUt94l{{V<*lI+MYnj7s-cZST#BL`Q%ogjh*D55CrC&Czz?albb<7xb1uik2~e(OxL zz1MGGmnh$Cv+(VdT6DU>l(>F&V+EU}X^g^6?j06g#qu_PR074)Z#h@Zp%01&)TBSrgm zp?zl?NKPb=aWOO@v4|ZbY?Ilc`L~G)`J=*XpATwwl5APEEjI1)%6>+UsKk&C}Nn|HSMRi+-?wF}jf^Ff+v zETWchByB!H6Qd^f4Ye@wqK}l6i~;kIf19g|cWo-_Ey1_W$U^<(y0V)Rwht<=7gN9_ zu1`XJ*Y-U4>8bny_}Sra7{PNadQ0k7_8uX&E>=68GU8a{m59$WT0Sn3dEey)0}WoM z6lBrPa!UAEByi%~8vbH%t3q<4S_(BO&Q#^y+E%^q+TG9S=Q=fM#HirjxqHu5w)n>PL>9i8;#LCPRy!R&M~<`Ju}G0 zIURFVEH7F141&XML35mB`Gl~+2OS6F-n-2v;T76o-cUdY0I+t+;G^NWUYWrll5x#f zwD*)+jG~&g{$0PW=Ot++P1B9srmULzZkOuTi15v0_A&Sa6^rV6m%{%53ElXzNnYi( zY3%K;r@0qOe8F=h(LB@0oM5vuu1-5w&U&}(CHph#R}ZSU!#@M1?Yx1~LE-x#>2c<( zmMa_|GDa1+AcEUBSLqg|tH*n%E?uUc6$+A>iDe)huF^hkdgS1N&wBCe$BtH<+=PNc zu81S~70|cL!=@x<&ei~fj=PAjrOR__bl0%2I*^2vZ7On<8h1%u!KTuBTK4MNyJop# zod+n+Fp|>BNlB|&==arL>1ceF2kgJ$?+8NoehAPs-4b>Y+smlv+Ei9D0_dg#bv2|B z2+@HBP@SNF3i|oJi~BzK*TP>FZN4v9+{tm`-8W0U)2(hV%j+6?=~og$*EXMJx>@71 zn&?9su(ygoH)@wBIj_?n2zZ)HOJ~#XH#diFbx3YCSz@-1S4dHe9(%C_$@W;C-Y7v4 znKm<#gf;rZ;m?5IvuBTdDBdH`t|Pww(t;b4cV(-^ZE_^HNW<8rnn)tHkTe?zc-m0e zUBhv&zRK`b@zlMh7abVd^0d;EoKjoLdbE}LyI-!hGu|u9GOBUI=9wGC(yrs|r-!7h zm%5)k;^NiYPRZWM-skg~<6jwkc%JOoPM#RDiL{fbY?r~7)4Vp3qW=J;4dll!&mj%= zhyik-6~GNd+K0uv&jx8e9+C9B+qkVIh->iamKu88fbwP1+9-qqI90r`4qN3o8Og8I zFNj|byblM9HLncm_p1YHzA@5ts22AA3FI1b+G#fi#(8C5I?q$Od(BS5$3SZyVGAJabU2#F`QI|Og>XmjOQ51QP;yK^>z&Q*z1W9qa%&DhKdm!2ECfEPOq+(4z33 zv#4EN8#{ReXl%8iKCW8SrRNOq=PCk@ps1m z0NSTr@f@$A_H6oe|8mirbr(#P8&TKwgz^Z_}msA zb@3FbsPkH0^qsG|dUV##{(Ii&F*vyWSK3MP)QlWs(WtbY+wXVM&qLtPihd-X{{U3G zA<0IOlz?{tyN(7v?f~O~pVGVz?rCI{mL;-5Z1wifJ+goOcs0^^k5hZAndDhAjLcE7 zk+*jRi9LS33HIZV^GdGgZQjSP;&6IngTNlV*XsBzFR;VYii?jlV&we)0O7T_<+1tZ zBD0-oRK3&pcW&!V@9X$=wVR$%5*^$g+2fB-PtW=q@{beWwAT_v8v?t4DgY-r{PWZ6 z$JV=TYg39iCBt#I3IQ1obI_cB2qPR4Pu9Gn_qTU=ZVavRlY)1aTyyV(-?vT(udK=N zypol9tvOYj**zMy_36^u>%QMTn`5G(>>&O5ty1lEquJ}%r_Xxn8!#xqh9_!+#z_E< z{Qf@m+UZw1&x7vty=v8^w`-&fU0Knkgt!t!Kp+s|aKvDd{9P$@O$qfKW+)YRZoY;T zRs@FI6VD@zXMf6s05gy?z^|u%Df}`1&iGB?oo#I77Z6I~>OuFSwq{Wqfz@}AKvFOd zy?Pkj#}7{vhQUjgal}PhF3n2sX0~_fe_qFtg~4K7DwwKnQI#3{uX}CyC9Rc{etMr= z{?322&xL$Vq1yO2#@B79>kwRv8!NdbEpG5#8Jah}S2^Q1+s7PMYYjB~P_j6kA%SYw@N?ApFrb?Q; zGs|xzFCvDC*%~{gjLC22Ab4|3@W+Yu6obXlTU|$> z&cv_wpV_5XhesY^Xqxg!8_bFcHzn=5-P;F^(Hg!lIN_?fWm=g0x=B*SS>mv%-Nsgp z=9Y;jo4fnZrpt51il5zU2-ZfEeDc7|#t;|DJ_q>9_riWL@mGX3hG^~lMl4`U zY2-zYTeOwqwGp(e`FpEPaj^X`wWjHan433Dwh z^wC*qb=yyt+8%+dXzAnKDsK%eGt1)t01MbyUU)lJhy{ILOn4==*StA%AMch^e=VJ@ z&63WNOKGVzYi;|BiOKlWQqrw0E=APSw}*9@72RI7c4d@;G35te5y}JqiJcfzK=z1*3U&-46DX9t#=5f^!r}x*RSAyYWTOo z+MVRKw~eN~%wBxs&(l}?M$TxZ^Fe01wvEz2QbzsaN~S=-$+(P+?H5YaEp=@*k_&{p zg(6!=kz`l6v9^#bHl)02uP_nM=EET@tGDJQRE~-CZA3?_*;wD+Xp$Sdc_*=HWGP_J zGDh~y+hI`9-G@jcERNR27zY{0qUbPdrozr^;I}PhFtNOb=6QUZmNs(8MZD6doR!r^n4AtOc>AOXW{y5C>v zmvY8#t(@<=(O=Z!Yp+erYIWW`X4E;bffF`faf^=}FIvR?*Rq(e%$n@Z5G zEuUSc?K(sog6X05J^eg5m&wtl*P%vu29KpwwVmEaHfCy&R?Pf5~mwD>+I>pu@P zT?r+gK$cK=zsCB0v3|Dm-DRhO$4&69jI$3fb}uDHEFw}rKed!tziF=&O>;HBkzuIm zS2A5`@BWPmRaT!JmUZ7?NkQweh~G;$3Ph z@{?-9$4N~^92QdZD|;ZA(}S(OvOy>2j<_5EiEDV)A16^NRDl?>mS*W zZjoab4)EDoNYJA0j6`O;kR*k~WJ*vRXpCX{c-Vz&f*KN_$M{Dg_h;~PM@=1h%YspxOXft>E0yx zgRMr3Bt}5+&e~Pcmd1Hm8dXTvX&-k#xL@!{zk^Mse$n0#_@W?PE`J+*4e{=|;tc{f z^HWaLJWugU!@eQV;(4Tw35GOi8cSPXl1FcCD(`exfh+Mp#rg)jh(_ZYil&6QYj>3{WAXm!9Vm1 z&-ia&9%(vWk8`E`J@|R>k6G}RhFpp4E-&spP2ouFlG^eA0DnBTdQONp?koyj#~~qF z0bfOd=axGqr%ozyb)hLsrz~vlmq&GeuJ5MjojQ@HhQ_xhlcy-9clMy~x6i%y^FCJn zgMVxdf5-m-veEcMCDK`shTrg%=|*{Bia8|GHM^*_zYRxWYRoQmDAZoh71Z&*9_dyl z+JJ-cZ^z#qYWDV;j+dos7f|Y2#r?wG+*{qq*U4t{O!n~0v5ZmP->i^7?wU`q>CX%y znqwlTkAJ~DS5kr>jX&EdetcQs5A6>NTY@Hz!V6tK>RF(_wYF7=S;l6Dc;QG@b{GVP zNpQ@6w}&U)@Wz7(j|uLR5lP6EpE3BHIAO;epF?aIN0T@^TXpmi}ha_ zSZO{4wVP78GTv)5>N6$m(cE9(3p9^b)0%uJ`#sgOO=%p@aAS*SF@-`$&_5WwT-qnX zEf>RYJeNLuU1~Qr(Uegw#Mi?5_j|@941BR}@jzD+nErC4;CwUTd*2m!#pj;S!}DtT z%yTuRwwVpR?ev96D(sMayxZX>u|OS%B^g3{{WYDIbxgF z-HJ)1?CkY#ZC|0LZ&ognqXd^dlfPLcud`OuU%1}!G&(1Rd@-eHI&>3i*ZNJNZBFek zRwHrZOVHOQ>LfvO74&;K;jn_^1`yiY4q0Ut=(=aZ&xQUBx3KXa#SL&?c#l*qsgDTg z{vK^ZPm1w`A@22UMQ2#&g3@9w=hGBf#XP3oJE`IFN-xyfe~vs+;!g}%X_`i!`X7Y_ z)ufW?Hs;O^Q(5yjD%!GX5yqA<$~?4tTOo5dmnugDVI!7D`fP4#iIq9ggl7oPNl)Qc zx>xCaR+jyB@o@QlJ9AXC((A8pU2p2r`;F_ZAHn|s53DZyHQ}uq*HCcs4;SmqL}SG| z_M>)^%W5@sN$zLSf3$8a#-y4QSGO9&p|ez)X_h!U`^{B89%)u1O|#WJPvfm|Zw2nL zs9Z;(ww0vaNX@AmD+RZjF9zv0?VBkljK7Gavy2&Ku$uBxo=`OXW5fD{7O8!sYX1Nb zyqmpa!d@DZNUk*h018KH;nz{Vu(&rjA7)7I7UC9dGTs=R!tZx2%<#x^b^Tw%9}T=Q zsQ8oOSB>?55^9pp*IpsBnNlmATH-d}4wd1kUG`q-_Xwcg-P)UdA#P< zXIfCEqwh*XF>i+Z6B@YBhUDdo8w4TdO{&qh0vw{`bN68lw1y$G{#4 z(P4cnQSn%UIK_^aB23q|UKX=iW4+XM?N)iOg^cSg^GSDbl3Yq6M|1hT@eATqzZyJw zr`YLwcA=zvEbxuE(|k3f3$tq`r)_JSoAGZ6TV;JuQFM;?T-2ggo@wM;iRZX6LaFDU z8a_UFzv8Z|rr1kwY2nWX>ET<${u|OmFO^`Ip>8iP=8Q$>-Ax#9E_|q^xx8kuir&uO z{Uq`}3)Qr-tI2OBrxmW3rCCbazlchKW1?x(BqYIWz4l3Xt0P+pAtECrLI7pB4;^WT z%W)K!wUTwJ)`My;{{XwYSC)Q`Mq`1Zr%EQjo`l%Xnzg- z8{$oC!~$I^8DTaSGFyXjaeaQef7$Lnz@&epCx;-qF}oQZW4^o|Q!}9z^1sHP6G>;_ zdw4v9WvF10WJcXM$Oe>(41)I_)UF@z8a@2 zaPYkUZ&j|Z9wX&{X_cjQTt;qRA&j3bda&hb6urZ}NN>b;mDMi&wMQ3kz z+w|94-={^*Dyncv=@y;j_1ovK-RYv&<*$wR-(m3M=)zc|jvJPi@!%uMneGZlbmc)C z=iPxGGNGRArdMt}d%;kqr>kCDh1~=i16js^1ZII?x|RY$MyVFy{ zHE3nCzqqx?3hi|$8Nl5z#^an3_dq1{E4#gMxRxuT&~Ops8T64zb(9ScFl7ZuM~#dxFshI1;JIwhFHJVOrwm9?cI_N z1Crk-KUGR!?9`gso!XtzKg0@T}@-CTFGTSt;8}Iq=IF@Nh4r# z41jLHAQjI%*Y!dBAO6$!o<7j5G@lV)i%%A5_VA_RDhT4a358pj5HV)?2n-ZBj4v!l zBl*jVuu7I^R?X%tVKX~D+eCuE+QK@&!%DJTO(s549@b7K4 z(EWqb=8kh1ZW%s67YG<i#U~C0|(`wanH(BbRc)#)32cT>q&df zS*u>wwf_LiUZ3WD6-enWRd>U_7V{{X>Ez86}~g!qeE zhI1v-4caESC0VW^k^^-t(!gZulOD~1P}@=qRr!A|}bT_np6v8rEP$cln< zr0PZ4LdZ;*nE);vsLpp1qdhTSNbCOqvj>K?xY@4!C8tE*RL{T3!IOfiv&)T2mm>jH zW!;guf=I6j@n7tt;J8tg_(I)VV?JcG3n4p060$TqOR-{H1GXZnj514Ct^Tvhr6{WK zsiSskQc3OcU&{LSDif&-OAQ)OloivAR9ew#e%9}!KR7-y{@nJSAcpVz8FhQcpB_+} z+G|)`g=}s@q_I1QCxg?kTKvWMiSaAO-w-b)*qW3UgLjv06UtI1o8@JbruQeGnDpkq za{NX6B6xN+4sd0B=RHq1G_wQ0sO>j`gV z6?x*lPxMYyr$lC)eBdA<4X*%VwK^e#`(2y(cj|TX0{4;G5 zMy{At4vwGQA2SvI0PFg5z%}(v{{X@3EhveW)h%{0fdpX`MDeZ_h*5weR0|`F?qG3T zKDnYnV-aa(j|&UTv7-RboT|PuSeXDNvw#3N86efy&*zwF1@5~g6@Iqcb*EiZJUV$S z@JgjgFMaCgE4J@#TfWwB{NAkde-Z2QS!qusC6%Q-0fi+~495WE05V4a91L;Aeti60 z@obm(cM*Ba72VxbwpGxKG5Y5tp8c!n&x<}UWQs^F=15k~Hdfm#dGgAf5;6$h4*=v7 z#&gAfba>xeYsz4*0pRjJv)xXpM2Nq zm^>4#sl}}{PTE-~rmyF<_w_zI8H@U*KWQy}qnT*Vo4+mER!>#cH*Gb2k6Q45!+HMz;W4wcirB;> zvRkEfJHF7@*2@|gh6o;MEh4slTV&mkSepC$<7a@OZ-rB7>1-bAEjq^TSUK3XD)Q-5 zPk=BGPb5+n>AwTl7JMo2M_AIlai!m0x7%$U66;*dRiXU0t~{(lX4jZ!y`Nj@3t5ZRPnp{_K&UOW5MmjK`__PauYSwxuxh1&lB zjpiA}Y({;T`zbD3FTX2Smu)q-;<{ZQq~S0zrIy#9B5tfb*x6b8tG>N8Z~U+P_iC}5 z8{69}n3dA)=8e(L$BIi9FyX(2C5yv@EK37YTAD zj!cCTGZrfzKn}IL6T_*c^fHx$M1t1bJ9&*1w@qr+@kXu#2=1-7s@o}VB}+P&#>4vW zwT)U-s#Ho*bmFHS)VGEy*e69O?`!0C4ABo-< z@UE?TwpZG{J zOYqZBu(Yt&w5@Yamg>V)(xF*ox7K_we{TN(Xo>CPbP?G{x>&(-%W$_-fhfjD^IQ8i zc>e%a@VAb1TMaVu>elc+FU1=OHFffvL&$l(qZ>FJ8STMHp9ay z$J57QV(n6DYMN;%-diQ2w!L=MTSN6e8pJ$i5>%$IK17>orkYW1$?NHB-u+&S@#{!5kT7B&D>9H?|b@=VI4LxPPfwa*C z)-WvAR|Y7h^Dzs2*y|EB*6A$gQ%hAb%C~lq#)2te z)MHyWELeoI;m;L#AHuqAmZh%g9wzZ7n9mdv`R7HR!B*xJ5Iw8k-D_sRO#m?2fqN2M z%&|aM&U3PsD;LRlMx>IrI+{w)OK9(8-%FDyP)azW<37S_ouH$^mL)r9bFS%sn*W4WG4M7FbB0a+G?8Jm=|{TmXm3ka?fn` zR2>?G@_;DCK|e-m_CExE1AHCv4VJO-55(H7%=U8F zL3QDu1j%i9=UKd4a+WriI<2-PyI=GhG;7R7_DhVaF4^k8!5qFm->#U{*`}iYTqoxucwvaC5fu} zY06F1QI4C%G}>26{WQ|rdOm8^qlcp|X(+`(yqfdY__S@JFwQR{kRKx5n?= z7smPn*?2?1b~5;f^bs5L!0_S3|EE%C+9qAf3BkZJ8MuLh@mVj`AZ^A@u3{g#7oc?x+*x`dZ5 z(JaDMPJiQfi1lyTkMK%|8cvs`X;<29-Ikr9t(~2) ziq_u8OVi`GHaA$C?6wT=F*fpt`itZL0K-{+0RI5NKE4%rHs1c`O-IE3Ingh?MRvh0 zE#g$@YxD|mSbF%H(SoM3?7C7?jh(7WDMHI- zJInBTYp0EqDC4tyuL)KPr96IPN)wFl$trN2=X*BVlG5Fs?`xlo{t%bIzYqLfX-J~jBSg#Y&C5Q!#1{XEdKyy1tN=Awz!H!wU$9Ov|nhtyNzd=v+@4`#Xk;1 zEsmRE9E)}Rwc+{n>u4g$cGT@$YYrcLkePsyZX+d~BUusE%WeRAm%|hmkEL5_dN#E^ zosH~zkBIIyYnyxftwY3m@0V@jUlvJnCQZ6L*7uPA0B2}wXI(toADT&KRZ4%f4~8|H zZA(ehbm;GGTTJmuzcPJ{#%0rQBAVhkjp2=Ej6#<&8Ew?9!lkUrcNt4J;W@4vbhAlE zn)Kq_6T0QS+K#u~B(~c{-23buSW1$DYnew_uO_v!*SptUPt8w?ark=bEk4sxhf~z2 zwu@DQ4GTvt737z9R}TbFD=oa}zG#ii_I?(@Aow#QKTo1jf5hvDraKLuELg)D+; zc1n@zcJ@%DQKqY^c*gl{ZDF{XA-l9oO+MW6G=bsx zTEkSd@a4tD=B{uGwGwP&w$vG6a$i_aMNj@v-gHH}|gyzt$UYPV{# zOR0E%1hQsBz_8QMqQ$b9iOBku0LOj%cnf1;w?raKjwejGAUREJShCsWzhqxtwo*b4Hz( zn?Ji^Av?iN`YS8GuSe(AciC^L?=A7)#&*ZVS~tVJ8q31>8sfztio9K^%G%DgKI^2G zNOc`@N#nZLZ$8#yhR;d0Ni>Kq8b-H{D~o$(kLE8KU0mw-5KnV^Can}YjFM|w?yG+# z$+&BI3teq8EyCVkPb(_Nr^K$hbZRZ_qx(v!JuJ6+j;DEJr%aLQliAsV{{Uo^ZM25r znVuGFa3XC*gzB(OY+gMk$In|hrMcU1INuPP%^KZcx|%H-JwJ4oiFoS_R**^=+F@#_ z;yal^Y1c7Iipxxvrjgl@xc|%wT<+a(7|maZ5_mxkt8rladI;%X^~9cWJ~6|U){Tz$c%x9 zX&P;hi>x(7ytt3UKMy=RJQHg-cUgi>a_Y=WJa*B@%J*~2=1At+1z#f7B{$3xdz4cz zhhWtFVE5P1>aMy?tdT9=r#ufGwYo72B+)3um$#1)opk$I{I<51;wy;*GVfj!$6DN4 zUz_3mDn+@L;ue*XOwDJeqDW(DCL?q=_rXDDD7jfmsuB@M`Y6Rv_Eb{;0BCVhsGI&B zn!UW5-rMxC=f?P&j#|Q;T5ZeWntP>vSLc0Bzr^1Ztu6HXUki9rXVEmDE*V-KJ}IrF zxrK6;@m<8sH(N%HE-j#AJj{OAaVacNeDmXP5X1IUZhlpWtIrBLs$9#ov|*cu+h|+N zjX4udbnO2CGS&{Yu0x@DLTiYwwM(7bi#M4)q)c0Kux-y7ea1zMI0Q2gy>sDSCGjFl zbtKo9jXc5SVQm{Q5y=Z?IN8+fL_D~TG7Ai$In8?+ep4#Ag;Bx5D{|@aPj@DstuFWZ zHKX!z*^Me{aD=Ynl5mCWwNB~wzx87yTao55U)|0cONh(eM9K-;)3}9H^#sO}C|u)q zIN(=x@X`dg(zR_&$RUbI<%BbEL{ZB$fcuwUC=fUR0npcp>Y9Z5mBij>$cgq?o)~^q zX>#hmOkjf-{C&+Mf_E`6=a|IR5}w6m2^k zU}29NvVN}@O~Q=~JfxhfNm(SKqNB@oWoGnR-%WJf`Rpv!d15g;C_`O$y`Nt-Z}@ZO zpAgITyW6XxNl9*`f;Jy?sz_a&XSdzYJ^ewZ_-4>*A8WQycLGl*+sCj+w92{{VxQ zR}B@PmW2^zLadl5Ps$S{up@Uo90Sw8KZ0?MMzJ$Zdy%Omp7(95f0n+wAGYwcFG>-; zxnUXIuFzMu{q)}2pRzx*{{Y8`wR<}|4OtS~TZqMPvn}LI6JTqcNED|0{Kq3DOAP0q z(w_2{`d1f-9fl>J0)b(S}kn#Tc-NoMe2U6Y1h$D3Vr4TZVrcy$AAtncXkAG zj@%mbeHzxv=4j)>yE-GjxICh2ryRw8;!^WciA_td5RS^uS=80 zmZrs9dZ7Rp+T$U)=MFdU2?}Dt9Wo7l1_D@l(s8W_wAy;4uXpc1HElk6`S{gn)`ae) z+DKH0FFhe#D2pp=TFn3||72t9Oc=oB{ zUnWTecaq9kyq1$=$cr4Qxl2nZ4TuTDs}K_)udQ?AV}@^)IHjhEB$nGd?`w8@YI-XUYBO>M8@t-i<4-X!prj~d*4veX84Puz=Whn(th35rHJNYs(NRD}R2&pZJ%kZIFH3PA+XyCTW4e9~hL6u>$R6lMjZ>TtjVUqpOI@lC8zO%$1Vc_MQ>Y_P_t z!K0CIBz$>g#vcQJT$=bh;-ADTTWeM?aTAGDn81Ch9?(hHq+wMGo&X#U=DtrAO-ipV z7^y`yYg+3_tG`D6TAxKqQ;jt$m%W!WcUozs+u5(dHhk;j8<;H>;PS1WFD_+wbVZa+ zD+W`OmL-|Olh<-NJm((-d}8r+ri%oz$t-sFlSb>f$PNHwh|$#izbcX!103;Qjq#J? zHLjj!V>+=>BBZF{S0H(dIbOKmBO(Ly0gjxC{G|B9@m@`K=HfP;W-T$1CWtg;82p29 z2a;8eSb|PS&TH&Adnc`0r6)D+TU)DJSpNW(`}aOeKg`r4QWV=t=_I7x_HFrTvhTl_ z1@TA5@?YLw!xV(9sT^g1<%C=*-PPFlCzG7l&ANu5(l#ZNI3(_4jAQB7fH*wzdf4IEj(VmBx%PXKPn?0+76 z_0KSQIXs_!YSiU{+q1o%j{gAH%=p~K ztm@QCa&9Rn81!E+&+Y-1e=HnibDvJ<@;K+z*WJIe2Zx~4HQR}mkWUt)V{IBpBN1K8 zaRl%~V4i6zUgS-bBg;r;I658;q3`N zz&yP}`)fzno>vE^X$sl?fO?^wi{w<2;U<8KKQ1IrP zHnAPf}U~tLn~L@pR?AlFO9wp_*2EU_F84kdbft7TfsfdQAr)=+tT+@(#66S zg>3C+O;Qaz{#j|(tk6QO=7@)>e0KPEYcIpE6YEo-vPo-w@bglN#uZOJ?2}Q0!}_+R z=U{D*P-6u5jB(|O);7#QivDbKt4}>nbyQV4ww2x6inm3!%TGVM_6jq;T}9s4gLX;l ztXHDbzs>H?;tS&Li4DhzHS24=kqb>G%I8j*u*K)Nw6(RCBIA`a>K2zOfxhMJZIBe? z`BnIV+Q!Xy6}(#rH7mzlZPe|{EOEsqFp=9^!EFmZa%E{*NL~-x{{Xe8!^7Zzi2f(F zxk+sj=lf=%xnGtsJae=maflir#eVmK#l~1nJ>HXo<$c#GyX(8M{cm=^IIV`}##WTw zmosUr?|oj+zL&SnG8f0KI$N8G?N-ymzAB1)bs3iCD0It1^E3ND#V)1sJHj7qCU|4M`&__37!~~yd<(VI zS5^3Ou*L{TZ|1pJq>?D@Mb*59-rs@X$l+4P@ zZ!8I>YSLWk_e(Hk*lk9qe{8nkE9^-2u|h?DH2h%Jbt}DZP}cOlT4^jU;MSS0;FW_m zt7~^Jrj0a*ITsS zhxYt1FD_OrjwZrg8j`T+QA;_JVNp9H=e`11Euhs2t9iG;rlJ|J3MLu-E$ zTftAZ!}fTtAZ_tmII}@9(;is?l(cN;=P&ptrNzd(@H@xe78cXVt61FK zc<;luqGos=>rAy>dMis?KzzG()u6Mww6zQtIPOd-4joq?suK84)5N-m!_ObdCN%Gf zUktoU;g{2Af#ZV0`h5!QT&3hvMqrj(t!isYQa3EfWr8isEx3}tOEQeGnT9tHO}}N% zoFueQtI=q+R%>I_%i^n0S8JN;@3WP@O8e<;^>#lwHOQ^?uY>;p3pERwY&4xa!&=XX zybt0BwSqY1m;E{`oiyDfIbEw2^^M#&4ZY;@+Eq9PxH}I7-0N4KC(!I7No_nWd8Xq2NGq|Rcy{Mh_$TpWM3rR^;g25JUd0fZ zq>kP*40b+n5$;4XNU0jC%_M5e87C3sdTyQ|)4T=YokL5Rt^7;z150b2a`Hx%5pL2b z^qZHCQ2R%b4VI;UB#Kon-Yck-DUnIf35lCkxm0eY%AK`uO`luezV_-yGCYbdD(kJT z?cHm$>AtqrpCkNE*0lcs6nNu6@JEHM?=JrUv#$oM^Y51K?_1R;c(gAM*fpGswka%Q z^XE3OsD);oRxGl-hxfbw37PvjX!HC>z4)2qPwXpyfIqWegLMlJ9O@d~ou$=|w|!xI zZ?E{vRS?~y!!-K$iXhVTXpBZ_CA09v_KKTk2>$?t-?G1hwU3B@67^pdcy$%Eo4sgw zbK$(#Dp*?T(xjl0`!1snt zTSe3KXZWG~^~$9E01_7*bPLuIDwcZ;UOb%aY>Ym%Bg zW*GJ?dSr^bVvs5F@AxPP@fXG45O|B@0Mw+9;m5-d6zP8f{v1IC;@xRJDbz3Y%}>QY z5qwz#!17znEVYdrM(~6)JhDtB(RD;5MnE4_@KU9a<~3y}I{3-$kvv zb~(6Wynoi-AA@Q%sA8%5C+z0sqVFWH*7keqd~f?3cpCTO#=U#-C&oAM>0S-+cZc+! z9Qd(pAQoC>*NtzlHMn#y09x4FiC}oGE#R=Z*Yvp7d6Efi^x2UnX(Wf4{jhu=@jF=X z&x?K{_%}_}{5EC1y}QzM2*8S2R@ZY$Xx>;=juebe?&QSMLnI)`k(hUWfd0k51f|gQ z&kuYQ@TQ>p--o^!c#ikP-w^y&ByS<{hs3>7@ZH@*a?s57v)fwS3!9N>tM-MrXqsS= z7}Wji`%QRL;lG8x6zcl-j4!l54(Ylsi#49Jaj9L|*;(GjfBuW*TeQJq8nw)_M)vHA z(%cB4w?$Zxs{BLYpRZ*YjAYzm=+kw*t&)_bqxWT|mhFG??~Zn`nYAhk4oal?eVUiT zljVHcyLH{#-*f)}Ie(1y-ZAl17nV2vA=YLZkhy}`YFeG#wkdyN&hH+}>!(;EoNrkr zZ!X=X5f3nwAbh8z>3%8I^}8XdMxH_zv$K}&<_Na4#_~gQAv1lV;%OzfYj#NEmOz$w zFdRF_=Kk6K(jNzOTP=TBgTr>ZT>3tve7E{-_5Iko5y^8rvLZ;4NpBs$+MYX&IUP4Z zUS`TEF-U(XJ~Q|}>rB-2&kt^5jvW zY2dAMR`DCF+AgM^7rh#6_ogWtJI@#;liuG;8sJR&FN6~1Aiup-ODkLVj@CowNSi*% z(S8-%_#Z_0X``T)@5b8ihxAM93ppU)X{2jjExlvm)@w2_m#UNBUBr@ZREtc!o${5;U~Uxoe-)CQ@pYFY*7h9%ZsD9zrNrNgK--;O+Ab9o>`JbI41b3Me* zCpXt}S}F&%wk12>HcQ*eQ*|rNMl|N^)FB%tx~TiQEnfGrj47&gp-xe}V`h|=yv;>l zy_2*1&x03BzDLq^OIy)#;u&WeeuZP0Z z+@V#C(MSWh`&(Gi;J2}bbS*Z~H3@Y0wA1f(K+9pLX*0()lsD6;*>yeErDp-XM6jAC zlF4^8Zknv+vb<|+7r~E(J|yuKy}Q}nUs+n|{vVmd5$ZaIx2oHiY-f>UQw6nww~lQ^ z?Jc&2MYW!Dxd!!x$MRL8;F828D8_2nPEB3AS?=_{-QJ^?8`?JAxn*y6MY2gfv|Vr1 z`2PUoH;Z%+1ULLDKZf=;*LI#7w_P_^)NK+IbFKKM=~&*$43_HA2sFdC&TH7xOL>GB z5=6u8Gu$7C{v!C!)4+C8L8s3h--+*nOEHZ@!Kcc=D{tJ5y7?X3?4=IaSNDn}2*mon zy{SdvO(y$N^Cvpip>H#+su8axH^01}$xW-}3FJF6pE?Cx&vQ6&%V8TJKn|F^;xc%J>B$@YZ{4nlup~loCqW!&4JD71RS%M zC5}Ic4kqEG)O9HKl|_4nRRrzGR{7plJYlx6A&3K~HA>r4RWnMg&c+!MHz1M&lk*(% zN%!SRz~ErNLylteYA17jka=K9md(Y>?6p!c_qQu@az+N>mKFLHJS{3pbCsZ>7i5>Z z{vOKfZCbVa9|s&%>diqn%~7P=ZMSx>*8Z-{^q&TJW^WYu&q~qZjoZw*O-AjCuGr8q zxObQ{F4)p7grof5xn%&9V7`O#(^`mYI!=M6%97aW*2s`bxl%!8h!#pg-1tMoP4K7T(|FTXn|F(RYXF+cDL>a~`XTd+T}p(8-G3X+4XI$N%IzYJyKCmH zb5#D-)E?^LKhn#2fM+UPBc;aG=gSN8GATLr^{*m7?YQMxK5BS$;}1E$RPAQ=vQ}&H z)q0b*3k&R3-Ac_lG}~%AJ2cyB>t^-)k2TdldwDC8`DqE;yCE<^01mu#>CQ4T4o|1) zID<~KxrqT|68)vHaLsV0%x=d(T2ihL-eaDe*O=^*Yq;V7mXc`IcMKL%;SjLxmDo;y z!o48e3wt0Q0^wlf1_g`CWXW06i;VqIo{!SHwsxSv@e(XNu!3;CpSM%Q(Vq3&i zyIjhon`zqGt6e)@`gQH*e&@pUB&gAvO(zJ!_i3f`zV`dotdFuj9O+F2aYHuoI-)8* zVv&m%A#*Z@*jL!P@rS?wx7F4j6}Gswh8ZJOmU!77;@R!;>~XmzWZxr5cYMn}#Zk1y z#d+7mp9Qtztc|)0i+sF8R@{PGniD$$p{tdnn-PLF$8_uTs_sHMKvR=U$=w@&Q&%5MbU z_*Yn)QP6K@v64Z$;>BZ->9UxywdPbIBVeNfNSR5KF3417y)Qxdy9S+m7L}|S^zBWU zY=+^N?DDw)d6t(unk$G%JBUrC4$#1`$o8?JrlSi%G;yuI#vudjW-FCk$N>z;B;fEy zIKk)5wf_Kz7B}%+Ms%xd!?C=GZ(lAt1uG)#khb6)1bEIkIRuLJoRW6eUcWsr`kd;d zV@Kd z+-E532d_k%M{p6(G^&%s`4=Ta?z_VQa`*`n(-eOe%VEXrL?|71_fo7K&uKSADp&a zNYV@e(FW3a6x|F?Cdx3sGU+SXHn+2S>)mVSRIA|Vu6ad8Cal%3do8c8;p?u4=r6^; zj24hv-^UH}DzX^MOFLpTZQ^V`1BQ@|tbc?ADUq7|y7;^MMMh$t($e?Nf;kKd9f%NZ zPbxDUi~(rKc?;l@NCzD=^Do5j+GgWb^VRisk~T6Ds9B7RfRErO+= z-`Wbx9ku<~Qc9tJDg_zeyFYn}Bw%(Ov5NXUn}urClBrh7%E>7`7OLB}(`kAntz+l& zO!g6kqfN_~OElHq*6(XKuARJ*=pPk6BVFs|qD#kiD(m-{A==n1PDjW`C|ew312ysw zi1pNkm`L7y_$mrHAu)mrazPpN&rF{6S6=ZBxv2(uBxvDmsfH5kv2*eUVgWp04teNt zTvV$rb1JyW$u2i;_9W!<>-c+D>$z4@O9@V!pCfv5`xzDYAZQv6n zmDZ1R^BEx5tp<|ON>p1&CNfBZ;DQVplG-a{C?S-{?zzBf`5q@|rAHrma>G5q>Ildu zKmB_8o4|Jo8u)%dz}v>nbz?M(<`mTuQ6aTOC!~mmR~RiH+0$+Wt)6^e@a5+Tr+pLA zTK@oe@F_aUHFoTx+S%QH9s2p(NBx!lz!Tfe@D;o%Yv*cLT3lKlsbvEcD`>LYT54AI z_Yu0Rsc(B>boSbUvWHvf(>p}&PtuT&?1`d!IMSV)3VzKW2PQ;K+4P4)~`|x`M`ew41oKM1Z2D#QHv@4yUDD z+bXaVn-{pfv7K0gThAZ{fs((Qc(M_tm{y%DQ=c`{N$BqvYwPCLx4+FV*D{suFqEm* zeD2LPqR}{|qq?(=-*%Sn)<2X#_$eNc6XR{BwQnol>H6=1F7IR1BXQ(Sa-JHp8lkr+ zWL^IN+FDJ=nu>S9a~ziPD3&MB!x}!Ay6=Nx-!xF$vg$T%<`CvXaU?5lU5c?1m{wyP zic~Dm8FB))-_-vA+w;KBb#J3<7O~q*-VVI+j*oQK=<2o!e`MO&K1p*ETivDg+eaw* zJlWW`7%;EOzXo2tj*X&e+Jc3%I)<%baIRM==3IqK#a1DjTVm?V!9^g9;2QliC&#Q$ z4`3%}DdFP?N%g&!t?R3|qD|j(<}s8NEOlCSEhxD@O*Bcg*Il&y&b#(d@MQCRQ)#BR z^Q>$vFP+`Mi3PpNM;@TUGR*Ol6f;E>s-#OJNfc_s2ETp29qRUe8u5?9KM$b^s_OT5 zTE+X!1Ycy+wCh{n5M0IfcOqy~Cb<#atrPx!$tPB7^=Equ;r_PDw;+oqUjGp?~?WqCEY zY)aDFX}XTHD%zx_Ax5~h4?3fT*xzP_%1rUb--;d`@d(zhHN87jo)7GucI!*HvblJs zX0_ICFB0BlM+(I)riwPVyi{gbp_uFk50BB`+w;aV$EXWi8;6ee!pB{>jqUzNhTbci zZ397Z6l)<>zPHo03#M|7xo69SjiVL$>);DfsmG*RUs(Y1Mwre!-XbUI6&J;Qs&&Sxs$i2CrqPXj+tdlN*bNc#bUbfm&6ttMarm#Z6j0D^(zZW zH1iw>b=8TsTX;;C!P%k{&E?MVw1eJTc=91%dlcqoC{BABe86W{oEMXNDf% z?Rr~HG|{_7{gX6BA(AheNlYs+W>i!A=cxFK4-NQ3S2y~wQzwaaN4C<-$#5r|N(-)B zJ>2k<3SQq>UPn8-7}=q0?NRbC!Dm$*>icH7QgV~Jvul|(cI@8v(%Np%qRVunX#6hQ zJr}yJw9|dlyYsR6f%`Q0W*t{b_~m1%S>Bi~wOeg7Rf^G)b&g?qaFJdk+&ZAPSbVF( z>$#l~!ez));C~RcdtV0r%@^wp!F_%rj6UX%kz#s@E41LFdUb;Rm07$-WoTeiVMsx)z6QO}CBh{12mE-Dz(n&9tY*o-WjF zbu002ZpJ%XhSY5JOARwnj_>#1XVV3+o&n|x_m2o&Y4_i>?Z=2bVESFIgYcu^y!!Oe z#T@f$8b+z&pB-sddYmxJ0l7&elG5ESl~T>-+Y~l2i402^`pgw-H8UDYalBOH6!ubS ztre|%yXc!wUg-IJT9c!QX>(eg*6z;R=uTEE?Dh;F?>Ss zu94x55$!FRuC)IE6YE-h`iF?TWvRt)u*YGdL34M1eRAOuZ{o4G8)UqK-nft3bKx(A z0JuO$zkUvW#vcy;8+cjxMbL)%86}>ibpH zuNb_O-Cf+wScYJqZF~~Yt~J?iJ}r2IKsrXH;tfAf(|jRgcOyQf;r{>uK@pQmvYI&w z8g2HZNA`}hYXosW+9kHLnnp_`TD$v7{5$Y}#BTy=*1k7b8^it{@eCd$xr**jF3w|R zq-$0&$9u>U+hbB`XN`{87CbQ7xF6W+VCP>2I8o$;s$uzQDwkB@DJZMWE4xJ|^xsRh z)8%p;WH4FYUyZLClZ6aSYEBfa>Q2#=R3Uq(DK?_F-$eF2C*i(_@bC6F_(}G^9QcDy zwbZTjldX8KRgy)vyw$u-EIakbf)&S@_fuP}7LmmmouNkowh~9-*T!3q+9&=BZ}E*R z%$kRTe`XCUP>uAG3>4DkIgYHcm;jhGR z+K0#Y{w4U~9k#b=r^9n5iQuzua(o}4+Qok&Y0+OoqGCTr)M-Cr2(WwMSu zA^Q7K@W;ZBfx0%8ABdW1wIOF~C6|V*BaUrPR=Qac_BDY>;D#uGIlH~MhR!s^D3N(d z8-8Ke_Q*LK&kJflsX&Pi8WiQZ^~IG;;t zE-mC#nrIZs4az&XmHz;vPuj1ySX<2{ z!fUhLtSjOTSv2|Popjq*jxc`Ar7Uf?US9rTe${^nbT5Qjj+1| z<+s*fRnj1{(k@1yaNDfzXO8uX#EBeB6i_Q2i4kwib6hP9HY%sooTW}REzH%Ml5y4f zufxe`eZEJFoT$=RnJHGN)RbeYgk7wp)AQEdwmw>#;?G<7&*28Oku;lK1}mQ%T+2Ky zzT)QMf9&53+uJ~lSZUf9&?yL)q!ptnrZYBy!unL~ZtSaV%>;?%_4<{ts)-W5e2A-NB0cT=<2l*y&nz ztV_F7x0-jKP}MI2F^&X^!%SOWCHI66duZV>(lV5#DAQBAzbsQ%*6(!w{)*N;y360? z(f8{Oo`RQZKbe{}K;(rlJHMp(qZ2T*vPj})ECBm5OG<#T|S+vtUt?3=r#IU}N zWL26dEN7cLXSPc4@7u#po5R;X3%(rQ$7ye@XkHwf#{N6gp^+pu+SiKipt;g?tCdL^ zZQD-p$kXhNoVay}I)ENlC-sZr55u1e*z3B!r>>+>_%6jG)%-_!G)f|a;qQPYhgtD7 z3{e>pRb{V$KG_g@n48_&05tZJ?}$Drn^*C@!_73&=@4FD*x161O2Z1OV+3VI!_E=IFwW_Q z07VKuQ1NPu9a8&Fc4d&=Pc6_nB_?JK3?!0Ed2uoWf--+9_hz^9FZB_5FuFF^(qF(g z1saB@acUOM&IWw%D$zZ~%J5P^0LI$ze-w>kGuPY%#*&YM(Xy}cGmj7`hG*3)_A&;sV3hoMwRzTwdd%x==Idf)1j53 zbdDx1D@AUvi~wYP!GY<356#9g+pT?t@N3~lnR{<}<9`|c@8Q>kB}=VDIT0Tc>(UZz zw0mF_Jc`#bBy&guDkk&2cWxeS@RQ-xzBSe~zYp3;ut~d8)$ir`S4-01A!eRr!Do_K zaBNj_vP>U-Hj!Sh@yp^Kp6PAip9MuNpMv}&5x2AnBvLk`sN4mD;`-u9O2s&lcnaT?06>KyUOr;4bl;p@w+X-%f}NvnGKboK6MLWGtkg+8vdHBU`izu~IU z?vK+if<76PE|DmUB2~AXHdK!wz=$13Q07?W2RZqQ@sM--kNZDpv+33`#^FHoe$MXd z&lj5%aRyQm!7?kj0rC*B#@-HpE4~eAMlb9Ywv8oOmffwD6+F$WAh<Wy3g!fZfNURoQWthh7$gwOa zGbF4A<}64cLc!6tvm~l^kEZmU5nd@JmupONL6$({%VB`UQ9!}Q<0I@Ol6Ega<9`mc z_gL-x=9#>a=BP}_;zGQiG7-0c7a(#la&jx_-3}-|&$>zDm8FeWbaA?G1a2Z{agQ)D zAch#@+PtW{LNRG*mHYI*zk%6Wv=fr|udC5JZnS@W^zGfN@N?;=+D6$NZqJ9vUCP8^IPp-J&=`PT%d9Z1Uzg$W0DRJKX2;#eciSUFi8|lLXSOKTYw5VX~RQ^ng)Jj z+m;KDGv==&@mGcKt}X5+GC~ZI7}C}#ooAOTpF8)1U@YB9IplPzOOh)6J9+K+G;2z5 zn^97;()`+5?K|1_y4Ruj>G50O=9zgGp(Iw#Zj**d7(^MCG8iY4&aNRFPS*uUBO@Ze zIQ)I^TGCN8e`ZElnpXuC)RhD%Q?(d{0aw)EoCC%`pdS+W0{p{xvRK_TmqA04tRqw* zfdK+ZBDW*=n;P67nov*>erWLz!0)$9G=jUO%Ujj>J{!$z1n3fv0l zp@^$Rsll|ZcAC0tPQ4fP3h4b_I^i@^(k_^{o6h zXi$;^WkJL-wgzxWZ2tff`*-ymSCs1B6Ki7N#>_BEhFmZpfxsgijO2QA_5FbOlkk=c zcerS62xTl)pp1y~k_=#A?p6nUPD%LyB%Y_@zZiHV+etT@1c(%{T#h!Aj=e?>4}ZqK zvo7K()SMqWOLVUF_;0=TK2tEtmK#__rlhZa?WUS*y_@<5FEzc>Pqmj_b=$7}FTar8 zqeeoJ(1Fiha0x%3AFWa&kVjF+xB2JRxAkjXj8wiK^dxim^XcivLqtSLCO-2O zC*21pjydc8b6;UbYE3)!(-veji{P@vRZU{*sSw0}9V3f=A7B1)jEva}*eBMMPVDH3m1 z30^bdSj4)$vuA`XSJ}wgrw6#t1Fd?dgB-AUfufDl4<<S&^2FoJ7&Vs(;hiI4gv=G~i3ehu)3UOpBUPgrC`VZ_DR>JvQ>|`G@;_d^ue&;V;G&OB9CgNaEEja}kb7bnQ|Y zMZ{6ymq}6+bm_QaG?2t{v4B6G&j$ELFAzSLt7;4*)2uHq-&7J1jS9y&k`jEf1Yc@J zlrB|_uFb&0h5ow#0BNf|BgNhoc{K=aZS*}`O!2mpEzRud_E;{o@-=HyB#XQyis`r4 z5!pm=WPUk+H@E#8}@87}obQZS1ysU-49>-Ka|Q*5zqO&Mhk zk>*@o2ZC6UxwvOCt49G}2>GVtRtlV8oR=$BZFjTkuh(OI(=S!AH0rKZ7T3JpmDelz zUeC$->d!^+<<+i@;k`g<(!+1!PZWz=Js!d$vQ4DvZqkOHNcOTS-e>Lik~=9>v4sd# z%D>&e2P|#86YyJ7)nbKW)x2GK0_m6YVZPC6rr1Sgqv}(~Ajr4VqJ2wVaLfhW#pq@u z2nWQ!27EBT5`HXrd*SV(1E=V=clxJ~HDp{HTQx}Kztin*j|nt(u|=c^fopi~C6U7x zW)~HoMX2-%-ZB8gQN>q?~@O9!he4>Xk&V zX0%V$YVE4(epi0czZ9bQi}82F7P=(2iK9R*yg#Mfhw}{jZi;`in&Me99gOi?&pd^Q z6Scv(+N3Tocn`yuZY*K6c+Jq$?k;rbW>*qieV!R4wjlh@T&z1V*&seJ30=N5@bAKR zy8f+h?2SIJs7M{-oH8faBiS|m*48Qnen654l$@z>=WyWt8>p@3hjz1RuPv3#cUpbc zo%X2XNv!nwri~0UTFi>_JeKboKqEz0kOl>#^D+7kF-C|NP(B&Eo=D&~51$2V_nB6`IuLVz zpxD^E1Tb8-yA1N(U8xS5cAqLBRLF1E-x6JG78+cB6KSt6uC<%nE5h*-Bv|}OrC7;# za|_dF4@?sG&37e_(cQqJK?4Ez5f7>JO$vp3j~VGQIs3ih2D8*zO#ZGxuq8% zExQQq((#f_jN#^;jxFS`*D-W8t!iyB6zK5p3WZ_9Xi#PY+K!k+D|c~bSxEZ&i0aJVX`}LWMw0tk^Uh3 zC)0i?e$?L$G#MrT08f2$Lee~uDqGqe158nK8%-fDijz7?7n1BF6H6_$tWV57o$&Ur zeXaN>!g1+(m)U$7rK;KEnDqkQ-Ie#onI@ziV*QW*djQmL}ZNGfLMJtZTXl8DOZ& zoRv+Y>A0nTdsMeeWz<;9S5k#JMe^x(?47@*-2D;wnc;`<&&FFT3#$f`!&f@Sov&VM z6Wm6E@(n`6T!hOFyeQDcBpT+KqC8gRN}}E~jj%GP<3DAMKF{J`?X+}Di|sWb)({$e%d^7Q;onv=lb?4fp=BeTT01((;+E_@E%`L>bZnI#TQMV>Xk~Fxwn61sf zl%v!?V@*569vAr4@IO@k(bBbx-B(zTQSnP@cHm#WxvpxKy7kmIKw?XkwYQ2W^!u4h zC)xHNDR2P#9JAT1UI9yA_NsPvz2g~1PhEGnTm9So-FP~9d_-Q2Q(HYXO8vjjvhV$% zX^^CLv22ZEQ5<1aMse~1$t0Ww0Tps%Pys~^^B)NPrF;jgd`j>>li?jM_DdLb9~D>) zYFNtv`YLJquiGJ$ZzWxfBq#i|kx@U2z4FbAK_2qE!Y<;xBV-M#fOIL0K2JIA*14}7 z>eJ}{AGo-*h)a1ht+k+Ti9CrT2|lQixdV)zI-34~m$9iq-APcLHFU0}$ta}Kwzqoi zuC`jApJ6Lh!{aE|rDal;RX8~$(voh?r>2izEzg24{v!CCwV#8&2ly;1-P&6%-lZ(@ z#+p1aIWP-rdwA}dWRgbiAUN89D;u*EwVmYGKQ%8p7 z;%N_$WmSF*%Zo1@%oG2w! zH;qpy_7tS?ZZ&B}50ZpeEMaKqqMhw--F;17E~h(boG+mUwz%(k?>cI z{3>-X7~bl3wrgW!rfN}}9WKgLio(mog4$G4@c_z=ad!>m()naGl0_GtD}kE+a6fME z6y0n801AqVZa_PY^p~29#}T>nByz-H7XDE` zYn@kAzWCAL`>j6xFKwsLpX`5Oo*8YHd)-?=wT38eS}=yz8!J1}^1Hg+eXNHfMo}J$j{ny-$XGblQd2 zi~Jwq2*S@X)inwADWM)(NfJ#GwCwId#Lp$iFrFoblM() zajVC91)?)gbE{lidBGM<={&;T4I@mnS7krDlqdSar{T*cgI@UMuDz_bV^Y&>BC@vA z?TnKIxz_HrYs-B~;quc%5{sKm&kS!9NX-v1zu_l} z+WK^g+;~sn>P@8R=w?-t=pAEMx0h2$&`1}_f=PEPA+MHjohwFPZYoi;=+c(GHhOuz z&$fh>N6jCzq^{F?S$1EouC>u$L-wQio2L9I{hU52Y2xxZwJmlr;0d)GJ(ibmsp@y9 zR@N=v-ZBNfrHQz_v2=t8n&U^aRK>h%7x_2Wb^T9NxbcOp(me8c>1U)PNa4c8oy-|V z-y+&_CdmaQWh(p=U#p+BpN;h|8TeC4(Jm&vlTW>mQuu;qQo<|ULh9yw3mY5Ktft&w zHlwTES-hENnky(I!I7PRCAAxijY8oAi0$s}bnD18wYAJr%N#;Sy%!J>n?W+A)v{cyS2B6bAiTf7R4I6~D{cneiL|zOkCs$E^ZUjFl)mt$wXFF1*H5vw zI+VJt{k@%x5TKY`$n*W4I6^5ID#ApN;iCa$!iFuL=3fO(eSanMMY7V;YkTSL;L686 zsyLbBkaddbwn-AKNX#S+BE%6(*V9)PQG5*2KjCZmnMRM}{d-Q54*_^$NFkEz!x|Os ztT5`HFx4QzlE(X4ca|Lj79MPD*P4+?z=*%=G;s-8(RQ1q$hx%Mozi`+@;^VtVXf~~ zg_pHl`unaoSJz!^){jTFryJwjog?6{gumel&~I%|jJ5nl;tfW|ad#zzQk{#UnW6~g z;J0X@XN+tUeX3Ktc?aZeR&dfKghz8Thq+>U$c0{Q!=i6ua-%zU05}y=_rw~OsjXf3 zgHv-PlT2=HyvYox(lm}0naZ;`j2*HZXL^h-I#F%5_UV5n;$@g`b{R6WZh2UooE@Qe zjl=@L41t=`oeyUy)RS|qDvnLN?WT(Brthc7=TnX$N;K^nbmeHJWSVVj+WKAE&%L#@ zO|(b~ua;UloZxgf5-}i?=$=}Rx!8LUY1YtrvRhmqF{48w62yJw!m%nx%((ypI0x4v zhK^aTEtC)zcDW9l?L2o@))NYbX`Qrw3M|w zn}7z=56HlD4i0$FBil9X9|!c+)Vx(-5*a5qY|}?DA$HnZtIsSuNXs;VStUNIPax*J zUj8s8k_elv zI^sg1K;RwMmSgh8RU6dg`Ev?$M zo9xr=+sOT%_yeO`&7t^OQ)~bc%O2!lLd|aMW8J{}v6MnM^%)iW5%5n;k=oiuj0Tf$ zoZCtA_ifoKuLOaPa6#n$KbPOLMTA!tR<) zv`$zlLCA2*c44!hm=@=dE5t*C~SNEoy(Isr2nR+7`^4mPS@DUD!WNC9Y&eEk*;+k=s0TC_}%!#;E zCmH_$zIT0jBN^>fTH)ino)m=;HVUbHB9tMp%tivXF_F$l1dP|G1$d{`?{Aix^y~O{ zzAt~0`>m(oQNGn}9^F!M3&k!m7%|C&&n>$I#&eQ{4cOzkt_NDZie`Pq6S~q2~Up;lV zndf?yoNawPkjT=$S}cMlAsKP71_~t^JeLHF{G8&r*!(?ja(ug(7F(Gam6g6`DjbEC zqXaC8o(i9t2m}lY^kW2)OcU)B#VN>HUt>JV2`$FZH=|-VJAo`gLPkjKG*1lMT}=hl z!IhJegeh>u?+vk&46dAjtWHiy^o8)armrons#|^DjdW*AX)Cv)-Lu0R;fuv9+|3w{ zDIt^Ok7xn#q&v0_5r#0Ji~vR}=kJJK1GIZ0XJ9&|yQ9Xy#FcKC8@wCG`c0v+xmHwZ!^Lp}Tr#Ui zBTXT2ueWSSB3wH&5X7hilg1S&PAN&>n)ki^+c($nzuwV}I`W*Xv`Ojbvh!VD&(!=$ z_^0sbMHJ`AZbLM21rZ2KZeK1^M$sCB^KHfm`9Kx<U`~o>b!{#h*$icW)-g3Ji zZe3$=9ZAl1{G+k`qWHv+&esw7MWc}Z`dkkok%=n7&fk?X1LOmqF<+lw5wxgdSCSZ> za6=@Bp%fV*Q?;N%Pww`dt1wc`Hi5`BwgSCaN)A(LCugO;Ys%}tps|!8T9xd%owaE+ zt-g=XuCL7ejq%Tf*G-v$k7D77Tn)fU_2=o| zjMwNV#H~4A;_aO_0`m@2BmfEdnuXk;1_1ER{J`Xa&=_M4C+qTwkrkecq`@T`}1)CO;mz||R z03JIL`eXk9*Ib;VZKbk;H?CQ^<7wkQxIV)@J?qxBtuk1P+&IKYq!2#yHd|_{;07aU z&QGUuamHxg9k|ssSgkywo5@q=AKfZ$IV^c)3PxO&ILIQuS<7p|;bNs`)g{$;Ppj*# znzq|5Z~4w`hMi1BDMcjZE?qA6OXlvD{W|nCejn+EU24*4S8O8WD~+so4XWqvj-0O^ z)3vBs0$0+5uvZJH;GPgU6UN=O*l?cyX2juY5f4TmJxP>6Xl`AKDW7 z?q*;WgjTTKTnQ9~;x8&F2Ij%}*B>d!`)lFv!i(FzJHuky#tANO6Rx`ZyG|bWLA-j#~K9MS=YkOvCyS9)8$fIE~`sv-P+%!tdB=5!Ok&olf0u8-u_m) zUTWK29@+Z?+UnYmj;=Hva_VSpF5-jinw;u@E5!3!+~{-5Di(r$O2Lfiw(Ouuroiep zN<#ktN*@Ka0N)J!Pw?XLqrcR2l(_x#%L>D%n;D-?)NW_E`JqcMvdawS4=J|-6$&{b zzYP8sd_UB_9@y%BExNOo6wz(}0Jj~aw??_O)=27V(MQICZ)|a}B$l%71Tr8U& zNfi&@Um3nC+k7Xt@dSPoG1*@HTkyrUrySAB*B3fwnc_Vz!h8EDk~T$-NUp8Scb6Gt zwv}a(WPdGpeovp&`sCEo=1C;Kysow1ub1L|hEq?C^P-Iks`-VJ9_og&haVO0okBT{bdpa8l__7sf_aof9n2k|>s)ie(b zv&=6ST-N5jmq4?be$Oh~%695LCZEh_n99gxf(fJ#8F&^j;bJbnpZhg@I`IDh!;J#Z z)`WBVpFW9T# zEBLF%*EaI}RPjcW4eZu86M62si^eC@G-ia9o*g?(d#En8Sf_2;h4hj~9>~DQ;qQw) zPvW18o;KI~N2|>?qdvEMx{j*Tq-rI(xs}@9*2XXbqem9V!k|7~vZ`PxTJ=BLhxU{4 zAMGXaJlN?*cksrOJ;Lec<8zzVdxSIUm(p$<r=aRtP;*Kpc+hVo}w zZb-UTw@WzKvIt?*q-S-J7_H32Z&txt>AYE~X_r&l>KdNwyXa$0Hp1veWV~43)f4SE zk(G$vNejdyQv#Q@5_w9_q?+>EEl$oKhzlLZl?lDnZ5H27LPLF)<5`mSCy~MZ)y29_ zIV6@=00Ic8J|^qvFK$j{7m!V7XXQW?OJ@z4Yj+4hibs_$9pr>?<{)=8jgmSR7IMZ$ za+93uFM9V%?Izv7l`Wp0Ye#G%-8?knp3d=JNyV#cr*A#%&%Hlph#Gxo!=+(5yjB<2 zcb7NOTuJB8reEpKw>HrLr6A3*1R^re9Iq9*U$es~ll4d9AB-jNpTOS`UZ5Y^-YeBE zH0=$7hy{k7ab<0yTD7976367}XH~g^SlQexscj0ZNGtQ(_A}5M!=4q>uJu5M{_=Y( zOPFGbj-{qg6|Kygk&FiJm1AXZa~Fj44K>~(&v9*Q_BpMixl|u%zDujnrU&zD^6cZ=V)%-9Ts=N$;iGqG z-EDV!Uh!61w!b6l@Uo2@vK113-m|sbtfHNj($TK=y}uTE*TL@-jRV8bd^FTV0^TbI z*WTjww!?L0B(dqR=$7;E^WwR*wYRpC+Ay+bQMi;$tGJ4nTk*8knyQ zpRL&2Fft?P{vnR?{BBs}W+L7h?WT;zEu^=&j&@u$Zr&NMNv?R~!WtKd{5x?H_>S^D zCI}+9F-atPwT0Ejsi?;cmg_4;ZL4Ya_A=VsNwFlmS9f$}dcupx+P9A5uxT{Sz9Krt ziDjs0&XL-byR?O@Z*J{lDz5|HSbdh-?lnRuzOq!3MQ{shszp*tH&cvlJ+zYDt*))D z*XD6igs#+^ac^Xsc4;?nH@nv8{geHrG$?#=tz79Art(c{{{X|b-y32}dxJc>wfBUy z+iP7hLNIwREH2Yhj7bZ^WtM$7L`01=EpO;wvvu~ZckzGW{mj$Hs@V8rS@2G;c_pQ? zLmV(_8t$>-xhDP2^Y#jZ=`x$&K)(6CU{cdd`Z9laB z7wWdA(&b87O{m%Uk5$#h>`_}sA-8C*<+H<;`%I9+)09QXuSX}St7bR~DK#%^7Z<&o zY2NqIU0T=CejavXDAU8@r7wmw{I2z0Zr1+*Ep@T|QrXDS*~GGKb}`$rWm2n?2;3NT z-WV6b9rNj2$BnNv`%M}B&!comK&)hRW5 z-qCv7Mv~py{{VigZzT_o8r$Alt=6mlAF=q;`(*q`Uk~^P#JW1Q+*k7W+s2V;GHGLV z^xCzjhZ=ak%Au{5EMs@m>_fbSEP`eSXaIjS{{R;gQ_?;!_!d7AMZJmQ}}E9Ug}nV@VR~l-&$%@-rjhrJVD{h zSf!RZp3X@%y>C*|CWmj!8o_NPlq=?Z)5CXq9Wa0o=Um>US=YqESANc|NI^G!6#1{( z)^}Uzt$mNxvKXoqsqZgnqc^47zLvjjJvD1(ug3oX0d$+o-`Y##J?J6r=1m>EJkl%q z&kvQRLe~(?hk95v7ZQ_ftg81OSC%mw5$azWyjLaOvExlEOVs6-(?zh;`~~5oXj#J{*jvqIEyd0C{;z#4ie3KhI9pM_k{sMkBnB_D z&lQ-K8)h4r6nT|WWmjJ#5na^8(cP%hrjqXC7_F9;(ciAVr_|x1p-!WI?=+i9^jBJL z_1fN-JP%UWWY#qA7+PKVSDqi1*GSXL$+XL-+-Y~(Y<6}QaLbmKD~m|wwvtgd{CuR> zUnz?%^E4j^_1%9~w7a#^uB3fZ_}a%Tc8uo!>S!K4C2W$|Kbhhvql`UqGqjs3;EK2lg%1YtGp<_Ncr*4D^CE-xj*Sjr;1 zki@qP_3GajK0bIm_IvP3{5kkXr&;)e;#|6WSzUPgD4O+rFcnrUL3AaR<%(Te@<`;h z{?ULyi)|vL4EAxYn|*FoI;m1rW}_GG_D)XD>uo%=`5!%wf|T5+Cam8tb+)~9*8Ml> zWPE`?hkhUa&)Os0c*{Yu@i)dLNwq%;d^(+Cy3oEE$1G+`j~skfxQb6M>%{jj9G6}m z@U^hMw9}!Hr_>0U76<&h_{rkm5PU)LMb53O-QDUdc{I&5_aT~PBjuJ>jj)hGFw0v) zVo$P1g@k~}+i_Gr#Qk5$-GfG-3 z#7U)zBbg8no%~g#PA#skVQF3|R!du(mR1aG!r?c3xCRLot-jC~&Um7T3`SPJNXu(F zRiW*hO~NWKOE%nIyIX6oF0A=neHSb|lwIX_Cb#YS=)E@Y^X{Q=@=0w4y0ai+imFZu zxm6&7c8#r;9^~Y5YogR5H*005#20;>Tij!4GE7%-ewio&jtAWh!SBmm+sUP0yfUM! zM#{r}KzPm?H$#)djjqHT1|$QHE1vhH|d6xFi#pV;~}vf@KB1 zWm3OoG^XjwHfz~l`dM1nww>(lwcXE?!`?p9FWqV1-P@(PwZ6Jq`fGdcU0~-?NNtMC zD#Vbt0C`}%N>V%l#_;1O_)7OR-HT|ns8+&aWV=3bZHxqvNYJr4X24C$wNS4hEQ`Q% zo-e{mv0h84+<;7CIire0o$6gYsMPV7k&-k3=jJ&dGZJ?-$2VHMlL8Cft4gGtD7V|@ zJgRYk%8Kd8Jh!5QTot1#m7^Ipp&K;%yK=kf>isXLm$OIQ!^zDrd0scY<)nK4JM6k{ zaj{v*Q_h)L=eoIcL@H121e@Pt{{RSltZ;$L6kwx@{WkrWJP$3k*M(L~c6O56Q@fo} zK=RS7Zfza>#lQeO>hR;yhag~l4Wa4M>)tA~wYE)>Lo5@5fNe-%kTfx{!5JWK3=T z%gw9bwYSyvT6DI)pR`kpP)RFW?t3@Wwe0QFL+ZZ>nNl?K3qu>G)HvQ4gPwZe9G*vU z#e2q;cW98q8cvEz6Dlbv$U!53RD+ynjN{(EY|!K}`761?t{W*Nq$uN(ci;ol9Q{3g zPQzK81&vn@$Rm#9lNy3fO99{1j+ynPO*JK>*SD&DZT&0lsfAcMT`MPky}nodZ@v5a zB)5_z%+VHj!mw2U;E6hM&Isy#2k5iMYC}p1k)y;xkg+WC1(lTTR>r`r3lV?@G6@yU zMS3A+Q!>o#cV;FbLbs@OUI!hHImS7y6q@SNM!8mx84Q7<^AHvQ7Av*XouG`B_5=G=5;7D>7IaGIwCCVTMUkGqest>64Fa^H;iW zog_h=w5sPhG8nSl4a&T2BaczeFl*6MSBC9E?h?2qWN{R66(_2MAG_P0+Z3MuwTxtj z)?|<>yEvR3-y#e!5uEhMJe*)0k;oXK)QeVa>udeVf1dVxUtoprWwwp|>t9`tR(mMY zG+Ave%P#nxjBLJg0u&!VY1$Mv#fX5#31?&XtF_Ry1(x0{LVUJGXwq2OR5Gc;sq;U0 ziZB={7}DmbnD3G$Zs*cwl0k&(3OBIBs(%AQn`!(K-@{k zf22HJsQr!^nJzaz0BB>Lc=8w$Gt)UB^gqlB_`~9EziPYZ^VOwD2s60zO{+O7(MIpc zFVyfD5HNYEttBstdTHr?wqLJ(bwP%U=GAL-?%tNY```SL@ejpo#F(m1>XOB6vN#;? zeZ;sZ0)V7tafTdc1E8;jej~Xs+<9S^Si^bD>$Ghz8B~j&HXc-C10Cz@&m7+*qTv!} zo6eoqSvKyHLFPA>Ez4x7D$LXEBVrXRjrGqfw@bJA_qp)Tk2KhBXR6#8<+`=B)Gy??WJ`3l zx_A~V4I9gY%29I&yj$zTff#$J5y~WP!|$-!jYzmth28m6+m$P()z$p3eQ(=C<1tx$ zU+*+_*81K%E3Guu+kVFr`zCxKkK#qo#V;03Y#P_X-WfK!d;&0SEq}Cbe$1%zND?nC zY@)dG*plDi&eGRM@Hd0B%_j2lU{`Lpsz+>Q)C;mCzH3@q zq;Cic!q$7ZxA4WkIX`FpdOrpHHHX5#VTRgYi(W0#wQmt6sVjV%4W*o#t5{2M58Si2 zhSN^BfwJJ8PT&Sme|h#aJ}uS!W#cV>!X6fg>iUO_d}(K*>sD~I^2esxFOv=J{-bvZ zZ?ITe=3{+=-;}ZZ1$sI08vS{2BW~e$pO1{h>Ttd*S^WX>BYo<`&IqZE)*tZ){Zst%m4; z0y7Gu>JnU(#ms(0a6w^b3Qy<0EBK@1KaKu5)%96L?yqp~2A87Q*#`+}19@ncI$UMj z>`_4=X1ct}y~o*Er3(wLcCK`-Gr+n%^V@}=S2lC5rKm*`nAY)TRr>{th6X!ZW>=E# zc!WmkMVBZLNYj3ahs$VXKFY0V!mcZlF{Mu2@TVus%+z((&iAuz+e>PE^;+`9PEopw zU0#mMTj_mTU3=?eC&OM0vGD!W8g#P2W|r|{xV)Hvx`oyuaIC;E5nDgn1T3H<7RxG; z#lIEn(^$)JZB{rFOle*@rt-Ycwh5<>Il~znD{Tpu?8_SFIU3z#3|QfGtCp79-s$ep zPXu#ZM?8|oRW2k#(V2|I`AVYTNhZ=aBci_L;BdYnmh$M&I%%ta4A1sxV~@$X3FHCx ziCz?zGVq3xnXUGbCJa`^5HLM>YE`S_T}sY#lu~P%zF5ac%WjXVTJ(A|j=W<{D?PPu z%iZ~R?Y;KY!1#ZAD|nH7SE(!JF*n)(GV+ zLisO;ei?YrU%0ilx3Rjuz3~&rVWYzmZ8Jr=R!G`5Q5}t}Ov5r5l1AGsr+iUnzf?R8 z;XMQ3e}FACJNu13OZ__jEoa4s37zdPF07+fvC?6hCi5=rbZMlPTNv&fOQ>DS_gC>U z7}aNpG4!yssz(o1tZhj9DpcUFE4yoJwztvW%I2AU2~?VOBK?(|r70)N%NCzhx?OGa zN4rhp4-fn-(q8Duw{cptrM$7D29IoiXw`LnLD^&3bS1m{4WN`0cdFf6L9s+G>F`gD zJauvLi^n&&utP4dq75rj(rqJ-N1o!!+en7)KMv?}!z-*pO(tmJmQ+@f8;#N3Tg^1_ z%@Fe6+8g$e)x1F0J|6Ix`y1*q?$iiFEV_M#znu2=6UyVw6GtSsm$wNTzxqtO7^#ib zxL?`!@5Hv>5WFXOqPs_Dr(Nk++LfKwJ9 zE-V-D5ikoadw_)YsyYQ6~8CP;NnE5=%OwW7go8M0XP z-9j}nTSp$9a+c3Moy=E-Hbri%SjQsCENBch4#~J_VRW* zaM_c6V_&-wcjqUeQp&BcfS{?6O5w4dM^J00)R%M$Kk2HU{Bi#P#=Ps`z4f){!_N%0td}x* zdVUyP#?jos&2cK2r&syBz!6t0s$?L_j_Aiei>ZQOKs&xqt1`P^dp)LxI7C-?b(!+ehl)jl>^THN31nof&xbtR#e>4nUmTvrPIWBJPK zkX9RrhfWey+DIoA@}{}_L3qN)#9F(-Yh$Fc=$BTu@Wutz%*_?@$dK-o6lwg4qE;Z1 z6oN)DPpo)@Nw!E7@8xQD5+N>+|&TygI~UVNEN^r1id@mS59JAD7?oP(O>h?a#yYi(A&N zVet%q7Bx*%T)UPb8MA^n5=2p;+ZUdzZm$zZ`^ab5v8jAlN6!zA+-{C0F|SN7GL zZ`w*#BP(wg%C)||8~OG>Qo=*nP@@h-bqKC!y4$x-*ZFj}+I}g!(&5rPFMn@=J-We`t5&(7Z97WSboq>zUL)1zky#mOHhb z{>d%0_P4DhP|eNE#Kkfsv&b2OBypEgtdHe-4~eDHEaA5>J^I{R-5qAkJH)s4OfGFD z)8~6|Wkg6G-sx5~bQ8nARNo=biwM7Hr2Xkd>A&8;4ea`Bd!E(OrDWeXD!u#Z*Zvy& zd7kzAL;Pse{12gCd?Wa49NJV~AkiZ6ABcQzkc%xt#ZI?kI|2DU2EutaFk6G+>1Nw3 zT1HmNXyk@{PTbr_0FUysUezybh4tmjO>M4QNbPX*$NS+Vi?T^jxdLdUX(OE-0aZX) z90Ol_{9Loub?qbJz5UEg*H(TWywRTa;Nn>{-RwTg9^J`&BzbJLC?=GwtqqV-6e!LY z;_r&DqFv2vdWK$YQsh;&ruR(g~hf9}&s&{gzb@Sz+l# zJ98%9x<7aCTYi_^`TTU7P1|d0t-oH~KQqBDG|frxu60`pW&Z$)t7) zRvADc;z`0I!oFNAq_zT*7S`(8?b}D52%1T4ri$;(jpSQ{a7!ePNCFQj*ATMc5@vIi zWdoei{AH-05Ne(oxQMLq$8&gv;~2P_!q_}9K`Wo!TwU6%$+<8YrSeMe+grLmtEKp( zTZ-dE@+>t}k8)|(GORZD&|0~M^41HMFC)4jjKHVPo+(>=Z6r)1U%Z_hL}MD9jU@#r zwRZHipY_{g=PFc!lw&Wp)MUC^@n7&?Ri~-*7sTHQ%cox~o?ME=l-kO}_fq0W{HG%h zv8x0OIK~L-E6yQ;cC`5w8!jP0t+;MKe8@&XW*~q#-ay82f%*sW3&ED&F!0UXP(?I8 zC9_MplFi~U%#% zW-4o1mE#-KlGR14EpKbTzVE)9n)1qC!i-XBr5SH@uA_ZDcRh39$ADJvRgvx@DAwQE z_V+VBeruE;X?s?Xe4&g{><`flij9ImKdwImyfgOuxaB!yw`8)5VEfkg+Y)(goj?U; zlgvm6`_Y+W88ZNXK0X*-#F^33+s<~H>gMj^VdY#0xnJJ2`%1gWqC&ZF&ax{Z!3Q6* zKeGpdrM=U%jYz2;Y%0a?RZS7sibx<4Sa`!tz2$!-pG~#5YyKzQ-w1pXZSC!2bn<+-nFewLGjfUv>$Q6J1Dg96 zMDV*=!j4#%b=(yj5|>}PpdPWX`^0c?K*6qS;pd0pkXl=P2KHRd zU0thf)miGkey1s-s>B7W%ZmncS7lcNqi{;bDY3pL3tuLF(kSef=)F_wyYPJ6X%i zGh}WCFw&?~#&IbMr;HXDK9zobbIaO?B@}$wP6M$Dv6zSo8=e?)F@Qd5rEd$grJmS= z&CXw8G3SPjR0MO6*YO0@G?%`)k7di-BORo$-yuw8Sd45etfK^AsL1Ydw4&D9{{WxK z?ehG!nMO4&X>Hoix8?r;0qTvloS4J;P&4fFaFLm8LF4pVKU4Kc6E}Sn;>h`m5eSI6Z-O=f}(_3FG zAt8h|#T+v;DguKjRQ=W?jx*PeYRuY=>5LE+{Z0taIp}_eoN#J0rL0plfs}~kkojlikI0DQpH>|!YEg9k-e%MEPo>)b z03@62?#^1guy%{)*4;d}v(wLeb@TAwh&t5Tc%DX`wlFceys7fyH`^%P^9FKAAZHs_ z)K|g35q0$YJkiOyrJic+vUoaVb@XFz+hIjxfYw0UQQAewFk!hk-@%E6pp(EUpSj?95I^?<`xB zAp!>FeZ?|u%vOu%b6lnA1sQ)Z=JM^e7i{Im$ju2c|Un5 zr}ERMUia(iW1>;!O*f~PTQvQ8ba!{@vG7NU{4}~Hy!R4EAZ2+Gm&`#R-U^F!#!5Pp z9(h!GixRL0b}DUlMrU z*IU<`;dGx4Si#~gLr!FpS(?h_Lg<=ZwD$i1zeY1G5h}?LaAcl6+sRt@GV1MGS8c0u z%hukF^-HFwk(t3Y7+w3(Yo|*;>0i0zHogMVd>wCXpkIBT`!mB@6qD+@j3p*?)TV;# zS@8;@?)k1RbiFL;z_(S@q2u?7pEdDp@~44(0ioVoOL?s8+Fq%x_?pTh?{Kr~D{+4- z7+yHuILOv5^qU8Z25Gd^RR%aK+I&T(-Rt`Pr)DO%oa=h@Hj$kkNaVEBZLW12D{Vr3 zP0Ubj%+l#_$}b8QoiCy!MI4czTh)JO_1Cw)#z? zUg{UPx3{#gw6>1YO|B%AjF1oQhwSV7CtQ3k@HdF{9eYxO*Tfe#+PvO3@jkyQ>6cnf z?xwL_K9!KrpnHamD%rDyu{t5x&?-J@dmxeT* zY+YVU_Un6HFHaGFU}&~8N+w&ozp_a7_u8zhi;G)En(Yu7QsP;qi_0E6$~^$vxkNyf5~x_cvO-gI-A`=BKD?Hw9YrRn*j7UClGZ=?%1N8vYA-x5a-Ee`p^Y zjZ?(BzM-vZSGNrdO>-+VTP&Vn0bx~ChA5y}W^w>je=KK_Sr_`|zpZQfjm)N9LsZtU z1Ki(SlPtzUq2!8GFDQvyacNRLjS@5vsv>!Sef|4Ad^GUqhqRkrbHozb+TL8;X&+~^ znG@|gjmEP(MrOE4Gb|8V==SqnEI^B7b{HJ)A7Mu?!Eoji+fk(nV`oY>t0biPl2O(= zrK5g#^GA(Vwj&=_Fy@kk8d8J0i?yBarTRagnmSuAf?7B1-SDdK#naQeuis%u;f~z6;fSPd1@y_Odn>`tEy+nB`rP%^{vf z*bKl%u-&L8;#AATcu|as{RsWH{xJ`UHny5pjuz6-O@a+ND6L|F6(dxQW4D>2CwZ9! zBYFMfMQ&ZoK6idT>N>3VSM#JzH0$RWOlW0g^5btTp<=j(+HWmc$UnVSkP@I~t0Sjd zH0Q*jTu?})A=Mbd3`2!WD9BbFz7gpx+W zC`W`clNa%MxEKJQ0IkmpSn67>)D}8zwAWWxk&P-liR9P}fmv>EdiiT2$jFdq<#jH^ zkhK?uJPlz3E%vc_AKJ`pH0>zK1o5)U43BYmcSz7g82NDhlq*O;^JYfM52Jhu;p^L& zMx_OcX_`KkeB0~cD_WzixGgL*>a(oP4YVFvXSa^#AQ^;lqcPY$>{cd@5}e^i3Z;3) zI9)6AXr8OC+B45kok_ZroAzqUm2B6o`m25ZXV8BQd@-W@8_*=zH4RSRRq;*Umpn1t zd5tU@WD#30+QdmaDOR?JWN)cSwn-$GXHVTk%syuLwfjT9sMG1%4D!XM*^RMY%=<1b zBxvM~?jc!-X|89K{MRlF!B#ohR!GzySK@EQ6XHvItt&;j{{V_|(|nqR^|Yi*smE;a znDp}7h_lic&Q-U!frDyHOWw%F1&S}0mP!0oe{(bnutRR%b;q9Q?x+iSaG*k6W4B1v zRH!V&8_63$;LCCaR)5-=np3FdctThAQ?-&?H?mjLOIqkLd97Hu;c#g}n_kYNz1y-| zyX)J{{yR5bCDiqMnJz72n@y5g3yI-(X^JT+em)Fu)m!x>}niZt3fP>$$S^JLwzu4z=b|F)8f3lytlTr za2`1a`bi0r$UbI*GZ&W>sTjeUXL1yl^8QC;{RQ|-<6A$0{vXjbX*9K#{{UOK)@(H^ zdBoxG^qpebNHrMl3d*tzh^?iH0T?LoGdyG?1Gl$1t4bJH(x+uP)QnUkrJ}ig?KSSY zJs)F}1saN6^&6XlN-5~ACYo!}{Jv-O!~XyTmH73fU+7=6j<>Eoy{vv7_?hD`f|mu+ zF0XHFP7ZutD9{f2*P2>uS} zdivN~K(;zhj=VqOt1D)=f^@jj{4I5TWpQLCk*Av8?&Cwdu`e_wB$68`R(B#m)mO$J zv_FXd0BVogBgelKFXEQpP4Pyvrb|8C(5gjir^Tw?w3=s?97P({uF?w_6KN|T3%sMo z$L6yt3+fJ(UE@k=%E>m}uPyGjX=!bCx;-onDr*v@CCzm1-j+!&n`pPc{1f%#z}_rB z8ZR_Y19+cEjU?85L*h>mX?pdO0sWP0r`*8*0BD`k?3kgABet}#o+h}JZY)+gmlH-0 z>%WQr0A;Tact#(Co+xb@uly_V6Gl2`ilvnB*8KLBL2|-01dS3?}HYe2+^!AF7I^RGR|)fL2MnGD~}4sm-1XR z;e@Wj+7oXT&K6N6$jxDvmur( zC&qpqxw_KzI9N5Y)gaNXTmxsC`>)~Oj^xJp7ye{^|qGz>d%s>y`@flS2JBN zZM4^|x+kvv&k6A*mY;EFs>i6`F5v_NPQK3;mp1ngbq>sJQf=jkDj;FFZc@M#kLSqM>H~7=zSByR+`0L_4t*Lmhc;`^@&Wm$86u1^Q_g1$0uZLP$7i4x& zS=%aU5kz+{m}ACBC*iom8%q;}g(oIiSt{B12opp$smn@{B zlS_9cXVu%Mrl-o6UIo>E;UMvTyRBW`NpYjasoZ(a<-?w8l9`50&zzqDxP8TXpBF>%NE7+9sypXg?+|LXq2*YlySj!L4(-ctc*f@H z-Jz2|CgRdXBL*d*iB&)Z8^5#uqYsY!JEhshs=@uG7l>}WIjzp|$_!ei<-6YL*AggX zQzgnkUP1s=l2;^w!Q1inhjFW0OB}IH2C=0{bgJ{m*SV_D_4NWyELecYSS})6^vg(@jI*%QA z&Kq3|!n!?`(A2CY(yetkwKZoe62)_IYZM`+X(WVM+8d2BXqD35Yc{qb7Lp^MgZ?3F z%=UV1jjXEhTxq(Dw~_f)%{exyKaq6r<&{)W$Ps4h& zZ)-HM&kmCFDGP6TVkEe_-vQX{^2HE>%}W^MGRrvJ00ZNXh!S1d+%$e@N$tMh4VWJ~ zODnl#w@}0`69Aal;z)y(=O(_#Af*{%p$=5+)$e;-M4jyPX=%%|TO-BJB{)z{o8*?y zt9EOBZ2ouiIGYRWpAczntp->;OQBBEODdwNkeZqPjNY_-?oJ>95z`8Mo6sd#C7rErhe#qC=+7cMZcGV@%Z`wpk#L zQ-TjI=d?r&49hNVZBFwev~3yrhvP4YbgdqHjXUfuVX0bQJ{!5@yrxFX9ZyW009*X&2bgNqsPqfUfMIOg<=POZH$H*ldCyel;GT4T25C{*{yZf-oNlA7`bw>1GPl4fOvIK zteaab!HNCc{hZd`-%7Dkc6n09B0xZGj~bUM1~HDn3?I_JH8gJn+3CI^)92Hz{{VE_ zMwbyO87JgL7an#-@3eWW=^hAoQ`rnP^X@p z_aeR=@GZT}F+kCcl0y`ziQ^pbPJ0nwQTSU_zO%E;_a7;4V=6ZenFO5n z$WU-|k<%df735 z-T8LA+qIVe0C9COv!iV!*7u6hJKJx&*Iz@yM}(&G(Ib{vi5OuFW|W=Jn)U>vO9e7 zf&dJ_Z3JMBoMyb+#B*B%Ev2k}UzS+zDgh`*(rozX!V zDms!dK*u0)Uo!kf)?ZYiLJp+Uw4}7Xn|E)reV*Gr^w{}V#wPb&yq?wq1Z+#0RuqgWA22b?w|c6%IXUNU zIj=FY@LEB59j(&lR7MO*EbxVQUh|7A2Zp=<44nax;u)YO0Q)fCe}x70URR zNQ+O?rMV^$7&J>8%gab3l6etWC;Ffh8<+^uh>)=hwYuF;SB6WAXP#HxY-BqUQ514v zenQQTI%j}0oQm@A9BMG!!h#cxuruV$(#0LoMVjs0k^-aI%F(#sD#qnNBe7&pJN=q>5yahxAwY3P+vo0 zk)s(J4?6kg|?3YO?+*v_;XP4=D%p>{U92yy*G;dO*Qyf#-C>etZAU>S2pi7 zX>VbtK@?Z6XFy}Qbd9qkMi0~Fafd8@q`4ri%_iNNR(4Od(_KF8SI*{c^%b4$n$t^s zD(zca>-Xw=cfd_|$DTLwCy2Z&;cXX2wZ74G%XyPgxsKV|%IW92OUW-aE5vA)>U+<& z-9f2bTo$tqZsK8tjeY+BMAD}4)EbtD;Js!~ihmHM$NVIoHSr;u=R)w*vdrmo7ly57 zdz)QC8&Tz2*5?&fQ2-Cpm@mS(xt z7Bq%bw^-WX+d(tSD6@8&NB|bXZwLLPziIn#h59}J0K>1@{{T_8x$*X<5jF1`M+7oy zx+;hvwz{82xMq?&jcuoIDP^~iMWEVRtPl%}`^KIRW-|WKtg!f4wGMc~q}}Zp=-iWf zSy?8ZUAEWft4(3kbNA%cqa9OD>-5=d(%WOmKWtBma`;NeR`^@s4-;Q_m&VuY7MtTO zT!9YTe#BmQAmu> z70tr7>}>o-z*Mi*)>+P2qD z%YW%Nlc#u7!ygZ=H9c#^R^s1Pf$c4{Jpq}X>F!@L1i5T)xp{UO@1@vXt#gM9DjJl0 zPQJ2&(lU|FfA+-GJg7l3LgvnR{MjFQ+aK7Rn4`+u+Q!>ThXfx3)cjMf_}^Z*)o)er zF60H2NiHp{tp)-~DI^PN=a6PygLw-byab?Ue|z{(@UCwa>pCpD#1h!}b?%MUv3S=X z=~wz4^=q-@piduwYoZ!?n$ zr75c?XKsl&?P)t(Rb;i(r>9Zz3qp&-9}YBadtUogn*RWe=D6`Km6%}59*{Kqh%AaZ z`J`a-!6bjXj69AaQcf8BY2kl|z9jJ#%({%0Nup@!a?pA6xR33KT$vumk=7{K3<2hTC-F~;JYjzv zx|QamtLj>ge%CCQGuuX{a#UN{MmNaT`wx~xURY@t0YrO1?z}1RqIlJH9a&pXva&|I zktdZNa`1lq83ctyM9Hu*ahZsdaXv`f?BVdZYzkE|bro9Bx`ZyI+?!Wzbn3Q!ucJJv zRjY`8#)KydZ%%ZTucp^Yy*~Zc?DH*d-tzrzyg#LR#?saleNsS@7Z*}trb#CZOoA=V z9B4=p#*v+&)!I!h6T>ZI8i;(wi-C1D$Pq#qWP)ePnH7)`fO5r0z>u7wDh zeE$Hnwd}Ek-y^Xs(nM59cfMrbG0R5aEK3*;C`oH(kV}8!4MH1RTVxW*(#f~Y2Q3s( z+M!lsAyOleRF&rhX0@I&c$q3x)Ao1wc^0?2Yh4p*-ul@mw|IsYo(^iIU*1VwzGtd- zR<*SCzWV5N{u#B=w4EsHH-2TctWwK8yvZ6J=!gZy?c|pVBTE~9m95}nUi$Rg+Q!=q zs{0oHI=1oFi>=FVJiBG?|rHJ3irebuoLXX9T>;%8p8)X89w=-w`I( z)=fK4^KG=WVxrtR3o0W3g_n%u0oOUf`G+_)?H{wFeXC3H{C5yXG#1(-NF{aMBvSe4 zh!BmTF&Qd^5LHi3^zpe#2k_&>Qd`(-7S<(EW3;-|VuChPb#mACYhcMR5RTQjM{DsE{NS&q}_KLI>t zsA-YIrgv-mjDW>Fc?5X$lo1k+Mv)i+l!$* zwsuXg>83cJGr#s`og`O_8jyC}M>8+k0zTx83rPO}GMf6I;hniU{pXFXHBB}hS47or zmMt?@WlL>dz+WYt$!~6_YF?X5#JEWsHhslrLJF&oHy?v^aJ6bHziQz-YV6cjkG0+2 zm+0-W*FzI2R7o`CrO5epN-e1N?Y;d^?l=Ak{rhQrDzfnB!M}r=R2CO@9s-UV{{V~^kE@0VhZdD;-(o6jV{2ch>;a`k;jie%(7>J8Pn8J#=?=YRl#6{d4>rfaz8~ zJNSBLNuuz_h;=XQu$KulCXucy>e^EI4qIYGW}XDME9IhEq^};=+P*>Xvi8H^WIq^w z9@`zdPlvVXej)fe!J?58!$I-G>bkx5lLWGoCx*ksR@%}ckj)5ql~q-O56#bpAGO!T zuMPNe_u?JZwyCG<`n}bUhi^3h02%mh`YW}bC@x;s7SpGS*7oeeEU%?YsDlYoSkVYl zs&)P${?&da@%M+cd+7A7DlZlIv%(tohjp#!?{#ez`e9>F3m2(nJZ*l>@bWoxOvosB}#M^cPMhdiN^3z*H?QhYu843`EE;76`A1hjjv-@6H1)t zCupcSN1e&FepMay+V(#T{{U{!kGikNkAvT{H^b@L*Hg6kk*V8iy41GQ8wl<+_PevO zLawGrW4;%YUPGzaUoQ+}Q%&R8){N(s&;tevx;!nlxFG#t#x6-t470IUfUI@cF z%^rc`SoGN@e<_j+WqWIAP34%5Nu->$#8ODIBA>E8DEuPvZ-h0763X&Re4lznMRyo zp0-i7`t`q-+vfq=)-T!(Vr@ zw7j>37*boCpE5b+f<<`Z-(IknPFv+y3U~f@M#|f{9UP~*M{|JyeE6%?}!>W z@C^31H=3>7(CWH&pLMBNc#1Oo3So}SO% zjgxkFk8L{lv-CP~FqG{TuQX%#-8I#9N64Q8d>JoH5QIR?^F&Ln#W{ zCYvtU&HS#08y!mC)-}CWxQbiWx$_L(L|e?AE`#uATGM<#;$1E)ZwGy^SAyo=El+jd z+U%u{{@TY-g4$!E`L@;?gs>Z@j@{!_Yj}s*+3x3(F30w8{hIu1tN6xSU30|xA<;Db zYe3W_g3fJ9TdhVb*&-KGU7M{mLT?)J;@A(H0@@j)U5-@OK7A3=`VPm2B*mq)#q#0jVB6VISHlUdVX zvk_cLq&382Ow`~g(jdFChE|5+GRbQsw<~pTBdS~>`2)v(4Q~)MTEB(ho5X$&zVgND ztQRe*X&34wjQyolNp)4$zq8e>>7Z&lHJ668tyb>h_Vv~= zGHL1d>$|vOfn$qI^CY!4Zwg-7P9?Oux>mhth2CeA^5?}rvR8~e58_$%e+TP65syZ@ zi%PlId^xDgt0koO8j>MHWhJ(eAc_IyNgUBM2tsUlT^J(4>G1w}PJ}rsX;zwCl@wdO zlK8Z}-TapOY;)##3Y8-nM^?1uO)Vv6-P`s355%t<=+KD1#UsyUDk59LC5a+Qtsv)Q zk0i1AY}Xi)S*1{}p-BX`YS+S#5NbXN@#osDEe5Tn-f8b^;$IA2Jn=!TX&1KaZtSj? z_om8bwRIY0++n7Q?ntJAK-?eo{{Z4=>p=hRS{d>&&6T)Xm zC0Ec1V+bk&`KBoxu>vIu59rx;eSyVbDmc`iv{JO<>CQ1trP?~{+1krxbK)yyc<5u4 z(vKpm8g}M|(zIIpTC3l?@7(sEf*ugkej513Ux@a12UqxI<8KUWR(c)H!Wrg{`u=-M zR`AZGpdYhp`XqNZ5nL?Fg8JPS<^m#<74+YW;xQfQ&2UkFXH*EQibDQw*;XjJ}A>a)Q!y1tAuD9W9uM|6L`hWJ0 zt7~(3F4Er8&rs9umqRAURndjxy_|tsDbhc)Pl)AaPh$A%@HfU0#im82>iQDicrMpV zwY8S|^HGv3jXKUrE-Zw;UXO>lRcQsrVcQ{XqCh@j@%YS4CyR@7R%thA$v%%)%KP+I z?6tA=IBY#wQNJ%T()z`8Nj+Qiw!7cR{AT!@puLv2r_Zl1nW@?f?JjGSy8A5BAm$A z3aCpQA71$D`z?68#yTFY;@f>w!ghC)Pi=FkTgh{(2xZkxwdJ+t^eqmBxkUF?6Hgp$ zphq0*A3HL7&)GNjaq*?ivmIYm@U)iNe71MxEw6QBEc01MGsPs1+9V{q+?eE$2T)Nq zlG(t^I>*Bil*WG1uVnq3yk)kWw$a^LTF>T>M|@{w1e-#jVq6E}Faf-umi& zN#I`$Hml+o#jUelPNv$$kx`sX*3m|uYvpsbvm<92^M3ar3jKum2k=tl+E~kdW};bG zFhdvy5wK$?j-d4D4{GsW*_Yr=&WV0+F6Pnit}f(?`bnAYUBkIA8lt^|02NBBlEs{Y zNjb0C4}`uDOJsp=meN3SSIXbO;{fIu{W-@TfLG)$pVx6JGO22CcW>TKTBmIj^xNis zhl#5!CK1)84}Ha3TXN{U_0#40^7elNCey5K?jQ>aGAaq59$ZsMa~NcgD%N#yqW&6#a)EV?GCA>7SwYX-I*blg> zvHY7$Mne|f2i;;a4lCkc7Wig6SuT+s9^cH7g1$_c-SUFLdEj&N0h}Id*vxYJ7)msy zPIva|F|$&?N4=6s*)I3H+Q);J=5=whnypSrM(g5+$t&8=wb{z}cf^w0Z3U%Oj1U6= z@_eTOSx~7A#Fh*R8%P^-kzS8&;-bOJN>!f-BTPABz!Eno#F7jGa5n&XWv`hpHTydY z$c zWs(yQ%q5A~k+hHEY~_LL@&V?Fh!1yk(2BFyU*p@-nU2XN#NLHF1l}vL0}b#)AVk0gTns+ZsA89 zkil zUs!H`vtHdz49~bbS}Shfa}!{BYL@7rlFF+IX#%qRr=`|@C3s3bEA3Ik;w8DZj%ii@ z0JHS7lEGIbonNROGCJU&O8Mi(zZNusq}`+~tp&Ubad#Y=PL|$#6A#&~M8De1Ber?= zOL~6KBRbnd9BZ|o7e_CxDv?}?E%cOkynnv@{^#UpUl&F+{o0bf?W|>GwbIh|Jgegw zm&E$7hP)%BN4^M7udIAU)M6J__8LX37c$Ef>nsDyxQ1BT$_pDsnpCx2v&b`WKF`hg zTJuV{xztXLs99KOntqpSbEx>1{{ZaJtoishB1SmKT=n@ACevYJylOQ}JO zc~-meUf;wv^2E~Xws#l0-Ml5NW3jMCwVvYU-%yfwit0%vzIiNWk{OER#c}pc(7>2I zVeyxMw9PYDeGkJ|@k64;8>(tcX?)Y&Mq_)EDVFKn+RX27aWo4u3)`5na8k_@(!Q=Z zM(DzljG-vSEhQg$`>$8cZp+I=osseN@i3iPO}o3bs(li*w07uK_|5T}+eYxaSo}G> zp59$jDJ0c3EBAYCQu@;2CXRTGvLkBZJIO=Kdv?nc+$P6D5=HXe%!iFpKfk%H62S`yo&DXSnb(u_Rn(-wYprJ#cOhjYZMYk0(q6v_?zMmi>WRC zruu#Oys}fM-f9v>Wi+nuaUHx+L183~X*AaJtdhg2U&(Orq=q|t7!+-^R< z*}EjNLfW;AGRYiXYet%d_nGJ3P7f08jV-VlB8)|D&nJcGJ{i>fU$0K9a~eecY#N>M zK=xMF>=pA74Esy~!x{*b$2oZ7j0qi9zP0f0{1f}fm);=L?XSE~;Ehu6RGuKW7x!BE zg2hE5xk+Sw=z&SJ2)Z{gS+MZu03`cB$ZvJ4v*=+ag-sYcbvH z7g48`=ju~0hro$cNah8E%+ND5h{jXqSJvXO7;LW(`?TeVX>-m|yk3#hO>Ue20E5)< zslvW4tB#7E%I$5Y>#e(ax2DI;9uW9@rdXsl`ZkAiE49)?V`)4oCZ(ud9n*cb`ZX;D zycRGdsP}hc?AFquBu0P%+x$cLlOMuQh9kpzeb$){ui{JdbFXWVLX0gM2l+I+NoF2p z=}6V4wvO51vXbGZxKk$NWAuNEJ_&fQ!B*EE6wvQ{J#T+`J)-OWBW*)elJ@;(w(}QD zvy1y39>YmwF`K1|?9X{Qc`gH>Xs7u*{u!gj{{R=fUjG2v{vh!6hKu3N0$pB&l@J?7`tS}9q7 zTJ$x>Q=SW!l_;lDNhq|^Pe|JBwPe=n--+<#&*Gnm`m3g!Z>?Y5UrJ+3Rm{4T#8MKe zG|Z97B3s=TUnoaxvBsn^X2(9U;eXlUG`NlYY2tCD+{Ek#g`|jYrhvwPh;C-GnV^PX z2@JC{d2*6vA>2m4X823?b=9;PH5+|8PYzh=x_dgtFZnc!tpt_Y~ zS&X+gk|UYqZz*h|fdF@Sui$@=ENzADwbz4=z>vdWbvG}=*eL+=Nd{*%1$y$H*wipf5Wel=B%c5AJ#E^^OAZ-GlwnOzNt%P zy}Z6l=AACfLeOqbo8XTIo7wlsvTK^FzOQ$XH%laQv}qcGYX~l`9A9b>&vuuxqRz}uU@R%cGDoH<@H%!}NSIZvn#U2Ov;ddF+A=f+& zr$WqGj+=F?Ss3ALVIvm)9YvCDt%kU8t-B;ABp;l-N&7c^Sk@$Ue-L;*A!%fiQ2O79 z<&x%R#%H>b{5DL|75PSRv9HL0ftu*RU{)fnI{3U*-PEO|<#(n2#)-T1)6a947mKTg zr^!bT8?6=GEcz?nTXp$&vpBC5d{WVEBNm<;w_E#}a~!u)WB?Zmf!qSKFgz>6wLGv9 zjyN7?f2r#_w0h0;`Dwn-Z)_TARg8A(+r)P9BjrJ6;X;s3-1V=2@Sp5)@efhENGI2P z5`kp5aLK4^tU&X?b?3ut!zdtg)PhBNkBxt1e;sK)1BMG-U%{_+uK14R-CVw*t)W{z z8W|(Dw*`DU#fwWbO6O{h6lCJQnolpnP86ugxj84y;X9@7$?D_kt^REA<@Ng1Yfhp~ zG@AF}eOu8x=&zyqyxW>W$7y_m#{hsx&NI`D;Etf=V?5W^e+Zzw(0&#CMe!*K5?x1jh zP64m8d}sSHc=tm1SK*%p*=n8*@~HTy+Iw!upmJ2O9 zd2p7ZSyo6~Zi3=>V&X?231XNrMoCyIG=Pi_N6}voHQUPuZA$#5#P)4%D>J)h0*bbi z7D)Fzx3_jM%M;tC=~P7=sCVr(HkbP-d_$K0_9xbS7DTrb&V{utb^Mv5fJlz2d^iCZ zP^^UQ1nvNk1$wuKKV^@I(CRi3Nv-%LEiCgi=?&Jj^4&`|$jq!|&{ag63dOkR5uhXH zX0A-acxv?hm$bB1u9RkzR@FauqITcUQ?mm~yed9;YbjkWXg*ftmY(zKo}QZ6`-9*% zr{OfX(R^{H+-jCAl1T@Mbf(iG)O2yBP7SR05Zy&5nzy=KGevzQ^TjK@u$3hl&3?lE z%bq0h2f)9KO`=7q->1Wm8`;8I=R~+m`x|{m?Zl93nnlgIk{cwswGiBxr2#*5@UBd% z55&LlPv6-L_>;ufehk(%%X>`<_fNQv?$*-o{{T`?vsZA?Ta^-{ z6>aJQ*Y3xIe`Aa9hJOa9)2;kXre6;V>i5<{>PP8g`v;5?ZX9jOh%}Ni1r@IRJUYw_m$s8aI^A++)1>8>0(9 zhk8}EwH2K5TxrhoPp8HrB=1=4@U0Xi7;Wz6XKmT@$Or^uHO+i+@kWi~4~&{$!_7L< z@(X_t{37xH0E*(0?h9*ld85=c9}U2)*7rAWB(||lBf`xR0ws2AJZKNoE yE-RW%MXz})#p;#XzMJZn$!p|bqgIx4hfP0a?c*86N-5deEok<;)unqkWB=I(= self.total_trials: - self.stop() - - has_told_study = [] - - for trial_idx in range(self.num_trials): - work_name = f"objective_work_{trial_idx}" - if work_name not in self.ws: - objective_work = ObjectiveWork( - script_path=self.script_path, - data_dir=self.data_dir, - cloud_compute=CloudCompute("cpu"), - ) - self.ws[work_name] = objective_work - if not self.ws[work_name].has_started: - trial = self._study.ask(ObjectiveWork.distributions()) - self.ws[work_name].run(trial_id=trial._trial_id, **trial.params) - - if self.ws[work_name].metric and not self.ws[work_name].has_told_study: - self.hi_plot.data.append({"x": -1 * self.ws[work_name].metric, **self.ws[work_name].params}) - self._study.tell(self.ws[work_name].trial_id, self.ws[work_name].metric) - self.ws[work_name].has_told_study = True - - has_told_study.append(self.ws[work_name].has_told_study) - - if all(has_told_study): - self.num_trials += self.simultaneous_trials - - -if __name__ == "__main__": - app = LightningApp( - RootHPOFlow( - script_path=str(Path(__file__).parent / "pl_script.py"), - data_dir="data/hymenoptera_data_version_0", - total_trials=6, - simultaneous_trials=2, - ) - ) diff --git a/examples/app/hpo/app_wo_ui.py b/examples/app/hpo/app_wo_ui.py deleted file mode 100644 index e20318347a1f7..0000000000000 --- a/examples/app/hpo/app_wo_ui.py +++ /dev/null @@ -1,57 +0,0 @@ -from pathlib import Path - -import optuna -from lightning.app import CloudCompute, LightningApp, LightningFlow -from lightning.app.structures import Dict -from objective import ObjectiveWork - - -class RootHPOFlow(LightningFlow): - def __init__(self, script_path, data_dir, total_trials, simultaneous_trials): - super().__init__() - self.script_path = script_path - self.data_dir = data_dir - self.total_trials = total_trials - self.simultaneous_trials = simultaneous_trials - self.num_trials = simultaneous_trials - self._study = optuna.create_study() - self.ws = Dict() - - def run(self): - if self.num_trials >= self.total_trials: - self.stop() - - has_told_study = [] - - for trial_idx in range(self.num_trials): - work_name = f"objective_work_{trial_idx}" - if work_name not in self.ws: - objective_work = ObjectiveWork( - script_path=self.script_path, - data_dir=self.data_dir, - cloud_compute=CloudCompute("cpu"), - ) - self.ws[work_name] = objective_work - if not self.ws[work_name].has_started: - trial = self._study.ask(ObjectiveWork.distributions()) - self.ws[work_name].run(trial_id=trial._trial_id, **trial.params) - - if self.ws[work_name].metric and not self.ws[work_name].has_told_study: - self._study.tell(self.ws[work_name].trial_id, self.ws[work_name].metric) - self.ws[work_name].has_told_study = True - - has_told_study.append(self.ws[work_name].has_told_study) - - if all(has_told_study): - self.num_trials += self.simultaneous_trials - - -if __name__ == "__main__": - app = LightningApp( - RootHPOFlow( - script_path=str(Path(__file__).parent / "pl_script.py"), - data_dir="data/hymenoptera_data_version_0", - total_trials=6, - simultaneous_trials=2, - ) - ) diff --git a/examples/app/hpo/download_data.py b/examples/app/hpo/download_data.py deleted file mode 100644 index d82b86a9dee95..0000000000000 --- a/examples/app/hpo/download_data.py +++ /dev/null @@ -1,5 +0,0 @@ -from utils import download_data - -data_dir = "hymenoptera_data_version_0" -download_url = f"https://pl-flash-data.s3.amazonaws.com/{data_dir}.zip" -download_data(download_url, "./data") diff --git a/examples/app/hpo/hyperplot.py b/examples/app/hpo/hyperplot.py deleted file mode 100644 index 8ff238ce38985..0000000000000 --- a/examples/app/hpo/hyperplot.py +++ /dev/null @@ -1,34 +0,0 @@ -from lightning.app import LightningFlow -from lightning.app.frontend import StreamlitFrontend -from lightning.app.utilities.state import AppState - - -class HiPlotFlow(LightningFlow): - def __init__(self): - super().__init__() - self.data = [] - - def run(self): - pass - - def configure_layout(self): - return StreamlitFrontend(render_fn=render_fn) - - -def render_fn(state: AppState): - import json - - import hiplot as hip - import streamlit as st - from streamlit_autorefresh import st_autorefresh - - st.set_page_config(layout="wide") - st_autorefresh(interval=1000, limit=None, key="refresh") - - if not state.data: - st.write("No data available yet ! Stay tuned") - return - - xp = hip.Experiment.from_iterable(state.data) - ret_val = xp.to_streamlit(ret="selected_uids", key="hip").display() - st.markdown("hiplot returned " + json.dumps(ret_val)) diff --git a/examples/app/hpo/objective.py b/examples/app/hpo/objective.py deleted file mode 100644 index e320b66217db1..0000000000000 --- a/examples/app/hpo/objective.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import tempfile -from datetime import datetime -from typing import Optional - -import pandas as pd -import torch -from lightning.app import CloudCompute -from lightning.app.components import TracerPythonScript -from optuna.distributions import CategoricalDistribution, LogUniformDistribution -from torchmetrics import Accuracy - - -class ObjectiveWork(TracerPythonScript): - def __init__(self, script_path: str, data_dir: str, cloud_compute: Optional[CloudCompute]): - timestamp = datetime.now().strftime("%H:%M:%S") - tmpdir = tempfile.TemporaryDirectory().name - submission_path = os.path.join(tmpdir, f"{timestamp}.csv") - best_model_path = os.path.join(tmpdir, f"{timestamp}.model.pt") - super().__init__( - script_path, - script_args=[ - f"--train_data_path={data_dir}/train", - f"--test_data_path={data_dir}/test", - f"--submission_path={submission_path}", - f"--best_model_path={best_model_path}", - ], - cloud_compute=cloud_compute, - ) - self.data_dir = data_dir - self.best_model_path = best_model_path - self.submission_path = submission_path - self.metric = None - self.trial_id = None - self.metric = None - self.params = None - self.has_told_study = False - - def run(self, trial_id: int, **params): - self.trial_id = trial_id - self.params = params - self.script_args.extend([f"--{k}={v}" for k, v in params.items()]) - super().run() - self.compute_metric() - - def _to_labels(self, path: str): - return torch.from_numpy(pd.read_csv(path).label.values) - - def compute_metric(self): - self.metric = -1 * float( - Accuracy(task="binary")( - self._to_labels(self.submission_path), - self._to_labels(f"{self.data_dir}/ground_truth.csv"), - ) - ) - - @staticmethod - def distributions(): - return { - "backbone": CategoricalDistribution(["resnet18", "resnet34"]), - "learning_rate": LogUniformDistribution(0.0001, 0.1), - } diff --git a/examples/app/hpo/pl_script.py b/examples/app/hpo/pl_script.py deleted file mode 100644 index bbc453798431a..0000000000000 --- a/examples/app/hpo/pl_script.py +++ /dev/null @@ -1,43 +0,0 @@ -import argparse -import os - -import pandas as pd -import torch -from flash import Trainer -from flash.image import ImageClassificationData, ImageClassifier - -# Parse arguments provided by the Work. -parser = argparse.ArgumentParser() -parser.add_argument("--train_data_path", type=str, required=True) -parser.add_argument("--submission_path", type=str, required=True) -parser.add_argument("--test_data_path", type=str, required=True) -parser.add_argument("--best_model_path", type=str, required=True) -# Optional -parser.add_argument("--backbone", type=str, default="resnet18") -parser.add_argument("--learning_rate", type=float, default=0.01) -args = parser.parse_args() - - -datamodule = ImageClassificationData.from_folders( - train_folder=args.train_data_path, - batch_size=8, -) - -model = ImageClassifier(datamodule.num_classes, backbone=args.backbone) -trainer = Trainer(fast_dev_run=True) -trainer.fit(model, datamodule=datamodule) -trainer.save_checkpoint(args.best_model_path) - -datamodule = ImageClassificationData.from_folders( - predict_folder=args.test_data_path, - batch_size=8, -) - -predictions = Trainer().predict(model, datamodule=datamodule) -submission_data = [ - {"filename": os.path.basename(p["metadata"]["filepath"]), "label": torch.argmax(p["preds"]).item()} - for batch in predictions - for p in batch -] -df = pd.DataFrame(submission_data) -df.to_csv(args.submission_path, index=False) diff --git a/examples/app/hpo/requirements.txt b/examples/app/hpo/requirements.txt deleted file mode 100644 index bd85880da2237..0000000000000 --- a/examples/app/hpo/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -optuna -lightning-flash[image,serve] == 0.7.0 -hiplot diff --git a/examples/app/hpo/utils.py b/examples/app/hpo/utils.py deleted file mode 100644 index a07ae73f8fd3e..0000000000000 --- a/examples/app/hpo/utils.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import os.path -import tarfile -import zipfile - -import requests - - -def download_data(url: str, path: str = "data/", verbose: bool = False) -> None: - """Download file with progressbar. - - # Code taken from: https://gist.github.com/ruxi/5d6803c116ec1130d484a4ab8c00c603 - # __author__ = "github.com/ruxi" - # __license__ = "MIT" - - Usage: - download_file('http://web4host.net/5MB.zip') - - """ - if url == "NEED_TO_BE_CREATED": - raise NotImplementedError - - if not os.path.exists(path): - os.makedirs(path) - local_filename = os.path.join(path, url.split("/")[-1]) - r = requests.get(url, stream=True, verify=False) - file_size = int(r.headers["Content-Length"]) if "Content-Length" in r.headers else 0 - chunk_size = 1024 - num_bars = int(file_size / chunk_size) - if verbose: - print({"file_size": file_size}) - print({"num_bars": num_bars}) - - if not os.path.exists(local_filename): - with open(local_filename, "wb") as fp: - for chunk in r.iter_content(chunk_size=chunk_size): - fp.write(chunk) # type: ignore - - def extract_tarfile(file_path: str, extract_path: str, mode: str): - if os.path.exists(file_path): - with tarfile.open(file_path, mode=mode) as tar_ref: - for member in tar_ref.getmembers(): - try: - tar_ref.extract(member, path=extract_path, set_attrs=False) - except PermissionError: - raise PermissionError(f"Could not extract tar file {file_path}") - - if ".zip" in local_filename: - if os.path.exists(local_filename): - with zipfile.ZipFile(local_filename, "r") as zip_ref: - zip_ref.extractall(path) # noqa: S202 - elif local_filename.endswith(".tar.gz") or local_filename.endswith(".tgz"): - extract_tarfile(local_filename, path, "r:gz") - elif local_filename.endswith(".tar.bz2") or local_filename.endswith(".tbz"): - extract_tarfile(local_filename, path, "r:bz2") diff --git a/examples/app/installation_commands/app.py b/examples/app/installation_commands/app.py deleted file mode 100644 index 526fcfef64413..0000000000000 --- a/examples/app/installation_commands/app.py +++ /dev/null @@ -1,31 +0,0 @@ -# EXAMPLE COMPONENT: RUN A SCRIPT -# app.py -# !echo "I am installing a dependency not declared in a requirements file" -# !pip install lmdb -import lmdb -from lightning.app import CloudCompute, LightningApp, LightningFlow, LightningWork - - -class YourComponent(LightningWork): - def run(self): - print(lmdb.version()) - print("lmdb successfully installed") - print("Accessing a module in a Work or Flow body works!") - - -class RootFlow(LightningFlow): - def __init__(self, work): - super().__init__() - self.work = work - - def run(self): - self.work.run() - - -print(f"Accessing an object in main code body works!: version = {lmdb.version()}") - - -# run on a cloud machine -compute = CloudCompute("cpu") -worker = YourComponent(cloud_compute=compute) -app = LightningApp(RootFlow(worker)) diff --git a/examples/app/interruptible/app.py b/examples/app/interruptible/app.py deleted file mode 100644 index a44fcf4dca3ed..0000000000000 --- a/examples/app/interruptible/app.py +++ /dev/null @@ -1,32 +0,0 @@ -from time import sleep - -from lightning.app import CloudCompute, LightningApp, LightningFlow, LightningWork - - -class Work(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.counter = 0 - - def run(self): - while True: - print(self.counter) - self.counter += 1 - sleep(1) - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.w = Work( - cloud_compute=CloudCompute("gpu", interruptible=True), - start_with_flow=False, - parallel=True, - ) - - def run(self): - self.w.run() - print(self.w.counter) - - -app = LightningApp(Flow()) diff --git a/examples/app/justpy/app.py b/examples/app/justpy/app.py deleted file mode 100644 index a4c9abc4cda1d..0000000000000 --- a/examples/app/justpy/app.py +++ /dev/null @@ -1,42 +0,0 @@ -from typing import Callable - -from lightning import LightningApp, LightningFlow -from lightning.app.frontend import JustPyFrontend - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.counter = 0 - - def run(self): - print(self.counter) - - def configure_layout(self): - return JustPyFrontend(render_fn=render_fn) - - -def render_fn(get_state: Callable) -> Callable: - import justpy as jp - - def webpage(): - wp = jp.QuasarPage(dark=True) - d = jp.Div(classes="q-pa-md q-gutter-sm", a=wp) - container = jp.QBtn(color="primary", text="Counter: 0") - - async def click(*_): - state = get_state() - state.counter += 1 - container.text = f"Counter: {state.counter}" - - button = jp.QBtn(color="primary", text="Click Me!", click=click) - - d.add(button) - d.add(container) - - return wp - - return webpage - - -app = LightningApp(Flow()) diff --git a/examples/app/justpy/requirements.txt b/examples/app/justpy/requirements.txt deleted file mode 100644 index 5f69409a4e4bb..0000000000000 --- a/examples/app/justpy/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -justpy diff --git a/examples/app/layout/app.py b/examples/app/layout/app.py deleted file mode 100644 index c57ab2eff78e7..0000000000000 --- a/examples/app/layout/app.py +++ /dev/null @@ -1,101 +0,0 @@ -"""An example showcasing how `configure_layout` can be used to nest user interfaces of different flows. - -Run the app: - -lightning run app examples/layout/demo.py - -This starts one server for each flow that returns a UI. Access the UI at the link printed in the terminal. - -""" - -import os -from time import sleep - -from lightning.app import LightningApp, LightningFlow -from lightning.app.frontend import StaticWebFrontend, StreamlitFrontend - - -class C11(LightningFlow): - def __init__(self): - super().__init__() - self.message = "Hello Streamlit!" - - def run(self): - pass - - def configure_layout(self): - return StreamlitFrontend(render_fn=render_c11) - - -def render_c11(state): - import streamlit as st - - st.write(state.message) - - -class C21(LightningFlow): - def __init__(self): - super().__init__() - - def run(self): - pass - - def configure_layout(self): - return StaticWebFrontend(os.path.join(os.path.dirname(__file__), "ui1")) - - -class C22(LightningFlow): - def __init__(self): - super().__init__() - - def run(self): - pass - - def configure_layout(self): - return StaticWebFrontend(os.path.join(os.path.dirname(__file__), "ui2")) - - -class C1(LightningFlow): - def __init__(self): - super().__init__() - self.c11 = C11() - - def run(self): - pass - - -class C2(LightningFlow): - def __init__(self): - super().__init__() - self.c21 = C21() - self.c22 = C22() - - def run(self): - pass - - def configure_layout(self): - return [ - {"name": "one", "content": self.c21}, - {"name": "two", "content": self.c22}, - ] - - -class Root(LightningFlow): - def __init__(self): - super().__init__() - self.c1 = C1() - self.c2 = C2() - - def run(self): - sleep(10) - self.stop("Layout End") - - def configure_layout(self): - return [ - {"name": "one", "content": self.c1.c11}, - {"name": "two", "content": self.c2}, - {"name": "three", "content": "https://lightning.ai"}, - ] - - -app = LightningApp(Root()) diff --git a/examples/app/layout/requirements.txt b/examples/app/layout/requirements.txt deleted file mode 100644 index 12a4706528df6..0000000000000 --- a/examples/app/layout/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -streamlit diff --git a/examples/app/layout/ui1/index.html b/examples/app/layout/ui1/index.html deleted file mode 100644 index 11668dee1b911..0000000000000 --- a/examples/app/layout/ui1/index.html +++ /dev/null @@ -1,10 +0,0 @@ - - - - - One - - - One - - diff --git a/examples/app/layout/ui2/index.html b/examples/app/layout/ui2/index.html deleted file mode 100644 index 7398be1f7630d..0000000000000 --- a/examples/app/layout/ui2/index.html +++ /dev/null @@ -1,10 +0,0 @@ - - - - - Two - - - Two - - diff --git a/examples/app/mount/app.py b/examples/app/mount/app.py deleted file mode 100644 index b7e7c4df4746e..0000000000000 --- a/examples/app/mount/app.py +++ /dev/null @@ -1,34 +0,0 @@ -import os - -from lightning.app import CloudCompute, LightningApp, LightningFlow, LightningWork -from lightning.app.storage import Mount - - -class Work(LightningWork): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def run(self): - files = os.listdir("/content/esRedditJson/") - for file in files: - print(file) - assert "esRedditJson1" in files - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - self.work_1 = Work( - cloud_compute=CloudCompute( - mounts=Mount( - source="s3://ryft-public-sample-data/esRedditJson/", - mount_path="/content/esRedditJson/", - ), - ) - ) - - def run(self): - self.work_1.run() - - -app = LightningApp(Flow()) diff --git a/examples/app/multi_node/README.md b/examples/app/multi_node/README.md deleted file mode 100644 index aef152444f4a4..0000000000000 --- a/examples/app/multi_node/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# Lightning & Multi Node Training - -Lightning supports makes multi-node training simple by providing a simple interface to orchestrate compute and data. - -## Multi Node with raw PyTorch - -You can run the multi-node raw PyTorch by running the following commands. - -Here is an example where you spawn your processes yourself. - -```bash -lightning run app train_pytorch.py -``` - -or you can use the built-in component for it. - -```bash -lightning run app train_pytorch_spawn.py -``` - -## Multi Node with raw PyTorch + Fabric - -You can run the multi-node raw PyTorch and Fabric by running the following commands. - -```bash -lightning run app train_fabric.py -``` - -Using Fabric, you retain control over your loops while accessing in a minimal way all Lightning distributed strategies. - -## Multi Node with Lightning Trainer - -Lightning supports running Lightning Trainer from a script or within a Lightning Work. - -You can either run a script directly - -```bash -lightning run app train_pl_script.py -``` - -or run your code within as a work. - -```bash -lightning run app train_pl.py -``` - -## Multi Node with any frameworks - -```bash -lightning run app train_any.py -``` diff --git a/examples/app/multi_node/pl_boring_script.py b/examples/app/multi_node/pl_boring_script.py deleted file mode 100644 index f14809354f405..0000000000000 --- a/examples/app/multi_node/pl_boring_script.py +++ /dev/null @@ -1,7 +0,0 @@ -from lightning.pytorch import Trainer -from lightning.pytorch.demos.boring_classes import BoringModel - -if __name__ == "__main__": - model = BoringModel() - trainer = Trainer(max_epochs=1) - trainer.fit(model) diff --git a/examples/app/multi_node/requirements.txt b/examples/app/multi_node/requirements.txt deleted file mode 100644 index 12c6d5d5eac2a..0000000000000 --- a/examples/app/multi_node/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -torch diff --git a/examples/app/multi_node/train_any.py b/examples/app/multi_node/train_any.py deleted file mode 100644 index b3c89ad534f43..0000000000000 --- a/examples/app/multi_node/train_any.py +++ /dev/null @@ -1,22 +0,0 @@ -from lightning.app import CloudCompute, LightningApp, LightningWork -from lightning.app.components import MultiNode - - -class AnyDistributedComponent(LightningWork): - def run( - self, - main_address: str, - main_port: int, - num_nodes: int, - node_rank: int, - ): - print(f"ADD YOUR DISTRIBUTED CODE: {main_address} {main_port} {num_nodes} {node_rank}.") - - -app = LightningApp( - MultiNode( - AnyDistributedComponent, - num_nodes=2, - cloud_compute=CloudCompute("gpu"), - ) -) diff --git a/examples/app/multi_node/train_fabric.py b/examples/app/multi_node/train_fabric.py deleted file mode 100644 index 2379c491f89aa..0000000000000 --- a/examples/app/multi_node/train_fabric.py +++ /dev/null @@ -1,40 +0,0 @@ -import torch -from lightning.app import CloudCompute, LightningApp, LightningWork -from lightning.app.components import FabricMultiNode -from lightning.fabric import Fabric - - -class FabricPyTorchDistributed(LightningWork): - def run(self): - # 1. Prepare the model - model = torch.nn.Sequential( - torch.nn.Linear(1, 1), - torch.nn.ReLU(), - torch.nn.Linear(1, 1), - ) - - # 2. Create Fabric. - fabric = Fabric(strategy="ddp", precision="16-mixed") - model, optimizer = fabric.setup(model, torch.optim.SGD(model.parameters(), lr=0.01)) - criterion = torch.nn.MSELoss() - - # 3. Train the model for 1000 steps. - for step in range(1000): - model.zero_grad() - x = torch.tensor([0.8]).to(fabric.device) - target = torch.tensor([1.0]).to(fabric.device) - output = model(x) - loss = criterion(output, target) - print(f"global_rank: {fabric.global_rank} step: {step} loss: {loss}") - fabric.backward(loss) - optimizer.step() - - -# 8 GPUs: (2 nodes of 4 x v100) -app = LightningApp( - FabricMultiNode( - FabricPyTorchDistributed, - cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x V100 - num_nodes=2, - ) -) diff --git a/examples/app/multi_node/train_lt.py b/examples/app/multi_node/train_lt.py deleted file mode 100644 index 23a2863e757c7..0000000000000 --- a/examples/app/multi_node/train_lt.py +++ /dev/null @@ -1,21 +0,0 @@ -# app.py -from lightning.app import CloudCompute, LightningApp, LightningWork -from lightning.app.components import LightningTrainerMultiNode -from lightning.pytorch import Trainer -from lightning.pytorch.demos.boring_classes import BoringModel - - -class LightningTrainerDistributed(LightningWork): - def run(self): - model = BoringModel() - trainer = Trainer(max_epochs=10, strategy="ddp") - trainer.fit(model) - - -# 8 GPUs: (2 nodes of 4 x v100) -component = LightningTrainerMultiNode( - LightningTrainerDistributed, - num_nodes=2, - cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100 -) -app = LightningApp(component) diff --git a/examples/app/multi_node/train_lt_script.py b/examples/app/multi_node/train_lt_script.py deleted file mode 100644 index 7f89bc95e9b17..0000000000000 --- a/examples/app/multi_node/train_lt_script.py +++ /dev/null @@ -1,11 +0,0 @@ -from lightning.app import CloudCompute, LightningApp -from lightning.app.components import LightningTrainerScript - -# 8 GPUs: (2 nodes of 4 x v100) -app = LightningApp( - LightningTrainerScript( - "pl_boring_script.py", - num_nodes=2, - cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100 - ), -) diff --git a/examples/app/multi_node/train_pytorch.py b/examples/app/multi_node/train_pytorch.py deleted file mode 100644 index a1c7fb8eac207..0000000000000 --- a/examples/app/multi_node/train_pytorch.py +++ /dev/null @@ -1,60 +0,0 @@ -# app.py -# ! pip install torch -import torch -from lightning.app import CloudCompute, LightningApp, LightningWork -from lightning.app.components import MultiNode -from torch.nn.parallel.distributed import DistributedDataParallel - - -def distributed_train(local_rank: int, main_address: str, main_port: int, num_nodes: int, node_rank: int, nprocs: int): - # 1. SET UP DISTRIBUTED ENVIRONMENT - global_rank = local_rank + node_rank * nprocs - world_size = num_nodes * nprocs - - if torch.distributed.is_available() and not torch.distributed.is_initialized(): - torch.distributed.init_process_group( - "nccl" if torch.cuda.is_available() else "gloo", - rank=global_rank, - world_size=world_size, - init_method=f"tcp://{main_address}:{main_port}", - ) - - # 2. PREPARE DISTRIBUTED MODEL - model = torch.nn.Linear(32, 2) - device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel(model, device_ids=[local_rank] if torch.cuda.is_available() else None).to(device) - - # 3. SETUP LOSS AND OPTIMIZER - criterion = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - - # 4.TRAIN THE MODEL FOR 50 STEPS - for step in range(50): - model.zero_grad() - x = torch.randn(64, 32).to(device) - output = model(x) - loss = criterion(output, torch.ones_like(output)) - print(f"global_rank: {global_rank} step: {step} loss: {loss}") - loss.backward() - optimizer.step() - - # 5. VERIFY ALL COPIES OF THE MODEL HAVE THE SAME WEIGTHS AT END OF TRAINING - weight = model.module.weight.clone() - torch.distributed.all_reduce(weight) - assert torch.equal(model.module.weight, weight / world_size) - - print("Multi Node Distributed Training Done!") - - -class PyTorchDistributed(LightningWork): - def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int): - nprocs = torch.cuda.device_count() if torch.cuda.is_available() else 1 - torch.multiprocessing.spawn( - distributed_train, args=(main_address, main_port, num_nodes, node_rank, nprocs), nprocs=nprocs - ) - - -# 8 GPUs: (2 nodes x 4 v 100) -compute = CloudCompute("gpu-fast-multi") # 4 x v100 -component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) -app = LightningApp(component) diff --git a/examples/app/multi_node/train_pytorch_spawn.py b/examples/app/multi_node/train_pytorch_spawn.py deleted file mode 100644 index 8febfe5dcf696..0000000000000 --- a/examples/app/multi_node/train_pytorch_spawn.py +++ /dev/null @@ -1,51 +0,0 @@ -import torch -from lightning.app import CloudCompute, LightningApp, LightningWork -from lightning.app.components import PyTorchSpawnMultiNode -from torch.nn.parallel.distributed import DistributedDataParallel - - -class PyTorchDistributed(LightningWork): - def run( - self, - world_size: int, - node_rank: int, - global_rank: str, - local_rank: int, - ): - # 1. Prepare the model - model = torch.nn.Sequential( - torch.nn.Linear(1, 1), - torch.nn.ReLU(), - torch.nn.Linear(1, 1), - ) - - # 2. Setup distributed training - device = torch.device(f"cuda:{local_rank}") if torch.cuda.is_available() else torch.device("cpu") - model = DistributedDataParallel( - model.to(device), device_ids=[local_rank] if torch.cuda.is_available() else None - ) - - # 3. Prepare loss and optimizer - criterion = torch.nn.MSELoss() - optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - - # 4. Train the model for 1000 steps. - for step in range(1000): - model.zero_grad() - x = torch.tensor([0.8]).to(device) - target = torch.tensor([1.0]).to(device) - output = model(x) - loss = criterion(output, target) - print(f"global_rank: {global_rank} step: {step} loss: {loss}") - loss.backward() - optimizer.step() - - -# 8 GPUs: (2 nodes x 4 v 100) -app = LightningApp( - PyTorchSpawnMultiNode( - PyTorchDistributed, - num_nodes=2, - cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100 - ) -) diff --git a/examples/app/payload/app.py b/examples/app/payload/app.py deleted file mode 100644 index c92f589b088cd..0000000000000 --- a/examples/app/payload/app.py +++ /dev/null @@ -1,31 +0,0 @@ -from lightning.app import LightningApp, LightningFlow, LightningWork -from lightning.app.storage import Payload - - -class SourceFileWriterWork(LightningWork): - def __init__(self): - super().__init__() - self.value = None - - def run(self): - self.value = Payload(42) - - -class DestinationWork(LightningWork): - def run(self, payload): - assert payload.value == 42 - - -class RootFlow(LightningFlow): - def __init__(self): - super().__init__() - self.src = SourceFileWriterWork() - self.dst = DestinationWork() - - def run(self): - self.src.run() - self.dst.run(self.src.value) - self.stop("Application End!") - - -app = LightningApp(RootFlow()) diff --git a/examples/app/pickle_or_not/app.py b/examples/app/pickle_or_not/app.py deleted file mode 100644 index aa7c3b01323da..0000000000000 --- a/examples/app/pickle_or_not/app.py +++ /dev/null @@ -1,54 +0,0 @@ -import logging - -from lightning.app import LightningApp, LightningFlow, LightningWork - -logger = logging.getLogger(__name__) - - -class PickleChecker(LightningWork): - def run(self, pickle_image: bytes): - parsed = self.parse_image(pickle_image) - if parsed == b"it is a pickle": - return True - if parsed == b"it is not a pickle": - return False - raise Exception("Couldn't parse the image") - - @staticmethod - def parse_image(image_str: bytes): - return image_str - - -class Slack(LightningFlow): - def __init__(self): - super().__init__() - - @staticmethod - def send_message(message): - logger.info(f"Sending message: {message}") - - def run(self): - pass - - -class RootComponent(LightningFlow): - def __init__(self): - super().__init__() - self.pickle_checker = PickleChecker() - self.slack = Slack() - self.counter = 3 - - def run(self): - if self.counter > 0: - logger.info(f"Running the app {self.counter}") - image_str = b"it is not a pickle" - if self.pickle_checker.run(image_str): - self.slack.send_message("It's a pickle!") - else: - self.slack.send_message("It's not a pickle!") - self.counter -= 1 - else: - self.stop("Pickle or Not End") - - -app = LightningApp(RootComponent()) diff --git a/examples/app/pickle_or_not/requirements.txt b/examples/app/pickle_or_not/requirements.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/examples/app/server/app.py b/examples/app/server/app.py deleted file mode 100644 index 97030179ffa78..0000000000000 --- a/examples/app/server/app.py +++ /dev/null @@ -1,39 +0,0 @@ -# !pip install torchvision pydantic -import base64 -import io - -import torch -import torchvision -from lightning.app import CloudCompute, LightningApp -from lightning.app.components.serve import Image as InputImage -from lightning.app.components.serve import PythonServer -from PIL import Image -from pydantic import BaseModel - - -class PyTorchServer(PythonServer): - def setup(self): - self._model = torchvision.models.resnet18(pretrained=True) - self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self._model.to(self._device) - - def predict(self, request): - image = base64.b64decode(request.image.encode("utf-8")) - image = Image.open(io.BytesIO(image)) - transforms = torchvision.transforms.Compose([ - torchvision.transforms.Resize(224), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), - ]) - image = transforms(image) - image = image.to(self._device) - prediction = self._model(image.unsqueeze(0)) - return {"prediction": prediction.argmax().item()} - - -class OutputData(BaseModel): - prediction: int - - -component = PyTorchServer(input_type=InputImage, output_type=OutputData, cloud_compute=CloudCompute("gpu")) -app = LightningApp(component) diff --git a/examples/app/server_with_auto_scaler/app.py b/examples/app/server_with_auto_scaler/app.py deleted file mode 100644 index 1320da6745fa6..0000000000000 --- a/examples/app/server_with_auto_scaler/app.py +++ /dev/null @@ -1,93 +0,0 @@ -# ! pip install torch torchvision -from typing import List - -import torch -import torchvision -from lightning.app import CloudCompute, LightningApp -from pydantic import BaseModel - - -class BatchRequestModel(BaseModel): - inputs: List[app.components.Image] - - -class BatchResponse(BaseModel): - outputs: List[app.components.Number] - - -class PyTorchServer(app.components.PythonServer): - def __init__(self, *args, **kwargs): - super().__init__( - input_type=BatchRequestModel, - output_type=BatchResponse, - *args, - **kwargs, - ) - - def setup(self): - if torch.cuda.is_available(): - self._device = torch.device("cuda:0") - elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): - self._device = torch.device("mps") - else: - self._device = torch.device("cpu") - self._model = torchvision.models.resnet18(pretrained=True).to(self._device) - - def predict(self, requests: BatchRequestModel): - transforms = torchvision.transforms.Compose([ - torchvision.transforms.Resize(224), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), - ]) - images = [] - for request in requests.inputs: - image = app.components.serve.types.image.Image.deserialize(request.image) - image = transforms(image).unsqueeze(0) - images.append(image) - images = torch.cat(images) - images = images.to(self._device) - predictions = self._model(images) - results = predictions.argmax(1).cpu().numpy().tolist() - return BatchResponse(outputs=[{"prediction": pred} for pred in results]) - - -class MyAutoScaler(app.components.AutoScaler): - def scale(self, replicas: int, metrics: dict) -> int: - pending_requests = metrics["pending_requests"] - active_or_pending_works = replicas + metrics["pending_works"] - - if active_or_pending_works == 0: - return 1 if pending_requests > 0 else 0 - - pending_requests_per_running_or_pending_work = pending_requests / active_or_pending_works - - # scale out if the number of pending requests exceeds max batch size. - max_requests_per_work = self.max_batch_size - if pending_requests_per_running_or_pending_work >= max_requests_per_work: - return replicas + 1 - - # scale in if the number of pending requests is below 25% of max_requests_per_work - min_requests_per_work = max_requests_per_work * 0.25 - if pending_requests_per_running_or_pending_work < min_requests_per_work: - return replicas - 1 - - return replicas - - -app = LightningApp( - MyAutoScaler( - # work class and args - PyTorchServer, - cloud_compute=CloudCompute("gpu"), - # autoscaler specific args - min_replicas=1, - max_replicas=4, - scale_out_interval=10, - scale_in_interval=10, - endpoint="predict", - input_type=app.components.Image, - output_type=app.components.Number, - timeout_batching=1, - max_batch_size=8, - ) -) diff --git a/examples/app/template_streamlit_ui/app.py b/examples/app/template_streamlit_ui/app.py deleted file mode 100644 index 21a13036aa782..0000000000000 --- a/examples/app/template_streamlit_ui/app.py +++ /dev/null @@ -1,44 +0,0 @@ -from lightning.app import LightningApp, LightningFlow -from lightning.app.frontend import StreamlitFrontend -from lightning.app.utilities.state import AppState - - -class StreamlitUI(LightningFlow): - def __init__(self): - super().__init__() - self.message_to_print = "Hello World!" - self.should_print = False - - def configure_layout(self): - return StreamlitFrontend(render_fn=render_fn) - - -def render_fn(state: AppState): - import streamlit as st - - should_print = st.button("Should print to the terminal ?") - - if should_print: - state.should_print = not state.should_print - - st.write("Currently printing." if state.should_print else "Currently waiting to print.") - - -class HelloWorld(LightningFlow): - def __init__(self): - super().__init__() - self.counter = 0 - self.streamlit_ui = StreamlitUI() - - def run(self): - self.streamlit_ui.run() - if self.streamlit_ui.should_print: - print(f"{self.counter}: {self.streamlit_ui.message_to_print}") - self.counter += 1 - self.streamlit_ui.should_print = False - - def configure_layout(self): - return [{"name": "StreamLitUI", "content": self.streamlit_ui}] - - -app = LightningApp(HelloWorld()) diff --git a/examples/app/template_streamlit_ui/requirements.txt b/examples/app/template_streamlit_ui/requirements.txt deleted file mode 100644 index 12a4706528df6..0000000000000 --- a/examples/app/template_streamlit_ui/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -streamlit diff --git a/examples/app/v0/.gitignore b/examples/app/v0/.gitignore deleted file mode 100644 index 186149fa056fe..0000000000000 --- a/examples/app/v0/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.storage -.lightning diff --git a/examples/app/v0/README.md b/examples/app/v0/README.md deleted file mode 100644 index 516283ae9cedd..0000000000000 --- a/examples/app/v0/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# v0 app - -This app is a flow-only app with nothing fancy. -This is meant to present the basic functionalities of the lightning framework. - -## Starting it - -Local - -```bash -lightning run app app.py -``` - -Cloud - -```bash -lightning run app app.py --cloud -``` diff --git a/examples/app/v0/app.py b/examples/app/v0/app.py deleted file mode 100644 index d1cbb41c6dc10..0000000000000 --- a/examples/app/v0/app.py +++ /dev/null @@ -1,49 +0,0 @@ -# v0_app.py -import os -from datetime import datetime -from time import sleep - -from lightning.app import LightningApp, LightningFlow -from lightning.app.frontend import StaticWebFrontend - - -class Word(LightningFlow): - def __init__(self, letter): - super().__init__() - self.letter = letter - self.repeats = letter - - def run(self): - self.repeats += self.letter - - def configure_layout(self): - return StaticWebFrontend(os.path.join(os.path.dirname(__file__), f"ui/{self.letter}")) - - -class V0App(LightningFlow): - def __init__(self): - super().__init__() - self.aas = Word("a") - self.bbs = Word("b") - self.counter = 0 - - def run(self): - now = datetime.now() - now = now.strftime("%H:%M:%S") - log = {"time": now, "a": self.aas.repeats, "b": self.bbs.repeats} - print(log) - self.aas.run() - self.bbs.run() - - sleep(2.0) - self.counter += 1 - - def configure_layout(self): - tab1 = {"name": "Tab_1", "content": self.aas} - tab2 = {"name": "Tab_2", "content": self.bbs} - tab3 = {"name": "Tab_3", "content": "https://tensorboard.dev/experiment/8m1aX0gcQ7aEmH0J7kbBtg/#scalars"} - - return [tab1, tab2, tab3] - - -app = LightningApp(V0App(), log_level="debug") diff --git a/examples/app/v0/emulate_ui.py b/examples/app/v0/emulate_ui.py deleted file mode 100644 index 1d42c1cdf4c52..0000000000000 --- a/examples/app/v0/emulate_ui.py +++ /dev/null @@ -1,18 +0,0 @@ -from time import sleep - -import requests -from lightning.app.utilities.state import headers_for - -headers = headers_for({}) -headers["X-Lightning-Type"] = "DEFAULT" - -res = requests.get("http://127.0.0.1:7501/state", headers=headers) - - -res = requests.post("http://127.0.0.1:7501/state", json={"stage": "running"}, headers=headers) -print(res) - -sleep(10) - -res = requests.post("http://127.0.0.1:7501/state", json={"stage": "stopping"}, headers=headers) -print(res) diff --git a/examples/app/v0/requirements.txt b/examples/app/v0/requirements.txt deleted file mode 100644 index edfce786a4d18..0000000000000 --- a/examples/app/v0/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -py diff --git a/examples/app/v0/ui/a/index.html b/examples/app/v0/ui/a/index.html deleted file mode 100644 index 6ddb9a5a1323c..0000000000000 --- a/examples/app/v0/ui/a/index.html +++ /dev/null @@ -1 +0,0 @@ -
Hello from component A
diff --git a/examples/app/v0/ui/b/index.html b/examples/app/v0/ui/b/index.html deleted file mode 100644 index 3bfd9e24cb7f7..0000000000000 --- a/examples/app/v0/ui/b/index.html +++ /dev/null @@ -1 +0,0 @@ -
Hello from component B
diff --git a/examples/app/works_on_default_machine/app_v2.py b/examples/app/works_on_default_machine/app_v2.py deleted file mode 100644 index 191070041b866..0000000000000 --- a/examples/app/works_on_default_machine/app_v2.py +++ /dev/null @@ -1,52 +0,0 @@ -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from lightning import CloudCompute, LightningApp, LightningFlow, LightningWork -from uvicorn import run - - -class Work(LightningWork): - def __init__(self, **kwargs): - super().__init__(parallel=True, **kwargs) - - def run(self): - fastapi_service = FastAPI() - - fastapi_service.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - @fastapi_service.get("/") - def get_root(): - return {"Hello Word!"} - - run(fastapi_service, host=self.host, port=self.port) - - -class Flow(LightningFlow): - def __init__(self): - super().__init__() - # In the Cloud: All the works defined without passing explicitly a CloudCompute object - # are running on the default machine. - # This would apply to `work_a`, `work_b` and the dynamically created `work_d`. - - self.work_a = Work() - self.work_b = Work() - - self.work_c = Work(cloud_compute=CloudCompute(name="cpu-small")) - - def run(self): - if not hasattr(self, "work_d"): - self.work_d = Work() - - for work in self.works(): - work.run() - - def configure_layout(self): - return [{"name": w.name, "content": w} for i, w in enumerate(self.works())] - - -app = LightningApp(Flow(), log_level="debug") diff --git a/examples/app/works_on_default_machine/requirements.txt b/examples/app/works_on_default_machine/requirements.txt deleted file mode 100644 index 12a4706528df6..0000000000000 --- a/examples/app/works_on_default_machine/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -streamlit diff --git a/pyproject.toml b/pyproject.toml index c24f27828fdd6..4189de8e9790a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,14 +134,7 @@ max-complexity = 10 files = [ "src/lightning", ] -# This section is for folders with "-" as they are not valid python modules -exclude = [ - "src/lightning/app/cli/app-template", - "src/lightning/app/cli/component-template", - "src/lightning/app/cli/pl-app-template", - "src/lightning/app/cli/react-ui-template", - "src/lightning/app/launcher/utils.py", -] + install_types = "True" non_interactive = "True" disallow_untyped_defs = "True" @@ -156,100 +149,6 @@ disable_error_code = "attr-defined" # style choices warn_no_return = "False" -# Ignore mypy errors for these files -# TODO: the goal is for this to be empty -[[tool.mypy.overrides]] -# the list can be generated with: -# mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' -module = [ - "lightning.app.api.http_methods", - "lightning.app.api.request_types", - "lightning.app.cli.cmd_install", - "lightning.app.cli.commands.app_commands", - "lightning.app.cli.commands.cd", - "lightning.app.cli.commands.cp", - "lightning.app.cli.commands.ls", - "lightning.app.cli.connect.app", - "lightning.app.components.database.client", - "lightning.app.components.database.server", - "lightning.app.components.database.utilities", - "lightning.app.components.multi_node.base", - "lightning.app.components.multi_node.fabric", - "lightning.app.components.multi_node.pytorch_spawn", - "lightning.app.components.multi_node.trainer", - "lightning.app.components.python.popen", - "lightning.app.components.python.tracer", - "lightning.app.components.serve.auto_scaler", - "lightning.app.components.serve.gradio_server", - "lightning.app.components.serve.python_server", - "lightning.app.components.serve.serve", - "lightning.app.components.serve.streamlit", - "lightning.app.components.serve.types.image", - "lightning.app.components.serve.types.type", - "lightning.app.components.training", - "lightning.app.frontend.panel.app_state_comm", - "lightning.app.frontend.panel.app_state_watcher", - "lightning.app.frontend.panel.panel_frontend", - "lightning.app.frontend.panel.panel_serve_render_fn", - "lightning.app.frontend.streamlit_base", - "lightning.app.frontend.stream_lit", - "lightning.app.frontend.utils", - "lightning.app.frontend.web", - "lightning.app.launcher.launcher", - "lightning.app.launcher.lightning_backend", - "lightning.app.launcher.lightning_hybrid_backend", - "lightning.app.pdb.pdb", - "lightning.app.runners.backends.backend", - "lightning.app.runners.backends.cloud", - "lightning.app.runners.backends.docker", - "lightning.app.runners.backends.mp_process", - "lightning.app.runners.cloud", - "lightning.app.runners.multiprocess", - "lightning.app.runners.runtime", - "lightning.app.source_code.copytree", - "lightning.app.source_code.hashing", - "lightning.app.source_code.local", - "lightning.app.source_code.tar", - "lightning.app.source_code.uploader", - "lightning.app.storage.copier", - "lightning.app.storage.drive", - "lightning.app.storage.filesystem", - "lightning.app.storage.orchestrator", - "lightning.app.storage.path", - "lightning.app.storage.payload", - "lightning.app.structures.dict", - "lightning.app.structures.list", - "lightning.app.testing.helpers", - "lightning.app.testing.testing", - "lightning.app.utilities.app_helpers", - "lightning.app.utilities.app_logs", - "lightning.app.utilities.cli_helpers", - "lightning.app.utilities.cloud", - "lightning.app.utilities.commands.base", - "lightning.app.utilities.component", - "lightning.app.utilities.enum", - "lightning.app.utilities.exceptions", - "lightning.app.utilities.git", - "lightning.app.utilities.imports", - "lightning.app.utilities.introspection", - "lightning.app.utilities.layout", - "lightning.app.utilities.load_app", - "lightning.app.utilities.log_helpers", - "lightning.app.utilities.login", - "lightning.app.utilities.name_generator", - "lightning.app.utilities.network", - "lightning.app.utilities.openapi", - "lightning.app.utilities.packaging.cloud_compute", - "lightning.app.utilities.packaging.lightning_utils", - "lightning.app.utilities.proxies", - "lightning.app.utilities.scheduler", - "lightning.app.utilities.state", - "lightning.app.utilities.tracer", - "lightning.app.utilities.tree", - "lightning.store.utils", -] -ignore_errors = "True" - [tool.coverage.report] exclude_lines = [ diff --git a/requirements.txt b/requirements.txt index bcb63693fbbe3..4910c7fbe7fc0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ # the default package dependencies --r ./requirements/app/app.txt -r ./requirements/fabric/base.txt -r ./requirements/pytorch/base.txt diff --git a/requirements/app/app.txt b/requirements/app/app.txt deleted file mode 100644 index 25c9bb893fe60..0000000000000 --- a/requirements/app/app.txt +++ /dev/null @@ -1,31 +0,0 @@ -lightning-cloud == 0.5.70 # Must be pinned to ensure compatibility -packaging -typing-extensions >=4.4.0, <4.10.0 -deepdiff >=5.7.0, <6.6.0 -fsspec[http] >=2022.5.0, <2023.11.0 -croniter >=1.3.0, <1.5.0 # strict; TODO: for now until we find something more robust. -traitlets >=5.3.0, <5.12.0 -arrow >=1.2.0, <1.3.0 -lightning-utilities >=0.10.0, <0.12.0 -beautifulsoup4 >=4.8.0, <4.13.0 -inquirer >=2.10.0, <3.2.0 -psutil <5.9.6 -click <8.2 -python-multipart >=0.0.5, <=0.0.6 -backoff >=2.2.1, <2.3.0 - -fastapi >=0.92.0, <0.104.0 -starlette # https://fastapi.tiangolo.com/deployment/versions/#about-starlette -pydantic >=1.7.4 # https://fastapi.tiangolo.com/deployment/versions/#about-pydantic - -dateutils <0.8.0 -Jinja2 <3.2.0 -PyYAML <=6.0.1 -requests <2.32.0 -rich >=12.3.0, <13.6.0 -urllib3 <2.0.0 -uvicorn <0.24.0 -websocket-client <1.7.0 -websockets <11.1.0 -numpy >=1.17.2, <2.0 -msgpack diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt deleted file mode 100644 index ad5d2d583d17f..0000000000000 --- a/requirements/app/cloud.txt +++ /dev/null @@ -1,4 +0,0 @@ -redis >=4.0.1, <5.1.0 -docker >=5.0.0, <6.1.4 -s3fs >=2022.5.0, <2023.6.1 -# setuptools==59.5.0 diff --git a/requirements/app/components.txt b/requirements/app/components.txt deleted file mode 100644 index 78509b6b0269e..0000000000000 --- a/requirements/app/components.txt +++ /dev/null @@ -1,5 +0,0 @@ -# deps required by components in the lightning app repository (src/lightning/app/components) -lightning_api_access >=0.0.3 # serve -aiohttp >=3.8.0, <3.9.0 # auto_scaler -lightning-fabric >=1.9.0 # multinode -pytorch-lightning >=1.9.0 # multinode diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt deleted file mode 100644 index f2db5000b9113..0000000000000 --- a/requirements/app/docs.txt +++ /dev/null @@ -1 +0,0 @@ --r ../docs.txt diff --git a/requirements/app/test.txt b/requirements/app/test.txt deleted file mode 100644 index fd9629649c89b..0000000000000 --- a/requirements/app/test.txt +++ /dev/null @@ -1,18 +0,0 @@ -coverage ==7.3.1 -pytest ==7.4.0 -pytest-timeout ==2.1.0 -pytest-cov ==4.1.0 -pytest-doctestplus ==1.0.0 -pytest-asyncio ==0.21.1 -# pytest-random-order ==1.1.0 -pytest-rerunfailures ==12.0 -pytest-xdist ==3.3.1 - -playwright ==1.38.0 -httpx ==0.25.0 -trio <0.22.0 # strict https://github.com/python-trio/trio/pull/2213 -pympler -psutil <5.10.0 -setuptools <68.3.0 -requests-mock ==1.11.0 -pandas diff --git a/requirements/app/ui.txt b/requirements/app/ui.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/setup.py b/setup.py index 698eaa5abe71e..bfc329bb8fe88 100755 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ There are considered three main scenarios for installing this project: -1. Using PyPI registry when you can install `pytorch-lightning`, `lightning-app`, etc. or `lightning` for all. +1. Using PyPI registry when you can install `pytorch-lightning`, etc. or `lightning` for all. 2. Installation from source code after cloning repository. In such case we recommend to use command `pip install .` or `pip install -e .` for development version @@ -26,12 +26,11 @@ - for `pytorch-lightning` use `export PACKAGE_NAME=pytorch ; pip install .` - for `lightning-fabric` use `export PACKAGE_NAME=fabric ; pip install .` - - for `lightning-app` use `export PACKAGE_NAME=app ; pip install .` 3. Building packages as sdist or binary wheel and installing or publish to PyPI afterwords you use command `python setup.py sdist` or `python setup.py bdist_wheel` accordingly. In case you want to build just a particular package you want to set an environment variable: - `PACKAGE_NAME=lightning|pytorch|app|fabric python setup.py sdist|bdist_wheel` + `PACKAGE_NAME=lightning|pytorch|fabric python setup.py sdist|bdist_wheel` 4. Automated releasing with GitHub action is natural extension of 3) is composed of three consecutive steps: a) determine which packages shall be released based on version increment in `__version__.py` and eventually @@ -57,7 +56,6 @@ _PACKAGE_MAPPING = { "lightning": "lightning", "pytorch": "pytorch_lightning", - "app": "lightning_app", "fabric": "lightning_fabric", } # https://packaging.python.org/guides/single-sourcing-package-version/ diff --git a/src/app-ui-version.info b/src/app-ui-version.info deleted file mode 100644 index ae39fab35ff1f..0000000000000 --- a/src/app-ui-version.info +++ /dev/null @@ -1 +0,0 @@ -v0.0.0 diff --git a/src/lightning/__init__.py b/src/lightning/__init__.py index c191334d2c218..1b054ed6715f7 100644 --- a/src/lightning/__init__.py +++ b/src/lightning/__init__.py @@ -1,7 +1,6 @@ """Root package info.""" import logging -import sys # explicitly don't set root logger's propagation and leave this to subpackages to manage _logger = logging.getLogger(__name__) @@ -31,19 +30,3 @@ "Fabric", "__version__", ] - - -def _cli_entry_point() -> None: - from lightning_utilities.core.imports import ModuleAvailableCache, RequirementCache - - if not ( - ModuleAvailableCache("lightning.app") - if RequirementCache("lightning-utilities<0.10.0") - else RequirementCache(module="lightning.app") - ): - print("The `lightning` command requires additional dependencies: `pip install lightning[app]`") - sys.exit(1) - - from lightning.app.cli.lightning_cli import main - - main() diff --git a/src/lightning/__main__.py b/src/lightning/__main__.py deleted file mode 100644 index 57b27ab968c82..0000000000000 --- a/src/lightning/__main__.py +++ /dev/null @@ -1,4 +0,0 @@ -from lightning.app.cli.lightning_cli import main - -if __name__ == "__main__": - main() diff --git a/src/lightning/__setup__.py b/src/lightning/__setup__.py index 81eae48545180..4bc4dff23be50 100644 --- a/src/lightning/__setup__.py +++ b/src/lightning/__setup__.py @@ -45,12 +45,8 @@ def _prepare_extras() -> Dict[str, Any]: extras["fabric-dev"] = extras["fabric-all"] + extras["fabric-test"] extras["pytorch-all"] = extras["pytorch-extra"] + extras["pytorch-strategies"] + extras["pytorch-examples"] extras["pytorch-dev"] = extras["pytorch-all"] + extras["pytorch-test"] - extras["app-extra"] = extras["app-app"] + extras["app-cloud"] + extras["app-ui"] + extras["app-components"] - extras["app-all"] = extras["app-extra"] - extras["app-dev"] = extras["app-all"] + extras["app-test"] - extras["store-store"] = extras["app-app"] # todo: consider cutting/leaning this dependency - # merge per-project extras of the same category, e.g. `app-test` + `fabric-test` + # merge per-project extras of the same category for extra in list(extras): name = "-".join(extra.split("-")[1:]) extras[name] = extras.get(name, []) + extras[extra] @@ -74,17 +70,6 @@ def _setup_args() -> Dict[str, Any]: _PROJECT_ROOT, homepage=about.__homepage__, version=version.version ) - # TODO: remove this once lightning-ui package is ready as a dependency - ui_ver_file = os.path.join(_SOURCE_ROOT, "app-ui-version.info") - if os.path.isfile(ui_ver_file): - with open(ui_ver_file, encoding="utf-8") as fo: - ui_version = fo.readlines()[0].strip() - download_fe_version = {"version": ui_version} - else: - print(f"Missing file with FE version: {ui_ver_file}") - download_fe_version = {} - _ASSISTANT._download_frontend(os.path.join(_PACKAGE_ROOT, "app"), **download_fe_version) - # TODO: consider invaliding some additional arguments from packages, for example if include data or safe to zip install_requires = _ASSISTANT.load_requirements( @@ -114,7 +99,6 @@ def _setup_args() -> Dict[str, Any]: "console_scripts": [ "fabric = lightning.fabric.cli:_main", "lightning = lightning.fabric.cli:_legacy_main", - "lightning_app = lightning:_cli_entry_point", ], }, "setup_requires": [], diff --git a/src/lightning/app/CHANGELOG.md b/src/lightning/app/CHANGELOG.md deleted file mode 100644 index d09c302b22067..0000000000000 --- a/src/lightning/app/CHANGELOG.md +++ /dev/null @@ -1,608 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - -## [2.2.0] - 2024-02-09 - -## Changed - -- Renames the `lightning` cli to `lightning_app` ([#19440](https://github.com/Lightning-AI/pytorch-lightning/pull/19440)) - - -## [2.1.4] - 2024-01-31 - -### Changed - -- Remove torch distributed for the Dataset Optimizer ([#19182](https://github.com/Lightning-AI/lightning/pull/19182)) - - -## [2.1.3] - 2023-12-21 - -### Changed - -- Lightning App: Use the batch get endpoint ([#19180](https://github.com/Lightning-AI/lightning/pull/19180)) -- Drop starsessions from App's requirements ([#18470](https://github.com/Lightning-AI/lightning/pull/18470)) -- Optimize loading time for chunks to be there ([#19109](https://github.com/Lightning-AI/lightning/pull/19109)) - - -## [2.1.2] - 2023-11-15 - -### Changed - -- Forced plugin server to use localhost ([#18976](https://github.com/Lightning-AI/lightning/pull/18976)) -- Enabled bundling additional files into app source ([#18980](https://github.com/Lightning-AI/lightning/pull/18980)) -- Limited rate of requests to http queue ([#18981](https://github.com/Lightning-AI/lightning/pull/18981)) - - -## [2.1.1] - 2023-11-06 - -### Added - -- Added flow `fail()` ([#18883](https://github.com/Lightning-AI/lightning/pull/18883)) - -### Fixed - -- Fixed failing lightning cli entry point ([#18821](https://github.com/Lightning-AI/lightning/pull/18821)) - - -## [2.1.0] - 2023-10-11 - -### Added - -- Allow customizing `gradio` components with lightning colors ([#17054](https://github.com/Lightning-AI/lightning/pull/17054)) - -### Changed - -- Changed `LocalSourceCodeDir` cache_location to not use home in some certain cases ([#17491](https://github.com/Lightning-AI/lightning/pull/17491)) - -### Removed - -- Remove cluster commands from the CLI ([#18151](https://github.com/Lightning-AI/lightning/pull/18151)) - - -## [2.0.9] - 2023-09-14 - -### Fixed - -- Replace LightningClient with import from lightning_cloud ([#18544](https://github.com/Lightning-AI/lightning/pull/18544)) - - -## [2.0.8] - 2023-08-29 - -## Changed - -- Change top folder ([#18212](https://github.com/Lightning-AI/lightning/pull/18212)) -- Remove `_handle_is_headless` calls in app run loop ([#18362](https://github.com/Lightning-AI/lightning/pull/18362)) - - -## [2.0.7] - 2023-08-14 - -### Changed - -- Removed the top-level import `lightning.pdb`; import `lightning.app.pdb` instead ([#18177](https://github.com/Lightning-AI/lightning/pull/18177)) -- Client retries forever ([#18065](https://github.com/Lightning-AI/lightning/pull/18065)) - -### Fixed - -- Fixed an issue that would prevent the user to set the multiprocessing start method after importing lightning ([#18177](https://github.com/Lightning-AI/lightning/pull/18177)) - - -## [2.0.6] - 2023-07-20 - -### Fixed - -- Fixed handling a `None` request in the file orchestration queue ([#18111](https://github.com/Lightning-AI/lightning/pull/18111)) - - -## [2.0.5] - 2023-07-07 - -### Added - -- plugin: store source app ([#17892](https://github.com/Lightning-AI/lightning/pull/17892)) -- added colocation identifier ([#16796](https://github.com/Lightning-AI/lightning/pull/16796)) -- Added exponential backoff to HTTPQueue put ([#18013](https://github.com/Lightning-AI/lightning/pull/18013)) -- Content for plugins ([#17243](https://github.com/Lightning-AI/lightning/pull/17243)) - -### Changed - -- Save a reference to created tasks, to avoid tasks disappearing ([#17946](https://github.com/Lightning-AI/lightning/pull/17946)) - - -## [2.0.4] - 2023-06-22 - -### Fixed - -- bumped several dependencies to address security vulnerabilities. - - -## [2.0.3] - 2023-06-07 - -- Added the property `LightningWork.public_ip` that exposes the public IP of the `LightningWork` instance ([#17742](https://github.com/Lightning-AI/lightning/pull/17742)) -- Add missing python-multipart dependency ([#17244](https://github.com/Lightning-AI/lightning/pull/17244)) - -### Changed - -- Made type hints public ([#17100](https://github.com/Lightning-AI/lightning/pull/17100)) - -### Fixed - -- Fixed `LightningWork.internal_ip` that was mistakenly exposing the public IP instead; now exposes the private/internal IP address ([#17742](https://github.com/Lightning-AI/lightning/pull/17742)) -- Fixed resolution of latest version in CLI ([#17351](https://github.com/Lightning-AI/lightning/pull/17351)) -- Fixed property raised instead of returned ([#17595](https://github.com/Lightning-AI/lightning/pull/17595)) -- Fixed get project ([#17617](https://github.com/Lightning-AI/lightning/pull/17617), [#17666](https://github.com/Lightning-AI/lightning/pull/17666)) - - -## [2.0.2] - 2023-04-24 - -### Fixed - -- Resolved Lightning App with remote storage ([#17426](https://github.com/Lightning-AI/lightning/pull/17426)) -- Fixed `AppState`, streamlit example ([#17452](https://github.com/Lightning-AI/lightning/pull/17452)) - - -## [2.0.1] - 2023-04-11 - -### Fixed - -- Fix frontend hosts when running with multi-process in the cloud ([#17324](https://github.com/Lightning-AI/lightning/pull/17324)) - - -## [2.0.0] - 2023-03-15 - -### Added - -- Added `--zip` option to the `lightning cp` command to copy content from the Cloud Platform Filesystem as a zipfile - -### Changed - -- Changed minimum supported version of `rich` from `10.14.0` to `12.13.0` ([#16798](https://github.com/Lightning-AI/lightning/pull/16798)) - -### Removed - -- Removed support for Python 3.7 ([#16579](https://github.com/Lightning-AI/lightning/pull/16579)) - - -## [1.9.4] - 2023-03-01 - -### Removed - -- Removed implicit ui testing with `testing.run_app_in_cloud` in favor of headless login and app selection ([#16741](https://github.com/Lightning-AI/lightning/pull/16741)) - - -## [1.9.3] - 2023-02-21 - -### Fixed - -- Fixed `lightning open` command and improved redirects ([#16794](https://github.com/Lightning-AI/lightning/pull/16794)) - - -## [1.9.2] - 2023-02-15 - -- Added Storage Commands ([#16740](https://github.com/Lightning-AI/lightning/pull/16740)) - * `rm`: Delete files from your Cloud Platform Filesystem -- Added `lightning connect data` to register data connection to private s3 buckets ([#16738](https://github.com/Lightning-AI/lightning/pull/16738)) - - -## [1.9.1] - 2023-02-10 - -### Added -- Added `lightning open` command ([#16482](https://github.com/Lightning-AI/lightning/pull/16482)) -- Added experimental support for interruptible GPU in the cloud ([#16399](https://github.com/Lightning-AI/lightning/pull/16399)) -- Added FileSystem abstraction to simply manipulation of files ([#16581](https://github.com/Lightning-AI/lightning/pull/16581)) -- Added Storage Commands ([#16606](https://github.com/Lightning-AI/lightning/pull/16606)) - * `ls`: List files from your Cloud Platform Filesystem - * `cd`: Change the current directory within your Cloud Platform filesystem (terminal session based) - * `pwd`: Return the current folder in your Cloud Platform Filesystem - * `cp`: Copy files between your Cloud Platform Filesystem and local filesystem -- Prevent to `cd` into non existent folders ([#16645](https://github.com/Lightning-AI/lightning/pull/16645)) -- Enabled `cp` (upload) at project level ([#16631](https://github.com/Lightning-AI/lightning/pull/16631)) -- Enabled `ls` and `cp` (download) at project level ([#16622](https://github.com/Lightning-AI/lightning/pull/16622)) -- Added `lightning connect data` to register data connection to s3 buckets ([#16670](https://github.com/Lightning-AI/lightning/pull/16670)) -- Added support for running with multiprocessing in the cloud ([#16624](https://github.com/Lightning-AI/lightning/pull/16624)) -- Initial plugin server ([#16523](https://github.com/Lightning-AI/lightning/pull/16523)) -- Connect and Disconnect node ([#16700](https://github.com/Lightning-AI/lightning/pull/16700)) - -### Changed - -- Changed the default `LightningClient(retry=False)` to `retry=True` ([#16382](https://github.com/Lightning-AI/lightning/pull/16382)) -- Add support for async predict method in PythonServer and remove torch context ([#16453](https://github.com/Lightning-AI/lightning/pull/16453)) -- Renamed `lightning.app.components.LiteMultiNode` to `lightning.app.components.FabricMultiNode` ([#16505](https://github.com/Lightning-AI/lightning/pull/16505)) -- Changed the command `lightning connect` to `lightning connect app` for consistency ([#16670](https://github.com/Lightning-AI/lightning/pull/16670)) -- Refactor cloud dispatch and update to new API ([#16456](https://github.com/Lightning-AI/lightning/pull/16456)) -- Updated app URLs to the latest format ([#16568](https://github.com/Lightning-AI/lightning/pull/16568)) - -### Fixed - -- Fixed a deadlock causing apps not to exit properly when running locally ([#16623](https://github.com/Lightning-AI/lightning/pull/16623)) -- Fixed the Drive root_folder not parsed properly ([#16454](https://github.com/Lightning-AI/lightning/pull/16454)) -- Fixed malformed path when downloading files using `lightning cp` ([#16626](https://github.com/Lightning-AI/lightning/pull/16626)) -- Fixed app name in URL ([#16575](https://github.com/Lightning-AI/lightning/pull/16575)) - - -## [1.9.0] - 2023-01-17 - -### Added - -- Added a possibility to set up basic authentication for Lightning apps ([#16105](https://github.com/Lightning-AI/lightning/pull/16105)) - -### Changed - -- The LoadBalancer now uses internal ip + port instead of URL exposed ([#16119](https://github.com/Lightning-AI/lightning/pull/16119)) -- Added support for logging in different trainer stages with `DeviceStatsMonitor` ([#16002](https://github.com/Lightning-AI/lightning/pull/16002)) -- Changed `lightning.app.components.serve.gradio` to `lightning.app.components.serve.gradio_server` ([#16201](https://github.com/Lightning-AI/lightning/pull/16201)) -- Made cluster creation/deletion async by default ([#16185](https://github.com/Lightning-AI/lightning/pull/16185)) -- Expose `LightningFlow.stop` method to stop the flow similar to works ([##16378](https://github.com/Lightning-AI/lightning/pull/16378)) - -### Fixed - -- Fixed not being able to run multiple lightning apps locally due to port collision ([#15819](https://github.com/Lightning-AI/lightning/pull/15819)) -- Avoid `relpath` bug on Windows ([#16164](https://github.com/Lightning-AI/lightning/pull/16164)) -- Avoid using the deprecated `LooseVersion` ([#16162](https://github.com/Lightning-AI/lightning/pull/16162)) -- Porting fixes to autoscaler component ([#16249](https://github.com/Lightning-AI/lightning/pull/16249)) -- Fixed a bug where `lightning login` with env variables would not correctly save the credentials ([#16339](https://github.com/Lightning-AI/lightning/pull/16339)) - - -## [1.8.6] - 2022-12-21 - -### Added - -- Added partial support for fastapi `Request` annotation in `configure_api` handlers ([#16047](https://github.com/Lightning-AI/lightning/pull/16047)) -- Added a nicer UI with URL and examples for the autoscaler component ([#16063](https://github.com/Lightning-AI/lightning/pull/16063)) -- Enabled users to have more control over scaling out/in interval ([#16093](https://github.com/Lightning-AI/lightning/pull/16093)) -- Added more datatypes to serving component ([#16018](https://github.com/Lightning-AI/lightning/pull/16018)) -- Added `work.delete` method to delete the work ([#16103](https://github.com/Lightning-AI/lightning/pull/16103)) -- Added `display_name` property to LightningWork for the cloud ([#16095](https://github.com/Lightning-AI/lightning/pull/16095)) -- Added `ColdStartProxy` to the AutoScaler ([#16094](https://github.com/Lightning-AI/lightning/pull/16094)) -- Added status endpoint, enable `ready` ([#16075](https://github.com/Lightning-AI/lightning/pull/16075)) -- Implemented `ready` for components ([#16129](https://github.com/Lightning-AI/lightning/pull/16129)) - -### Changed - -- The default `start_method` for creating Work processes locally on MacOS is now 'spawn' (previously 'fork') ([#16089](https://github.com/Lightning-AI/lightning/pull/16089)) -- The utility `lightning.app.utilities.cloud.is_running_in_cloud` now returns `True` during loading of the app locally when running with `--cloud` ([#16045](https://github.com/Lightning-AI/lightning/pull/16045)) -- Updated Multinode Warning ([#16091](https://github.com/Lightning-AI/lightning/pull/16091)) -- Updated app testing ([#16000](https://github.com/Lightning-AI/lightning/pull/16000)) -- Changed overwrite to `True` ([#16009](https://github.com/Lightning-AI/lightning/pull/16009)) -- Simplified messaging in cloud dispatch ([#16160](https://github.com/Lightning-AI/lightning/pull/16160)) -- Added annotations endpoint ([#16159](https://github.com/Lightning-AI/lightning/pull/16159)) - -### Fixed - -- Fixed `PythonServer` messaging "Your app has started" ([#15989](https://github.com/Lightning-AI/lightning/pull/15989)) -- Fixed auto-batching to enable batching for requests coming even after batch interval but is in the queue ([#16110](https://github.com/Lightning-AI/lightning/pull/16110)) -- Fixed a bug where `AutoScaler` would fail with min_replica=0 ([#16092](https://github.com/Lightning-AI/lightning/pull/16092) -- Fixed a non-thread safe deepcopy in the scheduler ([#16114](https://github.com/Lightning-AI/lightning/pull/16114)) -- Fixed Http Queue sleeping for 1 sec by default if no delta were found ([#16114](https://github.com/Lightning-AI/lightning/pull/16114)) -- Fixed the endpoint info tab not showing up in `AutoScaler` UI ([#16128](https://github.com/Lightning-AI/lightning/pull/16128)) -- Fixed an issue where an exception would be raised in the logs when using a recent version of streamlit ([#16139](https://github.com/Lightning-AI/lightning/pull/16139)) -- Fixed e2e tests ([#16146](https://github.com/Lightning-AI/lightning/pull/16146)) - - -## [1.8.5] - 2022-12-15 - -### Added - -- Added `Lightning{Flow,Work}.lightningignores` attributes to programmatically ignore files before uploading to the cloud ([#15818](https://github.com/Lightning-AI/lightning/pull/15818)) -- Added a progress bar while connecting to an app through the CLI ([#16035](https://github.com/Lightning-AI/lightning/pull/16035)) -- Support running on multiple clusters ([#16016](https://github.com/Lightning-AI/lightning/pull/16016)) -- Added guards to cluster deletion from cli ([#16053](https://github.com/Lightning-AI/lightning/pull/16053)) - -### Changed - -- Cleanup cluster waiting ([#16054](https://github.com/Lightning-AI/lightning/pull/16054)) - -### Fixed - -- Fixed `DDPStrategy` import in app framework ([#16029](https://github.com/Lightning-AI/lightning/pull/16029)) -- Fixed `AutoScaler` raising an exception when non-default cloud compute is specified ([#15991](https://github.com/Lightning-AI/lightning/pull/15991)) -- Fixed and improvements of login flow ([#16052](https://github.com/Lightning-AI/lightning/pull/16052)) -- Fixed the debugger detection mechanism for lightning App in VSCode ([#16068](https://github.com/Lightning-AI/lightning/pull/16068)) -- Fixed bug where components that are re-instantiated several times failed to initialize if they were modifying `self.lightningignore` ([#16080](https://github.com/Lightning-AI/lightning/pull/16080)) -- Fixed a bug where apps that had previously been deleted could not be run again from the CLI ([#16082](https://github.com/Lightning-AI/lightning/pull/16082)) -- Fixed install/upgrade - removing single quote ([#16079](https://github.com/Lightning-AI/lightning/pull/16079)) - - -## [1.8.4] - 2022-12-08 - -### Added - -- Add `code_dir` argument to tracer run ([#15771](https://github.com/Lightning-AI/lightning/pull/15771)) -- Added the CLI command `lightning run model` to launch a `LightningLite` accelerated script ([#15506](https://github.com/Lightning-AI/lightning/pull/15506)) -- Added the CLI command `lightning delete app` to delete a lightning app on the cloud ([#15783](https://github.com/Lightning-AI/lightning/pull/15783)) -- Added a CloudMultiProcessBackend which enables running a child App from within the Flow in the cloud ([#15800](https://github.com/Lightning-AI/lightning/pull/15800)) -- Utility for pickling work object safely even from a child process ([#15836](https://github.com/Lightning-AI/lightning/pull/15836)) -- Added `AutoScaler` component ( - [#15769](https://github.com/Lightning-AI/lightning/pull/15769), - [#15971](https://github.com/Lightning-AI/lightning/pull/15971), - [#15966](https://github.com/Lightning-AI/lightning/pull/15966) -) -- Added the property `ready` of the LightningFlow to inform when the `Open App` should be visible ([#15921](https://github.com/Lightning-AI/lightning/pull/15921)) -- Added private work attributed `_start_method` to customize how to start the works ([#15923](https://github.com/Lightning-AI/lightning/pull/15923)) -- Added a `configure_layout` method to the `LightningWork` which can be used to control how the work is handled in the layout of a parent flow ([#15926](https://github.com/Lightning-AI/lightning/pull/15926)) -- Added the ability to run a Lightning App or Component directly from the Gallery using `lightning run app organization/name` ([#15941](https://github.com/Lightning-AI/lightning/pull/15941)) -- Added automatic conversion of list and dict of works and flows to structures ([#15961](https://github.com/Lightning-AI/lightning/pull/15961)) - -### Changed - -- The `MultiNode` components now warn the user when running with `num_nodes > 1` locally ([#15806](https://github.com/Lightning-AI/lightning/pull/15806)) -- Cluster creation and deletion now waits by default [#15458](https://github.com/Lightning-AI/lightning/pull/15458) -- Running an app without a UI locally no longer opens the browser ([#15875](https://github.com/Lightning-AI/lightning/pull/15875)) -- Show a message when `BuildConfig(requirements=[...])` is passed but a `requirements.txt` file is already present in the Work ([#15799](https://github.com/Lightning-AI/lightning/pull/15799)) -- Show a message when `BuildConfig(dockerfile="...")` is passed but a `Dockerfile` file is already present in the Work ([#15799](https://github.com/Lightning-AI/lightning/pull/15799)) -- Dropped name column from cluster list ([#15721](https://github.com/Lightning-AI/lightning/pull/15721)) -- Apps without UIs no longer activate the "Open App" button when running in the cloud ([#15875](https://github.com/Lightning-AI/lightning/pull/15875)) -- Wait for full file to be transferred in Path / Payload ([#15934](https://github.com/Lightning-AI/lightning/pull/15934)) - -### Removed - -- Removed the `SingleProcessRuntime` ([#15933](https://github.com/Lightning-AI/lightning/pull/15933)) - -### Fixed - -- Fixed SSH CLI command listing stopped components ([#15810](https://github.com/Lightning-AI/lightning/pull/15810)) -- Fixed bug when launching apps on multiple clusters ([#15484](https://github.com/Lightning-AI/lightning/pull/15484)) -- Fixed Sigterm Handler causing thread lock which caused KeyboardInterrupt to hang ([#15881](https://github.com/Lightning-AI/lightning/pull/15881)) -- Fixed MPS error for multinode component (defaults to cpu on mps devices now as distributed operations are not supported by pytorch on mps) ([#15748](https://github.com/Lightning-AI/lightning/pull/15748)) -- Fixed the work not stopped when successful when passed directly to the LightningApp ([#15801](https://github.com/Lightning-AI/lightning/pull/15801)) -- Fixed the PyTorch Inference locally on GPU ([#15813](https://github.com/Lightning-AI/lightning/pull/15813)) -- Fixed the `enable_spawn` method of the `WorkRunExecutor` ([#15812](https://github.com/Lightning-AI/lightning/pull/15812)) -- Fixed require/import decorator ([#15849](https://github.com/Lightning-AI/lightning/pull/15849)) -- Fixed a bug where using `L.app.structures` would cause multiple apps to be opened and fail with an error in the cloud ([#15911](https://github.com/Lightning-AI/lightning/pull/15911)) -- Fixed PythonServer generating noise on M1 ([#15949](https://github.com/Lightning-AI/lightning/pull/15949)) -- Fixed multiprocessing breakpoint ([#15950](https://github.com/Lightning-AI/lightning/pull/15950)) -- Fixed detection of a Lightning App running in debug mode ([#15951](https://github.com/Lightning-AI/lightning/pull/15951)) -- Fixed `ImportError` on Multinode if package not present ([#15963](https://github.com/Lightning-AI/lightning/pull/15963)) -- Fixed MultiNode Component to use separate cloud computes ([#15965](https://github.com/Lightning-AI/lightning/pull/15965)) -- Fixed Registration for CloudComputes of Works in `L.app.structures` ([#15964](https://github.com/Lightning-AI/lightning/pull/15964)) -- Fixed a bug where auto-upgrading to the latest lightning via the CLI could get stuck in a loop ([#15984](https://github.com/Lightning-AI/lightning/pull/15984)) - - -## [1.8.3] - 2022-11-22 - -### Changed - -- Deduplicate top level lightning CLI command groups ([#15761](https://github.com/Lightning-AI/lightning/pull/15761)) - * `lightning add ssh-key` CLI command has been transitioned to `lightning create ssh-key` - * `lightning remove ssh-key` CLI command has been transitioned to `lightning delete ssh-key` -- Set Torch inference mode for prediction ([#15719](https://github.com/Lightning-AI/lightning/pull/15719)) -- Improved `LightningTrainerScript` start-up time ([#15751](https://github.com/Lightning-AI/lightning/pull/15751)) -- Disable XSRF protection in `StreamlitFrontend` to support upload in localhost ([#15684](https://github.com/Lightning-AI/lightning/pull/15684)) - -### Fixed - -- Fixed debugging with VSCode IDE ([#15747](https://github.com/Lightning-AI/lightning/pull/15747)) -- Fixed setting property to the `LightningFlow` ([#15750](https://github.com/Lightning-AI/lightning/pull/15750)) -- Fixed the PyTorch Inference locally on GPU ([#15813](https://github.com/Lightning-AI/lightning/pull/15813)) - - -## [1.8.2] - 2022-11-17 - -### Added - -- Added title and description to ServeGradio ([#15639](https://github.com/Lightning-AI/lightning/pull/15639)) -- Added a friendly error message when attempting to run the default cloud compute with a custom base image configured ([#14929](https://github.com/Lightning-AI/lightning/pull/14929)) - -### Changed - -- Improved support for running apps when dependencies aren't installed ([#15711](https://github.com/Lightning-AI/lightning/pull/15711)) -- Changed the root directory of the app (which gets uploaded) to be the folder containing the app file, rather than any parent folder containing a `.lightning` file ([#15654](https://github.com/Lightning-AI/lightning/pull/15654)) -- Enabled MultiNode Components to support state broadcasting ([#15607](https://github.com/Lightning-AI/lightning/pull/15607)) -- Prevent artefactual "running from outside your current environment" error ([#15647](https://github.com/Lightning-AI/lightning/pull/15647)) -- Rename failed -> error in tables ([#15608](https://github.com/Lightning-AI/lightning/pull/15608)) - -### Fixed - -- Fixed race condition to over-write the frontend with app infos ([#15398](https://github.com/Lightning-AI/lightning/pull/15398)) -- Fixed bi-directional queues sending delta with Drive Component name changes ([#15642](https://github.com/Lightning-AI/lightning/pull/15642)) -- Fixed CloudRuntime works collection with structures and accelerated multi node startup time ([#15650](https://github.com/Lightning-AI/lightning/pull/15650)) -- Fixed catimage import ([#15712](https://github.com/Lightning-AI/lightning/pull/15712)) -- Parse all lines in app file looking for shebangs to run commands ([#15714](https://github.com/Lightning-AI/lightning/pull/15714)) - - -## [1.8.1] - 2022-11-10 - -### Added - -- Added the `start` method to the work ([#15523](https://github.com/Lightning-AI/lightning/pull/15523)) -- Added a `MultiNode` Component to run with distributed computation with any frameworks ([#15524](https://github.com/Lightning-AI/lightning/pull/15524)) -- Expose `RunWorkExecutor` to the work and provides default ones for the `MultiNode` Component ([#15561](https://github.com/Lightning-AI/lightning/pull/15561)) -- Added a `start_with_flow` flag to the `LightningWork` which can be disabled to prevent the work from starting at the same time as the flow ([#15591](https://github.com/Lightning-AI/lightning/pull/15591)) -- Added support for running Lightning App with VSCode IDE debugger ([#15590](https://github.com/Lightning-AI/lightning/pull/15590)) -- Added `bi-directional` delta updates between the flow and the works ([#15582](https://github.com/Lightning-AI/lightning/pull/15582)) -- Added `--setup` flag to `lightning run app` CLI command allowing for dependency installation via app comments ([#15577](https://github.com/Lightning-AI/lightning/pull/15577)) -- Auto-upgrade / detect environment mismatch from the CLI ([#15434](https://github.com/Lightning-AI/lightning/pull/15434)) -- Added Serve component ([#15609](https://github.com/Lightning-AI/lightning/pull/15609)) - - -### Changed - -- Changed the `flow.flows` to be recursive won't to align the behavior with the `flow.works` ([#15466](https://github.com/Lightning-AI/lightning/pull/15466)) -- The `params` argument in `TracerPythonScript.run` no longer prepends `--` automatically to parameters ([#15518](https://github.com/Lightning-AI/lightning/pull/15518)) -- Only check versions / env when not in the cloud ([#15504](https://github.com/Lightning-AI/lightning/pull/15504)) -- Periodically sync database to the drive ([#15441](https://github.com/Lightning-AI/lightning/pull/15441)) -- Slightly safer multi node ([#15538](https://github.com/Lightning-AI/lightning/pull/15538)) -- Reuse existing commands when running connect more than once ([#15471](https://github.com/Lightning-AI/lightning/pull/15471)) - -### Fixed - -- Fixed writing app name and id in connect.txt file for the command CLI ([#15443](https://github.com/Lightning-AI/lightning/pull/15443)) -- Fixed missing root flow among the flows of the app ([#15531](https://github.com/Lightning-AI/lightning/pull/15531)) -- Fixed bug with Multi Node Component and add some examples ([#15557](https://github.com/Lightning-AI/lightning/pull/15557)) -- Fixed a bug where payload would take a very long time locally ([#15557](https://github.com/Lightning-AI/lightning/pull/15557)) -- Fixed an issue with the `lightning` CLI taking a long time to error out when the cloud is not reachable ([#15412](https://github.com/Lightning-AI/lightning/pull/15412)) - - -## [1.8.0] - 2022-11-01 - -### Added - -- Added `load_state_dict` and `state_dict` hooks for `LightningFlow` components ([#14100](https://github.com/Lightning-AI/lightning/pull/14100)) -- Added a `--secret` option to CLI to allow binding secrets to app environment variables when running in the cloud ([#14612](https://github.com/Lightning-AI/lightning/pull/14612)) -- Added support for running the works without cloud compute in the default container ([#14819](https://github.com/Lightning-AI/lightning/pull/14819)) -- Added an HTTPQueue as an optional replacement for the default redis queue ([#14978](https://github.com/Lightning-AI/lightning/pull/14978) -- Added support for configuring flow cloud compute ([#14831](https://github.com/Lightning-AI/lightning/pull/14831)) -- Added support for adding descriptions to commands either through a docstring or the `DESCRIPTION` attribute ([#15193](https://github.com/Lightning-AI/lightning/pull/15193) -- Added a try / catch mechanism around request processing to avoid killing the flow ([#15187](https://github.com/Lightning-AI/lightning/pull/15187) -- Added an Database Component ([#14995](https://github.com/Lightning-AI/lightning/pull/14995) -- Added authentication to HTTP queue ([#15202](https://github.com/Lightning-AI/lightning/pull/15202)) -- Added support to pass a `LightningWork` to the `LightningApp` ([#15215](https://github.com/Lightning-AI/lightning/pull/15215) -- Added support getting CLI help for connected apps even if the app isn't running ([#15196](https://github.com/Lightning-AI/lightning/pull/15196) -- Added support for adding requirements to commands and installing them when missing when running an app command ([#15198](https://github.com/Lightning-AI/lightning/pull/15198) -- Added Lightning CLI Connection to be terminal session instead of global ([#15241](https://github.com/Lightning-AI/lightning/pull/15241) -- Added support for managing SSH-keys via CLI ([#15291](https://github.com/Lightning-AI/lightning/pull/15291)) -- Add a `JustPyFrontend` to ease UI creation with `https://github.com/justpy-org/justpy` ([#15002](https://github.com/Lightning-AI/lightning/pull/15002)) -- Added a layout endpoint to the Rest API and enable to disable pulling or pushing to the state ([#15367](https://github.com/Lightning-AI/lightning/pull/15367) -- Added support for functions for `configure_api` and `configure_commands` to be executed in the Rest API process ([#15098](https://github.com/Lightning-AI/lightning/pull/15098) -- Added support for accessing Lightning Apps via SSH ([#15310](https://github.com/Lightning-AI/lightning/pull/15310)) -- Added support to start lightning app on cloud without needing to install dependencies locally ([#15019](https://github.com/Lightning-AI/lightning/pull/15019) - -### Changed - -- Improved the show logs command to be standalone and reusable ([#15343](https://github.com/Lightning-AI/lightning/pull/15343) -- Removed the `--instance-types` option when creating clusters ([#15314](https://github.com/Lightning-AI/lightning/pull/15314)) - -### Fixed - -- Fixed an issue when using the CLI without arguments ([#14877](https://github.com/Lightning-AI/lightning/pull/14877)) -- Fixed a bug where the upload files endpoint would raise an error when running locally ([#14924](https://github.com/Lightning-AI/lightning/pull/14924)) -- Fixed BYOC cluster region selector -> hiding it from help since only us-east-1 has been tested and is recommended ([#15277]https://github.com/Lightning-AI/lightning/pull/15277) -- Fixed a bug when launching an app on multiple clusters ([#15226](https://github.com/Lightning-AI/lightning/pull/15226)) -- Fixed a bug with a default CloudCompute for Lightning flows ([#15371](https://github.com/Lightning-AI/lightning/pull/15371)) - -## [0.6.2] - 2022-09-21 - -### Changed - -- Improved Lightning App connect logic by disconnecting automatically ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) -- Improved the error message when the `LightningWork` is missing the `run` method ([#14759](https://github.com/Lightning-AI/lightning/pull/14759)) -- Improved the error message when the root `LightningFlow` passed to `LightningApp` is missing the `run` method ([#14760](https://github.com/Lightning-AI/lightning/pull/14760)) - -### Fixed - -- Fixed a bug where the uploaded command file wasn't properly parsed ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) -- Fixed an issue where custom property setters were not being used `LightningWork` class ([#14259](https://github.com/Lightning-AI/lightning/pull/14259)) -- Fixed an issue where some terminals would display broken icons in the PL app CLI ([#14226](https://github.com/Lightning-AI/lightning/pull/14226)) - - -## [0.6.1] - 2022-09-19 - -### Added - -- Add support to upload files to the Drive through an asynchronous `upload_file` endpoint ([#14703](https://github.com/Lightning-AI/lightning/pull/14703)) - -### Changed - -- Application storage prefix moved from `app_id` to `project_id/app_id` ([#14583](https://github.com/Lightning-AI/lightning/pull/14583)) -- LightningCloud client calls to use keyword arguments instead of positional arguments ([#14685](https://github.com/Lightning-AI/lightning/pull/14685)) - -### Fixed - -- Making `threadpool` non-default from LightningCloud client ([#14757](https://github.com/Lightning-AI/lightning/pull/14757)) -- Resolved a bug where the state change detection using DeepDiff won't work with Path, Drive objects ([#14465](https://github.com/Lightning-AI/lightning/pull/14465)) -- Resolved a bug where the wrong client was passed to collect cloud logs ([#14684](https://github.com/Lightning-AI/lightning/pull/14684)) -- Resolved the memory leak issue with the Lightning Cloud package and bumped the requirements to use the latest version ([#14697](https://github.com/Lightning-AI/lightning/pull/14697)) -- Fixing 5000 log line limitation for Lightning AI BYOC cluster logs ([#14458](https://github.com/Lightning-AI/lightning/pull/14458)) -- Fixed a bug where the uploaded command file wasn't properly parsed ([#14532](https://github.com/Lightning-AI/lightning/pull/14532)) -- Resolved `LightningApp(..., debug=True)` ([#14464](https://github.com/Lightning-AI/lightning/pull/14464)) - - -## [0.6.0] - 2022-09-08 - -### Added - -- Introduce lightning connect ([#14452](https://github.com/Lightning-AI/lightning/pull/14452)) -- Adds `PanelFrontend` to easily create complex UI in Python ([#13531](https://github.com/Lightning-AI/lightning/pull/13531)) -- Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) -- Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) -- Add support to see Lightning AI BYOC cluster logs ([#14334](https://github.com/Lightning-AI/lightning/pull/14334)) -- Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) -- Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987)) -- Adds `LightningTrainerScript`. `LightningTrainerScript` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) -- Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) -- Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) -- Added a warning when `configure_layout` returns URLs configured with http instead of https ([#14233](https://github.com/Lightning-AI/lightning/pull/14233)) -- Add `--app_args` support from the CLI ([#13625](https://github.com/Lightning-AI/lightning/pull/13625)) - -### Changed - -- Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132)) -- Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) -- Increased DeepDiff's verbose level to properly handle dict changes ([#13960](https://github.com/Lightning-AI/lightning/pull/13960)) -- Setup: added requirement freeze for next major version ([#14480](https://github.com/Lightning-AI/lightning/pull/14480)) - -### Fixed - -- Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) -- Fixed an issue with `lightning --version` command ([#14433](https://github.com/Lightning-AI/lightning/pull/14433)) -- Fixed imports of collections.abc for py3.10 ([#14345](https://github.com/Lightning-AI/lightning/pull/14345)) - -## [0.5.7] - 2022-08-22 - -### Changed - -- Release LAI docs as stable ([#14250](https://github.com/Lightning-AI/lightning/pull/14250)) -- Compatibility for Python 3.10 - -### Fixed - -- Pinning starsessions to 1.x ([#14333](https://github.com/Lightning-AI/lightning/pull/14333)) -- Parsed local package versions ([#13933](https://github.com/Lightning-AI/lightning/pull/13933)) - - -## [0.5.6] - 2022-08-16 - -### Fixed - -- Resolved a bug where the `install` command was not installing the latest version of an app/component by default ([#14181](https://github.com/Lightning-AI/lightning/pull/14181)) - - -- Fixed the `examples/app_dag` example ([#14359](https://github.com/Lightning-AI/lightning/pull/14359)) - - -## [0.5.5] - 2022-08-9 - -### Deprecated - -- Deprecate sheety API ([#14004](https://github.com/Lightning-AI/lightning/pull/14004)) - -### Fixed - -- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) -- Resolved a bug about a race condition when sending the work state through the caller_queue ([#14074](https://github.com/Lightning-AI/lightning/pull/14074)) -- Fixed Start Lightning App on Cloud if Repo Begins With Name "Lightning" ([#14025](https://github.com/Lightning-AI/lightning/pull/14025)) - - -## [0.5.4] - 2022-08-01 - -### Changed - -- Wrapped imports for traceability ([#13924](https://github.com/Lightning-AI/lightning/pull/13924)) -- Set version as today ([#13906](https://github.com/Lightning-AI/lightning/pull/13906)) - -### Fixed - -- Included app templates to the lightning and app packages ([#13731](https://github.com/Lightning-AI/lightning/pull/13731)) -- Added UI for install all ([#13732](https://github.com/Lightning-AI/lightning/pull/13732)) -- Fixed build meta pkg flow ([#13926](https://github.com/Lightning-AI/lightning/pull/13926)) - -## [0.5.3] - 2022-07-25 - -### Changed - -- Pruned requirements duplicity ([#13739](https://github.com/Lightning-AI/lightning/pull/13739)) - -### Fixed - -- Use correct python version in lightning component template ([#13790](https://github.com/Lightning-AI/lightning/pull/13790)) - -## [0.5.2] - 2022-07-18 - -### Added - -- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537)) - -### Changed - -- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703)) diff --git a/src/lightning/app/__init__.py b/src/lightning/app/__init__.py deleted file mode 100644 index 5c904cc4a908c..0000000000000 --- a/src/lightning/app/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Root package info.""" - -import logging -import os - -from lightning_utilities.core.imports import module_available, package_available - -_root_logger = logging.getLogger() -_logger = logging.getLogger(__name__) -_logger.setLevel(logging.INFO) - -_console = logging.StreamHandler() -_console.setLevel(logging.INFO) - -formatter = logging.Formatter("%(levelname)s: %(message)s") -_console.setFormatter(formatter) - -# if root logger has handlers, propagate messages up and let root logger process them, -# otherwise use our own handler -if not _root_logger.hasHandlers(): - _logger.addHandler(_console) - _logger.propagate = False - - -if os.path.isfile(os.path.join(os.path.dirname(__file__), "__about__.py")): - from lightning.app.__about__ import * # noqa: F403 -if "__version__" not in locals(): - if os.path.isfile(os.path.join(os.path.dirname(__file__), "__version__.py")): - from lightning.app.__version__ import version as __version__ - elif package_available("lightning"): - from lightning import __version__ # noqa: F401 - -from lightning.app.core.app import LightningApp # noqa: E402 -from lightning.app.core.flow import LightningFlow # noqa: E402 -from lightning.app.core.work import LightningWork # noqa: E402 -from lightning.app.plugin.plugin import LightningPlugin # noqa: E402 -from lightning.app.utilities.packaging.build_config import BuildConfig # noqa: E402 -from lightning.app.utilities.packaging.cloud_compute import CloudCompute # noqa: E402 - -if module_available("lightning.app.components.demo"): - from lightning.app.components import demo # noqa: F401 - -__package_name__ = "lightning.app".split(".")[0] - -_PACKAGE_ROOT = os.path.dirname(__file__) -_PROJECT_ROOT = os.path.dirname(os.path.dirname(_PACKAGE_ROOT)) -if __package_name__ == "lightning": - _PACKAGE_ROOT = os.path.dirname(_PACKAGE_ROOT) - _PROJECT_ROOT = os.path.dirname(_PROJECT_ROOT) - -__all__ = ["LightningApp", "LightningFlow", "LightningWork", "LightningPlugin", "BuildConfig", "CloudCompute"] diff --git a/src/lightning/app/api/__init__.py b/src/lightning/app/api/__init__.py deleted file mode 100644 index d850e874da5a2..0000000000000 --- a/src/lightning/app/api/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from lightning.app.api.http_methods import Delete, Get, Post, Put - -__all__ = [ - "Delete", - "Get", - "Post", - "Put", -] diff --git a/src/lightning/app/api/http_methods.py b/src/lightning/app/api/http_methods.py deleted file mode 100644 index aa9e68528e487..0000000000000 --- a/src/lightning/app/api/http_methods.py +++ /dev/null @@ -1,258 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import inspect -import time -from copy import deepcopy -from dataclasses import dataclass -from functools import wraps -from multiprocessing import Queue -from typing import Any, Callable, Dict, List, Optional -from uuid import uuid4 - -from fastapi import FastAPI, HTTPException, Request, status -from lightning_utilities.core.apply_func import apply_to_collection - -from lightning.app.api.request_types import _APIRequest, _CommandRequest, _RequestResponse -from lightning.app.utilities.app_helpers import Logger - -logger = Logger(__name__) - - -def _signature_proxy_function(): - pass - - -@dataclass -class _FastApiMockRequest: - """This class is meant to mock FastAPI Request class that isn't pickle-able. - - If a user relies on FastAPI Request annotation, the Lightning framework - patches the annotation before pickling and replace them right after. - - Finally, the FastAPI request is converted back to the _FastApiMockRequest - before being delivered to the users. - - Example: - - from lightning.app import LightningFlow - from fastapi import Request - from lightning.app.api import Post - - class Flow(LightningFlow): - - def request(self, request: Request) -> OutputRequestModel: - ... - - def configure_api(self): - return [Post("/api/v1/request", self.request)] - - """ - - _body: Optional[str] = None - _json: Optional[str] = None - _method: Optional[str] = None - _headers: Optional[Dict] = None - - @property - def receive(self): - raise NotImplementedError - - @property - def method(self): - return self._method - - @property - def headers(self): - return self._headers - - def body(self): - return self._body - - def json(self): - return self._json - - def stream(self): - raise NotImplementedError - - def form(self): - raise NotImplementedError - - def close(self): - raise NotImplementedError - - def is_disconnected(self): - raise NotImplementedError - - -async def _mock_fastapi_request(request: Request): - # TODO: Add more requests parameters. - return _FastApiMockRequest( - _body=await request.body(), - _json=await request.json(), - _headers=request.headers, - _method=request.method, - ) - - -class _HttpMethod: - def __init__( - self, route: str, method: Callable, method_name: Optional[str] = None, timeout: int = 30, **kwargs: Any - ): - """This class is used to inject user defined methods within the App Rest API. - - Arguments: - route: The path used to route the requests - method: The associated flow method - timeout: The time in seconds taken before raising a timeout exception. - - """ - self.route = route - self.attached_to_flow = hasattr(method, "__self__") - self.method_name = method_name or method.__name__ - self.method_annotations = method.__annotations__ - # TODO: Validate the signature contains only pydantic models. - self.method_signature = inspect.signature(method) - - if not self.attached_to_flow: - self.component_name = method.__name__ - self.method = method - else: - self.component_name = method.__self__.name - - self.timeout = timeout - self.kwargs = kwargs - - # Enable the users to rely on FastAPI annotation typing with Request. - # Note: Only a part of the Request functionatilities are supported. - self._patch_fast_api_request() - - def add_route(self, app: FastAPI, request_queue: Queue, responses_store: Dict[str, Any]) -> None: - # 1: Get the route associated with the http method. - route = getattr(app, self.__class__.__name__.lower()) - - self._unpatch_fast_api_request() - - # 2: Create a proxy function with the signature of the wrapped method. - fn = deepcopy(_signature_proxy_function) - fn.__annotations__ = self.method_annotations - fn.__name__ = self.method_name - setattr(fn, "__signature__", self.method_signature) - - # Note: Handle requests differently if attached to a flow. - if not self.attached_to_flow: - # 3: Define the request handler. - @wraps(_signature_proxy_function) - async def _handle_request(*args: Any, **kwargs: Any): - if inspect.iscoroutinefunction(self.method): - return await self.method(*args, **kwargs) - return self.method(*args, **kwargs) - - else: - request_cls = _CommandRequest if self.route.startswith("/command/") else _APIRequest - - # 3: Define the request handler. - @wraps(_signature_proxy_function) - async def _handle_request(*args: Any, **kwargs: Any): - async def fn(*args: Any, **kwargs: Any): - args, kwargs = apply_to_collection((args, kwargs), Request, _mock_fastapi_request) - for k, v in kwargs.items(): - if hasattr(v, "__await__"): - kwargs[k] = await v - - request_id = str(uuid4()).split("-")[0] - logger.debug(f"Processing request {request_id} for route: {self.route}") - request_queue.put( - request_cls( - name=self.component_name, - method_name=self.method_name, - args=args, - kwargs=kwargs, - id=request_id, - ) - ) - - t0 = time.time() - while request_id not in responses_store: - await asyncio.sleep(0.01) - if (time.time() - t0) > self.timeout: - raise HTTPException( - status.HTTP_500_INTERNAL_SERVER_ERROR, - detail="The response was never received.", - ) - - logger.debug(f"Processed request {request_id} for route: {self.route}") - - return responses_store.pop(request_id) - - response: _RequestResponse = await asyncio.create_task(fn(*args, **kwargs)) - - if response.status_code != 200: - raise HTTPException(response.status_code, detail=response.content) - - return response.content - - # 4: Register the user provided route to the Rest API. - route(self.route, **self.kwargs)(_handle_request) - - def _patch_fast_api_request(self): - """This function replaces signature annotation for Request with its mock.""" - for k, v in self.method_annotations.items(): - if v == Request: - self.method_annotations[k] = _FastApiMockRequest - - for v in self.method_signature.parameters.values(): - if v._annotation == Request: - v._annotation = _FastApiMockRequest - - def _unpatch_fast_api_request(self): - """This function replaces back signature annotation to fastapi Request.""" - for k, v in self.method_annotations.items(): - if v == _FastApiMockRequest: - self.method_annotations[k] = Request - - for v in self.method_signature.parameters.values(): - if v._annotation == _FastApiMockRequest: - v._annotation = Request - - -class Post(_HttpMethod): - pass - - -class Get(_HttpMethod): - pass - - -class Put(_HttpMethod): - pass - - -class Delete(_HttpMethod): - pass - - -def _add_tags_to_api(apis: List[_HttpMethod], tags: List[str]) -> None: - for api in apis: - if not api.kwargs.get("tag"): - api.kwargs["tags"] = tags - - -def _validate_api(apis: List[_HttpMethod]) -> None: - for api in apis: - if not isinstance(api, _HttpMethod): - raise Exception(f"The provided api should be either [{Delete}, {Get}, {Post}, {Put}]") - if api.route.startswith("/command"): - raise Exception("The route `/command` is reserved for commands. Please, use something else.") diff --git a/src/lightning/app/api/request_types.py b/src/lightning/app/api/request_types.py deleted file mode 100644 index def50e3a20e10..0000000000000 --- a/src/lightning/app/api/request_types.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import asdict, dataclass -from typing import Any, Optional - -from deepdiff import Delta - - -@dataclass -class _BaseRequest: - def to_dict(self): - return asdict(self) - - -@dataclass -class _DeltaRequest(_BaseRequest): - delta: Delta - - def to_dict(self): - return self.delta.to_dict() - - -@dataclass -class _CommandRequest(_BaseRequest): - id: str - name: str - method_name: str - args: Any - kwargs: Any - - -@dataclass -class _APIRequest(_BaseRequest): - id: str - name: str - method_name: str - args: Any - kwargs: Any - - -@dataclass -class _RequestResponse(_BaseRequest): - status_code: int - content: Optional[str] = None diff --git a/src/lightning/app/cli/__init__.py b/src/lightning/app/cli/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/app-template/.gitignore b/src/lightning/app/cli/app-template/.gitignore deleted file mode 100644 index 70ba25888435f..0000000000000 --- a/src/lightning/app/cli/app-template/.gitignore +++ /dev/null @@ -1,157 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -*install-app* - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg - -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Sphinx documentation -docs/_build/ -docs/source/api/ -docs/source/*.md - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.local_env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# PyCharm -.idea/ - -# Lightning logs -lightning_logs -*.gz -.DS_Store -.*_submit.py -.vscode - -MNIST -*.pt -.storage/ -.shared/ -infra -data -coverage.* -# Frontend build artifacts -*lightning/app/ui* -gradio_cached_examples -/docs/source/api_reference/generated/* -examples/my_own_leaderboard/submissions/* -docs/source/api_reference/generated/* -*.ckpt -redis-stable -node_modules -*.rdb -*.webm -*hars -examples/quick_start/* -examples/quick_start -examples/template_react_ui/* -examples/template_react_ui -# Ignore external components -lightning/app/components/* -!lightning/app/components/python -!lightning/app/components/serve -!lightning/app/components/__init__.py -!lightning/app/components/README.md -train_script.py -*return_values* -scratch -storage diff --git a/src/lightning/app/cli/app-template/LICENSE b/src/lightning/app/cli/app-template/LICENSE deleted file mode 100644 index 261eeb9e9f8b2..0000000000000 --- a/src/lightning/app/cli/app-template/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/src/lightning/app/cli/app-template/README.md b/src/lightning/app/cli/app-template/README.md deleted file mode 100644 index 76c88e6cedb38..0000000000000 --- a/src/lightning/app/cli/app-template/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# placeholdername app - -This ⚡ [Lightning app](https://lightning.ai/) ⚡ was generated automatically with: - -```bash -lightning_app init app placeholdername -``` - -## To run placeholdername - -First, install placeholdername (warning: this app has not been officially approved on the lightning gallery): - -```bash -lightning_app install app https://github.com/theUser/placeholdername -``` - -Once the app is installed, run it locally with: - -```bash -lightning_app run app placeholdername/app.py -``` - -Run it on the [lightning cloud](https://lightning.ai/) with: - -```bash -lightning_app run app placeholdername/app.py --cloud -``` - -## to test and link - -Run flake to make sure all your styling is consistent (it keeps your team from going insane) - -```bash -flake8 . -``` - -To test, follow the README.md instructions in the tests folder. diff --git a/src/lightning/app/cli/app-template/app.py b/src/lightning/app/cli/app-template/app.py deleted file mode 100644 index 4b86551324ccc..0000000000000 --- a/src/lightning/app/cli/app-template/app.py +++ /dev/null @@ -1,16 +0,0 @@ -from lightning.app import LightningApp, LightningFlow -from placeholdername import ComponentA, ComponentB - - -class LitApp(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component_a = ComponentA() - self.component_b = ComponentB() - - def run(self): - self.component_a.run() - self.component_b.run() - - -app = LightningApp(LitApp()) diff --git a/src/lightning/app/cli/app-template/placeholdername/__init__.py b/src/lightning/app/cli/app-template/placeholdername/__init__.py deleted file mode 100644 index cf954823e0315..0000000000000 --- a/src/lightning/app/cli/app-template/placeholdername/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from placeholdername.components.component_a import ComponentA -from placeholdername.components.component_b import ComponentB - -__all__ = ["ComponentA", "ComponentB"] diff --git a/src/lightning/app/cli/app-template/placeholdername/components/component_a/__init__.py b/src/lightning/app/cli/app-template/placeholdername/components/component_a/__init__.py deleted file mode 100644 index 82753954e0e03..0000000000000 --- a/src/lightning/app/cli/app-template/placeholdername/components/component_a/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from placeholdername.components.component_a.component_a import ComponentA - -__all__ = ["ComponentA"] diff --git a/src/lightning/app/cli/app-template/placeholdername/components/component_a/component_a.py b/src/lightning/app/cli/app-template/placeholdername/components/component_a/component_a.py deleted file mode 100644 index e11ff40c299db..0000000000000 --- a/src/lightning/app/cli/app-template/placeholdername/components/component_a/component_a.py +++ /dev/null @@ -1,6 +0,0 @@ -from lightning.app import LightningFlow - - -class ComponentA(LightningFlow): - def run(self): - print("hello from component A") diff --git a/src/lightning/app/cli/app-template/placeholdername/components/component_b/__init__.py b/src/lightning/app/cli/app-template/placeholdername/components/component_b/__init__.py deleted file mode 100644 index 876454576ad90..0000000000000 --- a/src/lightning/app/cli/app-template/placeholdername/components/component_b/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from placeholdername.components.component_b.component_a import ComponentB - -__all__ = ["ComponentB"] diff --git a/src/lightning/app/cli/app-template/placeholdername/components/component_b/component_a.py b/src/lightning/app/cli/app-template/placeholdername/components/component_b/component_a.py deleted file mode 100644 index d80505d986026..0000000000000 --- a/src/lightning/app/cli/app-template/placeholdername/components/component_b/component_a.py +++ /dev/null @@ -1,6 +0,0 @@ -from lightning.app import LightningFlow - - -class ComponentB(LightningFlow): - def run(self): - print("hello from component B") diff --git a/src/lightning/app/cli/app-template/requirements.txt b/src/lightning/app/cli/app-template/requirements.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/app-template/setup.py b/src/lightning/app/cli/app-template/setup.py deleted file mode 100644 index c398ca985f759..0000000000000 --- a/src/lightning/app/cli/app-template/setup.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python - -from setuptools import find_packages, setup - -setup( - name="placeholdername", - version="0.0.0", - description="⚡ Lightning app ⚡ generated with command: lightning init app", - author="", - author_email="", - # REPLACE WITH YOUR OWN GITHUB PROJECT LINK - url="https://github.com/Lightning-AI/lightning-app-template", - install_requires=[], - packages=find_packages(), -) diff --git a/src/lightning/app/cli/app-template/tests/README.md b/src/lightning/app/cli/app-template/tests/README.md deleted file mode 100644 index 85e8c7faa08f9..0000000000000 --- a/src/lightning/app/cli/app-template/tests/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Run tests - -To run the tests: - -```bash -# go to your app folder -cd placeholdername - -# go to tests folder -cd tests - -# install testing deps -pip install -r requirements.txt - -# run tests -pytest . -``` diff --git a/src/lightning/app/cli/app-template/tests/__init__.py b/src/lightning/app/cli/app-template/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/app-template/tests/requirements.txt b/src/lightning/app/cli/app-template/tests/requirements.txt deleted file mode 100644 index 3185d1c44f033..0000000000000 --- a/src/lightning/app/cli/app-template/tests/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -coverage -codecov>=2.1 -pytest>=5.0.0 -pytest-cov -pytest-flake8 -flake8 -check-manifest -twine==4.0.1 diff --git a/src/lightning/app/cli/app-template/tests/test_placeholdername_app.py b/src/lightning/app/cli/app-template/tests/test_placeholdername_app.py deleted file mode 100644 index 6c7743b93ce1e..0000000000000 --- a/src/lightning/app/cli/app-template/tests/test_placeholdername_app.py +++ /dev/null @@ -1,44 +0,0 @@ -r""" -To test a lightning app: -1. Use LightningTestApp which is a subclass of LightningApp. -2. Subclass run_once in LightningTestApp. -3. in run_once, come up with a way to verify the behavior you wanted. - -run_once runs your app through one cycle of the event loop and then terminates -""" - -import io -import os -from contextlib import redirect_stdout - -from lightning.app.testing.testing import LightningTestApp, application_testing - - -class LightningAppTestInt(LightningTestApp): - def run_once(self) -> bool: - f = io.StringIO() - with redirect_stdout(f): - super().run_once() - out = f.getvalue() - assert out == "hello from component A\nhello from component B\n" - return True - - -def test_templatename_app(): - start_dir = os.getcwd() - os.chdir("..") - - cwd = os.getcwd() - cwd = os.path.join(cwd, "placeholdername/app.py") - command_line = [ - cwd, - "--blocking", - "False", - "--open-ui", - "False", - ] - result = application_testing(LightningAppTestInt, command_line) - assert result.exit_code == 0 - - # reset dir - os.chdir(start_dir) diff --git a/src/lightning/app/cli/cmd_apps.py b/src/lightning/app/cli/cmd_apps.py deleted file mode 100644 index d8d7deace2bb4..0000000000000 --- a/src/lightning/app/cli/cmd_apps.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -from datetime import datetime -from typing import List, Optional - -from lightning_cloud.openapi import ( - Externalv1LightningappInstance, - Externalv1Lightningwork, - V1LightningappInstanceState, - V1LightningappInstanceStatus, -) -from rich.console import Console -from rich.table import Table -from rich.text import Text - -from lightning.app.cli.core import Formatable -from lightning.app.utilities.cloud import _get_project -from lightning.app.utilities.network import LightningClient - - -class _AppManager: - """_AppManager implements API calls specific to Lightning AI BYOC apps.""" - - def __init__(self) -> None: - self.api_client = LightningClient(retry=False) - - def get_app(self, app_id: str) -> Externalv1LightningappInstance: - project = _get_project(self.api_client) - return self.api_client.lightningapp_instance_service_get_lightningapp_instance( - project_id=project.project_id, id=app_id - ) - - def list_apps(self, limit: int = 100, phase_in: Optional[List[str]] = None) -> List[Externalv1LightningappInstance]: - phase_in = phase_in or [] - project = _get_project(self.api_client) - - kwargs = { - "project_id": project.project_id, - "limit": limit, - "phase_in": phase_in, - } - - resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**kwargs) - apps = resp.lightningapps - while resp.next_page_token is not None and resp.next_page_token != "": - kwargs["page_token"] = resp.next_page_token - resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**kwargs) - apps = apps + resp.lightningapps - return apps - - def list_components(self, app_id: str, phase_in: Optional[List[str]] = None) -> List[Externalv1Lightningwork]: - phase_in = phase_in or [] - project = _get_project(self.api_client) - resp = self.api_client.lightningwork_service_list_lightningwork( - project_id=project.project_id, - app_id=app_id, - phase_in=phase_in, - ) - return resp.lightningworks - - def list(self, limit: int = 100) -> None: - console = Console() - console.print(_AppList(self.list_apps(limit=limit)).as_table()) - - def delete(self, app_id: str) -> None: - project = _get_project(self.api_client) - self.api_client.lightningapp_instance_service_delete_lightningapp_instance( - project_id=project.project_id, - id=app_id, - ) - - -class _AppList(Formatable): - def __init__(self, apps: List[Externalv1LightningappInstance]) -> None: - self.apps = apps - - @staticmethod - def _textualize_state_transitions( - desired_state: V1LightningappInstanceState, current_state: V1LightningappInstanceStatus - ) -> Text: - phases = { - V1LightningappInstanceState.IMAGE_BUILDING: Text("building image", style="bold yellow"), - V1LightningappInstanceState.PENDING: Text("pending", style="bold yellow"), - V1LightningappInstanceState.RUNNING: Text("running", style="bold green"), - V1LightningappInstanceState.FAILED: Text("failed", style="bold red"), - V1LightningappInstanceState.STOPPED: Text("stopped"), - V1LightningappInstanceState.NOT_STARTED: Text("not started"), - V1LightningappInstanceState.DELETED: Text("deleted", style="bold red"), - V1LightningappInstanceState.UNSPECIFIED: Text("unspecified", style="bold red"), - } - - if current_state.phase == V1LightningappInstanceState.UNSPECIFIED and current_state.start_timestamp is None: - return Text("not yet started", style="bold yellow") - - if ( - desired_state == V1LightningappInstanceState.DELETED - and current_state.phase != V1LightningappInstanceState.DELETED - ): - return Text("terminating", style="bold red") - - if ( - any( - phase == current_state.phase - for phase in [V1LightningappInstanceState.PENDING, V1LightningappInstanceState.STOPPED] - ) - and desired_state == V1LightningappInstanceState.RUNNING - ): - return Text("restarting", style="bold yellow") - - return phases[current_state.phase] - - def as_json(self) -> str: - return json.dumps(self.apps) - - def as_table(self) -> Table: - table = Table("id", "name", "status", "created", show_header=True, header_style="bold green") - - for app in self.apps: - status = self._textualize_state_transitions(desired_state=app.spec.desired_state, current_state=app.status) - - # this guard is necessary only until 0.3.93 releases which includes the `created_at` - # field to the external API - created_at = datetime.now() - if hasattr(app, "created_at"): - created_at = app.created_at - - table.add_row( - app.id, - app.name, - status, - created_at.strftime("%Y-%m-%d") if created_at else "", - ) - return table diff --git a/src/lightning/app/cli/cmd_init.py b/src/lightning/app/cli/cmd_init.py deleted file mode 100644 index db83fd41e47d9..0000000000000 --- a/src/lightning/app/cli/cmd_init.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import shutil -from typing import List, Optional, Tuple - -from lightning.app.utilities.app_helpers import Logger - -logger = Logger(__name__) - - -def app(app_name: str) -> None: - if app_name is None: - app_name = _capture_valid_app_component_name(resource_type="app") - - # generate resource template - new_resource_name, _ = _make_resource(resource_dir="app-template", resource_name=app_name) - - m = f""" - ⚡ Lightning app template created! ⚡ - {new_resource_name} - - run your app with: - lightning run app {app_name}/app.py - - run it on the cloud to share with your collaborators: - lightning run app {app_name}/app.py --cloud - """ - logger.info(m) - - -def _make_resource(resource_dir: str, resource_name: str) -> Tuple[str, str]: - path = os.path.dirname(os.path.abspath(__file__)) - template_dir = os.path.join(path, resource_dir) - name_for_files = re.sub("-", "_", resource_name) - - new_resource_name = os.path.join(os.getcwd(), resource_name) - - # lay out scaffolding - logger.info(f"laying out component template at {new_resource_name}") - shutil.copytree(template_dir, new_resource_name) - - # rename main folder - os.rename(os.path.join(new_resource_name, "placeholdername"), os.path.join(new_resource_name, name_for_files)) - - # for each file, rename the word - trouble_names = {".DS_Store"} - files = _ls_recursively(new_resource_name) - for bad_file in files: - if bad_file.split("/")[-1] in trouble_names: - continue - # find the words and replace - with open(bad_file) as fo: - content = fo.read().replace("placeholdername", name_for_files) - with open(bad_file, "w") as fw: - fw.write(content) - - # rename files - for file_name in files: - new_file = re.sub("placeholdername", name_for_files, file_name) - os.rename(file_name, new_file) - - return new_resource_name, name_for_files - - -def _ls_recursively(dir_name: str) -> List[str]: - fname = [] - for root, d_names, f_names in os.walk(dir_name): - for f in f_names: - if "__pycache__" not in root: - fname.append(os.path.join(root, f)) - - return fname - - -def _capture_valid_app_component_name(value: Optional[str] = None, resource_type: str = "app") -> str: - prompt = f""" - ⚡ Creating Lightning {resource_type} ⚡ - """ - logger.info(prompt) - - try: - if value is None: - value = input(f"\nName your Lightning {resource_type} (example: the-{resource_type}-name) > ") - value = value.strip().lower() - unsafe_chars = set(re.findall(r"[^a-z0-9\-]", value)) - if len(unsafe_chars) > 0: - m = f""" - Error: your Lightning {resource_type} name: - {value} - - contains the following unsupported characters: - {unsafe_chars} - - A Lightning {resource_type} name can only contain letters (a-z) numbers (0-9) and the '-' character - - valid example: - lightning-{resource_type} - """ - raise SystemExit(m) - - except KeyboardInterrupt: - raise SystemExit( - f""" - ⚡ {resource_type} init aborted! ⚡ - """ - ) - - return value - - -def component(component_name: str) -> None: - if component_name is None: - component_name = _capture_valid_app_component_name(resource_type="component") - - # generate resource template - new_resource_name, name_for_files = _make_resource(resource_dir="component-template", resource_name=component_name) - - m = f""" - ⚡ Lightning component template created! ⚡ - {new_resource_name} - - ⚡ To use your component, first pip install it (with these 3 commands): ⚡ - cd {component_name} - pip install -r requirements.txt - pip install -e . - - ⚡ Use the component inside an app: ⚡ - - from {name_for_files} import TemplateComponent - import lightning.app as la - - class LitApp(la.LightningFlow): - def __init__(self) -> None: - super().__init__() - self.{name_for_files} = TemplateComponent() - - def run(self): - print('this is a simple Lightning app to verify your component is working as expected') - self.{name_for_files}.run() - - app = la.LightningApp(LitApp()) - - ⚡ Checkout the demo app with your {component_name} component: ⚡ - lightning run app {component_name}/app.py - - ⚡ Tip: Publish your component to the Lightning Gallery to enable users to install it like so: - lightning install component YourLightningUserName/{component_name} - - so the Lightning community can use it like: - from {name_for_files} import TemplateComponent - - """ - logger.info(m) diff --git a/src/lightning/app/cli/cmd_install.py b/src/lightning/app/cli/cmd_install.py deleted file mode 100644 index b43aa3f88fac9..0000000000000 --- a/src/lightning/app/cli/cmd_install.py +++ /dev/null @@ -1,657 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import shutil -import subprocess -import sys -from typing import Dict, Optional, Tuple - -import click -import requests -from packaging.version import Version - -from lightning.app.core.constants import LIGHTNING_APPS_PUBLIC_REGISTRY, LIGHTNING_COMPONENT_PUBLIC_REGISTRY -from lightning.app.utilities.app_helpers import Logger - -logger = Logger(__name__) - - -@click.group(name="install") -def install() -> None: - """Install Lightning AI selfresources.""" - pass - - -@install.command("app") -@click.argument("name", type=str) -@click.option( - "--yes", - "-y", - is_flag=True, - help="disables prompt to ask permission to create env and run install cmds", -) -@click.option( - "--version", - "-v", - type=str, - help="Specify the version to install. By default it uses 'latest'", - default="latest", - show_default=True, -) -@click.option( - "--overwrite", - "-f", - is_flag=True, - default=False, - help="When set, overwrite the app directory without asking if it already exists.", -) -def install_app(name: str, yes: bool, version: str, overwrite: bool = False) -> None: - _install_app_command(name, yes, version, overwrite=overwrite) - - -@install.command("component") -@click.argument("name", type=str) -@click.option( - "--yes", - "-y", - is_flag=True, - help="disables prompt to ask permission to create env and run install cmds", -) -@click.option( - "--version", - "-v", - type=str, - help="Specify the version to install. By default it uses 'latest'", - default="latest", - show_default=True, -) -def install_component(name: str, yes: bool, version: str) -> None: - _install_component_command(name, yes, version) - - -def _install_app_command(name: str, yes: bool, version: str, overwrite: bool = False) -> None: - if "github.com" in name: - if version != "latest": - logger.warn( - "When installing from GitHub, only the 'latest' version is supported. " - f"The provided version ({version}) will be ignored." - ) - return non_gallery_app(name, yes, overwrite=overwrite) - - return gallery_app(name, yes, version, overwrite=overwrite) - - -def _install_component_command(name: str, yes: bool, version: str, overwrite: bool = False) -> None: - if "github.com" in name: - if version != "latest": - logger.warn( - "When installing from GitHub, only the 'latest' version is supported. " - f"The provided version ({version}) will be ignored." - ) - return non_gallery_component(name, yes) - - return gallery_component(name, yes, version) - - -def gallery_apps_and_components( - name: str, yes_arg: bool, version_arg: str, cwd: Optional[str] = None, overwrite: bool = False -) -> Optional[str]: - try: - org, app_or_component = name.split("/") - except Exception: - return None - - entry, kind = _resolve_entry(name, version_arg) - - if kind == "app": - # give the user the chance to do a manual install - source_url, git_url, folder_name, git_sha = _show_install_app_prompt( - entry, app_or_component, org, yes_arg, resource_type="app" - ) - # run installation if requested - _install_app_from_source(source_url, git_url, folder_name, cwd=cwd, overwrite=overwrite, git_sha=git_sha) - - return os.path.join(os.getcwd(), *entry["appEntrypointFile"].split("/")) - - if kind == "component": - # give the user the chance to do a manual install - source_url, git_url, folder_name, git_sha = _show_install_app_prompt( - entry, app_or_component, org, yes_arg, resource_type="component" - ) - if "@" in git_url: - git_url = git_url.split("git+")[1].split("@")[0] - # run installation if requested - _install_app_from_source(source_url, git_url, folder_name, cwd=cwd, overwrite=overwrite, git_sha=git_sha) - - return os.path.join(os.getcwd(), *entry["entrypointFile"].split("/")) - - return None - - -def gallery_component(name: str, yes_arg: bool, version_arg: str, cwd: Optional[str] = None) -> str: - # make sure org/component-name name is correct - org, component = _validate_name(name, resource_type="component", example="lightning/LAI-slack-component") - - # resolve registry (orgs can have a private registry through their environment variables) - registry_url = _resolve_component_registry() - - # load the component resource - component_entry = _resolve_resource(registry_url, name=name, version_arg=version_arg, resource_type="component") - - # give the user the chance to do a manual install - git_url = _show_install_component_prompt(component_entry, component, org, yes_arg) - - # run installation if requested - _install_component_from_source(git_url) - - return os.path.join(os.getcwd(), component_entry["entrypointFile"]) - - -def non_gallery_component(gh_url: str, yes_arg: bool, cwd: Optional[str] = None) -> None: - # give the user the chance to do a manual install - git_url = _show_non_gallery_install_component_prompt(gh_url, yes_arg) - - # run installation if requested - _install_component_from_source(git_url) - - -def gallery_app(name: str, yes_arg: bool, version_arg: str, cwd: Optional[str] = None, overwrite: bool = False) -> str: - # make sure org/app-name syntax is correct - org, app = _validate_name(name, resource_type="app", example="lightning/quick-start") - - # resolve registry (orgs can have a private registry through their environment variables) - registry_url = _resolve_app_registry() - - # load the app resource - app_entry = _resolve_resource(registry_url, name=name, version_arg=version_arg, resource_type="app") - - # give the user the chance to do a manual install - source_url, git_url, folder_name, git_sha = _show_install_app_prompt( - app_entry, app, org, yes_arg, resource_type="app" - ) - - # run installation if requested - _install_app_from_source(source_url, git_url, folder_name, cwd=cwd, overwrite=overwrite, git_sha=git_sha) - - return os.path.join(os.getcwd(), folder_name, app_entry["appEntrypointFile"]) - - -def non_gallery_app(gh_url: str, yes_arg: bool, cwd: Optional[str] = None, overwrite: bool = False) -> None: - # give the user the chance to do a manual install - repo_url, folder_name = _show_non_gallery_install_app_prompt(gh_url, yes_arg) - - # run installation if requested - _install_app_from_source(repo_url, repo_url, folder_name, cwd=cwd, overwrite=overwrite) - - -def _show_install_component_prompt(entry: Dict[str, str], component: str, org: str, yes_arg: bool) -> str: - git_url = entry["gitUrl"] - - # yes arg does not prompt the user for permission to install anything - # automatically creates env and sets up the project - if yes_arg: - return git_url - - prompt = f""" - ⚡ Installing Lightning component ⚡ - - component name : {component} - developer : {org} - - Installation runs the following command for you: - - pip install {git_url} - """ - logger.info(prompt) - - try: - value = input("\nPress enter to continue: ") - value = value.strip().lower() - should_install = len(value) == 0 or value in {"y", "yes", 1} - if not should_install: - raise KeyboardInterrupt() - - return git_url - except KeyboardInterrupt: - repo = entry["sourceUrl"] - raise SystemExit( - f""" - ⚡ Installation aborted! ⚡ - - Install the component yourself by visiting: - {repo} - """ - ) - - -def _show_non_gallery_install_component_prompt(gh_url: str, yes_arg: bool) -> str: - if ".git@" not in gh_url: - m = """ - Error, your github url must be in the following format: - git+https://github.com/OrgName/repo-name.git@ALongCommitSHAString - - Example: - git+https://github.com/Lightning-AI/LAI-slack-messenger.git@14f333456ffb6758bd19458e6fa0bf12cf5575e1 - """ - raise SystemExit(m) - - developer = gh_url.split("/")[3] - component_name = gh_url.split("/")[4].split(".git")[0] - repo_url = re.search(r"git\+(.*).git", gh_url).group(1) # type: ignore - - # yes arg does not prompt the user for permission to install anything - # automatically creates env and sets up the project - if yes_arg: - return gh_url - - prompt = f""" - ⚡ Installing Lightning component ⚡ - - component name : {component_name} - developer : {developer} - - ⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡ - WARNING: this is NOT an official Lightning Gallery component - Install at your own risk - ⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡ - - Installation runs the following command for you: - - pip install {gh_url} - """ - logger.info(prompt) - - try: - value = input("\nPress enter to continue: ") - value = value.strip().lower() - should_install = len(value) == 0 or value in {"y", "yes", 1} - if not should_install: - raise KeyboardInterrupt() - - return gh_url - except KeyboardInterrupt: - raise SystemExit( - f""" - ⚡ Installation aborted! ⚡ - - Install the component yourself by visiting: - {repo_url} - """ - ) - - -def _show_install_app_prompt( - entry: Dict[str, str], app: str, org: str, yes_arg: bool, resource_type: str -) -> Tuple[str, str, str, Optional[str]]: - source_url = entry["sourceUrl"] # This URL is used only to display the repo and extract folder name - full_git_url = entry["gitUrl"] # Used to clone the repo (can include tokens for private repos) - git_url_parts = full_git_url.split("#ref=") - git_url = git_url_parts[0] - git_sha = git_url_parts[1] if len(git_url_parts) == 2 else None - - folder_name = source_url.split("/")[-1] - - # yes arg does not prompt the user for permission to install anything - # automatically creates env and sets up the project - if yes_arg: - return source_url, git_url, folder_name, git_sha - - prompt = f""" - ⚡ Installing Lightning {resource_type} ⚡ - - {resource_type} name : {app} - developer: {org} - - Installation creates and runs the following commands for you: - - git clone {source_url} - cd {folder_name} - pip install -r requirements.txt - pip install -e . - """ - logger.info(prompt) - - try: - value = input("\nPress enter to continue: ") - value = value.strip().lower() - should_install = len(value) == 0 or value in {"y", "yes", 1} - if not should_install: - raise KeyboardInterrupt() - - return source_url, git_url, folder_name, git_sha - except KeyboardInterrupt: - repo = entry["sourceUrl"] - raise SystemExit( - f""" - ⚡ Installation aborted! ⚡ - - Install the {resource_type} yourself by visiting: - {repo} - """ - ) - - -def _show_non_gallery_install_app_prompt(gh_url: str, yes_arg: bool) -> Tuple[str, str]: - try: - if gh_url.endswith(".git"): - # folder_name when it's a GH url with .git - folder_name = gh_url.split("/")[-1] - folder_name = folder_name[:-4] - else: - # the last part of the url is the folder name otherwise - folder_name = gh_url.split("/")[-1] - - org = re.search(r"github.com\/(.*)\/", gh_url).group(1) # type: ignore - except Exception: - raise SystemExit( - """ - Your github url is not supported. Here's the supported format: - https://github.com/YourOrgName/your-repo-name - - Example: - https://github.com/Lightning-AI/lightning - """ - ) - - # yes arg does not prompt the user for permission to install anything - # automatically creates env and sets up the project - if yes_arg: - return gh_url, folder_name - - prompt = f""" - ⚡ Installing Lightning app ⚡ - - app source : {gh_url} - developer : {org} - - ⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡ - WARNING: this is NOT an official Lightning Gallery app - Install at your own risk - ⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡⚡ - - Installation creates and runs the following commands for you: - - git clone {gh_url} - cd {folder_name} - pip install -r requirements.txt - pip install -e . - """ - logger.info(prompt) - - try: - value = input("\nPress enter to continue: ") - value = value.strip().lower() - should_install = len(value) == 0 or value in {"y", "yes", 1} - if not should_install: - raise KeyboardInterrupt() - - return gh_url, folder_name - except KeyboardInterrupt: - raise SystemExit( - f""" - ⚡ Installation aborted! ⚡ - - Install the app yourself by visiting {gh_url} - """ - ) - - -def _validate_name(name: str, resource_type: str, example: str) -> Tuple[str, str]: - # ensure resource identifier is properly formatted - try: - org, resource = name.split("/") - except Exception: - raise SystemExit( - f""" - {resource_type} name format must have organization/{resource_type}-name - - Examples: - {example} - user/{resource_type}-name - - You passed in: {name} - """ - ) - return org, resource - - -def _resolve_entry(name, version_arg) -> Tuple[Optional[Dict], Optional[str]]: - entry = None - kind = None - - # resolve registry (orgs can have a private registry through their environment variables) - registry_url = _resolve_app_registry() - - # load the app resource - entry = _resolve_resource(registry_url, name=name, version_arg=version_arg, resource_type="app", raise_error=False) - - if not entry: - registry_url = _resolve_component_registry() - - # load the component resource - entry = _resolve_resource( - registry_url, name=name, version_arg=version_arg, resource_type="component", raise_error=False - ) - kind = "component" if entry else None - - else: - kind = "app" - - return entry, kind - - -def _resolve_resource( - registry_url: str, name: str, version_arg: str, resource_type: str, raise_error: bool = True -) -> Dict[str, str]: - gallery_entries = [] - try: - response = requests.get(registry_url) - data = response.json() - - if resource_type == "app": - gallery_entries = [a for a in data["apps"] if a["canDownloadSourceCode"]] - - elif resource_type == "component": - gallery_entries = data["components"] - except requests.ConnectionError: - sys.tracebacklimit = 0 - raise SystemError( - f""" - Network connection error, could not load list of available Lightning {resource_type}s. - - Try again when you have a network connection! - """ - ) - - entries = [] - all_versions = [] - for x in gallery_entries: - if name == x["name"]: - entries.append(x) - all_versions.append(x["version"]) - - if len(entries) == 0: - if raise_error: - raise SystemExit(f"{resource_type}: '{name}' is not available on ⚡ Lightning AI ⚡") - - return None - - entry = None - if version_arg == "latest": - entry = max(entries, key=lambda app: Version(app["version"])) - else: - for e in entries: - if e["version"] == version_arg: - entry = e - break - if entry is None and raise_error: - if raise_error: - raise Exception( - f"{resource_type}: 'Version {version_arg} for {name}' is not available on ⚡ Lightning AI ⚡. " - f"Here is the list of all availables versions:{os.linesep}{os.linesep.join(all_versions)}" - ) - return None - - return entry - - -def _install_with_env(repo_url: str, folder_name: str, cwd: Optional[str] = None) -> None: - if not cwd: - cwd = os.getcwd() - - # clone repo - logger.info(f"⚡ RUN: git clone {repo_url}") - subprocess.call(["git", "clone", repo_url]) - - # step into the repo folder - os.chdir(f"{folder_name}") - cwd = os.getcwd() - - # create env - logger.info(f"⚡ CREATE: virtual env at {cwd}") - subprocess.call(["python", "-m", "venv", cwd]) - - # activate and install reqs - # TODO: remove shell=True... but need to run command in venv - logger.info("⚡ RUN: install requirements (pip install -r requirements.txt)") - subprocess.call("source bin/activate && pip install -r requirements.txt", shell=True) - - # install project - # TODO: remove shell=True... but need to run command in venv - logger.info("⚡ RUN: setting up project (pip install -e .)") - subprocess.call("source bin/activate && pip install -e .", shell=True) - - m = f""" - ⚡ Installed! ⚡ to use your app - go into the folder: cd {folder_name} - activate the environment: source bin/activate - run the app: lightning run app [the_app_file.py] - """ - logger.info(m) - - -def _install_app_from_source( - source_url: str, - git_url: str, - folder_name: str, - cwd: Optional[str] = None, - overwrite: bool = False, - git_sha: Optional[str] = None, -) -> None: - """Installing lighting app from the `git_url` - - Args: - source_url: - source repo url without any tokens and params, this param is used only for displaying - git_url: - repo url that is used to clone, this can contain tokens - folder_name: - where to clone the repo ? - cwd: - Working director. If not specified, current working directory is used. - overwrite: - If true, overwrite the app directory without asking if it already exists - git_sha: - The git_sha for checking out the git repo of the app. - - """ - - if not cwd: - cwd = os.getcwd() - - destination = os.path.join(cwd, folder_name) - if os.path.exists(destination): - if not overwrite: - raise SystemExit( - f"Folder {folder_name} exists, please delete it and try again, " - f"or force to overwrite the existing folder by passing `--overwrite`.", - ) - shutil.rmtree(destination) - # clone repo - logger.info(f"⚡ RUN: git clone {source_url}") - try: - subprocess.check_output(["git", "clone", git_url], stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as ex: - if "Repository not found" in str(ex.output): - raise SystemExit( - f""" - Looks like the github url was not found or doesn't exist. Do you have a typo? - {source_url} - """ - ) - raise Exception(ex) - - # step into the repo folder - os.chdir(f"{folder_name}") - cwd = os.getcwd() - - try: - if git_sha: - subprocess.check_output(["git", "checkout", git_sha], stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as ex: - if "did not match any" in str(ex.output): - raise SystemExit("Looks like the git SHA is not valid or doesn't exist in app repo.") - raise Exception(ex) - - # activate and install reqs - # TODO: remove shell=True... but need to run command in venv - logger.info("⚡ RUN: install requirements (pip install -r requirements.txt)") - subprocess.call("pip install -r requirements.txt", shell=True) - - # install project - # TODO: remove shell=True... but need to run command in venv - logger.info("⚡ RUN: setting up project (pip install -e .)") - subprocess.call("pip install -e .", shell=True) - - m = f""" - ⚡ Installed! ⚡ to use your app: - - cd {folder_name} - lightning run app app.py - """ - logger.info(m) - - -def _install_component_from_source(git_url: str) -> None: - logger.info("⚡ RUN: pip install") - - out = subprocess.check_output(["pip", "install", git_url]) - possible_success_message = [x for x in str(out).split("\\n") if "Successfully installed" in x] - if len(possible_success_message) > 0: - uninstall_step = possible_success_message[0] - uninstall_step = re.sub("Successfully installed", "", uninstall_step).strip() - uninstall_step = re.sub("-0.0.0", "", uninstall_step).strip() - m = """ - ⚡ Installed! ⚡ - - to use your component: - from the_component import TheClass - - make sure to add this entry to your Lightning APP requirements.txt file: - {git_url} - - if you want to uninstall, run this command: - pip uninstall {uninstall_step} - """ - logger.info(m) - - -def _resolve_app_registry() -> str: - return os.environ.get("LIGHTNING_APP_REGISTRY", LIGHTNING_APPS_PUBLIC_REGISTRY) - - -def _resolve_component_registry() -> str: - return os.environ.get("LIGHTNING_COMPONENT_REGISTRY", LIGHTNING_COMPONENT_PUBLIC_REGISTRY) diff --git a/src/lightning/app/cli/cmd_pl_init.py b/src/lightning/app/cli/cmd_pl_init.py deleted file mode 100644 index 2436c28179ef2..0000000000000 --- a/src/lightning/app/cli/cmd_pl_init.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pathlib -import re -import shutil -import subprocess -import sys -import tarfile -import urllib.request -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any, Dict, List, Optional - -import click -from jinja2 import Environment, FileSystemLoader -from rich import print -from rich.panel import Panel -from rich.status import Status -from rich.text import Text -from rich.tree import Tree - -import lightning.app - -_REPORT_HELP_TEXTS = { - "core": "Important files for the app such as various components", - "source": "A copy of all your source code, including the PL script ⚡", - "tests": "This app comes with tests!", - "ui": "Source and build files for the user interface", - "app.py": "This is the main app file!", - "requirements.txt": "Lists the dependencies required to be installed before running the app", -} - -_REPORT_IGNORE_PATTERNS = [ - r"__pycache__", - r"__init__\.py", - r".*egg-info", - r"\..*", -] - - -def pl_app(source_dir: str, script_path: str, name: str, overwrite: bool) -> None: - source_dir = Path(source_dir).resolve() - script_path = Path(script_path).resolve() - - if not source_dir.is_dir(): - click.echo(f"The given source directory does not exist: {source_dir}", err=True) - raise SystemExit(1) - - if not script_path.exists(): - click.echo(f"The given script path does not exist: {script_path}", err=True) - raise SystemExit(1) - - if not script_path.is_file(): - click.echo(f"The given script path must be a file, you passed: {script_path}", err=True) - raise SystemExit(1) - - if source_dir not in script_path.parents: - click.echo( - "The given script path must be a subpath of the source directory. Example:" - " lightning init pl-app ./code ./code/scripts/train.py", - err=True, - ) - raise SystemExit(1) - - rel_script_path = script_path.relative_to(source_dir) - cwd = Path.cwd() - destination = cwd / name - - if destination.exists(): - if not overwrite: - click.echo( - f"There is already an app with the name {name} in the current working directory. Choose a different" - f" name with `--name` or force to overwrite the existing folder by passing `--overwrite`.", - err=True, - ) - raise SystemExit(1) - - shutil.rmtree(destination) - - template_dir = Path(lightning.app.cli.__file__).parent / "pl-app-template" - - with Status("[bold green]Copying app files"): - shutil.copytree(template_dir, destination, ignore=shutil.ignore_patterns("node_modules", "build")) - if (template_dir / "ui" / "build").exists(): - shutil.copytree(template_dir / "ui" / "build", destination / "ui" / "build") - else: - download_frontend(destination / "ui" / "build") - - with Status("[bold green]Copying source files"): - shutil.copytree(source_dir, destination / "source", ignore=shutil.ignore_patterns(name)) - project_file_from_template(template_dir, destination, "app.py", script_path=str(rel_script_path)) - project_file_from_template(template_dir, destination, "setup.py", app_name=name) - - with Status("[bold green]Installing"): - subprocess.call(["pip", "install", "--quiet", "-e", str(destination)]) - # TODO: download the ui files - - print_pretty_report( - destination, - ignore_patterns=_REPORT_IGNORE_PATTERNS, - help_texts=_REPORT_HELP_TEXTS, - ) - - -def download_frontend(destination: Path) -> None: - # TODO: Update the URL to the release in GitHub once the PL app repo is public - url = "https://storage.googleapis.com/grid-packages/pytorch-lightning-app/v0.0.0/build.tar.gz" - build_dir_name = "build" - with TemporaryDirectory() as download_dir: - response = urllib.request.urlopen(url) # noqa: S310 - file = tarfile.open(fileobj=response, mode="r|gz") - file.extractall(path=download_dir) # noqa: S202 - shutil.move(str(Path(download_dir, build_dir_name)), destination) - - -def project_file_from_template(template_dir: Path, destination_dir: Path, template_name: str, **kwargs: Any) -> None: - env = Environment(loader=FileSystemLoader(template_dir)) # noqa: S701 - template = env.get_template(template_name) - rendered_template = template.render(**kwargs) - with open(destination_dir / template_name, "w") as file: - file.write(rendered_template) - - -def print_pretty_report( - directory: pathlib.Path, - ignore_patterns: Optional[List[str]] = None, - help_texts: Optional[Dict[str, str]] = None, -) -> None: - """Prints a report for the generated app.""" - tree = Tree( - f":open_file_folder: [link file://{directory}]{directory}", - guide_style="bold bright_blue", - ) - - help_texts = {} if help_texts is None else help_texts - - paths = sorted( - directory.glob("*"), - key=lambda p: (p.is_file(), p.name.lower()), - ) - max_witdth = max(len(p.name) for p in paths) - - patterns_to_ignore = [] if ignore_patterns is None else ignore_patterns - for path in paths: - if any(re.match(pattern, path.name) for pattern in patterns_to_ignore): - # Only display relevant files - continue - - help_text = help_texts.get(path.name, "") - padding = " " * (max_witdth - len(path.name)) - - text_pathname = Text(path.name, "green") - text_pathname.highlight_regex(r"\..*$", "bold red") - text_pathname.stylize(f"link file://{path}") - text_pathname.append(f" {padding} {help_text}", "blue") - - icon = "📂 " if path.is_dir() else "📄 " - icon = icon if _can_encode_icon(icon) else "" - - tree.add(Text(icon) + text_pathname) - - print("\n") - print("Done. The app is ready here:\n") - print(tree) - print("\nRun it:\n") - print(Panel(f"[red]lightning run app {directory.relative_to(Path.cwd()) / 'app.py'}")) - - -def _can_encode_icon(icon: str) -> bool: - """Helper function to check whether an icon can be encoded.""" - try: - icon.encode(sys.stdout.encoding) - return True - except UnicodeEncodeError: - return False diff --git a/src/lightning/app/cli/cmd_react_ui_init.py b/src/lightning/app/cli/cmd_react_ui_init.py deleted file mode 100644 index 22e668433e233..0000000000000 --- a/src/lightning/app/cli/cmd_react_ui_init.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import shutil -import subprocess -from typing import Optional - -from lightning.app.utilities.app_helpers import Logger - -logger = Logger(__name__) - - -def react_ui(dest_dir: Optional[str] = None) -> None: - # verify all the prereqs for install are met - _check_react_prerequisites() - - # copy template files to the dir - _copy_and_setup_react_ui(dest_dir) - - -def _copy_and_setup_react_ui(dest_dir: Optional[str] = None) -> None: - logger.info("⚡ setting up react-ui template") - path = os.path.dirname(os.path.abspath(__file__)) - template_dir = os.path.join(path, "react-ui-template") - - if dest_dir is None: - dest_dir = os.path.join(os.getcwd(), "react-ui") - - shutil.copytree(template_dir, dest_dir) - - logger.info("⚡ install react project deps") - ui_path = os.path.join(dest_dir, "ui") - subprocess.run(f"cd {ui_path} && yarn install", shell=True) - - logger.info("⚡ building react project") - subprocess.run(f"cd {ui_path} && yarn build", shell=True) - - m = f""" - ⚡⚡ react-ui created! ⚡⚡ - - ⚡ Connect it to your component using `configure_layout`: - - # Use a LightningFlow or LightningWork - class YourComponent(la.LightningFlow): - def configure_layout(self): - return la.frontend.StaticWebFrontend(Path(__file__).parent / "react-ui/src/dist") - - ⚡ run the example_app.py to see it live! - lightning_app run app {dest_dir}/example_app.py - - """ - logger.info(m) - - -def _check_react_prerequisites() -> None: - """Args are for test purposes only.""" - missing_msgs = [] - version_regex = r"\d{1,2}\.\d{1,2}\.\d{1,3}" - - logger.info("Checking pre-requisites for react") - - # make sure npm is installed - npm_version = subprocess.check_output(["npm", "--version"]) - has_npm = bool(re.search(version_regex, str(npm_version))) - npm_version = re.search(version_regex, str(npm_version)) - npm_version = None if npm_version is None else npm_version.group(0) - - if not has_npm: - m = """ - This machine is missing 'npm'. Please install npm and rerun 'lightning_app init react-ui' again. - - Install instructions: https://docs.npmjs.com/downloading-and-installing-node-js-and-npm - """ - missing_msgs.append(m) - - # make sure node is installed - node_version = subprocess.check_output(["node", "--version"]) - has_node = bool(re.search(version_regex, str(node_version))) - node_version = re.search(version_regex, str(node_version)) - node_version = None if node_version is None else node_version.group(0) - - if not has_node: - m = """ - This machine is missing 'node'. Please install node and rerun 'lightning_app init react-ui' again. - - Install instructions: https://docs.npmjs.com/downloading-and-installing-node-js-and-npm - """ - missing_msgs.append(m) - - # make sure yarn is installed - yarn_version = subprocess.check_output(["yarn", "--version"]) - has_yarn = bool(re.search(version_regex, str(yarn_version))) - yarn_version = re.search(version_regex, str(yarn_version)) - yarn_version = None if yarn_version is None else yarn_version.group(0) - - if not has_yarn: - m = """ - This machine is missing 'yarn'. Please install npm+node first, then run - - npm install --global yarn - - Full install instructions: https://classic.yarnpkg.com/lang/en/docs/install/#mac-stable - """ - missing_msgs.append(m) - - # exit or show success message - if len(missing_msgs) > 0: - missing_msg = "\n".join(missing_msgs) - raise SystemExit(missing_msg) - logger.info( - f""" - found npm version: {npm_version} - found node version: {node_version} - found yarn version: {yarn_version} - - Pre-requisites met! - """ - ) diff --git a/src/lightning/app/cli/commands/__init__.py b/src/lightning/app/cli/commands/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/commands/app_commands.py b/src/lightning/app/cli/commands/app_commands.py deleted file mode 100644 index bbecceabb6e28..0000000000000 --- a/src/lightning/app/cli/commands/app_commands.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -from typing import Dict, Optional - -import requests - -from lightning.app.cli.connect.app import ( - _clean_lightning_connection, - _install_missing_requirements, - _resolve_command_path, -) -from lightning.app.utilities.cli_helpers import _LightningAppOpenAPIRetriever -from lightning.app.utilities.commands.base import _download_command -from lightning.app.utilities.enum import OpenAPITags - - -def _is_running_help(argv) -> bool: - return argv[-1] in ["--help", "-"] if argv else False - - -def _run_app_command(app_name: str, app_id: Optional[str]): - """Execute a function in a running App from its name.""" - # 1: Collect the url and comments from the running application - _clean_lightning_connection() - - running_help = _is_running_help(sys.argv) - - retriever = _LightningAppOpenAPIRetriever(app_id, use_cache=running_help) - - if not running_help and (retriever.url is None or retriever.api_commands is None): - if app_name == "localhost": - print("The command couldn't be executed as your local Lightning App isn't running.") - else: - print(f"The command couldn't be executed as your cloud Lightning App `{app_name}` isn't running.") - sys.exit(0) - - if not retriever.api_commands: - raise Exception("This application doesn't expose any commands yet.") - - full_command = "_".join(sys.argv) - - has_found = False - for command in list(retriever.api_commands): - if command in full_command: - has_found = True - for value in sys.argv: - if value == command and "_" in value: - print( - f"The command `{value}` was provided with an underscore and it isn't allowed." - f"Instead, use `lightning_app {value.replace('_', ' ')}`." - ) - sys.exit(0) - break - - if not has_found: - raise Exception(f"The provided command isn't available in {list(retriever.api_commands)}") - - # 2: Send the command from the user - metadata = retriever.api_commands[command] - - try: - # 3: Execute the command - if metadata["tag"] == OpenAPITags.APP_COMMAND: - _handle_command_without_client(command, metadata, retriever.url) - else: - _handle_command_with_client(command, metadata, app_name, app_id, retriever.url) - except ModuleNotFoundError: - _install_missing_requirements(retriever, fail_if_missing=True) - - if running_help: - print("Your command execution was successful.") - - -def _handle_command_without_client(command: str, metadata: Dict, url: str) -> None: - supported_params = list(metadata["parameters"]) - if _is_running_help(sys.argv): - print(f"Usage: lightning_app {command} [ARGS]...") - print(" ") - print("Options") - for param in supported_params: - print(f" {param}: Add description") - return - - provided_params = [param.replace("--", "") for param in sys.argv[1 + len(command.split("_")) :]] - - # TODO: Add support for more argument types. - if any("=" not in param for param in provided_params): - raise Exception("Please, use --x=y syntax when providing the command arguments.") - - if any(param.split("=")[0] not in supported_params for param in provided_params): - raise Exception(f"Some arguments need to be provided. The keys are {supported_params}.") - - # TODO: Encode the parameters and validate their type. - query_parameters = "&".join(provided_params) - resp = requests.post(url + f"/command/{command}?{query_parameters}") - assert resp.status_code == 200, resp.json() - print(resp.json()) - - -def _handle_command_with_client(command: str, metadata: Dict, app_name: str, app_id: Optional[str], url: str): - debug_mode = bool(int(os.getenv("DEBUG", "0"))) - - if app_name == "localhost": - target_file = metadata["cls_path"] - else: - target_file = _resolve_command_path(command) if debug_mode else _resolve_command_path(command) - - if debug_mode: - print(target_file) - - client_command = _download_command( - command, - metadata["cls_path"], - metadata["cls_name"], - app_id, - debug_mode=debug_mode, - target_file=target_file if debug_mode else _resolve_command_path(command), - ) - client_command._setup(command_name=command, app_url=url) - sys.argv = sys.argv[len(command.split("_")) :] - client_command.run() diff --git a/src/lightning/app/cli/commands/cd.py b/src/lightning/app/cli/commands/cd.py deleted file mode 100644 index 7f84b894bf155..0000000000000 --- a/src/lightning/app/cli/commands/cd.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Optional, Tuple, Union - -import click -from rich.live import Live -from rich.spinner import Spinner -from rich.text import Text - -from lightning.app.cli.commands import ls -from lightning.app.cli.connect.app import _LIGHTNING_CONNECTION_FOLDER -from lightning.app.utilities.app_helpers import Logger -from lightning.app.utilities.cli_helpers import _error_and_exit - -logger = Logger(__name__) - -_HOME = os.path.expanduser("~") -_CD_FILE = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "cd.txt") - - -@click.argument("path", nargs=-1) -def cd(path: Optional[Union[Tuple[str], str]], verify: bool = True) -> None: - """Change the current directory within the Lightning Cloud filesystem.""" - with Live(Spinner("point", text=Text("pending...", style="white")), transient=True) as live: - root = "/" - - if isinstance(path, Tuple) and len(path) > 0: - path = " ".join(path) - - # handle ~/ - if isinstance(path, str) and path.startswith(_HOME): - path = "/" + path.replace(_HOME, "") - - # handle no path -> / - if path is None or len(path) == 0: - path = "/" - - if not os.path.exists(_LIGHTNING_CONNECTION_FOLDER): - os.makedirs(_LIGHTNING_CONNECTION_FOLDER) - - if not os.path.exists(_CD_FILE): - # Start from the root - if path.startswith(".."): - root = _apply_double_dots(root, path) - - with open(_CD_FILE, "w") as f: - f.write(root + "\n") - - live.stop() - - print(f"cd {root}") - - return root - - # read from saved cd - with open(_CD_FILE) as f: - lines = f.readlines() - root = lines[0].replace("\n", "") - - if verify: - if path.startswith("/"): - paths = [os.path.join(path, p) for p in ls.ls(path, print=False, use_live=False)] - else: - paths = [os.path.join(root, p) for p in ls.ls(root, print=False, use_live=False)] - - # generate new root - if root == "/": - if path == "/": - root = "/" - elif not path.startswith(".."): - if not path.startswith("/"): - path = "/" + path - root = path - else: - root = _apply_double_dots(root, path) - else: - if path.startswith(".."): - root = _apply_double_dots(root, path) - elif path.startswith("~"): - root = path[2:] - else: - root = os.path.join(root, path) - - if verify and root != "/" and not any(p.startswith(root) or root.startswith(p) for p in paths): - _error_and_exit(f"no such file or directory: {path}") - - os.remove(_CD_FILE) - - # store new root - with open(_CD_FILE, "w") as f: - f.write(root + "\n") - - live.stop() - - print(f"cd {root}") - - return root - - -def _apply_double_dots(root: str, path: str) -> str: - splits = [split for split in path.split("/") if split != ""] - for split in splits: - root = "/" + os.path.join(*root.split("/")[:-1]) if split == ".." else os.path.join(root, split) - return root diff --git a/src/lightning/app/cli/commands/cp.py b/src/lightning/app/cli/commands/cp.py deleted file mode 100644 index 0b11a874b216d..0000000000000 --- a/src/lightning/app/cli/commands/cp.py +++ /dev/null @@ -1,350 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import concurrent -import contextlib -import os -import sys -from functools import partial -from multiprocessing.pool import ApplyResult -from pathlib import Path -from textwrap import dedent -from typing import Any, Optional, Tuple, Union - -import click -import requests -import urllib3 -from lightning_cloud.openapi import ( - Externalv1Cluster, - Externalv1LightningappInstance, - ProjectIdStorageBody, - V1CloudSpace, -) -from rich.live import Live -from rich.progress import BarColumn, DownloadColumn, Progress, TaskID, TextColumn -from rich.spinner import Spinner -from rich.text import Text - -from lightning.app.cli.commands.ls import _collect_artifacts, _get_prefix -from lightning.app.cli.commands.pwd import _pwd -from lightning.app.source_code import FileUploader -from lightning.app.utilities.app_helpers import Logger -from lightning.app.utilities.auth import _AuthTokenGetter -from lightning.app.utilities.cli_helpers import _error_and_exit -from lightning.app.utilities.network import LightningClient - -logger = Logger(__name__) - - -@click.argument("src_path", required=True) -@click.argument("dst_path", required=True) -@click.option("-r", required=False, hidden=True) -@click.option("--recursive", required=False, hidden=True) -@click.option("--zip", required=False, is_flag=True, default=False) -def cp(src_path: str, dst_path: str, r: bool = False, recursive: bool = False, zip: bool = False) -> None: - """Copy files between your local filesystem and the Lightning Cloud filesystem.""" - if sys.platform == "win32": - print("`cp` isn't supported on windows. Open an issue on Github.") - sys.exit(0) - - with Live(Spinner("point", text=Text("pending...", style="white")), transient=True) as live: - pwd = _pwd() - - client = LightningClient(retry=False) - - src_path, src_remote = _sanitize_path(src_path, pwd) - dst_path, dst_remote = _sanitize_path(dst_path, pwd) - - if src_remote and dst_remote: - return _error_and_exit("Moving files remotely isn't supported yet. Please, open a Github issue.") - - if not src_remote and dst_remote: - if dst_path == "/" or len(dst_path.split("/")) == 1: - return _error_and_exit("Uploading files at the project level isn't allowed yet.") - if zip: - return _error_and_exit("Zipping uploads isn't supported yet. Please, open a Github issue.") - _upload_files(live, client, src_path, dst_path, pwd) - return None - if src_remote and not dst_remote: - if zip: - return _zip_files(live, src_path, dst_path) - _download_files(live, client, src_path, dst_path, pwd) - return None - - return _error_and_exit("Moving files locally isn't supported yet. Please, open a Github issue.") - - -def _upload_files(live, client: LightningClient, local_src: str, remote_dst: str, pwd: str) -> str: - remote_splits = [split for split in remote_dst.split("/") if split != ""] - remote_dst = os.path.join(*remote_splits) - - if not os.path.exists(local_src): - return _error_and_exit(f"The provided source path {local_src} doesn't exist.") - - lit_resource = None - - if len(remote_splits) > 1: - project_id, lit_resource = _get_project_id_and_resource(pwd) - else: - project_id = _get_project_id_from_name(remote_dst) - - if len(remote_splits) > 2: - remote_dst = os.path.join(*remote_splits[2:]) - - local_src = Path(local_src).resolve() - upload_paths = [] - - if os.path.isdir(local_src): - for root_dir, _, paths in os.walk(local_src): - for path in paths: - upload_paths.append(os.path.join(root_dir, path)) - else: - upload_paths = [local_src] - - _upload_urls = [] - - clusters = client.projects_service_list_project_cluster_bindings(project_id) - - live.stop() - - for upload_path in upload_paths: - for cluster in clusters.clusters: - filename = str(upload_path).replace(str(os.getcwd()), "")[1:] - filename = _get_prefix(os.path.join(remote_dst, filename), lit_resource) if lit_resource else "/" + filename - - response = client.lightningapp_instance_service_upload_project_artifact( - project_id=project_id, - body=ProjectIdStorageBody(cluster_id=cluster.cluster_id, filename=filename), - async_req=True, - ) - _upload_urls.append(response) - - upload_urls = [] - for upload_url in _upload_urls: - upload_urls.extend(upload_url.get().urls) - - live.stop() - - if not upload_paths: - print("There were no files to upload.") - return None - - progress = _get_progress_bar() - - total_size = sum([Path(path).stat().st_size for path in upload_paths]) // max(len(clusters.clusters), 1) - task_id = progress.add_task("upload", filename="", total=total_size) - - progress.start() - - _upload_partial = partial(_upload, progress=progress, task_id=task_id) - - with concurrent.futures.ThreadPoolExecutor(4) as executor: - results = executor.map(_upload_partial, upload_paths, upload_urls) - - progress.stop() - - # Raise the first exception found - exception = next((e for e in results if isinstance(e, Exception)), None) - if exception: - _error_and_exit("We detected errors in uploading your files.") - return None - return None - - -def _upload(source_file: str, presigned_url: ApplyResult, progress: Progress, task_id: TaskID) -> Optional[Exception]: - source_file = Path(source_file) - file_uploader = FileUploader( - presigned_url, - source_file, - total_size=None, - name=str(source_file), - ) - file_uploader.progress = progress - file_uploader.task_id = task_id - file_uploader.upload() - - -def _zip_files(live: Live, remote_src: str, local_dst: str) -> None: - if len(remote_src.split("/")) < 3: - return _error_and_exit( - dedent( - f""" - The source path must be at least two levels deep (e.g. r:/my-project/my-lit-resource). - - The path provided was: r:{remote_src} - """ - ) - ) - - if os.path.isdir(local_dst): - local_dst = os.path.join(local_dst, os.path.basename(remote_src) + ".zip") - - project_id, lit_resource = _get_project_id_and_resource(remote_src) - - # /my-project/my-lit-resource/artfact-path -> cloudspace/my-lit-resource-id/artifact-path - artifact = "/".join(remote_src.split("/")[3:]) - prefix = _get_prefix(artifact, lit_resource) - - token = _AuthTokenGetter(LightningClient().api_client)._get_api_token() - endpoint = f"/v1/projects/{project_id}/artifacts/download?prefix={prefix}&token={token}" - - cluster = _cluster_from_lit_resource(lit_resource) - url = _storage_host(cluster) + endpoint - - live.stop() - progress = _get_progress_bar(transient=True) - progress.start() - task_id = progress.add_task("download zip", total=None) - - _download_file(local_dst, url, progress, task_id) - progress.stop() - - click.echo(f"Downloaded to {local_dst}") - return None - - -def _download_files(live, client, remote_src: str, local_dst: str, pwd: str): - project_id, lit_resource = _get_project_id_and_resource(pwd) - - download_paths = [] - download_urls = [] - total_size = [] - - prefix = _get_prefix("/".join(pwd.split("/")[3:]), lit_resource) + "/" - - for artifact in _collect_artifacts(client, project_id, prefix, include_download_url=True): - path = os.path.join(local_dst, artifact.filename.replace(remote_src, "")) - path = Path(path).resolve() - os.makedirs(path.parent, exist_ok=True) - download_paths.append(path) - download_urls.append(artifact.url) - total_size.append(int(artifact.size_bytes)) - - live.stop() - - if not download_paths: - print("There were no files to download.") - return - - progress = progress = _get_progress_bar() - - progress.start() - - task_id = progress.add_task("download", filename="", total=sum(total_size)) - - _download_file_fn = partial(_download_file, progress=progress, task_id=task_id) - - with concurrent.futures.ThreadPoolExecutor(4) as executor: - results = executor.map(_download_file_fn, download_paths, download_urls) - - progress.stop() - - # Raise the first exception found - exception = next((e for e in results if isinstance(e, Exception)), None) - if exception: - _error_and_exit("There was an error downloading your files.") - - -def _download_file(path: str, url: str, progress: Progress, task_id: TaskID) -> None: - # Disable warning about making an insecure request - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - with contextlib.suppress(ConnectionError): - request = requests.get(url, stream=True, verify=False) # noqa: S501 - - chunk_size = 1024 - - with open(path, "wb") as fp: - for chunk in request.iter_content(chunk_size=chunk_size): - fp.write(chunk) # type: ignore - progress.update(task_id, advance=len(chunk)) - - -def _sanitize_path(path: str, pwd: str) -> Tuple[str, bool]: - is_remote = _is_remote(path) - if is_remote: - path = _remove_remote(path) - path = pwd if path == "." else os.path.join(pwd, path) - return path, is_remote - - -def _is_remote(path: str) -> bool: - return path.startswith("r:") or path.startswith("remote:") - - -def _remove_remote(path: str) -> str: - return path.replace("r:", "").replace("remote:", "") - - -def _get_project_id_and_resource(pwd: str) -> Tuple[str, Union[Externalv1LightningappInstance, V1CloudSpace]]: - """Convert a root path to a project id and app id.""" - # TODO: Handle project level - project_name, resource_name, *_ = pwd.split("/")[1:3] - - # 1. Collect the projects of the user - client = LightningClient() - projects = client.projects_service_list_memberships() - project_id = [project.project_id for project in projects.memberships if project.name == project_name][0] - - # 2. Collect resources - lit_apps = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project_id).lightningapps - - lit_cloud_spaces = client.cloud_space_service_list_cloud_spaces(project_id=project_id).cloudspaces - - lit_ressources = [lit_resource for lit_resource in lit_cloud_spaces if lit_resource.name == resource_name] - - if len(lit_ressources) == 0: - lit_ressources = [lit_resource for lit_resource in lit_apps if lit_resource.name == resource_name] - - if len(lit_ressources) == 0: - print(f"ERROR: There isn't any Lightning Ressource matching the name {resource_name}.") - sys.exit(0) - - return project_id, lit_ressources[0] - - -def _get_project_id_from_name(project_name: str) -> str: - # 1. Collect the projects of the user - client = LightningClient() - projects = client.projects_service_list_memberships() - return [project.project_id for project in projects.memberships if project.name == project_name][0] - - -def _get_progress_bar(**kwargs: Any) -> Progress: - return Progress( - TextColumn("[bold blue]{task.description}", justify="left"), - BarColumn(bar_width=None), - "[self.progress.percentage]{task.percentage:>3.1f}%", - DownloadColumn(), - **kwargs, - ) - - -def _storage_host(cluster: Externalv1Cluster) -> str: - dev_host = os.environ.get("LIGHTNING_STORAGE_HOST") - if dev_host: - return dev_host - return f"https://storage.{cluster.spec.driver.kubernetes.root_domain_name}" - - -def _cluster_from_lit_resource(lit_resource: Union[Externalv1LightningappInstance, V1CloudSpace]) -> Externalv1Cluster: - client = LightningClient() - if isinstance(lit_resource, Externalv1LightningappInstance): - return client.cluster_service_get_cluster(lit_resource.spec.cluster_id) - - clusters = client.cluster_service_list_clusters() - for cluster in clusters.clusters: - if cluster.id == clusters.default_cluster: - return cluster - return None diff --git a/src/lightning/app/cli/commands/logs.py b/src/lightning/app/cli/commands/logs.py deleted file mode 100644 index 4587987ae5f17..0000000000000 --- a/src/lightning/app/cli/commands/logs.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -import click -import rich -from rich.color import ANSI_COLOR_NAMES - -from lightning.app.utilities.app_helpers import Logger -from lightning.app.utilities.app_logs import _app_logs_reader -from lightning.app.utilities.cloud import _get_project -from lightning.app.utilities.logs_socket_api import _LightningLogsSocketAPI -from lightning.app.utilities.network import LightningClient - -logger = Logger(__name__) - - -@click.argument("app_name", required=False) -@click.argument("components", nargs=-1, required=False) -@click.option("-f", "--follow", required=False, is_flag=True, help="Wait for new logs, to exit use CTRL+C.") -def logs(app_name: str, components: List[str], follow: bool) -> None: - """Show cloud application logs. By default, prints logs for all currently available components. - - Example uses: - - Print all application logs: - - $ lightning show logs my-application - - Print logs only from the flow (no work): - - $ lightning show logs my-application flow - - Print logs only from selected works: - - $ lightning show logs my-application root.work_a root.work_b - - """ - _show_logs(app_name, components, follow) - - -def _show_logs(app_name: str, components: List[str], follow: bool) -> None: - client = LightningClient(retry=False) - project = _get_project(client) - - apps = { - getattr(app, "display_name", None) or app.name: app - for app in client.lightningapp_instance_service_list_lightningapp_instances( - project_id=project.project_id - ).lightningapps - } - - if not apps: - raise click.ClickException( - "You don't have any application in the cloud. Please, run an application first with `--cloud`." - ) - - if not app_name: - raise click.ClickException( - f"You have not specified any Lightning App. Please select one of the following: [{', '.join(apps.keys())}]." - ) - - if app_name not in apps: - raise click.ClickException( - f"The Lightning App '{app_name}' does not exist. " - f"Please select one of the following: [{', '.join(apps.keys())}]." - ) - - # Fetch all lightning works from given application - # 'Flow' component is somewhat implicit, only one for whole app, - # and not listed in lightningwork API - so we add it directly to the list - works = client.lightningwork_service_list_lightningwork( - project_id=project.project_id, app_id=apps[app_name].id - ).lightningworks - - app_component_names = ["flow"] + [f.name for f in apps[app_name].spec.flow_servers] + [w.name for w in works] - - if not components: - components = app_component_names - - else: - - def add_prefix(c: str) -> str: - if c == "flow": - return c - if not c.startswith("root."): - return "root." + c - return c - - components = [add_prefix(c) for c in components] - - for component in components: - if component not in app_component_names: - raise click.ClickException(f"Component '{component}' does not exist in app {app_name}.") - - log_reader = _app_logs_reader( - logs_api_client=_LightningLogsSocketAPI(client.api_client), - project_id=project.project_id, - app_id=apps[app_name].id, - component_names=components, - follow=follow, - ) - - rich_colors = list(ANSI_COLOR_NAMES) - colors = {c: rich_colors[i + 1] for i, c in enumerate(components)} - - for log_event in log_reader: - date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") - color = colors[log_event.component_name] - rich.print(f"[{color}]{log_event.component_name}[/{color}] {date} {log_event.message}") diff --git a/src/lightning/app/cli/commands/ls.py b/src/lightning/app/cli/commands/ls.py deleted file mode 100644 index e16a354f66d8c..0000000000000 --- a/src/lightning/app/cli/commands/ls.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import os -import sys -from contextlib import nullcontext -from typing import Generator, List, Optional - -import click -import lightning_cloud -import rich -from lightning_cloud.openapi import Externalv1LightningappInstance -from rich.console import Console -from rich.live import Live -from rich.spinner import Spinner -from rich.text import Text - -from lightning.app.cli.connect.app import _LIGHTNING_CONNECTION_FOLDER -from lightning.app.utilities.app_helpers import Logger -from lightning.app.utilities.cli_helpers import _error_and_exit -from lightning.app.utilities.network import LightningClient - -_FOLDER_COLOR = "sky_blue1" -_FILE_COLOR = "white" - -logger = Logger(__name__) - - -@click.argument("path", required=False) -def ls(path: Optional[str] = None, print: bool = True, use_live: bool = True) -> List[str]: - """List the contents of a folder in the Lightning Cloud Filesystem.""" - from lightning.app.cli.commands.cd import _CD_FILE - - if sys.platform == "win32": - _error_and_exit("`ls` isn't supported on windows. Open an issue on Github.") - - root = "/" - - context = ( - Live(Spinner("point", text=Text("pending...", style="white")), transient=True) if use_live else nullcontext() - ) - - with context: - if not os.path.exists(_LIGHTNING_CONNECTION_FOLDER): - os.makedirs(_LIGHTNING_CONNECTION_FOLDER) - - if not os.path.exists(_CD_FILE): - with open(_CD_FILE, "w") as f: - f.write(root + "\n") - else: - with open(_CD_FILE) as f: - lines = f.readlines() - root = lines[0].replace("\n", "") - - client = LightningClient(retry=False) - projects = client.projects_service_list_memberships() - - if root == "/": - project_names = [project.name for project in projects.memberships] - if print: - _print_names_with_colors(project_names, [_FOLDER_COLOR] * len(project_names)) - return project_names - - # Note: Root format has the following structure: - # /{PROJECT_NAME}/{APP_NAME}/{ARTIFACTS_PATHS} - splits = root.split("/")[1:] - - project = [project for project in projects.memberships if project.name == splits[0]] - - # This happens if the user changes cluster and the project doesn't exit. - if len(project) == 0: - return _error_and_exit( - f"There isn't any Lightning Project matching the name {splits[0]}." " HINT: Use `lightning_app cd`." - ) - - project_id = project[0].project_id - - # Parallelise calls - lit_apps = client.lightningapp_instance_service_list_lightningapp_instances( - project_id=project_id, async_req=True - ) - lit_cloud_spaces = client.cloud_space_service_list_cloud_spaces(project_id=project_id, async_req=True) - - lit_apps = lit_apps.get().lightningapps - lit_cloud_spaces = lit_cloud_spaces.get().cloudspaces - - if len(splits) == 1: - apps = [lit_app.name for lit_app in lit_apps] - cloud_spaces = [lit_cloud_space.name for lit_cloud_space in lit_cloud_spaces] - ressource_names = sorted(set(cloud_spaces + apps)) - if print: - _print_names_with_colors(ressource_names, [_FOLDER_COLOR] * len(ressource_names)) - return ressource_names - - lit_ressources = [lit_resource for lit_resource in lit_cloud_spaces if lit_resource.name == splits[1]] - - if len(lit_ressources) == 0: - lit_ressources = [lit_resource for lit_resource in lit_apps if lit_resource.name == splits[1]] - - if len(lit_ressources) == 0: - _error_and_exit(f"There isn't any Lightning Ressource matching the name {splits[1]}.") - - lit_resource = lit_ressources[0] - - app_paths = [] - app_colors = [] - - cloud_spaces_paths = [] - cloud_spaces_colors = [] - - depth = len(splits) - - prefix = "/".join(splits[2:]) - prefix = _get_prefix(prefix, lit_resource) - - for artifact in _collect_artifacts(client=client, project_id=project_id, prefix=prefix): - if str(artifact.filename).startswith("/"): - artifact.filename = artifact.filename[1:] - - path = os.path.join(project_id, prefix[1:], artifact.filename) - - artifact_splits = path.split("/") - - if len(artifact_splits) <= depth + 1: - continue - - path = artifact_splits[depth + 1] - - paths = app_paths if isinstance(lit_resource, Externalv1LightningappInstance) else cloud_spaces_paths - colors = app_colors if isinstance(lit_resource, Externalv1LightningappInstance) else cloud_spaces_colors - - if path not in paths: - paths.append(path) - - # display files otherwise folders - colors.append(_FILE_COLOR if len(artifact_splits) == depth + 1 else _FOLDER_COLOR) - - if print: - if app_paths and cloud_spaces_paths: - if app_paths: - rich.print("Lightning App") - _print_names_with_colors(app_paths, app_colors) - - if cloud_spaces_paths: - rich.print("Lightning CloudSpaces") - _print_names_with_colors(cloud_spaces_paths, cloud_spaces_colors) - else: - _print_names_with_colors(app_paths + cloud_spaces_paths, app_colors + cloud_spaces_colors) - - return app_paths + cloud_spaces_paths - - -def _add_colors(filename: str, color: Optional[str] = None) -> str: - return f"[{color}]{filename}[/{color}]" - - -def _print_names_with_colors(names: List[str], colors: List[str], padding: int = 5) -> None: - console = Console() - width = console.width - - max_L = max([len(name) for name in names] + [0]) + padding - - use_spacing = False - - if max_L * len(names) < width: - use_spacing = True - - num_cols = width // max_L - - columns = {} - for index, (name, color) in enumerate(zip(names, colors)): - row = index // num_cols - if row not in columns: - columns[row] = [] - columns[row].append((name, color)) - - for row_index in sorted(columns): - row = "" - for name, color in columns[row_index]: - spacing = padding if use_spacing else max_L - len(name) - spaces = " " * spacing - row += _add_colors(name, color) + spaces - rich.print(row) - - -def _collect_artifacts( - client: LightningClient, - project_id: str, - prefix: str = "", - page_token: Optional[str] = "", - cluster_id: Optional[str] = None, - page_size: int = 100_000, - tokens=None, - include_download_url: bool = False, -) -> Generator: - if tokens is None: - tokens = [] - - if cluster_id is None: - clusters = client.projects_service_list_project_cluster_bindings(project_id) - for cluster in clusters.clusters: - yield from _collect_artifacts( - client, - project_id, - prefix=prefix, - cluster_id=cluster.cluster_id, - page_token=page_token, - tokens=tokens, - page_size=page_size, - include_download_url=include_download_url, - ) - else: - if page_token in tokens: - return - - # Note: This is triggered when the request is wrong. - # This is currently happening due to looping through the user clusters. - with contextlib.suppress(lightning_cloud.openapi.rest.ApiException): - response = client.lightningapp_instance_service_list_project_artifacts( - project_id, - prefix=prefix, - cluster_id=cluster_id, - page_token=page_token, - include_download_url=include_download_url, - page_size=str(page_size), - ) - for artifact in response.artifacts: - if ".lightning-app-sync" in artifact.filename: - continue - yield artifact - - if response.next_page_token: - tokens.append(page_token) - yield from _collect_artifacts( - client, - project_id, - prefix=prefix, - cluster_id=cluster_id, - page_token=response.next_page_token, - tokens=tokens, - ) - - -def _add_resource_prefix(prefix: str, resource_path: str): - if resource_path in prefix: - return prefix - prefix = os.path.join(resource_path, prefix) - if not prefix.startswith("/"): - prefix = "/" + prefix - return prefix - - -def _get_prefix(prefix: str, lit_resource) -> str: - if isinstance(lit_resource, Externalv1LightningappInstance): - return _add_resource_prefix(prefix, f"lightningapps/{lit_resource.id}") - - return _add_resource_prefix(prefix, f"cloudspaces/{lit_resource.id}") diff --git a/src/lightning/app/cli/commands/pwd.py b/src/lightning/app/cli/commands/pwd.py deleted file mode 100644 index 7768309e4e6bb..0000000000000 --- a/src/lightning/app/cli/commands/pwd.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys - -from rich.live import Live -from rich.spinner import Spinner -from rich.text import Text - -from lightning.app.cli.commands.cd import _CD_FILE -from lightning.app.utilities.app_helpers import Logger - -logger = Logger(__name__) - - -def pwd() -> str: - """Print your current working directory in the Lightning Cloud filesystem.""" - if sys.platform == "win32": - print("`pwd` isn't supported on windows. Open an issue on Github.") - sys.exit(0) - - with Live(Spinner("point", text=Text("pending...", style="white")), transient=True): - root = _pwd() - - print(root) - - return root - - -def _pwd() -> str: - root = "/" - - if not os.path.exists(_CD_FILE): - with open(_CD_FILE, "w") as f: - f.write(root + "\n") - else: - with open(_CD_FILE) as f: - lines = f.readlines() - root = lines[0].replace("\n", "") - - return root diff --git a/src/lightning/app/cli/commands/rm.py b/src/lightning/app/cli/commands/rm.py deleted file mode 100644 index 587cc50469131..0000000000000 --- a/src/lightning/app/cli/commands/rm.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -import os - -import click -import lightning_cloud -import rich - -from lightning.app.cli.commands.ls import _add_colors, _get_prefix -from lightning.app.cli.commands.pwd import _pwd -from lightning.app.utilities.app_helpers import Logger -from lightning.app.utilities.cli_helpers import _error_and_exit -from lightning.app.utilities.network import LightningClient - -logger = Logger(__name__) - - -@click.argument("rm_path", required=True) -@click.option("-r", required=False, hidden=True) -@click.option("--recursive", required=False, hidden=True) -def rm(rm_path: str, r: bool = False, recursive: bool = False) -> None: - """Delete files on the Lightning Cloud filesystem.""" - root = _pwd() - - if rm_path in (".", ".."): - return _error_and_exit('rm "." and ".." may not be removed') - - if ".." in rm_path: - return _error_and_exit('rm ".." or higher may not be removed') - - root = os.path.join(root, rm_path) - splits = [split for split in root.split("/") if split != ""] - - if root == "/" or len(splits) == 1: - return _error_and_exit("rm at the project level isn't supported") - - client = LightningClient(retry=False) - projects = client.projects_service_list_memberships() - - project = [project for project in projects.memberships if project.name == splits[0]] - - # This happens if the user changes cluster and the project doesn't exist. - if len(project) == 0: - return _error_and_exit( - f"There isn't any Lightning Project matching the name {splits[0]}." " HINT: Use `lightning cd`." - ) - - project_id = project[0].project_id - - # Parallelise calls - lit_apps = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project_id, async_req=True) - lit_cloud_spaces = client.cloud_space_service_list_cloud_spaces(project_id=project_id, async_req=True) - - lit_apps = lit_apps.get().lightningapps - lit_cloud_spaces = lit_cloud_spaces.get().cloudspaces - - lit_ressources = [lit_resource for lit_resource in lit_cloud_spaces if lit_resource.name == splits[1]] - - if len(lit_ressources) == 0: - lit_ressources = [lit_resource for lit_resource in lit_apps if lit_resource.name == splits[1]] - - if len(lit_ressources) == 0: - _error_and_exit(f"There isn't any Lightning Ressource matching the name {splits[1]}.") - - lit_resource = lit_ressources[0] - - prefix = "/".join(splits[2:]) - prefix = _get_prefix(prefix, lit_resource) - - clusters = client.projects_service_list_project_cluster_bindings(project_id) - succeeded = False - - for cluster in clusters.clusters: - with contextlib.suppress(lightning_cloud.openapi.rest.ApiException): - client.lightningapp_instance_service_delete_project_artifact( - project_id=project_id, - cluster_id=cluster.cluster_id, - filename=prefix, - ) - succeeded = True - break - - prefix = os.path.join(*splits) - - if succeeded: - rich.print(_add_colors(f"Successfuly deleted `{prefix}`.", color="green")) - return None - - return _error_and_exit(f"No file or folder named `{prefix}` was found.") diff --git a/src/lightning/app/cli/component-template/.github/workflows/ci-testing.yml b/src/lightning/app/cli/component-template/.github/workflows/ci-testing.yml deleted file mode 100644 index 16abb8f418b89..0000000000000 --- a/src/lightning/app/cli/component-template/.github/workflows/ci-testing.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: CI testing - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - # Trigger the workflow on push or pull request, but only for the master branch - push: - branches: [main] - pull_request: - branches: [main] - -jobs: - pytest: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] - python-version: [3.8] - - # Timeout: https://stackoverflow.com/a/59076067/4521646 - timeout-minutes: 35 - - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - - name: Setup macOS - if: runner.os == 'macOS' - run: | - brew install libomp # https://github.com/pytorch/pytorch/issues/20030 - - - name: Get pip cache dir - id: pip-cache - run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT - - - name: Cache pip - uses: actions/cache@v2 - with: - path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('requirements.txt') }} - restore-keys: | - ${{ runner.os }}-py${{ matrix.python-version }}- - - - name: Clone Template React UI Repo - uses: actions/checkout@v3 - with: - repository: Lightning-AI/lightning - token: ${{ secrets.PAT_GHOST }} - ref: "master" - path: lightning - - - name: Install Lightning - run: | - cd lightning - pip install -r requirements.txt - pip install -e . - shell: bash - - - name: Install dependencies - run: | - python --version - pip --version - pip install --requirement requirements.txt --upgrade --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html - pip install --requirement tests/requirements.txt --quiet - pip list - shell: bash - - - name: Tests - run: | - coverage run --source placeholdername -m py.test placeholdername tests -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml - - - name: Statistics - if: success() - run: | - coverage report diff --git a/src/lightning/app/cli/component-template/.gitignore b/src/lightning/app/cli/component-template/.gitignore deleted file mode 100644 index 70ba25888435f..0000000000000 --- a/src/lightning/app/cli/component-template/.gitignore +++ /dev/null @@ -1,157 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -*install-app* - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg - -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Sphinx documentation -docs/_build/ -docs/source/api/ -docs/source/*.md - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.local_env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# PyCharm -.idea/ - -# Lightning logs -lightning_logs -*.gz -.DS_Store -.*_submit.py -.vscode - -MNIST -*.pt -.storage/ -.shared/ -infra -data -coverage.* -# Frontend build artifacts -*lightning/app/ui* -gradio_cached_examples -/docs/source/api_reference/generated/* -examples/my_own_leaderboard/submissions/* -docs/source/api_reference/generated/* -*.ckpt -redis-stable -node_modules -*.rdb -*.webm -*hars -examples/quick_start/* -examples/quick_start -examples/template_react_ui/* -examples/template_react_ui -# Ignore external components -lightning/app/components/* -!lightning/app/components/python -!lightning/app/components/serve -!lightning/app/components/__init__.py -!lightning/app/components/README.md -train_script.py -*return_values* -scratch -storage diff --git a/src/lightning/app/cli/component-template/LICENSE b/src/lightning/app/cli/component-template/LICENSE deleted file mode 100644 index 261eeb9e9f8b2..0000000000000 --- a/src/lightning/app/cli/component-template/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/src/lightning/app/cli/component-template/README.md b/src/lightning/app/cli/component-template/README.md deleted file mode 100644 index 1d700e286461b..0000000000000 --- a/src/lightning/app/cli/component-template/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# placeholdername component - -This ⚡ [Lightning component](https://lightning.ai/) ⚡ was generated automatically with: - -```bash -lightning_app init component placeholdername -``` - -## To run placeholdername - -First, install placeholdername (warning: this component has not been officially approved on the lightning gallery): - -```bash -lightning_app install component https://github.com/theUser/placeholdername -``` - -Once the app is installed, use it in an app: - -```python -from placeholdername import TemplateComponent -import lightning as L - - -class LitApp(L.LightningFlow): - def __init__(self) -> None: - super().__init__() - self.placeholdername = TemplateComponent() - - def run(self): - print("this is a simple Lightning app to verify your component is working as expected") - self.placeholdername.run() - - -app = L.LightningApp(LitApp()) -``` diff --git a/src/lightning/app/cli/component-template/app.py b/src/lightning/app/cli/component-template/app.py deleted file mode 100644 index 0a10532204043..0000000000000 --- a/src/lightning/app/cli/component-template/app.py +++ /dev/null @@ -1,15 +0,0 @@ -from lightning.app import LightningApp, LightningFlow -from placeholdername import TemplateComponent - - -class LitApp(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.placeholdername = TemplateComponent() - - def run(self): - print("this is a simple Lightning app to verify your component is working as expected") - self.placeholdername.run() - - -app = LightningApp(LitApp()) diff --git a/src/lightning/app/cli/component-template/placeholdername/__init__.py b/src/lightning/app/cli/component-template/placeholdername/__init__.py deleted file mode 100644 index 92b4ef47d8062..0000000000000 --- a/src/lightning/app/cli/component-template/placeholdername/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from placeholdername.component import TemplateComponent - -__all__ = ["TemplateComponent"] diff --git a/src/lightning/app/cli/component-template/placeholdername/component.py b/src/lightning/app/cli/component-template/placeholdername/component.py deleted file mode 100644 index 251a4e10c6a9f..0000000000000 --- a/src/lightning/app/cli/component-template/placeholdername/component.py +++ /dev/null @@ -1,12 +0,0 @@ -from lightning.app import LightningWork - - -class TemplateComponent(LightningWork): - def __init__(self) -> None: - super().__init__() - self.value = 0 - - def run(self): - self.value += 1 - print("welcome to your work component") - print("this is running inside a work") diff --git a/src/lightning/app/cli/component-template/requirements.txt b/src/lightning/app/cli/component-template/requirements.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/component-template/setup.py b/src/lightning/app/cli/component-template/setup.py deleted file mode 100644 index 78631901190b2..0000000000000 --- a/src/lightning/app/cli/component-template/setup.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python - -from setuptools import find_packages, setup - -setup( - name="placeholdername", - version="0.0.0", - description="⚡ Lightning component ⚡ generated with command: lightning_app init component", - author="", - author_email="", - # REPLACE WITH YOUR OWN GITHUB PROJECT LINK - url="https://github.com/Lightning-AI/lightning-component-template", - install_requires=[], - packages=find_packages(), -) diff --git a/src/lightning/app/cli/component-template/tests/README.md b/src/lightning/app/cli/component-template/tests/README.md deleted file mode 100644 index bef681691185d..0000000000000 --- a/src/lightning/app/cli/component-template/tests/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Run tests - -To run the tests: - -```bash -# go to your component folder -cd placeholdername - -# go to tests folder -cd tests - -# install testing deps -pip install -r requirements.txt - -# run tests -pytest . -``` diff --git a/src/lightning/app/cli/component-template/tests/__init__.py b/src/lightning/app/cli/component-template/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/component-template/tests/requirements.txt b/src/lightning/app/cli/component-template/tests/requirements.txt deleted file mode 100644 index 3185d1c44f033..0000000000000 --- a/src/lightning/app/cli/component-template/tests/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -coverage -codecov>=2.1 -pytest>=5.0.0 -pytest-cov -pytest-flake8 -flake8 -check-manifest -twine==4.0.1 diff --git a/src/lightning/app/cli/component-template/tests/test_placeholdername_component.py b/src/lightning/app/cli/component-template/tests/test_placeholdername_component.py deleted file mode 100644 index 6b9c28845749c..0000000000000 --- a/src/lightning/app/cli/component-template/tests/test_placeholdername_component.py +++ /dev/null @@ -1,14 +0,0 @@ -r"""To test a lightning component: - -1. Init the component. -2. call .run() - -""" - -from placeholdername.component import TemplateComponent - - -def test_placeholder_component(): - messenger = TemplateComponent() - messenger.run() - assert messenger.value == 1 diff --git a/src/lightning/app/cli/connect/__init__.py b/src/lightning/app/cli/connect/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/connect/app.py b/src/lightning/app/cli/connect/app.py deleted file mode 100644 index e3f0d0b151bb5..0000000000000 --- a/src/lightning/app/cli/connect/app.py +++ /dev/null @@ -1,387 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import shutil -import sys -from subprocess import Popen -from typing import List, Optional, Tuple - -import click -import psutil -from lightning_utilities.core.imports import package_available -from rich.progress import Progress - -from lightning.app.utilities.cli_helpers import _get_app_display_name, _LightningAppOpenAPIRetriever -from lightning.app.utilities.cloud import _get_project -from lightning.app.utilities.enum import OpenAPITags -from lightning.app.utilities.log import get_logfile -from lightning.app.utilities.network import LightningClient - -_HOME = os.path.expanduser("~") -_PPID = os.getenv("LIGHTNING_CONNECT_PPID", str(psutil.Process(os.getpid()).ppid())) -_LIGHTNING_CONNECTION = os.path.join(_HOME, ".lightning", "lightning_connection") -_LIGHTNING_CONNECTION_FOLDER = os.path.join(_LIGHTNING_CONNECTION, _PPID) - - -@click.argument("app_name_or_id", required=True) -def connect_app(app_name_or_id: str): - """Connect your local terminal to a running lightning app. - - After connecting, the lightning CLI will respond to commands exposed by the app. - - Example: - - \b - # connect to an app named pizza-cooker-123 - lightning connect pizza-cooker-123 - \b - # this will now show the commands exposed by pizza-cooker-123 - lightning --help - \b - # while connected, you can run the cook-pizza command exposed - # by pizza-cooker-123.BTW, this should arguably generate an exception :-) - lightning cook-pizza --flavor pineapple - \b - # once done, disconnect and go back to the standard lightning CLI commands - lightning disconnect - - """ - from lightning.app.utilities.commands.base import _download_command - - _clean_lightning_connection() - - if not os.path.exists(_LIGHTNING_CONNECTION_FOLDER): - os.makedirs(_LIGHTNING_CONNECTION_FOLDER) - - connected_file = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "connect.txt") - - matched_connection_path = _scan_lightning_connections(app_name_or_id) - - if os.path.exists(connected_file): - with open(connected_file) as f: - result = f.readlines()[0].replace("\n", "") - - if result == app_name_or_id: - if app_name_or_id == "localhost": - click.echo("You are connected to the local Lightning App.") - else: - click.echo(f"You are already connected to the cloud Lightning App: {app_name_or_id}.") - else: - disconnect_app() - connect_app(app_name_or_id) - - elif app_name_or_id.startswith("localhost"): - with Progress() as progress_bar: - connecting = progress_bar.add_task("[magenta]Setting things up for you...", total=1.0) - - if app_name_or_id != "localhost": - raise Exception("You need to pass localhost to connect to the local Lightning App.") - - retriever = _LightningAppOpenAPIRetriever(None) - - if retriever.api_commands is None: - raise Exception(f"Connection wasn't successful. Is your app {app_name_or_id} running?") - - increment = 1 / (1 + len(retriever.api_commands)) - - progress_bar.update(connecting, advance=increment) - - commands_folder = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "commands") - if not os.path.exists(commands_folder): - os.makedirs(commands_folder) - - _write_commands_metadata(retriever.api_commands) - - with open(os.path.join(commands_folder, "openapi.json"), "w") as f: - json.dump(retriever.openapi, f) - - _install_missing_requirements(retriever) - - for command_name, metadata in retriever.api_commands.items(): - if "cls_path" in metadata: - target_file = os.path.join(commands_folder, f"{command_name.replace(' ', '_')}.py") - _download_command( - command_name, - metadata["cls_path"], - metadata["cls_name"], - None, - target_file=target_file, - ) - else: - with open(os.path.join(commands_folder, f"{command_name}.txt"), "w") as f: - f.write(command_name) - - progress_bar.update(connecting, advance=increment) - - with open(connected_file, "w") as f: - f.write(app_name_or_id + "\n") - - click.echo("The lightning App CLI now responds to app commands. Use 'lightning_app --help' to see them.") - click.echo(" ") - - Popen( - f"LIGHTNING_CONNECT_PPID={_PPID} {sys.executable} -m lightning_app --help", - shell=True, - stdout=sys.stdout, - stderr=sys.stderr, - ).wait() - - elif matched_connection_path: - matched_connected_file = os.path.join(matched_connection_path, "connect.txt") - matched_commands = os.path.join(matched_connection_path, "commands") - if os.path.isdir(matched_commands): - commands = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "commands") - shutil.copytree(matched_commands, commands) - shutil.copy(matched_connected_file, connected_file) - - click.echo("The lightning App CLI now responds to app commands. Use 'lightning_app --help' to see them.") - click.echo(" ") - - Popen( - f"LIGHTNING_CONNECT_PPID={_PPID} {sys.executable} -m lightning_app --help", - shell=True, - stdout=sys.stdout, - stderr=sys.stderr, - ).wait() - - else: - with Progress() as progress_bar: - connecting = progress_bar.add_task("[magenta]Setting things up for you...", total=1.0) - - retriever = _LightningAppOpenAPIRetriever(app_name_or_id) - - if not retriever.api_commands: - client = LightningClient(retry=False) - project = _get_project(client) - apps = client.lightningapp_instance_service_list_lightningapp_instances(project_id=project.project_id) - click.echo( - "We didn't find a matching App. Here are the available Apps that you can " - f"connect to {[_get_app_display_name(app) for app in apps.lightningapps]}." - ) - return - - increment = 1 / (1 + len(retriever.api_commands)) - - progress_bar.update(connecting, advance=increment) - - _install_missing_requirements(retriever) - - commands_folder = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "commands") - if not os.path.exists(commands_folder): - os.makedirs(commands_folder) - - _write_commands_metadata(retriever.api_commands) - - for command_name, metadata in retriever.api_commands.items(): - if "cls_path" in metadata: - target_file = os.path.join(commands_folder, f"{command_name}.py") - _download_command( - command_name, - metadata["cls_path"], - metadata["cls_name"], - retriever.app_id, - target_file=target_file, - ) - else: - with open(os.path.join(commands_folder, f"{command_name}.txt"), "w") as f: - f.write(command_name) - - progress_bar.update(connecting, advance=increment) - - with open(connected_file, "w") as f: - f.write(retriever.app_name + "\n") - f.write(retriever.app_id + "\n") - - click.echo("The lightning App CLI now responds to app commands. Use 'lightning_app --help' to see them.") - click.echo(" ") - - Popen( - f"LIGHTNING_CONNECT_PPID={_PPID} {sys.executable} -m lightning_app --help", - shell=True, - stdout=sys.stdout, - stderr=sys.stderr, - ).wait() - - -def disconnect_app(logout: bool = False): - """Disconnect from an App.""" - _clean_lightning_connection() - - connected_file = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "connect.txt") - if os.path.exists(connected_file): - with open(connected_file) as f: - result = f.readlines()[0].replace("\n", "") - - os.remove(connected_file) - commands_folder = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "commands") - if os.path.exists(commands_folder): - shutil.rmtree(commands_folder) - - if result == "localhost": - click.echo("You are disconnected from the local Lightning App.") - else: - click.echo(f"You are disconnected from the cloud Lightning App: {result}.") - else: - if not logout: - click.echo( - "You aren't connected to any Lightning App. " - "Please use `lightning_app connect app_name_or_id` to connect to one." - ) - - -def _read_connected_file(connected_file): - if os.path.exists(connected_file): - with open(connected_file) as f: - lines = [line.replace("\n", "") for line in f.readlines()] - if len(lines) == 2: - return lines[0], lines[1] - return lines[0], None - return None, None - - -def _retrieve_connection_to_an_app() -> Tuple[Optional[str], Optional[str]]: - connected_file = os.path.join(_LIGHTNING_CONNECTION_FOLDER, "connect.txt") - return _read_connected_file(connected_file) - - -def _get_commands_folder() -> str: - return os.path.join(_LIGHTNING_CONNECTION_FOLDER, "commands") - - -def _write_commands_metadata(api_commands): - metadata = dict(api_commands.items()) - metadata_path = os.path.join(_get_commands_folder(), ".meta.json") - with open(metadata_path, "w") as f: - json.dump(metadata, f) - - -def _get_commands_metadata(): - metadata_path = os.path.join(_get_commands_folder(), ".meta.json") - with open(metadata_path) as f: - return json.load(f) - - -def _resolve_command_path(command: str) -> str: - return os.path.join(_get_commands_folder(), f"{command}.py") - - -def _list_app_commands(echo: bool = True) -> List[str]: - metadata = _get_commands_metadata() - metadata = {key.replace("_", " "): value for key, value in metadata.items()} - - command_names = sorted(metadata.keys()) - if not command_names: - click.echo("The current Lightning App doesn't have commands.") - return [] - - app_info = metadata[command_names[0]].get("app_info", None) - - title, description, on_connect_end = "Lightning", None, None - if app_info: - title = app_info.get("title") - description = app_info.get("description") - on_connect_end = app_info.get("on_connect_end") - - if echo: - click.echo(f"{title} App") - if description: - click.echo("") - click.echo("Description:") - if description.endswith("\n"): - description = description[:-2] - click.echo(f" {description}") - click.echo("") - click.echo("Commands:") - max_length = max(len(n) for n in command_names) - for command_name in command_names: - padding = (max_length + 1 - len(command_name)) * " " - click.echo(f" {command_name}{padding}{metadata[command_name].get('description', '')}") - if "LIGHTNING_CONNECT_PPID" in os.environ and on_connect_end: - if on_connect_end.endswith("\n"): - on_connect_end = on_connect_end[:-2] - click.echo(on_connect_end) - return command_names - - -def _install_missing_requirements( - retriever: _LightningAppOpenAPIRetriever, - fail_if_missing: bool = False, -): - requirements = set() - for metadata in retriever.api_commands.values(): - if metadata["tag"] == OpenAPITags.APP_CLIENT_COMMAND: - for req in metadata.get("requirements", []) or []: - requirements.add(req) - - if requirements: - missing_requirements = [] - for req in requirements: - if not (package_available(req) or package_available(req.replace("-", "_"))): - missing_requirements.append(req) - - if missing_requirements: - if fail_if_missing: - missing_requirements = " ".join(missing_requirements) - print(f"The command failed as you are missing the following requirements: `{missing_requirements}`.") - sys.exit(0) - - for req in missing_requirements: - std_out_out = get_logfile("output.log") - with open(std_out_out, "wb") as stdout: - Popen( - f"{sys.executable} -m pip install {req}", - shell=True, - stdout=stdout, - stderr=stdout, - ).wait() - os.remove(std_out_out) - - -def _clean_lightning_connection(): - if not os.path.exists(_LIGHTNING_CONNECTION): - return - - for ppid in os.listdir(_LIGHTNING_CONNECTION): - try: - psutil.Process(int(ppid)) - except (psutil.NoSuchProcess, ValueError): - connection = os.path.join(_LIGHTNING_CONNECTION, str(ppid)) - if os.path.exists(connection): - shutil.rmtree(connection) - - -def _scan_lightning_connections(app_name_or_id): - if not os.path.exists(_LIGHTNING_CONNECTION): - return None - - for ppid in os.listdir(_LIGHTNING_CONNECTION): - try: - psutil.Process(int(ppid)) - except (psutil.NoSuchProcess, ValueError): - continue - - connection_path = os.path.join(_LIGHTNING_CONNECTION, str(ppid)) - - connected_file = os.path.join(connection_path, "connect.txt") - curr_app_name, curr_app_id = _read_connected_file(connected_file) - - if not curr_app_name: - continue - - if app_name_or_id in (curr_app_name, curr_app_id): - return connection_path - - return None diff --git a/src/lightning/app/cli/connect/data.py b/src/lightning/app/cli/connect/data.py deleted file mode 100644 index 6069432ddb187..0000000000000 --- a/src/lightning/app/cli/connect/data.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright The Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import ast -import sys - -import click -import lightning_cloud -import rich -from rich.live import Live -from rich.spinner import Spinner -from rich.text import Text - -from lightning.app.utilities.app_helpers import Logger -from lightning.app.utilities.cli_helpers import _error_and_exit -from lightning.app.utilities.cloud import _get_project -from lightning.app.utilities.network import LightningClient - -logger = Logger(__name__) - - -@click.argument("name", required=True) -@click.option("--region", help="The AWS region of your bucket. Example: `us-west-1`.", required=True) -@click.option( - "--source", help="The URL path to your AWS S3 folder. Example: `s3://pl-flash-data/images/`.", required=True -) -@click.option( - "--secret_arn_name", - help="The name of role stored as a secret on Lightning AI to access your data. " - "Learn more with https://gist.github.com/tchaton/12ad4b788012e83c0eb35e6223ae09fc. " - "Example: `my_role`.", - required=False, -) -@click.option( - "--destination", help="Where your data should appear in the cloud. Currently not supported.", required=False -) -@click.option("--project_name", help="The project name on which to create the data connection.", required=False) -def connect_data( - name: str, - region: str, - source: str, - secret_arn_name: str = "", - destination: str = "", - project_name: str = "", -) -> None: - """Create a new data connection.""" - - from lightning_cloud.openapi import Create, V1AwsDataConnection - - if sys.platform == "win32": - _error_and_exit("Data connection isn't supported on windows. Open an issue on Github.") - - with Live(Spinner("point", text=Text("pending...", style="white")), transient=True) as live: - live.stop() - - client = LightningClient(retry=False) - projects = client.projects_service_list_memberships() - - project_id = None - - for project in projects.memberships: - if project.name == project_name: - project_id = project.project_id - break - - if project_id is None: - project_id = _get_project(client).project_id - - if not source.startswith("s3://"): - return _error_and_exit( - "Only public S3 folders are supported for now. Please, open a Github issue with your use case." - ) - - try: - client.data_connection_service_create_data_connection( - body=Create( - name=name, - aws=V1AwsDataConnection( - region=region, - source=source, - destination=destination, - secret_arn_name=secret_arn_name, - ), - ), - project_id=project_id, - ) - - # Note: Expose through lightning show data {DATA_NAME} - # response = client.data_connection_service_list_data_connection_artifacts( - # project_id=project_id, - # id=response.id, - # ) - except lightning_cloud.openapi.rest.ApiException as e: - message = ast.literal_eval(e.body.decode("utf-8"))["message"] - _error_and_exit(f"The data connection creation failed. Message: {message}") - - rich.print(f"[green]Succeeded[/green]: You have created a new data connection {name}.") - return None diff --git a/src/lightning/app/cli/core.py b/src/lightning/app/cli/core.py deleted file mode 100644 index 6d54c31426ee1..0000000000000 --- a/src/lightning/app/cli/core.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import abc - -from rich.table import Table - - -class Formatable(abc.ABC): - @abc.abstractmethod - def as_table(self) -> Table: - pass - - @abc.abstractmethod - def as_json(self) -> str: - pass diff --git a/src/lightning/app/cli/lightning_cli.py b/src/lightning/app/cli/lightning_cli.py deleted file mode 100644 index 6aa84063ab93f..0000000000000 --- a/src/lightning/app/cli/lightning_cli.py +++ /dev/null @@ -1,503 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -from pathlib import Path -from typing import Tuple, Union - -import click -from requests.exceptions import ConnectionError - -import lightning.app.core.constants as constants -from lightning.app import __version__ as ver -from lightning.app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init -from lightning.app.cli.commands.app_commands import _run_app_command -from lightning.app.cli.commands.cd import cd -from lightning.app.cli.commands.cp import cp -from lightning.app.cli.commands.logs import logs -from lightning.app.cli.commands.ls import ls -from lightning.app.cli.commands.pwd import pwd -from lightning.app.cli.commands.rm import rm -from lightning.app.cli.connect.app import ( - _list_app_commands, - _retrieve_connection_to_an_app, - connect_app, - disconnect_app, -) -from lightning.app.cli.connect.data import connect_data -from lightning.app.cli.lightning_cli_delete import delete -from lightning.app.cli.lightning_cli_launch import launch -from lightning.app.cli.lightning_cli_list import get_list -from lightning.app.core.constants import ( - APP_SERVER_HOST, - APP_SERVER_PORT, - ENABLE_APP_COMMENT_COMMAND_EXECUTION, - get_lightning_cloud_url, -) -from lightning.app.launcher.launcher import ( - run_lightning_flow, - run_lightning_work, - serve_frontend, - start_application_server, - start_flow_and_servers, -) -from lightning.app.runners.cloud import CloudRuntime -from lightning.app.runners.runtime import dispatch -from lightning.app.runners.runtime_type import RuntimeType -from lightning.app.utilities.app_commands import run_app_commands -from lightning.app.utilities.app_helpers import Logger -from lightning.app.utilities.cli_helpers import ( - _check_environment_and_redirect, - _check_version_and_upgrade, - _format_input_env_variables, -) -from lightning.app.utilities.exceptions import _ApiExceptionHandler -from lightning.app.utilities.login import Auth -from lightning.app.utilities.port import _find_lit_app_port - -logger = Logger(__name__) - - -def main() -> None: - # Check environment and versions if not in the cloud and not testing - is_testing = bool(int(os.getenv("LIGHTING_TESTING", "0"))) - if not is_testing and "LIGHTNING_APP_STATE_URL" not in os.environ: - try: - # Enforce running in PATH Python - _check_environment_and_redirect() - - # Check for newer versions and upgrade - _check_version_and_upgrade() - except SystemExit: - raise - except Exception: - # Note: We intentionally ignore all exceptions here so that we never panic if one of the above calls fails. - # If they fail for some reason users should still be able to continue with their command. - click.echo( - "We encountered an unexpected problem while checking your environment." - "We will still proceed with the command, however, there is a chance that errors may occur." - ) - - # 1: Handle connection to a Lightning App. - if len(sys.argv) > 1 and sys.argv[1] in ("connect", "disconnect", "logout"): - _main() - else: - # 2: Collect the connection a Lightning App. - app_name, app_id = _retrieve_connection_to_an_app() - if app_name: - # 3: Handle development use case. - is_local_app = app_name == "localhost" - if sys.argv[1:3] == ["run", "app"] or ( - sys.argv[1:3] == ["show", "logs"] and "show logs" not in _list_app_commands(False) - ): - _main() - else: - if is_local_app: - message = "You are connected to the local Lightning App." - else: - message = f"You are connected to the cloud Lightning App: {app_name}." - - if (len(sys.argv) > 1 and sys.argv[1] in ["-h", "--help"]) or len(sys.argv) == 1: - _list_app_commands() - else: - _run_app_command(app_name, app_id) - - click.echo() - click.echo(message + " Return to the primary CLI with `lightning_app disconnect`.") - else: - _main() - - -@click.group(cls=_ApiExceptionHandler) -@click.version_option(ver) -def _main() -> None: - pass - - -@_main.group() -def show() -> None: - """Show given resource.""" - pass - - -@_main.group() -def connect() -> None: - """Connect apps and data.""" - pass - - -@_main.group() -def disconnect() -> None: - """Disconnect apps.""" - pass - - -connect.command("app")(connect_app) -disconnect.command("app")(disconnect_app) -connect.command("data", hidden=True)(connect_data) -_main.command(hidden=True)(ls) -_main.command(hidden=True)(cd) -_main.command(hidden=True)(cp) -_main.command(hidden=True)(pwd) -_main.command(hidden=True)(rm) -show.command()(logs) - - -@_main.command() -def login() -> None: - """Log in to your lightning.ai account.""" - auth = Auth() - auth.clear() - - try: - auth.authenticate() - except ConnectionError: - click.echo(f"Unable to connect to {get_lightning_cloud_url()}. Please check your internet connection.") - exit(1) - - -@_main.command() -def logout() -> None: - """Log out of your lightning.ai account.""" - Auth().clear() - disconnect_app(logout=True) - - -def _run_app( - file: str, - cloud: bool, - without_server: bool, - no_cache: bool, - name: str, - blocking: bool, - open_ui: bool, - env: tuple, - secret: tuple, - run_app_comment_commands: bool, - enable_basic_auth: str, -) -> None: - if not os.path.exists(file): - original_file = file - file = cmd_install.gallery_apps_and_components(file, True, "latest", overwrite=True) # type: ignore[assignment] # E501 - if file is None: - click.echo(f"The provided entrypoint `{original_file}` doesn't exist.") - sys.exit(1) - run_app_comment_commands = True - - runtime_type = RuntimeType.CLOUD if cloud else RuntimeType.MULTIPROCESS - - # Cloud specific validations - if runtime_type != RuntimeType.CLOUD: - if no_cache: - raise click.ClickException( - "Caching is a property of apps running in cloud. " - "Using the flag --no-cache in local execution is not supported." - ) - if secret: - raise click.ClickException( - "Secrets can only be used for apps running in cloud. " - "Using the option --secret in local execution is not supported." - ) - if (ENABLE_APP_COMMENT_COMMAND_EXECUTION or run_app_comment_commands) and file is not None: - run_app_commands(str(file)) - - env_vars = _format_input_env_variables(env) - os.environ.update(env_vars) - - secrets = _format_input_env_variables(secret) - - port = _find_lit_app_port(constants.APP_SERVER_PORT) - constants.APP_SERVER_PORT = port - - click.echo("Your Lightning App is starting. This won't take long.") - - # TODO: Fixme when Grid utilities are available. - # And refactor test_lightning_run_app_cloud - file_path = Path(file) - dispatch( - file_path, - runtime_type, - start_server=not without_server, - no_cache=no_cache, - blocking=blocking, - open_ui=open_ui, - name=name, - env_vars=env_vars, - secrets=secrets, - run_app_comment_commands=run_app_comment_commands, - enable_basic_auth=enable_basic_auth, - port=port, - ) - if runtime_type == RuntimeType.CLOUD: - click.echo("Application is ready in the cloud") - - -@_main.group() -def run() -> None: - """Run a Lightning application locally or on the cloud.""" - - -@run.command("app") -@click.argument("file", type=str) -@click.option("--cloud", type=bool, default=False, is_flag=True) -@click.option("--name", help="The current application name", default="", type=str) -@click.option("--without-server", is_flag=True, default=False) -@click.option( - "--no-cache", - is_flag=True, - default=False, - help="Disable caching of packages " "installed from requirements.txt", -) -@click.option("--blocking", "blocking", type=bool, default=False) -@click.option( - "--open-ui", - type=bool, - default=True, - help="Decide whether to launch the app UI in a web browser", -) -@click.option("--env", type=str, default=[], multiple=True, help="Environment variables to be set for the app.") -@click.option("--secret", type=str, default=[], multiple=True, help="Secret variables to be set for the app.") -@click.option("--app_args", type=str, default=[], multiple=True, help="Collection of arguments for the app.") -@click.option( - "--setup", - "-s", - "run_app_comment_commands", - is_flag=True, - default=False, - help="run environment setup commands from the app comments.", -) -@click.option( - "--enable-basic-auth", - type=str, - default="", - help="Enable basic authentication for the app and use credentials provided in the format username:password", -) -def run_app( - file: str, - cloud: bool, - without_server: bool, - no_cache: bool, - name: str, - blocking: bool, - open_ui: bool, - env: tuple, - secret: tuple, - app_args: tuple, - run_app_comment_commands: bool, - enable_basic_auth: str, -) -> None: - """Run an app from a file.""" - _run_app( - file, - cloud, - without_server, - no_cache, - name, - blocking, - open_ui, - env, - secret, - run_app_comment_commands, - enable_basic_auth, - ) - - -@_main.command("open", hidden=True) -@click.argument("path", type=str, default=".") -@click.option("--name", help="The name to use for the CloudSpace", default="", type=str) -def open(path: str, name: str) -> None: - """Open files or folders from your machine on the cloud.""" - if not os.path.exists(path): - click.echo(f"The provided path `{path}` doesn't exist.") - sys.exit(1) - - runtime = CloudRuntime(entrypoint=Path(path)) - runtime.open(name) - - -_main.add_command(get_list) -_main.add_command(delete) -_main.add_command(launch) -_main.add_command(cmd_install.install) - - -@_main.group() -def init() -> None: - """Init a Lightning App and/or component.""" - - -@init.command("app") -@click.argument("name", type=str, required=False) -def init_app(name: str) -> None: - cmd_init.app(name) - - -@init.command("pl-app") -@click.argument("source", nargs=-1) -@click.option( - "--name", - "-n", - type=str, - default="pl-app", - help="The name of the folder where the app code will be. Default: pl-app", -) -@click.option( - "--overwrite", - "-f", - is_flag=True, - default=False, - help="When set, overwrite the output directory without asking if it already exists.", -) -def init_pl_app(source: Union[Tuple[str], Tuple[str, str]], name: str, overwrite: bool = False) -> None: - """Create an app from your PyTorch Lightning source files.""" - if len(source) == 1: - script_path = source[0] - source_dir = str(Path(script_path).resolve().parent) - elif len(source) == 2: - # enable type checking once https://github.com/python/mypy/issues/1178 is available - source_dir, script_path = source - else: - click.echo( - f"Incorrect number of arguments. You passed ({', '.join(source)}) but only either one argument" - f" (script path) or two arguments (root dir, script path) are allowed. Examples:\n" - f"lightning init pl-app ./path/to/script.py\n" - f"lightning init pl-app ./code ./code/path/to/script.py", - err=True, - ) - raise SystemExit(1) - - cmd_pl_init.pl_app(source_dir=source_dir, script_path=script_path, name=name, overwrite=overwrite) - - -@init.command("component") -@click.argument("name", type=str, required=False) -def init_component(name: str) -> None: - cmd_init.component(name) - - -@init.command("react-ui") -@click.option( - "--dest_dir", - "-dest_dir", - type=str, - help="optional destination directory to create the react ui", -) -def init_react_ui(dest_dir: str) -> None: - """Create a react UI to give a Lightning component a React.js web user interface (UI)""" - cmd_react_ui_init.react_ui(dest_dir) - - -def _prepare_file(file: str) -> str: - exists = os.path.exists(file) - if exists: - return file - - raise FileNotFoundError(f"The provided file {file} hasn't been found.") - - -@run.command("server") -@click.argument("file", type=click.Path(exists=True)) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) -@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) -def run_server(file: str, queue_id: str, host: str, port: int) -> None: - """It takes the application file as input, build the application object and then use that to run the application - server. - - This is used by the cloud runners to start the status server for the application - - """ - logger.debug(f"Run Server: {file} {queue_id} {host} {port}") - start_application_server(file, host, port, queue_id=queue_id) - - -@run.command("flow") -@click.argument("file", type=click.Path(exists=True)) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -@click.option("--base-url", help="Base url at which the app server is hosted", default="") -def run_flow(file: str, queue_id: str, base_url: str) -> None: - """It takes the application file as input, build the application object, proxy all the work components and then run - the application flow defined in the root component. - - It does exactly what a singleprocess dispatcher would do but with proxied work components. - - """ - logger.debug(f"Run Flow: {file} {queue_id} {base_url}") - run_lightning_flow(file, queue_id=queue_id, base_url=base_url) - - -@run.command("work") -@click.argument("file", type=click.Path(exists=True)) -@click.option("--work-name", type=str) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -def run_work(file: str, work_name: str, queue_id: str) -> None: - """Unlike other entrypoints, this command will take the file path or module details for a work component and run - that by fetching the states from the queues.""" - logger.debug(f"Run Work: {file} {work_name} {queue_id}") - run_lightning_work( - file=file, - work_name=work_name, - queue_id=queue_id, - ) - - -@run.command("frontend") -@click.argument("file", type=click.Path(exists=True)) -@click.option("--flow-name") -@click.option("--host") -@click.option("--port", type=int) -def run_frontend(file: str, flow_name: str, host: str, port: int) -> None: - """Serve the frontend specified by the given flow.""" - logger.debug(f"Run Frontend: {file} {flow_name} {host}") - serve_frontend(file=file, flow_name=flow_name, host=host, port=port) - - -@run.command("flow-and-servers") -@click.argument("file", type=click.Path(exists=True)) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -@click.option("--base-url", help="Base url at which the app server is hosted", default="") -@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) -@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) -@click.option( - "--flow-port", - help="Pair of flow name and frontend port", - type=(str, int), - multiple=True, -) -def run_flow_and_servers( - file: str, - base_url: str, - queue_id: str, - host: str, - port: int, - flow_port: Tuple[Tuple[str, int]], -) -> None: - """It takes the application file as input, build the application object and then use that to run the application - flow defined in the root component, the application server and all the flow frontends. - - This is used by the cloud runners to start the flow, the status server and all frontends for the application - - """ - logger.debug(f"Run Flow: {file} {queue_id} {base_url}") - logger.debug(f"Run Server: {file} {queue_id} {host} {port}.") - logger.debug(f"Run Frontend's: {flow_port}") - start_flow_and_servers( - entrypoint_file=file, - base_url=base_url, - queue_id=queue_id, - host=host, - port=port, - flow_names_and_ports=flow_port, - ) diff --git a/src/lightning/app/cli/lightning_cli_delete.py b/src/lightning/app/cli/lightning_cli_delete.py deleted file mode 100644 index 179e5b6fc365d..0000000000000 --- a/src/lightning/app/cli/lightning_cli_delete.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import click -import inquirer -from inquirer.themes import GreenPassion -from rich.console import Console - -from lightning.app.cli.cmd_apps import _AppManager - - -@click.group("delete") -def delete() -> None: - """Delete Lightning AI self-managed resources (e.g. apps)""" - pass - - -def _find_selected_app_instance_id(app_name: str) -> str: - console = Console() - app_manager = _AppManager() - - all_app_names_and_ids = {} - selected_app_instance_id = None - - for app in app_manager.list_apps(): - all_app_names_and_ids[app.name] = app.id - # figure out the ID of some app_name - if app_name == app.name or app_name == app.id: - selected_app_instance_id = app.id - break - - if selected_app_instance_id is None: - # when there is no app with the given app_name, - # ask the user which app they would like to delete. - console.print(f'[b][yellow]Cannot find app named "{app_name}"[/yellow][/b]') - try: - ask = [ - inquirer.List( - "app_name", - message="Select the app name to delete", - choices=list(all_app_names_and_ids.keys()), - ), - ] - app_name = inquirer.prompt(ask, theme=GreenPassion(), raise_keyboard_interrupt=True)["app_name"] - selected_app_instance_id = all_app_names_and_ids[app_name] - except KeyboardInterrupt: - console.print("[b][red]Cancelled by user![/b][/red]") - raise InterruptedError - - return selected_app_instance_id - - -def _delete_app_confirmation_prompt(app_name: str) -> None: - console = Console() - - # when the --yes / -y flags were not passed, do a final - # confirmation that the user wants to delete the app. - try: - ask = [ - inquirer.Confirm( - "confirm", - message=f'Are you sure you want to delete app "{app_name}""?', - default=False, - ), - ] - if inquirer.prompt(ask, theme=GreenPassion(), raise_keyboard_interrupt=True)["confirm"] is False: - console.print("[b][red]Aborted![/b][/red]") - raise InterruptedError - except KeyboardInterrupt: - console.print("[b][red]Cancelled by user![/b][/red]") - raise InterruptedError - - -@delete.command("app") -@click.argument("app-name", type=str) -@click.option( - "skip_user_confirm_prompt", - "--yes", - "-y", - is_flag=True, - default=False, - help="Do not prompt for confirmation.", -) -def delete_app(app_name: str, skip_user_confirm_prompt: bool) -> None: - """Delete a Lightning app. - - Deleting an app also deletes all app websites, works, artifacts, and logs. This permanently removes any record of - the app as well as all any of its associated resources and data. This does not affect any resources and data - associated with other Lightning apps on your account. - - """ - console = Console() - - try: - selected_app_instance_id = _find_selected_app_instance_id(app_name=app_name) - if not skip_user_confirm_prompt: - _delete_app_confirmation_prompt(app_name=app_name) - except InterruptedError: - return - - try: - # Delete the app! - app_manager = _AppManager() - app_manager.delete(app_id=selected_app_instance_id) - except Exception as ex: - console.print( - f'[b][red]An issue occurred while deleting app "{app_name}. If the issue persists, please ' - "reach out to us at [link=mailto:support@lightning.ai]support@lightning.ai[/link][/b][/red]." - ) - raise click.ClickException(str(ex)) - - console.print(f'[b][green]App "{app_name}" has been successfully deleted"![/green][/b]') - return diff --git a/src/lightning/app/cli/lightning_cli_launch.py b/src/lightning/app/cli/lightning_cli_launch.py deleted file mode 100644 index c171fd7b946f1..0000000000000 --- a/src/lightning/app/cli/lightning_cli_launch.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Tuple - -import click - -from lightning.app.core.constants import APP_SERVER_HOST, APP_SERVER_PORT -from lightning.app.launcher.launcher import ( - run_lightning_flow, - run_lightning_work, - serve_frontend, - start_application_server, - start_flow_and_servers, -) - -logger = logging.getLogger(__name__) - - -@click.group(name="launch", hidden=True) -def launch() -> None: - """Launch your application.""" - - -@launch.command("server", hidden=True) -@click.argument("file", type=click.Path(exists=True)) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) -@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) -def run_server(file: str, queue_id: str, host: str, port: int) -> None: - """It takes the application file as input, build the application object and then use that to run the application - server. - - This is used by the cloud runners to start the status server for the application - - """ - logger.debug(f"Run Server: {file} {queue_id} {host} {port}") - start_application_server(file, host, port, queue_id=queue_id) - - -@launch.command("flow", hidden=True) -@click.argument("file", type=click.Path(exists=True)) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -@click.option("--base-url", help="Base url at which the app server is hosted", default="") -def run_flow(file: str, queue_id: str, base_url: str) -> None: - """It takes the application file as input, build the application object, proxy all the work components and then run - the application flow defined in the root component. - - It does exactly what a singleprocess dispatcher would do but with proxied work components. - - """ - logger.debug(f"Run Flow: {file} {queue_id} {base_url}") - run_lightning_flow(file, queue_id=queue_id, base_url=base_url) - - -@launch.command("work", hidden=True) -@click.argument("file", type=click.Path(exists=True)) -@click.option("--work-name", type=str) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -def run_work(file: str, work_name: str, queue_id: str) -> None: - """Unlike other entrypoints, this command will take the file path or module details for a work component and run - that by fetching the states from the queues.""" - logger.debug(f"Run Work: {file} {work_name} {queue_id}") - run_lightning_work( - file=file, - work_name=work_name, - queue_id=queue_id, - ) - - -@launch.command("frontend", hidden=True) -@click.argument("file", type=click.Path(exists=True)) -@click.option("--flow-name") -@click.option("--host") -@click.option("--port", type=int) -def run_frontend(file: str, flow_name: str, host: str, port: int) -> None: - """Serve the frontend specified by the given flow.""" - logger.debug(f"Run Frontend: {file} {flow_name} {host}") - serve_frontend(file=file, flow_name=flow_name, host=host, port=port) - - -@launch.command("flow-and-servers", hidden=True) -@click.argument("file", type=click.Path(exists=True)) -@click.option("--queue-id", help="ID for identifying queue", default="", type=str) -@click.option("--base-url", help="Base url at which the app server is hosted", default="") -@click.option("--host", help="Application running host", default=APP_SERVER_HOST, type=str) -@click.option("--port", help="Application running port", default=APP_SERVER_PORT, type=int) -@click.option( - "--flow-port", - help="Pair of flow name and frontend port", - type=(str, int), - multiple=True, -) -def run_flow_and_servers( - file: str, - base_url: str, - queue_id: str, - host: str, - port: int, - flow_port: Tuple[Tuple[str, int]], -) -> None: - """It takes the application file as input, build the application object and then use that to run the application - flow defined in the root component, the application server and all the flow frontends. - - This is used by the cloud runners to start the flow, the status server and all frontends for the application - - """ - logger.debug(f"Run Flow: {file} {queue_id} {base_url}") - logger.debug(f"Run Server: {file} {queue_id} {host} {port}.") - logger.debug(f"Run Frontend's: {flow_port}") - start_flow_and_servers( - entrypoint_file=file, - base_url=base_url, - queue_id=queue_id, - host=host, - port=port, - flow_names_and_ports=flow_port, - ) diff --git a/src/lightning/app/cli/lightning_cli_list.py b/src/lightning/app/cli/lightning_cli_list.py deleted file mode 100644 index 0cbc8e3cc1887..0000000000000 --- a/src/lightning/app/cli/lightning_cli_list.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright The Lightning AI team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any - -import click - -from lightning.app.cli.cmd_apps import _AppManager - - -@click.group(name="list") -def get_list() -> None: - """List Lightning AI self-managed resources (e.g. apps)""" - pass - - -@get_list.command("apps") -def list_apps(**kwargs: Any) -> None: - """List your Lightning AI apps.""" - app_manager = _AppManager() - app_manager.list() diff --git a/src/lightning/app/cli/pl-app-template/.gitignore b/src/lightning/app/cli/pl-app-template/.gitignore deleted file mode 100644 index 01aa0091c3945..0000000000000 --- a/src/lightning/app/cli/pl-app-template/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.storage diff --git a/src/lightning/app/cli/pl-app-template/.lightningignore b/src/lightning/app/cli/pl-app-template/.lightningignore deleted file mode 100644 index 5895fd5187660..0000000000000 --- a/src/lightning/app/cli/pl-app-template/.lightningignore +++ /dev/null @@ -1,2 +0,0 @@ -.storage -ui/node_modules diff --git a/src/lightning/app/cli/pl-app-template/app.py b/src/lightning/app/cli/pl-app-template/app.py deleted file mode 100644 index b15ea21a02276..0000000000000 --- a/src/lightning/app/cli/pl-app-template/app.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -from typing import Dict, List, Optional, Union - -from core.components import TensorBoard, WeightsAndBiases -from core.components.script_runner import ScriptRunner -from lightning.app import LightningApp, LightningFlow -from lightning.app.frontend import StaticWebFrontend -from lightning.app.storage.path import Path -from lightning.app.utilities.packaging.cloud_compute import CloudCompute - - -class ReactUI(LightningFlow): - def configure_layout(self): - return StaticWebFrontend(str(Path(__file__).parent / "ui/build")) - - -class ScriptOrchestrator(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.script_runner: Optional[ScriptRunner] = None - self.triggered: bool = False - self.running: bool = False - self.succeeded: bool = False - self.failed: bool = False - self.script_args: List[str] = [] - self.cloud_compute_args: Dict[str, Union[str, int]] = {"name": "cpu-small"} - self.environment_variables: Dict[str, str] = {} - self.script_path = "{{ script_path }}" - - def run(self) -> None: - if not self.triggered: - return - - if self.script_runner is None: - self.script_runner = ScriptRunner( - root_path=str(Path(__file__).parent / "source"), - script_path=str(Path(__file__).parent / "source" / self.script_path), - script_args=self.script_args, - env=self._prepare_environment(), - parallel=True, - cloud_compute=CloudCompute(**self.cloud_compute_args), - raise_exception=False, - ) - self.script_runner.run() - - self.running = self.script_runner is not None and self.script_runner.has_started - self.succeeded = self.script_runner is not None and self.script_runner.has_succeeded - self.failed = self.script_runner is not None and self.script_runner.has_failed - - if self.succeeded or self.failed: - self.triggered = False - # TODO: support restarting - # self.script_runner = None - - def _prepare_environment(self) -> Dict[str, str]: - env = os.environ.copy() - env.update(self.environment_variables) - return env - - -class Main(LightningFlow): - def __init__(self) -> None: - super().__init__() - self.react_ui = ReactUI() - self.script_orchestrator = ScriptOrchestrator() - self.running_in_cloud = bool(os.environ.get("LIGHTNING_CLOUD_APP_ID", False)) - - def run(self) -> None: - self.react_ui.run() - self.script_orchestrator.run() - - if self.script_orchestrator.script_runner and self.script_orchestrator.script_runner.logger_metadatas: - if not getattr(self, "logger_component", None): - # TODO: Hack with hasattr and setattr until - # https://linear.app/gridai/issue/LAI2-8970/work-getting-set-to-none-in-state-update-from-appstate - # is resolved - logger_component = self._choose_logger_component() - if logger_component is not None: - setattr(self, "logger_component", logger_component) - else: - self.logger_component.run() - - def configure_layout(self): - tabs = [{"name": "Home", "content": self.react_ui}] - if hasattr(self, "logger_component"): - tabs.extend(self.logger_component.configure_layout()) - return tabs - - def _choose_logger_component(self) -> Optional[Union[TensorBoard, WeightsAndBiases]]: - logger_metadatas = self.script_orchestrator.script_runner.logger_metadatas - if not logger_metadatas: - return None - if logger_metadatas[0].get("class_name") == "TensorBoardLogger": - return TensorBoard(log_dir=self.script_orchestrator.script_runner.log_dir) - if logger_metadatas[0].get("class_name") == "WandbLogger": - return WeightsAndBiases( - username=logger_metadatas[0]["username"], - project_name=logger_metadatas[0]["project_name"], - run_id=logger_metadatas[0]["run_id"], - api_key=self.script_orchestrator.environment_variables.get("WANDB_API_KEY"), - ) - return None - - -app = LightningApp(Main()) diff --git a/src/lightning/app/cli/pl-app-template/core/__init__.py b/src/lightning/app/cli/pl-app-template/core/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/pl-app-template/core/callbacks.py b/src/lightning/app/cli/pl-app-template/core/callbacks.py deleted file mode 100644 index 87ec8e9bcbda2..0000000000000 --- a/src/lightning/app/cli/pl-app-template/core/callbacks.py +++ /dev/null @@ -1,319 +0,0 @@ -import inspect -from typing import TYPE_CHECKING, Any, Dict, Union - -import lightning.pytorch as pl -from lightning.app.storage.path import Path -from lightning.app.utilities.app_helpers import Logger -from lightning.pytorch import Callback -from lightning.pytorch.callbacks.progress.progress_bar import get_standard_metrics -from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger -from lightning.pytorch.utilities.parsing import collect_init_args - -from core.state import ProgressBarState, TrainerState - -if TYPE_CHECKING: - from core.components.script_runner import ScriptRunner - - -_log = Logger(__name__) - - -class PLAppProgressTracker(Callback): - """This callback tracks and communicates the Trainer's progress to the running PyTorch Lightning App.""" - - def __init__(self, work: "ScriptRunner", refresh_rate: int = 1) -> None: - super().__init__() - self.work = work - self.refresh_rate = refresh_rate - self.is_enabled = False - self._state = ProgressBarState() - - def setup( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - stage: str, - ) -> None: - self.is_enabled = trainer.is_global_zero - - def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - # We calculate the estimated stepping batches here instead of in the setup hook, because calling the - # `Trainer.estimated_stepping_batches` too early would lead to a barrier() call in case of DDP and since this - # callback is only attached on rank 0, would lead to a stall. - self._state.fit.estimated_stepping_batches = trainer.estimated_stepping_batches - - def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", *_: Any) -> None: - self._state.fit.total_train_batches = self._total_train_batches(trainer) - self._state.fit.total_val_batches = self._total_val_batches(trainer) - self._state.fit.current_epoch = trainer.current_epoch - if self.is_enabled: - self._send_state() - - def on_train_batch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", *_: Any) -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - current = self._train_batch_idx(trainer) - self._state.fit.train_batch_idx = current - self._state.fit.global_step = trainer.global_step - if self._should_send(current, self._total_train_batches(trainer)): - self._send_state() - - def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - if self.is_enabled: - self._send_state() - - def on_validation_batch_start( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - batch: Any, - batch_idx: int, - dataloader_idx: int, - ) -> None: - if trainer.state.fn == "fit": - self._state.fit.val_dataloader_idx = dataloader_idx - self._state.fit.total_val_batches = self._total_val_batches(trainer) - if trainer.state.fn == "validate": - self._state.val.dataloader_idx = dataloader_idx - self._state.val.total_val_batches = self._total_val_batches(trainer) - - def on_validation_batch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", *_: Any) -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - current = self._val_batch_idx(trainer) - if trainer.state.fn == "fit": - self._state.fit.val_batch_idx = current - if trainer.state.fn == "validate": - self._state.val.val_batch_idx = current - if self._should_send(current, self._total_val_batches(trainer)): - self._send_state() - - def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - if self.is_enabled: - self._send_state() - - def on_test_batch_start( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - batch: Any, - batch_idx: int, - dataloader_idx: int, - ) -> None: - self._state.test.dataloader_idx = dataloader_idx - self._state.test.total_test_batches = trainer.num_test_batches[dataloader_idx] - - def on_test_batch_end( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - outputs: Any, - batch: Any, - batch_idx: int, - dataloader_idx: int = 0, - ) -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - current = self._test_batch_idx(trainer) - self._state.test.test_batch_idx = current - if self._should_send(current, trainer.num_test_batches[dataloader_idx]): - self._send_state() - - def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - if self.is_enabled: - self._send_state() - - def on_predict_batch_start( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - batch: Any, - batch_idx: int, - dataloader_idx: int, - ) -> None: - self._state.predict.dataloader_idx = dataloader_idx - self._state.predict.total_predict_batches = trainer.num_predict_batches[dataloader_idx] - - def on_predict_batch_end( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - outputs: Any, - batch: Any, - batch_idx: int, - dataloader_idx: int = 0, - ) -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - current = self._predict_batch_idx(trainer) - self._state.predict.predict_batch_idx = current - if self._should_send(current, trainer.num_predict_batches[dataloader_idx]): - self._send_state() - - def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.metrics = self._progress_bar_metrics(trainer, pl_module) - if self.is_enabled: - self._send_state() - - def _train_batch_idx(self, trainer: "pl.Trainer") -> int: - return trainer.fit_loop.epoch_loop.batch_progress.current.processed - - def _val_batch_idx(self, trainer: "pl.Trainer") -> int: - loop = trainer.fit_loop.epoch_loop.val_loop if trainer.state.fn == "fit" else trainer.validate_loop - - return loop.epoch_loop.batch_progress.current.processed - - def _test_batch_idx(self, trainer: "pl.Trainer") -> int: - return trainer.test_loop.epoch_loop.batch_progress.current.processed - - def _predict_batch_idx(self, trainer: "pl.Trainer") -> int: - return trainer.predict_loop.epoch_loop.batch_progress.current.processed - - def _total_train_batches(self, trainer: "pl.Trainer") -> Union[int, float]: - return trainer.num_training_batches - - def _total_val_batches(self, trainer: "pl.Trainer") -> Union[int, float]: - return sum(trainer.num_val_batches) if trainer.fit_loop.epoch_loop._should_check_val_epoch() else 0 - - def _progress_bar_metrics( - self, trainer: "pl.Trainer", pl_module: "pl.LightningModule" - ) -> Dict[str, Union[str, float]]: - standard_metrics = get_standard_metrics(trainer, pl_module) - pbar_metrics = trainer.progress_bar_metrics - return {**standard_metrics, **pbar_metrics} - - def _send_state(self) -> None: - self.work.trainer_progress = self._state.dict() - - def _should_send(self, current: int, total: int) -> bool: - return self.is_enabled and current % self.refresh_rate == 0 or current == total - - -class PLAppTrainerStateTracker(Callback): - def __init__(self, work: "ScriptRunner") -> None: - super().__init__() - self.work = work - self._state = TrainerState() - - def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.fn = "fit" - self.work.trainer_state = self._state.dict() - - def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.fn = None - self.work.trainer_state = self._state.dict() - - def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.stage = "training" - self.work.trainer_state = self._state.dict() - - def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.stage = None - self.work.trainer_state = self._state.dict() - - def on_validation_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.stage = "validating" - self.work.trainer_state = self._state.dict() - - def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.stage = None - self.work.trainer_state = self._state.dict() - - def on_test_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.fn = "test" - self._state.stage = "testing" - self.work.trainer_state = self._state.dict() - - def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.fn = None - self._state.stage = None - self.work.trainer_state = self._state.dict() - - def on_predict_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.fn = "predict" - self._state.stage = "predicting" - self.work.trainer_state = self._state.dict() - - def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - self._state.fn = None - self._state.stage = None - self.work.trainer_state = self._state.dict() - - -class PLAppSummary(Callback): - def __init__(self, work: "ScriptRunner") -> None: - super().__init__() - self.work = work - - def on_init_end(self, trainer: "pl.Trainer") -> None: - current_frame = inspect.currentframe() - # Trainer.init() -> Trainer._call_callback_hooks() -> Callback.on_init_end() - frame = current_frame.f_back.f_back - init_args = {} - for local_args in collect_init_args(frame, []): - init_args.update(local_args) - - self.work.trainer_hparams = self._sanitize_trainer_init_args(init_args) - - def setup( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - stage: str, - ) -> None: - self.work.model_hparams = self._sanitize_model_init_args(dict(**pl_module.hparams)) - - def _sanitize_trainer_init_args(self, init_args: Dict[str, Any]) -> Dict[str, str]: - if init_args["callbacks"]: - init_args["callbacks"] = [c.__class__.__name__ for c in init_args["callbacks"]] - return {k: str(v) for k, v in init_args.items()} - - def _sanitize_model_init_args(self, init_args: Dict[str, Any]) -> Dict[str, str]: - return {k: str(v) for k, v in init_args.items()} - - -class PLAppArtifactsTracker(Callback): - def __init__(self, work: "ScriptRunner") -> None: - super().__init__() - self.work = work - - def setup( - self, - trainer: "pl.Trainer", - pl_module: "pl.LightningModule", - stage: str, - ) -> None: - log_dir = self._get_logdir(trainer) - self.work.log_dir = Path(log_dir) if log_dir is not None else None - self._collect_logger_metadata(trainer) - - def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if trainer.checkpoint_callback and trainer.checkpoint_callback.dirpath is not None: - self.work.checkpoint_dir = Path(trainer.checkpoint_callback.dirpath) - - def _collect_logger_metadata(self, trainer: "pl.Trainer") -> None: - if not trainer.loggers: - return - - for logger in trainer.loggers: - metadata = {"class_name": logger.__class__.__name__} - if isinstance(logger, WandbLogger) and not logger._offline: - metadata.update({ - "username": logger.experiment.entity, - "project_name": logger.name, - "run_id": logger.version, - }) - - if metadata and metadata not in self.work.logger_metadatas: - self.work.logger_metadatas.append(metadata) - - @staticmethod - def _get_logdir(trainer: "pl.Trainer") -> str: - """The code here is the same as in the ``Trainer.log_dir``, with the exception of the broadcast call.""" - if len(trainer.loggers) == 1: - if isinstance(trainer.logger, TensorBoardLogger): - dirpath = trainer.logger.log_dir - else: - dirpath = trainer.logger.save_dir - else: - dirpath = trainer.default_root_dir - return dirpath diff --git a/src/lightning/app/cli/pl-app-template/core/components/__init__.py b/src/lightning/app/cli/pl-app-template/core/components/__init__.py deleted file mode 100644 index 75f49eb7da05e..0000000000000 --- a/src/lightning/app/cli/pl-app-template/core/components/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from core.components.logger.tensorboard import TensorBoard # noqa: F401 -from core.components.logger.weights_and_biases import WeightsAndBiases # noqa: F401 diff --git a/src/lightning/app/cli/pl-app-template/core/components/logger/__init__.py b/src/lightning/app/cli/pl-app-template/core/components/logger/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/pl-app-template/core/components/logger/tensorboard.py b/src/lightning/app/cli/pl-app-template/core/components/logger/tensorboard.py deleted file mode 100644 index 0e1a536ff4859..0000000000000 --- a/src/lightning/app/cli/pl-app-template/core/components/logger/tensorboard.py +++ /dev/null @@ -1,49 +0,0 @@ -import subprocess -import time -from typing import Dict, List - -from lightning.app import BuildConfig, LightningFlow, LightningWork -from lightning.app.storage.path import Path - - -class TensorBoard(LightningFlow): - def __init__(self, log_dir: Path, sync_every_n_seconds: int = 5) -> None: - """This TensorBoard component synchronizes the log directory of an experiment and starts up the server. - - Args: - log_dir: The path to the directory where the TensorBoard log-files will appear. - sync_every_n_seconds: How often to sync the log directory (given as an argument to the run method) - - """ - super().__init__() - self.worker = TensorBoardWorker(log_dir=log_dir, sync_every_n_seconds=sync_every_n_seconds) - - def run(self) -> None: - self.worker.run() - - def configure_layout(self) -> List[Dict[str, str]]: - return [{"name": "Training Logs", "content": self.worker.url}] - - -class TensorBoardWorker(LightningWork): - def __init__(self, log_dir: Path, sync_every_n_seconds: int = 5) -> None: - super().__init__(cloud_build_config=BuildConfig(requirements=["tensorboard"])) - self.log_dir = log_dir - self._sync_every_n_seconds = sync_every_n_seconds - - def run(self) -> None: - subprocess.Popen([ - "tensorboard", - "--logdir", - str(self.log_dir), - "--host", - self.host, - "--port", - str(self.port), - ]) - - # Download the log directory periodically - while True: - time.sleep(self._sync_every_n_seconds) - if self.log_dir.exists_remote(): - self.log_dir.get(overwrite=True) diff --git a/src/lightning/app/cli/pl-app-template/core/components/logger/weights_and_biases.py b/src/lightning/app/cli/pl-app-template/core/components/logger/weights_and_biases.py deleted file mode 100644 index bf20d17de033c..0000000000000 --- a/src/lightning/app/cli/pl-app-template/core/components/logger/weights_and_biases.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -from typing import TYPE_CHECKING, Dict, List, Optional - -from lightning.app import LightningFlow - -if TYPE_CHECKING: - import wandb - - -class WeightsAndBiases(LightningFlow): - def __init__(self, username: str, project_name: str, run_id: str, api_key: Optional[str] = None) -> None: - super().__init__() - self.username = username - self.project_name = project_name - self.run_id = run_id - self._api_key = api_key - self._run: Optional[wandb.Run] = None - - def run(self) -> None: - if self._run is not None: - return - - if self._api_key: - os.environ["WANDB_API_KEY"] = self._api_key - - import wandb - - self._run = wandb.init(project=self.project_name, id=self.run_id, entity=self.username) - - def configure_layout(self) -> List[Dict[str, str]]: - if self._run is not None: - return [{"name": "Training Logs", "content": self._run.get_url()}] - return [] diff --git a/src/lightning/app/cli/pl-app-template/core/components/script_runner/__init__.py b/src/lightning/app/cli/pl-app-template/core/components/script_runner/__init__.py deleted file mode 100644 index b74bcabd5fbd7..0000000000000 --- a/src/lightning/app/cli/pl-app-template/core/components/script_runner/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from core.components.script_runner.script_runner import ScriptRunner # noqa: F401 diff --git a/src/lightning/app/cli/pl-app-template/core/components/script_runner/script_runner.py b/src/lightning/app/cli/pl-app-template/core/components/script_runner/script_runner.py deleted file mode 100644 index 0c2f09e372237..0000000000000 --- a/src/lightning/app/cli/pl-app-template/core/components/script_runner/script_runner.py +++ /dev/null @@ -1,76 +0,0 @@ -import sys -import traceback -from typing import Any, Dict, List, Optional, Tuple - -from lightning.app.components.python import TracerPythonScript -from lightning.app.storage.path import Path -from lightning.app.utilities.packaging.build_config import BuildConfig, load_requirements -from lightning.app.utilities.tracer import Tracer - - -class ScriptRunner(TracerPythonScript): - """The ScriptRunner executes the script using ``runpy`` and also patches the Trainer methods to inject additional - code.""" - - def __init__(self, root_path: str, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, cloud_build_config=self._get_build_config(root_path), **kwargs) - self.root_path = root_path - self.exception_message: str = "" - self.trainer_progress: dict = {} - self.trainer_state: dict = {} - self.trainer_hparams: dict = {} - self.model_hparams: dict = {} - self.log_dir: Optional[Path] = None - self.checkpoint_dir: Optional[Path] = None - self.logger_metadatas: List[Dict[str, str]] = [] - - def configure_tracer(self) -> Tracer: - from lightning.pytorch import Trainer - - from core.callbacks import PLAppArtifactsTracker, PLAppProgressTracker, PLAppSummary, PLAppTrainerStateTracker - - tracer = Tracer() - trainer_artifacts_tracker = PLAppArtifactsTracker(work=self) - trainer_state_tracker = PLAppTrainerStateTracker(work=self) - progress_tracker = PLAppProgressTracker(work=self) - summary = PLAppSummary(work=self) - - def pre_trainer_init(_, *args: Any, **kwargs: Any) -> Tuple[Dict, Tuple[Any, ...], Dict[str, Any]]: - kwargs.setdefault("callbacks", []) - kwargs["callbacks"].extend([ - trainer_artifacts_tracker, - trainer_state_tracker, - progress_tracker, - summary, - ]) - return {}, args, kwargs - - tracer.add_traced(Trainer, "__init__", pre_fn=pre_trainer_init) - return tracer - - def run(self) -> None: - self.exception_message = "" - # We need to set the module path both in sys.path and the PYTHONPATH env variable. - # The former is for the current process which is already running, and the env variable is needed in case - # the script launches subprocesses - sys.path.insert(0, self.root_path) - self.env["PYTHONPATH"] = self.root_path - super().run() - - def on_exception(self, exception: BaseException) -> None: - self.exception_message = traceback.format_exc() - super().on_exception(exception) - - @staticmethod - def _get_build_config(root_path: str) -> Optional[BuildConfig]: - # These are the requirements for the script runner itself - requirements = [ - "protobuf<4.21.0", - "pytorch-lightning<=1.6.3", - "pydantic<=1.9.0", - ] - if Path(root_path, "requirements.txt").exists(): - # Requirements from the user's code folder - requirements.extend(load_requirements(root_path, file_name="requirements.txt")) - - return BuildConfig(requirements=requirements) diff --git a/src/lightning/app/cli/pl-app-template/core/state.py b/src/lightning/app/cli/pl-app-template/core/state.py deleted file mode 100644 index 80a9f3d4e0619..0000000000000 --- a/src/lightning/app/cli/pl-app-template/core/state.py +++ /dev/null @@ -1,45 +0,0 @@ -from typing import Dict, Optional, Union - -from pydantic import BaseModel, Field - - -class FitProgress(BaseModel): - current_epoch: int = 0 - train_batch_idx: int = 0 - total_train_batches: int = 0 - val_dataloader_idx: int = 0 - val_batch_idx: int = 0 - total_val_batches: int = 0 - global_step: int = 0 - estimated_stepping_batches: int = 0 - - -class ValidateProgress(BaseModel): - dataloader_idx: int = 0 - val_batch_idx: int = 0 - total_val_batches: int = 0 - - -class TestProgress(BaseModel): - dataloader_idx: int = 0 - test_batch_idx: int = 0 - total_test_batches: int = 0 - - -class PredictProgress(BaseModel): - dataloader_idx: int = 0 - predict_batch_idx: int = 0 - total_predict_batches: int = 0 - - -class ProgressBarState(BaseModel): - fit: FitProgress = Field(default_factory=FitProgress) - val: ValidateProgress = Field(alias="validate", default_factory=ValidateProgress) - test: TestProgress = Field(default_factory=TestProgress) - predict: PredictProgress = Field(default_factory=PredictProgress) - metrics: Dict[str, Union[float, str]] = {} - - -class TrainerState(BaseModel): - fn: Optional[str] = None - stage: Optional[str] = None diff --git a/src/lightning/app/cli/pl-app-template/setup.py b/src/lightning/app/cli/pl-app-template/setup.py deleted file mode 100644 index dc223931779a2..0000000000000 --- a/src/lightning/app/cli/pl-app-template/setup.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from typing import List - -from setuptools import find_packages, setup - -_PROJECT_ROOT = os.path.dirname(__file__) - - -def _load_requirements(path_dir: str, file_name: str = "requirements.txt", comment_char: str = "#") -> List[str]: - """Load requirements from a file.""" - with open(os.path.join(path_dir, file_name)) as file: - lines = [ln.strip() for ln in file.readlines()] - reqs = [] - for ln in lines: - # filer all comments - if comment_char in ln: - ln = ln[: ln.index(comment_char)].strip() - # skip directly installed dependencies - if ln.startswith("http"): - continue - # skip index url - if ln.startswith("--extra-index-url"): - continue - if ln: # if requirement is not empty - reqs.append(ln) - return reqs - - -setup( - name="{{ app_name }}", - version="0.0.1", - packages=find_packages(exclude=["ui"]), - python_requires=">=3.8", -) diff --git a/src/lightning/app/cli/pl-app-template/tests/__init__.py b/src/lightning/app/cli/pl-app-template/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/pl-app-template/tests/core/__init__.py b/src/lightning/app/cli/pl-app-template/tests/core/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/src/lightning/app/cli/pl-app-template/tests/core/test_callbacks.py b/src/lightning/app/cli/pl-app-template/tests/core/test_callbacks.py deleted file mode 100644 index a211da35e6a90..0000000000000 --- a/src/lightning/app/cli/pl-app-template/tests/core/test_callbacks.py +++ /dev/null @@ -1,68 +0,0 @@ -import os.path -from unittest.mock import Mock - -import pytest -from core.callbacks import PLAppArtifactsTracker, PLAppProgressTracker, PLAppSummary -from core.components.script_runner import ScriptRunner -from lightning.app.storage.path import Path -from lightning.pytorch import LightningModule, Trainer -from lightning.pytorch.loggers import TensorBoardLogger - - -@pytest.mark.parametrize("rank", [0, 1]) -def test_progress_tracker_enabled(rank): - trainer = Mock() - trainer.global_rank = rank - trainer.is_global_zero = rank == 0 - work = Mock() - tracker = PLAppProgressTracker(work) - assert not tracker.is_enabled - tracker.setup(trainer, Mock(), Mock()) - assert tracker.is_enabled == trainer.is_global_zero - - -def test_summary_callback_tracks_hyperparameters(): - class ModelWithParameters(LightningModule): - def __init__(self, float_arg=0.1, int_arg=5, bool_arg=True, string_arg="string"): - super().__init__() - self.save_hyperparameters() - - model = ModelWithParameters() - work = Mock() - summary = PLAppSummary(work) - trainer = Trainer(max_epochs=22, callbacks=[summary]) # this triggers the `Callback.on_init_end` hook - summary.setup(trainer, model) - assert work.model_hparams == { - "float_arg": "0.1", - "int_arg": "5", - "bool_arg": "True", - "string_arg": "string", - } - - assert work.trainer_hparams["max_epochs"] == "22" - assert work.trainer_hparams["logger"] == "True" - assert "ModelCheckpoint" in work.trainer_hparams["callbacks"] - assert "PLAppSummary" in work.trainer_hparams["callbacks"] - - -def test_artifacts_tracker(tmpdir): - work = ScriptRunner(root_path=os.path.dirname(__file__), script_path=__file__) - tracker = PLAppArtifactsTracker(work=work) - trainer = Mock() - - trainer.loggers = [] - trainer.default_root_dir = "default_root_dir" - tracker.setup(trainer=trainer, pl_module=Mock()) - assert work.log_dir == Path("default_root_dir") - assert not work.logger_metadatas - - trainer.loggers = [TensorBoardLogger(save_dir=tmpdir)] - trainer.logger = trainer.loggers[0] - tracker.setup(trainer=trainer, pl_module=Mock()) - assert work.log_dir == Path(tmpdir / "lightning_logs" / "version_0") - assert len(work.logger_metadatas) == 1 - assert work.logger_metadatas[0] == {"class_name": "TensorBoardLogger"} - - # call setup a second time and the metadata length should not change - tracker.setup(trainer=trainer, pl_module=Mock()) - assert len(work.logger_metadatas) == 1 diff --git a/src/lightning/app/cli/pl-app-template/tests/test_app.py b/src/lightning/app/cli/pl-app-template/tests/test_app.py deleted file mode 100644 index 3fc14bfcdbf69..0000000000000 --- a/src/lightning/app/cli/pl-app-template/tests/test_app.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest - - -@pytest.mark.skip() -def test_is_running_in_cloud(monkeypatch): - from app import Main - - monkeypatch.setenv("LIGHTNING_CLOUD_APP_ID", "anything") - app = Main() - assert app.running_in_cloud - - monkeypatch.delenv("LIGHTNING_CLOUD_APP_ID", raising=False) - app = Main() - assert not app.running_in_cloud diff --git a/src/lightning/app/cli/pl-app-template/ui/.gitignore b/src/lightning/app/cli/pl-app-template/ui/.gitignore deleted file mode 100644 index 6c2d44cd3ba13..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/.gitignore +++ /dev/null @@ -1,25 +0,0 @@ -# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. - -# dependencies -/node_modules -/.pnp -.pnp.js - -# testing -/coverage - -# production -/build - -# misc -.DS_Store - -npm-debug.log* -yarn-debug.log* -yarn-error.log* - -/cypress/videos -/cypress/screenshots -/cypress/downloads - -.eslintcache diff --git a/src/lightning/app/cli/pl-app-template/ui/.prettierignore b/src/lightning/app/cli/pl-app-template/ui/.prettierignore deleted file mode 100644 index 2ea70f096046d..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/.prettierignore +++ /dev/null @@ -1,3 +0,0 @@ -resources -build -node_modules diff --git a/src/lightning/app/cli/pl-app-template/ui/.prettierrc b/src/lightning/app/cli/pl-app-template/ui/.prettierrc deleted file mode 100644 index cad1459af3548..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/.prettierrc +++ /dev/null @@ -1,24 +0,0 @@ -{ - "jsxSingleQuote": false, - "arrowParens": "avoid", - "tabWidth": 2, - "useTabs": false, - "printWidth": 119, - "singleQuote": false, - "semi": true, - "endOfLine": "lf", - "proseWrap": "always", - "bracketSameLine": true, - "quoteProps": "consistent", - "trailingComma": "all", - "bracketSpacing": true, - "importOrder": [ - "^react$", - "", - "^(components|hooks|resources|utils|lightning-.*)", - "^tests", - "^[./]" - ], - "importOrderSeparation": true, - "importOrderSortSpecifiers": true -} diff --git a/src/lightning/app/cli/pl-app-template/ui/craco.config.js b/src/lightning/app/cli/pl-app-template/ui/craco.config.js deleted file mode 100644 index 979d08985ff19..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/craco.config.js +++ /dev/null @@ -1,29 +0,0 @@ -const path = require("path"); -const fs = require("fs"); -const cracoBabelLoader = require("craco-babel-loader"); - -// manage relative paths to packages -const appDirectory = fs.realpathSync(process.cwd()); -const resolvePackage = relativePath => path.resolve(appDirectory, relativePath); - -module.exports = { - devServer: { - // When launching `yarn start dev`, write the files to the build folder too - devMiddleware: { writeToDisk: true }, - }, - webpack: { - configure: { - output: { - publicPath: "./", - }, - }, - }, - plugins: [ - { - plugin: cracoBabelLoader, - options: { - includes: [resolvePackage("node_modules/lightning-ui")], - }, - }, - ], -}; diff --git a/src/lightning/app/cli/pl-app-template/ui/package.json b/src/lightning/app/cli/pl-app-template/ui/package.json deleted file mode 100644 index 690bd85b6498c..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/package.json +++ /dev/null @@ -1,95 +0,0 @@ -{ - "name": "pytorch-lightning-app", - "version": "0.1.0", - "private": true, - "dependencies": { - "@emotion/react": "^11.7.1", - "@emotion/styled": "^11.6.0", - "@mui/icons-material": "^5.6.2", - "@mui/lab": "^5.0.0-alpha.64", - "@mui/material": "^5.2.7", - "@reduxjs/toolkit": "^1.8.0", - "@stripe/stripe-js": "^1.29.0", - "axios": "^0.25.0", - "boring-avatars": "^1.6.3", - "filter-material-ui": "2.7.0", - "fontfaceobserver": "^2.1.0", - "lightning-ui": "git+ssh://git@github.com/gridai/lightning-ui.git#35f4124cc8a16a313174fe63ec82cb74af388c6b", - "lodash": "^4.17.21", - "notistack": "^2.0.4", - "query-string": "^7.1.0", - "react": "^17.0.2", - "react-dom": "^17.0.2", - "react-github-btn": "^1.2.1", - "react-hook-form": "^7.27.1", - "react-query": "^3.34.7", - "react-router-dom": "^6.2.1", - "react-scripts": "5.0.0", - "react-spring": "^9.4.4", - "react-table": "^7.7.0", - "rxjs": "^7.5.2", - "typescript": "^4.4.2", - "use-debounce": "^7.0.1", - "web-vitals": "^2.1.0", - "xterm": "^4.18.0", - "xterm-addon-fit": "^0.5.0", - "xterm-addon-search": "^0.8.2" - }, - "scripts": { - "start": "craco start", - "build": "craco build", - "lint": "eslint --cache --max-warnings=0 . && prettier -c .", - "lint:fix": "eslint --cache --max-warnings=0 . --fix && prettier -w .", - "eject": "react-scripts eject" - }, - "lint-staged": { - "**/*": "prettier --write --ignore-unknown" - }, - "eslintConfig": { - "extends": [ - "react-app" - ], - "ignorePatterns": [ - "node_modules/**", - "build/**" - ], - "rules": { - "react/jsx-sort-props": [ - "off", - { - "callbacksLast": true, - "ignoreCase": true, - "noSortAlphabetically": false, - "reservedFirst": true, - "shorthandFirst": true - } - ], - "react/jsx-pascal-case": "warn" - } - }, - "browserslist": { - "production": [ - ">0.2%", - "not dead", - "not op_mini all" - ], - "development": [ - "last 1 chrome version", - "last 1 firefox version", - "last 1 safari version" - ] - }, - "devDependencies": { - "@craco/craco": "^6.4.3", - "@trivago/prettier-plugin-sort-imports": "^3.1.1", - "@types/fontfaceobserver": "^2.1.0", - "@types/lodash": "^4.14.182", - "@types/node": "^16.7.13", - "@types/react": "^17.0.20", - "@types/react-dom": "^17.0.9", - "@types/react-table": "^7.7.9", - "craco-babel-loader": "^1.0.3", - "lint-staged": "^12.3.2", - "prettier": "2.5.1" - } -} diff --git a/src/lightning/app/cli/pl-app-template/ui/public/favicon.svg b/src/lightning/app/cli/pl-app-template/ui/public/favicon.svg deleted file mode 100644 index 94a65989d0b4b..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/public/favicon.svg +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/src/lightning/app/cli/pl-app-template/ui/public/index.html b/src/lightning/app/cli/pl-app-template/ui/public/index.html deleted file mode 100644 index 0f2384212e1d3..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/public/index.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - - - - - - - - - PyTorch Lightning App - - - - - - -
- - - diff --git a/src/lightning/app/cli/pl-app-template/ui/public/manifest.json b/src/lightning/app/cli/pl-app-template/ui/public/manifest.json deleted file mode 100644 index 4a97d68d6b7ba..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/public/manifest.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "short_name": "PL App", - "name": "PyTorch Lightning App", - "icons": [ - { - "src": "favicon.svg", - "sizes": "512x512 192x192 64x64 32x32 24x24 16x16", - "type": "image/svg+xml" - } - ], - "start_url": ".", - "display": "standalone", - "theme_color": "#000000", - "background_color": "#ffffff" -} diff --git a/src/lightning/app/cli/pl-app-template/ui/public/robots.txt b/src/lightning/app/cli/pl-app-template/ui/public/robots.txt deleted file mode 100644 index e9e57dc4d41b9..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/public/robots.txt +++ /dev/null @@ -1,3 +0,0 @@ -# https://www.robotstxt.org/robotstxt.html -User-agent: * -Disallow: diff --git a/src/lightning/app/cli/pl-app-template/ui/src/App.tsx b/src/lightning/app/cli/pl-app-template/ui/src/App.tsx deleted file mode 100644 index 717984216f298..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/src/App.tsx +++ /dev/null @@ -1,126 +0,0 @@ -import { useEffect } from "react"; - -import { QueryClient, QueryClientProvider } from "react-query"; -import { BrowserRouter } from "react-router-dom"; - -import ErrorPanel from "components/ErrorPanel"; -import HyperparameterSummary from "components/HyperparameterSummary"; -import Launcher from "components/Launcher"; -import ProgressBarGroup from "components/ProgressBarGroup"; -import { - Breadcrumbs, - Card, - CardContent, - CardHeader, - Grid, - SnackbarProvider, - Stack, - useSnackbar, -} from "lightning-ui/src/design-system/components"; -import ThemeProvider from "lightning-ui/src/design-system/theme"; - -import ExecutionSummary from "./components/ExecutionSummary"; -import { useLightningState } from "./hooks/useLightningState"; - -const queryClient = new QueryClient(); - -function AppContainer() { - const { lightningState } = useLightningState(); - - const trainer_progress = lightningState?.flows.script_orchestrator.works.script_runner?.vars.trainer_progress; - const trainer_state = lightningState?.flows.script_orchestrator.works.script_runner?.vars.trainer_state; - const trainer_hparams = lightningState?.flows.script_orchestrator.works.script_runner?.vars.trainer_hparams; - const model_hparams = lightningState?.flows.script_orchestrator.works.script_runner?.vars.model_hparams; - - const script_running = lightningState?.flows.script_orchestrator.vars.running; - const script_succeeded = lightningState?.flows.script_orchestrator.vars.succeeded; - const script_failed = lightningState?.flows.script_orchestrator.vars.failed; - const start_triggered = lightningState?.flows.script_orchestrator.vars.triggered; - const script_path = lightningState?.flows.script_orchestrator.vars.script_path; - const running_in_cloud = lightningState?.vars.running_in_cloud; - - const breadCrumbItems = [ - { title: "Users", href: "url/to/href/1" }, - { title: "adrian", href: "url/to/href/2" }, - { title: "projects", href: "url/to/href/3" }, - { title: "app_name", href: "url/to/href/4" }, - { title: "source", href: "url/to/href/5" }, - { title: "train.py", href: "url/to/href/6" }, - ]; - - const { enqueueSnackbar } = useSnackbar(); - const exception_message = lightningState?.flows.script_orchestrator.works.script_runner?.vars?.exception_message; - useEffect(() => { - if (exception_message) { - enqueueSnackbar({ - title: "The script failed to complete", - severity: "error", - children: "See the error message", - }); - } - }, [exception_message]); - - return ( - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ); -} - -function App() { - return ( - - - - - - - - - - ); -} - -export default App; diff --git a/src/lightning/app/cli/pl-app-template/ui/src/components/EnvironmentConfigurator.tsx b/src/lightning/app/cli/pl-app-template/ui/src/components/EnvironmentConfigurator.tsx deleted file mode 100644 index 2d26f86ad9965..0000000000000 --- a/src/lightning/app/cli/pl-app-template/ui/src/components/EnvironmentConfigurator.tsx +++ /dev/null @@ -1,67 +0,0 @@ -import { Button, Stack, TextField } from "lightning-ui/src/design-system/components"; - -interface Data { - [key: string]: string; -} - -export function data2dict(data: Data[]) { - var dict: Data = {}; - for (var i = 0; i < data.length; i++) { - if (data[i]["name"] === "") { - continue; - } - dict[data[i]["name"]] = data[i]["value"]; - } - return dict; -} - -export default function EnvironmentConfigurator(props: any) { - const data: Data[] = props.data; - const setData = props.setData; - const addItemAllowed = data[data.length - 1].name.length > 0; - - const onItemAdd = () => { - setData([...data, { name: "", value: "" }]); - }; - - const onItemChange = (fieldName: string, index: number, text: any) => { - let newData = [...data]; - - text = text.trim(); - if (fieldName == "name") { - text = text.replace(/[^0-9a-zA-Z_]+/gi, "").toUpperCase(); - } - - newData[index][fieldName] = text; - setData(newData); - }; - - return ( - - {data.map((entry, index) => ( - - onItemChange("name", index, e)} - placeholder="KEY" - size="medium" - statusText="" - type="text" - value={entry.name || ""} - /> - onItemChange("value", index, e)} - placeholder="VALUE" - size="medium" - statusText="" - type="text" - value={entry.value || ""} - /> - - ))} - -