feat: enable upload Torch's nn.Module

Borda · Borda · commit 35ee01734fd2 · 2024-11-29T21:30:54.000+01:00
diff --git a/examples/demo-upload-download.py b/examples/demo-upload-download.py
@@ -10,13 +10,13 @@
     torch.save(model.state_dict(), "./boring-checkpoint.pt")
 
     # Upload the model checkpoint
-    litmodels.upload_model(
+    litmodels.upload_model_files(
         "./boring-checkpoint.pt",
         "jirka/kaggle/boring-model",
     )
 
     # Download the model checkpoint
-    model_path = litmodels.download_model("jirka/kaggle/boring-model", download_dir="./my-models")
+    model_path = litmodels.download_model_files("jirka/kaggle/boring-model", download_dir="./my-models")
     print(f"Model downloaded to {model_path}")
 
     # Load the model checkpoint
diff --git a/examples/train-callback.py b/examples/train-callback.py
@@ -1,7 +1,7 @@
 import torch.utils.data as data
 import torchvision as tv
 from lightning import Callback, Trainer
-from litmodels import upload_model
+from litmodels import upload_model_files
 from sample_model import LitAutoEncoder
 
 
@@ -11,7 +11,7 @@ def on_train_epoch_end(self, trainer, pl_module):
         best_model_path = trainer.checkpoint_callback.best_model_path
         if best_model_path:
             print(f"Uploading model: {best_model_path}")
-            upload_model(path=best_model_path, name="jirka/kaggle/lit-auto-encoder-callback")
+            upload_model_files(path=best_model_path, name="jirka/kaggle/lit-auto-encoder-callback")
 
 
 if __name__ == "__main__":
diff --git a/examples/train-resume.py b/examples/train-resume.py
@@ -1,14 +1,14 @@
 import torch.utils.data as data
 import torchvision as tv
 from lightning import Trainer
-from litmodels import download_model
+from litmodels import download_model_files
 from sample_model import LitAutoEncoder
 
 if __name__ == "__main__":
     dataset = tv.datasets.MNIST(".", download=True, transform=tv.transforms.ToTensor())
     train, val = data.random_split(dataset, [55000, 5000])
 
-    model_path = download_model(name="jirka/kaggle/lit-auto-encoder-simple", download_dir="my_models")
+    model_path = download_model_files(name="jirka/kaggle/lit-auto-encoder-simple", download_dir="my_models")
     print(f"model: {model_path}")
     # autoencoder = LitAutoEncoder.load_from_checkpoint(checkpoint_path=model_path)
 
diff --git a/examples/train-simple.py b/examples/train-simple.py
@@ -2,7 +2,7 @@
 import torchvision as tv
 from lightning import Trainer
 from lightning.pytorch.callbacks import ModelCheckpoint
-from litmodels import upload_model
+from litmodels import upload_model_files
 from sample_model import LitAutoEncoder
 
 if __name__ == "__main__":
@@ -30,4 +30,4 @@
         data.DataLoader(val, batch_size=256),
     )
     print(f"last: {vars(checkpoint_callback)}")
-    upload_model(path=checkpoint_callback.last_model_path, name="jirka/kaggle/lit-auto-encoder-simple")
+    upload_model_files(path=checkpoint_callback.last_model_path, name="jirka/kaggle/lit-auto-encoder-simple")
diff --git a/src/litmodels/__init__.py b/src/litmodels/__init__.py
@@ -7,6 +7,6 @@
 _PACKAGE_ROOT = os.path.dirname(__file__)
 _PROJECT_ROOT = os.path.dirname(_PACKAGE_ROOT)
 
-from litmodels.cloud_io import download_model, upload_model
+from litmodels.cloud_io import download_model_files, upload_model_files
 
-__all__ = ["download_model", "upload_model"]
+__all__ = ["download_model_files", "upload_model_files"]
diff --git a/src/litmodels/cloud_io.py b/src/litmodels/cloud_io.py
@@ -2,12 +2,31 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
-
-from typing import Optional, Tuple
+import os
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Tuple, Union
 
 from lightning_sdk.api.teamspace_api import UploadedModelInfo
 from lightning_sdk.teamspace import Teamspace
 from lightning_sdk.utils import resolve as sdk_resolvers
+from lightning_utilities import module_available
+
+if TYPE_CHECKING:
+    from torch import nn
+
+if module_available("torch"):
+    import torch
+    from torch import nn
+else:
+    torch = None
+
+# if module_available("lightning"):
+#     from lightning import LightningModule
+# elif module_available("pytorch_lightning"):
+#     from pytorch_lightning import LightningModule
+# else:
+#     LightningModule = None
 
 
 def _parse_name(name: str) -> Tuple[str, str, str]:
@@ -45,6 +64,48 @@ def _get_teamspace(name: str, organization: str) -> Teamspace:
 
 
 def upload_model(
+    model: Union[str, Path, nn.Module],
+    name: str,
+    progress_bar: bool = True,
+    cluster_id: Optional[str] = None,
+    staging_dir: Optional[str] = None,
+) -> UploadedModelInfo:
+    """Upload a local checkpoint file to the model store.
+
+    Args:
+        model: The model to upload. Can be a path to a checkpoint file, a PyTorch model, or a Lightning model.
+        name: Name tag of the model to upload. Must be in the format 'organization/teamspace/modelname'
+            where entity is either your username or the name of an organization you are part of.
+        progress_bar: Whether to show a progress bar for the upload.
+        cluster_id: The name of the cluster to use. Only required if it can't be determined
+            automatically.
+        staging_dir: A directory where the model can be saved temporarily. If not provided, a temporary directory will
+            be created and used.
+
+    """
+    if not staging_dir:
+        staging_dir = tempfile.mkdtemp()
+    # if LightningModule and isinstance(model, LightningModule):
+    #     path = os.path.join(staging_dir, f"{model.__class__.__name__}.ckpt")
+    #     model.save_checkpoint(path)
+    elif torch and isinstance(model, nn.Module):
+        path = os.path.join(staging_dir, f"{model.__class__.__name__}.pth")
+        torch.save(model.state_dict(), path)
+    elif isinstance(model, str):
+        path = model
+    elif isinstance(model, Path):
+        path = str(model)
+    else:
+        raise ValueError(f"Unsupported model type {type(model)}")
+    return upload_model_files(
+        path=path,
+        name=name,
+        progress_bar=progress_bar,
+        cluster_id=cluster_id,
+    )
+
+
+def upload_model_files(
     path: str,
     name: str,
     progress_bar: bool = True,
@@ -71,7 +132,7 @@ def upload_model(
     )
 
 
-def download_model(
+def download_model_files(
     name: str,
     download_dir: str = ".",
     progress_bar: bool = True,
diff --git a/tests/test_cloud_io.py b/tests/test_cloud_io.py
@@ -1,30 +1,41 @@
 from unittest import mock
 
 import pytest
-from litmodels.cloud_io import download_model, upload_model
+from litmodels.cloud_io import download_model_files, upload_model, upload_model_files
+from torch.nn import Module
 
 
 @pytest.mark.parametrize("name", ["org/model", "model-name", "/too/many/slashes"])
 def test_wrong_model_name(name):
     with pytest.raises(ValueError, match=r".*organization/teamspace/model.*"):
-        upload_model(path="path/to/checkpoint", name=name)
+        upload_model_files(path="path/to/checkpoint", name=name)
     with pytest.raises(ValueError, match=r".*organization/teamspace/model.*"):
-        download_model(name=name)
-
-
-def test_upload_model(mocker):
+        download_model_files(name=name)
+
+
+@pytest.mark.parametrize(
+    "model, model_path",
+    [
+        ("path/to/checkpoint", "path/to/checkpoint"),
+        # (BoringModel(), "%s/BoringModel.ckpt"),
+        (Module(), "%s/Module.pth"),
+    ],
+)
+def test_upload_model(mocker, tmpdir, model, model_path):
     # mocking the _get_teamspace to return another mock
     ts_mock = mock.MagicMock()
     mocker.patch("litmodels.cloud_io._get_teamspace", return_value=ts_mock)
 
     # The lit-logger function is just a wrapper around the SDK function
     upload_model(
-        path="path/to/checkpoint",
+        model,
         name="org-name/teamspace/model-name",
         cluster_id="cluster_id",
+        staging_dir=tmpdir,
     )
+    expected_path = model_path % str(tmpdir) if "%" in model_path else model_path
     ts_mock.upload_model.assert_called_once_with(
-        path="path/to/checkpoint",
+        path=expected_path,
         name="model-name",
         cluster_id="cluster_id",
         progress_bar=True,
@@ -36,7 +47,7 @@ def test_download_model(mocker):
     ts_mock = mock.MagicMock()
     mocker.patch("litmodels.cloud_io._get_teamspace", return_value=ts_mock)
     # The lit-logger function is just a wrapper around the SDK function
-    download_model(
+    download_model_files(
         name="org-name/teamspace/model-name",
         download_dir="where/to/download",
     )