Merge pull request #651 from transformerlab/add/userinfo-job-remote

deep1401 · web-flow · commit d2dd64c3fd9e · 2025-11-11T13:13:07.000-07:00
Store user info in a REMOTE job and Job ID should be formatted inside the cluster name
diff --git a/test/api/test_remote.py b/test/api/test_remote.py
@@ -117,211 +117,11 @@ def test_validate_env_vars_missing_port(self, monkeypatch):
         assert "GPU_ORCHESTRATION_SERVER_PORT" in error_response["message"]
 
 
-class TestCreateRemoteJob:
-    """Test the /remote/create-job endpoint"""
-
-    def test_create_remote_job_success(self, client, gpu_orchestration_env_vars, mock_experiment_id, job_cleanup):
-        """Test creating a remote job successfully"""
-        response = client.post(
-            f"/remote/create-job?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-                "task_name": "test-task",
-            },
-        )
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "success"
-        assert "job_id" in data
-        assert data["message"] == "Remote job created successfully"
-        # Track job for cleanup
-        job_cleanup.append((data["job_id"], mock_experiment_id))
-
-    def test_create_remote_job_with_optional_params(self, client, gpu_orchestration_env_vars, mock_experiment_id, job_cleanup):
-        """Test creating a remote job with optional parameters"""
-        response = client.post(
-            f"/remote/create-job?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-                "cpus": "4",
-                "memory": "8GB",
-                "disk_space": "100GB",
-                "accelerators": "1xV100",
-                "num_nodes": 2,
-            },
-        )
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "success"
-        assert "job_id" in data
-        # Track job for cleanup
-        job_cleanup.append((data["job_id"], mock_experiment_id))
-
-
-class TestLaunchRemote:
-    """Test the /remote/launch endpoint"""
-
-    @patch("transformerlab.routers.remote.httpx.AsyncClient")
-    def test_launch_remote_success(self, mock_client_class, client, gpu_orchestration_env_vars, mock_experiment_id, job_cleanup):
-        """Test launching a remote job successfully"""
-        # Mock the async client and response
-        mock_response = MagicMock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {
-            "request_id": "test-request-123",
-            "cluster_name": "test-cluster",
-            "status": "launched",
-        }
-        
-        # Set up the async context manager protocol for httpx.AsyncClient
-        mock_httpx_client = AsyncMock()
-        mock_httpx_client.post = AsyncMock(return_value=mock_response)
-        # AsyncMock automatically handles __aenter__ and __aexit__, but we can be explicit
-        mock_httpx_client.__aenter__.return_value = mock_httpx_client
-        mock_httpx_client.__aexit__.return_value = None
-        mock_client_class.return_value = mock_httpx_client
-
-        response = client.post(
-            f"/remote/launch?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-            },
-        )
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "success"
-        assert "job_id" in data
-        assert data["data"]["request_id"] == "test-request-123"
-        # Track job for cleanup
-        job_cleanup.append((data["job_id"], mock_experiment_id))
-
-    @patch("transformerlab.routers.remote.httpx.AsyncClient")
-    def test_launch_remote_with_existing_job_id(self, mock_client_class, client, gpu_orchestration_env_vars, mock_experiment_id, job_cleanup):
-        """Test launching with an existing job_id"""
-        # First create a job
-        create_response = client.post(
-            f"/remote/create-job?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-            },
-        )
-        job_id = create_response.json()["job_id"]
-        # Track job for cleanup
-        job_cleanup.append((job_id, mock_experiment_id))
-
-        # Mock the async client and response
-        mock_response = MagicMock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {
-            "request_id": "test-request-456",
-            "cluster_name": "test-cluster",
-        }
-        
-        mock_httpx_client = AsyncMock()
-        mock_httpx_client.post = AsyncMock(return_value=mock_response)
-        mock_httpx_client.__aenter__.return_value = mock_httpx_client
-        mock_httpx_client.__aexit__.return_value = None
-        mock_client_class.return_value = mock_httpx_client
-
-        response = client.post(
-            f"/remote/launch?experimentId={mock_experiment_id}",
-            data={
-                "job_id": job_id,
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-            },
-        )
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "success"
-        assert data["job_id"] == str(job_id)
-
-    def test_launch_remote_missing_env_vars(self, client, no_gpu_orchestration_env, mock_experiment_id):
-        """Test launching when GPU orchestration env vars are not set"""
-        response = client.post(
-            f"/remote/launch?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-            },
-        )
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "error"
-        assert "GPU_ORCHESTRATION_SERVER" in data["message"]
-
-    @patch("transformerlab.routers.remote.httpx.AsyncClient")
-    def test_launch_remote_orchestrator_error(self, mock_client_class, client, gpu_orchestration_env_vars, mock_experiment_id, job_cleanup):
-        """Test handling orchestrator error response"""
-        mock_response = MagicMock()
-        mock_response.status_code = 500
-        mock_response.text = "Internal Server Error"
-        
-        mock_httpx_client = AsyncMock()
-        mock_httpx_client.post = AsyncMock(return_value=mock_response)
-        mock_httpx_client.__aenter__.return_value = mock_httpx_client
-        mock_httpx_client.__aexit__.return_value = None
-        mock_client_class.return_value = mock_httpx_client
-
-        response = client.post(
-            f"/remote/launch?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-            },
-        )
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "error"
-        assert "500" in data["message"]
-        # Even if launch failed, a job might have been created, so track it if present
-        if "job_id" in data:
-            job_cleanup.append((data["job_id"], mock_experiment_id))
 
 
 class TestStopRemote:
     """Test the /remote/stop endpoint"""
 
-    @patch("transformerlab.routers.remote.httpx.AsyncClient")
-    def test_stop_remote_success(self, mock_client_class, client, gpu_orchestration_env_vars, mock_experiment_id, job_cleanup):
-        """Test stopping a remote job successfully"""
-        # Create a job first
-        create_response = client.post(
-            f"/remote/create-job?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-            },
-        )
-        job_id = create_response.json()["job_id"]
-        # Track job for cleanup
-        job_cleanup.append((job_id, mock_experiment_id))
-
-        # Mock the async client and response
-        mock_response = MagicMock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {"status": "stopped"}
-        
-        mock_httpx_client = AsyncMock()
-        mock_httpx_client.post = AsyncMock(return_value=mock_response)
-        mock_httpx_client.__aenter__.return_value = mock_httpx_client
-        mock_httpx_client.__aexit__.return_value = None
-        mock_client_class.return_value = mock_httpx_client
-
-        response = client.post(
-            "/remote/stop",
-            data={
-                "job_id": job_id,
-                "cluster_name": "test-cluster",
-            },
-        )
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "success"
 
     def test_stop_remote_missing_env_vars(self, client, no_gpu_orchestration_env):
         """Test stopping when GPU orchestration env vars are not set"""
@@ -400,50 +200,6 @@ def test_check_status_no_launching_jobs(self, mock_client_class, client, gpu_orc
         assert "updated_jobs" in data
         assert data["updated_jobs"] == []
 
-    @patch("transformerlab.routers.remote.httpx.AsyncClient")
-    def test_check_status_with_jobs(self, mock_client_class, client, gpu_orchestration_env_vars, mock_experiment_id, job_cleanup):
-        """Test checking status with LAUNCHING jobs"""
-        # Create a remote job in LAUNCHING state
-        create_response = client.post(
-            f"/remote/create-job?experimentId={mock_experiment_id}",
-            data={
-                "cluster_name": "test-cluster",
-                "command": "echo 'test'",
-            },
-        )
-        # Track job for cleanup
-        job_id = None
-        if create_response.status_code == 200:
-            job_data = create_response.json()
-            if "job_id" in job_data:
-                job_id = job_data["job_id"]
-                job_cleanup.append((job_id, mock_experiment_id))
-
-        # Verify job was created
-        assert job_id is not None, "Job should have been created"
-
-        # Mock the async client and response for status check
-        # This mocks the call to check_remote_job_status which calls the orchestrator
-        mock_response = MagicMock()
-        mock_response.status_code = 200
-        mock_response.json.return_value = {
-            "jobs": [
-                {"status": "SUCCEEDED"},
-            ],
-        }
-        
-        mock_httpx_client = AsyncMock()
-        mock_httpx_client.get = AsyncMock(return_value=mock_response)
-        mock_httpx_client.__aenter__.return_value = mock_httpx_client
-        mock_httpx_client.__aexit__.return_value = None
-        mock_client_class.return_value = mock_httpx_client
-
-        response = client.get("/remote/check-status")
-        assert response.status_code == 200
-        data = response.json()
-        assert data["status"] == "success"
-        assert "updated_jobs" in data
-
 
 class TestGetOrchestratorLogs:
     """Test the /remote/logs/{request_id} endpoint"""
diff --git a/transformerlab/routers/experiment/jobs.py b/transformerlab/routers/experiment/jobs.py
@@ -526,7 +526,6 @@ async def get_checkpoints(job_id: str, request: Request):
         return {"checkpoints": []}
 
     job_data = job["job_data"]
-
     # First try to use the new SDK method to get checkpoints
     try:
         from lab.job import Job
@@ -535,26 +534,33 @@ async def get_checkpoints(job_id: str, request: Request):
         sdk_job = Job(job_id)
         checkpoint_paths = sdk_job.get_checkpoint_paths()
 
+
         if checkpoint_paths and len(checkpoint_paths) > 0:
             checkpoints = []
             for checkpoint_path in checkpoint_paths:
                 try:
-                    stat = os.stat(checkpoint_path)
-                    modified_time = stat.st_mtime
-                    filesize = stat.st_size
-                    # Format the timestamp as ISO 8601 string
-                    formatted_time = datetime.fromtimestamp(modified_time).isoformat()
+                    if os.path.isdir(checkpoint_path):
+                        # Dont set formatted_time and filesize for directories (os.stat messes it up for fused filesystems)
+                        formatted_time = None
+                        filesize = None
+                    else:
+                        stat = os.stat(checkpoint_path)
+                        modified_time = stat.st_mtime
+                        filesize = stat.st_size
+                        # Format the timestamp as ISO 8601 string
+                        formatted_time = datetime.fromtimestamp(modified_time).isoformat()
+
                     filename = os.path.basename(checkpoint_path)
                     checkpoints.append({"filename": filename, "date": formatted_time, "size": filesize})
                 except Exception as e:
-                    logging.error(f"Error getting stat for checkpoint {checkpoint_path}: {e}")
+                    print(f"Error getting stat for checkpoint {checkpoint_path}: {e}")
                     continue
 
             # Sort checkpoints by filename in reverse (descending) order for consistent ordering
             checkpoints.sort(key=lambda x: x["filename"], reverse=True)
             return {"checkpoints": checkpoints}
     except Exception as e:
-        logging.info(f"SDK checkpoint method failed for job {job_id}, falling back to legacy method: {e}")
+        print(f"SDK checkpoint method failed for job {job_id}, falling back to legacy method: {e}")
 
     # Fallback to the original logic if SDK method doesn't work or returns nothing
     # Check if the job has a supports_checkpoints flag
@@ -577,18 +583,35 @@ async def get_checkpoints(job_id: str, request: Request):
     default_adaptor_dir = os.path.join(workspace_dir, "adaptors", secure_filename(model_name), adaptor_name)
 
     # print(f"Default adaptor directory: {default_adaptor_dir}")
-
-    checkpoints_dir = job_data.get("checkpoints_dir", default_adaptor_dir)
+    # Get job directory from t
+    checkpoints_dir = job_data.get("checkpoints_dir")
+    if not checkpoints_dir:
+        from lab.dirs import get_job_checkpoints_dir
+        checkpoints_dir = get_job_checkpoints_dir(job_id)
     if not checkpoints_dir or not os.path.exists(checkpoints_dir):
-        # print(f"Checkpoints directory does not exist: {checkpoints_dir}")
         return {"checkpoints": []}
+    elif os.path.isdir(checkpoints_dir):
+        checkpoints = []
+        if len(os.listdir(checkpoints_dir)) > 0:
+            for filename in os.listdir(checkpoints_dir):
+                if fnmatch(filename, "*_adapters.safetensors"):
+                    file_path = os.path.join(checkpoints_dir, filename)
+                    stat = os.stat(file_path)
+                    modified_time = stat.st_mtime
+                    filesize = stat.st_size
+                    checkpoints.append({"filename": filename, "date": modified_time, "size": filesize})
+                # allow directories too
+                elif os.path.isdir(os.path.join(checkpoints_dir, filename)):
+                    checkpoints.append({"filename": filename, "date": None, "size": None})
+            return {"checkpoints": checkpoints}
+        
 
+    # Fallback to using default adaptor directory as checkpoints directory
+    checkpoints_dir = default_adaptor_dir
     checkpoints_file_filter = job_data.get("checkpoints_file_filter", "*_adapters.safetensors")
     if not checkpoints_file_filter:
         checkpoints_file_filter = "*_adapters.safetensors"
 
-    # print(f"Checkpoints directory: {checkpoints_dir}")
-    # print(f"Checkpoints file filter: {checkpoints_file_filter}")
 
     checkpoints = []
     try:
@@ -607,7 +630,7 @@ async def get_checkpoints(job_id: str, request: Request):
                     filesize = None
                 checkpoints.append({"filename": filename, "date": formatted_time, "size": filesize})
     except OSError as e:
-        logging.error(f"Error reading checkpoints directory {checkpoints_dir}: {e}")
+        print(f"Error reading checkpoints directory {checkpoints_dir}: {e}")
 
     # Sort checkpoints by filename in reverse (descending) order for consistent ordering
     checkpoints.sort(key=lambda x: x["filename"], reverse=True)
diff --git a/transformerlab/routers/remote.py b/transformerlab/routers/remote.py