Skip to content
This repository was archived by the owner on Dec 16, 2025. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/lattice/routes/instances/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ async def launch_instance(
experiment_id: Optional[str] = Form(None),
job_name: Optional[str] = Form(None),
tlab_job_id: Optional[str] = Form(None),
tlab_parent_job_id: Optional[str] = Form(None),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since both of these would always come together, is it possible to send things in a single param instead so we dont increase too many parameters on this route and its easier to port things later to other endpoints when necessary

tlab_checkpoint_name: Optional[str] = Form(None),
disabled_mandatory_mounts: Optional[bool] = Form(False),
yaml_file: Optional[UploadFile] = File(None),
user: dict = Depends(get_user_or_api_key),
Expand Down Expand Up @@ -440,6 +442,12 @@ async def launch_instance(
# Set _TFL_JOB_ID environment variable if tlab_job_id is provided
if tlab_job_id:
hook_env_vars["_TFL_JOB_ID"] = tlab_job_id

# Set checkpoint-related environment variables for training resume
if tlab_parent_job_id:
hook_env_vars["_TFL_PARENT_JOB_ID"] = tlab_parent_job_id
if tlab_checkpoint_name:
hook_env_vars["_TFL_CHECKPOINT_NAME"] = tlab_checkpoint_name

# Pre-calculate requested GPU count and preserve selected RunPod option for pricing
# (RunPod mapping below may clear 'accelerators')
Expand Down
1 change: 1 addition & 0 deletions src/lattice/routes/instances/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ def launch_cluster_with_skypilot(
except Exception:
effective_num_nodes = 1


task = sky.Task(
name=name,
run=command,
Expand Down