Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
9655b33
update eval API to include finetune run-configs
chiang-daniel Dec 2, 2025
093e0f8
add test
chiang-daniel Dec 2, 2025
8314490
add more coverage
chiang-daniel Dec 2, 2025
c5c1be4
handle parsers for finetune
chiang-daniel Dec 2, 2025
bad35fe
remove stale API
chiang-daniel Dec 6, 2025
b2171c5
look up model based on model_id instead of name
chiang-daniel Dec 6, 2025
42d03a9
coder rabbit
chiang-daniel Dec 6, 2025
147d6f6
Fewer builds: we don't need to be running builds on every push.
scosman Dec 8, 2025
faa180d
added add subtopics button to SDG
sfierro Dec 11, 2025
a14ebda
Fix linting errors: remove unused imports
sfierro Dec 11, 2025
1a413a1
fix: api key values incorrectly replaced with [hidden]
leonardmq Dec 12, 2025
8a9883d
Merge pull request #896 from Kiln-AI/leonard/kil-328-bug-custom-provi…
leonardmq Dec 13, 2025
1f1c333
Merge pull request #868 from Kiln-AI/dchiang/eval-finetune-tools
chiang-daniel Dec 15, 2025
599a900
update deps
scosman Dec 15, 2025
5c206a6
Fixes https://linear.app/kiln-ai/issue/KIL-206/remove-task-descriptio…
scosman Dec 15, 2025
f8ffdb0
Merge pull request #897 from Kiln-AI/scosman/hide_task_description_on…
scosman Dec 15, 2025
13d27b7
Add Leonard's suggestion to allow manual runs too.
scosman Dec 15, 2025
2f57a6e
tessl support
sfierro Dec 15, 2025
d6d4d6b
Merge pull request #875 from Kiln-AI/scosman/fewer_builds
scosman Dec 15, 2025
8f1c31f
ui dependencies
sfierro Dec 15, 2025
7e1002d
Merge branch 'main' into sfierro/KIL-312
sfierro Dec 15, 2025
a022af9
coderabbit feedback
sfierro Dec 15, 2025
9787340
Merge pull request #898 from Kiln-AI/sfierro/tessl
sfierro Dec 15, 2025
e2f8ebf
mcp hooks
sfierro Dec 15, 2025
de9855d
Merge pull request #899 from Kiln-AI/sfierro/mcp-hooks
sfierro Dec 15, 2025
1e6cc3f
Fix: don't save test files into root. Use tmp
scosman Dec 16, 2025
c96a491
Merge pull request #904 from Kiln-AI/scosman/test_zip_file
scosman Dec 17, 2025
028dc02
ty typecheck WIP
scosman Dec 17, 2025
2cab746
ty typecheck WIP
scosman Dec 17, 2025
fd39571
ty typecheck WIP
scosman Dec 17, 2025
665f37b
ty typecheck WIP
scosman Dec 17, 2025
304a9b9
feat: allow system message override in invoke
leonardmq Dec 17, 2025
c5d1510
Merge pull request #890 from Kiln-AI/sfierro/KIL-312
sfierro Dec 17, 2025
cee1f34
Fix reactivity issue on compare view
scosman Dec 17, 2025
a44b9d8
Mike wants a chart
scosman Dec 17, 2025
0c6fd30
Mike wants a chart part 2
scosman Dec 17, 2025
33e571c
better subtitles
scosman Dec 17, 2025
f24f205
don't fill em
scosman Dec 17, 2025
7944b7e
highlight on hover
scosman Dec 17, 2025
2fa972a
refactor: custom prompt builder injection via constructor
leonardmq Dec 18, 2025
4a73a76
Fixes https://linear.app/kiln-ai/issue/KIL-340/hide-chart-if-were-mis…
scosman Dec 18, 2025
91fc5ea
Don't show radar chart unless we have 3 points to show
scosman Dec 18, 2025
229b888
Merge pull request #909 from Kiln-AI/scosman/mike_wants_a_chart
scosman Dec 18, 2025
378d323
Merge branch 'main' into scosman/ty2
scosman Dec 18, 2025
f0fafc5
Cleaner value lookup, which fixes type checking. Also cleaner checks.sh
scosman Dec 18, 2025
5e43ea6
Fix type error
scosman Dec 18, 2025
a6d198f
Fix CI, ty requires sync
scosman Dec 18, 2025
9d808b3
coderabbit feedback
leonardmq Dec 18, 2025
03ff585
suggest global instead of us-central1 in connect providers (#916)
tawnymanticore Dec 18, 2025
d1d3868
Merge pull request #906 from Kiln-AI/leonard/override-prompt
leonardmq Dec 19, 2025
757a603
add gemini flash + nemotron 3 to ml_model_list (#915)
tawnymanticore Dec 19, 2025
d0a10c7
Merge pull request #914 from Kiln-AI/scosman/ty2
scosman Dec 19, 2025
4adf15a
Adding support for GLM 4.7 (#923)
tawnymanticore Jan 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .cursor/mcp.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"mcpServers": {
"tessl": {
"type": "stdio",
"command": "tessl",
"args": ["mcp", "start"]
},
"HooksMCP": {
"command": "uvx",
"args": ["hooks-mcp", "--working-directory", "."]
}
}
}
1 change: 1 addition & 0 deletions .cursor/rules/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tessl__*.mdc
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
run: uv run python3 -m pytest --runslow .

- name: Check Python Types
run: uv run pyright .
run: uv tool install [email protected] && uvx ty check

- name: Build Core
run: uv build
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/build_desktop.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
name: Build Desktop Apps

on:
workflow_dispatch:
release:
types: [created]
push:
branches:
- main

jobs:
build:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/format_and_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
run: uv python install 3.13

- name: Install the project
run: uv tool install ruff
run: uv sync --all-extras --dev

- name: Lint with ruff
run: |
Expand All @@ -45,3 +45,7 @@ jobs:
- name: Format with ruff
run: |
uvx ruff format --check .

- name: Typecheck with ty
run: |
uv tool install [email protected] && uvx ty check
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ __pycache__/
**/*.egg-info
node_modules/
conductor.json
CLAUDE.md

libs/core/docs
libs/core/build
Expand Down
2 changes: 2 additions & 0 deletions .tessl/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
tiles/
RULES.md
4 changes: 4 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,7 @@ These prompts can be accessed from the `get_prompt` tool, and you may request se
### Final

To show you read these, call me 'boss'

# Agent Rules <!-- tessl-managed -->

@.tessl/RULES.md follow the [instructions](.tessl/RULES.md)
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ We suggest the following extensions for VSCode/Cursor. With them, you'll get com
- Prettier
- Python
- Python Debugger
- Type checking by pyright via one of: Cursor Python if using Cursor, Pylance if VSCode
- Ty - language server and type checker for Python
- Ruff
- Svelte for VS Code
- Vitest
Expand Down
73 changes: 46 additions & 27 deletions app/desktop/studio_server/eval_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import StreamingResponse
from kiln_ai.adapters.eval.eval_runner import EvalRunner
from kiln_ai.adapters.fine_tune.finetune_run_config_id import (
finetune_from_finetune_run_config_id,
finetune_run_config_id,
)
from kiln_ai.adapters.ml_model_list import ModelProviderName
from kiln_ai.adapters.prompt_builders import prompt_builder_from_id
from kiln_ai.datamodel import BasePrompt, Task, TaskRun
Expand Down Expand Up @@ -59,6 +63,31 @@ def eval_config_from_id(
)


def get_all_run_configs(project_id: str, task_id: str) -> list[TaskRunConfig]:
"""
Returns all run configs for a task, including completed fine-tune run configs.
Only includes fine-tunes that have a fine_tune_model_id (are completed and usable).
"""
task = task_from_id(project_id, task_id)
configs = task.run_configs()

# Get run configs from finetunes and only include completed fine-tunes
finetunes = task.finetunes()
for finetune in finetunes:
if finetune.run_config is not None and finetune.fine_tune_model_id is not None:
configs.append(
TaskRunConfig(
id=finetune_run_config_id(project_id, task_id, str(finetune.id)),
name=finetune.name,
description=finetune.description,
run_config_properties=finetune.run_config,
parent=task, # special case, we need to reference the task model
)
)

return configs


def task_run_config_from_id(
project_id: str, task_id: str, run_config_id: str
) -> TaskRunConfig:
Expand All @@ -67,6 +96,18 @@ def task_run_config_from_id(
if run_config.id == run_config_id:
return run_config

# special case for finetune run configs, it's inside the finetune model
if run_config_id.startswith("finetune_run_config::"):
finetune = finetune_from_finetune_run_config_id(run_config_id)
if finetune.run_config is not None:
return TaskRunConfig(
id=finetune_run_config_id(project_id, task_id, str(finetune.id)),
name=finetune.name,
description=finetune.description,
run_config_properties=finetune.run_config,
parent=task, # special case, we need to reference the task model
)

raise HTTPException(
status_code=404,
detail=f"Task run config not found. ID: {run_config_id}",
Expand Down Expand Up @@ -315,33 +356,9 @@ async def create_evaluator(
eval.save_to_file()
return eval

@app.get("/api/projects/{project_id}/tasks/{task_id}/task_run_configs")
async def get_task_run_configs(
project_id: str, task_id: str
) -> list[TaskRunConfig]:
task = task_from_id(project_id, task_id)
return task.run_configs()

@app.get("/api/projects/{project_id}/tasks/{task_id}/run_configs/")
async def get_run_configs(project_id: str, task_id: str) -> list[TaskRunConfig]:
# Returns all run configs of a given task.
task = task_from_id(project_id, task_id)
configs = task.run_configs()

# Get run configs from finetunes
finetunes = task.finetunes()
for finetune in finetunes:
if finetune.run_config is not None:
configs.append(
TaskRunConfig(
id=f"finetune_run_config::{project_id}::{task_id}::{finetune.id}",
name=finetune.name,
description=finetune.description,
run_config_properties=finetune.run_config,
)
)

return configs
return get_all_run_configs(project_id, task_id)

@app.get("/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}")
async def get_eval(project_id: str, task_id: str, eval_id: str) -> Eval:
Expand Down Expand Up @@ -480,7 +497,8 @@ async def run_eval_config(
# Load the list of run configs to use. Two options:
run_configs: list[TaskRunConfig] = []
if all_run_configs:
run_configs = task_from_id(project_id, task_id).run_configs()
# special case, we cannot directly lod task.run_configs(), we need to also get all finetune run configs which lives inside the finetune model
run_configs = get_all_run_configs(project_id, task_id)
else:
if len(run_config_ids) == 0:
raise HTTPException(
Expand Down Expand Up @@ -633,7 +651,8 @@ async def get_eval_config_score_summary(
task = task_from_id(project_id, task_id)
eval = eval_from_id(project_id, task_id, eval_id)
eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id)
task_runs_configs = task.run_configs()
# special case, we cannot directly lod task.run_configs(), we need to also get all finetune run configs which lives inside the finetune model
task_runs_configs = get_all_run_configs(project_id, task_id)

# Build a set of all the dataset items IDs we expect to have scores for
expected_dataset_ids = dataset_ids_in_filter(
Expand Down
6 changes: 3 additions & 3 deletions app/desktop/studio_server/finetune_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ async def finetune(
status_code=400,
detail=f"Fine tune provider '{finetune.provider}' not found",
)
finetune_adapter = finetune_registry[finetune.provider]
finetune_adapter = finetune_registry[finetune.provider] # type: ignore[invalid-argument-type]
status = await finetune_adapter(finetune).status()
return FinetuneWithStatus(finetune=finetune, status=status)

Expand Down Expand Up @@ -360,7 +360,7 @@ async def finetune_hyperparameters(
raise HTTPException(
status_code=400, detail=f"Fine tune provider '{provider_id}' not found"
)
finetune_adapter_class = finetune_registry[provider_id]
finetune_adapter_class = finetune_registry[provider_id] # type: ignore[invalid-argument-type]
return finetune_adapter_class.available_parameters()

@app.get("/api/projects/{project_id}/tasks/{task_id}/finetune_dataset_info")
Expand Down Expand Up @@ -433,7 +433,7 @@ async def create_finetune(
status_code=400,
detail=f"Fine tune provider '{request.provider}' not found",
)
finetune_adapter_class = finetune_registry[request.provider]
finetune_adapter_class = finetune_registry[request.provider] # type: ignore[invalid-argument-type]

dataset = DatasetSplit.from_id_and_parent_path(request.dataset_id, task.path)
if dataset is None:
Expand Down
112 changes: 108 additions & 4 deletions app/desktop/studio_server/test_eval_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
CreateEvaluatorRequest,
connect_evals_api,
eval_config_from_id,
get_all_run_configs,
task_run_config_from_id,
)

Expand Down Expand Up @@ -297,7 +298,7 @@ async def test_create_task_run_config_with_freezing(
== "Frozen copy of prompt 'simple_chain_of_thought_prompt_builder'."
)
# Fetch it from API
fetch_response = client.get("/api/projects/project1/tasks/task1/task_run_configs")
fetch_response = client.get("/api/projects/project1/tasks/task1/run_configs/")
assert fetch_response.status_code == 200
configs = fetch_response.json()
assert len(configs) == 1
Expand Down Expand Up @@ -548,6 +549,104 @@ async def test_task_run_config_from_id(
task_run_config_from_id("project1", "task1", "non_existent")


@pytest.mark.asyncio
async def test_task_run_config_from_id_finetune(mock_task_from_id, mock_task):
mock_task_from_id.return_value = mock_task

run_config_props = RunConfigProperties(
model_name="gpt-4",
model_provider_name=ModelProviderName.openai,
prompt_id="simple_chain_of_thought_prompt_builder",
structured_output_mode=StructuredOutputMode.json_schema,
)

mock_finetune = Finetune(
id="ft_test",
name="Test Finetune",
description="Test finetune description",
provider="openai",
base_model_id="model1",
dataset_split_id="split1",
system_message="System message",
latest_status=FineTuneStatusType.completed,
run_config=run_config_props,
fine_tune_model_id="ft_model_123",
parent=mock_task,
)

with patch(
"app.desktop.studio_server.eval_api.finetune_from_finetune_run_config_id"
) as mock_finetune_from_id:
mock_finetune_from_id.return_value = mock_finetune

run_config = task_run_config_from_id(
"project1", "task1", "finetune_run_config::project1::task1::ft_test"
)

assert run_config.id == "finetune_run_config::project1::task1::ft_test"
assert run_config.name == "Test Finetune"
assert run_config.description == "Test finetune description"
assert run_config.run_config_properties == run_config_props
assert run_config.parent == mock_task


@pytest.mark.asyncio
async def test_get_all_run_configs(mock_task_from_id, mock_task):
"""Test that get_all_run_configs returns regular run configs and completed finetune run configs."""
mock_task_from_id.return_value = mock_task

run_config_props = RunConfigProperties(
model_name="gpt-4",
model_provider_name=ModelProviderName.openai,
prompt_id="simple_chain_of_thought_prompt_builder",
structured_output_mode=StructuredOutputMode.json_schema,
)

regular_run_config = TaskRunConfig(
id="regular_run_config1",
name="Regular Run Config",
description="A regular run config",
run_config_properties=run_config_props,
parent=mock_task,
)
regular_run_config.save_to_file()

completed_finetune = Finetune(
id="ft_completed",
name="Completed Finetune",
provider="openai",
base_model_id="model1",
dataset_split_id="split1",
system_message="System message",
latest_status=FineTuneStatusType.completed,
run_config=run_config_props,
fine_tune_model_id="ft_model_123",
parent=mock_task,
)
completed_finetune.save_to_file()

incomplete_finetune = Finetune(
id="ft_incomplete",
name="Incomplete Finetune",
provider="openai",
base_model_id="model2",
dataset_split_id="split2",
system_message="System message",
latest_status=FineTuneStatusType.running,
run_config=run_config_props,
fine_tune_model_id=None,
parent=mock_task,
)
incomplete_finetune.save_to_file()

configs = get_all_run_configs("project1", "task1")

config_ids = [config.id for config in configs]
assert "regular_run_config1" in config_ids
assert "finetune_run_config::project1::task1::ft_completed" in config_ids
assert "finetune_run_config::project1::task1::ft_incomplete" not in config_ids


@pytest.fixture
def mock_eval_for_score_summary():
eval = Mock(spec=Eval)
Expand Down Expand Up @@ -635,6 +734,7 @@ async def test_get_eval_config_score_summary(
Mock(spec=TaskRunConfig, id="run4"),
Mock(spec=TaskRunConfig, id="run5"),
]
mock_task.finetunes.return_value = []
mock_task_from_id.return_value = mock_task

response = client.get(
Expand Down Expand Up @@ -1910,6 +2010,7 @@ async def test_get_run_configs_includes_finetunes_with_run_config(
system_message="System message",
latest_status=FineTuneStatusType.completed,
run_config=run_config_props,
fine_tune_model_id="ft_model_123",
parent=mock_task,
),
Finetune(
Expand All @@ -1921,6 +2022,7 @@ async def test_get_run_configs_includes_finetunes_with_run_config(
system_message="System message",
latest_status=FineTuneStatusType.running,
run_config=run_config_props,
fine_tune_model_id=None,
parent=mock_task,
),
Finetune(
Expand All @@ -1932,6 +2034,7 @@ async def test_get_run_configs_includes_finetunes_with_run_config(
system_message="System message",
latest_status=FineTuneStatusType.unknown,
run_config=run_config_props,
fine_tune_model_id=None,
parent=mock_task,
),
Finetune(
Expand All @@ -1943,6 +2046,7 @@ async def test_get_run_configs_includes_finetunes_with_run_config(
system_message="System message",
latest_status=FineTuneStatusType.failed,
run_config=run_config_props,
fine_tune_model_id=None,
parent=mock_task,
),
Finetune(
Expand All @@ -1969,7 +2073,7 @@ async def test_get_run_configs_includes_finetunes_with_run_config(
config_ids = [config["id"] for config in configs]

assert "finetune_run_config::project1::task1::ft_completed" in config_ids
assert "finetune_run_config::project1::task1::ft_running" in config_ids
assert "finetune_run_config::project1::task1::ft_failed" in config_ids
assert "finetune_run_config::project1::task1::ft_unknown" in config_ids
assert "finetune_run_config::project1::task1::ft_running" not in config_ids
assert "finetune_run_config::project1::task1::ft_failed" not in config_ids
assert "finetune_run_config::project1::task1::ft_unknown" not in config_ids
assert "finetune_run_config::project1::task1::ft_no_run_config" not in config_ids
Loading
Loading