Kiln-AI · scosman · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/.github/workflows/developer_content_check.yml b/.github/workflows/developer_content_check.yml
@@ -0,0 +1,43 @@
+name: Developer Content Check
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  developer-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Search for leftover developer content
+        run: |
+          set +e
+          found=0
+          echo "Checking for console.log statements"
+          logs=$(grep -nR --exclude-dir=node_modules --exclude-dir=.venv --exclude-dir=.git --exclude-dir=.github --exclude-dir=build --exclude-dir=dist --exclude-dir=.svelte-kit -e 'console\.log' . || true)
+          if [ -n "$logs" ]; then
+            echo "$logs"
+            found=1
+          fi
+
+          echo "Checking for python print statements"
+          prints=$(grep -nR --include='*.py' --exclude-dir=node_modules --exclude-dir=.venv --exclude-dir=.git --exclude-dir=.github --exclude-dir=build --exclude-dir=dist --exclude-dir=.svelte-kit -e 'print(' . || true)
+          if [ -n "$prints" ]; then
+            echo "$prints"
+            found=1
+          fi
+
+          echo "Checking for TODO or FIXME"
+          notes=$(grep -nR --exclude-dir=node_modules --exclude-dir=.venv --exclude-dir=.git --exclude-dir=.github --exclude-dir=build --exclude-dir=dist --exclude-dir=.svelte-kit -e 'TODO' -e 'FIXME' . || true)
+          if [ -n "$notes" ]; then
+            echo "$notes"
+            found=1
+          fi
+
+          if [ "$found" -ne 0 ]; then
+            echo "Developer content found" >&2
+            exit 1
+          else
+            echo "No developer content found"
+          fi
diff --git a/.github/workflows/windows_release_build.yml b/.github/workflows/windows_release_build.yml
@@ -42,7 +42,7 @@ jobs:
           files-folder: ${{ github.workspace }}/app/desktop/build/dist
           files-folder-recurse: true
           files-folder-filter: exe
-          # TODO: consider signing dlls as well. But for testing, we don't want to use all our quota.
+          # We should consider signing dlls as well, but for testing we don't want to use all our quota.
           # files-folder-filter: exe,dll
           file-digest: SHA256
           timestamp-rfc3161: http://timestamp.acs.microsoft.com

diff --git a/app/desktop/build_desktop_app.sh b/app/desktop/build_desktop_app.sh
@@ -62,7 +62,7 @@ else
 fi
 
 # Builds the desktop app
-# TODO: use a spec instead of long winded command line
+# We should use a spec instead of a long-winded command line
 pyinstaller $(printf %s "$PLATFORM_OPTS")  \
   --add-data "./taskbar.png:." --add-data "../../web_ui/build:./web_ui/build" \
   --noconfirm --distpath=./desktop/build/dist --workpath=./desktop/build/work \

diff --git a/app/desktop/desktop.py b/app/desktop/desktop.py
@@ -17,7 +17,7 @@
 from app.desktop.custom_tray import KilnTray
 from app.desktop.desktop_server import ThreadedServer, server_config
 
-# TODO: remove this and all other globals in this file
+# We should remove this and all other globals in this file
 root = None  # type: tk.Tk | None
 tray = None  # type: ignore
 

diff --git a/app/desktop/studio_server/test_generate_docs.py b/app/desktop/studio_server/test_generate_docs.py
@@ -1,3 +1,4 @@
+import logging
 from typing import List
 
 import pytest
@@ -77,11 +78,13 @@ async def test_generate_model_table():
         row = f"| {model.friendly_name} | {provider_names} | {structured_output} | {reasoning} | {data_gen} | {finetune} |"
         table.append(row)
 
-    # Print the table (useful for documentation)
-    print("\nModel Capability Matrix:\n")
-    print("\n".join(table))
-    print("\n\nFireworks models remaining:\n")
-    print("- " + "\n- ".join(f"{m.name}" for m in fireworks_models), "\n\n")
+    # Log the table (useful for documentation)
+    logger = logging.getLogger(__name__)
+    logger.info("\nModel Capability Matrix:\n%s", "\n".join(table))
+    logger.info(
+        "\n\nFireworks models remaining:\n- %s\n\n",
+        "\n- ".join(f"{m.name}" for m in fireworks_models),
+    )
 
     # Basic assertions to ensure the table is well-formed
     assert len(table) > 2, "Table should have header and at least one row"

diff --git a/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte b/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte
@@ -29,7 +29,7 @@
   // @ts-expect-error list_page is not a property of PageState
   $: list_page = ($page.state.list_page || []) as string[]
 
-  // TODO: we need to remove task_id from the URL, or load it by ID. $current_task is a lie
+  // We should remove task_id from the URL, or load it by ID. $current_task is a lie
   let run: TaskRun | null = null
   let loading = true
   let load_error: KilnError | null = null

diff --git a/app/web_ui/src/routes/(app)/generate/[project_id]/[task_id]/synth_data_guidance_datamodel.ts b/app/web_ui/src/routes/(app)/generate/[project_id]/[task_id]/synth_data_guidance_datamodel.ts
@@ -152,7 +152,7 @@ export class SynthDataGuidanceDataModel {
 
   private apply_selected_template(template: string) {
     if (template == "custom") {
-      // TODO make each unique
+      // We should make each unique
       this.topic_guidance.set(null)
       this.input_guidance.set(null)
       this.output_guidance.set(null)

diff --git a/app/web_ui/src/routes/(app)/run/+page.svelte b/app/web_ui/src/routes/(app)/run/+page.svelte
@@ -24,9 +24,9 @@
   import Collapse from "$lib/ui/collapse.svelte"
   import posthog from "posthog-js"
 
-  // TODO: implement checking input content
+  // We should implement checking input content
   // let warn_before_unload
-  // TODO UI for errors
+  // We should add a UI for errors
   let error: KilnError | null = null
   let submitting = false
   let run_complete = false

diff --git a/app/web_ui/src/routes/(app)/run/run.svelte b/app/web_ui/src/routes/(app)/run/run.svelte
@@ -56,7 +56,7 @@
   let show_raw_data = false
   let save_rating_error: KilnError | null = null
   let show_create_tag = false
-  // TODO warn_before_unload
+  // We should implement warn_before_unload
 
   type RatingValue = number | null
   let overall_rating: RatingValue = null

diff --git a/libs/core/kiln_ai/adapters/data_gen/data_gen_task.py b/libs/core/kiln_ai/adapters/data_gen/data_gen_task.py
@@ -77,7 +77,7 @@ class DataGenCategoriesTask(Task, parent_of={}):
     """
 
     def __init__(self, gen_type: Literal["training", "eval"], guidance: str | None):
-        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
+        # Keep the typechecker happy. We should be able to avoid this and parent_of above.
         tmp_project = Project(name="DataGen")
 
         instruction = generate_topic_tree_prompt(gen_type=gen_type, guidance=guidance)
@@ -181,7 +181,7 @@ def __init__(
         gen_type: Literal["training", "eval"],
         guidance: str | None,
     ):
-        # Keep the typechecker happy. TODO: shouldn't need this or parent_of above.
+        # Keep the typechecker happy. We should be able to avoid this and parent_of above.
         tmp_project = Project(name="DataGenSample")
 
         instruction = generate_sample_generation_prompt(

diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py b/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py
diff --git a/libs/core/kiln_ai/adapters/ml_model_list.py b/libs/core/kiln_ai/adapters/ml_model_list.py
@@ -191,7 +191,7 @@ class KilnModelProvider(BaseModel):
     suggested_for_uncensored_data_gen: bool = False
     tuned_chat_strategy: ChatStrategy | None = None
 
-    # TODO P1: Need a more generalized way to handle custom provider parameters.
+    # We need a more generalized way to handle custom provider parameters.
     # Making them quite declarative here for now, isolating provider specific logic
     # to this file. Later I should be able to override anything in this file via config.
     r1_openrouter_options: bool = False
@@ -1332,7 +1332,7 @@ class KilnModel(BaseModel):
             KilnModelProvider(
                 name=ModelProviderName.vertex,
                 model_id="meta/llama-3.3-70b-instruct-maas",
-                # Doesn't work, TODO to debug
+                # Doesn't work yet; needs debugging
                 supports_structured_output=False,
                 supports_data_gen=False,
             ),
@@ -1486,7 +1486,7 @@ class KilnModel(BaseModel):
             ),
             KilnModelProvider(
                 name=ModelProviderName.openrouter,
-                # TODO: swap to non-free model when available (more reliable)
+                # Swap to non-free model when available (more reliable)
                 model_id="google/gemma-3-1b-it:free",
                 supports_structured_output=False,
                 supports_data_gen=False,
@@ -1507,7 +1507,7 @@ class KilnModel(BaseModel):
             KilnModelProvider(
                 name=ModelProviderName.openrouter,
                 structured_output_mode=StructuredOutputMode.json_instruction_and_object,
-                # TODO: swap to non-free model when available (more reliable)
+                # Swap to non-free model when available (more reliable)
                 model_id="google/gemma-3-4b-it:free",
             ),
         ],
@@ -1525,7 +1525,7 @@ class KilnModel(BaseModel):
             KilnModelProvider(
                 name=ModelProviderName.openrouter,
                 structured_output_mode=StructuredOutputMode.json_instruction_and_object,
-                # TODO: swap to non-free model when available (more reliable)
+                # Swap to non-free model when available (more reliable)
                 model_id="google/gemma-3-12b-it:free",
             ),
         ],

diff --git a/libs/core/kiln_ai/adapters/model_adapters/litellm_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/litellm_adapter.py
@@ -235,7 +235,7 @@ def tool_call_params(self, strict: bool) -> dict[str, Any]:
         }
 
     def build_extra_body(self, provider: KilnModelProvider) -> dict[str, Any]:
-        # TODO P1: Don't love having this logic here. But it's a usability improvement
+        # We should consider moving this logic elsewhere for better usability
         # so better to keep it than exclude it. Should figure out how I want to isolate
         # this sort of logic so it's config driven and can be overridden
 

diff --git a/libs/core/kiln_ai/adapters/repair/repair_task.py b/libs/core/kiln_ai/adapters/repair/repair_task.py
@@ -6,7 +6,7 @@
 from kiln_ai.datamodel import Priority, Project, Task, TaskRequirement, TaskRun
 
 
-# TODO add evaluator rating
+# We should add evaluator rating
 class RepairTaskInput(BaseModel):
     original_prompt: str
     original_input: str

diff --git a/libs/core/kiln_ai/adapters/test_adapter_registry.py b/libs/core/kiln_ai/adapters/test_adapter_registry.py
@@ -109,7 +109,7 @@ def test_openai_compatible_adapter_creation(mock_config, basic_task, provider):
     assert adapter.run_config.model_name == "test-model"
 
 
-# TODO should run for all cases
+# We should run for all cases
 def test_custom_prompt_builder(mock_config, basic_task):
     adapter = adapter_for_task(
         kiln_task=basic_task,
@@ -124,7 +124,7 @@ def test_custom_prompt_builder(mock_config, basic_task):
     assert adapter.run_config.prompt_id == "simple_chain_of_thought_prompt_builder"
 
 
-# TODO should run for all cases
+# We should run for all cases
 def test_tags_passed_through(mock_config, basic_task):
     tags = ["test-tag-1", "test-tag-2"]
     adapter = adapter_for_task(

diff --git a/libs/core/kiln_ai/adapters/test_remote_config.py b/libs/core/kiln_ai/adapters/test_remote_config.py
@@ -160,12 +160,11 @@ def test_backwards_compatibility_with_v0_18(tmp_path):
         for provider in model.providers:
             assert hasattr(provider, 'name')
 
-    print("SUCCESS: v0.18 successfully parsed JSON from current version")
-    print(f"Parsed {{len(models)}} models")
+    # Indicate success
+    pass
 
 except Exception as e:
-    print(f"ERROR: {{e}}")
-    sys.exit(1)
+    raise SystemExit(f"ERROR: {e}")
 '''
 
     try:

diff --git a/libs/core/kiln_ai/datamodel/basemodel.py b/libs/core/kiln_ai/datamodel/basemodel.py
@@ -470,7 +470,7 @@ def validate_and_save_with_subrelations(
             ValidationError: If validation fails for the model or any of its children
         """
         # Validate first, then save. Don't want error half way through, and partly persisted
-        # TODO P2: save to tmp dir, then move atomically. But need to merge directories so later.
+        # We should save to a tmp dir and move atomically, but need to merge directories later.
         cls._validate_nested(data, save=False, path=path, parent=parent)
         instance = cls._validate_nested(data, save=True, path=path, parent=parent)
         return instance

diff --git a/libs/core/kiln_ai/datamodel/project.py b/libs/core/kiln_ai/datamodel/project.py
@@ -18,6 +18,6 @@ class Project(KilnParentModel, parent_of={"tasks": Task}):
         description="A description of the project for you and your team. Will not be used in prompts/training/validation.",
     )
 
-    # Needed for typechecking. TODO P2: fix this in KilnParentModel
+    # Needed for typechecking. We should fix this in KilnParentModel
     def tasks(self) -> list[Task]:
         return super().tasks()  # type: ignore
diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py
@@ -216,7 +216,7 @@ def input_schema(self) -> Dict | None:
             return None
         return schema_from_json_str(self.input_json_schema)
 
-    # These wrappers help for typechecking. TODO P2: fix this in KilnParentModel
+    # These wrappers help for typechecking. We should fix this in KilnParentModel
     def runs(self, readonly: bool = False) -> list[TaskRun]:
         return super().runs(readonly=readonly)  # type: ignore
 

diff --git a/libs/core/kiln_ai/datamodel/test_model_perf.py b/libs/core/kiln_ai/datamodel/test_model_perf.py
@@ -1,3 +1,4 @@
+import logging
 import shutil
 import uuid
 
@@ -121,6 +122,7 @@ def test_benchmark_load_from_file(benchmark, task_run):
 
     # I get 8k ops per second on my MBP. Lower value here for CI and parallel testing.
     # Prior to optimization was 290 ops per second.
-    print(f"Ops per second: {ops_per_second:.6f}")
+    logger = logging.getLogger(__name__)
+    logger.info("Ops per second: %.6f", ops_per_second)
     if ops_per_second < 500:
         pytest.fail(f"Ops per second: {ops_per_second:.6f}, expected more than 1k ops")