context-machine-lab
diff --git a/‎src/sleepless_agent/config.yaml‎
Lines changed: 79 additions & 29 deletions b/‎src/sleepless_agent/config.yaml‎
Lines changed: 79 additions & 29 deletions
diff --git a/‎src/sleepless_agent/core/daemon.py‎
Lines changed: 14 additions & 5 deletions b/‎src/sleepless_agent/core/daemon.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎src/sleepless_agent/core/executor.py‎
Lines changed: 7 additions & 6 deletions b/‎src/sleepless_agent/core/executor.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/sleepless_agent/core/queue.py‎
Lines changed: 8 additions & 8 deletions b/‎src/sleepless_agent/core/queue.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/sleepless_agent/core/task_runtime.py‎
Lines changed: 39 additions & 19 deletions b/‎src/sleepless_agent/core/task_runtime.py‎
Lines changed: 39 additions & 19 deletions
@@ -3,13 +3,14 @@ claude_code:
   model: claude-sonnet-4-5-20250929
   night_start_hour: 1
   night_end_hour: 9
-  threshold_day: 100.0
-  threshold_night: 100.0
+  threshold_day: 20.0
+  threshold_night: 80.0
   usage_command: claude /usage
 
 git:
   use_remote_repo: true
   remote_repo_url: [email protected]:TimeLovercc/sleepless-agent.git
+  auto_create_repo: true
 
 agent:
   workspace_root: ./workspace
@@ -29,33 +30,82 @@ multi_agent_workflow:
 auto_generation:
   enabled: true
   prompts:
-    - name: default_improvement
+    - name: refine_focused
       prompt: |-
-        You are a software development assistant. Generate ONE specific, actionable improvement idea for a Generic Python project.
-
-        Generate task ideas in categories like:
-        - Code quality (refactoring, optimization, testing)
-        - Documentation (docstrings, README, examples)
-        - Features (new functionality, enhancements)
-        - Architecture (design improvements, modularity)
-        - Performance (caching, algorithms, database queries)
-        - Security (input validation, authentication, encryption)
-
-        IMPORTANT: Classify the task type and prefix your response with [NEW] or [REFINE].
-        Respond with the type prefix followed by a single task description in 1-2 sentences.
-      weight: 0.7
-    - name: bug_fix_maintenance
+        The workspace has multiple ongoing projects and tasks that need attention.
+
+        ## Current State
+        - Active tasks: {task_count} ({pending_count} pending, {in_progress_count} in progress)
+        - Many tasks are already in progress or pending
+
+        ## Recent Work & Context
+        {recent_work}
+
+        ## Task Generation
+        Generate ONE REFINE task to continue or improve existing work:
+        - Complete partial/incomplete tasks mentioned above
+        - Follow up on outstanding items and recommendations
+        - Enhance or improve existing projects in the workspace
+        - Fix issues or improve quality of current work
+        - Add missing components to existing implementations
+        - Expand documentation or analysis from previous tasks
+
+        IMPORTANT: Your response MUST start with [REFINE] followed by a specific, actionable task description in 1-2 sentences.
+
+        Focus on completing or improving what already exists in the workspace rather than starting new projects.
+      weight: 0.45
+    - name: balanced
+      prompt: |-
+        Review the workspace state and generate a valuable task.
+
+        ## Current State
+        - Active tasks: {task_count} ({pending_count} pending, {in_progress_count} in progress)
+
+        ## Recent Work & Context
+        {recent_work}
+
+        ## Task Generation
+        Generate ONE valuable task (NEW or REFINE):
+        - For REFINE: improve existing work, complete partial tasks, enhance current projects
+        - For NEW: create something useful, interesting, or educational
+
+        Task categories to consider:
+        - Software development (applications, scripts, tools, APIs)
+        - Data analysis and visualization projects
+        - Research and documentation (technical guides, comparisons, best practices)
+        - Creative writing (stories, tutorials, technical articles)
+        - System design and architecture documents
+        - Educational content and examples
+        - Automation and productivity improvements
+        - Analysis and evaluation reports
+
+        IMPORTANT: Prefix your response with [NEW] or [REFINE] followed by a specific, actionable task description in 1-2 sentences.
+      weight: 0.35
+    - name: new_friendly
       prompt: |-
-        You are a software development assistant. Generate ONE specific bug fix or maintenance task for a Generic Python project.
-
-        Focus on areas like:
-        - Bug fixes (edge cases, error handling, race conditions)
-        - Technical debt (outdated dependencies, deprecated APIs)
-        - Code maintenance (cleanup, refactoring for clarity)
-        - Robustness (input validation, error recovery)
-        - Edge case handling (boundary conditions, null checks)
-
-        IMPORTANT: Classify the task type and prefix your response with [NEW] or [REFINE].
-        Respond with the type prefix followed by a single task description in 1-2 sentences.
-      weight: 0.3
+        Generate an interesting and valuable task for the workspace.
+
+        ## Current State
+        - Active tasks: {task_count} ({pending_count} pending, {in_progress_count} in progress)
+        - Few tasks in queue - good time for new projects!
+
+        ## Task Generation
+        Generate ONE innovative task that creates value.
+
+        Areas to explore:
+        - Build practical tools and utilities
+        - Create educational content and tutorials
+        - Develop software applications or scripts
+        - Write comprehensive documentation or guides
+        - Design systems and architectures
+        - Analyze and compare technologies or approaches
+        - Generate creative content (technical writing, examples)
+        - Research and summarize complex topics
+        - Create data visualizations or analysis
+        - Develop proof-of-concepts or experiments
+
+        Can be NEW (fresh project) or REFINE (improve existing work) - choose what would be most valuable.
+
+        IMPORTANT: Prefix your response with [NEW] or [REFINE] followed by a specific, actionable task description in 1-2 sentences.
+      weight: 0.20
 
@@ -5,7 +5,7 @@
 import asyncio
 import signal
 import sys
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from pathlib import Path
 
 from sqlalchemy.orm import sessionmaker
@@ -99,7 +99,11 @@ def __init__(self) -> None:
             str(self.config.agent.results_path),
         )
 
-        self.git = GitManager(workspace_root=str(self.config.agent.workspace_root))
+        auto_create_repo = git_config.get("auto_create_repo", False) if git_config else False
+        self.git = GitManager(
+            workspace_root=str(self.config.agent.workspace_root),
+            auto_create_repo=auto_create_repo,
+        )
         self.git.init_repo()
         if self.use_remote_repo and self.remote_repo_url:
             try:
@@ -210,7 +214,13 @@ async def run(self) -> None:
         logger.info("Sleepless Agent starting...")
 
         try:
-            self.bot.start()
+            # Start bot in background thread to avoid blocking the async event loop
+            # The Slack SDK's connect() is synchronous and would block forever
+            import threading
+            bot_thread = threading.Thread(target=self.bot.start, daemon=True, name="SlackBot")
+            bot_thread.start()
+            await asyncio.sleep(0.5)  # Give bot time to initialize
+            logger.info("Slack bot started in background thread")
         except Exception as exc:
             logger.error(f"Failed to start bot: {exc}")
             return
@@ -259,13 +269,12 @@ async def _process_tasks(self) -> None:
                     break
 
                 await self.task_runtime.execute(task)
-                self.scheduler.log_task_execution(task.id)
                 await asyncio.sleep(1)
         except Exception as exc:
             logger.error(f"Error in task processing loop: {exc}")
 
     def _check_and_summarize_daily_reports(self) -> None:
-        now = datetime.utcnow()
+        now = datetime.now(timezone.utc).replace(tzinfo=None)
         end_of_day = now.replace(hour=23, minute=59, second=0, microsecond=0)
 
         if self.last_daily_summarization is None or self.last_daily_summarization.date() != now.date():
 
@@ -5,7 +5,7 @@
 import subprocess
 import time
 from collections import OrderedDict
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Optional, Tuple, List, Dict
 import shutil
@@ -243,7 +243,7 @@ def _ensure_readme_exists(self, workspace: Path, task_id: int, task_description:
                 PRIORITY="serious" if project_id else "random",
                 PRIORITY_LABEL="SERIOUS" if project_id else "RANDOM",
                 PROJECT_NAME=project_name or "None",
-                CREATED_AT=datetime.utcnow().isoformat(),
+                CREATED_AT=datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
             )
 
             readme_path.write_text(content)
@@ -340,7 +340,7 @@ def _update_readme_task_history(self, workspace: Path, task_id: int,
             status_icon = "✅" if status == "completed" else "❌"
             git_line = f"\n- Git: {git_info}" if git_info else ""
 
-            update = f"\n\n### Execution {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}\n"
+            update = f"\n\n### Execution {datetime.now(timezone.utc).replace(tzinfo=None).strftime('%Y-%m-%d %H:%M:%S')}\n"
             update += f"- Status: {status_icon} {status.upper()}\n"
             update += f"- Files Modified: {files_modified}\n"
             update += f"- Duration: {execution_time}s"
@@ -1176,7 +1176,7 @@ async def execute_task(
         project_id: Optional[str] = None,
         project_name: Optional[str] = None,
         workspace_task_type: Optional[str] = None,
-    ) -> Tuple[str, List[str], List[str], int]:
+    ) -> Tuple[str, List[str], List[str], int, Dict, Optional[str]]:
         """Execute task with Claude Code SDK
 
         Args:
@@ -1190,7 +1190,8 @@ async def execute_task(
             workspace_task_type: Workspace task type ("new" or "refine") - for workspace initialization
 
         Returns:
-            Tuple of (output_text, files_modified, commands_executed, exit_code, usage_metrics)
+            Tuple of (output_text, files_modified, commands_executed, exit_code, usage_metrics, eval_status)
+            eval_status can be: "COMPLETE", "PARTIAL", "INCOMPLETE", "FAILED", or None if evaluator disabled
         """
         timeout = timeout or self.default_timeout
 
@@ -1472,7 +1473,7 @@ async def execute_task(
                 commands=len(all_commands_executed),
             )
 
-            return output_text, all_modified_files, all_commands_executed, final_exit_code, combined_metrics
+            return output_text, all_modified_files, all_commands_executed, final_exit_code, combined_metrics, eval_status
 
         except CLINotFoundError:
             self._live_update(
 
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import json
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from typing import List, Optional
 
 from sqlalchemy import case
@@ -112,7 +112,7 @@ def _op(session: Session) -> Optional[Task]:
             task = session.query(Task).filter(Task.id == task_id).first()
             if task:
                 task.status = TaskStatus.IN_PROGRESS
-                task.started_at = datetime.utcnow()
+                task.started_at = datetime.now(timezone.utc).replace(tzinfo=None)
                 task.attempt_count += 1
             return task
 
@@ -128,7 +128,7 @@ def _op(session: Session) -> Optional[Task]:
             task = session.query(Task).filter(Task.id == task_id).first()
             if task:
                 task.status = TaskStatus.COMPLETED
-                task.completed_at = datetime.utcnow()
+                task.completed_at = datetime.now(timezone.utc).replace(tzinfo=None)
                 task.result_id = result_id
             return task
 
@@ -146,7 +146,7 @@ def _op(session: Session) -> Optional[Task]:
                 task.status = TaskStatus.FAILED
                 task.error_message = error_message
                 if not task.completed_at:
-                    task.completed_at = datetime.utcnow()
+                    task.completed_at = datetime.now(timezone.utc).replace(tzinfo=None)
             return task
 
         task = self._run_write(_op)
@@ -161,7 +161,7 @@ def _op(session: Session) -> Optional[Task]:
             task = session.query(Task).filter(Task.id == task_id).first()
             if task and task.status == TaskStatus.PENDING:
                 task.status = TaskStatus.CANCELLED
-                task.deleted_at = datetime.utcnow()
+                task.deleted_at = datetime.now(timezone.utc).replace(tzinfo=None)
             return task
 
         task = self._run_write(_op)
@@ -247,7 +247,7 @@ def timeout_expired_tasks(self, max_age_seconds: int) -> List[Task]:
             return []
 
         def _op(session: Session) -> List[Task]:
-            cutoff = datetime.utcnow() - timedelta(seconds=max_age_seconds)
+            cutoff = datetime.now(timezone.utc).replace(tzinfo=None) - timedelta(seconds=max_age_seconds)
             tasks = (
                 session.query(Task)
                 .filter(
@@ -261,7 +261,7 @@ def _op(session: Session) -> List[Task]:
             if not tasks:
                 return []
 
-            now = datetime.utcnow()
+            now = datetime.now(timezone.utc).replace(tzinfo=None)
             for task in tasks:
                 task.status = TaskStatus.FAILED
                 task.completed_at = now
@@ -360,7 +360,7 @@ def _op(session: Session) -> int:
             for task in tasks:
                 if task.status == TaskStatus.PENDING:
                     task.status = TaskStatus.CANCELLED
-                    task.deleted_at = datetime.utcnow()
+                    task.deleted_at = datetime.now(timezone.utc).replace(tzinfo=None)
                     count += 1
             return count
 
 
@@ -3,7 +3,7 @@
 import asyncio
 import os
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Iterable, List, Optional, Set, TYPE_CHECKING
 
@@ -92,6 +92,7 @@ async def execute(self, task) -> None:
                 commands_executed,
                 exit_code,
                 usage_metrics,
+                eval_status,
             ) = await self._run_task_with_timeout(task)
 
             processing_time = int(time.time() - start_time)
@@ -106,6 +107,7 @@ async def execute(self, task) -> None:
                 duration_s=processing_time,
                 total_cost_usd=usage_metrics.get("total_cost_usd"),
                 turns=usage_metrics.get("num_turns"),
+                eval_status=eval_status,
             )
 
             if exit_code != 0:
@@ -157,23 +159,41 @@ async def execute(self, task) -> None:
             else:
                 task_log.warning("task.git.skipped", reason="workspace_missing")
 
-            self.task_queue.mark_completed(task.id, result_id=result.id)
-            self._log_success_metrics(
-                task=task,
-                processing_time=processing_time,
-                files_modified=files_modified,
-                commands_executed=commands_executed,
-                git_commit_sha=git_commit_sha,
-                git_pr_url=git_pr_url,
-                usage_metrics=usage_metrics,
-                result_output=result_output,
-            )
-            task_log.info(
-                "task.complete",
-                status="completed",
-                duration_s=processing_time,
-                git_commit=git_commit_sha,
-            )
+            # Check evaluator status before marking as completed
+            # Only mark as completed if evaluator says COMPLETE, or if evaluator is disabled
+            if eval_status and eval_status.upper() in ["INCOMPLETE", "FAILED"]:
+                task_log.warning(
+                    "task.evaluator_incomplete",
+                    eval_status=eval_status,
+                    message="Task marked as failed due to evaluator status"
+                )
+                self.task_queue.mark_failed(task.id, f"Evaluator status: {eval_status}")
+                self._log_failure_metrics(task=task, duration=processing_time, error=f"Evaluator: {eval_status}")
+                task_log.info(
+                    "task.complete",
+                    status="failed",
+                    duration_s=processing_time,
+                    eval_status=eval_status,
+                )
+            else:
+                self.task_queue.mark_completed(task.id, result_id=result.id)
+                self._log_success_metrics(
+                    task=task,
+                    processing_time=processing_time,
+                    files_modified=files_modified,
+                    commands_executed=commands_executed,
+                    git_commit_sha=git_commit_sha,
+                    git_pr_url=git_pr_url,
+                    usage_metrics=usage_metrics,
+                    result_output=result_output,
+                )
+                task_log.info(
+                    "task.complete",
+                    status="completed",
+                    duration_s=processing_time,
+                    git_commit=git_commit_sha,
+                    eval_status=eval_status,
+                )
         except PauseException as pause:
             await self._handle_pause_exception(
                 task=task,
@@ -529,7 +549,7 @@ async def _handle_pause_exception(
 
         sleep_seconds = 0.0
         if pause.reset_time:
-            now = datetime.utcnow()
+            now = datetime.now(timezone.utc).replace(tzinfo=None)
             sleep_seconds = max(0.0, (pause.reset_time - now).total_seconds())
             task_log.info("task.pause.reset_time", reset_at=reset_time_iso)
         else: