From 424616f81330789fa9c3642aed2222ddeef98484 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@ccns.ncku.edu.tw>
Date: Fri, 21 Nov 2025 15:58:20 +0800
Subject: [PATCH] Use preemptive EDF scheduler with ecall-based switch

This commit introduces a preemptive Earliest Deadline First (EDF) scheduler
that uses RISC-V ecall instructions for voluntary context switches while
preserving the existing cooperative scheduling mode.

The preemptive scheduler required several architectural changes. Tasks now
maintain separate stack pointer (sp) fields for ISR-based context switching,
distinct from the jmp_buf context used in cooperative mode. The dispatcher
accepts a from_timer parameter to distinguish timer-driven preemption from
voluntary yields, ensuring tick counters only increment on actual timer
interrupts.

Context switching in preemptive mode builds ISR stack frames with mepc
pointing to task entry points, allowing mret to resume execution. The ecall
handler invokes the dispatcher directly, enabling tasks to yield without
relying on setjmp/longjmp which are incompatible with interrupt contexts.

The cooperative mode preserves its setjmp/longjmp semantics. The dispatcher
always calls hal_context_restore() even when the same task continues,
because the longjmp completes the save/restore cycle initiated by
hal_context_save(). The hal_interrupt_tick() function enables interrupts
on a task's first run by detecting when the entry point still resides in
the context's return address slot.

Real-time scheduling support includes EDF with deadline-based priority
calculation, configurable through mo_task_rt_priority(). The RT scheduler
hook in KCB allows custom scheduling policies. Delay handling was enhanced
with batch updates to minimize critical section duration.

The logger subsystem gained a direct_mode flag for ISR-safe output, and
printf was made flush-aware to support synchronous output when needed.
Exception handling uses trap_puts() to avoid printf deadlock in trap
context.

Close #26
---
 app/rtsched.c        | 545 +++++++++++++++++++++++++++++++++++--------
 arch/riscv/boot.c    |   9 +-
 arch/riscv/hal.c     | 262 +++++++++++++++++----
 arch/riscv/hal.h     |  17 +-
 include/sys/logger.h |  18 ++
 include/sys/task.h   |  29 ++-
 kernel/logger.c      |  69 +++++-
 kernel/main.c        |   5 +
 kernel/task.c        | 208 +++++++++++++++--
 lib/stdio.c          |  35 ++-
 10 files changed, 1013 insertions(+), 184 deletions(-)

diff --git a/app/rtsched.c b/app/rtsched.c
index ffb98876..56402814 100644
--- a/app/rtsched.c
+++ b/app/rtsched.c
@@ -1,144 +1,491 @@
 #include <linmo.h>
 
-/* Task 4: Simple task that prints a message and waits for scheduling */
-static void task4(void)
+/* Extended task statistics for fairness validation */
+typedef struct {
+    uint32_t executions;      /* Total number of job executions */
+    uint32_t deadline_misses; /* Count of missed deadlines (RT tasks only) */
+    uint32_t total_response;  /* Sum of response times (release to start) */
+    uint32_t max_response, min_response; /* Max/Min response time observed */
+    uint32_t period;                     /* Task period (0 for non-RT) */
+    uint32_t deadline;                   /* Relative deadline (0 for non-RT) */
+} task_stats_t;
+
+/* Global statistics - indexed by task number 0-4 */
+static task_stats_t task_stats[5];
+static volatile uint32_t test_start_time = 0;
+
+/* Flag to indicate test has started.
+ * Note: This has a benign race condition - multiple tasks may observe
+ * test_started=0 simultaneously and attempt to set test_start_time.
+ * This is acceptable because: (1) all tasks set similar values (current tick),
+ * and (2) EDF ensures the highest-priority task runs first anyway.
+ * A proper fix would use a mutex, but the overhead is unnecessary here.
+ */
+static volatile int test_started = 0;
+static uint32_t test_duration = 50; /* Run for 50 ticks for better statistics */
+
+/* Set to 1+ to enable workload simulation for response time testing */
+#define WORKLOAD_TICKS 0
+
+/* Simulate workload: busy-wait for WORKLOAD_TICKS timer periods.
+ * When WORKLOAD_TICKS=0 (default), this is a no-op.
+ */
+#if WORKLOAD_TICKS > 0
+static void simulate_workload(void)
 {
-    while (1) {
-        printf("Task 4 running\n");
-        mo_task_wfi(); /* Wait for interrupt to yield control */
+    uint32_t start = mo_ticks();
+    while (mo_ticks() - start < WORKLOAD_TICKS) {
+        /* Busy wait to consume CPU time */
+        for (volatile int i = 0; i < 1000; i++)
+            ;
     }
 }
+#else
+#define simulate_workload() ((void) 0)
+#endif
 
-/* Task 3: Simple task that prints a message and waits for scheduling */
-static void task3(void)
+/* Task 0: RT task with period=10 */
+static void task0(void)
 {
-    while (1) {
-        printf("Task 3 running\n");
-        mo_task_wfi(); /* Wait for interrupt to yield control */
+    int idx = 0;
+    uint32_t period = 10;
+    uint32_t theoretical_release;
+    uint32_t job_start;
+    uint32_t response_time;
+
+    /* Initialize stats */
+    task_stats[idx].period = period;
+    task_stats[idx].deadline = period; /* implicit deadline = period */
+    task_stats[idx].min_response = UINT32_MAX;
+
+    /* Initialize test_start_time on first task execution */
+    if (!test_started) {
+        test_start_time = mo_ticks();
+        test_started = 1;
+    }
+
+    /* First job theoretical release time = test start */
+    theoretical_release = test_start_time;
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        /* Record actual start time */
+        job_start = mo_ticks();
+
+        /* Response time = actual start - theoretical release */
+        response_time = job_start - theoretical_release;
+
+        /* Track response time statistics */
+        task_stats[idx].total_response += response_time;
+        if (response_time > task_stats[idx].max_response)
+            task_stats[idx].max_response = response_time;
+        if (response_time < task_stats[idx].min_response)
+            task_stats[idx].min_response = response_time;
+
+        /* Check deadline miss (response > deadline) */
+        if (response_time > task_stats[idx].deadline)
+            task_stats[idx].deadline_misses++;
+
+        task_stats[idx].executions++;
+
+        /* Simulate workload to test response time measurement */
+        simulate_workload();
+
+        /* Calculate next theoretical release time */
+        theoretical_release += period;
+
+        /* Delay until next period */
+        uint32_t now = mo_ticks();
+        if (now < theoretical_release)
+            mo_task_delay(theoretical_release - now);
+    }
+
+    /* Clear RT priority so EDF stops selecting this task */
+    mo_task_rt_priority(mo_task_id(), NULL);
+    while (1)
+        mo_task_wfi();
+}
+
+/* Task 1: RT task with period=15 */
+static void task1(void)
+{
+    int idx = 1;
+    uint32_t period = 15;
+    uint32_t theoretical_release;
+    uint32_t job_start;
+    uint32_t response_time;
+
+    /* Initialize stats */
+    task_stats[idx].period = period;
+    task_stats[idx].deadline = period;
+    task_stats[idx].min_response = UINT32_MAX;
+
+    /* Wait for test to start */
+    while (!test_started)
+        mo_task_delay(1);
+
+    /* First job theoretical release time = test start */
+    theoretical_release = test_start_time;
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        job_start = mo_ticks();
+        response_time = job_start - theoretical_release;
+
+        task_stats[idx].total_response += response_time;
+        if (response_time > task_stats[idx].max_response)
+            task_stats[idx].max_response = response_time;
+        if (response_time < task_stats[idx].min_response)
+            task_stats[idx].min_response = response_time;
+        if (response_time > task_stats[idx].deadline)
+            task_stats[idx].deadline_misses++;
+
+        task_stats[idx].executions++;
+
+        /* Simulate workload */
+        simulate_workload();
+
+        theoretical_release += period;
+        uint32_t now = mo_ticks();
+        if (now < theoretical_release)
+            mo_task_delay(theoretical_release - now);
     }
+
+    mo_task_rt_priority(mo_task_id(), NULL);
+    while (1)
+        mo_task_wfi();
 }
 
-/* Task 2: Prints task ID and an incrementing counter, then waits */
+/* Task 2: RT task with period=20 */
 static void task2(void)
 {
-    int32_t cnt = 300000;
+    int idx = 2;
+    uint32_t period = 20;
+    uint32_t theoretical_release;
+    uint32_t job_start;
+    uint32_t response_time;
 
-    while (1) {
-        printf("[Task %d: %ld]\n", mo_task_id(), cnt++);
-        mo_task_wfi(); /* Yield control to scheduler */
+    /* Initialize stats */
+    task_stats[idx].period = period;
+    task_stats[idx].deadline = period;
+    task_stats[idx].min_response = UINT32_MAX;
+
+    /* Wait for test to start */
+    while (!test_started)
+        mo_task_delay(1);
+
+    /* First job theoretical release time = test start */
+    theoretical_release = test_start_time;
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        job_start = mo_ticks();
+        response_time = job_start - theoretical_release;
+
+        task_stats[idx].total_response += response_time;
+        if (response_time > task_stats[idx].max_response)
+            task_stats[idx].max_response = response_time;
+        if (response_time < task_stats[idx].min_response)
+            task_stats[idx].min_response = response_time;
+        if (response_time > task_stats[idx].deadline)
+            task_stats[idx].deadline_misses++;
+
+        task_stats[idx].executions++;
+
+        /* Simulate workload */
+        simulate_workload();
+
+        theoretical_release += period;
+        uint32_t now = mo_ticks();
+        if (now < theoretical_release)
+            mo_task_delay(theoretical_release - now);
     }
+
+    mo_task_rt_priority(mo_task_id(), NULL);
+    while (1)
+        mo_task_wfi();
 }
 
-/* Task 1: Prints task ID and an incrementing counter, then waits */
-static void task1(void)
+/* Task 3: Non-RT background task */
+static void task3(void)
 {
-    int32_t cnt = 200000;
+    int idx = 3;
+    uint32_t period = 25;
 
-    while (1) {
-        printf("[Task %d: %ld]\n", mo_task_id(), cnt++);
-        mo_task_wfi(); /* Yield control to scheduler */
+    task_stats[idx].period = period;
+
+    while (!test_started)
+        mo_task_delay(1);
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        task_stats[idx].executions++;
+        mo_task_delay(period);
     }
+    while (1)
+        mo_task_wfi();
 }
 
-/* Task 0: Prints task ID and an incrementing counter, then waits */
-static void task0(void)
+/* Print scheduling statistics using stdio with flush for ordered output */
+static void print_stats(void)
+{
+    /* Flush pending logger output to ensure report appears in order */
+    mo_logger_flush();
+
+    printf("\n========================================\n");
+    printf("    EDF Scheduler Statistics Report    \n");
+    printf("========================================\n");
+    printf("Test duration: %lu ticks\n\n", (unsigned long) test_duration);
+
+    printf("--- RT Task Statistics ---\n");
+    for (int i = 0; i < 3; i++) {
+        /* Ceiling division: task at t=0 runs once even for partial periods */
+        uint32_t expected =
+            (test_duration + task_stats[i].period - 1) / task_stats[i].period;
+        printf("Task %d (period=%lu, deadline=%lu):\n", i,
+               (unsigned long) task_stats[i].period,
+               (unsigned long) task_stats[i].deadline);
+        printf("  Executions: %lu (expected: %lu)\n",
+               (unsigned long) task_stats[i].executions,
+               (unsigned long) expected);
+        printf("  Deadline misses: %lu\n",
+               (unsigned long) task_stats[i].deadline_misses);
+
+        if (task_stats[i].executions > 0) {
+            uint32_t avg_response =
+                task_stats[i].total_response / task_stats[i].executions;
+            uint32_t jitter =
+                task_stats[i].max_response - task_stats[i].min_response;
+            printf("  Response time - min: %lu, max: %lu, avg: %lu\n",
+                   (unsigned long) task_stats[i].min_response,
+                   (unsigned long) task_stats[i].max_response,
+                   (unsigned long) avg_response);
+            printf("  Jitter (max-min): %lu ticks\n", (unsigned long) jitter);
+        }
+        printf("\n");
+    }
+
+    printf("--- Non-RT Task Statistics ---\n");
+    for (int i = 3; i < 5; i++) {
+        printf("Task %d (period=%lu):\n", i,
+               (unsigned long) task_stats[i].period);
+        printf("  Executions: %lu\n\n",
+               (unsigned long) task_stats[i].executions);
+    }
+
+    printf("--- Fairness Analysis ---\n");
+
+    /* 1. Deadline miss check */
+    uint32_t total_deadline_misses = 0;
+    for (int i = 0; i < 3; i++)
+        total_deadline_misses += task_stats[i].deadline_misses;
+    printf("1. Deadline misses: %lu %s\n",
+           (unsigned long) total_deadline_misses,
+           total_deadline_misses == 0 ? "[PASS]" : "[FAIL]");
+
+    /* 2. Execution count fairness */
+    int exec_ok = 1;
+    for (int i = 0; i < 3; i++) {
+        /* Ceiling division: task at t=0 runs once even for partial periods */
+        uint32_t expected =
+            (test_duration + task_stats[i].period - 1) / task_stats[i].period;
+        uint32_t actual = task_stats[i].executions;
+        /* Avoid underflow: check actual+1 < expected */
+        if (actual + 1 < expected || actual > expected + 1)
+            exec_ok = 0;
+    }
+    printf("2. Execution count: %s\n", exec_ok ? "[PASS] within expected range"
+                                               : "[FAIL] unexpected count");
+
+    /* 3. Response time bounded by deadline */
+    int response_ok = 1;
+    for (int i = 0; i < 3; i++) {
+        if (task_stats[i].max_response > task_stats[i].deadline)
+            response_ok = 0;
+    }
+    printf("3. Response bounded: %s\n",
+           response_ok ? "[PASS] max_response <= deadline"
+                       : "[FAIL] response exceeded deadline");
+
+    /* 4. Jitter analysis */
+    int jitter_ok = 1;
+    for (int i = 0; i < 3; i++) {
+        if (task_stats[i].executions > 0) {
+            uint32_t jitter =
+                task_stats[i].max_response - task_stats[i].min_response;
+            if (jitter > task_stats[i].period / 2)
+                jitter_ok = 0;
+        }
+    }
+    printf("4. Jitter acceptable: %s\n", jitter_ok
+                                             ? "[PASS] jitter < 50% period"
+                                             : "[WARN] high jitter detected");
+
+    /* 5. Non-RT task starvation check */
+    int starvation_ok =
+        (task_stats[3].executions > 0 || task_stats[4].executions > 0);
+    printf("5. Non-RT starvation: %s\n", starvation_ok
+                                             ? "[PASS] non-RT tasks executed"
+                                             : "[FAIL] non-RT tasks starved");
+
+    /* Overall verdict */
+    printf("\n--- Overall Verdict ---\n");
+    printf("EDF Scheduler: %s\n", (total_deadline_misses == 0 && exec_ok &&
+                                   response_ok && starvation_ok)
+                                      ? "All tests passed"
+                                      : "Some tests failed");
+    printf("========================================\n");
+
+    /* Re-enable async logging for any subsequent output */
+    mo_logger_async_resume();
+}
+
+/* Task 4: Statistics collector and reporter */
+static void task4(void)
 {
-    int32_t cnt = 100000;
+    int idx = 4;
+
+    task_stats[idx].period = 1; /* Runs every tick */
 
+    /* Wait for test to start */
+    while (!test_started)
+        mo_task_delay(1);
+
+    /* Monitor test progress */
+    while (mo_ticks() - test_start_time < test_duration) {
+        task_stats[idx].executions++;
+        mo_task_delay(1);
+    }
+
+    /* Wait a bit for other tasks to complete */
+    mo_task_delay(5);
+
+    /* Print comprehensive statistics */
+    print_stats();
+
+    while (1)
+        mo_task_wfi();
+}
+
+/* IDLE task: Always ready, runs when all other tasks blocked */
+static void idle_task(void)
+{
     while (1) {
-        printf("[Task %d: %ld]\n", mo_task_id(), cnt++);
-        mo_task_wfi(); /* Yield control to scheduler */
+        /* Just burn CPU cycles - don't yield or delay */
+        for (volatile int i = 0; i < 100; i++)
+            ;
     }
 }
 
 typedef struct {
-    unsigned credits;
-    unsigned remaining;
-} custom_prio_t;
-
-/* A simple credit-based real-time scheduler
- * – Every RT task carries a custom_prio_t record via its tcb_t::rt_prio field.
- * – Each time the scheduler selects a task it decrements "remaining".
- *   When "remaining" reaches zero it is reloaded from "credits" on the task’s
- *   next turn.
- * – The function returns the ID of the selected RT task, or –1 when no RT task
- *   is ready so the kernel should fall back to its round-robin scheduler.
+    uint32_t period;   /* Task period in ticks */
+    uint32_t deadline; /* Absolute deadline (ticks) */
+} edf_prio_t;
+
+/* Earliest Deadline First (EDF) real-time scheduler
+ * – Every RT task carries an edf_prio_t record via its tcb_t::rt_prio field.
+ * – The scheduler selects the READY RT task with the earliest absolute
+ * deadline. – When a task is selected, its deadline advances to the next
+ * period. – Returns the ID of the selected RT task, or -1 when no RT task is
+ * ready.
+ *
+ * Deadline Update Strategy:
+ * – Deadline advances (deadline += period) when a task is selected from READY.
+ * – For periodic tasks that delay for their period (mo_task_delay(period)),
+ *   this approximates correct EDF semantics: tasks become READY at period
+ *   boundaries, get selected shortly after, and deadline advances correctly.
+ * – This approach is simpler than tracking job releases separately.
+ * – Tasks must delay for their period to ensure correct periodic behavior.
+ *
+ * EDF is optimal for single-core systems: if any scheduler can meet all
+ * deadlines, EDF can. Complexity: O(n) where n = number of RT tasks.
  */
-static int32_t custom_sched(void)
+static int32_t edf_sched(void)
 {
-    static list_node_t *task_node = NULL; /* resume point */
-
-    /* If we have no starting point or we’ve wrapped, begin at head->next */
-    if (!task_node)
-        task_node = list_next(kcb->tasks->head);
-
-    /* Scan at most one full loop of the list */
-    list_node_t *start = task_node;
-    do {
-        if (!task_node) /* empty list */
-            return -1;
-
-        /* Skip head/tail sentinels and NULL-data nodes */
-        if (task_node == kcb->tasks->head || task_node == kcb->tasks->tail ||
-            !task_node->data) {
-            task_node = list_next(task_node);
+    tcb_t *earliest = NULL;
+    uint32_t earliest_deadline = UINT32_MAX;
+
+    /* Scan all tasks to find the one with earliest deadline */
+    list_node_t *node = list_next(kcb->tasks->head);
+    while (node && node != kcb->tasks->tail) {
+        if (!node->data) {
+            node = list_next(node);
             continue;
         }
 
-        /* Safe: data is non-NULL here */
-        tcb_t *task = (tcb_t *) task_node->data;
-
-        /* READY + RT-eligible ? */
-        if (task->state == TASK_READY && task->rt_prio) {
-            /* Consume one credit */
-            custom_prio_t *cp = (custom_prio_t *) task->rt_prio;
-            if (cp->remaining == 0)
-                cp->remaining = cp->credits;
-            cp->remaining--;
-
-            /* Next time resume with the following node */
-            task_node = list_next(task_node);
-            if (task_node == kcb->tasks->head || task_node == kcb->tasks->tail)
-                task_node = list_next(task_node); /* skip sentinel  */
-            return task->id;
+        tcb_t *task = (tcb_t *) node->data;
+
+        /* Consider both READY and RUNNING RT tasks for preemptive scheduling */
+        if ((task->state == TASK_READY || task->state == TASK_RUNNING) &&
+            task->rt_prio) {
+            edf_prio_t *edf = (edf_prio_t *) task->rt_prio;
+
+            /* Track task with earliest deadline */
+            if (edf->deadline < earliest_deadline) {
+                earliest_deadline = edf->deadline;
+                earliest = task;
+            }
         }
 
-        /* Otherwise advance */
-        task_node = list_next(task_node);
-    } while (task_node != start); /* one full lap */
+        node = list_next(node);
+    }
+
+    /* DON'T advance deadline here - that would happen on EVERY scheduler call!
+     * Deadline should only advance when task actually releases next job.
+     * For now, just return the selected task. Deadline advancement will happen
+     * when task becomes READY again after delay expires.
+     */
 
-    /* No READY RT task this cycle */
-    task_node = NULL; /* restart next */
-    return -1;
+    /* Return selected task ID, or -1 if no RT task is ready */
+    return earliest ? earliest->id : -1;
 }
 
 /* Application Entry Point: Initializes tasks and scheduler
  *
- * Spawns five tasks, assigns real-time priorities to tasks 0, 1, and 2,
- * and sets up the custom credit-based scheduler. Enables preemptive mode.
+ * RT Task Configuration (EDF scheduling):
+ * - Task 0: period = 10 ticks, utilization = 10%
+ * - Task 1: period = 15 ticks, utilization = 6.7%
+ * - Task 2: period = 20 ticks, utilization = 5%
+ * - Task 3: Non-RT background task (period = 25 ticks)
+ * - Task 4: Non-RT background task (period = 25 ticks)
+ *
+ * Total RT Utilization: ~21.7% (well under EDF's 100% bound)
  */
 int32_t app_main(void)
 {
-    /* Define RT task priorities with initial credit values */
-    static custom_prio_t priorities[3] = {
-        {.credits = 3, .remaining = 3}, /* Task 0 */
-        {.credits = 4, .remaining = 4}, /* Task 1 */
-        {.credits = 5, .remaining = 5}, /* Task 2 */
-    };
-
-    /* Spawn tasks with default stack size */
-    mo_task_spawn(task0, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task1, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task2, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task3, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task4, DEFAULT_STACK_SIZE);
-
-    /* Configure custom scheduler and assign RT priorities */
-    kcb->rt_sched = custom_sched;
-    mo_task_rt_priority(0, &priorities[0]);
-    mo_task_rt_priority(1, &priorities[1]);
-    mo_task_rt_priority(2, &priorities[2]);
-
-    /* preemptive scheduling */
+    /* test_start_time will be initialized by first task that runs */
+
+    /* Spawn all 5 RT/background tasks first */
+    int32_t tid0 = mo_task_spawn(task0, DEFAULT_STACK_SIZE);
+    int32_t tid1 = mo_task_spawn(task1, DEFAULT_STACK_SIZE);
+    int32_t tid2 = mo_task_spawn(task2, DEFAULT_STACK_SIZE);
+    (void) mo_task_spawn(task3, DEFAULT_STACK_SIZE); /* Non-RT task 3 */
+    /* Non-RT task 4 - displays stats */
+    (void) mo_task_spawn(task4, DEFAULT_STACK_SIZE);
+
+    /* Spawn IDLE task LAST so it's at end of round-robin list.
+     * This ensures other ready tasks get scheduled before IDLE.
+     */
+    (void) mo_task_spawn(idle_task, DEFAULT_STACK_SIZE);
+
+    /* Configure EDF priorities for RT tasks 0-2 with deadlines relative to
+     * current time */
+    uint32_t now = mo_ticks();
+    static edf_prio_t priorities[3];
+    priorities[0].period = 10;
+    priorities[0].deadline = now + 10;
+    priorities[1].period = 15;
+    priorities[1].deadline = now + 15;
+    priorities[2].period = 20;
+    priorities[2].deadline = now + 20;
+
+    /* Install EDF scheduler BEFORE setting priorities */
+    kcb->rt_sched = edf_sched;
+
+    mo_task_rt_priority(tid0, &priorities[0]);
+    mo_task_rt_priority(tid1, &priorities[1]);
+    mo_task_rt_priority(tid2, &priorities[2]);
+
+    /* Tasks 3-4 are non-RT, will use round-robin when no RT tasks ready */
+
+    printf("[RTSCHED] Current tick: %lu\n", (unsigned long) mo_ticks());
+
+    /* Return 1 for preemptive mode */
     return 1;
 }
diff --git a/arch/riscv/boot.c b/arch/riscv/boot.c
index ef025de8..37978025 100644
--- a/arch/riscv/boot.c
+++ b/arch/riscv/boot.c
@@ -156,12 +156,19 @@ __attribute__((naked, aligned(4))) void _isr(void)
         /* Save trap-related CSRs and prepare arguments for do_trap */
         "csrr   a0, mcause\n" /* Arg 1: cause */
         "csrr   a1, mepc\n"   /* Arg 2: epc */
+        "mv     a2, sp\n"     /* Arg 3: isr_sp (current stack frame) */
         "sw     a0,  30*4(sp)\n"
         "sw     a1,  31*4(sp)\n"
 
-        /* Call the high-level C trap handler */
+        /* Call the high-level C trap handler.
+         * Returns: a0 = SP to use for restoring context (may be different
+         * task's stack if context switch occurred).
+         */
         "call   do_trap\n"
 
+        /* Use returned SP for context restore (enables context switching) */
+        "mv     sp, a0\n"
+
         /* Restore context. mepc might have been modified by the handler */
         "lw     a1,  31*4(sp)\n"
         "csrw   mepc, a1\n"
diff --git a/arch/riscv/hal.c b/arch/riscv/hal.c
index ae10a653..04ecc7d0 100644
--- a/arch/riscv/hal.c
+++ b/arch/riscv/hal.c
@@ -42,6 +42,19 @@
  */
 #define ISR_STACK_FRAME_SIZE 128
 
+/* Global variable to hold the new stack pointer for pending context switch.
+ * When a context switch is needed, hal_switch_stack() saves the current SP
+ * and stores the new SP here. The ISR epilogue then uses this value.
+ * NULL means no context switch is pending, use current SP.
+ */
+static void *pending_switch_sp = NULL;
+
+/* Global variable to hold the ISR frame SP for the current trap.
+ * Set at the start of do_trap() so hal_switch_stack() can save the correct
+ * SP to the previous task (the ISR frame SP, not the current function's SP).
+ */
+static uint32_t current_isr_frame_sp = 0;
+
 /* NS16550A UART0 - Memory-mapped registers for the QEMU 'virt' machine's serial
  * port.
  */
@@ -248,31 +261,48 @@ void hal_cpu_idle(void)
 
 /* Interrupt and Trap Handling */
 
+/* Direct UART output for trap context (avoids printf deadlock) */
+extern int _putchar(int c);
+static void trap_puts(const char *s)
+{
+    while (*s)
+        _putchar(*s++);
+}
+
+/* Exception message table per RISC-V Privileged Spec */
+static const char *exc_msg[] = {
+    [0] = "Instruction address misaligned",
+    [1] = "Instruction access fault",
+    [2] = "Illegal instruction",
+    [3] = "Breakpoint",
+    [4] = "Load address misaligned",
+    [5] = "Load access fault",
+    [6] = "Store/AMO address misaligned",
+    [7] = "Store/AMO access fault",
+    [8] = "Environment call from U-mode",
+    [9] = "Environment call from S-mode",
+    [10] = "Reserved",
+    [11] = "Environment call from M-mode",
+    [12] = "Instruction page fault",
+    [13] = "Load page fault",
+    [14] = "Reserved",
+    [15] = "Store/AMO page fault",
+};
+
 /* C-level trap handler, called by the '_isr' assembly routine.
  * @cause : The value of the 'mcause' CSR, indicating the reason for the trap.
  * @epc   : The value of the 'mepc' CSR, the PC at the time of the trap.
+ * @isr_sp: The stack pointer pointing to the ISR frame.
+ *
+ * Returns The SP to use for restoring context (same or new task's frame).
  */
-void do_trap(uint32_t cause, uint32_t epc)
+uint32_t do_trap(uint32_t cause, uint32_t epc, uint32_t isr_sp)
 {
-    static const char *exc_msg[] = {
-        /* For printing helpful debug messages */
-        [0] = "Instruction address misaligned",
-        [1] = "Instruction access fault",
-        [2] = "Illegal instruction",
-        [3] = "Breakpoint",
-        [4] = "Load address misaligned",
-        [5] = "Load access fault",
-        [6] = "Store/AMO address misaligned",
-        [7] = "Store/AMO access fault",
-        [8] = "Environment call from U-mode",
-        [9] = "Environment call from S-mode",
-        [10] = "Reserved",
-        [11] = "Environment call from M-mode",
-        [12] = "Instruction page fault",
-        [13] = "Load page fault",
-        [14] = "Reserved",
-        [15] = "Store/AMO page fault",
-    };
+    /* Reset pending switch at start of every trap */
+    pending_switch_sp = NULL;
+
+    /* Store ISR frame SP so hal_switch_stack() can save it to prev task */
+    current_isr_frame_sp = isr_sp;
 
     if (MCAUSE_IS_INTERRUPT(cause)) { /* Asynchronous Interrupt */
         uint32_t int_code = MCAUSE_GET_CODE(cause);
@@ -282,28 +312,64 @@ void do_trap(uint32_t cause, uint32_t epc)
              * consistent tick frequency even with interrupt latency.
              */
             mtimecmp_w(mtimecmp_r() + (F_CPU / F_TIMER));
-            dispatcher(); /* Invoke the OS scheduler */
+            /* Invoke scheduler - parameter 1 = from timer, increment ticks */
+            dispatcher(1);
         } else {
             /* All other interrupt sources are unexpected and fatal */
-            printf("[UNHANDLED INTERRUPT] code=%u, cause=%08x, epc=%08x\n",
-                   int_code, cause, epc);
             hal_panic();
         }
     } else { /* Synchronous Exception */
         uint32_t code = MCAUSE_GET_CODE(cause);
-        const char *reason = "Unknown exception";
+
+        /* Handle ecall from M-mode - used for yielding in preemptive mode */
+        if (code == MCAUSE_ECALL_MMODE) {
+            /* Advance mepc past the ecall instruction (4 bytes) */
+            uint32_t new_epc = epc + 4;
+            write_csr(mepc, new_epc);
+
+            /* Also update mepc in the ISR frame on the stack!
+             * The ISR epilogue will restore mepc from the frame (offset 31*4 =
+             * 124 bytes). If we don't update the frame, mret will jump back to
+             * the ecall instruction!
+             */
+            uint32_t *isr_frame = (uint32_t *) isr_sp;
+            isr_frame[31] = new_epc;
+
+            /* Invoke dispatcher for context switch - parameter 0 = from ecall,
+             * don't increment ticks.
+             */
+            dispatcher(0);
+
+            /* Return the SP to use - new task's frame or current frame */
+            return pending_switch_sp ? (uint32_t) pending_switch_sp : isr_sp;
+        }
+
+        /* Print exception info via direct UART (safe in trap context) */
+        trap_puts("[EXCEPTION] ");
         if (code < ARRAY_SIZE(exc_msg) && exc_msg[code])
-            reason = exc_msg[code];
-        printf("[EXCEPTION] code=%u (%s), epc=%08x, cause=%08x\n", code, reason,
-               epc, cause);
+            trap_puts(exc_msg[code]);
+        else
+            trap_puts("Unknown");
+        trap_puts(" epc=0x");
+        for (int i = 28; i >= 0; i -= 4) {
+            uint32_t nibble = (epc >> i) & 0xF;
+            _putchar(nibble < 10 ? '0' + nibble : 'A' + nibble - 10);
+        }
+        trap_puts("\r\n");
+
         hal_panic();
     }
+
+    /* Return the SP to use for context restore - new task's frame or current */
+    return pending_switch_sp ? (uint32_t) pending_switch_sp : isr_sp;
 }
 
 /* Enables the machine-level timer interrupt source */
 void hal_timer_enable(void)
 {
-    mtimecmp_w(mtime_r() + (F_CPU / F_TIMER));
+    uint64_t now = mtime_r();
+    uint64_t target = now + (F_CPU / F_TIMER);
+    mtimecmp_w(target);
     write_csr(mie, read_csr(mie) | MIE_MTIE);
 }
 
@@ -313,20 +379,66 @@ void hal_timer_disable(void)
     write_csr(mie, read_csr(mie) & ~MIE_MTIE);
 }
 
-/* Hook called by the scheduler after a context switch.
- * Its primary purpose is to enable global interrupts ('mstatus.MIE') only
- * AFTER the first task has been launched. This ensures interrupts are not
- * globally enabled until the OS is fully running in a valid task context.
+/* Enable timer interrupt bit only - does NOT reset mtimecmp.
+ * Use this for NOSCHED_LEAVE to avoid pushing the interrupt deadline forward.
  */
-void hal_interrupt_tick(void)
+void hal_timer_irq_enable(void)
 {
-    tcb_t *task = kcb->task_current->data;
-    if (unlikely(!task))
-        hal_panic(); /* Fatal error - invalid task state */
+    write_csr(mie, read_csr(mie) | MIE_MTIE);
+}
 
-    /* The task's entry point is still in RA, so this is its very first run */
-    if ((uint32_t) task->entry == task->context[CONTEXT_RA])
-        _ei(); /* Enable global interrupts now that execution is in a task */
+/* Disable timer interrupt bit only - does NOT touch mtimecmp.
+ * Use this for NOSCHED_ENTER to temporarily disable preemption.
+ */
+void hal_timer_irq_disable(void)
+{
+    write_csr(mie, read_csr(mie) & ~MIE_MTIE);
+}
+
+/* Linker script symbols - needed for task initialization */
+extern uint32_t _gp, _end;
+
+/* Build initial ISR frame on task stack for preemptive mode.
+ * Returns the stack pointer that points to the frame.
+ * When ISR restores from this frame, it will jump to task_entry.
+ *
+ * CRITICAL: ISR deallocates the frame before mret (sp += 128).
+ * We place the frame such that after deallocation, SP is at a safe location.
+ *
+ * ISR Stack Frame Layout (must match boot.c _isr):
+ *   0: ra,   4: gp,   8: tp,  12: t0, ... 116: t6
+ * 120: mcause, 124: mepc
+ */
+void *hal_build_initial_frame(void *stack_top, void (*task_entry)(void))
+{
+#define INITIAL_STACK_RESERVE \
+    256 /* Reserve space below stack_top for task startup */
+
+    /* Place frame deeper in stack so after ISR deallocates (sp += 128),
+     * SP will be at (stack_top - INITIAL_STACK_RESERVE), not at stack_top.
+     */
+    uint32_t *frame =
+        (uint32_t *) ((uint8_t *) stack_top - INITIAL_STACK_RESERVE -
+                      ISR_STACK_FRAME_SIZE);
+
+    /* Zero out entire frame */
+    for (int i = 0; i < 32; i++) {
+        frame[i] = 0;
+    }
+
+    /* Compute tp value same as boot.c: aligned to 64 bytes from _end */
+    uint32_t tp_val = ((uint32_t) &_end + 63) & ~63U;
+
+    /* Initialize critical registers for proper task startup:
+     * - frame[1] = gp: Global pointer, required for accessing global variables
+     * - frame[2] = tp: Thread pointer, required for thread-local storage
+     * - frame[31] = mepc: Task entry point, where mret will jump to
+     */
+    frame[1] = (uint32_t) &_gp;        /* gp - global pointer */
+    frame[2] = tp_val;                 /* tp - thread pointer */
+    frame[31] = (uint32_t) task_entry; /* mepc - entry point */
+
+    return (void *) frame;
 }
 
 /* Context Switching */
@@ -468,6 +580,18 @@ __attribute__((noreturn)) void hal_context_restore(jmp_buf env, int32_t val)
     if (unlikely(!env))
         hal_panic(); /* Cannot proceed with invalid context */
 
+    /* Validate RA is in text section (simple sanity check) */
+    uint32_t ra = env[15]; /* CONTEXT_RA = 15 */
+    if (ra < 0x80000000 || ra > 0x80010000) {
+        trap_puts("[CTX_ERR] Bad RA=0x");
+        for (int i = 28; i >= 0; i -= 4) {
+            uint32_t nibble = (ra >> i) & 0xF;
+            _putchar(nibble < 10 ? '0' + nibble : 'A' + nibble - 10);
+        }
+        trap_puts("\r\n");
+        hal_panic();
+    }
+
     if (val == 0)
         val = 1; /* Must return a non-zero value after restore */
 
@@ -503,12 +627,60 @@ __attribute__((noreturn)) void hal_context_restore(jmp_buf env, int32_t val)
     __builtin_unreachable(); /* Tell compiler this point is never reached */
 }
 
+/* Stack pointer switching for preemptive context switch.
+ * Saves current SP to *old_sp and loads new SP from new_sp.
+ * Called by dispatcher when switching tasks in preemptive mode.
+ * After this returns, ISR will restore registers from the new stack.
+ *
+ * @old_sp: Pointer to location where current SP should be saved
+ * @new_sp: New stack pointer to switch to
+ */
+void hal_switch_stack(void **old_sp, void *new_sp)
+{
+    /* Save the ISR frame SP (NOT current SP which is deep in call stack!)
+     * to prev task. DO NOT change SP here - that would corrupt the C call
+     * stack! Instead, store new_sp in pending_switch_sp for ISR epilogue.
+     */
+    *old_sp = (void *) current_isr_frame_sp;
+
+    /* Set pending switch - ISR epilogue will use this SP for restore */
+    pending_switch_sp = new_sp;
+}
+
+/* Enable interrupts on first run of a task.
+ * Checks if task's return address still points to entry (meaning it hasn't
+ * run yet), and if so, enables global interrupts.
+ */
+void hal_interrupt_tick(void)
+{
+    tcb_t *task = kcb->task_current->data;
+    if (unlikely(!task))
+        hal_panic();
+
+    /* The task's entry point is still in RA, so this is its very first run */
+    if ((uint32_t) task->entry == task->context[CONTEXT_RA])
+        _ei();
+}
+
 /* Low-level context restore helper. Expects a pointer to a 'jmp_buf' in 'a0'.
- * Restores the GPRs and jumps to the restored return address.
+ * Restores the GPRs, mstatus, and jumps to the restored return address.
+ *
+ * This function must restore mstatus from the context to be
+ * consistent with hal_context_restore(). The first task context is initialized
+ * with MSTATUS_MIE | MSTATUS_MPP_MACH by hal_context_init(), which enables
+ * interrupts. Failing to restore this value would create an inconsistency
+ * where the first task inherits the kernel's mstatus instead of its own.
  */
 static void __attribute__((naked, used)) __dispatch_init(void)
 {
     asm volatile(
+        /* Restore mstatus FIRST to ensure correct processor state.
+         * This is critical for interrupt enable state (MSTATUS_MIE).
+         * Context was initialized with MIE=1 by hal_context_init().
+         */
+        "lw  t0, 16*4(a0)\n"
+        "csrw mstatus, t0\n"
+        /* Now restore all general-purpose registers */
         "lw  s0,   0*4(a0)\n"
         "lw  s1,   1*4(a0)\n"
         "lw  s2,   2*4(a0)\n"
@@ -536,6 +708,7 @@ __attribute__((noreturn)) void hal_dispatch_init(jmp_buf env)
 
     if (kcb->preemptive)
         hal_timer_enable();
+
     _ei(); /* Enable global interrupts just before launching the first task */
 
     asm volatile(
@@ -574,6 +747,15 @@ void hal_context_init(jmp_buf *ctx, size_t sp, size_t ss, size_t ra)
     /* Zero the context for predictability */
     memset(ctx, 0, sizeof(*ctx));
 
+    /* Compute tp value same as boot.c: aligned to 64 bytes from _end */
+    uint32_t tp_val = ((uint32_t) &_end + 63) & ~63U;
+
+    /* Set global pointer and thread pointer for proper task execution.
+     * These are critical for accessing global variables and TLS.
+     */
+    (*ctx)[CONTEXT_GP] = (uint32_t) &_gp;
+    (*ctx)[CONTEXT_TP] = tp_val;
+
     /* Set the essential registers for a new task:
      * - SP is set to the prepared top of the task's stack.
      * - RA is set to the task's entry point.
diff --git a/arch/riscv/hal.h b/arch/riscv/hal.h
index 8354264e..45a16409 100644
--- a/arch/riscv/hal.h
+++ b/arch/riscv/hal.h
@@ -76,6 +76,12 @@ int32_t hal_context_save(jmp_buf env);
 void hal_context_restore(jmp_buf env, int32_t val);
 void hal_dispatch_init(jmp_buf env);
 
+/* Stack switching for preemptive context switch.
+ * Saves current SP to *old_sp and loads new SP from new_sp.
+ * Used by dispatcher when switching tasks in preemptive mode.
+ */
+void hal_switch_stack(void **old_sp, void *new_sp);
+
 /* Provides a blocking, busy-wait delay.
  * This function monopolizes the CPU and should only be used for very short
  * delays or in pre-scheduling initialization code.
@@ -92,7 +98,14 @@ uint64_t _read_us(void);
 void hal_hardware_init(void);
 void hal_timer_enable(void);
 void hal_timer_disable(void);
-void hal_interrupt_tick(void);
+void hal_timer_irq_enable(
+    void); /* Enable timer interrupt bit only (for NOSCHED) */
+void hal_timer_irq_disable(
+    void); /* Disable timer interrupt bit only (for NOSCHED) */
+void hal_interrupt_tick(void); /* Enable interrupts on first task run */
+void *hal_build_initial_frame(
+    void *stack_top,
+    void (*task_entry)(void)); /* Build ISR frame for preemptive mode */
 
 /* Initializes the context structure for a new task.
  * @ctx : Pointer to jmp_buf to initialize (must be non-NULL).
@@ -109,4 +122,4 @@ void hal_panic(void);
 void hal_cpu_idle(void);
 
 /* Default stack size for new tasks if not otherwise specified */
-#define DEFAULT_STACK_SIZE 4096
+#define DEFAULT_STACK_SIZE 8192
diff --git a/include/sys/logger.h b/include/sys/logger.h
index eb0ecf7a..e4a3df75 100644
--- a/include/sys/logger.h
+++ b/include/sys/logger.h
@@ -58,3 +58,21 @@ uint32_t mo_logger_queue_depth(void);
  * Returns total dropped message count since logger init
  */
 uint32_t mo_logger_dropped_count(void);
+
+/* Check if logger is in direct output mode.
+ * Lock-free read for performance - safe to call frequently.
+ * Returns true if printf/puts should bypass the queue.
+ */
+bool mo_logger_direct_mode(void);
+
+/* Flush all pending messages and enter direct output mode.
+ * Drains the queue directly from caller's context.
+ * After flush, printf/puts bypass the queue for ordered output.
+ * Call mo_logger_async_resume() to re-enable async logging.
+ */
+void mo_logger_flush(void);
+
+/* Re-enable async logging after a flush.
+ * Call this after completing ordered output that required direct mode.
+ */
+void mo_logger_async_resume(void);
diff --git a/include/sys/task.h b/include/sys/task.h
index dc6410af..0d3aaa4d 100644
--- a/include/sys/task.h
+++ b/include/sys/task.h
@@ -67,6 +67,7 @@ enum task_states {
 typedef struct tcb {
     /* Context and Stack Management */
     jmp_buf context; /* Saved CPU context (GPRs, SP, PC) for task switching */
+    void *sp;        /* Saved stack pointer for preemptive context switch */
     void *stack;     /* Pointer to base of task's allocated stack memory */
     size_t stack_sz; /* Total size of the stack in bytes */
     void (*entry)(void); /* Task's entry point function */
@@ -145,21 +146,29 @@ extern kcb_t *kcb;
             _ei();           \
     } while (0)
 
+/* Flag indicating scheduler has started - prevents timer IRQ during early
+ * initializations.
+ */
+extern volatile bool scheduler_started;
+
 /* Disable/enable ONLY the scheduler timer interrupt.
  * Lighter-weight critical section that prevents task preemption but allows
  * other hardware interrupts (e.g., UART) to be serviced, minimizing latency.
  * Use when protecting data shared between tasks.
+ *
+ * NOSCHED_LEAVE only enables timer if scheduler has started, preventing
+ * premature timer interrupts during early initialization (e.g., logger init).
  */
-#define NOSCHED_ENTER()          \
-    do {                         \
-        if (kcb->preemptive)     \
-            hal_timer_disable(); \
+#define NOSCHED_ENTER()              \
+    do {                             \
+        if (kcb->preemptive)         \
+            hal_timer_irq_disable(); \
     } while (0)
 
-#define NOSCHED_LEAVE()         \
-    do {                        \
-        if (kcb->preemptive)    \
-            hal_timer_enable(); \
+#define NOSCHED_LEAVE()                           \
+    do {                                          \
+        if (kcb->preemptive && scheduler_started) \
+            hal_timer_irq_enable();               \
     } while (0)
 
 /* Core Kernel and Task Management API */
@@ -169,8 +178,8 @@ extern kcb_t *kcb;
 /* Prints a fatal error message and halts the system */
 void panic(int32_t ecode);
 
-/* Main scheduler dispatch function, called by the timer ISR */
-void dispatcher(void);
+/* Main scheduler dispatch function, called by timer ISR or ecall */
+void dispatcher(int from_timer);
 
 /* Architecture-specific context switch implementations */
 void _dispatch(void);
diff --git a/kernel/logger.c b/kernel/logger.c
index 701a36f7..f2fe2733 100644
--- a/kernel/logger.c
+++ b/kernel/logger.c
@@ -28,6 +28,12 @@ typedef struct {
     mutex_t lock;     /* Protects queue manipulation, not UART output */
     int32_t task_id;
     bool initialized;
+
+    /* When true, printf bypasses queue.
+     * volatile: prevent compiler caching for lock-free read. Written under
+     * mutex, read without - safe on single-core.
+     */
+    volatile bool direct_mode;
 } logger_state_t;
 
 static logger_state_t logger;
@@ -75,8 +81,8 @@ int32_t mo_logger_init(void)
     if (mo_mutex_init(&logger.lock) != ERR_OK)
         return ERR_FAIL;
 
-    /* 512B stack: simple operations only (no printf/recursion/ISR use) */
-    logger.task_id = mo_task_spawn(logger_task, 512);
+    /* 1024B stack: space for log_entry_t (130B) + ISR frame (128B) + calls */
+    logger.task_id = mo_task_spawn(logger_task, 1024);
     if (logger.task_id < 0) {
         mo_mutex_destroy(&logger.lock);
         return ERR_FAIL;
@@ -149,3 +155,62 @@ uint32_t mo_logger_dropped_count(void)
 
     return dropped;
 }
+
+/* Check if logger is in direct output mode.
+ * Lock-free read: safe because direct_mode is only set atomically by flush
+ * and cleared by async_resume, both under mutex protection. Reading a stale
+ * value is benign (worst case: one extra direct output or one queued message).
+ */
+bool mo_logger_direct_mode(void)
+{
+    return logger.initialized && logger.direct_mode;
+}
+
+/* Flush all pending messages and enter direct output mode.
+ * Drains the queue directly from caller's context, bypassing logger task.
+ * After flush, printf/puts bypass the queue for ordered output.
+ * Call mo_logger_async_resume() to re-enable async logging.
+ */
+void mo_logger_flush(void)
+{
+    if (!logger.initialized)
+        return;
+
+    log_entry_t entry;
+
+    while (1) {
+        bool have_message = false;
+
+        mo_mutex_lock(&logger.lock);
+        if (logger.count > 0) {
+            memcpy(&entry, &logger.queue[logger.tail], sizeof(log_entry_t));
+            logger.tail = (logger.tail + 1) % LOG_QSIZE;
+            logger.count--;
+            have_message = true;
+        } else {
+            /* Queue drained: enter direct mode while still holding lock */
+            logger.direct_mode = true;
+        }
+        mo_mutex_unlock(&logger.lock);
+
+        if (!have_message)
+            break;
+
+        /* Output outside lock */
+        for (uint16_t i = 0; i < entry.length; i++)
+            _putchar(entry.data[i]);
+    }
+}
+
+/* Re-enable async logging after a flush.
+ * Call this after completing ordered output that required direct mode.
+ */
+void mo_logger_async_resume(void)
+{
+    if (!logger.initialized)
+        return;
+
+    mo_mutex_lock(&logger.lock);
+    logger.direct_mode = false;
+    mo_mutex_unlock(&logger.lock);
+}
diff --git a/kernel/main.c b/kernel/main.c
index c1583951..ce0dc08a 100644
--- a/kernel/main.c
+++ b/kernel/main.c
@@ -67,6 +67,11 @@ int32_t main(void)
     if (!first_task)
         panic(ERR_NO_TASKS);
 
+    /* Mark scheduler as started - enables timer IRQ in NOSCHED_LEAVE.
+     * Must be set before hal_dispatch_init() which enables preemption.
+     */
+    scheduler_started = true;
+
     hal_dispatch_init(first_task->context);
 
     /* This line should be unreachable. */
diff --git a/kernel/task.c b/kernel/task.c
index de17913b..84f048a3 100644
--- a/kernel/task.c
+++ b/kernel/task.c
@@ -29,6 +29,12 @@ static kcb_t kernel_state = {
 };
 kcb_t *kcb = &kernel_state;
 
+/* Flag to track if scheduler has started - prevents timer IRQ during early
+ * init. NOSCHED_LEAVE checks this to avoid enabling timer before scheduler is
+ * ready.
+ */
+volatile bool scheduler_started = false;
+
 /* timer work management for reduced latency */
 static volatile uint32_t timer_work_pending = 0;    /* timer work types */
 static volatile uint32_t timer_work_generation = 0; /* counter for coalescing */
@@ -178,6 +184,22 @@ static list_node_t *delay_update_batch(list_node_t *node, void *arg)
     if (t->delay > 0) {
         if (--t->delay == 0) {
             t->state = TASK_READY;
+
+            /* If this is an RT task, set its deadline for the next job.
+             * For periodic tasks, deadline should be current_time + period.
+             * This ensures tasks are scheduled based on their actual deadlines,
+             * not inflated values from previous scheduler calls.
+             */
+            if (t->rt_prio) {
+                typedef struct {
+                    uint32_t period;
+                    uint32_t deadline;
+                } edf_prio_t;
+                edf_prio_t *edf = (edf_prio_t *) t->rt_prio;
+                extern kcb_t *kcb;
+                edf->deadline = kcb->ticks + edf->period;
+            }
+
             /* Add to appropriate priority ready queue */
             sched_enqueue_task(t);
             (*ready_count)++;
@@ -369,9 +391,13 @@ void sched_tick_current_task(void)
     if (current_task->time_slice > 0)
         current_task->time_slice--;
 
-    /* If time slice expired, force immediate rescheduling */
+    /* If time slice expired, mark task as ready for rescheduling.
+     * Don't call _dispatch() here - let the normal dispatcher() flow handle it.
+     * Calling _dispatch() from within dispatcher() causes double-dispatch bug.
+     */
     if (current_task->time_slice == 0) {
-        _dispatch();
+        if (current_task->state == TASK_RUNNING)
+            current_task->state = TASK_READY;
     }
 }
 
@@ -443,7 +469,29 @@ uint16_t sched_select_next_task(void)
 
     } while (node != start_node && ++iterations < SCHED_IMAX);
 
-    /* No ready tasks found - this should not happen in normal operation */
+    /* No ready tasks found in preemptive mode - all tasks are blocked.
+     * This is normal for periodic RT tasks waiting for their next period.
+     * We CANNOT return a BLOCKED task as that would cause it to run.
+     * Instead, find ANY task (even blocked) as a placeholder, then wait for
+     * interrupt.
+     */
+    if (kcb->preemptive) {
+        /* Select any task as placeholder (dispatcher won't actually switch to
+         * it if blocked) */
+        list_node_t *any_node = list_next(kcb->tasks->head);
+        while (any_node && any_node != kcb->tasks->tail) {
+            if (any_node->data) {
+                kcb->task_current = any_node;
+                tcb_t *any_task = any_node->data;
+                return any_task->id;
+            }
+            any_node = list_next(any_node);
+        }
+        /* No tasks at all - this is a real error */
+        panic(ERR_NO_TASKS);
+    }
+
+    /* In cooperative mode, having no ready tasks is an error */
     panic(ERR_NO_TASKS);
     return 0;
 }
@@ -454,10 +502,14 @@ static int32_t noop_rtsched(void)
     return -1;
 }
 
-/* The main entry point from the system tick interrupt. */
-void dispatcher(void)
+/* The main entry point from interrupts (timer or ecall).
+ * Parameter: from_timer = 1 if called from timer ISR (increment ticks),
+ *                       = 0 if called from ecall (don't increment ticks)
+ */
+void dispatcher(int from_timer)
 {
-    kcb->ticks++;
+    if (from_timer)
+        kcb->ticks++;
 
     /* Handle time slice for current task */
     sched_tick_current_task();
@@ -475,12 +527,15 @@ void dispatch(void)
     if (unlikely(!kcb || !kcb->task_current || !kcb->task_current->data))
         panic(ERR_NO_TASKS);
 
-    /* Save current context using dedicated HAL routine that handles both
-     * execution context and processor state for context switching.
-     * Returns immediately if this is the restore path.
+    /* Save current context - only needed for cooperative mode.
+     * In preemptive mode, ISR already saved context to stack,
+     * so we skip this step to avoid interference.
      */
-    if (hal_context_save(((tcb_t *) kcb->task_current->data)->context) != 0)
-        return;
+    if (!kcb->preemptive) {
+        /* Cooperative mode: use setjmp/longjmp mechanism */
+        if (hal_context_save(((tcb_t *) kcb->task_current->data)->context) != 0)
+            return;
+    }
 
 #if CONFIG_STACK_PROTECTION
     /* Do stack check less frequently to reduce overhead */
@@ -488,18 +543,99 @@ void dispatch(void)
         task_stack_check();
 #endif
 
-    /* Batch process task delays for better efficiency */
+    /* Batch process task delays for better efficiency.
+     * Only process delays if tick has advanced to avoid decrementing multiple
+     * times per tick when dispatch() is called multiple times.
+     */
     uint32_t ready_count = 0;
-    list_foreach(kcb->tasks, delay_update_batch, &ready_count);
+    static uint32_t last_delay_update_tick = 0;
+    if (kcb->ticks != last_delay_update_tick) {
+        list_foreach(kcb->tasks, delay_update_batch, &ready_count);
+        last_delay_update_tick = kcb->ticks;
+    }
 
     /* Hook for real-time scheduler - if it selects a task, use it */
-    if (kcb->rt_sched() < 0)
-        sched_select_next_task(); /* Use O(1) priority scheduler */
+    tcb_t *prev_task = kcb->task_current->data;
+    int32_t rt_task_id = kcb->rt_sched();
+
+    if (rt_task_id < 0) {
+        sched_select_next_task(); /* Use O(n) round-robin scheduler */
+    } else {
+        /* RT scheduler selected a task - update current task pointer */
+        list_node_t *rt_node = find_task_node_by_id((uint16_t) rt_task_id);
+        if (rt_node && rt_node->data) {
+            tcb_t *rt_task = rt_node->data;
+            /* Different task - perform context switch */
+            if (rt_node != kcb->task_current) {
+                if (kcb->task_current && kcb->task_current->data) {
+                    tcb_t *prev = kcb->task_current->data;
+                    if (prev->state == TASK_RUNNING)
+                        prev->state = TASK_READY;
+                }
+                /* Switch to RT task */
+                kcb->task_current = rt_node;
+                rt_task->state = TASK_RUNNING;
+                rt_task->time_slice =
+                    get_priority_timeslice(rt_task->prio_level);
+            }
+            /* If same task selected, fall through to do_context_switch
+             * which will check if task is blocked and handle appropriately */
+        } else {
+            /* RT task not found, fall back to round-robin */
+            sched_select_next_task();
+        }
+    }
 
-    hal_interrupt_tick();
+    /* Check if we're still on the same task (no actual switch needed) */
+    tcb_t *next_task = kcb->task_current->data;
 
-    /* Restore next task context */
-    hal_context_restore(((tcb_t *) kcb->task_current->data)->context, 1);
+    /* In preemptive mode, if selected task has pending delay, keep trying to
+     * find ready task. We check delay > 0 instead of state == BLOCKED because
+     * schedulers already modified state to RUNNING.
+     */
+    if (kcb->preemptive) {
+        int attempts = 0;
+        while (next_task->delay > 0 && attempts < 10) {
+            /* Try next task in round-robin */
+            kcb->task_current = list_cnext(kcb->tasks, kcb->task_current);
+            if (!kcb->task_current || !kcb->task_current->data)
+                kcb->task_current = list_next(kcb->tasks->head);
+            next_task = kcb->task_current->data;
+            attempts++;
+        }
+
+        /* If still has delay after all attempts, all tasks are blocked.
+         * Just select this task anyway - it will resume and immediately yield
+         * again, creating a busy-wait ecall loop until timer interrupt fires
+         * and decrements delays.
+         */
+    }
+
+    /* Update task state and time slice before context switch */
+    if (next_task->state != TASK_RUNNING)
+        next_task->state = TASK_RUNNING;
+    next_task->time_slice = get_priority_timeslice(next_task->prio_level);
+
+    /* Perform context switch based on scheduling mode */
+    if (kcb->preemptive) {
+        /* Same task - no context switch needed */
+        if (next_task == prev_task)
+            return; /* ISR will restore from current stack naturally */
+
+        /* Preemptive mode: Switch stack pointer.
+         * ISR already saved context to prev_task's stack.
+         * Switch SP to next_task's stack.
+         * When we return, ISR will restore from next_task's stack.
+         */
+        hal_switch_stack(&prev_task->sp, next_task->sp);
+    } else {
+        /* Cooperative mode: Always call hal_context_restore() because it uses
+         * setjmp/longjmp mechanism. Even if same task continues, we must
+         * longjmp back to complete the context save/restore cycle.
+         */
+        hal_interrupt_tick();
+        hal_context_restore(next_task->context, 1);
+    }
 }
 
 /* Cooperative context switch */
@@ -511,7 +647,24 @@ void yield(void)
     /* Process deferred timer work during yield */
     process_deferred_timer_work();
 
-    /* HAL context switching is used for preemptive scheduling. */
+    /* In preemptive mode, can't use setjmp/longjmp - incompatible with ISR
+     * stack frames. Trigger dispatcher via ecall, then wait until task becomes
+     * READY again.
+     */
+    if (kcb->preemptive) {
+        /* Trigger one dispatcher call - this will context switch to another
+         * task. When we return here (after being rescheduled), our delay will
+         * have expired.
+         */
+        __asm__ volatile("ecall");
+
+        /* After ecall returns, we've been context-switched back, meaning we're
+         * READY. No need to check state - if we're executing, we're ready.
+         */
+        return;
+    }
+
+    /* Cooperative mode: use setjmp/longjmp mechanism */
     if (hal_context_save(((tcb_t *) kcb->task_current->data)->context) != 0)
         return;
 
@@ -520,8 +673,7 @@ void yield(void)
 #endif
 
     /* In cooperative mode, delays are only processed on an explicit yield. */
-    if (!kcb->preemptive)
-        list_foreach(kcb->tasks, delay_update, NULL);
+    list_foreach(kcb->tasks, delay_update, NULL);
 
     sched_select_next_task(); /* Use O(1) priority scheduler */
     hal_context_restore(((tcb_t *) kcb->task_current->data)->context, 1);
@@ -627,6 +779,12 @@ int32_t mo_task_spawn(void *task_entry, uint16_t stack_size_req)
     hal_context_init(&tcb->context, (size_t) tcb->stack, new_stack_size,
                      (size_t) task_entry);
 
+    /* Initialize SP for preemptive mode.
+     * Build initial ISR frame on stack with mepc pointing to task entry.
+     */
+    void *stack_top = (void *) ((uint8_t *) tcb->stack + new_stack_size);
+    tcb->sp = hal_build_initial_frame(stack_top, task_entry);
+
     printf("task %u: entry=%p stack=%p size=%u prio_level=%u time_slice=%u\n",
            tcb->id, task_entry, tcb->stack, (unsigned int) new_stack_size,
            tcb->prio_level, tcb->time_slice);
@@ -807,6 +965,7 @@ int32_t mo_task_rt_priority(uint16_t id, void *priority)
     }
 
     task->rt_prio = priority;
+
     CRITICAL_LEAVE();
     return ERR_OK;
 }
@@ -838,9 +997,16 @@ void mo_task_wfi(void)
     if (!kcb->preemptive)
         return;
 
+    /* Enable interrupts before WFI - we're in ISR context with interrupts
+     * disabled. WFI needs interrupts enabled to wake up on timer interrupt.
+     */
+    _ei();
+
     volatile uint32_t current_ticks = kcb->ticks;
     while (current_ticks == kcb->ticks)
         hal_cpu_idle();
+
+    /* Note: Interrupts will be re-disabled when we return to ISR caller */
 }
 
 uint16_t mo_task_count(void)
diff --git a/lib/stdio.c b/lib/stdio.c
index 65f0eba4..f5eec7d7 100644
--- a/lib/stdio.c
+++ b/lib/stdio.c
@@ -294,7 +294,11 @@ int vsnprintf(char *str, size_t size, const char *fmt, va_list args)
 /* Formatted output to stdout.
  * Uses a fixed stack buffer - very long output will be truncated.
  * Thread-safe: Uses deferred logging via logger task.
- * Falls back to direct output during early boot or if queue is full.
+ * Falls back to direct output during early boot, queue full, or after flush.
+ *
+ * Flush-aware behavior: After mo_logger_flush(), printf() outputs directly
+ * to UART (direct_mode flag set), ensuring ordered output for multi-line
+ * reports. Call mo_logger_async_resume() to re-enable async logging.
  */
 int32_t printf(const char *fmt, ...)
 {
@@ -305,13 +309,22 @@ int32_t printf(const char *fmt, ...)
     int32_t len = vsnprintf(buf, sizeof(buf), fmt, args);
     va_end(args);
 
-    /* Try deferred logging only if message fits (avoids silent truncation).
-     * Long messages fall back to direct output for completeness.
+    /* Handle vsnprintf error (negative return indicates encoding error) */
+    if (len < 0)
+        return len;
+
+    /* Try deferred logging only if:
+     * 1. Message fits in log entry (avoids silent truncation)
+     * 2. Not in direct mode (set by mo_logger_flush)
+     * 3. Enqueue succeeds (queue not full)
      */
-    if (len <= LOG_ENTRY_SZ - 1 && mo_logger_enqueue(buf, len) == 0)
+    if (len <= LOG_ENTRY_SZ - 1 && !mo_logger_direct_mode() &&
+        mo_logger_enqueue(buf, len) == 0)
         return len; /* Successfully enqueued */
 
-    /* Fallback to direct output (early boot, queue full, or too long) */
+    /* Direct output: early boot, direct mode (post-flush), queue full, or too
+     * long.
+     */
     char *p = buf;
     while (*p)
         _putchar(*p++);
@@ -336,7 +349,8 @@ int32_t snprintf(char *str, size_t size, const char *fmt, ...)
 
 /* Writes a string to stdout, followed by a newline.
  * Thread-safe: Uses deferred logging via logger task.
- * Falls back to direct output during early boot or if queue is full.
+ * Falls back to direct output during early boot, queue full, or after flush.
+ * Same flush-aware behavior as printf() for ordered multi-line output.
  */
 int32_t puts(const char *str)
 {
@@ -349,11 +363,14 @@ int32_t puts(const char *str)
     buf[len++] = '\n';
     buf[len] = '\0';
 
-    /* Try deferred logging only if message fits (avoids silent truncation) */
-    if (len <= LOG_ENTRY_SZ - 1 && mo_logger_enqueue(buf, len) == 0)
+    /* Try deferred logging only if not in direct mode */
+    if (len <= LOG_ENTRY_SZ - 1 && !mo_logger_direct_mode() &&
+        mo_logger_enqueue(buf, len) == 0)
         return 0; /* Successfully enqueued */
 
-    /* Fallback to direct output (early boot, queue full, or too long) */
+    /* Direct output: early boot, direct mode (post-flush), queue full, or too
+     * long.
+     */
     char *p = buf;
     while (*p)
         _putchar(*p++);