diff --git a/app/rtsched.c b/app/rtsched.c
index ffb98876..56402814 100644
--- a/app/rtsched.c
+++ b/app/rtsched.c
@@ -1,144 +1,491 @@
 #include <linmo.h>
 
-/* Task 4: Simple task that prints a message and waits for scheduling */
-static void task4(void)
+/* Extended task statistics for fairness validation */
+typedef struct {
+    uint32_t executions;      /* Total number of job executions */
+    uint32_t deadline_misses; /* Count of missed deadlines (RT tasks only) */
+    uint32_t total_response;  /* Sum of response times (release to start) */
+    uint32_t max_response, min_response; /* Max/Min response time observed */
+    uint32_t period;                     /* Task period (0 for non-RT) */
+    uint32_t deadline;                   /* Relative deadline (0 for non-RT) */
+} task_stats_t;
+
+/* Global statistics - indexed by task number 0-4 */
+static task_stats_t task_stats[5];
+static volatile uint32_t test_start_time = 0;
+
+/* Flag to indicate test has started.
+ * Note: This has a benign race condition - multiple tasks may observe
+ * test_started=0 simultaneously and attempt to set test_start_time.
+ * This is acceptable because: (1) all tasks set similar values (current tick),
+ * and (2) EDF ensures the highest-priority task runs first anyway.
+ * A proper fix would use a mutex, but the overhead is unnecessary here.
+ */
+static volatile int test_started = 0;
+static uint32_t test_duration = 50; /* Run for 50 ticks for better statistics */
+
+/* Set to 1+ to enable workload simulation for response time testing */
+#define WORKLOAD_TICKS 0
+
+/* Simulate workload: busy-wait for WORKLOAD_TICKS timer periods.
+ * When WORKLOAD_TICKS=0 (default), this is a no-op.
+ */
+#if WORKLOAD_TICKS > 0
+static void simulate_workload(void)
 {
-    while (1) {
-        printf("Task 4 running\n");
-        mo_task_wfi(); /* Wait for interrupt to yield control */
+    uint32_t start = mo_ticks();
+    while (mo_ticks() - start < WORKLOAD_TICKS) {
+        /* Busy wait to consume CPU time */
+        for (volatile int i = 0; i < 1000; i++)
+            ;
     }
 }
+#else
+#define simulate_workload() ((void) 0)
+#endif
 
-/* Task 3: Simple task that prints a message and waits for scheduling */
-static void task3(void)
+/* Task 0: RT task with period=10 */
+static void task0(void)
 {
-    while (1) {
-        printf("Task 3 running\n");
-        mo_task_wfi(); /* Wait for interrupt to yield control */
+    int idx = 0;
+    uint32_t period = 10;
+    uint32_t theoretical_release;
+    uint32_t job_start;
+    uint32_t response_time;
+
+    /* Initialize stats */
+    task_stats[idx].period = period;
+    task_stats[idx].deadline = period; /* implicit deadline = period */
+    task_stats[idx].min_response = UINT32_MAX;
+
+    /* Initialize test_start_time on first task execution */
+    if (!test_started) {
+        test_start_time = mo_ticks();
+        test_started = 1;
+    }
+
+    /* First job theoretical release time = test start */
+    theoretical_release = test_start_time;
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        /* Record actual start time */
+        job_start = mo_ticks();
+
+        /* Response time = actual start - theoretical release */
+        response_time = job_start - theoretical_release;
+
+        /* Track response time statistics */
+        task_stats[idx].total_response += response_time;
+        if (response_time > task_stats[idx].max_response)
+            task_stats[idx].max_response = response_time;
+        if (response_time < task_stats[idx].min_response)
+            task_stats[idx].min_response = response_time;
+
+        /* Check deadline miss (response > deadline) */
+        if (response_time > task_stats[idx].deadline)
+            task_stats[idx].deadline_misses++;
+
+        task_stats[idx].executions++;
+
+        /* Simulate workload to test response time measurement */
+        simulate_workload();
+
+        /* Calculate next theoretical release time */
+        theoretical_release += period;
+
+        /* Delay until next period */
+        uint32_t now = mo_ticks();
+        if (now < theoretical_release)
+            mo_task_delay(theoretical_release - now);
+    }
+
+    /* Clear RT priority so EDF stops selecting this task */
+    mo_task_rt_priority(mo_task_id(), NULL);
+    while (1)
+        mo_task_wfi();
+}
+
+/* Task 1: RT task with period=15 */
+static void task1(void)
+{
+    int idx = 1;
+    uint32_t period = 15;
+    uint32_t theoretical_release;
+    uint32_t job_start;
+    uint32_t response_time;
+
+    /* Initialize stats */
+    task_stats[idx].period = period;
+    task_stats[idx].deadline = period;
+    task_stats[idx].min_response = UINT32_MAX;
+
+    /* Wait for test to start */
+    while (!test_started)
+        mo_task_delay(1);
+
+    /* First job theoretical release time = test start */
+    theoretical_release = test_start_time;
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        job_start = mo_ticks();
+        response_time = job_start - theoretical_release;
+
+        task_stats[idx].total_response += response_time;
+        if (response_time > task_stats[idx].max_response)
+            task_stats[idx].max_response = response_time;
+        if (response_time < task_stats[idx].min_response)
+            task_stats[idx].min_response = response_time;
+        if (response_time > task_stats[idx].deadline)
+            task_stats[idx].deadline_misses++;
+
+        task_stats[idx].executions++;
+
+        /* Simulate workload */
+        simulate_workload();
+
+        theoretical_release += period;
+        uint32_t now = mo_ticks();
+        if (now < theoretical_release)
+            mo_task_delay(theoretical_release - now);
     }
+
+    mo_task_rt_priority(mo_task_id(), NULL);
+    while (1)
+        mo_task_wfi();
 }
 
-/* Task 2: Prints task ID and an incrementing counter, then waits */
+/* Task 2: RT task with period=20 */
 static void task2(void)
 {
-    int32_t cnt = 300000;
+    int idx = 2;
+    uint32_t period = 20;
+    uint32_t theoretical_release;
+    uint32_t job_start;
+    uint32_t response_time;
 
-    while (1) {
-        printf("[Task %d: %ld]\n", mo_task_id(), cnt++);
-        mo_task_wfi(); /* Yield control to scheduler */
+    /* Initialize stats */
+    task_stats[idx].period = period;
+    task_stats[idx].deadline = period;
+    task_stats[idx].min_response = UINT32_MAX;
+
+    /* Wait for test to start */
+    while (!test_started)
+        mo_task_delay(1);
+
+    /* First job theoretical release time = test start */
+    theoretical_release = test_start_time;
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        job_start = mo_ticks();
+        response_time = job_start - theoretical_release;
+
+        task_stats[idx].total_response += response_time;
+        if (response_time > task_stats[idx].max_response)
+            task_stats[idx].max_response = response_time;
+        if (response_time < task_stats[idx].min_response)
+            task_stats[idx].min_response = response_time;
+        if (response_time > task_stats[idx].deadline)
+            task_stats[idx].deadline_misses++;
+
+        task_stats[idx].executions++;
+
+        /* Simulate workload */
+        simulate_workload();
+
+        theoretical_release += period;
+        uint32_t now = mo_ticks();
+        if (now < theoretical_release)
+            mo_task_delay(theoretical_release - now);
     }
+
+    mo_task_rt_priority(mo_task_id(), NULL);
+    while (1)
+        mo_task_wfi();
 }
 
-/* Task 1: Prints task ID and an incrementing counter, then waits */
-static void task1(void)
+/* Task 3: Non-RT background task */
+static void task3(void)
 {
-    int32_t cnt = 200000;
+    int idx = 3;
+    uint32_t period = 25;
 
-    while (1) {
-        printf("[Task %d: %ld]\n", mo_task_id(), cnt++);
-        mo_task_wfi(); /* Yield control to scheduler */
+    task_stats[idx].period = period;
+
+    while (!test_started)
+        mo_task_delay(1);
+
+    while (mo_ticks() - test_start_time < test_duration) {
+        task_stats[idx].executions++;
+        mo_task_delay(period);
     }
+    while (1)
+        mo_task_wfi();
 }
 
-/* Task 0: Prints task ID and an incrementing counter, then waits */
-static void task0(void)
+/* Print scheduling statistics using stdio with flush for ordered output */
+static void print_stats(void)
+{
+    /* Flush pending logger output to ensure report appears in order */
+    mo_logger_flush();
+
+    printf("\n========================================\n");
+    printf("    EDF Scheduler Statistics Report    \n");
+    printf("========================================\n");
+    printf("Test duration: %lu ticks\n\n", (unsigned long) test_duration);
+
+    printf("--- RT Task Statistics ---\n");
+    for (int i = 0; i < 3; i++) {
+        /* Ceiling division: task at t=0 runs once even for partial periods */
+        uint32_t expected =
+            (test_duration + task_stats[i].period - 1) / task_stats[i].period;
+        printf("Task %d (period=%lu, deadline=%lu):\n", i,
+               (unsigned long) task_stats[i].period,
+               (unsigned long) task_stats[i].deadline);
+        printf("  Executions: %lu (expected: %lu)\n",
+               (unsigned long) task_stats[i].executions,
+               (unsigned long) expected);
+        printf("  Deadline misses: %lu\n",
+               (unsigned long) task_stats[i].deadline_misses);
+
+        if (task_stats[i].executions > 0) {
+            uint32_t avg_response =
+                task_stats[i].total_response / task_stats[i].executions;
+            uint32_t jitter =
+                task_stats[i].max_response - task_stats[i].min_response;
+            printf("  Response time - min: %lu, max: %lu, avg: %lu\n",
+                   (unsigned long) task_stats[i].min_response,
+                   (unsigned long) task_stats[i].max_response,
+                   (unsigned long) avg_response);
+            printf("  Jitter (max-min): %lu ticks\n", (unsigned long) jitter);
+        }
+        printf("\n");
+    }
+
+    printf("--- Non-RT Task Statistics ---\n");
+    for (int i = 3; i < 5; i++) {
+        printf("Task %d (period=%lu):\n", i,
+               (unsigned long) task_stats[i].period);
+        printf("  Executions: %lu\n\n",
+               (unsigned long) task_stats[i].executions);
+    }
+
+    printf("--- Fairness Analysis ---\n");
+
+    /* 1. Deadline miss check */
+    uint32_t total_deadline_misses = 0;
+    for (int i = 0; i < 3; i++)
+        total_deadline_misses += task_stats[i].deadline_misses;
+    printf("1. Deadline misses: %lu %s\n",
+           (unsigned long) total_deadline_misses,
+           total_deadline_misses == 0 ? "[PASS]" : "[FAIL]");
+
+    /* 2. Execution count fairness */
+    int exec_ok = 1;
+    for (int i = 0; i < 3; i++) {
+        /* Ceiling division: task at t=0 runs once even for partial periods */
+        uint32_t expected =
+            (test_duration + task_stats[i].period - 1) / task_stats[i].period;
+        uint32_t actual = task_stats[i].executions;
+        /* Avoid underflow: check actual+1 < expected */
+        if (actual + 1 < expected || actual > expected + 1)
+            exec_ok = 0;
+    }
+    printf("2. Execution count: %s\n", exec_ok ? "[PASS] within expected range"
+                                               : "[FAIL] unexpected count");
+
+    /* 3. Response time bounded by deadline */
+    int response_ok = 1;
+    for (int i = 0; i < 3; i++) {
+        if (task_stats[i].max_response > task_stats[i].deadline)
+            response_ok = 0;
+    }
+    printf("3. Response bounded: %s\n",
+           response_ok ? "[PASS] max_response <= deadline"
+                       : "[FAIL] response exceeded deadline");
+
+    /* 4. Jitter analysis */
+    int jitter_ok = 1;
+    for (int i = 0; i < 3; i++) {
+        if (task_stats[i].executions > 0) {
+            uint32_t jitter =
+                task_stats[i].max_response - task_stats[i].min_response;
+            if (jitter > task_stats[i].period / 2)
+                jitter_ok = 0;
+        }
+    }
+    printf("4. Jitter acceptable: %s\n", jitter_ok
+                                             ? "[PASS] jitter < 50% period"
+                                             : "[WARN] high jitter detected");
+
+    /* 5. Non-RT task starvation check */
+    int starvation_ok =
+        (task_stats[3].executions > 0 || task_stats[4].executions > 0);
+    printf("5. Non-RT starvation: %s\n", starvation_ok
+                                             ? "[PASS] non-RT tasks executed"
+                                             : "[FAIL] non-RT tasks starved");
+
+    /* Overall verdict */
+    printf("\n--- Overall Verdict ---\n");
+    printf("EDF Scheduler: %s\n", (total_deadline_misses == 0 && exec_ok &&
+                                   response_ok && starvation_ok)
+                                      ? "All tests passed"
+                                      : "Some tests failed");
+    printf("========================================\n");
+
+    /* Re-enable async logging for any subsequent output */
+    mo_logger_async_resume();
+}
+
+/* Task 4: Statistics collector and reporter */
+static void task4(void)
 {
-    int32_t cnt = 100000;
+    int idx = 4;
+
+    task_stats[idx].period = 1; /* Runs every tick */
 
+    /* Wait for test to start */
+    while (!test_started)
+        mo_task_delay(1);
+
+    /* Monitor test progress */
+    while (mo_ticks() - test_start_time < test_duration) {
+        task_stats[idx].executions++;
+        mo_task_delay(1);
+    }
+
+    /* Wait a bit for other tasks to complete */
+    mo_task_delay(5);
+
+    /* Print comprehensive statistics */
+    print_stats();
+
+    while (1)
+        mo_task_wfi();
+}
+
+/* IDLE task: Always ready, runs when all other tasks blocked */
+static void idle_task(void)
+{
     while (1) {
-        printf("[Task %d: %ld]\n", mo_task_id(), cnt++);
-        mo_task_wfi(); /* Yield control to scheduler */
+        /* Just burn CPU cycles - don't yield or delay */
+        for (volatile int i = 0; i < 100; i++)
+            ;
     }
 }
 
 typedef struct {
-    unsigned credits;
-    unsigned remaining;
-} custom_prio_t;
-
-/* A simple credit-based real-time scheduler
- * – Every RT task carries a custom_prio_t record via its tcb_t::rt_prio field.
- * – Each time the scheduler selects a task it decrements "remaining".
- *   When "remaining" reaches zero it is reloaded from "credits" on the task’s
- *   next turn.
- * – The function returns the ID of the selected RT task, or –1 when no RT task
- *   is ready so the kernel should fall back to its round-robin scheduler.
+    uint32_t period;   /* Task period in ticks */
+    uint32_t deadline; /* Absolute deadline (ticks) */
+} edf_prio_t;
+
+/* Earliest Deadline First (EDF) real-time scheduler
+ * – Every RT task carries an edf_prio_t record via its tcb_t::rt_prio field.
+ * – The scheduler selects the READY RT task with the earliest absolute
+ * deadline. – When a task is selected, its deadline advances to the next
+ * period. – Returns the ID of the selected RT task, or -1 when no RT task is
+ * ready.
+ *
+ * Deadline Update Strategy:
+ * – Deadline advances (deadline += period) when a task is selected from READY.
+ * – For periodic tasks that delay for their period (mo_task_delay(period)),
+ *   this approximates correct EDF semantics: tasks become READY at period
+ *   boundaries, get selected shortly after, and deadline advances correctly.
+ * – This approach is simpler than tracking job releases separately.
+ * – Tasks must delay for their period to ensure correct periodic behavior.
+ *
+ * EDF is optimal for single-core systems: if any scheduler can meet all
+ * deadlines, EDF can. Complexity: O(n) where n = number of RT tasks.
  */
-static int32_t custom_sched(void)
+static int32_t edf_sched(void)
 {
-    static list_node_t *task_node = NULL; /* resume point */
-
-    /* If we have no starting point or we’ve wrapped, begin at head->next */
-    if (!task_node)
-        task_node = list_next(kcb->tasks->head);
-
-    /* Scan at most one full loop of the list */
-    list_node_t *start = task_node;
-    do {
-        if (!task_node) /* empty list */
-            return -1;
-
-        /* Skip head/tail sentinels and NULL-data nodes */
-        if (task_node == kcb->tasks->head || task_node == kcb->tasks->tail ||
-            !task_node->data) {
-            task_node = list_next(task_node);
+    tcb_t *earliest = NULL;
+    uint32_t earliest_deadline = UINT32_MAX;
+
+    /* Scan all tasks to find the one with earliest deadline */
+    list_node_t *node = list_next(kcb->tasks->head);
+    while (node && node != kcb->tasks->tail) {
+        if (!node->data) {
+            node = list_next(node);
             continue;
         }
 
-        /* Safe: data is non-NULL here */
-        tcb_t *task = (tcb_t *) task_node->data;
-
-        /* READY + RT-eligible ? */
-        if (task->state == TASK_READY && task->rt_prio) {
-            /* Consume one credit */
-            custom_prio_t *cp = (custom_prio_t *) task->rt_prio;
-            if (cp->remaining == 0)
-                cp->remaining = cp->credits;
-            cp->remaining--;
-
-            /* Next time resume with the following node */
-            task_node = list_next(task_node);
-            if (task_node == kcb->tasks->head || task_node == kcb->tasks->tail)
-                task_node = list_next(task_node); /* skip sentinel  */
-            return task->id;
+        tcb_t *task = (tcb_t *) node->data;
+
+        /* Consider both READY and RUNNING RT tasks for preemptive scheduling */
+        if ((task->state == TASK_READY || task->state == TASK_RUNNING) &&
+            task->rt_prio) {
+            edf_prio_t *edf = (edf_prio_t *) task->rt_prio;
+
+            /* Track task with earliest deadline */
+            if (edf->deadline < earliest_deadline) {
+                earliest_deadline = edf->deadline;
+                earliest = task;
+            }
         }
 
-        /* Otherwise advance */
-        task_node = list_next(task_node);
-    } while (task_node != start); /* one full lap */
+        node = list_next(node);
+    }
+
+    /* DON'T advance deadline here - that would happen on EVERY scheduler call!
+     * Deadline should only advance when task actually releases next job.
+     * For now, just return the selected task. Deadline advancement will happen
+     * when task becomes READY again after delay expires.
+     */
 
-    /* No READY RT task this cycle */
-    task_node = NULL; /* restart next */
-    return -1;
+    /* Return selected task ID, or -1 if no RT task is ready */
+    return earliest ? earliest->id : -1;
 }
 
 /* Application Entry Point: Initializes tasks and scheduler
  *
- * Spawns five tasks, assigns real-time priorities to tasks 0, 1, and 2,
- * and sets up the custom credit-based scheduler. Enables preemptive mode.
+ * RT Task Configuration (EDF scheduling):
+ * - Task 0: period = 10 ticks, utilization = 10%
+ * - Task 1: period = 15 ticks, utilization = 6.7%
+ * - Task 2: period = 20 ticks, utilization = 5%
+ * - Task 3: Non-RT background task (period = 25 ticks)
+ * - Task 4: Non-RT background task (period = 25 ticks)
+ *
+ * Total RT Utilization: ~21.7% (well under EDF's 100% bound)
  */
 int32_t app_main(void)
 {
-    /* Define RT task priorities with initial credit values */
-    static custom_prio_t priorities[3] = {
-        {.credits = 3, .remaining = 3}, /* Task 0 */
-        {.credits = 4, .remaining = 4}, /* Task 1 */
-        {.credits = 5, .remaining = 5}, /* Task 2 */
-    };
-
-    /* Spawn tasks with default stack size */
-    mo_task_spawn(task0, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task1, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task2, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task3, DEFAULT_STACK_SIZE);
-    mo_task_spawn(task4, DEFAULT_STACK_SIZE);
-
-    /* Configure custom scheduler and assign RT priorities */
-    kcb->rt_sched = custom_sched;
-    mo_task_rt_priority(0, &priorities[0]);
-    mo_task_rt_priority(1, &priorities[1]);
-    mo_task_rt_priority(2, &priorities[2]);
-
-    /* preemptive scheduling */
+    /* test_start_time will be initialized by first task that runs */
+
+    /* Spawn all 5 RT/background tasks first */
+    int32_t tid0 = mo_task_spawn(task0, DEFAULT_STACK_SIZE);
+    int32_t tid1 = mo_task_spawn(task1, DEFAULT_STACK_SIZE);
+    int32_t tid2 = mo_task_spawn(task2, DEFAULT_STACK_SIZE);
+    (void) mo_task_spawn(task3, DEFAULT_STACK_SIZE); /* Non-RT task 3 */
+    /* Non-RT task 4 - displays stats */
+    (void) mo_task_spawn(task4, DEFAULT_STACK_SIZE);
+
+    /* Spawn IDLE task LAST so it's at end of round-robin list.
+     * This ensures other ready tasks get scheduled before IDLE.
+     */
+    (void) mo_task_spawn(idle_task, DEFAULT_STACK_SIZE);
+
+    /* Configure EDF priorities for RT tasks 0-2 with deadlines relative to
+     * current time */
+    uint32_t now = mo_ticks();
+    static edf_prio_t priorities[3];
+    priorities[0].period = 10;
+    priorities[0].deadline = now + 10;
+    priorities[1].period = 15;
+    priorities[1].deadline = now + 15;
+    priorities[2].period = 20;
+    priorities[2].deadline = now + 20;
+
+    /* Install EDF scheduler BEFORE setting priorities */
+    kcb->rt_sched = edf_sched;
+
+    mo_task_rt_priority(tid0, &priorities[0]);
+    mo_task_rt_priority(tid1, &priorities[1]);
+    mo_task_rt_priority(tid2, &priorities[2]);
+
+    /* Tasks 3-4 are non-RT, will use round-robin when no RT tasks ready */
+
+    printf("[RTSCHED] Current tick: %lu\n", (unsigned long) mo_ticks());
+
+    /* Return 1 for preemptive mode */
     return 1;
 }
diff --git a/arch/riscv/boot.c b/arch/riscv/boot.c
index ef025de8..37978025 100644
--- a/arch/riscv/boot.c
+++ b/arch/riscv/boot.c
@@ -156,12 +156,19 @@ __attribute__((naked, aligned(4))) void _isr(void)
         /* Save trap-related CSRs and prepare arguments for do_trap */
         "csrr   a0, mcause\n" /* Arg 1: cause */
         "csrr   a1, mepc\n"   /* Arg 2: epc */
+        "mv     a2, sp\n"     /* Arg 3: isr_sp (current stack frame) */
         "sw     a0,  30*4(sp)\n"
         "sw     a1,  31*4(sp)\n"
 
-        /* Call the high-level C trap handler */
+        /* Call the high-level C trap handler.
+         * Returns: a0 = SP to use for restoring context (may be different
+         * task's stack if context switch occurred).
+         */
         "call   do_trap\n"
 
+        /* Use returned SP for context restore (enables context switching) */
+        "mv     sp, a0\n"
+
         /* Restore context. mepc might have been modified by the handler */
         "lw     a1,  31*4(sp)\n"
         "csrw   mepc, a1\n"
diff --git a/arch/riscv/hal.c b/arch/riscv/hal.c
index ae10a653..04ecc7d0 100644
--- a/arch/riscv/hal.c
+++ b/arch/riscv/hal.c
@@ -42,6 +42,19 @@
  */
 #define ISR_STACK_FRAME_SIZE 128
 
+/* Global variable to hold the new stack pointer for pending context switch.
+ * When a context switch is needed, hal_switch_stack() saves the current SP
+ * and stores the new SP here. The ISR epilogue then uses this value.
+ * NULL means no context switch is pending, use current SP.
+ */
+static void *pending_switch_sp = NULL;
+
+/* Global variable to hold the ISR frame SP for the current trap.
+ * Set at the start of do_trap() so hal_switch_stack() can save the correct
+ * SP to the previous task (the ISR frame SP, not the current function's SP).
+ */
+static uint32_t current_isr_frame_sp = 0;
+
 /* NS16550A UART0 - Memory-mapped registers for the QEMU 'virt' machine's serial
  * port.
  */
@@ -248,31 +261,48 @@ void hal_cpu_idle(void)
 
 /* Interrupt and Trap Handling */
 
+/* Direct UART output for trap context (avoids printf deadlock) */
+extern int _putchar(int c);
+static void trap_puts(const char *s)
+{
+    while (*s)
+        _putchar(*s++);
+}
+
+/* Exception message table per RISC-V Privileged Spec */
+static const char *exc_msg[] = {
+    [0] = "Instruction address misaligned",
+    [1] = "Instruction access fault",
+    [2] = "Illegal instruction",
+    [3] = "Breakpoint",
+    [4] = "Load address misaligned",
+    [5] = "Load access fault",
+    [6] = "Store/AMO address misaligned",
+    [7] = "Store/AMO access fault",
+    [8] = "Environment call from U-mode",
+    [9] = "Environment call from S-mode",
+    [10] = "Reserved",
+    [11] = "Environment call from M-mode",
+    [12] = "Instruction page fault",
+    [13] = "Load page fault",
+    [14] = "Reserved",
+    [15] = "Store/AMO page fault",
+};
+
 /* C-level trap handler, called by the '_isr' assembly routine.
  * @cause : The value of the 'mcause' CSR, indicating the reason for the trap.
  * @epc   : The value of the 'mepc' CSR, the PC at the time of the trap.
+ * @isr_sp: The stack pointer pointing to the ISR frame.
+ *
+ * Returns The SP to use for restoring context (same or new task's frame).
  */
-void do_trap(uint32_t cause, uint32_t epc)
+uint32_t do_trap(uint32_t cause, uint32_t epc, uint32_t isr_sp)
 {
-    static const char *exc_msg[] = {
-        /* For printing helpful debug messages */
-        [0] = "Instruction address misaligned",
-        [1] = "Instruction access fault",
-        [2] = "Illegal instruction",
-        [3] = "Breakpoint",
-        [4] = "Load address misaligned",
-        [5] = "Load access fault",
-        [6] = "Store/AMO address misaligned",
-        [7] = "Store/AMO access fault",
-        [8] = "Environment call from U-mode",
-        [9] = "Environment call from S-mode",
-        [10] = "Reserved",
-        [11] = "Environment call from M-mode",
-        [12] = "Instruction page fault",
-        [13] = "Load page fault",
-        [14] = "Reserved",
-        [15] = "Store/AMO page fault",
-    };
+    /* Reset pending switch at start of every trap */
+    pending_switch_sp = NULL;
+
+    /* Store ISR frame SP so hal_switch_stack() can save it to prev task */
+    current_isr_frame_sp = isr_sp;
 
     if (MCAUSE_IS_INTERRUPT(cause)) { /* Asynchronous Interrupt */
         uint32_t int_code = MCAUSE_GET_CODE(cause);
@@ -282,28 +312,64 @@ void do_trap(uint32_t cause, uint32_t epc)
              * consistent tick frequency even with interrupt latency.
              */
             mtimecmp_w(mtimecmp_r() + (F_CPU / F_TIMER));
-            dispatcher(); /* Invoke the OS scheduler */
+            /* Invoke scheduler - parameter 1 = from timer, increment ticks */
+            dispatcher(1);
         } else {
             /* All other interrupt sources are unexpected and fatal */
-            printf("[UNHANDLED INTERRUPT] code=%u, cause=%08x, epc=%08x\n",
-                   int_code, cause, epc);
             hal_panic();
         }
     } else { /* Synchronous Exception */
         uint32_t code = MCAUSE_GET_CODE(cause);
-        const char *reason = "Unknown exception";
+
+        /* Handle ecall from M-mode - used for yielding in preemptive mode */
+        if (code == MCAUSE_ECALL_MMODE) {
+            /* Advance mepc past the ecall instruction (4 bytes) */
+            uint32_t new_epc = epc + 4;
+            write_csr(mepc, new_epc);
+
+            /* Also update mepc in the ISR frame on the stack!
+             * The ISR epilogue will restore mepc from the frame (offset 31*4 =
+             * 124 bytes). If we don't update the frame, mret will jump back to
+             * the ecall instruction!
+             */
+            uint32_t *isr_frame = (uint32_t *) isr_sp;
+            isr_frame[31] = new_epc;
+
+            /* Invoke dispatcher for context switch - parameter 0 = from ecall,
+             * don't increment ticks.
+             */
+            dispatcher(0);
+
+            /* Return the SP to use - new task's frame or current frame */
+            return pending_switch_sp ? (uint32_t) pending_switch_sp : isr_sp;
+        }
+
+        /* Print exception info via direct UART (safe in trap context) */
+        trap_puts("[EXCEPTION] ");
         if (code < ARRAY_SIZE(exc_msg) && exc_msg[code])
-            reason = exc_msg[code];
-        printf("[EXCEPTION] code=%u (%s), epc=%08x, cause=%08x\n", code, reason,
-               epc, cause);
+            trap_puts(exc_msg[code]);
+        else
+            trap_puts("Unknown");
+        trap_puts(" epc=0x");
+        for (int i = 28; i >= 0; i -= 4) {
+            uint32_t nibble = (epc >> i) & 0xF;
+            _putchar(nibble < 10 ? '0' + nibble : 'A' + nibble - 10);
+        }
+        trap_puts("\r\n");
+
         hal_panic();
     }
+
+    /* Return the SP to use for context restore - new task's frame or current */
+    return pending_switch_sp ? (uint32_t) pending_switch_sp : isr_sp;
 }
 
 /* Enables the machine-level timer interrupt source */
 void hal_timer_enable(void)
 {
-    mtimecmp_w(mtime_r() + (F_CPU / F_TIMER));
+    uint64_t now = mtime_r();
+    uint64_t target = now + (F_CPU / F_TIMER);
+    mtimecmp_w(target);
     write_csr(mie, read_csr(mie) | MIE_MTIE);
 }
 
@@ -313,20 +379,66 @@ void hal_timer_disable(void)
     write_csr(mie, read_csr(mie) & ~MIE_MTIE);
 }
 
-/* Hook called by the scheduler after a context switch.
- * Its primary purpose is to enable global interrupts ('mstatus.MIE') only
- * AFTER the first task has been launched. This ensures interrupts are not
- * globally enabled until the OS is fully running in a valid task context.
+/* Enable timer interrupt bit only - does NOT reset mtimecmp.
+ * Use this for NOSCHED_LEAVE to avoid pushing the interrupt deadline forward.
  */
-void hal_interrupt_tick(void)
+void hal_timer_irq_enable(void)
 {
-    tcb_t *task = kcb->task_current->data;
-    if (unlikely(!task))
-        hal_panic(); /* Fatal error - invalid task state */
+    write_csr(mie, read_csr(mie) | MIE_MTIE);
+}
 
-    /* The task's entry point is still in RA, so this is its very first run */
-    if ((uint32_t) task->entry == task->context[CONTEXT_RA])
-        _ei(); /* Enable global interrupts now that execution is in a task */
+/* Disable timer interrupt bit only - does NOT touch mtimecmp.
+ * Use this for NOSCHED_ENTER to temporarily disable preemption.
+ */
+void hal_timer_irq_disable(void)
+{
+    write_csr(mie, read_csr(mie) & ~MIE_MTIE);
+}
+
+/* Linker script symbols - needed for task initialization */
+extern uint32_t _gp, _end;
+
+/* Build initial ISR frame on task stack for preemptive mode.
+ * Returns the stack pointer that points to the frame.
+ * When ISR restores from this frame, it will jump to task_entry.
+ *
+ * CRITICAL: ISR deallocates the frame before mret (sp += 128).
+ * We place the frame such that after deallocation, SP is at a safe location.
+ *
+ * ISR Stack Frame Layout (must match boot.c _isr):
+ *   0: ra,   4: gp,   8: tp,  12: t0, ... 116: t6
+ * 120: mcause, 124: mepc
+ */
+void *hal_build_initial_frame(void *stack_top, void (*task_entry)(void))
+{
+#define INITIAL_STACK_RESERVE \
+    256 /* Reserve space below stack_top for task startup */
+
+    /* Place frame deeper in stack so after ISR deallocates (sp += 128),
+     * SP will be at (stack_top - INITIAL_STACK_RESERVE), not at stack_top.
+     */
+    uint32_t *frame =
+        (uint32_t *) ((uint8_t *) stack_top - INITIAL_STACK_RESERVE -
+                      ISR_STACK_FRAME_SIZE);
+
+    /* Zero out entire frame */
+    for (int i = 0; i < 32; i++) {
+        frame[i] = 0;
+    }
+
+    /* Compute tp value same as boot.c: aligned to 64 bytes from _end */
+    uint32_t tp_val = ((uint32_t) &_end + 63) & ~63U;
+
+    /* Initialize critical registers for proper task startup:
+     * - frame[1] = gp: Global pointer, required for accessing global variables
+     * - frame[2] = tp: Thread pointer, required for thread-local storage
+     * - frame[31] = mepc: Task entry point, where mret will jump to
+     */
+    frame[1] = (uint32_t) &_gp;        /* gp - global pointer */
+    frame[2] = tp_val;                 /* tp - thread pointer */
+    frame[31] = (uint32_t) task_entry; /* mepc - entry point */
+
+    return (void *) frame;
 }
 
 /* Context Switching */
@@ -468,6 +580,18 @@ __attribute__((noreturn)) void hal_context_restore(jmp_buf env, int32_t val)
     if (unlikely(!env))
         hal_panic(); /* Cannot proceed with invalid context */
 
+    /* Validate RA is in text section (simple sanity check) */
+    uint32_t ra = env[15]; /* CONTEXT_RA = 15 */
+    if (ra < 0x80000000 || ra > 0x80010000) {
+        trap_puts("[CTX_ERR] Bad RA=0x");
+        for (int i = 28; i >= 0; i -= 4) {
+            uint32_t nibble = (ra >> i) & 0xF;
+            _putchar(nibble < 10 ? '0' + nibble : 'A' + nibble - 10);
+        }
+        trap_puts("\r\n");
+        hal_panic();
+    }
+
     if (val == 0)
         val = 1; /* Must return a non-zero value after restore */
 
@@ -503,12 +627,60 @@ __attribute__((noreturn)) void hal_context_restore(jmp_buf env, int32_t val)
     __builtin_unreachable(); /* Tell compiler this point is never reached */
 }
 
+/* Stack pointer switching for preemptive context switch.
+ * Saves current SP to *old_sp and loads new SP from new_sp.
+ * Called by dispatcher when switching tasks in preemptive mode.
+ * After this returns, ISR will restore registers from the new stack.
+ *
+ * @old_sp: Pointer to location where current SP should be saved
+ * @new_sp: New stack pointer to switch to
+ */
+void hal_switch_stack(void **old_sp, void *new_sp)
+{
+    /* Save the ISR frame SP (NOT current SP which is deep in call stack!)
+     * to prev task. DO NOT change SP here - that would corrupt the C call
+     * stack! Instead, store new_sp in pending_switch_sp for ISR epilogue.
+     */
+    *old_sp = (void *) current_isr_frame_sp;
+
+    /* Set pending switch - ISR epilogue will use this SP for restore */
+    pending_switch_sp = new_sp;
+}
+
+/* Enable interrupts on first run of a task.
+ * Checks if task's return address still points to entry (meaning it hasn't
+ * run yet), and if so, enables global interrupts.
+ */
+void hal_interrupt_tick(void)
+{
+    tcb_t *task = kcb->task_current->data;
+    if (unlikely(!task))
+        hal_panic();
+
+    /* The task's entry point is still in RA, so this is its very first run */
+    if ((uint32_t) task->entry == task->context[CONTEXT_RA])
+        _ei();
+}
+
 /* Low-level context restore helper. Expects a pointer to a 'jmp_buf' in 'a0'.
- * Restores the GPRs and jumps to the restored return address.
+ * Restores the GPRs, mstatus, and jumps to the restored return address.
+ *
+ * This function must restore mstatus from the context to be
+ * consistent with hal_context_restore(). The first task context is initialized
+ * with MSTATUS_MIE | MSTATUS_MPP_MACH by hal_context_init(), which enables
+ * interrupts. Failing to restore this value would create an inconsistency
+ * where the first task inherits the kernel's mstatus instead of its own.
  */
 static void __attribute__((naked, used)) __dispatch_init(void)
 {
     asm volatile(
+        /* Restore mstatus FIRST to ensure correct processor state.
+         * This is critical for interrupt enable state (MSTATUS_MIE).
+         * Context was initialized with MIE=1 by hal_context_init().
+         */
+        "lw  t0, 16*4(a0)\n"
+        "csrw mstatus, t0\n"
+        /* Now restore all general-purpose registers */
         "lw  s0,   0*4(a0)\n"
         "lw  s1,   1*4(a0)\n"
         "lw  s2,   2*4(a0)\n"
@@ -536,6 +708,7 @@ __attribute__((noreturn)) void hal_dispatch_init(jmp_buf env)
 
     if (kcb->preemptive)
         hal_timer_enable();
+
     _ei(); /* Enable global interrupts just before launching the first task */
 
     asm volatile(
@@ -574,6 +747,15 @@ void hal_context_init(jmp_buf *ctx, size_t sp, size_t ss, size_t ra)
     /* Zero the context for predictability */
     memset(ctx, 0, sizeof(*ctx));
 
+    /* Compute tp value same as boot.c: aligned to 64 bytes from _end */
+    uint32_t tp_val = ((uint32_t) &_end + 63) & ~63U;
+
+    /* Set global pointer and thread pointer for proper task execution.
+     * These are critical for accessing global variables and TLS.
+     */
+    (*ctx)[CONTEXT_GP] = (uint32_t) &_gp;
+    (*ctx)[CONTEXT_TP] = tp_val;
+
     /* Set the essential registers for a new task:
      * - SP is set to the prepared top of the task's stack.
      * - RA is set to the task's entry point.
diff --git a/arch/riscv/hal.h b/arch/riscv/hal.h
index 8354264e..45a16409 100644
--- a/arch/riscv/hal.h
+++ b/arch/riscv/hal.h
@@ -76,6 +76,12 @@ int32_t hal_context_save(jmp_buf env);
 void hal_context_restore(jmp_buf env, int32_t val);
 void hal_dispatch_init(jmp_buf env);
 
+/* Stack switching for preemptive context switch.
+ * Saves current SP to *old_sp and loads new SP from new_sp.
+ * Used by dispatcher when switching tasks in preemptive mode.
+ */
+void hal_switch_stack(void **old_sp, void *new_sp);
+
 /* Provides a blocking, busy-wait delay.
  * This function monopolizes the CPU and should only be used for very short
  * delays or in pre-scheduling initialization code.
@@ -92,7 +98,14 @@ uint64_t _read_us(void);
 void hal_hardware_init(void);
 void hal_timer_enable(void);
 void hal_timer_disable(void);
-void hal_interrupt_tick(void);
+void hal_timer_irq_enable(
+    void); /* Enable timer interrupt bit only (for NOSCHED) */
+void hal_timer_irq_disable(
+    void); /* Disable timer interrupt bit only (for NOSCHED) */
+void hal_interrupt_tick(void); /* Enable interrupts on first task run */
+void *hal_build_initial_frame(
+    void *stack_top,
+    void (*task_entry)(void)); /* Build ISR frame for preemptive mode */
 
 /* Initializes the context structure for a new task.
  * @ctx : Pointer to jmp_buf to initialize (must be non-NULL).
@@ -109,4 +122,4 @@ void hal_panic(void);
 void hal_cpu_idle(void);
 
 /* Default stack size for new tasks if not otherwise specified */
-#define DEFAULT_STACK_SIZE 4096
+#define DEFAULT_STACK_SIZE 8192
diff --git a/include/sys/logger.h b/include/sys/logger.h
index eb0ecf7a..e4a3df75 100644
--- a/include/sys/logger.h
+++ b/include/sys/logger.h
@@ -58,3 +58,21 @@ uint32_t mo_logger_queue_depth(void);
  * Returns total dropped message count since logger init
  */
 uint32_t mo_logger_dropped_count(void);
+
+/* Check if logger is in direct output mode.
+ * Lock-free read for performance - safe to call frequently.
+ * Returns true if printf/puts should bypass the queue.
+ */
+bool mo_logger_direct_mode(void);
+
+/* Flush all pending messages and enter direct output mode.
+ * Drains the queue directly from caller's context.
+ * After flush, printf/puts bypass the queue for ordered output.
+ * Call mo_logger_async_resume() to re-enable async logging.
+ */
+void mo_logger_flush(void);
+
+/* Re-enable async logging after a flush.
+ * Call this after completing ordered output that required direct mode.
+ */
+void mo_logger_async_resume(void);
diff --git a/include/sys/task.h b/include/sys/task.h
index dc6410af..0d3aaa4d 100644
--- a/include/sys/task.h
+++ b/include/sys/task.h
@@ -67,6 +67,7 @@ enum task_states {
 typedef struct tcb {
     /* Context and Stack Management */
     jmp_buf context; /* Saved CPU context (GPRs, SP, PC) for task switching */
+    void *sp;        /* Saved stack pointer for preemptive context switch */
     void *stack;     /* Pointer to base of task's allocated stack memory */
     size_t stack_sz; /* Total size of the stack in bytes */
     void (*entry)(void); /* Task's entry point function */
@@ -145,21 +146,29 @@ extern kcb_t *kcb;
             _ei();           \
     } while (0)
 
+/* Flag indicating scheduler has started - prevents timer IRQ during early
+ * initializations.
+ */
+extern volatile bool scheduler_started;
+
 /* Disable/enable ONLY the scheduler timer interrupt.
  * Lighter-weight critical section that prevents task preemption but allows
  * other hardware interrupts (e.g., UART) to be serviced, minimizing latency.
  * Use when protecting data shared between tasks.
+ *
+ * NOSCHED_LEAVE only enables timer if scheduler has started, preventing
+ * premature timer interrupts during early initialization (e.g., logger init).
  */
-#define NOSCHED_ENTER()          \
-    do {                         \
-        if (kcb->preemptive)     \
-            hal_timer_disable(); \
+#define NOSCHED_ENTER()              \
+    do {                             \
+        if (kcb->preemptive)         \
+            hal_timer_irq_disable(); \
     } while (0)
 
-#define NOSCHED_LEAVE()         \
-    do {                        \
-        if (kcb->preemptive)    \
-            hal_timer_enable(); \
+#define NOSCHED_LEAVE()                           \
+    do {                                          \
+        if (kcb->preemptive && scheduler_started) \
+            hal_timer_irq_enable();               \
     } while (0)
 
 /* Core Kernel and Task Management API */
@@ -169,8 +178,8 @@ extern kcb_t *kcb;
 /* Prints a fatal error message and halts the system */
 void panic(int32_t ecode);
 
-/* Main scheduler dispatch function, called by the timer ISR */
-void dispatcher(void);
+/* Main scheduler dispatch function, called by timer ISR or ecall */
+void dispatcher(int from_timer);
 
 /* Architecture-specific context switch implementations */
 void _dispatch(void);
diff --git a/kernel/logger.c b/kernel/logger.c
index 701a36f7..f2fe2733 100644
--- a/kernel/logger.c
+++ b/kernel/logger.c
@@ -28,6 +28,12 @@ typedef struct {
     mutex_t lock;     /* Protects queue manipulation, not UART output */
     int32_t task_id;
     bool initialized;
+
+    /* When true, printf bypasses queue.
+     * volatile: prevent compiler caching for lock-free read. Written under
+     * mutex, read without - safe on single-core.
+     */
+    volatile bool direct_mode;
 } logger_state_t;
 
 static logger_state_t logger;
@@ -75,8 +81,8 @@ int32_t mo_logger_init(void)
     if (mo_mutex_init(&logger.lock) != ERR_OK)
         return ERR_FAIL;
 
-    /* 512B stack: simple operations only (no printf/recursion/ISR use) */
-    logger.task_id = mo_task_spawn(logger_task, 512);
+    /* 1024B stack: space for log_entry_t (130B) + ISR frame (128B) + calls */
+    logger.task_id = mo_task_spawn(logger_task, 1024);
     if (logger.task_id < 0) {
         mo_mutex_destroy(&logger.lock);
         return ERR_FAIL;
@@ -149,3 +155,62 @@ uint32_t mo_logger_dropped_count(void)
 
     return dropped;
 }
+
+/* Check if logger is in direct output mode.
+ * Lock-free read: safe because direct_mode is only set atomically by flush
+ * and cleared by async_resume, both under mutex protection. Reading a stale
+ * value is benign (worst case: one extra direct output or one queued message).
+ */
+bool mo_logger_direct_mode(void)
+{
+    return logger.initialized && logger.direct_mode;
+}
+
+/* Flush all pending messages and enter direct output mode.
+ * Drains the queue directly from caller's context, bypassing logger task.
+ * After flush, printf/puts bypass the queue for ordered output.
+ * Call mo_logger_async_resume() to re-enable async logging.
+ */
+void mo_logger_flush(void)
+{
+    if (!logger.initialized)
+        return;
+
+    log_entry_t entry;
+
+    while (1) {
+        bool have_message = false;
+
+        mo_mutex_lock(&logger.lock);
+        if (logger.count > 0) {
+            memcpy(&entry, &logger.queue[logger.tail], sizeof(log_entry_t));
+            logger.tail = (logger.tail + 1) % LOG_QSIZE;
+            logger.count--;
+            have_message = true;
+        } else {
+            /* Queue drained: enter direct mode while still holding lock */
+            logger.direct_mode = true;
+        }
+        mo_mutex_unlock(&logger.lock);
+
+        if (!have_message)
+            break;
+
+        /* Output outside lock */
+        for (uint16_t i = 0; i < entry.length; i++)
+            _putchar(entry.data[i]);
+    }
+}
+
+/* Re-enable async logging after a flush.
+ * Call this after completing ordered output that required direct mode.
+ */
+void mo_logger_async_resume(void)
+{
+    if (!logger.initialized)
+        return;
+
+    mo_mutex_lock(&logger.lock);
+    logger.direct_mode = false;
+    mo_mutex_unlock(&logger.lock);
+}
diff --git a/kernel/main.c b/kernel/main.c
index c1583951..ce0dc08a 100644
--- a/kernel/main.c
+++ b/kernel/main.c
@@ -67,6 +67,11 @@ int32_t main(void)
     if (!first_task)
         panic(ERR_NO_TASKS);
 
+    /* Mark scheduler as started - enables timer IRQ in NOSCHED_LEAVE.
+     * Must be set before hal_dispatch_init() which enables preemption.
+     */
+    scheduler_started = true;
+
     hal_dispatch_init(first_task->context);
 
     /* This line should be unreachable. */
diff --git a/kernel/task.c b/kernel/task.c
index de17913b..84f048a3 100644
--- a/kernel/task.c
+++ b/kernel/task.c
@@ -29,6 +29,12 @@ static kcb_t kernel_state = {
 };
 kcb_t *kcb = &kernel_state;
 
+/* Flag to track if scheduler has started - prevents timer IRQ during early
+ * init. NOSCHED_LEAVE checks this to avoid enabling timer before scheduler is
+ * ready.
+ */
+volatile bool scheduler_started = false;
+
 /* timer work management for reduced latency */
 static volatile uint32_t timer_work_pending = 0;    /* timer work types */
 static volatile uint32_t timer_work_generation = 0; /* counter for coalescing */
@@ -178,6 +184,22 @@ static list_node_t *delay_update_batch(list_node_t *node, void *arg)
     if (t->delay > 0) {
         if (--t->delay == 0) {
             t->state = TASK_READY;
+
+            /* If this is an RT task, set its deadline for the next job.
+             * For periodic tasks, deadline should be current_time + period.
+             * This ensures tasks are scheduled based on their actual deadlines,
+             * not inflated values from previous scheduler calls.
+             */
+            if (t->rt_prio) {
+                typedef struct {
+                    uint32_t period;
+                    uint32_t deadline;
+                } edf_prio_t;
+                edf_prio_t *edf = (edf_prio_t *) t->rt_prio;
+                extern kcb_t *kcb;
+                edf->deadline = kcb->ticks + edf->period;
+            }
+
             /* Add to appropriate priority ready queue */
             sched_enqueue_task(t);
             (*ready_count)++;
@@ -369,9 +391,13 @@ void sched_tick_current_task(void)
     if (current_task->time_slice > 0)
         current_task->time_slice--;
 
-    /* If time slice expired, force immediate rescheduling */
+    /* If time slice expired, mark task as ready for rescheduling.
+     * Don't call _dispatch() here - let the normal dispatcher() flow handle it.
+     * Calling _dispatch() from within dispatcher() causes double-dispatch bug.
+     */
     if (current_task->time_slice == 0) {
-        _dispatch();
+        if (current_task->state == TASK_RUNNING)
+            current_task->state = TASK_READY;
     }
 }
 
@@ -443,7 +469,29 @@ uint16_t sched_select_next_task(void)
 
     } while (node != start_node && ++iterations < SCHED_IMAX);
 
-    /* No ready tasks found - this should not happen in normal operation */
+    /* No ready tasks found in preemptive mode - all tasks are blocked.
+     * This is normal for periodic RT tasks waiting for their next period.
+     * We CANNOT return a BLOCKED task as that would cause it to run.
+     * Instead, find ANY task (even blocked) as a placeholder, then wait for
+     * interrupt.
+     */
+    if (kcb->preemptive) {
+        /* Select any task as placeholder (dispatcher won't actually switch to
+         * it if blocked) */
+        list_node_t *any_node = list_next(kcb->tasks->head);
+        while (any_node && any_node != kcb->tasks->tail) {
+            if (any_node->data) {
+                kcb->task_current = any_node;
+                tcb_t *any_task = any_node->data;
+                return any_task->id;
+            }
+            any_node = list_next(any_node);
+        }
+        /* No tasks at all - this is a real error */
+        panic(ERR_NO_TASKS);
+    }
+
+    /* In cooperative mode, having no ready tasks is an error */
     panic(ERR_NO_TASKS);
     return 0;
 }
@@ -454,10 +502,14 @@ static int32_t noop_rtsched(void)
     return -1;
 }
 
-/* The main entry point from the system tick interrupt. */
-void dispatcher(void)
+/* The main entry point from interrupts (timer or ecall).
+ * Parameter: from_timer = 1 if called from timer ISR (increment ticks),
+ *                       = 0 if called from ecall (don't increment ticks)
+ */
+void dispatcher(int from_timer)
 {
-    kcb->ticks++;
+    if (from_timer)
+        kcb->ticks++;
 
     /* Handle time slice for current task */
     sched_tick_current_task();
@@ -475,12 +527,15 @@ void dispatch(void)
     if (unlikely(!kcb || !kcb->task_current || !kcb->task_current->data))
         panic(ERR_NO_TASKS);
 
-    /* Save current context using dedicated HAL routine that handles both
-     * execution context and processor state for context switching.
-     * Returns immediately if this is the restore path.
+    /* Save current context - only needed for cooperative mode.
+     * In preemptive mode, ISR already saved context to stack,
+     * so we skip this step to avoid interference.
      */
-    if (hal_context_save(((tcb_t *) kcb->task_current->data)->context) != 0)
-        return;
+    if (!kcb->preemptive) {
+        /* Cooperative mode: use setjmp/longjmp mechanism */
+        if (hal_context_save(((tcb_t *) kcb->task_current->data)->context) != 0)
+            return;
+    }
 
 #if CONFIG_STACK_PROTECTION
     /* Do stack check less frequently to reduce overhead */
@@ -488,18 +543,99 @@ void dispatch(void)
         task_stack_check();
 #endif
 
-    /* Batch process task delays for better efficiency */
+    /* Batch process task delays for better efficiency.
+     * Only process delays if tick has advanced to avoid decrementing multiple
+     * times per tick when dispatch() is called multiple times.
+     */
     uint32_t ready_count = 0;
-    list_foreach(kcb->tasks, delay_update_batch, &ready_count);
+    static uint32_t last_delay_update_tick = 0;
+    if (kcb->ticks != last_delay_update_tick) {
+        list_foreach(kcb->tasks, delay_update_batch, &ready_count);
+        last_delay_update_tick = kcb->ticks;
+    }
 
     /* Hook for real-time scheduler - if it selects a task, use it */
-    if (kcb->rt_sched() < 0)
-        sched_select_next_task(); /* Use O(1) priority scheduler */
+    tcb_t *prev_task = kcb->task_current->data;
+    int32_t rt_task_id = kcb->rt_sched();
+
+    if (rt_task_id < 0) {
+        sched_select_next_task(); /* Use O(n) round-robin scheduler */
+    } else {
+        /* RT scheduler selected a task - update current task pointer */
+        list_node_t *rt_node = find_task_node_by_id((uint16_t) rt_task_id);
+        if (rt_node && rt_node->data) {
+            tcb_t *rt_task = rt_node->data;
+            /* Different task - perform context switch */
+            if (rt_node != kcb->task_current) {
+                if (kcb->task_current && kcb->task_current->data) {
+                    tcb_t *prev = kcb->task_current->data;
+                    if (prev->state == TASK_RUNNING)
+                        prev->state = TASK_READY;
+                }
+                /* Switch to RT task */
+                kcb->task_current = rt_node;
+                rt_task->state = TASK_RUNNING;
+                rt_task->time_slice =
+                    get_priority_timeslice(rt_task->prio_level);
+            }
+            /* If same task selected, fall through to do_context_switch
+             * which will check if task is blocked and handle appropriately */
+        } else {
+            /* RT task not found, fall back to round-robin */
+            sched_select_next_task();
+        }
+    }
 
-    hal_interrupt_tick();
+    /* Check if we're still on the same task (no actual switch needed) */
+    tcb_t *next_task = kcb->task_current->data;
 
-    /* Restore next task context */
-    hal_context_restore(((tcb_t *) kcb->task_current->data)->context, 1);
+    /* In preemptive mode, if selected task has pending delay, keep trying to
+     * find ready task. We check delay > 0 instead of state == BLOCKED because
+     * schedulers already modified state to RUNNING.
+     */
+    if (kcb->preemptive) {
+        int attempts = 0;
+        while (next_task->delay > 0 && attempts < 10) {
+            /* Try next task in round-robin */
+            kcb->task_current = list_cnext(kcb->tasks, kcb->task_current);
+            if (!kcb->task_current || !kcb->task_current->data)
+                kcb->task_current = list_next(kcb->tasks->head);
+            next_task = kcb->task_current->data;
+            attempts++;
+        }
+
+        /* If still has delay after all attempts, all tasks are blocked.
+         * Just select this task anyway - it will resume and immediately yield
+         * again, creating a busy-wait ecall loop until timer interrupt fires
+         * and decrements delays.
+         */
+    }
+
+    /* Update task state and time slice before context switch */
+    if (next_task->state != TASK_RUNNING)
+        next_task->state = TASK_RUNNING;
+    next_task->time_slice = get_priority_timeslice(next_task->prio_level);
+
+    /* Perform context switch based on scheduling mode */
+    if (kcb->preemptive) {
+        /* Same task - no context switch needed */
+        if (next_task == prev_task)
+            return; /* ISR will restore from current stack naturally */
+
+        /* Preemptive mode: Switch stack pointer.
+         * ISR already saved context to prev_task's stack.
+         * Switch SP to next_task's stack.
+         * When we return, ISR will restore from next_task's stack.
+         */
+        hal_switch_stack(&prev_task->sp, next_task->sp);
+    } else {
+        /* Cooperative mode: Always call hal_context_restore() because it uses
+         * setjmp/longjmp mechanism. Even if same task continues, we must
+         * longjmp back to complete the context save/restore cycle.
+         */
+        hal_interrupt_tick();
+        hal_context_restore(next_task->context, 1);
+    }
 }
 
 /* Cooperative context switch */
@@ -511,7 +647,24 @@ void yield(void)
     /* Process deferred timer work during yield */
     process_deferred_timer_work();
 
-    /* HAL context switching is used for preemptive scheduling. */
+    /* In preemptive mode, can't use setjmp/longjmp - incompatible with ISR
+     * stack frames. Trigger dispatcher via ecall, then wait until task becomes
+     * READY again.
+     */
+    if (kcb->preemptive) {
+        /* Trigger one dispatcher call - this will context switch to another
+         * task. When we return here (after being rescheduled), our delay will
+         * have expired.
+         */
+        __asm__ volatile("ecall");
+
+        /* After ecall returns, we've been context-switched back, meaning we're
+         * READY. No need to check state - if we're executing, we're ready.
+         */
+        return;
+    }
+
+    /* Cooperative mode: use setjmp/longjmp mechanism */
     if (hal_context_save(((tcb_t *) kcb->task_current->data)->context) != 0)
         return;
 
@@ -520,8 +673,7 @@ void yield(void)
 #endif
 
     /* In cooperative mode, delays are only processed on an explicit yield. */
-    if (!kcb->preemptive)
-        list_foreach(kcb->tasks, delay_update, NULL);
+    list_foreach(kcb->tasks, delay_update, NULL);
 
     sched_select_next_task(); /* Use O(1) priority scheduler */
     hal_context_restore(((tcb_t *) kcb->task_current->data)->context, 1);
@@ -627,6 +779,12 @@ int32_t mo_task_spawn(void *task_entry, uint16_t stack_size_req)
     hal_context_init(&tcb->context, (size_t) tcb->stack, new_stack_size,
                      (size_t) task_entry);
 
+    /* Initialize SP for preemptive mode.
+     * Build initial ISR frame on stack with mepc pointing to task entry.
+     */
+    void *stack_top = (void *) ((uint8_t *) tcb->stack + new_stack_size);
+    tcb->sp = hal_build_initial_frame(stack_top, task_entry);
+
     printf("task %u: entry=%p stack=%p size=%u prio_level=%u time_slice=%u\n",
            tcb->id, task_entry, tcb->stack, (unsigned int) new_stack_size,
            tcb->prio_level, tcb->time_slice);
@@ -807,6 +965,7 @@ int32_t mo_task_rt_priority(uint16_t id, void *priority)
     }
 
     task->rt_prio = priority;
+
     CRITICAL_LEAVE();
     return ERR_OK;
 }
@@ -838,9 +997,16 @@ void mo_task_wfi(void)
     if (!kcb->preemptive)
         return;
 
+    /* Enable interrupts before WFI - we're in ISR context with interrupts
+     * disabled. WFI needs interrupts enabled to wake up on timer interrupt.
+     */
+    _ei();
+
     volatile uint32_t current_ticks = kcb->ticks;
     while (current_ticks == kcb->ticks)
         hal_cpu_idle();
+
+    /* Note: Interrupts will be re-disabled when we return to ISR caller */
 }
 
 uint16_t mo_task_count(void)
diff --git a/lib/stdio.c b/lib/stdio.c
index 65f0eba4..f5eec7d7 100644
--- a/lib/stdio.c
+++ b/lib/stdio.c
@@ -294,7 +294,11 @@ int vsnprintf(char *str, size_t size, const char *fmt, va_list args)
 /* Formatted output to stdout.
  * Uses a fixed stack buffer - very long output will be truncated.
  * Thread-safe: Uses deferred logging via logger task.
- * Falls back to direct output during early boot or if queue is full.
+ * Falls back to direct output during early boot, queue full, or after flush.
+ *
+ * Flush-aware behavior: After mo_logger_flush(), printf() outputs directly
+ * to UART (direct_mode flag set), ensuring ordered output for multi-line
+ * reports. Call mo_logger_async_resume() to re-enable async logging.
  */
 int32_t printf(const char *fmt, ...)
 {
@@ -305,13 +309,22 @@ int32_t printf(const char *fmt, ...)
     int32_t len = vsnprintf(buf, sizeof(buf), fmt, args);
     va_end(args);
 
-    /* Try deferred logging only if message fits (avoids silent truncation).
-     * Long messages fall back to direct output for completeness.
+    /* Handle vsnprintf error (negative return indicates encoding error) */
+    if (len < 0)
+        return len;
+
+    /* Try deferred logging only if:
+     * 1. Message fits in log entry (avoids silent truncation)
+     * 2. Not in direct mode (set by mo_logger_flush)
+     * 3. Enqueue succeeds (queue not full)
      */
-    if (len <= LOG_ENTRY_SZ - 1 && mo_logger_enqueue(buf, len) == 0)
+    if (len <= LOG_ENTRY_SZ - 1 && !mo_logger_direct_mode() &&
+        mo_logger_enqueue(buf, len) == 0)
         return len; /* Successfully enqueued */
 
-    /* Fallback to direct output (early boot, queue full, or too long) */
+    /* Direct output: early boot, direct mode (post-flush), queue full, or too
+     * long.
+     */
     char *p = buf;
     while (*p)
         _putchar(*p++);
@@ -336,7 +349,8 @@ int32_t snprintf(char *str, size_t size, const char *fmt, ...)
 
 /* Writes a string to stdout, followed by a newline.
  * Thread-safe: Uses deferred logging via logger task.
- * Falls back to direct output during early boot or if queue is full.
+ * Falls back to direct output during early boot, queue full, or after flush.
+ * Same flush-aware behavior as printf() for ordered multi-line output.
  */
 int32_t puts(const char *str)
 {
@@ -349,11 +363,14 @@ int32_t puts(const char *str)
     buf[len++] = '\n';
     buf[len] = '\0';
 
-    /* Try deferred logging only if message fits (avoids silent truncation) */
-    if (len <= LOG_ENTRY_SZ - 1 && mo_logger_enqueue(buf, len) == 0)
+    /* Try deferred logging only if not in direct mode */
+    if (len <= LOG_ENTRY_SZ - 1 && !mo_logger_direct_mode() &&
+        mo_logger_enqueue(buf, len) == 0)
         return 0; /* Successfully enqueued */
 
-    /* Fallback to direct output (early boot, queue full, or too long) */
+    /* Direct output: early boot, direct mode (post-flush), queue full, or too
+     * long.
+     */
     char *p = buf;
     while (*p)
         _putchar(*p++);