richardr1126
diff --git a/‎Dockerfile‎
Lines changed: 41 additions & 5 deletions b/‎Dockerfile‎
Lines changed: 41 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 19 additions & 45 deletions b/‎README.md‎
Lines changed: 19 additions & 45 deletions
diff --git a/‎package.json‎
Lines changed: 1 addition & 1 deletion b/‎package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎…c/app/api/audio/convert/chapter/route.ts‎ ‎src/app/api/audiobook/chapter/route.ts‎src/app/api/audio/convert/chapter/route.ts renamed to src/app/api/audiobook/chapter/route.ts b/‎…c/app/api/audio/convert/chapter/route.ts‎ ‎src/app/api/audiobook/chapter/route.ts‎src/app/api/audio/convert/chapter/route.ts renamed to src/app/api/audiobook/chapter/route.ts
diff --git a/‎src/app/api/audio/convert/route.ts‎ ‎src/app/api/audiobook/route.ts‎src/app/api/audio/convert/route.ts renamed to src/app/api/audiobook/route.ts
Lines changed: 38 additions & 7 deletions b/‎src/app/api/audio/convert/route.ts‎ ‎src/app/api/audiobook/route.ts‎src/app/api/audio/convert/route.ts renamed to src/app/api/audiobook/route.ts
Lines changed: 38 additions & 7 deletions
diff --git a/‎…/app/api/audio/convert/chapters/route.ts‎ ‎src/app/api/audiobook/status/route.ts‎src/app/api/audio/convert/chapters/route.ts renamed to src/app/api/audiobook/status/route.ts
Lines changed: 3 additions & 30 deletions b/‎…/app/api/audio/convert/chapters/route.ts‎ ‎src/app/api/audiobook/status/route.ts‎src/app/api/audio/convert/chapters/route.ts renamed to src/app/api/audiobook/status/route.ts
Lines changed: 3 additions & 30 deletions
diff --git a/‎src/app/api/tts/route.ts‎
Lines changed: 9 additions & 8 deletions b/‎src/app/api/tts/route.ts‎
Lines changed: 9 additions & 8 deletions
@@ -1,8 +1,18 @@
-# Use Node.js slim image
-FROM node:current-alpine
+# Stage 1: build whisper.cpp (no model download – the app handles that)
+FROM alpine:3.20 AS whisper-builder
+
+RUN apk add --no-cache git cmake build-base
+
+WORKDIR /opt
+
+RUN git clone --depth 1 https://github.com/ggml-org/whisper.cpp.git && \
+    cd whisper.cpp && \
+    cmake -B build && \
+    cmake --build build -j --config Release
 
-# Add ffmpeg and libreoffice using Alpine package manager
-RUN apk add --no-cache ffmpeg libreoffice-writer
+
+# Stage 2: build the Next.js app
+FROM node:lts-alpine AS app-builder
 
 # Install pnpm globally
 RUN npm install -g pnpm
@@ -23,8 +33,34 @@ COPY . .
 RUN pnpm exec next telemetry disable
 RUN pnpm build
 
+
+# Stage 3: minimal runtime image
+FROM node:current-alpine AS runner
+
+# Add runtime OS dependencies:
+# - ffmpeg: required for audiobook export and word-by-word alignment (/api/whisper)
+# - libreoffice-writer: required for DOCX → PDF conversion
+RUN apk add --no-cache ffmpeg libreoffice-writer
+
+# Install pnpm globally for running the app
+RUN npm install -g pnpm
+
+# App runtime directory
+WORKDIR /app
+
+# Copy built app and dependencies from the builder stage
+COPY --from=app-builder /app ./
+
+# Copy the compiled whisper.cpp build output into the runtime image
+# (includes whisper-cli and its shared libraries, e.g. libwhisper.so, libggml.so)
+COPY --from=whisper-builder /opt/whisper.cpp/build /opt/whisper.cpp/build
+
+# Point the app at the compiled whisper-cli binary and ensure its libs are discoverable
+ENV WHISPER_CPP_BIN=/opt/whisper.cpp/build/bin/whisper-cli
+ENV LD_LIBRARY_PATH=/opt/whisper.cpp/build
+
 # Expose the port the app runs on
 EXPOSE 3003
 
 # Start the application
-CMD ["pnpm", "start"]
+CMD ["pnpm", "start"]
@@ -11,65 +11,25 @@
 
 OpenReader WebUI is an open source text to speech document reader web app built using Next.js, offering a TTS read along experience with narration for **EPUB, PDF, TXT, MD, and DOCX documents**. It supports multiple TTS providers including OpenAI, Deepinfra, and custom OpenAI-compatible endpoints like [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI) and [Orpheus-FastAPI](https://github.com/Lex-au/Orpheus-FastAPI)
 
-- 🧠 *(New)* **Smart Sentence-Aware Narration** merges sentences across pages/chapters for smoother TTS
-- 🎧 *(New)* **Reliable Audiobook Export** in **m4b/mp3**, with resumable, chapter-based export and regeneration
 - 🎯 *(New)* **Multi-Provider TTS Support**
   - [**Kokoro-FastAPI**](https://github.com/remsky/Kokoro-FastAPI): Supporting multi-voice combinations (like `af_heart+af_bella`)
   - [**Orpheus-FastAPI**](https://github.com/Lex-au/Orpheus-FastAPI)
   - **Custom OpenAI-compatible**: Any TTS API with `/v1/audio/voices` and `/v1/audio/speech` endpoints
   - **Cloud TTS Providers (requiring API keys)**
     - [**Deepinfra**](https://deepinfra.com/models/text-to-speech): Kokoro-82M + models with support for cloned voices and more
     - [**OpenAI API ($$)**](https://platform.openai.com/docs/pricing#transcription-and-speech): tts-1, tts-1-hd, and gpt-4o-mini-tts w/ instructions
-- 🚀 *(New)* **Optimized Next.js TTS Proxy** with audio caching and optimized repeat playback
-- 💾 *(Updated)* **Local-First Architecture** stores documents and more in-browser with Dexie.js
 - 📖 *(Updated)* **Read Along Experience** providing real-time text highlighting during playback (PDF/EPUB)
+  - *(New)* **Word-by-word** highlighting uses word-by-word timestamps generated server-side with [*whisper.cpp*](https://github.com/ggml-org/whisper.cpp) (optional)
+- 🧠 *(New)* **Smart Sentence-Aware Narration** merges sentences across pages/chapters for smoother TTS
+- 🎧 *(New)* **Reliable Audiobook Export** in **m4b/mp3**, with resumable, chapter-based export and regeneration
+- 🚀 *(New)* **Optimized Next.js TTS Proxy** with audio caching and optimized repeat playback
+- 💾 **Local-First Architecture** stores documents and more in-browser with Dexie.js
 - 🛜 **Optional Server-side documents** using backend `/docstore` for all users
 - 🎨 **Customizable Experience**
   - 🎨 Multiple app theme options
   - ⚙️ Various TTS and document handling settings
   - And more ...
 
-<details>
-<summary>
-
-### 🆕 What's New in v1.0.0
-
-</summary>
-
-- 🧠 **Smart sentence continuation**  
-  - Improved NLP handling of complex structures and quoted dialogue provides more natural sentence boundaries and a smoother audio-text flow.  
-  - EPUB and PDF playback now use smarter sentence splitting and continuation metadata so sentences that cross page/chapter boundaries are merged before hitting the TTS API.  
-  - This yields more natural narration and fewer awkward pauses when a sentence spans multiple pages or EPUB spine items.
-- 📄 **Modernized PDF text highlighting pipeline**  
-  - Real-time PDF text highlighting is now offloaded to a dedicated Web Worker so scrolling and playback controls remain responsive during narration.  
-  - A new overlay-based highlighting system draws independent highlight layers on top of the PDF, avoiding interference with the underlying text layer.  
-  - Upgraded fuzzy matching with Dice-based similarity improves the accuracy of mapping spoken words to on-screen text.  
-  - A new per-device setting lets you enable or disable real-time PDF highlighting during playback for a more tailored reading experience.  
-- 🎧 **Chapter/page-based audiobook export with resume & regeneration**  
-  - Per-chapter/per-page generation to disk with persistent `bookId`  
-  - Resumable generation (can cancel and continue later)  
-  - Per-chapter regeneration & deletion  
-  - Final combined **M4B** or **MP3** download with embedded chapter metadata.  
-- 💾 **Dexie-backed local storage & sync**  
-  - All document types (PDF, EPUB, TXT/MD-as-HTML) and config are stored via a unified Dexie layer on top of IndexedDB.  
-  - Document lists use live Dexie queries (no manual refresh needed), and server sync now correctly includes text/markdown documents as part of the library backup.  
-- 🗣️ **Kokoro multi-voice selection & utilities**  
-  - Kokoro models now support multi-voice combination, with provider-aware limits and helpers (not supported on OpenAI or Deepinfra)
-- ⚡ **Faster, more efficient TTS backend proxy**  
-  - In-memory **LRU caching** for audio responses with configurable size/TTL  
-  - **ETag** support (`304` on cache hits) + `X-Cache` headers (`HIT` / `MISS` / `INFLIGHT`)  
-- 📄 **More robust DOCX → PDF conversion**  
-  - DOCX conversion now uses isolated per-job LibreOffice profiles and temp directories, polls for a stable output file size, and aggressively cleans up temp files.  
-  - This reduces cross-job interference and flakiness when converting multiple DOCX files in parallel.
-- ♿ **Accessibility & layout improvements**  
-  - Dialogs and folder toggles expose proper roles and ARIA attributes.  
-  - PDF/EPUB/HTML readers use a full-height app shell with a sticky bottom TTS bar, improved scrollbars, and refined focus styles.
-- ✅ **End-to-end Playwright test suite with TTS mocks**  
-  - Deterministic TTS responses in tests via a reusable Playwright route mock.  
-  - Coverage for accessibility, upload, navigation, folder management, deletion flows, audiobook generation/export and playback across all document types.
-
-</details>
-
 ## 🐳 Docker Quick Start
 
 ### Prerequisites
@@ -194,6 +154,20 @@ Optionally required for different features:
     ```bash
     brew install libreoffice
     ```
+- [whisper.cpp](https://github.com/ggml-org/whisper.cpp) (optional, required for word-by-word highlighting)
+    ```bash
+    # clone and build whisper.cpp (no model download needed – OpenReader handles that)
+    git clone https://github.com/ggml-org/whisper.cpp.git
+    cd whisper.cpp
+    cmake -B build
+    cmake --build build -j --config Release
+
+    # point OpenReader to the compiled whisper-cli binary
+    echo WHISPER_CPP_BIN=\"$(pwd)/build/bin/whisper-cli\"
+    ```
+
+    > **Note:** The `WHISPER_CPP_BIN` path should be set in your `.env` file for OpenReader to use word-by-word highlighting features.
+    
 ### Steps
 
 1. Clone the repository:
 
@@ -1,6 +1,6 @@
 {
   "name": "openreader-webui",
-  "version": "v1.0.1",
+  "version": "v1.1.0",
   "private": true,
   "scripts": {
     "dev": "next dev --turbopack -p 3003",
 
@@ -1,15 +1,16 @@
 import { NextRequest, NextResponse } from 'next/server';
 import { spawn } from 'child_process';
-import { writeFile, readFile, mkdir, unlink, readdir } from 'fs/promises';
+import { writeFile, readFile, mkdir, unlink, readdir, rm } from 'fs/promises';
 import { existsSync, createReadStream } from 'fs';
 import { join } from 'path';
 import { randomUUID } from 'crypto';
+import type { TTSAudioBytes, TTSAudiobookFormat } from '@/types/tts';
 
 interface ConversionRequest {
   chapterTitle: string;
-  buffer: number[];
+  buffer: TTSAudioBytes;
   bookId?: string;
-  format?: 'mp3' | 'm4b';
+  format?: TTSAudiobookFormat;
   chapterIndex?: number;
 }
 
@@ -206,9 +207,12 @@ export async function POST(request: NextRequest) {
     await unlink(inputPath).catch(console.error);
 
     return NextResponse.json({ 
+      index: chapterIndex,
+      title: data.chapterTitle,
+      duration,
+      status: 'completed' as const,
       bookId,
-      chapterIndex,
-      duration
+      format
     });
 
   } catch (error) {
@@ -229,7 +233,7 @@ export async function POST(request: NextRequest) {
 export async function GET(request: NextRequest) {
   try {
     const bookId = request.nextUrl.searchParams.get('bookId');
-    const requestedFormat = request.nextUrl.searchParams.get('format') as 'mp3' | 'm4b' | null;
+    const requestedFormat = request.nextUrl.searchParams.get('format') as TTSAudiobookFormat | null;
     if (!bookId) {
       return NextResponse.json({ error: 'Missing bookId parameter' }, { status: 400 });
     }
@@ -378,4 +382,31 @@ function streamFile(filePath: string, format: string) {
       'Cache-Control': 'no-cache',
     },
   });
-}
+}
+export async function DELETE(request: NextRequest) {
+  try {
+    const bookId = request.nextUrl.searchParams.get('bookId');
+    if (!bookId) {
+      return NextResponse.json({ error: 'Missing bookId parameter' }, { status: 400 });
+    }
+
+    const docstoreDir = join(process.cwd(), 'docstore');
+    const intermediateDir = join(docstoreDir, `${bookId}-audiobook`);
+
+    // If directory doesn't exist, consider it already reset
+    if (!existsSync(intermediateDir)) {
+      return NextResponse.json({ success: true, existed: false });
+    }
+
+    // Recursively delete the entire audiobook directory
+    await rm(intermediateDir, { recursive: true, force: true });
+
+    return NextResponse.json({ success: true, existed: true });
+  } catch (error) {
+    console.error('Error resetting audiobook:', error);
+    return NextResponse.json(
+      { error: 'Failed to reset audiobook' },
+      { status: 500 }
+    );
+  }
+}
@@ -1,7 +1,8 @@
 import { NextRequest, NextResponse } from 'next/server';
-import { readdir, readFile, rm } from 'fs/promises';
+import { readdir, readFile } from 'fs/promises';
 import { existsSync } from 'fs';
 import { join } from 'path';
+import type { TTSAudiobookFormat } from '@/types/tts';
 
 export async function GET(request: NextRequest) {
   try {
@@ -26,7 +27,7 @@ export async function GET(request: NextRequest) {
       duration?: number;
       status: 'completed' | 'error';
       bookId: string;
-      format?: 'mp3' | 'm4b';
+      format?: TTSAudiobookFormat;
     }> = [];
 
     for (const metaFile of metaFiles) {
@@ -68,31 +69,3 @@ export async function GET(request: NextRequest) {
     );
   }
 }
-
-export async function DELETE(request: NextRequest) {
-  try {
-    const bookId = request.nextUrl.searchParams.get('bookId');
-    if (!bookId) {
-      return NextResponse.json({ error: 'Missing bookId parameter' }, { status: 400 });
-    }
-
-    const docstoreDir = join(process.cwd(), 'docstore');
-    const intermediateDir = join(docstoreDir, `${bookId}-audiobook`);
-
-    // If directory doesn't exist, consider it already reset
-    if (!existsSync(intermediateDir)) {
-      return NextResponse.json({ success: true, existed: false });
-    }
-
-    // Recursively delete the entire audiobook directory
-    await rm(intermediateDir, { recursive: true, force: true });
-
-    return NextResponse.json({ success: true, existed: true });
-  } catch (error) {
-    console.error('Error resetting audiobook:', error);
-    return NextResponse.json(
-      { error: 'Failed to reset audiobook' },
-      { status: 500 }
-    );
-  }
-}
@@ -4,7 +4,8 @@ import { SpeechCreateParams } from 'openai/resources/audio/speech.mjs';
 import { isKokoroModel } from '@/utils/voice';
 import { LRUCache } from 'lru-cache';
 import { createHash } from 'crypto';
-import type { TTSRequestPayload, TTSError } from '@/types/tts';
+import type { TTSRequestPayload } from '@/types/client';
+import type { TTSError, TTSAudioBuffer } from '@/types/tts';
 
 export const runtime = 'nodejs';
 
@@ -13,7 +14,7 @@ type ExtendedSpeechParams = Omit<SpeechCreateParams, 'voice'> & {
   voice: SpeechCreateParams['voice'] | CustomVoice;
   instructions?: string;
 };
-type AudioBufferValue = ArrayBuffer;
+type AudioBufferValue = TTSAudioBuffer;
 
 const TTS_CACHE_MAX_SIZE_BYTES = Number(process.env.TTS_CACHE_MAX_SIZE_BYTES || 256 * 1024 * 1024); // 256MB
 const TTS_CACHE_TTL_MS = Number(process.env.TTS_CACHE_TTL_MS || 1000 * 60 * 30); // 30 minutes
@@ -25,7 +26,7 @@ const ttsAudioCache = new LRUCache<string, AudioBufferValue>({
 });
 
 type InflightEntry = {
-  promise: Promise<ArrayBuffer>;
+  promise: Promise<TTSAudioBuffer>;
   controller: AbortController;
   consumers: number;
 };
@@ -40,7 +41,7 @@ async function fetchTTSBufferWithRetry(
   openai: OpenAI,
   createParams: ExtendedSpeechParams,
   signal: AbortSignal
-): Promise<ArrayBuffer> {
+): Promise<TTSAudioBuffer> {
   let attempt = 0;
   const maxRetries = Number(process.env.TTS_MAX_RETRIES ?? 2);
   let delay = Number(process.env.TTS_RETRY_INITIAL_MS ?? 250);
@@ -135,15 +136,15 @@ export async function POST(req: NextRequest) {
       voice: normalizedVoice,
       input: text,
       speed: speed,
-      response_format: format === 'aac' ? 'aac' : 'mp3',
+      response_format: format,
     };
     // Only add instructions if model is gpt-4o-mini-tts and instructions are provided
     if ((model as string) === 'gpt-4o-mini-tts' && instructions) {
       createParams.instructions = instructions;
     }
 
     // Compute cache key and check LRU before making provider call
-    const contentType = format === 'aac' ? 'audio/aac' : 'audio/mpeg';
+    const contentType = 'audio/mpeg';
 
     // Preserve voice string as-is for cache key (no weight stripping)
     const voiceForKey = typeof createParams.voice === 'string'
@@ -245,7 +246,7 @@ export async function POST(req: NextRequest) {
     };
     req.signal.addEventListener('abort', onAbort, { once: true });
 
-    let buffer: ArrayBuffer;
+    let buffer: TTSAudioBuffer;
     try {
       buffer = await entry.promise;
     } finally {
@@ -280,4 +281,4 @@ export async function POST(req: NextRequest) {
       { status: 500 }
     );
   }
-}
+}
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "openreader-webui",`
`3`		`- "version": "v1.0.1",`
	`3`	`+ "version": "v1.1.0",`
`4`	`4`	`"private": true,`
`5`	`5`	`"scripts": {`
`6`	`6`	`"dev": "next dev --turbopack -p 3003",`