SPACESODA
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎app/epub2txt.js‎
Lines changed: 108 additions & 27 deletions b/‎app/epub2txt.js‎
Lines changed: 108 additions & 27 deletions
diff --git a/‎epub2txt.py‎
Lines changed: 43 additions & 8 deletions b/‎epub2txt.py‎
Lines changed: 43 additions & 8 deletions
diff --git a/‎index.html‎
Lines changed: 2 additions & 1 deletion b/‎index.html‎
Lines changed: 2 additions & 1 deletion
@@ -11,7 +11,6 @@ env/
 
 # Content Files
 *.epub
-!/test-data/*.epub
 *.txt
 !requirements.txt
 
 
@@ -20,6 +20,7 @@
 * **Batch Processing**: Convert multiple files or top-level folders in one run (non-recursive).
 * **Formatting**: Adds blank lines between paragraphs.
 * **Text Extraction**: Strips images, styles, scripts, and metadata—keeps only text from `.html/.htm/.xhtml` files.
+* **Smart List Handling**: Converts ordered and unordered lists into clean, indented bullet points, preserving nested structures.
 * **Interactive Mode**: Run without arguments to enter interactive mode. Supports dragging multiple files and folders.
 * **Output Handling**: If `-o` points to a new folder, it will be created; `-o` is for single inputs only.
 
@@ -115,6 +116,7 @@ If you prefer to run it manually or don't want to use the helper scripts:
 * **一括処理**: 複数のファイルやトップレベルのフォルダを一度に変換します (再帰的ではありません)。
 * **整形**: 段落間に空行を追加します。
 * **テキスト抽出**: 画像、スタイル、スクリプト、メタデータを削除し、`.html/.htm/.xhtml` ファイルからテキストのみを保持します。
+* **リストの整形**: 箇条書き・番号付きリストを、階層を保ったまま読みやすく整形します。
 * **インタラクティブモード**: 引数なしで実行するとインタラクティブモードに入ります。複数のファイルやフォルダのドラッグ＆ドロップに対応しています。
 * **出力処理**: `-o` で存在しないフォルダを指定した場合は作成します。`-o` は単一入力専用です。
 
@@ -217,6 +219,7 @@ If you prefer to run it manually or don't want to use the helper scripts:
 * **批量處理**: 一次轉換多個檔案或資料夾 (僅掃描第一層)。
 * **段落排版**: 在段落之間添加空行。
 * **文字提取**: 移除圖片、樣式、腳本與中繼資料，只保留 `.html/.htm/.xhtml` 文字。
+* **列表格式優化**: 自動保留列表的層級結構，並轉換為整齊易讀的縮排格式。
 * **互動模式**: 無參數執行即可進入互動模式，支援拖放多個檔案與資料夾。
 * **輸出處理**: 若 `-o` 指向的新資料夾不存在會自動建立；`-o` 只適用單檔輸入。
 
 
@@ -5,28 +5,28 @@ document.addEventListener('DOMContentLoaded', () => {
 
     const ERRORS = {
         en: {
-            tooLarge: (size) => `File too large. Please use an EPUB under ${size}MB.`,
-            tooManyFiles: "EPUB has too many content files to process safely.",
-            noContent: "No readable HTML/XHTML content found in EPUB.",
-            missingOpf: "Invalid EPUB: OPF file declared in container.xml not found.",
-            invalidEpub: "Invalid EPUB/ZIP file.",
-            invalidOpf: "Invalid EPUB: OPF file is missing required sections."
+            tooLarge: (size) => `File too large. Please use an EPUB under ${size}MB`,
+            tooManyFiles: "EPUB has too many content files to process safely",
+            noContent: "No readable HTML/XHTML content found in EPUB",
+            missingOpf: "Invalid EPUB: OPF file declared in container.xml not found",
+            invalidEpub: "Invalid EPUB/ZIP file",
+            invalidOpf: "Invalid EPUB: OPF file is missing required sections"
         },
         ja: {
-            tooLarge: (size) => `ファイルサイズが大きすぎます。${size}MB未満のEPUBを使用してください。`,
-            tooManyFiles: "EPUBに含まれるコンテンツファイルが多すぎるため、安全に処理できません。",
-            noContent: "EPUB内に読み取り可能なHTML/XHTMLコンテンツが見つかりません。",
-            missingOpf: "無効なEPUBです: container.xmlで指定されたOPFファイルが見つかりません。",
-            invalidEpub: "無効なEPUB/ZIPファイルです。",
-            invalidOpf: "無効なEPUBです: OPFファイルに必要なセクションが欠落しています。"
+            tooLarge: (size) => `ファイルサイズが大きすぎます。${size}MB未満のEPUBを使用してください`,
+            tooManyFiles: "EPUBに含まれるコンテンツファイルが多すぎるため、安全に処理できません",
+            noContent: "EPUB内に読み取り可能なHTML/XHTMLコンテンツが見つかりません",
+            missingOpf: "無効なEPUBです: container.xmlで指定されたOPFファイルが見つかりません",
+            invalidEpub: "無効なEPUB/ZIPファイルです",
+            invalidOpf: "無効なEPUBです: OPFファイルに必要なセクションが欠落しています"
         },
         zh: {
-            tooLarge: (size) => `檔案過大，請使用小於 ${size}MB 的 EPUB。`,
-            tooManyFiles: "此 EPUB 內容檔案過多，無法安全處理。",
-            noContent: "EPUB 中沒有可讀的 HTML/XHTML 內容。",
-            missingOpf: "無效的 EPUB: container.xml 指定的 OPF 檔案不存在。",
-            invalidEpub: "無效的 EPUB/ZIP 檔案。",
-            invalidOpf: "無效的 EPUB: OPF 檔案缺少必要的區段。"
+            tooLarge: (size) => `檔案過大，請使用小於 ${size}MB 的 EPUB`,
+            tooManyFiles: "此 EPUB 內容檔案過多，無法安全處理",
+            noContent: "EPUB 中沒有可讀的 HTML/XHTML 內容",
+            missingOpf: "無效的 EPUB: container.xml 指定的 OPF 檔案不存在",
+            invalidEpub: "無效的 EPUB/ZIP 檔案",
+            invalidOpf: "無效的 EPUB: OPF 檔案缺少必要的區段"
         }
     };
 
@@ -80,8 +80,8 @@ document.addEventListener('DOMContentLoaded', () => {
             packaging: "Packaging ZIP...",
             processingFile: (current, total) => `File ${current}/${total}:`,
             errorPrefix: "Error: ",
-            onlyEpub: "Only .epub files are supported.",
-            genericError: "An unexpected error occurred.",
+            onlyEpub: "Only .epub files are supported",
+            genericError: "An unexpected error occurred",
             convertAnother: "Drag other .epub files to convert",
             selectFile: "select file(s)",
             downloadTxt: "Download TXT",
@@ -98,8 +98,8 @@ document.addEventListener('DOMContentLoaded', () => {
             packaging: "ZIPを作成中...",
             processingFile: (current, total) => `ファイル ${current}/${total}:`,
             errorPrefix: "エラー: ",
-            onlyEpub: ".epubファイルのみ対応しています。",
-            genericError: "予期しないエラーが発生しました。",
+            onlyEpub: ".epubファイルのみ対応しています",
+            genericError: "予期しないエラーが発生しました",
             convertAnother: "他の .epub ファイルをドラッグして変換",
             selectFile: "ファイルを選択",
             downloadTxt: "TXTをダウンロード",
@@ -116,8 +116,8 @@ document.addEventListener('DOMContentLoaded', () => {
             packaging: "正在打包 ZIP...",
             processingFile: (current, total) => `檔案 ${current}/${total}:`,
             errorPrefix: "錯誤: ",
-            onlyEpub: "請選擇 .epub 檔案。",
-            genericError: "發生未預期的錯誤。",
+            onlyEpub: "僅支援 .epub 檔案",
+            genericError: "發生未預期的錯誤",
             convertAnother: "拖放其他 .epub 檔案以轉換",
             selectFile: "選擇檔案",
             downloadTxt: "下載 TXT",
@@ -279,7 +279,13 @@ document.addEventListener('DOMContentLoaded', () => {
             // Handle missing files in spine gracefully
             let content;
             try {
-                content = await zip.file(path).async("string");
+                const entry = zip.file(path);
+                if (!entry) {
+                    console.warn("Could not read file:", path);
+                    continue;
+                }
+                const bytes = await entry.async("uint8array");
+                content = decodeBytesToString(bytes);
             } catch (e) {
                 console.warn("Could not read file:", path);
                 continue;
@@ -342,6 +348,44 @@ document.addEventListener('DOMContentLoaded', () => {
         return stack.join('/');
     }
 
+    function decodeBytesToString(bytes) {
+        const encoding = sniffEncoding(bytes) || 'utf-8';
+        try {
+            return new TextDecoder(encoding).decode(bytes);
+        } catch (e) {
+            return new TextDecoder('utf-8').decode(bytes);
+        }
+    }
+
+    function sniffEncoding(bytes) {
+        if (!bytes || !bytes.length) return null;
+        const headerBytes = bytes.subarray(0, 2048);
+        let headerText = '';
+        try {
+            headerText = new TextDecoder('utf-8').decode(headerBytes);
+        } catch (e) {
+            return null;
+        }
+
+        const xmlMatch = headerText.match(/<\?xml[^>]*encoding=["']([^"']+)["']/i);
+        if (xmlMatch) return normalizeEncodingName(xmlMatch[1]);
+
+        const metaCharsetMatch = headerText.match(/<meta[^>]*charset=["']?\s*([^"'\s/>]+)/i);
+        if (metaCharsetMatch) return normalizeEncodingName(metaCharsetMatch[1]);
+
+        const metaHttpEquivMatch = headerText.match(/<meta[^>]*http-equiv=["']content-type["'][^>]*content=["'][^"']*charset=([^"']+)["']/i);
+        if (metaHttpEquivMatch) return normalizeEncodingName(metaHttpEquivMatch[1]);
+
+        return null;
+    }
+
+    function normalizeEncodingName(name) {
+        if (!name) return null;
+        const cleaned = String(name).trim().toLowerCase().replace(/_/g, '-');
+        if (cleaned === 'utf8') return 'utf-8';
+        return cleaned;
+    }
+
     function resolveZipPath(opfDir, href) {
         const cleaned = href.split('#')[0];
         if (!cleaned) return null;
@@ -610,7 +654,17 @@ document.addEventListener('DOMContentLoaded', () => {
         return combined;
     }
 
-    function collectTextSegments(element, inPre = false, segments = [], state = null) {
+    /**
+     * Recursive function to traverse the DOM and collect text segments.
+     * Mirrors the logic in the Python script's `get_clean_text`.
+     * 
+     * @param {Node} element - The DOM node to traverse.
+     * @param {boolean} inPre - Whether the current node is inside a <pre> tag.
+     * @param {Array} segments - Accumulator for text segments.
+     * @param {Object} state - Tracks state across recursion (e.g., hasContent, lastWasSeparator).
+     * @param {number} listDepth - Current nesting level of lists (for indentation).
+     */
+    function collectTextSegments(element, inPre = false, segments = [], state = null, listDepth = 0) {
         if (!element) return segments;
         if (!state) {
             state = { hasContent: false, lastWasSeparator: false };
@@ -645,6 +699,25 @@ document.addEventListener('DOMContentLoaded', () => {
                     return;
                 }
 
+                // Handle Lists
+                if (tagName === 'UL' || tagName === 'OL') {
+                    if (!inPre) pushSegment("\n", false);
+                    collectTextSegments(node, inPre, segments, state, listDepth + 1);
+                    if (!inPre) pushSegment("\n", false);
+                    return;
+                }
+
+                if (tagName === 'LI') {
+                    if (!inPre) {
+                        pushSegment("\n", false);
+                        const indent = "  ".repeat(Math.max(0, listDepth - 1));
+                        pushSegment(indent + "- ", true);
+                    }
+                    collectTextSegments(node, inPre, segments, state, listDepth);
+                    if (!inPre) pushSegment("\n", false);
+                    return;
+                }
+
                 const headingLevel = HEADING_TAGS[tagName];
                 if (headingLevel && !inPre) {
                     const headingText = node.textContent.replace(/\s+/g, ' ').trim();
@@ -664,7 +737,7 @@ document.addEventListener('DOMContentLoaded', () => {
                     pushSegment("\n", false);
                 }
 
-                collectTextSegments(node, nextPre, segments, state);
+                collectTextSegments(node, nextPre, segments, state, listDepth);
 
                 if (isBlock && !inPre) {
                     pushSegment("\n", false);
@@ -698,6 +771,10 @@ document.addEventListener('DOMContentLoaded', () => {
         return elements.length ? elements[0] : null;
     }
 
+    /**
+     * Handles the creation of a temporary Object URL for downloading.
+     * Revokes any existing URL to prevent memory leaks before creating a new one.
+     */
     function prepareBlobDownload(blob, filename, downloadType) {
         safeRevokeBlob();
         currentBlobUrl = URL.createObjectURL(blob);
@@ -769,6 +846,10 @@ document.addEventListener('DOMContentLoaded', () => {
         }
     }
 
+    /**
+     * Generates a unique filename by appending a counter if the name already exists.
+     * e.g., "book.txt" -> "book (2).txt" -> "book (3).txt"
+     */
     function makeUniqueFilename(name, usedNames) {
         if (!usedNames.has(name)) return name;
         const dotIndex = name.lastIndexOf('.');
 
@@ -126,15 +126,20 @@ def parse_opf(zip_ref: zipfile.ZipFile, opf_path: str):
     root = ET.fromstring(opf_content)
 
     # Create the OPF namespace map dynamically to handle varying versions (2.0 vs 3.0).
-    # Grabs the namespace from the root tag itself.
-    ns = {'pkg': root.tag.split('}')[0].strip('{')}
+    # Some OPFs are un-namespaced; handle both cases.
+    has_namespace = '}' in root.tag
+    ns = {'pkg': root.tag.split('}')[0].strip('{')} if has_namespace else {}
 
     # 1. Parse Manifest: Map ID -> Href (File Path)
     # Creates a dictionary where valid IDs point to their actual file locations.
     manifest_items = {}
     nav_href = None
     ncx_href = None
-    for item in root.findall(".//pkg:manifest/pkg:item", ns):
+    if has_namespace:
+        manifest_items_iter = root.findall(".//pkg:manifest/pkg:item", ns)
+    else:
+        manifest_items_iter = root.findall(".//manifest/item")
+    for item in manifest_items_iter:
         item_id = item.attrib.get('id')
         href = item.attrib.get('href')
         if not item_id or not href:
@@ -150,12 +155,16 @@ def parse_opf(zip_ref: zipfile.ZipFile, opf_path: str):
     # 2. Parse Spine: Get linear reading order
     # The spine tells the parser the order in which to display the items found in the manifest.
     spine_hrefs = []
-    spine = root.find(".//pkg:spine", ns)
+    spine = root.find(".//pkg:spine", ns) if has_namespace else root.find(".//spine")
     if spine is not None:
         toc_id = spine.attrib.get('toc')
         if toc_id and toc_id in manifest_items:
             ncx_href = manifest_items[toc_id]
-        for itemref in spine.findall(".//pkg:itemref", ns):
+        if has_namespace:
+            spine_items = spine.findall(".//pkg:itemref", ns)
+        else:
+            spine_items = spine.findall(".//itemref")
+        for itemref in spine_items:
             item_id = itemref.attrib.get('idref')
             if item_id in manifest_items:
                 spine_hrefs.append(manifest_items[item_id])
@@ -396,7 +405,7 @@ def epub_to_text(epub_path: str, output_txt_path: str) -> None:
                             element.decompose()
 
                         # Step 4: Extract text
-                        # Use our custom function to handle spacing intelligently
+                        # Use helper function to handle spacing intelligently
                         normalized_path = posixpath.normpath(file_path)
                         anchor_ids = chapter_anchors.get(normalized_path, [])
                         insert_anchor_markers(soup, anchor_ids)
@@ -469,6 +478,11 @@ def get_clean_text(soup: BeautifulSoup) -> str:
     """
     Extract text from BeautifulSoup object with intelligent whitespace handling.
     Preserves sentence structure for LLMs while maintaining paragraph separation.
+    
+    This function traverses the DOM tree recursively:
+    - Block elements (p, div, etc.) trigger line breaks.
+    - Lists are flattened with indentation to preserve hierarchy.
+    - Script/Style/Meta tags are ignored.
     """
     root = soup.body or soup
     if not root:
@@ -490,7 +504,7 @@ def add_separator():
             parts.append(("\n\n---\n\n", False))
             state['last_sep'] = True
 
-    def walk(node, in_pre: bool = False):
+    def walk(node, in_pre: bool = False, list_depth: int = 0):
         for child in node.children:
             if isinstance(child, NavigableString):
                 text = str(child)
@@ -513,6 +527,27 @@ def walk(node, in_pre: bool = False):
                 if name == 'br':
                     add_text("\n", in_pre)
                     continue
+
+                # Handle Lists
+                if name in ('ul', 'ol'):
+                    if not in_pre:
+                        add_text("\n", False)
+                    walk(child, in_pre, list_depth + 1)
+                    if not in_pre:
+                        add_text("\n", False)
+                    continue
+
+                if name == 'li':
+                    if not in_pre:
+                        add_text("\n", False)
+                        # Indent based on depth (depth 1 = no indent, depth 2 = 2 spaces, etc.)
+                        indent = "  " * max(0, list_depth - 1)
+                        add_text(indent + "- ", True)
+                    walk(child, in_pre, list_depth)
+                    if not in_pre:
+                        add_text("\n", False)
+                    continue
+
                 heading_level = HEADING_TAGS.get(name)
                 if heading_level and not in_pre:
                     heading_text = child.get_text(" ", strip=True)
@@ -528,7 +563,7 @@ def walk(node, in_pre: bool = False):
                 if is_block and not in_pre:
                     add_text("\n", False)
 
-                walk(child, next_pre)
+                walk(child, next_pre, list_depth)
 
                 if is_block and not in_pre:
                     add_text("\n", False)
 
@@ -10,6 +10,7 @@
     <link rel="icon" type="image/png" sizes="16x16" href="assets/favicon-16x16.png">
     <link rel="shortcut icon" href="assets/favicon.ico">
     <title>epub2txt - Convert EPUB to Text</title>
+    <link rel="canonical" href="https://spacesoda.github.io/epub2txt/" />
     <link rel="alternate" hreflang="en" href="https://spacesoda.github.io/epub2txt/" />
     <link rel="alternate" hreflang="ja" href="https://spacesoda.github.io/epub2txt/ja/" />
     <link rel="alternate" hreflang="zh-TW" href="https://spacesoda.github.io/epub2txt/zh/" />
@@ -130,7 +131,7 @@ <h2 id="success-filename">book.txt</h2>
                     </div>
                     <div id="error-state" class="hidden">
                         <div class="icon error">⚠️</div>
-                        <p id="error-msg">Only .epub files are supported.</p>
+                        <p id="error-msg">An unexpected error occurred</p>
                         <button id="retry-btn" class="cta-button secondary">Try Again</button>
                     </div>
                 </div>