imadreamerboy · mspinolaeie · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,27 +1,82 @@
 name: CI
 
-on:
-  push:
-    branches: [main, master, dev]
-  pull_request:
-    branches: [main, master, dev]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
-        with:
+on:
+  push:
+    branches: [main, master, dev]
+  pull_request:
+    branches: [main, master, dev]
+  workflow_dispatch:
+    inputs:
+      run_packaging_smoke:
+        description: "Run the slow PyInstaller smoke test"
+        required: false
+        default: false
+        type: boolean
+
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  fast-tests:
+    name: Fast Tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
           python-version: '3.10'
       - name: Install system dependencies
         run: |
           sudo apt-get update
           sudo apt-get install -y libgl1 libegl1 libxkbcommon-x11-0
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install ".[test]"
-      - name: Run tests
-        run: |
-          pytest -q
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ".[test]"
+      - name: Run fast tests
+        run: |
+          pytest -q -m "not runtime_ui and not pdf_real and not packaging"
+
+  integration-tests:
+    name: Runtime And Real PDF Tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libgl1 libegl1 libxkbcommon-x11-0
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ".[test]"
+      - name: Run runtime and real PDF tests
+        run: |
+          pytest -q -m "runtime_ui or pdf_real"
+
+  packaging-smoke:
+    name: Packaging Smoke
+    if: ${{ github.event_name == 'workflow_dispatch' && inputs.run_packaging_smoke }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libgl1 libegl1 libxkbcommon-x11-0
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ".[dev]"
+      - name: Run packaging smoke test
+        env:
+          MARKITDOWNGUI_RUN_PYINSTALLER_SMOKE: "1"
+        run: |
+          pytest -q -m packaging
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -1,9 +1,13 @@
 name: Build and Release
 
-on:
-  push:
-    tags:
-      - 'v*'  # Runs when a tag like v1.0.0 is pushed
+on:
+  push:
+    tags:
+      - 'v*'  # Runs when a tag like v1.0.0 is pushed
+
+concurrency:
+  group: release-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
 
 permissions:
   contents: write
@@ -60,11 +64,12 @@ jobs:
           python -m zipfile -c "dist_output/${ARTIFACT_NAME}" dist/MarkItDown
         shell: bash
 
-      - name: Upload artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: MarkItDown-${{ runner.os }}
-          path: dist_output/*.zip
+      - name: Upload artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: MarkItDown-${{ runner.os }}
+          path: dist_output/*.zip
+          retention-days: 1
 
   release:
     name: Create GitHub Release

diff --git a/MarkItDown.spec b/MarkItDown.spec
@@ -1,15 +1,16 @@
 # -*- mode: python ; coding: utf-8 -*-
-import os
-from PyInstaller.utils.hooks import collect_data_files, collect_submodules
-from markitdowngui.build_config import build_datas, build_hiddenimports
-
-hiddenimports = build_hiddenimports(collect_submodules, warn=print)
-datas = build_datas(collect_data_files, warn=print)
+import os
+from PyInstaller.utils.hooks import collect_data_files, collect_dynamic_libs, collect_submodules
+from markitdowngui.build_config import build_binaries, build_datas, build_hiddenimports
+
+hiddenimports = build_hiddenimports(collect_submodules, warn=print)
+datas = build_datas(collect_data_files, warn=print)
+binaries = build_binaries(collect_dynamic_libs, warn=print)
 
 a = Analysis(
     ["markitdowngui/main.py"],
     pathex=[],
-    binaries=[],
+    binaries=binaries,
     datas=datas,
     hiddenimports=hiddenimports,
     hookspath=[],

diff --git a/README.md b/README.md
@@ -7,17 +7,20 @@ It focuses on fast multi-file conversion to Markdown with a modern Fluent-style
 
 ![Current UI screenshot](image.png)
 
-## Features
-
-- Queue-based file workflow with drag and drop.
-- Batch conversion with start, pause/resume, cancel, and progress feedback.
-- Results view with per-file selection and Markdown preview.
-- Preview modes: rendered Markdown view and raw Markdown view.
-- Save modes: export as one combined file or separate files.
-- Quick actions: copy Markdown, save output, back to queue, start over.
-- Optional OCR for scanned PDFs and image files, with Azure Document Intelligence first and local Tesseract fallback.
-- Settings for output folder, batch size, header style, table style, OCR, and theme mode (light/dark/system).
-- Built-in shortcuts dialog, update check action, and about dialog.
+## Features
+
+- Queue-based file workflow with drag and drop.
+- Batch conversion with start, pause/resume, cancel, and progress feedback.
+- Results view with per-file selection and Markdown preview.
+- Preview modes: rendered Markdown view and raw Markdown view.
+- Save modes: export as one combined file or separate files.
+- Quick actions: copy Markdown, save output, back to queue, start over.
+- Optional OCR for scanned PDFs and image files, with Azure Document Intelligence first and local Tesseract fallback.
+- PDF pipeline toggle: `MarkItDown` for conservative wrapper behavior, or `PyMuPDF` for alternate PDF parsing.
+- Optional `Preserve PDF images in Markdown` export with `separate` or `single` asset layouts.
+- `PyMuPDF` PDF mode can place extracted images near the closest preceding text block and show them in preview before save.
+- Settings for output folder, batch size, header style, table style, OCR, PDF behavior, and theme mode (light/dark/system).
+- Built-in shortcuts dialog, update check action, and about dialog.
 
 ## Installation
 
@@ -40,14 +43,24 @@ Alternative:
 pip install -e .[dev]
 ```
 
-### OCR Notes
-
-- OCR is optional and disabled by default.
-- Local OCR requires a system `tesseract` binary. Install it from the [official Tesseract project](https://github.com/tesseract-ocr/tesseract). If it is not on your `PATH`, set the executable path in Settings.
-- Azure OCR requires an Azure Document Intelligence endpoint in Settings.
-- Azure Document Intelligence pricing includes [500 free pages per month](https://azure.microsoft.com/en-us/products/ai-foundry/tools/document-intelligence#Pricing) at the time of writing.
-- For API-key auth, set `AZURE_OCR_API_KEY`.
-- If `AZURE_OCR_API_KEY` is not set, Azure OCR falls back to Azure identity credentials supported by `DefaultAzureCredential`.
+### OCR Notes
+
+- OCR is optional and disabled by default.
+- Local OCR requires a system `tesseract` binary. Install it from the [official Tesseract project](https://github.com/tesseract-ocr/tesseract). If it is not on your `PATH`, set the executable path in Settings.
+- Azure OCR requires an Azure Document Intelligence endpoint in Settings.
+- Azure Document Intelligence pricing includes [500 free pages per month](https://azure.microsoft.com/en-us/products/ai-foundry/tools/document-intelligence#Pricing) at the time of writing.
+- For API-key auth, set `AZURE_OCR_API_KEY`.
+- If `AZURE_OCR_API_KEY` is not set, Azure OCR falls back to Azure identity credentials supported by `DefaultAzureCredential`.
+- `PyMuPDF` is used internally for local PDF OCR, PDF image extraction, and the alternate PDF parsing pipeline.
+
+### PDF Notes
+
+- `MarkItDown` remains the default PDF pipeline and keeps the conservative wrapper behavior.
+- `PyMuPDF` is the alternate PDF pipeline and is the only mode that supports best-effort inline placement of extracted PDF images near the closest preceding text block.
+- Inline image placement is based on page coordinates and is best-effort, not a full layout reconstruction.
+- If no reliable preceding text block is found, the image is placed at the end of that page instead of in a global trailing image section.
+- If PDF conversion falls back to Azure OCR or local OCR text extraction, inline image placement is not preserved because that fallback path does not carry a shared page-layout model.
+- When image preservation is enabled, extracted images are saved as files and linked from the Markdown; preview materializes those assets before final save.
 
 ## Run the App
 
@@ -68,13 +81,14 @@ uv run python -m markitdowngui.main
 
 ## Build a Standalone Executable
 
-```sh
-uv pip install -e .[dev]
-pyinstaller MarkItDown.spec --clean --noconfirm
-```
-
-The default spec builds an `onedir` app in `dist/MarkItDown/`.
-Release workflows package this folder into platform-specific `.zip` artifacts.
+```sh
+uv pip install -e .[dev]
+pyinstaller MarkItDown.spec --clean --noconfirm
+```
+
+The default spec builds an `onedir` app in `dist/MarkItDown/`.
+The bundled spec also collects the runtime pieces required for `PyMuPDF` / `fitz`.
+Release workflows package this folder into platform-specific `.zip` artifacts.
 
 ## License
 
@@ -92,14 +106,14 @@ This follows the non-commercial licensing requirements of `PySide6-Fluent-Widget
 uv pip install -e .[dev]
 ```
 
-3. Make your changes.
-4. Run tests:
-
-```sh
-uv run pytest -q
-```
-
-5. Open a pull request with a clear summary.
+3. Make your changes.
+4. Run tests:
+
+```sh
+python -m pytest
+```
+
+5. Open a pull request with a clear summary.
 
 ## Credits
 

diff --git a/markitdowngui/build_config.py b/markitdowngui/build_config.py
@@ -13,24 +13,30 @@
     "markitdown",
     "charset_normalizer",
 )
-OPTIONAL_HIDDENIMPORT_PACKAGES = (
-    "azure.ai.documentintelligence",
-    "azure.identity",
-    "pypdfium2",
-    "pypdfium2_raw",
-    "pytesseract",
-)
+OPTIONAL_HIDDENIMPORT_PACKAGES = (
+    "azure.ai.documentintelligence",
+    "azure.identity",
+    "fitz",
+    "pymupdf",
+    "pypdfium2",
+    "pypdfium2_raw",
+    "pytesseract",
+)
 BASE_DATAS = (
     ("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"),
     ("markitdowngui/resources/moon.svg", "markitdowngui/resources"),
     ("markitdowngui/resources/sun.svg", "markitdowngui/resources"),
     ("LICENSE", "."),
 )
-OPTIONAL_DATA_PACKAGES = (
-    "magika",
-    "pypdfium2",
-    "pypdfium2_raw",
-)
+OPTIONAL_DATA_PACKAGES = (
+    "magika",
+    "pypdfium2",
+    "pypdfium2_raw",
+)
+OPTIONAL_BINARY_PACKAGES = (
+    "fitz",
+    "pymupdf",
+)
 
 
 def _dedupe(items: list[str]) -> list[str]:
@@ -59,11 +65,11 @@ def build_hiddenimports(
     return _dedupe(hiddenimports)
 
 
-def build_datas(
-    collect_data_files: Callable[[str], list[tuple[str, str]]],
-    *,
-    warn: Callable[[str], None] | None = None,
-) -> list[tuple[str, str]]:
+def build_datas(
+    collect_data_files: Callable[[str], list[tuple[str, str]]],
+    *,
+    warn: Callable[[str], None] | None = None,
+) -> list[tuple[str, str]]:
     datas = list(BASE_DATAS)
 
     for package in OPTIONAL_DATA_PACKAGES:
@@ -72,5 +78,24 @@ def build_datas(
         except Exception as exc:
             if warn is not None:
                 warn(f"Warning: Could not collect data files for {package}: {exc}")
-
-    return datas
+
+    return datas
+
+
+def build_binaries(
+    collect_dynamic_libs: Callable[[str], list[tuple[str, str]]],
+    *,
+    warn: Callable[[str], None] | None = None,
+) -> list[tuple[str, str]]:
+    binaries: list[tuple[str, str]] = []
+
+    for package in OPTIONAL_BINARY_PACKAGES:
+        try:
+            binaries.extend(collect_dynamic_libs(package))
+        except Exception as exc:
+            if warn is not None:
+                warn(
+                    f"Warning: Could not collect dynamic libraries for {package}: {exc}"
+                )
+
+    return binaries