Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 75 additions & 20 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,27 +1,82 @@
name: CI

on:
push:
branches: [main, master, dev]
pull_request:
branches: [main, master, dev]

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
on:
push:
branches: [main, master, dev]
pull_request:
branches: [main, master, dev]
workflow_dispatch:
inputs:
run_packaging_smoke:
description: "Run the slow PyInstaller smoke test"
required: false
default: false
type: boolean

concurrency:
group: ci-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
fast-tests:
name: Fast Tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libgl1 libegl1 libxkbcommon-x11-0
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ".[test]"
- name: Run tests
run: |
pytest -q
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ".[test]"
- name: Run fast tests
run: |
pytest -q -m "not runtime_ui and not pdf_real and not packaging"

integration-tests:
name: Runtime And Real PDF Tests
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libgl1 libegl1 libxkbcommon-x11-0
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ".[test]"
- name: Run runtime and real PDF tests
run: |
pytest -q -m "runtime_ui or pdf_real"

packaging-smoke:
name: Packaging Smoke
if: ${{ github.event_name == 'workflow_dispatch' && inputs.run_packaging_smoke }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libgl1 libegl1 libxkbcommon-x11-0
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ".[dev]"
- name: Run packaging smoke test
env:
MARKITDOWNGUI_RUN_PYINSTALLER_SMOKE: "1"
run: |
pytest -q -m packaging
23 changes: 14 additions & 9 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
name: Build and Release

on:
push:
tags:
- 'v*' # Runs when a tag like v1.0.0 is pushed
on:
push:
tags:
- 'v*' # Runs when a tag like v1.0.0 is pushed

concurrency:
group: release-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: write
Expand Down Expand Up @@ -60,11 +64,12 @@ jobs:
python -m zipfile -c "dist_output/${ARTIFACT_NAME}" dist/MarkItDown
shell: bash

- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: MarkItDown-${{ runner.os }}
path: dist_output/*.zip
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: MarkItDown-${{ runner.os }}
path: dist_output/*.zip
retention-days: 1

release:
name: Create GitHub Release
Expand Down
15 changes: 8 additions & 7 deletions MarkItDown.spec
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# -*- mode: python ; coding: utf-8 -*-
import os
from PyInstaller.utils.hooks import collect_data_files, collect_submodules
from markitdowngui.build_config import build_datas, build_hiddenimports

hiddenimports = build_hiddenimports(collect_submodules, warn=print)
datas = build_datas(collect_data_files, warn=print)
import os
from PyInstaller.utils.hooks import collect_data_files, collect_dynamic_libs, collect_submodules
from markitdowngui.build_config import build_binaries, build_datas, build_hiddenimports

hiddenimports = build_hiddenimports(collect_submodules, warn=print)
datas = build_datas(collect_data_files, warn=print)
binaries = build_binaries(collect_dynamic_libs, warn=print)

a = Analysis(
["markitdowngui/main.py"],
pathex=[],
binaries=[],
binaries=binaries,
datas=datas,
hiddenimports=hiddenimports,
hookspath=[],
Expand Down
82 changes: 48 additions & 34 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,20 @@ It focuses on fast multi-file conversion to Markdown with a modern Fluent-style

![Current UI screenshot](image.png)

## Features

- Queue-based file workflow with drag and drop.
- Batch conversion with start, pause/resume, cancel, and progress feedback.
- Results view with per-file selection and Markdown preview.
- Preview modes: rendered Markdown view and raw Markdown view.
- Save modes: export as one combined file or separate files.
- Quick actions: copy Markdown, save output, back to queue, start over.
- Optional OCR for scanned PDFs and image files, with Azure Document Intelligence first and local Tesseract fallback.
- Settings for output folder, batch size, header style, table style, OCR, and theme mode (light/dark/system).
- Built-in shortcuts dialog, update check action, and about dialog.
## Features

- Queue-based file workflow with drag and drop.
- Batch conversion with start, pause/resume, cancel, and progress feedback.
- Results view with per-file selection and Markdown preview.
- Preview modes: rendered Markdown view and raw Markdown view.
- Save modes: export as one combined file or separate files.
- Quick actions: copy Markdown, save output, back to queue, start over.
- Optional OCR for scanned PDFs and image files, with Azure Document Intelligence first and local Tesseract fallback.
- PDF pipeline toggle: `MarkItDown` for conservative wrapper behavior, or `PyMuPDF` for alternate PDF parsing.
- Optional `Preserve PDF images in Markdown` export with `separate` or `single` asset layouts.
- `PyMuPDF` PDF mode can place extracted images near the closest preceding text block and show them in preview before save.
- Settings for output folder, batch size, header style, table style, OCR, PDF behavior, and theme mode (light/dark/system).
- Built-in shortcuts dialog, update check action, and about dialog.

## Installation

Expand All @@ -40,14 +43,24 @@ Alternative:
pip install -e .[dev]
```

### OCR Notes

- OCR is optional and disabled by default.
- Local OCR requires a system `tesseract` binary. Install it from the [official Tesseract project](https://github.com/tesseract-ocr/tesseract). If it is not on your `PATH`, set the executable path in Settings.
- Azure OCR requires an Azure Document Intelligence endpoint in Settings.
- Azure Document Intelligence pricing includes [500 free pages per month](https://azure.microsoft.com/en-us/products/ai-foundry/tools/document-intelligence#Pricing) at the time of writing.
- For API-key auth, set `AZURE_OCR_API_KEY`.
- If `AZURE_OCR_API_KEY` is not set, Azure OCR falls back to Azure identity credentials supported by `DefaultAzureCredential`.
### OCR Notes

- OCR is optional and disabled by default.
- Local OCR requires a system `tesseract` binary. Install it from the [official Tesseract project](https://github.com/tesseract-ocr/tesseract). If it is not on your `PATH`, set the executable path in Settings.
- Azure OCR requires an Azure Document Intelligence endpoint in Settings.
- Azure Document Intelligence pricing includes [500 free pages per month](https://azure.microsoft.com/en-us/products/ai-foundry/tools/document-intelligence#Pricing) at the time of writing.
- For API-key auth, set `AZURE_OCR_API_KEY`.
- If `AZURE_OCR_API_KEY` is not set, Azure OCR falls back to Azure identity credentials supported by `DefaultAzureCredential`.
- `PyMuPDF` is used internally for local PDF OCR, PDF image extraction, and the alternate PDF parsing pipeline.

### PDF Notes

- `MarkItDown` remains the default PDF pipeline and keeps the conservative wrapper behavior.
- `PyMuPDF` is the alternate PDF pipeline and is the only mode that supports best-effort inline placement of extracted PDF images near the closest preceding text block.
- Inline image placement is based on page coordinates and is best-effort, not a full layout reconstruction.
- If no reliable preceding text block is found, the image is placed at the end of that page instead of in a global trailing image section.
- If PDF conversion falls back to Azure OCR or local OCR text extraction, inline image placement is not preserved because that fallback path does not carry a shared page-layout model.
- When image preservation is enabled, extracted images are saved as files and linked from the Markdown; preview materializes those assets before final save.

## Run the App

Expand All @@ -68,13 +81,14 @@ uv run python -m markitdowngui.main

## Build a Standalone Executable

```sh
uv pip install -e .[dev]
pyinstaller MarkItDown.spec --clean --noconfirm
```

The default spec builds an `onedir` app in `dist/MarkItDown/`.
Release workflows package this folder into platform-specific `.zip` artifacts.
```sh
uv pip install -e .[dev]
pyinstaller MarkItDown.spec --clean --noconfirm
```

The default spec builds an `onedir` app in `dist/MarkItDown/`.
The bundled spec also collects the runtime pieces required for `PyMuPDF` / `fitz`.
Release workflows package this folder into platform-specific `.zip` artifacts.

## License

Expand All @@ -92,14 +106,14 @@ This follows the non-commercial licensing requirements of `PySide6-Fluent-Widget
uv pip install -e .[dev]
```

3. Make your changes.
4. Run tests:
```sh
uv run pytest -q
```
5. Open a pull request with a clear summary.
3. Make your changes.
4. Run tests:

```sh
python -m pytest
```

5. Open a pull request with a clear summary.

## Credits

Expand Down
63 changes: 44 additions & 19 deletions markitdowngui/build_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,30 @@
"markitdown",
"charset_normalizer",
)
OPTIONAL_HIDDENIMPORT_PACKAGES = (
"azure.ai.documentintelligence",
"azure.identity",
"pypdfium2",
"pypdfium2_raw",
"pytesseract",
)
OPTIONAL_HIDDENIMPORT_PACKAGES = (
"azure.ai.documentintelligence",
"azure.identity",
"fitz",
"pymupdf",
"pypdfium2",
"pypdfium2_raw",
"pytesseract",
)
BASE_DATAS = (
("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"),
("markitdowngui/resources/moon.svg", "markitdowngui/resources"),
("markitdowngui/resources/sun.svg", "markitdowngui/resources"),
("LICENSE", "."),
)
OPTIONAL_DATA_PACKAGES = (
"magika",
"pypdfium2",
"pypdfium2_raw",
)
OPTIONAL_DATA_PACKAGES = (
"magika",
"pypdfium2",
"pypdfium2_raw",
)
OPTIONAL_BINARY_PACKAGES = (
"fitz",
"pymupdf",
)


def _dedupe(items: list[str]) -> list[str]:
Expand Down Expand Up @@ -59,11 +65,11 @@ def build_hiddenimports(
return _dedupe(hiddenimports)


def build_datas(
collect_data_files: Callable[[str], list[tuple[str, str]]],
*,
warn: Callable[[str], None] | None = None,
) -> list[tuple[str, str]]:
def build_datas(
collect_data_files: Callable[[str], list[tuple[str, str]]],
*,
warn: Callable[[str], None] | None = None,
) -> list[tuple[str, str]]:
datas = list(BASE_DATAS)

for package in OPTIONAL_DATA_PACKAGES:
Expand All @@ -72,5 +78,24 @@ def build_datas(
except Exception as exc:
if warn is not None:
warn(f"Warning: Could not collect data files for {package}: {exc}")

return datas

return datas


def build_binaries(
collect_dynamic_libs: Callable[[str], list[tuple[str, str]]],
*,
warn: Callable[[str], None] | None = None,
) -> list[tuple[str, str]]:
binaries: list[tuple[str, str]] = []

for package in OPTIONAL_BINARY_PACKAGES:
try:
binaries.extend(collect_dynamic_libs(package))
except Exception as exc:
if warn is not None:
warn(
f"Warning: Could not collect dynamic libraries for {package}: {exc}"
)

return binaries
Loading
Loading