Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/install-system-deps/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ runs:
path: |
C:\Program Files\Tesseract-OCR
C:\ProgramData\chocolatey\lib\tesseract
key: tesseract-windows-${{ runner.arch }}-v5
key: tesseract-windows-${{ runner.arch }}-v5-data
restore-keys: |
tesseract-windows-${{ runner.arch }}-

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ci-go.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ jobs:
- name: Move FFI library to correct location
shell: bash
run: scripts/ci/go/move-downloaded-ffi-library.sh
# Note: Header copy is handled by move-downloaded-ffi-library.sh

- name: Verify FFI library location and permissions
shell: bash
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/ci-ruby.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ jobs:
echo "Cleaning stale build artifacts to prevent fingerprint errors"
cargo clean --release -p kreuzberg-ffi 2>/dev/null || true
cargo clean --release -p kreuzberg 2>/dev/null || true
# Clean rb_sys build directory to prevent stale fingerprints in packages/ruby/tmp
if [ -d "packages/ruby/tmp" ]; then
echo "Cleaning rb_sys build directory: packages/ruby/tmp"
rm -rf packages/ruby/tmp
fi
else
echo "No target directory found, skipping cache cleanup"
fi
Expand Down Expand Up @@ -525,6 +530,11 @@ jobs:
echo "Cleaning stale build artifacts to prevent fingerprint errors"
cargo clean --release -p kreuzberg-ffi 2>/dev/null || true
cargo clean --release -p kreuzberg 2>/dev/null || true
# Clean rb_sys build directory to prevent stale fingerprints in packages/ruby/tmp
if [ -d "packages/ruby/tmp" ]; then
echo "Cleaning rb_sys build directory: packages/ruby/tmp"
rm -rf packages/ruby/tmp
fi
else
echo "No target directory found, skipping cache cleanup"
fi
Expand Down
29 changes: 21 additions & 8 deletions .github/workflows/publish-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ jobs:
runs-on: ubuntu-latest
permissions:
contents: read
packages: read
outputs:
core_exists: ${{ steps.core.outputs.exists }}
full_exists: ${{ steps.full.outputs.exists }}
Expand All @@ -126,17 +127,24 @@ jobs:
with:
ref: ${{ needs.prepare.outputs.tag }}

- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Check core image tag
id: core
env:
DOCKER_TAG: ${{ 'goldziher/kreuzberg' }}:${{ needs.prepare.outputs.version }}-core
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}-core
SUMMARY_LABEL: core
run: scripts/publish/check-docker-tag.sh

- name: Check full image tag
id: full
env:
DOCKER_TAG: ${{ 'goldziher/kreuzberg' }}:${{ needs.prepare.outputs.version }}
DOCKER_TAG: ghcr.io/kreuzberg-dev/kreuzberg:${{ needs.prepare.outputs.version }}
SUMMARY_LABEL: full
run: scripts/publish/check-docker-tag.sh

Expand All @@ -156,12 +164,12 @@ jobs:
include:
- variant: core
dockerfile: docker/Dockerfile.core
image: goldziher/kreuzberg
image: ghcr.io/kreuzberg-dev/kreuzberg
tag_suffix: "-core"
extra_tag: "core"
- variant: full
dockerfile: docker/Dockerfile.full
image: goldziher/kreuzberg
image: ghcr.io/kreuzberg-dev/kreuzberg
tag_suffix: ""
extra_tag: "latest"
if: ${{ needs.prepare.outputs.release_docker == 'true' }}
Expand Down Expand Up @@ -198,12 +206,13 @@ jobs:
if: ${{ needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true') }}
run: ./scripts/test_docker.sh --skip-build --image kreuzberg-publish:${{ matrix.variant }}-test --variant ${{ matrix.variant }} --verbose

- name: Log in to Docker Hub
- name: Log in to GitHub Container Registry
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true')) }}
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract Docker metadata
if: ${{ needs.prepare.outputs.dry_run != 'true' && (needs.prepare.outputs.force_republish == 'true' || (matrix.variant == 'core' && needs.check-docker.outputs.core_exists != 'true') || (matrix.variant == 'full' && needs.check-docker.outputs.full_exists != 'true')) }}
Expand All @@ -226,7 +235,11 @@ jobs:
PDFIUM_VERSION=${{ env.PDFIUM_VERSION }}
ONNXRUNTIME_VERSION=${{ env.ORT_VERSION }}
tags: ${{ steps.docker_meta.outputs.tags }}
labels: ${{ steps.docker_meta.outputs.labels }}
labels: |
${{ steps.docker_meta.outputs.labels }}
org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
org.opencontainers.image.description=Kreuzberg document intelligence - ${{ matrix.variant }} variant
org.opencontainers.image.licenses=MIT
platforms: linux/amd64,linux/arm64
cache-from: type=gha
cache-to: type=gha,mode=max,scope=publish-docker-${{ matrix.variant }}
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ exclude: ^docs/snippets/|vendor/|node_modules/|target/|invalid*|dist/|artifacts/
repos:
# Commit message linting
- repo: https://github.com/Goldziher/gitfluff
rev: v0.7.0
rev: v0.7.1
hooks:
- id: gitfluff-lint
args: ["--write"]
Expand Down
46 changes: 46 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,52 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

---

## [Unreleased]

### Changed

#### Docker
- **Docker registry migration**: Migrated from Docker Hub to GitHub Container Registry
- New image location: `ghcr.io/kreuzberg-dev/kreuzberg` (was `goldziher/kreuzberg`)
- Core variant: `ghcr.io/kreuzberg-dev/kreuzberg:VERSION-core` or `:core`
- Full variant: `ghcr.io/kreuzberg-dev/kreuzberg:VERSION` or `:latest`
- Added OCI labels for better container metadata and repository linking
- Updated all documentation, examples, and test configurations
- Images remain publicly accessible and support linux/amd64 and linux/arm64

---

## [4.0.8] - 2026-01-17

### Fixed

#### CI/CD
- **Ruby CI cache cleanup**: Fixed Cargo fingerprint errors caused by stale rb_sys build artifacts
- Added cleanup of `packages/ruby/tmp/` directory in "Detect partial cache hit and clean stale fingerprints" step
- Prevents fingerprint mismatches when GitHub Actions restores partial Cargo cache
- Applied to both build-ruby-gem and test-ruby jobs

#### C#
- **HtmlConversionOptions serialization with no values**: Fixed JSON serialization to write empty object `{}` instead of `null` when HtmlConversionOptions has no values set
- Rust FFI expects an object type, not null value
- Changed `WriteNullValue()` to `WriteStartObject()` + `WriteEndObject()` for empty options
- Resolves "Runtime error: html_options must be an object" error on all HtmlToMarkdown calls with default options

#### Python
- **Type completions now working**: Fixed missing `_internal_bindings.pyi` type stub file in Python wheels ([#298](https://github.com/kreuzberg-dev/kreuzberg/issues/298))
- Added `.pyi` file to Maturin include configuration in `pyproject.toml`
- Removed redundant `MANIFEST.in` (Maturin uses `pyproject.toml` include list)
- IDEs and type checkers now have full type information for all Rust bindings
- Resolves "Type completions not working" error in PyCharm, VS Code, and mypy

#### Homebrew
- **Bottle checksum mismatches**: Fixed formula update script to download bottles from GitHub Release and compute checksums from actual uploaded files
- Formula checksums now match what users download, preventing "Bottle reports different checksum" errors
- Script downloads bottles from release instead of using local artifacts that may differ
- Ensures checksums are accurate even when bottles are re-uploaded with `--clobber` flag

---

## [4.0.6] - 2026-01-14

### Fixed
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
<a href="https://rubygems.org/gems/kreuzberg">
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
</a>

<!-- Project Info -->
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
Expand Down
3 changes: 3 additions & 0 deletions crates/kreuzberg-node/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
<a href="https://rubygems.org/gems/kreuzberg">
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
</a>

<!-- Project Info -->
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
Expand Down
7 changes: 5 additions & 2 deletions crates/kreuzberg-wasm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
<a href="https://rubygems.org/gems/kreuzberg">
<img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
</a>
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
</a>

<!-- Project Info -->
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
Expand Down Expand Up @@ -184,7 +187,7 @@ interface DocumentJob {
mimeType: string;
}

async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
await initWasm();

const results: Record<string, string> = {};
Expand Down Expand Up @@ -441,7 +444,7 @@ interface DocumentJob {
mimeType: string;
}

async function processBatch(documents: DocumentJob[], concurrency: number = 3) {
async function _processBatch(documents: DocumentJob[], concurrency: number = 3) {
await initWasm();

const results: Record<string, string> = {};
Expand Down
5 changes: 5 additions & 0 deletions docker/Dockerfile.core
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ RUN --mount=type=cache,target=/usr/local/cargo/registry \
# =============================================================================
FROM debian:trixie-slim

# OCI labels for container metadata
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - core variant"
LABEL org.opencontainers.image.licenses="MIT"

WORKDIR /app

# Download and install dependencies (Core version - without LibreOffice)
Expand Down
5 changes: 5 additions & 0 deletions docker/Dockerfile.full
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ RUN --mount=type=cache,target=/usr/local/cargo/registry \
# =============================================================================
FROM debian:trixie-slim

# OCI labels for container metadata
LABEL org.opencontainers.image.source="https://github.com/kreuzberg-dev/kreuzberg"
LABEL org.opencontainers.image.description="Kreuzberg document intelligence - full variant"
LABEL org.opencontainers.image.licenses="MIT"

WORKDIR /app

# Download and install dependencies
Expand Down
11 changes: 7 additions & 4 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ IMAGE_NAME=kreuzberg:full ./scripts/test_docker.sh

## GitHub Actions

The `.github/workflows/docker.yaml` workflow builds and publishes both variants:
- `kreuzberg:v4-core` - Core image without LibreOffice
- `kreuzberg:v4-full` - Full image with LibreOffice
- `kreuzberg:v4`, `kreuzberg:latest` - Aliases for full image
The `.github/workflows/publish-docker.yaml` workflow builds and publishes both variants to GitHub Container Registry:
- `ghcr.io/kreuzberg-dev/kreuzberg:VERSION-core` - Core image without LibreOffice
- `ghcr.io/kreuzberg-dev/kreuzberg:core` - Latest core image
- `ghcr.io/kreuzberg-dev/kreuzberg:VERSION` - Full image with LibreOffice
- `ghcr.io/kreuzberg-dev/kreuzberg:latest` - Latest full image

For local development, use the local tags shown in the build commands above.

## Recommendations

Expand Down
8 changes: 4 additions & 4 deletions docs/cli/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -386,19 +386,19 @@ kreuzberg detect document.pdf

```bash title="Terminal"
# Extract document using Docker with mounted directory
docker run -v $(pwd):/data goldziher/kreuzberg:latest \
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
extract /data/document.pdf

# Extract and save output to host directory using shell redirection
docker run -v $(pwd):/data goldziher/kreuzberg:latest \
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
extract /data/document.pdf > output.txt
```

### Docker with OCR

```bash title="Terminal"
# Extract with OCR using Docker
docker run -v $(pwd):/data goldziher/kreuzberg:latest \
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
extract /data/scanned.pdf --ocr true
```

Expand All @@ -411,7 +411,7 @@ version: '3.8'

services:
kreuzberg:
image: goldziher/kreuzberg:latest
image: ghcr.io/kreuzberg-dev/kreuzberg:latest
volumes:
- ./documents:/input
command: extract /input/document.pdf --ocr true
Expand Down
6 changes: 3 additions & 3 deletions docs/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -880,12 +880,12 @@ choco install tesseract libreoffice
Pre-built Docker images available on Docker Hub:

**Variants:**
- `goldziher/kreuzberg:latest` - Core + Tesseract
- `goldziher/kreuzberg:latest-all` - All features
- `ghcr.io/kreuzberg-dev/kreuzberg:latest` - Core + Tesseract
- `ghcr.io/kreuzberg-dev/kreuzberg:latest` - All features

**Usage:**
```bash title="Terminal"
docker run -v $(pwd):/data goldziher/kreuzberg:latest \
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
extract /data/document.pdf --ocr
```

Expand Down
4 changes: 2 additions & 2 deletions docs/getting-started/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,8 @@ cargo install kreuzberg-cli
Docker image:

```bash title="Terminal"
docker pull goldziher/kreuzberg:latest # Core image with essential features
docker pull goldziher/kreuzberg:latest-all # Full image with all extensions
docker pull ghcr.io/kreuzberg-dev/kreuzberg:latest # Core image with essential features
docker pull ghcr.io/kreuzberg-dev/kreuzberg:latest # Full image with all extensions
```

Next steps: [CLI Usage](../cli/usage.md) • [API Server Guide](../guides/api-server.md)
Expand Down
4 changes: 2 additions & 2 deletions docs/guides/api-server.md
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ version: '3.8'

services:
kreuzberg-api:
image: goldziher/kreuzberg:latest
image: ghcr.io/kreuzberg-dev/kreuzberg:latest
ports:
- "8000:8000"
environment:
Expand Down Expand Up @@ -671,7 +671,7 @@ spec:
spec:
containers:
- name: kreuzberg
image: goldziher/kreuzberg:latest
image: ghcr.io/kreuzberg-dev/kreuzberg:latest
ports:
- containerPort: 8000
env:
Expand Down
Loading
Loading