cubist38 · Snuffy2 · Nov 20, 2025 · Nov 22, 2025
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,13 @@
+version: 2
+updates:
+  - package-ecosystem: 'github-actions'
+    directory: '/'
+    schedule:
+      interval: 'weekly'
+    open-pull-requests-limit: 10
+
+  - package-ecosystem: 'pip'
+    directory: '/'
+    schedule:
+      interval: 'weekly'
+    open-pull-requests-limit: 10
diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
@@ -0,0 +1,51 @@
+name: Linters
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  linters:
+    name: Run Linters
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v5
+
+      - name: Debug GitHub Variables
+        run: |
+          echo "github.event_name: ${{ github.event_name }}"
+          echo "github.ref_name: ${{ github.ref_name }}"
+          echo "github.event.repository.default_branch: ${{ github.event.repository.default_branch }}"
+
+      - name: Setup Python 3 (with caching)
+        uses: actions/setup-python@v6
+        id: setup-python
+        with:
+          python-version: 3.12
+          cache: 'pip'
+          cache-dependency-path: |
+            pyproject.toml
+
+      - name: Install Linting Requirements
+        run: |
+          python -m pip install --upgrade pip
+          pip install --group dev -e .
+
+      - name: Cache pre-commit
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/pre-commit
+          key: ${{ runner.os }}-lint-py${{ steps.setup-python.outputs.python-version || '3.x' }}-${{ hashFiles('**/.pre-commit-config.yaml', '**/pyproject.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-lint-
+
+      - name: Run pre-commit
+        uses: pre-commit/[email protected]
+
+      - uses: pre-commit-ci/[email protected]
+        if: always()
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -29,4 +29,4 @@ jobs:
         env:
           TWINE_USERNAME: __token__
           TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
-        run: twine upload dist/* 
+        run: twine upload dist/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,52 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: trailing-whitespace
+        exclude: '.*\.md$'
+      - id: end-of-file-fixer
+        exclude: '.*\.md$'
+      - id: check-yaml
+      - id: check-toml
+      - id: check-added-large-files
+      - id: check-ast
+      - id: check-merge-conflict
+
+  - repo: https://github.com/rhysd/actionlint
+    rev: v1.7.8
+    hooks:
+      - id: actionlint
+
+    # Note: shellcheck cannot directly parse YAML; actionlint extracts workflow
+    # shell blocks and calls shellcheck when available.
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.11.0.1
+    hooks:
+      - id: shellcheck
+        # Match by detected shell file type (extensions or shebang)
+        types: [shell]
+        args: ['-x']
+
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1
+    hooks:
+      - id: codespell
+        additional_dependencies:
+          - tomli
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.4
+    hooks:
+      # Run the linter.
+      - id: ruff-check
+        args: [--fix]
+      # Run the formatter.
+      - id: ruff-format
+
+ci:
+  autofix_prs: true
+  autoupdate_schedule: weekly
+  skip: []
+  submodules: false
diff --git a/AGENTS.md b/AGENTS.md
@@ -38,26 +38,22 @@ This document provides comprehensive guidelines for AI coding agents contributin
 
 This project uses the following tools to maintain consistent code quality:
 
-- **Black**: Automatic code formatter with a 100-character line length (configured in `pyproject.toml`)
-- **isort**: Import statement organizer with Black-compatible profile
-- **flake8**: Style guide enforcement (configured in `pyproject.toml` to ignore `E501` and `W503`)
+- **ruff**: Automatic code formatter with options configured in `pyproject.toml`
 
 ### Formatting Workflow
 
 Before committing code changes:
 
 ```bash
-./.venv/bin/black <file_or_directory>
-./.venv/bin/isort <file_or_directory>
-./.venv/bin/flake8 <file_or_directory>
+./.venv/bin/ruff check --fix <file_or_directory>
+./.venv/bin/ruff format <file_or_directory>
 ```
 
 Alternatively, format the entire project:
 
 ```bash
-./.venv/bin/black app/ tests/
-./.venv/bin/isort app/ tests/
-./.venv/bin/flake8 app/ tests/
+./.venv/bin/ruff check --fix app/ tests/
+./.venv/bin/ruff format app/ tests/
 ```
 
 ---
@@ -68,6 +64,7 @@ Alternatively, format the entire project:
 
 - **Mandatory typing**: Add type annotations to all function signatures, method signatures, and class attributes.
 - **Return types**: Always specify return types, including `None` when applicable.
+- **Minimize `Any`**: Do not just use `Any` for typing to make the error go away. Use appropriate type annotations and only use `Any` when applicable (ex. if there are > 3 different return types possible).
 - **Forward references**: Use `from __future__ import annotations` to defer evaluation of type annotations, allowing forward references without string literals.
 - **Python 3.11+ type hints**: Use built-in generic types instead of typing module equivalents (e.g., `dict[str, Any]` instead of `Dict[str, Any]`, `list[str]` instead of `List[str]`).
 
@@ -179,7 +176,9 @@ except json.JSONDecodeError as e:
 
 ### Logging Practices
 
-- **Use loguru**: The project uses `loguru` for logging. Import it as `from loguru import logger`.
+- **Use loguru**: 
+  - The project uses `loguru` for logging. Import it as `from loguru import logger`.
+  - Use f-string format for logging strings.
 - **Appropriate log levels**:
   - `logger.debug()`: Detailed diagnostic information
   - `logger.info()`: General informational messages (startup, shutdown, major operations)
@@ -209,7 +208,7 @@ except ModelLoadError as e:
 
 - **Explicit user request required**: Only create new branches or open pull requests when the user explicitly asks for it **or** when the user includes the hashtag `#github-pull-request-agent` in their request.
 - **Asynchronous agent handoff**: The `#github-pull-request-agent` hashtag signals that the task should be handed off to the asynchronous GitHub Copilot coding agent after all planning, analysis, and preparation are complete.
-- **Default behavior**: By default, work directly on the current branch and commit changes locally without creating PRs.
+- **No staging or committing without permission**: Agents must **not** stage (`git add`) or commit changes unless the user explicitly requests it. Only make code changes to files - leave git operations to the user.
 
 ### Commit Messages
 
@@ -271,7 +270,7 @@ When an agent cannot or chooses not to follow one or more guidelines in this doc
 **Example disclosure:**
 
 > **⚠️ Deviation Notice:**  
-> The code was not formatted with Black/isort because the dev dependencies are not installed in the current environment. Run `./.venv/bin/pip install -e '.[dev]'` to enable linting/formatting tools.
+> The code was not formatted with ruff because the dev dependencies are not installed in the current environment. Run `./.venv/bin/pip install -e '.[dev]'` to enable linting/formatting tools.
 
 ### Communication Principles
 
@@ -309,10 +308,9 @@ When an agent cannot or chooses not to follow one or more guidelines in this doc
 Before finalizing any code contribution, verify:
 
 - ✅ Virtual environment (`./.venv`) is used for all operations
-- ✅ Code is formatted with Black and isort
-- ✅ Code passes flake8 linting
+- ✅ Code passes ruff linting and formatting
 - ✅ Type annotations are present on all functions/methods
-- ✅ Docstrings follow PEP 257 conventions
+- ✅ Docstrings follow NumPy style conventions
 - ✅ Specific exceptions are caught (not bare `Exception`)
 - ✅ Appropriate logging is in place
 - ✅ Existing comments are preserved

diff --git a/Makefile b/Makefile
@@ -7,4 +7,4 @@ run:
 	--queue-size 100
 
 install:
-	pip install -e .
+	pip install -e .
diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ This repository hosts a high-performance API server that provides OpenAI-compati
 - 🎛️ **LoRA adapter support** for fine-tuned image generation
 - ⚡ **Configurable quantization** (4-bit, 8-bit, 16-bit) for optimal performance
 - 🧠 **Customizable context length** for memory optimization and performance tuning
+- ♻️ **JIT loading with idle auto-unload** to reclaim VRAM when the server is idle
 
 ---
 
@@ -108,7 +109,7 @@ The server supports six types of MLX models:
 
 ### Flux-Series Image Models
 
-> **⚠️ Note:** Image generation and editing capabilities require manual installation of `mflux`: `pip install git+https://github.com/cubist38/mflux.git`
+> **⚠️ Note:** Image generation and editing capabilities require installation of `mflux`: `pip install mlx-openai-server[image-generation]` or `pip install git+https://github.com/cubist38/mflux.git`
 
 The server supports multiple Flux model configurations for advanced image generation and editing:
 
@@ -173,8 +174,8 @@ Follow these steps to set up the MLX-powered server:
     cd mlx-openai-server
     pip install -e .
 
-    # Optional: For image generation/editing support, also install mflux
-    pip install git+https://github.com/cubist38/mflux.git
+    # Optional: For image generation/editing support
+    pip install -e .[image-generation]
     ```
 
 ### Using Conda (Recommended)
@@ -210,8 +211,8 @@ For better environment management and to avoid architecture issues, we recommend
     cd mlx-openai-server
     pip install -e .
 
-    # Optional: For image generation/editing support, also install mflux
-    pip install git+https://github.com/cubist38/mflux.git
+    # Optional: For image generation/editing support
+    pip install -e .[image-generation]
     ```
 
 ### Optional Dependencies
@@ -229,8 +230,14 @@ pip install mlx-openai-server
 - All core API endpoints and functionality
 
 #### Image Generation & Editing Support
-For image generation and editing capabilities, you need to install `mflux` manually:
+For image generation and editing capabilities, install with the image-generation extra:
 
+```bash
+# Install with image generation support
+pip install mlx-openai-server[image-generation]
+```
+
+Or install manually:
 ```bash
 # First install the base server
 pip install mlx-openai-server
@@ -247,6 +254,21 @@ pip install git+https://github.com/cubist38/mflux.git
 
 > **Note:** If you try to use image generation or editing without `mflux` installed, you'll receive a clear error message directing you to install it manually.
 
+#### Enhanced Caching Support
+For enhanced caching and performance when working with complex ML models and objects, install with the enhanced-caching extra:
+
+```bash
+# Install with enhanced caching support
+pip install mlx-openai-server[enhanced-caching]
+```
+
+This enables better serialization and caching of objects from:
+- spaCy (NLP processing)
+- regex (regular expressions)
+- tiktoken (tokenization)
+- torch (PyTorch tensors and models)
+- transformers (Hugging Face models)
+
 #### Whisper Models Support
 For whisper models to work properly, you need to install ffmpeg:
 
@@ -380,6 +402,23 @@ mlx-openai-server launch \
 
 ```
 
+#### Enabling JIT Loading & Auto-Unload
+
+Use the `--jit` flag to defer model initialization until the first request arrives. Pair it with
+`--auto-unload-minutes <minutes>` to automatically unload the model after a period of inactivity and
+reclaim VRAM. Example:
+
+```bash
+mlx-openai-server launch \
+  --model-path <path-to-mlx-model> \
+  --model-type lm \
+  --jit \
+  --auto-unload-minutes 30
+```
+
+When JIT mode is active, the `/health` endpoint reports `status="ok"` with
+`model_status="unloaded"` while the model is idle and loads it back on demand for the next request.
+
 #### Server Parameters
 - `--model-path`: Path to the MLX model directory (local path or Hugging Face model repository). Required for `lm`, `multimodal`, `embeddings`, `image-generation`, `image-edit`, and `whisper` model types.
 - `--model-type`: Type of model to run:

diff --git a/app/__init__.py b/app/__init__.py
@@ -1,7 +1 @@
-import os
-from .version import __version__
-
-# Suppress transformers warnings
-os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
-
-__all__ = ["__version__"]
+"""MLX OpenAI Server package."""
diff --git a/app/api/__init__.py b/app/api/__init__.py
@@ -1 +1 @@
-
+"""API endpoints for the MLX OpenAI server."""