google
diff --git a/‎.github/workflows/deploy.yml‎
Lines changed: 46 additions & 0 deletions b/‎.github/workflows/deploy.yml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/workflows/maturin_ci.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/maturin_ci.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/mypy.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/mypy.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/piptest.yml‎
Lines changed: 11 additions & 2 deletions b/‎.github/workflows/piptest.yml‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎.github/workflows/pytest.yml‎
Lines changed: 30 additions & 16 deletions b/‎.github/workflows/pytest.yml‎
Lines changed: 30 additions & 16 deletions
diff --git a/‎README.md‎
Lines changed: 8 additions & 4 deletions b/‎README.md‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎docs/tutorials/quick_start/mnist_read_jax.py‎
Lines changed: 188 additions & 0 deletions b/‎docs/tutorials/quick_start/mnist_read_jax.py‎
Lines changed: 188 additions & 0 deletions
@@ -0,0 +1,46 @@
+name: Deploy to GitHub Pages
+
+on:
+  # Trigger the workflow every time you push to the `main` branch
+  # Using a different branch name? Replace `main` with your branch’s name
+  push:
+    branches: [ main ]
+    paths: [ website ]
+  merge_group:
+  # Allows you to run this workflow manually from the Actions tab on GitHub.
+  workflow_dispatch:
+
+# Allow this job to clone the repo and create a page deployment
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout your repository using git
+        uses: actions/checkout@v4
+      - name: Install, build, and upload your site
+        uses: withastro/action@v3
+        with:
+          path: website # The root location of your Astro project inside the repository. (optional)
+          # node-version: 20 # The specific version of Node that should be used to build your site. Defaults to 20. (optional)
+          # package-manager: pnpm@latest # The Node package manager that should be used to install dependencies and build your site. Automatically detected based on your lockfile. (optional)
+
+  deploy:
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
@@ -3,14 +3,19 @@
 #
 #    maturin generate-ci github
 #
+# Prefer to run matrix of unit-tests in the pytest workflow until the following
+# issue is solved: https://github.com/PyO3/maturin/issues/1971
 name: Maturin CI
 
 on:
   push:
     paths:
       - '**/*.py'
       - '**/*.rs'
-      - 'pytest.ini'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'rust/Cargo.toml'
+      - 'rust/Cargo.lock'
   merge_group:
   pull_request:
     types: [opened, synchronize, reopened]
 
@@ -50,4 +50,4 @@ jobs:
           pip install -r test_requirements.txt
           mkdir -p .mypy_cache
           mypy --version
-          mypy --no-color-output --install-types --non-interactive src
+          mypy --no-color-output --install-types --non-interactive src docs
@@ -12,7 +12,16 @@ on:
 
 jobs:
   piptesting:
-    runs-on: ubuntu-22.04
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        # ubuntu-24.04-arm is not stable enough
+        platform:
+          - runner: ubuntu-latest  # x64
+          - runner: windows-latest  # x64
+          - runner: macos-13  # Intel
+          - runner: macos-14  # arm64
+          - runner: macos-latest  # arm64
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python 3.10
@@ -26,4 +35,4 @@ jobs:
       - name: Run tutorial using sedpack pip package
         run: |
           python docs/tutorials/quick_start/mnist_save.py -d mnist_dataset
-          python docs/tutorials/quick_start/mnist_read.py -d mnist_dataset
+          python docs/tutorials/quick_start/mnist_read_keras.py -d mnist_dataset
@@ -13,7 +13,16 @@ on:
 
 jobs:
   unittesting:
-    runs-on: ubuntu-22.04
+    runs-on: ${{ matrix.platform.runner }}
+    strategy:
+      matrix:
+        # ubuntu-20.04-arm was not stable enough when testing
+        platform:
+          - runner: ubuntu-latest  # x64
+          - runner: windows-latest  # x64
+          - runner: macos-13  # Intel
+          - runner: macos-14  # arm64
+          - runner: macos-latest  # arm64
     if: github.event_name != 'schedule'
     steps:
       - uses: actions/checkout@v4
@@ -33,25 +42,30 @@ jobs:
         with:
           path: ${{ steps.pip-cache.outputs.dir }}
           # The cache key depends on requirements.txt
-          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
-          restore-keys: |
-            ${{ runner.os }}-pip-
+          key: ${{ matrix.platform.runner }}-pip-${{ hashFiles('pyproject.toml') }}-${{ hashFiles('requirements*.txt') }}-${{ hashFiles('test_requirements*.txt') }}
       # Build a virtualenv, but only if it doesn't already exist
       - name: Populate pip cache
-        run: python -m pip install --require-hashes --no-deps -r requirements.txt
+        # requirements.txt is not reliable since across different platforms and
+        # their versions the pip package versions might vary. We regenerate it
+        # again from pyproject.toml every time when pyproject.toml or
+        # requirements.txt changes. The pinned versions in requirements.txt are
+        # tested by coverage since that is running on ubuntu which is also used
+        # to produce the main requirements.txt file.
+        run: |
+          pip install pip-tools
+          pip-compile --generate-hashes pyproject.toml > requirements.txt
+          pip install -r requirements.txt
+          pip install -r test_requirements.txt
+        if: steps.cache.outputs.cache-hit != 'true'
       - name: Save cache
         id: cache-save
         uses: actions/cache/save@v4
         with:
           path: ${{ steps.pip-cache.outputs.dir }}
           key: ${{ steps.cache.outputs.cache-primary-key }}
         if: steps.cache.outputs.cache-hit != 'true'
-      - name: Installing test requirements and sedpack
-        # Start by "installing" sedpack to be sure all dependencies are listed
-        run: |
-          pip install -r test_requirements.txt
-          pip install --editable .
-          echo "PYTHONPATH=./src:$PYTHONPATH" >> $GITHUB_ENV
+      - name: Install sedpack locally
+        run: pip install --editable .
       - name: Running unit tests
         run: |
           python -m pytest
@@ -76,12 +90,13 @@ jobs:
         with:
           path: ${{ steps.pip-cache.outputs.dir }}
           # The cache key depends on requirements.txt
-          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }}
-          restore-keys: |
-            ${{ runner.os }}-pip-
+          key: ${{ runner.os }}-pip-${{ hashFiles('requirements*.txt') }}-${{ hashFiles('test_requirements*.txt') }}
       # Build a virtualenv, but only if it doesn't already exist
       - name: Populate pip cache
-        run: python -m pip install --require-hashes --no-deps -r requirements.txt
+        run: |
+          python -m pip install --require-hashes --no-deps -r requirements.txt
+          pip install -r test_requirements.txt
+        if: steps.cache.outputs.cache-hit != 'true'
       - name: Save cache
         id: cache-save
         uses: actions/cache/save@v4
@@ -92,7 +107,6 @@ jobs:
       - name: Installing test requirements and sedpack
         # Start by "installing" sedpack to be sure all dependencies are listed
         run: |
-          pip install -r test_requirements.txt
           pip install --editable .
           echo "PYTHONPATH=./src:$PYTHONPATH" >> $GITHUB_ENV
       - name: Install workflow dependencies
 
@@ -2,6 +2,8 @@
 
 [![Coverage Status](https://coveralls.io/repos/github/google/sedpack/badge.svg?branch=main)](https://coveralls.io/github/google/sedpack?branch=main)
 
+[Documentation](https://google.github.io/sedpack/)
+
 Mainly refactored from the [SCAAML](https://github.com/google/scaaml) project.
 
 ## Available components
@@ -53,10 +55,12 @@ Update: `pip-compile pyproject.toml --generate-hashes --upgrade` and commit requ
 
 ### Tutorial
 
-Tutorials available in the docs/tutorials/ directory.  For a "hello world" see
-[docs/tutorials/quick_start/mnist_save.py](https://github.com/google/sedpack/blob/main/docs/tutorials/quick_start/mnist_save.py)
-and
-[docs/tutorials/quick_start/mnist_save.py](https://github.com/google/sedpack/blob/main/docs/tutorials/quick_start/mnist_read.py).
+A tutorial and documentation is available at
+[https://google.github.io/sedpack/](https://google.github.io/sedpack/).
+
+Code for the tutorials is available in the `docs/tutorials` directory. For a
+"hello world" see
+[https://google.github.io/sedpack/tutorials/mnist/](https://google.github.io/sedpack/tutorials/mnist/).
 
 ## Disclaimer
 
 
@@ -0,0 +1,188 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Read MNIST data and feed it to a neural network. For a tutorial with
+explanations see: https://google.github.io/sedpack/tutorials/mnist
+
+Inspired by https://flax.readthedocs.io/en/latest/mnist_tutorial.html
+
+Example use:
+    python mnist_save.py -d "~/Datasets/my_new_dataset/"
+    python mnist_read_jax.py -d "~/Datasets/my_new_dataset/"
+"""
+import argparse
+from functools import partial
+from typing import Any
+
+from jax import Array
+from jax.typing import ArrayLike
+from flax import nnx
+import jax.numpy as jnp
+import optax
+from tqdm import tqdm
+
+from sedpack.io import Dataset
+
+
+def process_batch(d: Any) -> dict[str, Array]:
+    """Turn the NumPy arrays into JAX arrays and reshape the input to have a
+    channel.
+    """
+    batch_size: int = d["input"].shape[0]
+    return {
+        "input": jnp.array(d["input"]).reshape(batch_size, 28, 28, 1),
+        "digit": jnp.array(d["digit"], jnp.int32),
+    }
+
+
+class CNN(nnx.Module):  # type: ignore[misc]
+    """FLAX CNN model.
+    """
+
+    def __init__(self, *, rngs: nnx.Rngs) -> None:
+        self.conv1 = nnx.Conv(1, 32, kernel_size=(3, 3), rngs=rngs)
+        self.conv2 = nnx.Conv(32, 64, kernel_size=(3, 3), rngs=rngs)
+        self.avg_pool = partial(nnx.avg_pool,
+                                window_shape=(2, 2),
+                                strides=(2, 2))
+        self.linear1 = nnx.Linear(3_136, 256, rngs=rngs)
+        self.linear2 = nnx.Linear(256, 10, rngs=rngs)
+
+    def __call__(self, x: Array) -> Array:
+        x = self.avg_pool(nnx.relu(self.conv1(x)))
+        x = self.avg_pool(nnx.relu(self.conv2(x)))
+        x = x.reshape(x.shape[0], -1)  # flatten
+        x = nnx.relu(self.linear1(x))
+        x = self.linear2(x)
+        return x
+
+
+def loss_fn(model: CNN, batch: dict[str, Array]) -> tuple[Array, Array]:
+    logits = model(batch["input"])
+    loss = optax.softmax_cross_entropy_with_integer_labels(
+        logits=logits, labels=batch["digit"]).mean()
+    return loss, logits
+
+
+@nnx.jit  # type: ignore[misc]
+def train_step(model: CNN, optimizer: nnx.Optimizer, metrics: nnx.MultiMetric,
+               batch: dict[str, ArrayLike]) -> None:
+    """Train for a single step.
+    """
+    grad_fn = nnx.value_and_grad(loss_fn, has_aux=True)
+    (loss, logits), grads = grad_fn(model, batch)
+    metrics.update(loss=loss, logits=logits, labels=batch["digit"])
+    optimizer.update(grads)
+
+
+@nnx.jit  # type: ignore[misc]
+def eval_step(
+    model: CNN,
+    metrics: nnx.MultiMetric,
+    batch: dict[str, Array],
+) -> None:
+    loss, logits = loss_fn(model, batch)
+    metrics.update(loss=loss, logits=logits, labels=batch["digit"])
+
+
+def main() -> None:
+    """Train a neural network on the MNIST dataset saved in the sedpack
+    format.
+    """
+    parser = argparse.ArgumentParser(
+        description=
+        "Read MNIST in dataset-lib format and train a small neural network.")
+    parser.add_argument("--dataset_directory",
+                        "-d",
+                        help="Where to load the dataset",
+                        required=True)
+    parser.add_argument("--ascii_evaluations",
+                        "-e",
+                        help="How many images to print and evaluate",
+                        type=int,
+                        default=10)
+    args = parser.parse_args()
+
+    model = CNN(rngs=nnx.Rngs(0))
+    nnx.display(model)
+
+    learning_rate: float = 0.005
+    momentum: float = 0.9
+    optimizer = nnx.Optimizer(model, optax.adamw(learning_rate, momentum))
+    metrics = nnx.MultiMetric(
+        accuracy=nnx.metrics.Accuracy(),
+        loss=nnx.metrics.Average("loss"),
+    )
+    nnx.display(optimizer)
+
+    metrics_history: dict[str, list[Array]] = {
+        "train_loss": [],
+        "train_accuracy": [],
+        "test_loss": [],
+        "test_accuracy": [],
+    }
+
+    dataset = Dataset(args.dataset_directory)  # Load the dataset
+    batch_size = 32
+    train_data = dataset.as_tfdataset(
+        "train",
+        batch_size=batch_size,
+        shuffle=1_000,
+    )
+    validation_data = dataset.as_tfdataset(
+        "test",  # validation split
+        batch_size=batch_size,
+        shuffle=1_000,
+        repeat=False,
+    )
+    train_steps: int = 1_200
+    eval_every: int = 200
+
+    for step, batch in enumerate(tqdm(train_data)):
+        if step > train_steps:
+            break
+
+        # Run the optimization for one step and make a stateful update to the
+        # following:
+        # - The train state's model parameters
+        # - The optimizer state
+        # - The training loss and accuracy batch metrics
+        batch = process_batch(batch)
+        train_step(model, optimizer, metrics, batch)
+
+        if step > 0 and (step % eval_every == 0 or step
+                         == train_steps - 1):  # One training epoch has passed.
+            # Log the training metrics.
+            # Compute the metrics.
+            for metric, value in metrics.compute().items():
+                # Record the metrics.
+                metrics_history[f"train_{metric}"].append(value)
+                print(f"{metric} = {value}", end=" ")
+            metrics.reset()  # Reset the metrics for the test set.
+            print()
+
+            # Compute the metrics on the test set after each training epoch.
+            for test_batch in validation_data.as_numpy_iterator():
+                test_batch = process_batch(test_batch)
+                eval_step(model, metrics, test_batch)
+
+            # Log the test metrics.
+            for metric, value in metrics.compute().items():
+                metrics_history[f"test_{metric}"].append(value)
+                print(f"test {metric} = {value}", end=" ")
+            metrics.reset()  # Reset the metrics for the next training epoch.
+            print()
+
+
+if __name__ == "__main__":
+    main()