diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..cd842b7
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,49 @@
+name: build
+
+on:
+  push:
+    branches: [ main ]
+    tags: [ "v*" ]
+  pull_request:
+
+jobs:
+  ci:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install
+        run: |
+          python -m pip install -U pip
+          python -m pip install .
+          python -m pip install pytest
+      - name: Test
+        run: |
+          pytest -q
+
+  publish:
+    needs: ci
+    if: startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Build
+        run: |
+          python -m pip install -U pip build
+          python -m build
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 0000000..8bd86ea
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,35 @@
+name: pr
+
+on:
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+      - name: Install
+        run: |
+          python -m pip install -U pip
+          python -m pip install .
+          python -m pip install pytest ruff black build pip-audit
+      - name: Lint
+        run: |
+          ruff check .
+          black --check .
+      - name: Test
+        run: pytest -q
+      - name: Build and verify
+        run: |
+          python -m build
+          twine check dist/* || true
+      - name: Security audit
+        run: |
+          pip-audit -r requirements.txt || true
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..4208745
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,43 @@
+name: release
+
+on:
+  push:
+    branches: [ main ]
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+      - name: Install
+        run: |
+          python -m pip install -U pip
+          python -m pip install .
+          python -m pip install pytest ruff black build python-semantic-release pip-audit
+      - name: Lint
+        run: |
+          ruff check .
+          black --check .
+      - name: Test
+        run: pytest -q
+      - name: Build and verify
+        run: |
+          python -m build
+          twine check dist/* || true
+      - name: Security audit
+        run: |
+          pip-audit -r requirements.txt || true
+      - name: Semantic Release (version, tag, GitHub release, PyPI)
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: semantic-release publish
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index b7faf40..e54dbfc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -205,3 +205,10 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+# Local configuration
+config.yaml
+
+# Editor/OS
+.DS_Store
+Thumbs.db
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..6f4b308
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,21 @@
+# Contributing
+
+Thanks for your interest in contributing!
+
+- Open an issue to discuss substantial changes.
+- Fork and create feature branches from `main`.
+- Run formatting and tests before submitting a PR.
+
+## Dev setup
+
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -U pip
+pip install -e .
+```
+
+## Testing
+
+```bash
+python -m pytest -q
+```
\ No newline at end of file
diff --git a/README.md b/README.md
index 5d34160..23df618 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,107 @@
 # local-storage-utils
-Set of scripts and tools for managing GCP Datastore data in local
+
+Utilities for analyzing and managing local Datastore/Firestore (Datastore mode) data. Works with both the Datastore Emulator and GCP using Application Default Credentials.
+
+## Install (PyPI)
+
+```bash
+pip install local-storage-utils
+```
+
+This installs the `lsu` CLI.
+
+## Install (from source)
+
+git clone <this-repo-url>
+cd local-storage-utils
+python3 -m venv .venv
+source .venv/bin/activate
+python -m pip install -U pip
+pip install -e .
+
+### Troubleshooting local installs
+- If you see "Command 'python' not found", use `python3 -m venv .venv` (above). Inside the venv, `python` will point to Python 3.
+- If you see "externally-managed-environment", you are attempting a system-wide install. Always install into a virtual environment:
+  - Create a venv: `python3 -m venv .venv && source .venv/bin/activate`
+  - Then use the venv pip: `python -m pip install -U pip && pip install -e .`
+  ```bash
+  sudo apt-get update && sudo apt-get install -y python3-venv
+  ```
+
+## Configuration
+
+- Create a local `config.yaml` in your working directory. It is gitignored and not included in the repo.
+- Any CLI flag overrides values from `config.yaml`.
+- If neither config nor flags provide a value, the tool falls back to environment variables (for emulator detection) or sensible defaults.
+
+Example `config.yaml`:
+
+```yaml
+project_id: "my-project"          # If omitted, ADC/env will be used
+emulator_host: "localhost:8010"   # If set, uses Datastore Emulator
+
+# Explicit filters (empty means all)
+namespaces: [""]                   # Empty -> iterate all namespaces (including default "")
+kinds: []                          # Empty -> iterate all kinds per namespace
+
+# Optional defaults
+kind: "SourceCollectionStateEntity"  # Default for analyze-fields
+
+# Cleanup
+ttl_field: "expireAt"
+delete_missing_ttl: true
+batch_size: 500
+
+# Analysis
+group_by_field: null
+
+# Logging
+log_level: "INFO"
+```
+
+## CLI usage
+
+```bash
+# Kind-level counts and size estimates
+lsu analyze-kinds --project my-project
+
+# Use all namespaces/kinds by default, or restrict explicitly
+lsu analyze-kinds --namespace "" --namespace tenant-a --kind SourceCollectionStateEntity
+
+# Field contribution analysis (falls back to config.kind/config.namespace if not provided)
+lsu analyze-fields --kind SourceCollectionStateEntity --namespace "" --group-by batchId
+
+# TTL cleanup across namespaces/kinds (dry-run)
+lsu cleanup --ttl-field expireAt --dry-run
+
+# TTL cleanup restricted to specific namespaces/kinds
+lsu cleanup --namespace "" --namespace tenant-a --kind pipeline-job
+```
+
+Use `--help` on any command for full options. Config can be provided via `config.yaml` or flags.
+
+## Development
+
+- Create a virtual environment and install in editable mode as shown above
+- Run tests:
+
+```bash
+python -m pip install pytest
+pytest -q
+```
+
+- Lint/format (optional if you use pre-commit/CI):
+```bash
+python -m pip install ruff black
+ruff check .
+black .
+```
+
+## Publishing
+
+- Automated: pushing to `main` triggers versioning, tagging, GitHub release, and PyPI publish via semantic-release.
+- Prerequisites:
+  - Add a PyPI token to repo secrets as `PYPI_API_TOKEN`.
+  - Use conventional commits for proper versioning.
+
+Main branch should be protected (require PRs, disallow direct pushes) in repository settings.
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..8d97b20
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+import json
+from typing import List, Optional, Annotated
+
+import typer
+
+from commands.config import AppConfig, load_config
+from commands.analyze_kinds import analyze_kinds, print_summary_table
+from commands.analyze_entity_fields import analyze_field_contributions, print_field_summary
+from commands.cleanup_expired import cleanup_expired
+
+app = typer.Typer(help="Utilities for analyzing and managing local Datastore/Firestore (Datastore mode)")
+
+# Aliases with flags only — no defaults here
+ConfigOpt = Annotated[Optional[str], typer.Option("--config", help="Path to config.yaml")]
+ProjectOpt = Annotated[Optional[str], typer.Option("--project", help="GCP/Emulator project id")]
+EmulatorHostOpt = Annotated[Optional[str], typer.Option("--emulator-host", help="Emulator host, e.g. localhost:8010")]
+LogLevelOpt = Annotated[Optional[str], typer.Option("--log-level", help="Logging level")]
+KindsOpt = Annotated[
+    Optional[List[str]],
+    typer.Option("--kind", "-k", help="Kinds to process (omit or empty to process all in each namespace)")
+]
+SingleKindOpt = Annotated[Optional[str], typer.Option("--kind", "-k", help="Kind to analyze (falls back to config.kind)")]
+
+def _load_cfg(
+    config_path: Optional[str],
+    project: Optional[str],
+    emulator_host: Optional[str],
+    log_level: Optional[str],
+) -> AppConfig:
+    overrides = {}
+    if project:
+        overrides["project_id"] = project
+    if emulator_host:
+        overrides["emulator_host"] = emulator_host
+    if log_level:
+        overrides["log_level"] = log_level
+    return load_config(config_path, overrides)
+
+@app.command("analyze-kinds")
+def cmd_analyze_kinds(
+    config: ConfigOpt = None,
+    project: ProjectOpt = None,
+    emulator_host: EmulatorHostOpt = None,
+    log_level: LogLevelOpt = None,
+    kind: KindsOpt = None,
+    output: Annotated[Optional[str], typer.Option("--output", help="Output CSV file path")] = None,
+):
+    cfg = _load_cfg(config, project, emulator_host, log_level)
+
+    if kind is not None:
+        # Normalise: treat [""] as empty (all kinds)
+        cfg.kinds = [k for k in kind if k]  # drop empty strings
+    rows = analyze_kinds(cfg)
+
+    if output:
+        with open(output, "w", encoding="utf-8") as fh:
+            fh.write("namespace,kind,count,size,bytes\n")
+            for r in rows:
+                ns = r.get("namespace") or ""
+                fh.write(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}\n")
+        typer.echo(f"Wrote {len(rows)} rows to {output}")
+    else:
+        print_summary_table(rows)
+
+@app.command("analyze-fields")
+def cmd_analyze_fields(
+    kind: SingleKindOpt = None,
+    namespace: Annotated[Optional[str], typer.Option("--namespace", "-n", help="Namespace to query (omit to use all)")] = None,
+    group_by: Annotated[Optional[str], typer.Option("--group-by", help="Group results by this field value (falls back to config.group_by_field)")] = None,
+    only_field: Annotated[Optional[List[str]], typer.Option("--only-field", help="Only consider these fields")] = None,
+    config: ConfigOpt = None,
+    project: ProjectOpt = None,
+    emulator_host: EmulatorHostOpt = None,
+    log_level: LogLevelOpt = None,
+    output_json: Annotated[Optional[str], typer.Option("--output-json", help="Write raw JSON results to file")] = None,
+):
+    cfg = _load_cfg(config, project, emulator_host, log_level)
+
+    target_kind = kind or cfg.kind
+    target_namespace = namespace if namespace is not None else cfg.namespace
+    group_by_field = group_by if group_by is not None else cfg.group_by_field
+
+    if not target_kind:
+        raise typer.BadParameter("--kind is required (either via flag or config.kind)")
+
+    result = analyze_field_contributions(
+        cfg,
+        kind=target_kind,
+        namespace=target_namespace,
+        group_by_field=group_by_field,
+        only_fields=[f for f in only_field] if only_field else None,
+    )
+
+    if output_json:
+        with open(output_json, "w", encoding="utf-8") as fh:
+            json.dump(result, fh, indent=2)
+        typer.echo(f"Wrote JSON results to {output_json}")
+    else:
+        print_field_summary(result)
+
+@app.command("cleanup")
+def cmd_cleanup(
+    config: ConfigOpt = None,
+    project: ProjectOpt = None,
+    emulator_host: EmulatorHostOpt = None,
+    log_level: LogLevelOpt = None,
+    kind: KindsOpt = None,
+    ttl_field: Annotated[Optional[str], typer.Option("--ttl-field", help="TTL field name (falls back to config.ttl_field)")] = None,
+    delete_missing_ttl: Annotated[Optional[bool], typer.Option("--delete-missing-ttl", help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)")] = None,
+    batch_size: Annotated[Optional[int], typer.Option("--batch-size", help="Delete batch size (falls back to config.batch_size)")] = None,
+    dry_run: Annotated[bool, typer.Option("--dry-run", help="Only report counts; do not delete")] = False,
+):
+    cfg = _load_cfg(config, project, emulator_host, log_level)
+
+    if kind is not None:
+        cfg.kinds = [k for k in kind if k]
+    if ttl_field is not None:
+        cfg.ttl_field = ttl_field
+    if delete_missing_ttl is not None:
+        cfg.delete_missing_ttl = delete_missing_ttl
+    if batch_size is not None:
+        cfg.batch_size = batch_size
+
+    totals = cleanup_expired(cfg, dry_run=dry_run)
+    deleted_sum = sum(totals.values())
+    typer.echo(f"Total entities {'to delete' if dry_run else 'deleted'}: {deleted_sum}")
+
+if __name__ == "__main__":
+    app()
diff --git a/commands/__init__.py b/commands/__init__.py
new file mode 100644
index 0000000..7493223
--- /dev/null
+++ b/commands/__init__.py
@@ -0,0 +1,18 @@
+from .config import AppConfig, load_config, build_client, list_namespaces, list_kinds, format_size
+from . import analyze_kinds as analyze_kinds
+from . import analyze_entity_fields as analyze_entity_fields
+from . import cleanup_expired as cleanup_expired
+from . import config as config
+
+__all__ = [
+	"AppConfig",
+	"load_config",
+	"build_client",
+	"list_namespaces",
+	"list_kinds",
+	"format_size",
+	"analyze_kinds",
+	"analyze_entity_fields",
+	"cleanup_expired",
+	"config",
+]
diff --git a/commands/analyze_entity_fields.py b/commands/analyze_entity_fields.py
new file mode 100644
index 0000000..fa70fc6
--- /dev/null
+++ b/commands/analyze_entity_fields.py
@@ -0,0 +1,168 @@
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
+
+from google.cloud import datastore
+from google.cloud.datastore.helpers import entity_to_protobuf
+
+from .config import AppConfig, build_client, format_size, list_namespaces
+
+logger = logging.getLogger(__name__)
+
+
+def _clone_without_field(entity: datastore.Entity, exclude_field: str) -> datastore.Entity:
+    new_entity = datastore.Entity(key=entity.key)
+    for k, v in entity.items():
+        if k != exclude_field:
+            new_entity[k] = v
+    return new_entity
+
+
+def _estimate_field_contributions(
+    entities: Iterable[datastore.Entity],
+    target_fields: Optional[List[str]] = None,
+) -> Tuple[Dict[str, int], int, int]:
+    field_totals: DefaultDict[str, int] = defaultdict(int)
+    total_size = 0
+    entity_count = 0
+
+    from tqdm import tqdm
+    for entity in tqdm(list(entities), desc="Analyzing field contributions", unit="entity"):
+        entity_count += 1
+        proto = entity_to_protobuf(entity)._pb
+        full_size = len(proto.SerializeToString())
+        total_size += full_size
+
+        for field in (target_fields or list(entity.keys())):
+            if field not in entity:
+                continue
+            reduced_entity = _clone_without_field(entity, field)
+            reduced_size = len(entity_to_protobuf(reduced_entity)._pb.SerializeToString())
+            field_totals[field] += max(0, full_size - reduced_size)
+
+    return dict(field_totals), total_size, entity_count
+
+
+def _analyze_single_namespace(
+    client: datastore.Client,
+    kind: str,
+    namespace: Optional[str],
+    group_by_field: Optional[str],
+    only_fields: Optional[List[str]],
+) -> Dict:
+    query = client.query(kind=kind, namespace=namespace or None)
+
+    if group_by_field:
+        logger.info(
+            "Analyzing field contributions for kind=%s, namespace=%s grouped by %s",
+            kind,
+            namespace or "(default)",
+            group_by_field,
+        )
+        grouped_entities: DefaultDict[str, List[datastore.Entity]] = defaultdict(list)
+        for entity in query.fetch():
+            group_val = entity.get(group_by_field)
+            key = str(group_val) if group_val is not None else "<missing>"
+            grouped_entities[key].append(entity)
+
+        results: Dict[str, Dict] = {}
+        for group_key, ents in grouped_entities.items():
+            field_totals, total_size, entity_count = _estimate_field_contributions(
+                ents, target_fields=only_fields
+            )
+            results[group_key] = {
+                "namespace": namespace,
+                "kind": kind,
+                "group": group_key,
+                "entity_count": entity_count,
+                "total_bytes": total_size,
+                "total_size": format_size(total_size),
+                "fields": {
+                    f: {
+                        "bytes": b,
+                        "avg_per_entity": (b / entity_count) if entity_count else 0.0,
+                        "human": format_size(b),
+                    }
+                    for f, b in sorted(field_totals.items(), key=lambda x: x[1], reverse=True)
+                },
+            }
+        return {"grouped": results}
+
+    # Ungrouped path
+    logger.info(
+        "Analyzing field contributions for kind=%s, namespace=%s",
+        kind,
+        namespace or "(default)",
+    )
+    field_totals, total_size, entity_count = _estimate_field_contributions(
+        query.fetch(), target_fields=only_fields
+    )
+    return {
+        "namespace": namespace,
+        "kind": kind,
+        "entity_count": entity_count,
+        "total_bytes": total_size,
+        "total_size": format_size(total_size),
+        "fields": {
+            f: {
+                "bytes": b,
+                "avg_per_entity": (b / entity_count) if entity_count else 0.0,
+                "human": format_size(b),
+            }
+            for f, b in sorted(field_totals.items(), key=lambda x: x[1], reverse=True)
+        },
+    }
+
+
+def analyze_field_contributions(
+    config: AppConfig,
+    kind: str,
+    namespace: Optional[str] = None,
+    group_by_field: Optional[str] = None,
+    only_fields: Optional[List[str]] = None,
+) -> Dict:
+    client = build_client(config)
+
+    # If no namespace provided, or config.namespaces is None/empty, iterate all namespaces
+    if namespace is None:
+        if hasattr(config, "namespaces") and (not config.namespaces):
+            ns_list = list_namespaces(client)
+        else:
+            ns_list = [namespace] if namespace else list_namespaces(client)
+        results: Dict[str, Dict] = {}
+        for ns in ns_list:
+            results[ns or ""] = _analyze_single_namespace(
+                client, kind=kind, namespace=ns, group_by_field=group_by_field, only_fields=only_fields
+            )
+        return {"by_namespace": results}
+
+    # Single namespace
+    return _analyze_single_namespace(
+        client, kind=kind, namespace=namespace, group_by_field=group_by_field, only_fields=only_fields
+    )
+
+
+def print_field_summary(result: Dict) -> None:
+    if "by_namespace" in result:
+        for ns, data in result["by_namespace"].items():
+            print(f"\n=== namespace: {ns or '(default)'} ===")
+            print_field_summary(data)
+        return
+
+    if "grouped" in result:
+        for group_key, data in result["grouped"].items():
+            ns = data.get("namespace") or ""
+            print(f"\n[group={group_key}] ns={ns} kind={data['kind']} entities={data['entity_count']} total={data['total_size']}")
+            for field, stats in data["fields"].items():
+                avg = stats["avg_per_entity"]
+                print(f"  {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)")
+    else:
+        ns = result.get("namespace") or ""
+        print(
+            f"ns={ns} kind={result['kind']} entities={result['entity_count']} total={result['total_size']}"
+        )
+        for field, stats in result["fields"].items():
+            avg = stats["avg_per_entity"]
+            print(f"  {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)")
diff --git a/commands/analyze_kinds.py b/commands/analyze_kinds.py
new file mode 100644
index 0000000..9807532
--- /dev/null
+++ b/commands/analyze_kinds.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import logging
+from typing import Dict, List, Optional, Tuple
+
+from google.cloud import datastore
+from google.cloud.datastore.helpers import entity_to_protobuf
+
+from .config import (
+    AppConfig,
+    build_client,
+    list_namespaces,
+    list_kinds,
+    format_size,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def get_kind_stats(client, kind: str, namespace: Optional[str] = None) -> Tuple[Optional[int], Optional[int]]:
+    """
+    Returns (count, bytes) for the given kind/namespace using Datastore statistics.
+    Falls back to None if not found.
+    """
+    if namespace:
+        stats_kind = "__Stat_Kind_Ns__"
+        query = client.query(kind=stats_kind)
+        query.add_filter("kind_name", "=", kind)
+        query.add_filter("namespace_name", "=", namespace)
+    else:
+        stats_kind = "__Stat_Kind__"
+        query = client.query(kind=stats_kind)
+        query.add_filter("kind_name", "=", kind)
+
+    results = list(query.fetch(limit=1))
+    if results:
+        return results[0]["count"], results[0]["bytes"]
+    return None, None
+
+
+def estimate_entity_count_and_size(client, kind: str, namespace: Optional[str], sample_size: int = 100) -> Tuple[int, int]:
+    """
+    Original keys-only method: exact count, approximate bytes via sampling.
+    """
+    # Count with keys-only
+    count_query = client.query(kind=kind, namespace=namespace or None)
+    count_query.keys_only()
+    total_count = sum(1 for _ in count_query.fetch())
+
+    # Sample for size
+    sample_query = client.query(kind=kind, namespace=namespace or None)
+    sample_entities = list(sample_query.fetch(limit=sample_size))
+    if sample_entities:
+        avg_size = sum(len(entity_to_protobuf(e)._pb.SerializeToString()) for e in sample_entities) / len(sample_entities)
+    else:
+        avg_size = 0
+
+    return total_count, int(avg_size * total_count)
+
+
+def analyze_kinds(config: AppConfig, method: Optional[str] = None) -> List[Dict]:
+    """
+    Analyze kinds using either:
+      - 'stats' (default) => fast built-in Datastore statistics
+      - 'scan'            => keys-only scan with sampling
+    Falls back to 'scan' if stats are missing for a kind.
+    """
+    client = build_client(config)
+
+    # Decide method priority: parameter > config > default
+    method = method or getattr(config, "method", None) or "stats"
+
+    # If namespaces is None or empty, iterate all available namespaces
+    if not config.namespaces:
+        namespaces = list_namespaces(client)
+    else:
+        namespaces = config.namespaces
+
+    print(f"Found namespaces: {namespaces}")
+    from tqdm import tqdm
+    results: List[Dict] = []
+    for ns in namespaces:
+        kinds = config.kinds or list_kinds(client, ns)
+        print(f"Namespace '{ns}': found kinds: {kinds}")
+        logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds))
+        for kind in tqdm(kinds, desc=f"Analyzing kinds in ns={ns or '(default)'}", unit="kind"):
+            if method == "stats":
+                count, total_bytes = get_kind_stats(client, kind, ns)
+                if count is None:
+                    logger.warning("Stats not found for kind=%s, ns=%s — falling back to scan", kind, ns or "(default)")
+                    count, total_bytes = estimate_entity_count_and_size(client, kind, ns)
+            elif method == "scan":
+                count, total_bytes = estimate_entity_count_and_size(client, kind, ns)
+            else:
+                raise ValueError(f"Unknown method: {method}")
+
+            results.append(
+                {
+                    "namespace": ns,
+                    "kind": kind,
+                    "count": count,
+                    "bytes": total_bytes,
+                    "size": format_size(total_bytes),
+                }
+            )
+    return results
+
+
+def print_summary_table(rows: List[Dict]) -> None:
+    # Plain stdout table for wide compatibility
+    print("namespace,kind,count,size,bytes")
+    for r in rows:
+        ns = r.get("namespace") or ""
+        print(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}")
diff --git a/commands/cleanup_expired.py b/commands/cleanup_expired.py
new file mode 100644
index 0000000..4d8b9d4
--- /dev/null
+++ b/commands/cleanup_expired.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import logging
+from datetime import datetime, timezone
+from typing import Dict, List, Optional
+
+from google.cloud import datastore
+
+from .config import (
+    AppConfig,
+    build_client,
+    list_namespaces,
+    list_kinds,
+    chunked,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _delete_in_batches(client: datastore.Client, keys: List[datastore.Key], batch_size: int) -> int:
+    deleted = 0
+    for batch in chunked(keys, batch_size):
+        client.delete_multi(batch)  # type: ignore[arg-type]
+        deleted += len(batch)
+    return deleted
+
+
+def cleanup_expired(
+    config: AppConfig,
+    dry_run: bool = False,
+) -> Dict[str, int]:
+    client = build_client(config)
+
+    # If namespaces is None or empty, iterate all available namespaces
+    if not config.namespaces:
+        namespaces = list_namespaces(client)
+    else:
+        namespaces = config.namespaces
+
+    totals: Dict[str, int] = {}
+    now = datetime.now(timezone.utc)
+
+    for ns in namespaces:
+        # Determine kinds: explicit list, or all in namespace
+        kinds = config.kinds if config.kinds else list_kinds(client, ns)
+
+        for kind in kinds:
+            query = client.query(kind=kind, namespace=ns or None)
+            to_delete: List[datastore.Key] = []
+            entities = list(query.fetch())
+            from tqdm import tqdm
+            for entity in tqdm(entities, desc=f"Scanning {kind} in ns={ns or '(default)'}", unit="entity"):
+                expire_at = entity.get(config.ttl_field)
+                expired = expire_at is None if config.delete_missing_ttl else False
+                if not expired and expire_at is not None:
+                    try:
+                        expired = expire_at < now
+                    except Exception:
+                        # If unparsable or timezone-less, skip
+                        expired = False
+                if expired:
+                    to_delete.append(entity.key)
+
+            if dry_run:
+                logger.info(
+                    "[DRY-RUN] ns=%s kind=%s would delete %d entities",
+                    ns or "(default)",
+                    kind,
+                    len(to_delete),
+                )
+                totals[f"{ns}:{kind}"] = len(to_delete)
+            else:
+                deleted = 0
+                if to_delete:
+                    for batch in tqdm(list(chunked(to_delete, config.batch_size)), desc=f"Deleting {kind} in ns={ns or '(default)'}", unit="batch"):
+                        client.delete_multi(batch)
+                        deleted += len(batch)
+                logger.info(
+                    "ns=%s kind=%s deleted %d expired entities",
+                    ns or "(default)",
+                    kind,
+                    deleted,
+                )
+                totals[f"{ns}:{kind}"] = deleted
+
+    return totals
diff --git a/commands/config.py b/commands/config.py
new file mode 100644
index 0000000..420c993
--- /dev/null
+++ b/commands/config.py
@@ -0,0 +1,149 @@
+from __future__ import annotations
+
+import os
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Optional, Sequence
+
+import yaml
+from google.cloud import datastore
+
+@dataclass
+class AppConfig:
+    project_id: Optional[str] = None
+    emulator_host: Optional[str] = None
+
+    # Explicit filters (when empty -> use all)
+    namespaces: List[str] = field(default_factory=list)
+    kinds: List[str] = field(default_factory=list)
+
+    # Optional defaults for commands that need them (e.g., analyze-fields)
+    kind: Optional[str] = None
+    namespace: Optional[str] = None
+
+    # Cleanup settings
+    ttl_field: str = "expireAt"
+    delete_missing_ttl: bool = True
+    batch_size: int = 500
+
+    # Analysis settings
+    group_by_field: Optional[str] = None
+
+    # Logging
+    log_level: str = "INFO"
+
+
+def _as_list(value: Optional[Iterable[str]]) -> List[str]:
+    if value is None:
+        return []
+    if isinstance(value, (list, tuple)):
+        return [str(v) for v in value]
+    return [str(value)]
+
+
+def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) -> AppConfig:
+    config = AppConfig()
+
+    # Load YAML if provided or if default exists
+    data: Dict = {}
+    candidate = path or os.getenv("LSU_CONFIG")
+    if not candidate and os.path.exists("config.yaml"):
+        candidate = "config.yaml"
+
+    if candidate and os.path.exists(candidate):
+        with open(candidate, "r", encoding="utf-8") as fh:
+            data = yaml.safe_load(fh) or {}
+
+    overrides = overrides or {}
+    merged = {**data, **overrides}
+
+    config.project_id = merged.get("project_id") or os.getenv("DATASTORE_PROJECT_ID")
+    config.emulator_host = merged.get("emulator_host") or os.getenv("DATASTORE_EMULATOR_HOST")
+
+    # Explicit lists (no include/exclude). Empty -> all
+    config.namespaces = _as_list(merged.get("namespaces"))
+    config.kinds = _as_list(merged.get("kinds"))
+
+    # 🛠 Normalise: treat [""] as empty
+    if config.namespaces == [""] or config.namespaces is None:
+        config.namespaces = []
+    if config.kinds == [""] or config.kinds is None:
+        config.kinds = []
+
+    # Optional defaults used by some commands
+    config.kind = merged.get("kind")
+    config.namespace = merged.get("namespace")
+
+    config.ttl_field = merged.get("ttl_field", config.ttl_field)
+    config.delete_missing_ttl = bool(merged.get("delete_missing_ttl", config.delete_missing_ttl))
+    config.batch_size = int(merged.get("batch_size", config.batch_size))
+
+    config.group_by_field = merged.get("group_by_field", config.group_by_field)
+
+    config.log_level = str(merged.get("log_level", config.log_level)).upper()
+
+    _configure_logging(config.log_level)
+    return config
+
+
+def _configure_logging(level: str) -> None:
+    level_value = getattr(logging, level.upper(), logging.INFO)
+    logging.basicConfig(level=level_value, format="%(asctime)s | %(levelname)s | %(message)s")
+
+
+def build_client(config: AppConfig) -> datastore.Client:
+    # Prefer explicit emulator_host if provided, otherwise env decides
+    if config.emulator_host:
+        os.environ["DATASTORE_EMULATOR_HOST"] = config.emulator_host
+    # Project id is required in emulator; optional on GCP (ADC will detect)
+    if config.project_id:
+        os.environ.setdefault("DATASTORE_PROJECT_ID", config.project_id)
+
+    if os.getenv("DATASTORE_EMULATOR_HOST"):
+        # When using emulator, ensure a project ID is present
+        project_id = os.getenv("DATASTORE_PROJECT_ID") or config.project_id or "local-dev"
+        os.environ["DATASTORE_PROJECT_ID"] = project_id
+        return datastore.Client(project=project_id)
+
+    # GCP path, relies on ADC if project not provided
+    return datastore.Client(project=config.project_id)
+
+
+def list_namespaces(client: datastore.Client) -> List[str]:
+    """
+    Return all namespaces in the datastore, including the default ("").
+    Always queries __namespace__ in the root context so it works in emulator/GCP.
+    """
+    # Include default namespace "" first
+    namespaces: List[str] = [""]
+
+    # Force namespace=None to query the metadata root
+    query = client.query(kind="__namespace__", namespace=None)
+    query.keys_only()
+
+    for entity in query.fetch():
+        name = entity.key.name or ""
+        if name != "":
+            namespaces.append(name)
+
+    return namespaces
+
+
+def list_kinds(client: datastore.Client, namespace: Optional[str]) -> List[str]:
+    query = client.query(kind="__kind__", namespace=namespace or None)
+    query.keys_only()
+    return [e.key.name for e in query.fetch()]
+
+
+def chunked(iterable: Sequence, chunk_size: int):
+    for i in range(0, len(iterable), max(1, chunk_size)):
+        yield iterable[i : i + chunk_size]
+
+
+def format_size(bytes_size: int) -> str:
+    size = float(bytes_size)
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if size < 1024:
+            return f"{size:.2f} {unit}"
+        size /= 1024
+    return f"{size:.2f} PB"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..0ff44d9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,60 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "local-storage-utils"
+version = "0.1.0"
+description = "Utilities for analyzing and managing local Datastore/Firestore (Datastore mode) data"
+authors = [
+  { name = "Your Name", email = "you@example.com" },
+]
+readme = "README.md"
+requires-python = ">=3.9"
+license = { file = "LICENSE" }
+keywords = ["google-cloud-datastore", "firestore", "emulator", "cleanup", "analysis"]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "License :: OSI Approved :: Apache Software License",
+  "Operating System :: OS Independent",
+]
+dependencies = [
+  "google-cloud-datastore>=2.19.0",
+  "PyYAML>=6.0.1",
+  "typer>=0.12.3",
+]
+
+[project.optional-dependencies]
+rich = ["rich>=13.7.0"]
+
+[project.scripts]
+lsu = "cli:app"
+local-storage-utils = "cli:app"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["commands*"]
+
+[tool.black]
+line-length = 100
+
+[tool.ruff]
+line-length = 100
+
+[tool.semantic_release]
+version_variable = "pyproject.toml:version"
+branch = "main"
+upload_to_pypi = true
+dist_path = "dist"
+build_command = "python -m build"
+commit_message = "chore(release): {version} [skip ci]"
+changelog_sections = "feature,fix,perf,refactor,docs,style,build,ci,chore"
+
+[tool.semantic_release.remote]
+name = "origin"
+repo_url = "https://github.com/your-org/local-storage-utils"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e8f4d09
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+google-cloud-datastore>=2.19.0
+PyYAML>=6.0.1
+typer>=0.12.3
\ No newline at end of file
diff --git a/tests/test_commands.py b/tests/test_commands.py
new file mode 100644
index 0000000..70c05a3
--- /dev/null
+++ b/tests/test_commands.py
@@ -0,0 +1,52 @@
+import sys
+import os
+import pytest
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from commands import analyze_kinds, analyze_entity_fields, cleanup_expired, config
+from commands.config import AppConfig, build_client, list_namespaces
+
+# Dummy config for testing (adjust as needed for emulator)
+def make_dummy_config():
+    return AppConfig(
+        project_id="dummy-project",
+        emulator_host="localhost:8080",
+        namespaces=[""],
+        kinds=["TestKind"],
+        ttl_field="expireAt",
+        delete_missing_ttl=True,
+        batch_size=10,
+        group_by_field=None,
+        log_level="INFO",
+    )
+
+def test_analyze_kinds_runs():
+    cfg = make_dummy_config()
+    try:
+        result = analyze_kinds(cfg)
+        assert isinstance(result, list)
+    except Exception as e:
+        pytest.skip(f"analyze_kinds requires emulator: {e}")
+
+def test_analyze_fields_runs():
+    cfg = make_dummy_config()
+    try:
+        result = analyze_entity_fields.analyze_field_contributions(cfg, kind="TestKind")
+        assert isinstance(result, dict)
+    except Exception as e:
+        pytest.skip(f"analyze_fields requires emulator: {e}")
+
+def test_cleanup_expired_runs():
+    cfg = make_dummy_config()
+    try:
+        result = cleanup_expired.cleanup_expired(cfg, dry_run=True)
+        assert isinstance(result, dict)
+    except Exception as e:
+        pytest.skip(f"cleanup_expired requires emulator: {e}")
+
+def test_list_namespaces_returns_default_and_any_custom():
+    cfg = AppConfig(project_id="dummy-project", emulator_host="localhost:8010")
+    client = build_client(cfg)
+    namespaces = list_namespaces(client)
+    assert "" in namespaces  # default namespace always present
+    # This test will pass if at least the default namespace is present
+    # Add more asserts if you know your emulator has more namespaces
diff --git a/tests/test_import.py b/tests/test_import.py
new file mode 100644
index 0000000..c30178d
--- /dev/null
+++ b/tests/test_import.py
@@ -0,0 +1,10 @@
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+def test_imports():
+    import commands
+    from commands import analyze_kinds, analyze_entity_fields, cleanup_expired, config
+
+    assert commands is not None
+    assert hasattr(config, "AppConfig")
\ No newline at end of file