From 18be2a2dc2563a4ffc75df79d6ecb42dbb5fa0db Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 12 Aug 2025 15:03:58 +0000
Subject: [PATCH 01/12] Initial project setup for local Datastore/Firestore
 management utilities

Co-authored-by: isisosirishorus <isisosirishorus@hotmail.com>
---
 .github/workflows/build.yml        |  46 +++++++++
 CONTRIBUTING.md                    |  21 ++++
 README.md                          |  24 ++++-
 cli.py                             | 128 ++++++++++++++++++++++++
 config.yaml                        |  23 +++++
 gcd_tools/__init__.py              |  18 ++++
 gcd_tools/analyze_entity_fields.py | 135 +++++++++++++++++++++++++
 gcd_tools/analyze_kinds.py         |  73 ++++++++++++++
 gcd_tools/cleanup_expired.py       |  81 +++++++++++++++
 gcd_tools/config.py                | 153 +++++++++++++++++++++++++++++
 pyproject.toml                     |  47 +++++++++
 requirements.txt                   |   3 +
 tests/test_import.py               |   6 ++
 13 files changed, 757 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/build.yml
 create mode 100644 CONTRIBUTING.md
 create mode 100644 cli.py
 create mode 100644 config.yaml
 create mode 100644 gcd_tools/__init__.py
 create mode 100644 gcd_tools/analyze_entity_fields.py
 create mode 100644 gcd_tools/analyze_kinds.py
 create mode 100644 gcd_tools/cleanup_expired.py
 create mode 100644 gcd_tools/config.py
 create mode 100644 pyproject.toml
 create mode 100644 requirements.txt
 create mode 100644 tests/test_import.py

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..91cf7cc
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,46 @@
+name: build
+
+on:
+  push:
+    branches: [ main ]
+    tags: [ "v*" ]
+  pull_request:
+
+jobs:
+  ci:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install
+        run: |
+          python -m pip install -U pip
+          python -m pip install .
+          python -m pip install pytest
+      - name: Test
+        run: |
+          pytest -q
+
+  publish:
+    needs: ci
+    if: startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Build
+        run: |
+          python -m pip install -U pip build
+          python -m build
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..6f4b308
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,21 @@
+# Contributing
+
+Thanks for your interest in contributing!
+
+- Open an issue to discuss substantial changes.
+- Fork and create feature branches from `main`.
+- Run formatting and tests before submitting a PR.
+
+## Dev setup
+
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -U pip
+pip install -e .
+```
+
+## Testing
+
+```bash
+python -m pytest -q
+```
\ No newline at end of file
diff --git a/README.md b/README.md
index 5d34160..ce7ed6f 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,24 @@
 # local-storage-utils
-Set of scripts and tools for managing GCP Datastore data in local
+
+Utilities for analyzing and managing local Datastore/Firestore (Datastore mode) data. Works with both the Datastore Emulator and GCP using Application Default Credentials.
+
+## Install
+
+```bash
+pip install -e .
+```
+
+## CLI
+
+```bash
+# Kind-level counts and size estimates
+lsu analyze-kinds --project my-project
+
+# Field contribution analysis for a kind
+lsu analyze-fields --kind SourceCollectionStateEntity --namespace "" --group-by batchId
+
+# TTL cleanup across kinds/namespaces (dry-run)
+lsu cleanup --ttl-field expireAt --dry-run
+```
+
+Use `--help` on any command for full options. Config can be provided via `config.yaml` or flags.
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..9e6cb26
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+
+import json
+from typing import List, Optional
+
+import typer
+
+from gcd_tools.config import AppConfig, load_config, format_size
+from gcd_tools.analyze_kinds import analyze_kinds, print_summary_table
+from gcd_tools.analyze_entity_fields import analyze_field_contributions, print_field_summary
+from gcd_tools.cleanup_expired import cleanup_expired
+
+app = typer.Typer(help="Utilities for analyzing and managing local Datastore/Firestore (Datastore mode)")
+
+
+def _load_cfg(
+    config_path: Optional[str],
+    project: Optional[str],
+    emulator_host: Optional[str],
+    log_level: Optional[str],
+) -> AppConfig:
+    overrides = {}
+    if project:
+        overrides["project_id"] = project
+    if emulator_host:
+        overrides["emulator_host"] = emulator_host
+    if log_level:
+        overrides["log_level"] = log_level
+    return load_config(config_path, overrides)
+
+
+@app.command("analyze-kinds")
+def cmd_analyze_kinds(
+    config: Optional[str] = typer.Option(None, help="Path to config.yaml"),
+    project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
+    emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
+    log_level: Optional[str] = typer.Option(None, help="Logging level"),
+    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to include"),
+    exclude_namespace: Optional[List[str]] = typer.Option(None, "--exclude-namespace", help="Namespaces to exclude"),
+    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to include"),
+    exclude_kind: Optional[List[str]] = typer.Option(None, "--exclude-kind", help="Kinds to exclude"),
+    output: Optional[str] = typer.Option(None, help="Output CSV file path"),
+):
+    cfg = _load_cfg(config, project, emulator_host, log_level)
+    if namespace:
+        cfg.namespace_include = list(namespace)
+    if exclude_namespace:
+        cfg.namespace_exclude = list(exclude_namespace)
+    if kind:
+        cfg.kinds_include = list(kind)
+    if exclude_kind:
+        cfg.kinds_exclude = list(exclude_kind)
+
+    rows = analyze_kinds(cfg)
+    if output:
+        with open(output, "w", encoding="utf-8") as fh:
+            fh.write("namespace,kind,count,size,bytes\n")
+            for r in rows:
+                ns = r.get("namespace") or ""
+                fh.write(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}\n")
+        typer.echo(f"Wrote {len(rows)} rows to {output}")
+    else:
+        print_summary_table(rows)
+
+
+@app.command("analyze-fields")
+def cmd_analyze_fields(
+    kind: str = typer.Option(..., "--kind", "-k", help="Kind to analyze"),
+    namespace: Optional[str] = typer.Option(None, "--namespace", "-n", help="Namespace to query"),
+    group_by: Optional[str] = typer.Option(None, help="Group results by this field value"),
+    only_field: Optional[List[str]] = typer.Option(None, "--only-field", help="Only consider these fields"),
+    config: Optional[str] = typer.Option(None, help="Path to config.yaml"),
+    project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
+    emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
+    log_level: Optional[str] = typer.Option(None, help="Logging level"),
+    output_json: Optional[str] = typer.Option(None, help="Write raw JSON results to file"),
+):
+    cfg = _load_cfg(config, project, emulator_host, log_level)
+    result = analyze_field_contributions(
+        cfg, kind=kind, namespace=namespace, group_by_field=group_by, only_fields=list(only_field) if only_field else None
+    )
+
+    if output_json:
+        with open(output_json, "w", encoding="utf-8") as fh:
+            json.dump(result, fh, indent=2)
+        typer.echo(f"Wrote JSON results to {output_json}")
+    else:
+        print_field_summary(result)
+
+
+@app.command("cleanup")
+def cmd_cleanup(
+    config: Optional[str] = typer.Option(None, help="Path to config.yaml"),
+    project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
+    emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
+    log_level: Optional[str] = typer.Option(None, help="Logging level"),
+    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to include"),
+    exclude_namespace: Optional[List[str]] = typer.Option(None, "--exclude-namespace", help="Namespaces to exclude"),
+    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to include"),
+    exclude_kind: Optional[List[str]] = typer.Option(None, "--exclude-kind", help="Kinds to exclude"),
+    ttl_field: Optional[str] = typer.Option(None, help="TTL field name"),
+    delete_missing_ttl: bool = typer.Option(True, help="Delete when TTL field is missing"),
+    batch_size: Optional[int] = typer.Option(None, help="Delete batch size"),
+    dry_run: bool = typer.Option(False, help="Only report counts; do not delete"),
+):
+    cfg = _load_cfg(config, project, emulator_host, log_level)
+
+    if namespace:
+        cfg.namespace_include = list(namespace)
+    if exclude_namespace:
+        cfg.namespace_exclude = list(exclude_namespace)
+    if kind:
+        cfg.kinds_include = list(kind)
+    if exclude_kind:
+        cfg.kinds_exclude = list(exclude_kind)
+    if ttl_field:
+        cfg.ttl_field = ttl_field
+    cfg.delete_missing_ttl = delete_missing_ttl
+    if batch_size:
+        cfg.batch_size = batch_size
+
+    totals = cleanup_expired(cfg, dry_run=dry_run)
+    deleted_sum = sum(totals.values())
+    typer.echo(f"Total entities {'to delete' if dry_run else 'deleted'}: {deleted_sum}")
+
+
+if __name__ == "__main__":
+    app()
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 0000000..7537311
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,23 @@
+# Default configuration for local-storage-utils
+# Values here can be overridden via CLI flags or environment variables.
+
+# Core connection
+project_id: null          # e.g. "my-project"; if omitted, ADC or env will be used
+emulator_host: null       # e.g. "localhost:8010"; if set, emulator mode is enabled
+
+# Filters
+namespace_include: []     # e.g. ["", "tenant-a"]; empty string means default namespace
+namespace_exclude: []
+kinds_include: []
+kinds_exclude: []
+
+# Cleanup
+ttl_field: "expireAt"
+delete_missing_ttl: true
+batch_size: 500
+
+# Analysis
+group_by_field: null
+
+# Logging
+log_level: "INFO"
\ No newline at end of file
diff --git a/gcd_tools/__init__.py b/gcd_tools/__init__.py
new file mode 100644
index 0000000..a1c8e85
--- /dev/null
+++ b/gcd_tools/__init__.py
@@ -0,0 +1,18 @@
+from .config import AppConfig, load_config, build_client, list_namespaces, list_kinds, format_size
+from . import analyze_kinds as analyze_kinds
+from . import analyze_entity_fields as analyze_entity_fields
+from . import cleanup_expired as cleanup_expired
+from . import config as config
+
+__all__ = [
+    "AppConfig",
+    "load_config",
+    "build_client",
+    "list_namespaces",
+    "list_kinds",
+    "format_size",
+    "analyze_kinds",
+    "analyze_entity_fields",
+    "cleanup_expired",
+    "config",
+]
\ No newline at end of file
diff --git a/gcd_tools/analyze_entity_fields.py b/gcd_tools/analyze_entity_fields.py
new file mode 100644
index 0000000..e4bddcd
--- /dev/null
+++ b/gcd_tools/analyze_entity_fields.py
@@ -0,0 +1,135 @@
+from __future__ import annotations
+
+import logging
+from collections import defaultdict
+from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
+
+from google.cloud import datastore
+from google.cloud.datastore.helpers import entity_to_protobuf
+
+from .config import AppConfig, build_client, format_size
+
+logger = logging.getLogger(__name__)
+
+
+def _clone_without_field(entity: datastore.Entity, exclude_field: str) -> datastore.Entity:
+    new_entity = datastore.Entity(key=entity.key)
+    for k, v in entity.items():
+        if k != exclude_field:
+            new_entity[k] = v
+    return new_entity
+
+
+def _estimate_field_contributions(
+    entities: Iterable[datastore.Entity],
+    target_fields: Optional[List[str]] = None,
+) -> Tuple[Dict[str, int], int, int]:
+    field_totals: DefaultDict[str, int] = defaultdict(int)
+    total_size = 0
+    entity_count = 0
+
+    for entity in entities:
+        entity_count += 1
+        proto = entity_to_protobuf(entity)._pb
+        full_size = len(proto.SerializeToString())
+        total_size += full_size
+
+        for field in (target_fields or list(entity.keys())):
+            if field not in entity:
+                continue
+            reduced_entity = _clone_without_field(entity, field)
+            reduced_size = len(entity_to_protobuf(reduced_entity)._pb.SerializeToString())
+            field_totals[field] += max(0, full_size - reduced_size)
+
+    return dict(field_totals), total_size, entity_count
+
+
+def analyze_field_contributions(
+    config: AppConfig,
+    kind: str,
+    namespace: Optional[str] = None,
+    group_by_field: Optional[str] = None,
+    only_fields: Optional[List[str]] = None,
+) -> Dict:
+    client = build_client(config)
+
+    query = client.query(kind=kind, namespace=namespace or None)
+
+    if group_by_field:
+        logger.info(
+            "Analyzing field contributions for kind=%s, namespace=%s grouped by %s",
+            kind,
+            namespace or "(default)",
+            group_by_field,
+        )
+        grouped_entities: DefaultDict[str, List[datastore.Entity]] = defaultdict(list)
+        for entity in query.fetch():
+            group_val = entity.get(group_by_field)
+            key = str(group_val) if group_val is not None else "<missing>"
+            grouped_entities[key].append(entity)
+
+        results: Dict[str, Dict] = {}
+        for group_key, ents in grouped_entities.items():
+            field_totals, total_size, entity_count = _estimate_field_contributions(
+                ents, target_fields=only_fields
+            )
+            results[group_key] = {
+                "namespace": namespace,
+                "kind": kind,
+                "group": group_key,
+                "entity_count": entity_count,
+                "total_bytes": total_size,
+                "total_size": format_size(total_size),
+                "fields": {
+                    f: {
+                        "bytes": b,
+                        "avg_per_entity": (b / entity_count) if entity_count else 0.0,
+                        "human": format_size(b),
+                    }
+                    for f, b in sorted(field_totals.items(), key=lambda x: x[1], reverse=True)
+                },
+            }
+        return {"grouped": results}
+
+    # Ungrouped path
+    logger.info(
+        "Analyzing field contributions for kind=%s, namespace=%s",
+        kind,
+        namespace or "(default)",
+    )
+    field_totals, total_size, entity_count = _estimate_field_contributions(
+        query.fetch(), target_fields=only_fields
+    )
+    return {
+        "namespace": namespace,
+        "kind": kind,
+        "entity_count": entity_count,
+        "total_bytes": total_size,
+        "total_size": format_size(total_size),
+        "fields": {
+            f: {
+                "bytes": b,
+                "avg_per_entity": (b / entity_count) if entity_count else 0.0,
+                "human": format_size(b),
+            }
+            for f, b in sorted(field_totals.items(), key=lambda x: x[1], reverse=True)
+        },
+    }
+
+
+def print_field_summary(result: Dict) -> None:
+    if "grouped" in result:
+        for group_key, data in result["grouped"].items():
+            ns = data.get("namespace") or ""
+            print(f"\n[group={group_key}] ns={ns} kind={data['kind']} entities={data['entity_count']} total={data['total_size']}")
+            for field, stats in data["fields"].items():
+                avg = stats["avg_per_entity"]
+                print(f"  {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)")
+    else:
+        ns = result.get("namespace") or ""
+        print(
+            f"ns={ns} kind={result['kind']} entities={result['entity_count']} total={result['total_size']}"
+        )
+        for field, stats in result["fields"].items():
+            avg = stats["avg_per_entity"]
+            print(f"  {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)")
\ No newline at end of file
diff --git a/gcd_tools/analyze_kinds.py b/gcd_tools/analyze_kinds.py
new file mode 100644
index 0000000..323f318
--- /dev/null
+++ b/gcd_tools/analyze_kinds.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import logging
+from typing import Dict, List, Optional, Tuple
+
+from google.cloud import datastore
+from google.cloud.datastore.helpers import entity_to_protobuf
+
+from .config import (
+    AppConfig,
+    build_client,
+    list_namespaces,
+    list_kinds,
+    apply_kind_filters,
+    apply_namespace_filters,
+    format_size,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def estimate_entity_count_and_size(
+    client: datastore.Client, kind: str, namespace: Optional[str]
+) -> Tuple[int, int]:
+    query = client.query(kind=kind, namespace=namespace or None)
+    total_size = 0
+    count = 0
+    for entity in query.fetch():
+        try:
+            raw_proto = entity_to_protobuf(entity)._pb
+            total_size += len(raw_proto.SerializeToString())
+        except Exception:
+            # Fallback: count only
+            pass
+        count += 1
+    return count, total_size
+
+
+def analyze_kinds(config: AppConfig) -> List[Dict]:
+    client = build_client(config)
+
+    all_namespaces = list_namespaces(client)
+    namespaces = apply_namespace_filters(
+        all_namespaces, config.namespace_include, config.namespace_exclude
+    )
+
+    results: List[Dict] = []
+    for ns in namespaces:
+        kinds = list_kinds(client, ns)
+        kinds = apply_kind_filters(kinds, config.kinds_include, config.kinds_exclude)
+
+        logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds))
+        for kind in kinds:
+            count, total_bytes = estimate_entity_count_and_size(client, kind, ns)
+            results.append(
+                {
+                    "namespace": ns,
+                    "kind": kind,
+                    "count": count,
+                    "bytes": total_bytes,
+                    "size": format_size(total_bytes),
+                }
+            )
+    return results
+
+
+def print_summary_table(rows: List[Dict]) -> None:
+    # Plain stdout table for wide compatibility
+    print("namespace,kind,count,size,bytes")
+    for r in rows:
+        ns = r.get("namespace") or ""
+        print(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}")
\ No newline at end of file
diff --git a/gcd_tools/cleanup_expired.py b/gcd_tools/cleanup_expired.py
new file mode 100644
index 0000000..0062dac
--- /dev/null
+++ b/gcd_tools/cleanup_expired.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+import logging
+from datetime import datetime, timezone
+from typing import Dict, Iterable, List, Optional
+
+from google.cloud import datastore
+
+from .config import (
+    AppConfig,
+    build_client,
+    list_namespaces,
+    list_kinds,
+    apply_kind_filters,
+    apply_namespace_filters,
+    chunked,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _delete_in_batches(client: datastore.Client, keys: List[datastore.Key], batch_size: int) -> int:
+    deleted = 0
+    for batch in chunked(keys, batch_size):
+        client.delete_multi(batch)  # type: ignore[arg-type]
+        deleted += len(batch)
+    return deleted
+
+
+def cleanup_expired(
+    config: AppConfig,
+    dry_run: bool = False,
+) -> Dict[str, int]:
+    client = build_client(config)
+
+    all_namespaces = list_namespaces(client)
+    namespaces = apply_namespace_filters(
+        all_namespaces, config.namespace_include, config.namespace_exclude
+    )
+
+    totals: Dict[str, int] = {}
+    now = datetime.now(timezone.utc)
+
+    for ns in namespaces:
+        kinds = list_kinds(client, ns)
+        kinds = apply_kind_filters(kinds, config.kinds_include, config.kinds_exclude)
+
+        for kind in kinds:
+            query = client.query(kind=kind, namespace=ns or None)
+            to_delete: List[datastore.Key] = []
+            for entity in query.fetch():
+                expire_at = entity.get(config.ttl_field)
+                expired = expire_at is None if config.delete_missing_ttl else False
+                if not expired and expire_at is not None:
+                    try:
+                        expired = expire_at < now
+                    except Exception:
+                        # If unparsable or timezone-less, skip
+                        expired = False
+                if expired:
+                    to_delete.append(entity.key)
+
+            if dry_run:
+                logger.info(
+                    "[DRY-RUN] ns=%s kind=%s would delete %d entities",
+                    ns or "(default)",
+                    kind,
+                    len(to_delete),
+                )
+                totals[f"{ns}:{kind}"] = len(to_delete)
+            else:
+                deleted = _delete_in_batches(client, to_delete, config.batch_size) if to_delete else 0
+                logger.info(
+                    "ns=%s kind=%s deleted %d expired entities",
+                    ns or "(default)",
+                    kind,
+                    deleted,
+                )
+                totals[f"{ns}:{kind}"] = deleted
+
+    return totals
\ No newline at end of file
diff --git a/gcd_tools/config.py b/gcd_tools/config.py
new file mode 100644
index 0000000..d1c3d81
--- /dev/null
+++ b/gcd_tools/config.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+import os
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+
+import yaml
+from google.cloud import datastore
+
+
+@dataclass
+class AppConfig:
+    project_id: Optional[str] = None
+    emulator_host: Optional[str] = None
+
+    # Filters
+    namespace_include: List[str] = field(default_factory=list)
+    namespace_exclude: List[str] = field(default_factory=list)
+    kinds_include: List[str] = field(default_factory=list)
+    kinds_exclude: List[str] = field(default_factory=list)
+
+    # Cleanup settings
+    ttl_field: str = "expireAt"
+    delete_missing_ttl: bool = True
+    batch_size: int = 500
+
+    # Analysis settings
+    group_by_field: Optional[str] = None
+
+    # Logging
+    log_level: str = "INFO"
+
+
+def _as_list(value: Optional[Iterable[str]]) -> List[str]:
+    if value is None:
+        return []
+    if isinstance(value, (list, tuple)):
+        return [str(v) for v in value]
+    return [str(value)]
+
+
+def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) -> AppConfig:
+    config = AppConfig()
+
+    # Load YAML if provided or if default exists
+    data: Dict = {}
+    candidate = path or os.getenv("LSU_CONFIG")
+    if not candidate and os.path.exists("config.yaml"):
+        candidate = "config.yaml"
+
+    if candidate and os.path.exists(candidate):
+        with open(candidate, "r", encoding="utf-8") as fh:
+            data = yaml.safe_load(fh) or {}
+
+    overrides = overrides or {}
+    merged = {**data, **overrides}
+
+    config.project_id = merged.get("project_id") or os.getenv("DATASTORE_PROJECT_ID")
+    config.emulator_host = merged.get("emulator_host") or os.getenv("DATASTORE_EMULATOR_HOST")
+
+    config.namespace_include = _as_list(merged.get("namespace_include"))
+    config.namespace_exclude = _as_list(merged.get("namespace_exclude"))
+    config.kinds_include = _as_list(merged.get("kinds_include"))
+    config.kinds_exclude = _as_list(merged.get("kinds_exclude"))
+
+    config.ttl_field = merged.get("ttl_field", config.ttl_field)
+    config.delete_missing_ttl = bool(merged.get("delete_missing_ttl", config.delete_missing_ttl))
+    config.batch_size = int(merged.get("batch_size", config.batch_size))
+
+    config.group_by_field = merged.get("group_by_field", config.group_by_field)
+
+    config.log_level = str(merged.get("log_level", config.log_level)).upper()
+
+    _configure_logging(config.log_level)
+    return config
+
+
+def _configure_logging(level: str) -> None:
+    level_value = getattr(logging, level.upper(), logging.INFO)
+    logging.basicConfig(level=level_value, format="%(asctime)s | %(levelname)s | %(message)s")
+
+
+def build_client(config: AppConfig) -> datastore.Client:
+    # Prefer explicit emulator_host if provided, otherwise env decides
+    if config.emulator_host:
+        os.environ["DATASTORE_EMULATOR_HOST"] = config.emulator_host
+    # Project id is required in emulator; optional on GCP (ADC will detect)
+    if config.project_id:
+        os.environ.setdefault("DATASTORE_PROJECT_ID", config.project_id)
+
+    if os.getenv("DATASTORE_EMULATOR_HOST"):
+        # When using emulator, ensure a project ID is present
+        project_id = os.getenv("DATASTORE_PROJECT_ID") or config.project_id or "local-dev"
+        os.environ["DATASTORE_PROJECT_ID"] = project_id
+        return datastore.Client(project=project_id)
+
+    # GCP path, relies on ADC if project not provided
+    return datastore.Client(project=config.project_id)
+
+
+def list_namespaces(client: datastore.Client) -> List[str]:
+    # Include default namespace as "" first
+    namespaces: List[str] = [""]
+    query = client.query(kind="__namespace__")
+    query.keys_only()
+    for entity in query.fetch():
+        name = entity.key.name or ""
+        if name != "":
+            namespaces.append(name)
+    return namespaces
+
+
+def list_kinds(client: datastore.Client, namespace: Optional[str]) -> List[str]:
+    query = client.query(kind="__kind__", namespace=namespace or None)
+    query.keys_only()
+    return [e.key.name for e in query.fetch()]
+
+
+def apply_namespace_filters(all_namespaces: Sequence[str], include: Sequence[str], exclude: Sequence[str]) -> List[str]:
+    selected = list(all_namespaces)
+    if include:
+        include_set = set(include)
+        selected = [ns for ns in selected if ns in include_set]
+    if exclude:
+        exclude_set = set(exclude)
+        selected = [ns for ns in selected if ns not in exclude_set]
+    return selected
+
+
+def apply_kind_filters(all_kinds: Sequence[str], include: Sequence[str], exclude: Sequence[str]) -> List[str]:
+    selected = list(all_kinds)
+    if include:
+        include_set = set(include)
+        selected = [k for k in selected if k in include_set]
+    if exclude:
+        exclude_set = set(exclude)
+        selected = [k for k in selected if k not in exclude_set]
+    return selected
+
+
+def chunked(iterable: Sequence, chunk_size: int) -> Iterable[Sequence]:
+    for i in range(0, len(iterable), max(1, chunk_size)):
+        yield iterable[i : i + chunk_size]
+
+
+def format_size(bytes_size: int) -> str:
+    size = float(bytes_size)
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if size < 1024:
+            return f"{size:.2f} {unit}"
+        size /= 1024
+    return f"{size:.2f} PB"
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b9fcc3a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,47 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "local-storage-utils"
+version = "0.1.0"
+description = "Utilities for analyzing and managing local Datastore/Firestore (Datastore mode) data"
+authors = [
+  { name = "Your Name", email = "you@example.com" },
+]
+readme = "README.md"
+requires-python = ">=3.9"
+license = { file = "LICENSE" }
+keywords = ["google-cloud-datastore", "firestore", "emulator", "cleanup", "analysis"]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "License :: OSI Approved :: Apache Software License",
+  "Operating System :: OS Independent",
+]
+dependencies = [
+  "google-cloud-datastore>=2.19.0",
+  "PyYAML>=6.0.1",
+  "typer>=0.12.3",
+]
+
+[project.optional-dependencies]
+rich = ["rich>=13.7.0"]
+
+[project.scripts]
+lsu = "cli:app"
+local-storage-utils = "cli:app"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["gcd_tools*"]
+
+[tool.black]
+line-length = 100
+
+[tool.ruff]
+line-length = 100
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e8f4d09
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+google-cloud-datastore>=2.19.0
+PyYAML>=6.0.1
+typer>=0.12.3
\ No newline at end of file
diff --git a/tests/test_import.py b/tests/test_import.py
new file mode 100644
index 0000000..b7f63dd
--- /dev/null
+++ b/tests/test_import.py
@@ -0,0 +1,6 @@
+def test_imports():
+    import gcd_tools
+    from gcd_tools import analyze_kinds, analyze_entity_fields, cleanup_expired, config
+
+    assert gcd_tools is not None
+    assert hasattr(config, "AppConfig")
\ No newline at end of file

From 0619748b6ffa8b21bbf83134cfdedda23e2b800e Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 12 Aug 2025 20:11:47 +0000
Subject: [PATCH 02/12] Refactor config handling, improve CLI flexibility, and
 update documentation

Co-authored-by: isisosirishorus <isisosirishorus@hotmail.com>
---
 .github/workflows/build.yml        |  3 +
 README.md                          | 89 ++++++++++++++++++++++++++++--
 cli.py                             | 60 ++++++++++----------
 config.yaml                        | 14 +++--
 gcd_tools/analyze_entity_fields.py | 44 ++++++++++++---
 gcd_tools/analyze_kinds.py         | 12 ++--
 gcd_tools/cleanup_expired.py       | 14 ++---
 gcd_tools/config.py                | 49 ++++++----------
 8 files changed, 187 insertions(+), 98 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 91cf7cc..cd842b7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -30,6 +30,9 @@ jobs:
     needs: ci
     if: startsWith(github.ref, 'refs/tags/v')
     runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
diff --git a/README.md b/README.md
index ce7ed6f..59b25c2 100644
--- a/README.md
+++ b/README.md
@@ -2,23 +2,104 @@
 
 Utilities for analyzing and managing local Datastore/Firestore (Datastore mode) data. Works with both the Datastore Emulator and GCP using Application Default Credentials.
 
-## Install
+## Install (PyPI)
 
 ```bash
+pip install local-storage-utils
+```
+
+This installs the `lsu` CLI.
+
+## Install (from source)
+
+```bash
+git clone <this-repo-url>
+cd local-storage-utils
+python -m venv .venv
+source .venv/bin/activate
+pip install -U pip
 pip install -e .
 ```
 
-## CLI
+## Configuration
+
+- By default, the CLI loads `config.yaml` from the current directory if present.
+- Any CLI flag overrides values from `config.yaml`.
+- If neither config nor flags provide a value, the tool falls back to environment variables (for emulator detection) or sensible defaults.
+
+Key settings in `config.yaml`:
+
+```yaml
+project_id: "my-project"          # If omitted, ADC/env will be used
+emulator_host: "localhost:8010"   # If set, uses Datastore Emulator
+
+# Explicit filters (empty means all)
+namespaces: [""]                   # Empty -> iterate all namespaces (including default "")
+kinds: []                          # Empty -> iterate all kinds per namespace
+
+# Optional defaults
+kind: "SourceCollectionStateEntity"  # Default for analyze-fields
+namespace: ""                         # Default namespace for analyze-fields
+
+# Cleanup
+ttl_field: "expireAt"
+delete_missing_ttl: true
+batch_size: 500
+
+# Analysis
+group_by_field: null
+
+# Logging
+log_level: "INFO"
+```
+
+## CLI usage
 
 ```bash
 # Kind-level counts and size estimates
 lsu analyze-kinds --project my-project
 
-# Field contribution analysis for a kind
+# Use all namespaces/kinds by default, or restrict explicitly
+lsu analyze-kinds --namespace "" --namespace tenant-a --kind SourceCollectionStateEntity
+
+# Field contribution analysis (falls back to config.kind/config.namespace if not provided)
 lsu analyze-fields --kind SourceCollectionStateEntity --namespace "" --group-by batchId
 
-# TTL cleanup across kinds/namespaces (dry-run)
+# TTL cleanup across namespaces/kinds (dry-run)
 lsu cleanup --ttl-field expireAt --dry-run
+
+# TTL cleanup restricted to specific namespaces/kinds
+lsu cleanup --namespace "" --namespace tenant-a --kind pipeline-job
 ```
 
 Use `--help` on any command for full options. Config can be provided via `config.yaml` or flags.
+
+## Development
+
+- Create a virtual environment and install in editable mode as shown above
+- Run tests:
+
+```bash
+python -m pip install pytest
+pytest -q
+```
+
+- Lint/format (optional if you use pre-commit/CI):
+```bash
+python -m pip install ruff black
+ruff check .
+black .
+```
+
+## Publishing
+
+- CI is configured to publish to PyPI on tags `v*`.
+- Create a PyPI token and add it to repository secrets as `PYPI_API_TOKEN`.
+- Tag and push:
+
+```bash
+git tag v0.1.0
+git push origin v0.1.0
+```
+
+The GitHub Actions workflow will build and upload the package to PyPI.
diff --git a/cli.py b/cli.py
index 9e6cb26..e5c7c16 100644
--- a/cli.py
+++ b/cli.py
@@ -5,7 +5,7 @@
 
 import typer
 
-from gcd_tools.config import AppConfig, load_config, format_size
+from gcd_tools.config import AppConfig, load_config
 from gcd_tools.analyze_kinds import analyze_kinds, print_summary_table
 from gcd_tools.analyze_entity_fields import analyze_field_contributions, print_field_summary
 from gcd_tools.cleanup_expired import cleanup_expired
@@ -35,21 +35,16 @@ def cmd_analyze_kinds(
     project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
     emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
     log_level: Optional[str] = typer.Option(None, help="Logging level"),
-    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to include"),
-    exclude_namespace: Optional[List[str]] = typer.Option(None, "--exclude-namespace", help="Namespaces to exclude"),
-    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to include"),
-    exclude_kind: Optional[List[str]] = typer.Option(None, "--exclude-kind", help="Kinds to exclude"),
+    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to process (omit to process all)"),
+    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to process (omit to process all in each namespace)"),
     output: Optional[str] = typer.Option(None, help="Output CSV file path"),
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
+
     if namespace:
-        cfg.namespace_include = list(namespace)
-    if exclude_namespace:
-        cfg.namespace_exclude = list(exclude_namespace)
+        cfg.namespaces = list(namespace)
     if kind:
-        cfg.kinds_include = list(kind)
-    if exclude_kind:
-        cfg.kinds_exclude = list(exclude_kind)
+        cfg.kinds = list(kind)
 
     rows = analyze_kinds(cfg)
     if output:
@@ -65,9 +60,9 @@ def cmd_analyze_kinds(
 
 @app.command("analyze-fields")
 def cmd_analyze_fields(
-    kind: str = typer.Option(..., "--kind", "-k", help="Kind to analyze"),
-    namespace: Optional[str] = typer.Option(None, "--namespace", "-n", help="Namespace to query"),
-    group_by: Optional[str] = typer.Option(None, help="Group results by this field value"),
+    kind: Optional[str] = typer.Option(None, "--kind", "-k", help="Kind to analyze (falls back to config.kind)"),
+    namespace: Optional[str] = typer.Option(None, "--namespace", "-n", help="Namespace to query (falls back to config.namespace; omit to use all)"),
+    group_by: Optional[str] = typer.Option(None, help="Group results by this field value (falls back to config.group_by_field)"),
     only_field: Optional[List[str]] = typer.Option(None, "--only-field", help="Only consider these fields"),
     config: Optional[str] = typer.Option(None, help="Path to config.yaml"),
     project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
@@ -76,8 +71,16 @@ def cmd_analyze_fields(
     output_json: Optional[str] = typer.Option(None, help="Write raw JSON results to file"),
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
+
+    target_kind = kind or cfg.kind
+    target_namespace = namespace if namespace is not None else cfg.namespace
+    group_by_field = group_by if group_by is not None else cfg.group_by_field
+
+    if not target_kind:
+        raise typer.BadParameter("--kind is required (either via flag or config.kind)")
+
     result = analyze_field_contributions(
-        cfg, kind=kind, namespace=namespace, group_by_field=group_by, only_fields=list(only_field) if only_field else None
+        cfg, kind=target_kind, namespace=target_namespace, group_by_field=group_by_field, only_fields=list(only_field) if only_field else None
     )
 
     if output_json:
@@ -94,29 +97,24 @@ def cmd_cleanup(
     project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
     emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
     log_level: Optional[str] = typer.Option(None, help="Logging level"),
-    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to include"),
-    exclude_namespace: Optional[List[str]] = typer.Option(None, "--exclude-namespace", help="Namespaces to exclude"),
-    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to include"),
-    exclude_kind: Optional[List[str]] = typer.Option(None, "--exclude-kind", help="Kinds to exclude"),
-    ttl_field: Optional[str] = typer.Option(None, help="TTL field name"),
-    delete_missing_ttl: bool = typer.Option(True, help="Delete when TTL field is missing"),
-    batch_size: Optional[int] = typer.Option(None, help="Delete batch size"),
+    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to process (omit to process all)"),
+    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to process (omit to process all in each namespace)"),
+    ttl_field: Optional[str] = typer.Option(None, help="TTL field name (falls back to config.ttl_field)"),
+    delete_missing_ttl: Optional[bool] = typer.Option(None, help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)"),
+    batch_size: Optional[int] = typer.Option(None, help="Delete batch size (falls back to config.batch_size)"),
     dry_run: bool = typer.Option(False, help="Only report counts; do not delete"),
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 
     if namespace:
-        cfg.namespace_include = list(namespace)
-    if exclude_namespace:
-        cfg.namespace_exclude = list(exclude_namespace)
+        cfg.namespaces = list(namespace)
     if kind:
-        cfg.kinds_include = list(kind)
-    if exclude_kind:
-        cfg.kinds_exclude = list(exclude_kind)
-    if ttl_field:
+        cfg.kinds = list(kind)
+    if ttl_field is not None:
         cfg.ttl_field = ttl_field
-    cfg.delete_missing_ttl = delete_missing_ttl
-    if batch_size:
+    if delete_missing_ttl is not None:
+        cfg.delete_missing_ttl = delete_missing_ttl
+    if batch_size is not None:
         cfg.batch_size = batch_size
 
     totals = cleanup_expired(cfg, dry_run=dry_run)
diff --git a/config.yaml b/config.yaml
index 7537311..721dff4 100644
--- a/config.yaml
+++ b/config.yaml
@@ -5,11 +5,15 @@
 project_id: null          # e.g. "my-project"; if omitted, ADC or env will be used
 emulator_host: null       # e.g. "localhost:8010"; if set, emulator mode is enabled
 
-# Filters
-namespace_include: []     # e.g. ["", "tenant-a"]; empty string means default namespace
-namespace_exclude: []
-kinds_include: []
-kinds_exclude: []
+# Explicit filters (empty == all)
+# Empty list of namespaces means: iterate over all namespaces (including default "")
+namespaces: []            # e.g. ["", "tenant-a"]
+# Empty list of kinds means: iterate over all kinds within each namespace
+kinds: []                 # e.g. ["SourceCollectionStateEntity"]
+
+# Optional defaults used by some commands
+kind: null                # default kind for analyze-fields
+namespace: null           # default namespace for analyze-fields
 
 # Cleanup
 ttl_field: "expireAt"
diff --git a/gcd_tools/analyze_entity_fields.py b/gcd_tools/analyze_entity_fields.py
index e4bddcd..6de862d 100644
--- a/gcd_tools/analyze_entity_fields.py
+++ b/gcd_tools/analyze_entity_fields.py
@@ -7,7 +7,7 @@
 from google.cloud import datastore
 from google.cloud.datastore.helpers import entity_to_protobuf
 
-from .config import AppConfig, build_client, format_size
+from .config import AppConfig, build_client, format_size, list_namespaces
 
 logger = logging.getLogger(__name__)
 
@@ -44,15 +44,13 @@ def _estimate_field_contributions(
     return dict(field_totals), total_size, entity_count
 
 
-def analyze_field_contributions(
-    config: AppConfig,
+def _analyze_single_namespace(
+    client: datastore.Client,
     kind: str,
-    namespace: Optional[str] = None,
-    group_by_field: Optional[str] = None,
-    only_fields: Optional[List[str]] = None,
+    namespace: Optional[str],
+    group_by_field: Optional[str],
+    only_fields: Optional[List[str]],
 ) -> Dict:
-    client = build_client(config)
-
     query = client.query(kind=kind, namespace=namespace or None)
 
     if group_by_field:
@@ -117,7 +115,37 @@ def analyze_field_contributions(
     }
 
 
+def analyze_field_contributions(
+    config: AppConfig,
+    kind: str,
+    namespace: Optional[str] = None,
+    group_by_field: Optional[str] = None,
+    only_fields: Optional[List[str]] = None,
+) -> Dict:
+    client = build_client(config)
+
+    # If no namespace provided, iterate across all namespaces
+    if namespace is None:
+        results: Dict[str, Dict] = {}
+        for ns in list_namespaces(client):
+            results[ns or ""] = _analyze_single_namespace(
+                client, kind=kind, namespace=ns, group_by_field=group_by_field, only_fields=only_fields
+            )
+        return {"by_namespace": results}
+
+    # Single namespace
+    return _analyze_single_namespace(
+        client, kind=kind, namespace=namespace, group_by_field=group_by_field, only_fields=only_fields
+    )
+
+
 def print_field_summary(result: Dict) -> None:
+    if "by_namespace" in result:
+        for ns, data in result["by_namespace"].items():
+            print(f"\n=== namespace: {ns or '(default)'} ===")
+            print_field_summary(data)
+        return
+
     if "grouped" in result:
         for group_key, data in result["grouped"].items():
             ns = data.get("namespace") or ""
diff --git a/gcd_tools/analyze_kinds.py b/gcd_tools/analyze_kinds.py
index 323f318..31aac1a 100644
--- a/gcd_tools/analyze_kinds.py
+++ b/gcd_tools/analyze_kinds.py
@@ -11,8 +11,6 @@
     build_client,
     list_namespaces,
     list_kinds,
-    apply_kind_filters,
-    apply_namespace_filters,
     format_size,
 )
 
@@ -40,15 +38,13 @@ def estimate_entity_count_and_size(
 def analyze_kinds(config: AppConfig) -> List[Dict]:
     client = build_client(config)
 
-    all_namespaces = list_namespaces(client)
-    namespaces = apply_namespace_filters(
-        all_namespaces, config.namespace_include, config.namespace_exclude
-    )
+    # Determine namespaces: explicit list, or all
+    namespaces = config.namespaces if config.namespaces else list_namespaces(client)
 
     results: List[Dict] = []
     for ns in namespaces:
-        kinds = list_kinds(client, ns)
-        kinds = apply_kind_filters(kinds, config.kinds_include, config.kinds_exclude)
+        # Determine kinds: explicit list, or all in namespace
+        kinds = config.kinds if config.kinds else list_kinds(client, ns)
 
         logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds))
         for kind in kinds:
diff --git a/gcd_tools/cleanup_expired.py b/gcd_tools/cleanup_expired.py
index 0062dac..7ef36ef 100644
--- a/gcd_tools/cleanup_expired.py
+++ b/gcd_tools/cleanup_expired.py
@@ -2,7 +2,7 @@
 
 import logging
 from datetime import datetime, timezone
-from typing import Dict, Iterable, List, Optional
+from typing import Dict, List, Optional
 
 from google.cloud import datastore
 
@@ -11,8 +11,6 @@
     build_client,
     list_namespaces,
     list_kinds,
-    apply_kind_filters,
-    apply_namespace_filters,
     chunked,
 )
 
@@ -33,17 +31,15 @@ def cleanup_expired(
 ) -> Dict[str, int]:
     client = build_client(config)
 
-    all_namespaces = list_namespaces(client)
-    namespaces = apply_namespace_filters(
-        all_namespaces, config.namespace_include, config.namespace_exclude
-    )
+    # Determine namespaces: explicit list, or all
+    namespaces = config.namespaces if config.namespaces else list_namespaces(client)
 
     totals: Dict[str, int] = {}
     now = datetime.now(timezone.utc)
 
     for ns in namespaces:
-        kinds = list_kinds(client, ns)
-        kinds = apply_kind_filters(kinds, config.kinds_include, config.kinds_exclude)
+        # Determine kinds: explicit list, or all in namespace
+        kinds = config.kinds if config.kinds else list_kinds(client, ns)
 
         for kind in kinds:
             query = client.query(kind=kind, namespace=ns or None)
diff --git a/gcd_tools/config.py b/gcd_tools/config.py
index d1c3d81..568fd0e 100644
--- a/gcd_tools/config.py
+++ b/gcd_tools/config.py
@@ -3,7 +3,7 @@
 import os
 import logging
 from dataclasses import dataclass, field
-from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+from typing import Dict, Iterable, List, Optional, Sequence
 
 import yaml
 from google.cloud import datastore
@@ -14,11 +14,13 @@ class AppConfig:
     project_id: Optional[str] = None
     emulator_host: Optional[str] = None
 
-    # Filters
-    namespace_include: List[str] = field(default_factory=list)
-    namespace_exclude: List[str] = field(default_factory=list)
-    kinds_include: List[str] = field(default_factory=list)
-    kinds_exclude: List[str] = field(default_factory=list)
+    # Explicit filters (when empty -> use all)
+    namespaces: List[str] = field(default_factory=list)
+    kinds: List[str] = field(default_factory=list)
+
+    # Optional defaults for commands that need them (e.g., analyze-fields)
+    kind: Optional[str] = None
+    namespace: Optional[str] = None
 
     # Cleanup settings
     ttl_field: str = "expireAt"
@@ -59,10 +61,13 @@ def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) ->
     config.project_id = merged.get("project_id") or os.getenv("DATASTORE_PROJECT_ID")
     config.emulator_host = merged.get("emulator_host") or os.getenv("DATASTORE_EMULATOR_HOST")
 
-    config.namespace_include = _as_list(merged.get("namespace_include"))
-    config.namespace_exclude = _as_list(merged.get("namespace_exclude"))
-    config.kinds_include = _as_list(merged.get("kinds_include"))
-    config.kinds_exclude = _as_list(merged.get("kinds_exclude"))
+    # Explicit lists (no include/exclude). Empty -> all
+    config.namespaces = _as_list(merged.get("namespaces"))
+    config.kinds = _as_list(merged.get("kinds"))
+
+    # Optional defaults used by some commands
+    config.kind = merged.get("kind")
+    config.namespace = merged.get("namespace")
 
     config.ttl_field = merged.get("ttl_field", config.ttl_field)
     config.delete_missing_ttl = bool(merged.get("delete_missing_ttl", config.delete_missing_ttl))
@@ -117,29 +122,7 @@ def list_kinds(client: datastore.Client, namespace: Optional[str]) -> List[str]:
     return [e.key.name for e in query.fetch()]
 
 
-def apply_namespace_filters(all_namespaces: Sequence[str], include: Sequence[str], exclude: Sequence[str]) -> List[str]:
-    selected = list(all_namespaces)
-    if include:
-        include_set = set(include)
-        selected = [ns for ns in selected if ns in include_set]
-    if exclude:
-        exclude_set = set(exclude)
-        selected = [ns for ns in selected if ns not in exclude_set]
-    return selected
-
-
-def apply_kind_filters(all_kinds: Sequence[str], include: Sequence[str], exclude: Sequence[str]) -> List[str]:
-    selected = list(all_kinds)
-    if include:
-        include_set = set(include)
-        selected = [k for k in selected if k in include_set]
-    if exclude:
-        exclude_set = set(exclude)
-        selected = [k for k in selected if k not in exclude_set]
-    return selected
-
-
-def chunked(iterable: Sequence, chunk_size: int) -> Iterable[Sequence]:
+def chunked(iterable: Sequence, chunk_size: int):
     for i in range(0, len(iterable), max(1, chunk_size)):
         yield iterable[i : i + chunk_size]
 

From 359add6eacd4abc9facbdd19281eccfe99391189 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 12 Aug 2025 20:29:36 +0000
Subject: [PATCH 03/12] Refactor CLI options using Annotated for improved type
 hints and readability

Co-authored-by: isisosirishorus <isisosirishorus@hotmail.com>
---
 cli.py | 84 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 26 deletions(-)

diff --git a/cli.py b/cli.py
index e5c7c16..329262b 100644
--- a/cli.py
+++ b/cli.py
@@ -2,6 +2,7 @@
 
 import json
 from typing import List, Optional
+from typing import Annotated
 
 import typer
 
@@ -12,6 +13,30 @@
 
 app = typer.Typer(help="Utilities for analyzing and managing local Datastore/Firestore (Datastore mode)")
 
+# Reusable option aliases
+ConfigOpt = Annotated[Optional[str], typer.Option(None, "--config", help="Path to config.yaml")]
+ProjectOpt = Annotated[Optional[str], typer.Option(None, "--project", help="GCP/Emulator project id")]
+EmulatorHostOpt = Annotated[
+    Optional[str], typer.Option(None, "--emulator-host", help="Emulator host, e.g. localhost:8010")
+]
+LogLevelOpt = Annotated[Optional[str], typer.Option(None, "--log-level", help="Logging level")]
+NamespacesOpt = Annotated[
+    Optional[List[str]],
+    typer.Option(None, "--namespace", "-n", help="Namespaces to process (omit to process all)"),
+]
+KindsOpt = Annotated[
+    Optional[List[str]],
+    typer.Option(
+        None, "--kind", "-k", help="Kinds to process (omit to process all in each namespace)"
+    ),
+]
+SingleNamespaceOpt = Annotated[
+    Optional[str], typer.Option(None, "--namespace", "-n", help="Namespace to query (omit to use all)")
+]
+SingleKindOpt = Annotated[
+    Optional[str], typer.Option(None, "--kind", "-k", help="Kind to analyze (falls back to config.kind)")
+]
+
 
 def _load_cfg(
     config_path: Optional[str],
@@ -31,12 +56,12 @@ def _load_cfg(
 
 @app.command("analyze-kinds")
 def cmd_analyze_kinds(
-    config: Optional[str] = typer.Option(None, help="Path to config.yaml"),
-    project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
-    emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
-    log_level: Optional[str] = typer.Option(None, help="Logging level"),
-    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to process (omit to process all)"),
-    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to process (omit to process all in each namespace)"),
+    config: ConfigOpt,
+    project: ProjectOpt,
+    emulator_host: EmulatorHostOpt,
+    log_level: LogLevelOpt,
+    namespace: NamespacesOpt,
+    kind: KindsOpt,
     output: Optional[str] = typer.Option(None, help="Output CSV file path"),
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
@@ -60,15 +85,15 @@ def cmd_analyze_kinds(
 
 @app.command("analyze-fields")
 def cmd_analyze_fields(
-    kind: Optional[str] = typer.Option(None, "--kind", "-k", help="Kind to analyze (falls back to config.kind)"),
-    namespace: Optional[str] = typer.Option(None, "--namespace", "-n", help="Namespace to query (falls back to config.namespace; omit to use all)"),
-    group_by: Optional[str] = typer.Option(None, help="Group results by this field value (falls back to config.group_by_field)"),
-    only_field: Optional[List[str]] = typer.Option(None, "--only-field", help="Only consider these fields"),
-    config: Optional[str] = typer.Option(None, help="Path to config.yaml"),
-    project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
-    emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
-    log_level: Optional[str] = typer.Option(None, help="Logging level"),
-    output_json: Optional[str] = typer.Option(None, help="Write raw JSON results to file"),
+    kind: SingleKindOpt,
+    namespace: SingleNamespaceOpt,
+    group_by: Annotated[Optional[str], typer.Option(None, help="Group results by this field value (falls back to config.group_by_field)")],
+    only_field: Annotated[Optional[List[str]], typer.Option(None, "--only-field", help="Only consider these fields")],
+    config: ConfigOpt,
+    project: ProjectOpt,
+    emulator_host: EmulatorHostOpt,
+    log_level: LogLevelOpt,
+    output_json: Annotated[Optional[str], typer.Option(None, help="Write raw JSON results to file")],
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 
@@ -80,7 +105,11 @@ def cmd_analyze_fields(
         raise typer.BadParameter("--kind is required (either via flag or config.kind)")
 
     result = analyze_field_contributions(
-        cfg, kind=target_kind, namespace=target_namespace, group_by_field=group_by_field, only_fields=list(only_field) if only_field else None
+        cfg,
+        kind=target_kind,
+        namespace=target_namespace,
+        group_by_field=group_by_field,
+        only_fields=list(only_field) if only_field else None,
     )
 
     if output_json:
@@ -93,16 +122,19 @@ def cmd_analyze_fields(
 
 @app.command("cleanup")
 def cmd_cleanup(
-    config: Optional[str] = typer.Option(None, help="Path to config.yaml"),
-    project: Optional[str] = typer.Option(None, help="GCP/Emulator project id"),
-    emulator_host: Optional[str] = typer.Option(None, help="Emulator host, e.g. localhost:8010"),
-    log_level: Optional[str] = typer.Option(None, help="Logging level"),
-    namespace: Optional[List[str]] = typer.Option(None, "--namespace", "-n", help="Namespaces to process (omit to process all)"),
-    kind: Optional[List[str]] = typer.Option(None, "--kind", "-k", help="Kinds to process (omit to process all in each namespace)"),
-    ttl_field: Optional[str] = typer.Option(None, help="TTL field name (falls back to config.ttl_field)"),
-    delete_missing_ttl: Optional[bool] = typer.Option(None, help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)"),
-    batch_size: Optional[int] = typer.Option(None, help="Delete batch size (falls back to config.batch_size)"),
-    dry_run: bool = typer.Option(False, help="Only report counts; do not delete"),
+    config: ConfigOpt,
+    project: ProjectOpt,
+    emulator_host: EmulatorHostOpt,
+    log_level: LogLevelOpt,
+    namespace: NamespacesOpt,
+    kind: KindsOpt,
+    ttl_field: Annotated[Optional[str], typer.Option(None, help="TTL field name (falls back to config.ttl_field)")],
+    delete_missing_ttl: Annotated[
+        Optional[bool],
+        typer.Option(None, help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)"),
+    ],
+    batch_size: Annotated[Optional[int], typer.Option(None, help="Delete batch size (falls back to config.batch_size)")],
+    dry_run: Annotated[bool, typer.Option(False, help="Only report counts; do not delete")],
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 

From 953df966b10dbb17f0c5e2171070b40f48409e95 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 12 Aug 2025 23:36:17 +0000
Subject: [PATCH 04/12] Checkpoint before follow-up message

Co-authored-by: isisosirishorus <isisosirishorus@hotmail.com>
---
 .github/workflows/pr.yml      | 23 +++++++++++++++++++++++
 .github/workflows/release.yml | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 .github/workflows/pr.yml
 create mode 100644 .github/workflows/release.yml

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 0000000..2ba4f57
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,23 @@
+name: pr
+
+on:
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install
+        run: |
+          python -m pip install -U pip
+          python -m pip install .
+          python -m pip install pytest
+      - name: Test
+        run: pytest -q
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..9d7dbde
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,34 @@
+name: release
+
+on:
+  push:
+    branches: [ main ]
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write       # to push tags and release notes
+      id-token: write       # for PyPI trusted publishing (optional)
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0   # full history for semantic-release
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install build and release tooling
+        run: |
+          python -m pip install -U pip
+          python -m pip install .
+          python -m pip install build python-semantic-release
+      - name: Run tests
+        run: |
+          python -m pip install pytest
+          pytest -q
+      - name: Semantic Release
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          semantic-release publish
\ No newline at end of file

From fbf5abb01796ca6965407355e8b46929c152d07d Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 12 Aug 2025 23:38:39 +0000
Subject: [PATCH 05/12] Enhance CI/CD workflows with linting, security checks,
 and release configuration

Co-authored-by: isisosirishorus <isisosirishorus@hotmail.com>
---
 .github/workflows/pr.yml      | 16 ++++++++++++++--
 .github/workflows/release.yml | 31 ++++++++++++++++++++-----------
 .gitignore                    |  7 +++++++
 pyproject.toml                | 15 ++++++++++++++-
 4 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 2ba4f57..8bd86ea 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -14,10 +14,22 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+          cache: 'pip'
       - name: Install
         run: |
           python -m pip install -U pip
           python -m pip install .
-          python -m pip install pytest
+          python -m pip install pytest ruff black build pip-audit
+      - name: Lint
+        run: |
+          ruff check .
+          black --check .
       - name: Test
-        run: pytest -q
\ No newline at end of file
+        run: pytest -q
+      - name: Build and verify
+        run: |
+          python -m build
+          twine check dist/* || true
+      - name: Security audit
+        run: |
+          pip-audit -r requirements.txt || true
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 9d7dbde..4208745 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -8,27 +8,36 @@ jobs:
   release:
     runs-on: ubuntu-latest
     permissions:
-      contents: write       # to push tags and release notes
-      id-token: write       # for PyPI trusted publishing (optional)
+      contents: write
+      id-token: write
     steps:
       - uses: actions/checkout@v4
         with:
-          fetch-depth: 0   # full history for semantic-release
+          fetch-depth: 0
       - uses: actions/setup-python@v5
         with:
           python-version: '3.11'
-      - name: Install build and release tooling
+          cache: 'pip'
+      - name: Install
         run: |
           python -m pip install -U pip
           python -m pip install .
-          python -m pip install build python-semantic-release
-      - name: Run tests
+          python -m pip install pytest ruff black build python-semantic-release pip-audit
+      - name: Lint
         run: |
-          python -m pip install pytest
-          pytest -q
-      - name: Semantic Release
+          ruff check .
+          black --check .
+      - name: Test
+        run: pytest -q
+      - name: Build and verify
+        run: |
+          python -m build
+          twine check dist/* || true
+      - name: Security audit
+        run: |
+          pip-audit -r requirements.txt || true
+      - name: Semantic Release (version, tag, GitHub release, PyPI)
         env:
           PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          semantic-release publish
\ No newline at end of file
+        run: semantic-release publish
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index b7faf40..e54dbfc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -205,3 +205,10 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+# Local configuration
+config.yaml
+
+# Editor/OS
+.DS_Store
+Thumbs.db
diff --git a/pyproject.toml b/pyproject.toml
index b9fcc3a..862abd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,4 +44,17 @@ include = ["gcd_tools*"]
 line-length = 100
 
 [tool.ruff]
-line-length = 100
\ No newline at end of file
+line-length = 100
+
+[tool.semantic_release]
+version_variable = "pyproject.toml:version"
+branch = "main"
+upload_to_pypi = true
+dist_path = "dist"
+build_command = "python -m build"
+commit_message = "chore(release): {version} [skip ci]"
+changelog_sections = "feature,fix,perf,refactor,docs,style,build,ci,chore"
+
+[tool.semantic_release.remote]
+name = "origin"
+repo_url = "https://github.com/your-org/local-storage-utils"
\ No newline at end of file

From 0198bad33930355ebbfec95b056bc68b6b7e6150 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 12 Aug 2025 23:44:49 +0000
Subject: [PATCH 06/12] Update README with config guidance and publishing
 workflow details

Co-authored-by: isisosirishorus <isisosirishorus@hotmail.com>
---
 README.md   | 18 +++++++-----------
 config.yaml | 27 ---------------------------
 2 files changed, 7 insertions(+), 38 deletions(-)
 delete mode 100644 config.yaml

diff --git a/README.md b/README.md
index 59b25c2..cc76d7f 100644
--- a/README.md
+++ b/README.md
@@ -23,11 +23,11 @@ pip install -e .
 
 ## Configuration
 
-- By default, the CLI loads `config.yaml` from the current directory if present.
+- Create a local `config.yaml` in your working directory. It is gitignored and not included in the repo.
 - Any CLI flag overrides values from `config.yaml`.
 - If neither config nor flags provide a value, the tool falls back to environment variables (for emulator detection) or sensible defaults.
 
-Key settings in `config.yaml`:
+Example `config.yaml`:
 
 ```yaml
 project_id: "my-project"          # If omitted, ADC/env will be used
@@ -93,13 +93,9 @@ black .
 
 ## Publishing
 
-- CI is configured to publish to PyPI on tags `v*`.
-- Create a PyPI token and add it to repository secrets as `PYPI_API_TOKEN`.
-- Tag and push:
+- Automated: pushing to `main` triggers versioning, tagging, GitHub release, and PyPI publish via semantic-release.
+- Prerequisites:
+  - Add a PyPI token to repo secrets as `PYPI_API_TOKEN`.
+  - Use conventional commits for proper versioning.
 
-```bash
-git tag v0.1.0
-git push origin v0.1.0
-```
-
-The GitHub Actions workflow will build and upload the package to PyPI.
+Main branch should be protected (require PRs, disallow direct pushes) in repository settings.
diff --git a/config.yaml b/config.yaml
deleted file mode 100644
index 721dff4..0000000
--- a/config.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Default configuration for local-storage-utils
-# Values here can be overridden via CLI flags or environment variables.
-
-# Core connection
-project_id: null          # e.g. "my-project"; if omitted, ADC or env will be used
-emulator_host: null       # e.g. "localhost:8010"; if set, emulator mode is enabled
-
-# Explicit filters (empty == all)
-# Empty list of namespaces means: iterate over all namespaces (including default "")
-namespaces: []            # e.g. ["", "tenant-a"]
-# Empty list of kinds means: iterate over all kinds within each namespace
-kinds: []                 # e.g. ["SourceCollectionStateEntity"]
-
-# Optional defaults used by some commands
-kind: null                # default kind for analyze-fields
-namespace: null           # default namespace for analyze-fields
-
-# Cleanup
-ttl_field: "expireAt"
-delete_missing_ttl: true
-batch_size: 500
-
-# Analysis
-group_by_field: null
-
-# Logging
-log_level: "INFO"
\ No newline at end of file

From 6c88a552fdcbd238fd2a793d5b982228239c77f9 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 12 Aug 2025 23:53:09 +0000
Subject: [PATCH 07/12] Update README with Python installation and
 troubleshooting guidance

Co-authored-by: isisosirishorus <isisosirishorus@hotmail.com>
---
 README.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cc76d7f..f472b72 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,22 @@ This installs the `lsu` CLI.
 ```bash
 git clone <this-repo-url>
 cd local-storage-utils
-python -m venv .venv
+python3 -m venv .venv
 source .venv/bin/activate
-pip install -U pip
+python -m pip install -U pip
 pip install -e .
 ```
 
+### Troubleshooting local installs
+- If you see "Command 'python' not found", use `python3 -m venv .venv` (above). Inside the venv, `python` will point to Python 3.
+- If you see "externally-managed-environment", you are attempting a system-wide install. Always install into a virtual environment:
+  - Create a venv: `python3 -m venv .venv && source .venv/bin/activate`
+  - Then use the venv pip: `python -m pip install -U pip && pip install -e .`
+- If venv creation fails with "ensurepip is not available", install venv tooling on Debian/Ubuntu and retry:
+  ```bash
+  sudo apt-get update && sudo apt-get install -y python3-venv
+  ```
+
 ## Configuration
 
 - Create a local `config.yaml` in your working directory. It is gitignored and not included in the repo.

From 23cf174ea23ff8d3ee9da3262422889af84d2547 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20P=C3=A9rez?= <andres@worklytics.co>
Date: Tue, 26 Aug 2025 00:01:29 +0200
Subject: [PATCH 08/12] Fix cli dependencies

---
 cli.py | 86 +++++++++++++++++++++++++---------------------------------
 1 file changed, 37 insertions(+), 49 deletions(-)

diff --git a/cli.py b/cli.py
index 329262b..17d84bd 100644
--- a/cli.py
+++ b/cli.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 
 import json
-from typing import List, Optional
-from typing import Annotated
+from typing import List, Optional, Annotated
 
 import typer
 
@@ -13,29 +12,21 @@
 
 app = typer.Typer(help="Utilities for analyzing and managing local Datastore/Firestore (Datastore mode)")
 
-# Reusable option aliases
-ConfigOpt = Annotated[Optional[str], typer.Option(None, "--config", help="Path to config.yaml")]
-ProjectOpt = Annotated[Optional[str], typer.Option(None, "--project", help="GCP/Emulator project id")]
-EmulatorHostOpt = Annotated[
-    Optional[str], typer.Option(None, "--emulator-host", help="Emulator host, e.g. localhost:8010")
-]
-LogLevelOpt = Annotated[Optional[str], typer.Option(None, "--log-level", help="Logging level")]
+# Aliases with flags only — no defaults here
+ConfigOpt = Annotated[Optional[str], typer.Option("--config", help="Path to config.yaml")]
+ProjectOpt = Annotated[Optional[str], typer.Option("--project", help="GCP/Emulator project id")]
+EmulatorHostOpt = Annotated[Optional[str], typer.Option("--emulator-host", help="Emulator host, e.g. localhost:8010")]
+LogLevelOpt = Annotated[Optional[str], typer.Option("--log-level", help="Logging level")]
 NamespacesOpt = Annotated[
     Optional[List[str]],
-    typer.Option(None, "--namespace", "-n", help="Namespaces to process (omit to process all)"),
+    typer.Option("--namespace", "-n", help="Namespaces to process (omit to process all)")
 ]
 KindsOpt = Annotated[
     Optional[List[str]],
-    typer.Option(
-        None, "--kind", "-k", help="Kinds to process (omit to process all in each namespace)"
-    ),
-]
-SingleNamespaceOpt = Annotated[
-    Optional[str], typer.Option(None, "--namespace", "-n", help="Namespace to query (omit to use all)")
-]
-SingleKindOpt = Annotated[
-    Optional[str], typer.Option(None, "--kind", "-k", help="Kind to analyze (falls back to config.kind)")
+    typer.Option("--kind", "-k", help="Kinds to process (omit to process all in each namespace)")
 ]
+SingleNamespaceOpt = Annotated[Optional[str], typer.Option("--namespace", "-n", help="Namespace to query (omit to use all)")]
+SingleKindOpt = Annotated[Optional[str], typer.Option("--kind", "-k", help="Kind to analyze (falls back to config.kind)")]
 
 
 def _load_cfg(
@@ -56,13 +47,13 @@ def _load_cfg(
 
 @app.command("analyze-kinds")
 def cmd_analyze_kinds(
-    config: ConfigOpt,
-    project: ProjectOpt,
-    emulator_host: EmulatorHostOpt,
-    log_level: LogLevelOpt,
-    namespace: NamespacesOpt,
-    kind: KindsOpt,
-    output: Optional[str] = typer.Option(None, help="Output CSV file path"),
+    config: ConfigOpt = None,
+    project: ProjectOpt = None,
+    emulator_host: EmulatorHostOpt = None,
+    log_level: LogLevelOpt = None,
+    namespace: NamespacesOpt = None,
+    kind: KindsOpt = None,
+    output: Annotated[Optional[str], typer.Option("--output", help="Output CSV file path")] = None,
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 
@@ -85,15 +76,15 @@ def cmd_analyze_kinds(
 
 @app.command("analyze-fields")
 def cmd_analyze_fields(
-    kind: SingleKindOpt,
-    namespace: SingleNamespaceOpt,
-    group_by: Annotated[Optional[str], typer.Option(None, help="Group results by this field value (falls back to config.group_by_field)")],
-    only_field: Annotated[Optional[List[str]], typer.Option(None, "--only-field", help="Only consider these fields")],
-    config: ConfigOpt,
-    project: ProjectOpt,
-    emulator_host: EmulatorHostOpt,
-    log_level: LogLevelOpt,
-    output_json: Annotated[Optional[str], typer.Option(None, help="Write raw JSON results to file")],
+    kind: SingleKindOpt = None,
+    namespace: SingleNamespaceOpt = None,
+    group_by: Annotated[Optional[str], typer.Option("--group-by", help="Group results by this field value (falls back to config.group_by_field)")] = None,
+    only_field: Annotated[Optional[List[str]], typer.Option("--only-field", help="Only consider these fields")] = None,
+    config: ConfigOpt = None,
+    project: ProjectOpt = None,
+    emulator_host: EmulatorHostOpt = None,
+    log_level: LogLevelOpt = None,
+    output_json: Annotated[Optional[str], typer.Option("--output-json", help="Write raw JSON results to file")] = None,
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 
@@ -122,19 +113,16 @@ def cmd_analyze_fields(
 
 @app.command("cleanup")
 def cmd_cleanup(
-    config: ConfigOpt,
-    project: ProjectOpt,
-    emulator_host: EmulatorHostOpt,
-    log_level: LogLevelOpt,
-    namespace: NamespacesOpt,
-    kind: KindsOpt,
-    ttl_field: Annotated[Optional[str], typer.Option(None, help="TTL field name (falls back to config.ttl_field)")],
-    delete_missing_ttl: Annotated[
-        Optional[bool],
-        typer.Option(None, help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)"),
-    ],
-    batch_size: Annotated[Optional[int], typer.Option(None, help="Delete batch size (falls back to config.batch_size)")],
-    dry_run: Annotated[bool, typer.Option(False, help="Only report counts; do not delete")],
+    config: ConfigOpt = None,
+    project: ProjectOpt = None,
+    emulator_host: EmulatorHostOpt = None,
+    log_level: LogLevelOpt = None,
+    namespace: NamespacesOpt = None,
+    kind: KindsOpt = None,
+    ttl_field: Annotated[Optional[str], typer.Option("--ttl-field", help="TTL field name (falls back to config.ttl_field)")] = None,
+    delete_missing_ttl: Annotated[Optional[bool], typer.Option("--delete-missing-ttl", help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)")] = None,
+    batch_size: Annotated[Optional[int], typer.Option("--batch-size", help="Delete batch size (falls back to config.batch_size)")] = None,
+    dry_run: Annotated[bool, typer.Option("--dry-run", help="Only report counts; do not delete")] = False,
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 
@@ -155,4 +143,4 @@ def cmd_cleanup(
 
 
 if __name__ == "__main__":
-    app()
\ No newline at end of file
+    app()

From 378ff3cb1b9e1f30cdb53ce1d3e6dbcdf9a9b8c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20P=C3=A9rez?= <andres@worklytics.co>
Date: Tue, 26 Aug 2025 00:47:19 +0200
Subject: [PATCH 09/12] Using list of namespaces

---
 README.md                  |  1 -
 cli.py                     | 33 +++++++++------------------------
 gcd_tools/analyze_kinds.py | 14 ++++----------
 gcd_tools/config.py        | 19 +++++++++++++++++--
 4 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index f472b72..19ad010 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,6 @@ kinds: []                          # Empty -> iterate all kinds per namespace
 
 # Optional defaults
 kind: "SourceCollectionStateEntity"  # Default for analyze-fields
-namespace: ""                         # Default namespace for analyze-fields
 
 # Cleanup
 ttl_field: "expireAt"
diff --git a/cli.py b/cli.py
index 17d84bd..243d6b4 100644
--- a/cli.py
+++ b/cli.py
@@ -17,18 +17,12 @@
 ProjectOpt = Annotated[Optional[str], typer.Option("--project", help="GCP/Emulator project id")]
 EmulatorHostOpt = Annotated[Optional[str], typer.Option("--emulator-host", help="Emulator host, e.g. localhost:8010")]
 LogLevelOpt = Annotated[Optional[str], typer.Option("--log-level", help="Logging level")]
-NamespacesOpt = Annotated[
-    Optional[List[str]],
-    typer.Option("--namespace", "-n", help="Namespaces to process (omit to process all)")
-]
 KindsOpt = Annotated[
     Optional[List[str]],
-    typer.Option("--kind", "-k", help="Kinds to process (omit to process all in each namespace)")
+    typer.Option("--kind", "-k", help="Kinds to process (omit or empty to process all in each namespace)")
 ]
-SingleNamespaceOpt = Annotated[Optional[str], typer.Option("--namespace", "-n", help="Namespace to query (omit to use all)")]
 SingleKindOpt = Annotated[Optional[str], typer.Option("--kind", "-k", help="Kind to analyze (falls back to config.kind)")]
 
-
 def _load_cfg(
     config_path: Optional[str],
     project: Optional[str],
@@ -44,25 +38,22 @@ def _load_cfg(
         overrides["log_level"] = log_level
     return load_config(config_path, overrides)
 
-
 @app.command("analyze-kinds")
 def cmd_analyze_kinds(
     config: ConfigOpt = None,
     project: ProjectOpt = None,
     emulator_host: EmulatorHostOpt = None,
     log_level: LogLevelOpt = None,
-    namespace: NamespacesOpt = None,
     kind: KindsOpt = None,
     output: Annotated[Optional[str], typer.Option("--output", help="Output CSV file path")] = None,
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 
-    if namespace:
-        cfg.namespaces = list(namespace)
-    if kind:
-        cfg.kinds = list(kind)
-
+    if kind is not None:
+        # Normalise: treat [""] as empty (all kinds)
+        cfg.kinds = [k for k in kind if k]  # drop empty strings
     rows = analyze_kinds(cfg)
+
     if output:
         with open(output, "w", encoding="utf-8") as fh:
             fh.write("namespace,kind,count,size,bytes\n")
@@ -73,11 +64,10 @@ def cmd_analyze_kinds(
     else:
         print_summary_table(rows)
 
-
 @app.command("analyze-fields")
 def cmd_analyze_fields(
     kind: SingleKindOpt = None,
-    namespace: SingleNamespaceOpt = None,
+    namespace: Annotated[Optional[str], typer.Option("--namespace", "-n", help="Namespace to query (omit to use all)")] = None,
     group_by: Annotated[Optional[str], typer.Option("--group-by", help="Group results by this field value (falls back to config.group_by_field)")] = None,
     only_field: Annotated[Optional[List[str]], typer.Option("--only-field", help="Only consider these fields")] = None,
     config: ConfigOpt = None,
@@ -100,7 +90,7 @@ def cmd_analyze_fields(
         kind=target_kind,
         namespace=target_namespace,
         group_by_field=group_by_field,
-        only_fields=list(only_field) if only_field else None,
+        only_fields=[f for f in only_field] if only_field else None,
     )
 
     if output_json:
@@ -110,14 +100,12 @@ def cmd_analyze_fields(
     else:
         print_field_summary(result)
 
-
 @app.command("cleanup")
 def cmd_cleanup(
     config: ConfigOpt = None,
     project: ProjectOpt = None,
     emulator_host: EmulatorHostOpt = None,
     log_level: LogLevelOpt = None,
-    namespace: NamespacesOpt = None,
     kind: KindsOpt = None,
     ttl_field: Annotated[Optional[str], typer.Option("--ttl-field", help="TTL field name (falls back to config.ttl_field)")] = None,
     delete_missing_ttl: Annotated[Optional[bool], typer.Option("--delete-missing-ttl", help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)")] = None,
@@ -126,10 +114,8 @@ def cmd_cleanup(
 ):
     cfg = _load_cfg(config, project, emulator_host, log_level)
 
-    if namespace:
-        cfg.namespaces = list(namespace)
-    if kind:
-        cfg.kinds = list(kind)
+    if kind is not None:
+        cfg.kinds = [k for k in kind if k]
     if ttl_field is not None:
         cfg.ttl_field = ttl_field
     if delete_missing_ttl is not None:
@@ -141,6 +127,5 @@ def cmd_cleanup(
     deleted_sum = sum(totals.values())
     typer.echo(f"Total entities {'to delete' if dry_run else 'deleted'}: {deleted_sum}")
 
-
 if __name__ == "__main__":
     app()
diff --git a/gcd_tools/analyze_kinds.py b/gcd_tools/analyze_kinds.py
index 31aac1a..7f3e621 100644
--- a/gcd_tools/analyze_kinds.py
+++ b/gcd_tools/analyze_kinds.py
@@ -14,10 +14,8 @@
     format_size,
 )
 
-
 logger = logging.getLogger(__name__)
 
-
 def estimate_entity_count_and_size(
     client: datastore.Client, kind: str, namespace: Optional[str]
 ) -> Tuple[int, int]:
@@ -34,18 +32,15 @@ def estimate_entity_count_and_size(
         count += 1
     return count, total_size
 
-
 def analyze_kinds(config: AppConfig) -> List[Dict]:
     client = build_client(config)
 
-    # Determine namespaces: explicit list, or all
-    namespaces = config.namespaces if config.namespaces else list_namespaces(client)
+    # Thanks to config.py normalisation, [] is the only “all” case
+    namespaces = config.namespaces or list_namespaces(client)
 
     results: List[Dict] = []
     for ns in namespaces:
-        # Determine kinds: explicit list, or all in namespace
-        kinds = config.kinds if config.kinds else list_kinds(client, ns)
-
+        kinds = config.kinds or list_kinds(client, ns)
         logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds))
         for kind in kinds:
             count, total_bytes = estimate_entity_count_and_size(client, kind, ns)
@@ -60,10 +55,9 @@ def analyze_kinds(config: AppConfig) -> List[Dict]:
             )
     return results
 
-
 def print_summary_table(rows: List[Dict]) -> None:
     # Plain stdout table for wide compatibility
     print("namespace,kind,count,size,bytes")
     for r in rows:
         ns = r.get("namespace") or ""
-        print(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}")
\ No newline at end of file
+        print(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}")
diff --git a/gcd_tools/config.py b/gcd_tools/config.py
index 568fd0e..071ce9e 100644
--- a/gcd_tools/config.py
+++ b/gcd_tools/config.py
@@ -65,6 +65,12 @@ def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) ->
     config.namespaces = _as_list(merged.get("namespaces"))
     config.kinds = _as_list(merged.get("kinds"))
 
+    # 🛠 Normalise: treat [""] as empty
+    if config.namespaces == [""] or config.namespaces is None:
+        config.namespaces = []
+    if config.kinds == [""] or config.kinds is None:
+        config.kinds = []
+
     # Optional defaults used by some commands
     config.kind = merged.get("kind")
     config.namespace = merged.get("namespace")
@@ -81,6 +87,7 @@ def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) ->
     return config
 
 
+
 def _configure_logging(level: str) -> None:
     level_value = getattr(logging, level.upper(), logging.INFO)
     logging.basicConfig(level=level_value, format="%(asctime)s | %(levelname)s | %(message)s")
@@ -105,14 +112,22 @@ def build_client(config: AppConfig) -> datastore.Client:
 
 
 def list_namespaces(client: datastore.Client) -> List[str]:
-    # Include default namespace as "" first
+    """
+    Return all namespaces in the datastore, including the default ("").
+    Always queries __namespace__ in the root context so it works in emulator/GCP.
+    """
+    # Include default namespace "" first
     namespaces: List[str] = [""]
-    query = client.query(kind="__namespace__")
+
+    # Force namespace=None to query the metadata root
+    query = client.query(kind="__namespace__", namespace=None)
     query.keys_only()
+
     for entity in query.fetch():
         name = entity.key.name or ""
         if name != "":
             namespaces.append(name)
+
     return namespaces
 
 

From 14bfe22e06fe4962d9999b53ef864365b901c663 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20P=C3=A9rez?= <andres@worklytics.co>
Date: Tue, 26 Aug 2025 21:25:58 +0200
Subject: [PATCH 10/12] using stats

---
 gcd_tools/analyze_kinds.py | 81 +++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/gcd_tools/analyze_kinds.py b/gcd_tools/analyze_kinds.py
index 7f3e621..5b31f87 100644
--- a/gcd_tools/analyze_kinds.py
+++ b/gcd_tools/analyze_kinds.py
@@ -16,25 +16,60 @@
 
 logger = logging.getLogger(__name__)
 
-def estimate_entity_count_and_size(
-    client: datastore.Client, kind: str, namespace: Optional[str]
-) -> Tuple[int, int]:
-    query = client.query(kind=kind, namespace=namespace or None)
-    total_size = 0
-    count = 0
-    for entity in query.fetch():
-        try:
-            raw_proto = entity_to_protobuf(entity)._pb
-            total_size += len(raw_proto.SerializeToString())
-        except Exception:
-            # Fallback: count only
-            pass
-        count += 1
-    return count, total_size
-
-def analyze_kinds(config: AppConfig) -> List[Dict]:
+
+def get_kind_stats(client, kind: str, namespace: Optional[str] = None) -> Tuple[Optional[int], Optional[int]]:
+    """
+    Returns (count, bytes) for the given kind/namespace using Datastore statistics.
+    Falls back to None if not found.
+    """
+    if namespace:
+        stats_kind = "__Stat_Kind_Ns__"
+        query = client.query(kind=stats_kind)
+        query.add_filter("kind_name", "=", kind)
+        query.add_filter("namespace_name", "=", namespace)
+    else:
+        stats_kind = "__Stat_Kind__"
+        query = client.query(kind=stats_kind)
+        query.add_filter("kind_name", "=", kind)
+
+    results = list(query.fetch(limit=1))
+    if results:
+        return results[0]["count"], results[0]["bytes"]
+    return None, None
+
+
+def estimate_entity_count_and_size(client, kind: str, namespace: Optional[str], sample_size: int = 100) -> Tuple[int, int]:
+    """
+    Original keys-only method: exact count, approximate bytes via sampling.
+    """
+    # Count with keys-only
+    count_query = client.query(kind=kind, namespace=namespace or None)
+    count_query.keys_only()
+    total_count = sum(1 for _ in count_query.fetch())
+
+    # Sample for size
+    sample_query = client.query(kind=kind, namespace=namespace or None)
+    sample_entities = list(sample_query.fetch(limit=sample_size))
+    if sample_entities:
+        avg_size = sum(len(entity_to_protobuf(e)._pb.SerializeToString()) for e in sample_entities) / len(sample_entities)
+    else:
+        avg_size = 0
+
+    return total_count, int(avg_size * total_count)
+
+
+def analyze_kinds(config: AppConfig, method: Optional[str] = None) -> List[Dict]:
+    """
+    Analyze kinds using either:
+      - 'stats' (default) => fast built-in Datastore statistics
+      - 'scan'            => keys-only scan with sampling
+    Falls back to 'scan' if stats are missing for a kind.
+    """
     client = build_client(config)
 
+    # Decide method priority: parameter > config > default
+    method = method or getattr(config, "method", None) or "stats"
+
     # Thanks to config.py normalisation, [] is the only “all” case
     namespaces = config.namespaces or list_namespaces(client)
 
@@ -43,7 +78,16 @@ def analyze_kinds(config: AppConfig) -> List[Dict]:
         kinds = config.kinds or list_kinds(client, ns)
         logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds))
         for kind in kinds:
-            count, total_bytes = estimate_entity_count_and_size(client, kind, ns)
+            if method == "stats":
+                count, total_bytes = get_kind_stats(client, kind, ns)
+                if count is None:
+                    logger.warning("Stats not found for kind=%s, ns=%s — falling back to scan", kind, ns or "(default)")
+                    count, total_bytes = estimate_entity_count_and_size(client, kind, ns)
+            elif method == "scan":
+                count, total_bytes = estimate_entity_count_and_size(client, kind, ns)
+            else:
+                raise ValueError(f"Unknown method: {method}")
+
             results.append(
                 {
                     "namespace": ns,
@@ -55,6 +99,7 @@ def analyze_kinds(config: AppConfig) -> List[Dict]:
             )
     return results
 
+
 def print_summary_table(rows: List[Dict]) -> None:
     # Plain stdout table for wide compatibility
     print("namespace,kind,count,size,bytes")

From f7635b5e7e33028cdcedc55f7399c01383ad98ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20P=C3=A9rez?= <andres@worklytics.co>
Date: Fri, 17 Oct 2025 01:08:23 +0200
Subject: [PATCH 11/12] First working version

---
 README.md                          |  3 --
 gcd_tools/analyze_entity_fields.py |  3 +-
 gcd_tools/analyze_kinds.py         |  3 +-
 gcd_tools/cleanup_expired.py       | 25 +++++++++++------
 tests/test_commands.py             | 45 ++++++++++++++++++++++++++++++
 tests/test_import.py               |  4 +++
 6 files changed, 69 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_commands.py

diff --git a/README.md b/README.md
index 19ad010..23df618 100644
--- a/README.md
+++ b/README.md
@@ -12,21 +12,18 @@ This installs the `lsu` CLI.
 
 ## Install (from source)
 
-```bash
 git clone <this-repo-url>
 cd local-storage-utils
 python3 -m venv .venv
 source .venv/bin/activate
 python -m pip install -U pip
 pip install -e .
-```
 
 ### Troubleshooting local installs
 - If you see "Command 'python' not found", use `python3 -m venv .venv` (above). Inside the venv, `python` will point to Python 3.
 - If you see "externally-managed-environment", you are attempting a system-wide install. Always install into a virtual environment:
   - Create a venv: `python3 -m venv .venv && source .venv/bin/activate`
   - Then use the venv pip: `python -m pip install -U pip && pip install -e .`
-- If venv creation fails with "ensurepip is not available", install venv tooling on Debian/Ubuntu and retry:
   ```bash
   sudo apt-get update && sudo apt-get install -y python3-venv
   ```
diff --git a/gcd_tools/analyze_entity_fields.py b/gcd_tools/analyze_entity_fields.py
index 6de862d..d23c5e0 100644
--- a/gcd_tools/analyze_entity_fields.py
+++ b/gcd_tools/analyze_entity_fields.py
@@ -28,7 +28,8 @@ def _estimate_field_contributions(
     total_size = 0
     entity_count = 0
 
-    for entity in entities:
+    from tqdm import tqdm
+    for entity in tqdm(list(entities), desc="Analyzing field contributions", unit="entity"):
         entity_count += 1
         proto = entity_to_protobuf(entity)._pb
         full_size = len(proto.SerializeToString())
diff --git a/gcd_tools/analyze_kinds.py b/gcd_tools/analyze_kinds.py
index 5b31f87..a679f5c 100644
--- a/gcd_tools/analyze_kinds.py
+++ b/gcd_tools/analyze_kinds.py
@@ -73,11 +73,12 @@ def analyze_kinds(config: AppConfig, method: Optional[str] = None) -> List[Dict]
     # Thanks to config.py normalisation, [] is the only “all” case
     namespaces = config.namespaces or list_namespaces(client)
 
+    from tqdm import tqdm
     results: List[Dict] = []
     for ns in namespaces:
         kinds = config.kinds or list_kinds(client, ns)
         logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds))
-        for kind in kinds:
+        for kind in tqdm(kinds, desc=f"Analyzing kinds in ns={ns or '(default)'}", unit="kind"):
             if method == "stats":
                 count, total_bytes = get_kind_stats(client, kind, ns)
                 if count is None:
diff --git a/gcd_tools/cleanup_expired.py b/gcd_tools/cleanup_expired.py
index 7ef36ef..0f522b9 100644
--- a/gcd_tools/cleanup_expired.py
+++ b/gcd_tools/cleanup_expired.py
@@ -6,6 +6,8 @@
 
 from google.cloud import datastore
 
+from tqdm import tqdm
+
 from .config import (
     AppConfig,
     build_client,
@@ -44,7 +46,8 @@ def cleanup_expired(
         for kind in kinds:
             query = client.query(kind=kind, namespace=ns or None)
             to_delete: List[datastore.Key] = []
-            for entity in query.fetch():
+            entities = list(query.fetch())
+            for entity in tqdm(entities, desc=f"Scanning {kind} in ns={ns or '(default)'}", unit="entity"):
                 expire_at = entity.get(config.ttl_field)
                 expired = expire_at is None if config.delete_missing_ttl else False
                 if not expired and expire_at is not None:
@@ -65,13 +68,17 @@ def cleanup_expired(
                 )
                 totals[f"{ns}:{kind}"] = len(to_delete)
             else:
-                deleted = _delete_in_batches(client, to_delete, config.batch_size) if to_delete else 0
-                logger.info(
-                    "ns=%s kind=%s deleted %d expired entities",
-                    ns or "(default)",
-                    kind,
-                    deleted,
-                )
-                totals[f"{ns}:{kind}"] = deleted
+                    deleted = 0
+                    if to_delete:
+                        for batch in tqdm(list(chunked(to_delete, config.batch_size)), desc=f"Deleting {kind} in ns={ns or '(default)'}", unit="batch"):
+                            client.delete_multi(batch)
+                            deleted += len(batch)
+                    logger.info(
+                        "ns=%s kind=%s deleted %d expired entities",
+                        ns or "(default)",
+                        kind,
+                        deleted,
+                    )
+                    totals[f"{ns}:{kind}"] = deleted
 
     return totals
\ No newline at end of file
diff --git a/tests/test_commands.py b/tests/test_commands.py
new file mode 100644
index 0000000..0742187
--- /dev/null
+++ b/tests/test_commands.py
@@ -0,0 +1,45 @@
+
+import sys
+import os
+import pytest
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from gcd_tools import analyze_kinds, analyze_entity_fields, cleanup_expired, config
+from gcd_tools.config import AppConfig
+
+# Dummy config for testing (adjust as needed for emulator)
+def make_dummy_config():
+    return AppConfig(
+        project_id="dummy-project",
+        emulator_host="localhost:8080",
+        namespaces=[""],
+        kinds=["TestKind"],
+        ttl_field="expireAt",
+        delete_missing_ttl=True,
+        batch_size=10,
+        group_by_field=None,
+        log_level="INFO",
+    )
+
+def test_analyze_kinds_runs():
+    cfg = make_dummy_config()
+    try:
+        result = analyze_kinds(cfg)
+        assert isinstance(result, list)
+    except Exception as e:
+        pytest.skip(f"analyze_kinds requires emulator: {e}")
+
+def test_analyze_fields_runs():
+    cfg = make_dummy_config()
+    try:
+        result = analyze_entity_fields.analyze_field_contributions(cfg, kind="TestKind")
+        assert isinstance(result, dict)
+    except Exception as e:
+        pytest.skip(f"analyze_fields requires emulator: {e}")
+
+def test_cleanup_expired_runs():
+    cfg = make_dummy_config()
+    try:
+        result = cleanup_expired.cleanup_expired(cfg, dry_run=True)
+        assert isinstance(result, dict)
+    except Exception as e:
+        pytest.skip(f"cleanup_expired requires emulator: {e}")
diff --git a/tests/test_import.py b/tests/test_import.py
index b7f63dd..e87bc50 100644
--- a/tests/test_import.py
+++ b/tests/test_import.py
@@ -1,3 +1,7 @@
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
 def test_imports():
     import gcd_tools
     from gcd_tools import analyze_kinds, analyze_entity_fields, cleanup_expired, config

From 8ca20feb918504cb4d9975b8365a31a40efbd096 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9s=20P=C3=A9rez?= <andres@worklytics.co>
Date: Fri, 17 Oct 2025 02:02:28 +0200
Subject: [PATCH 12/12] Move to command folder

---
 cli.py                                        |  8 ++---
 {gcd_tools => commands}/__init__.py           | 22 ++++++------
 .../analyze_entity_fields.py                  | 10 ++++--
 {gcd_tools => commands}/analyze_kinds.py      |  9 +++--
 {gcd_tools => commands}/cleanup_expired.py    | 36 ++++++++++---------
 {gcd_tools => commands}/config.py             |  4 +--
 pyproject.toml                                |  2 +-
 tests/test_commands.py                        | 13 +++++--
 tests/test_import.py                          |  6 ++--
 9 files changed, 63 insertions(+), 47 deletions(-)
 rename {gcd_tools => commands}/__init__.py (59%)
 rename {gcd_tools => commands}/analyze_entity_fields.py (94%)
 rename {gcd_tools => commands}/analyze_kinds.py (92%)
 rename {gcd_tools => commands}/cleanup_expired.py (71%)
 rename {gcd_tools => commands}/config.py (99%)

diff --git a/cli.py b/cli.py
index 243d6b4..8d97b20 100644
--- a/cli.py
+++ b/cli.py
@@ -5,10 +5,10 @@
 
 import typer
 
-from gcd_tools.config import AppConfig, load_config
-from gcd_tools.analyze_kinds import analyze_kinds, print_summary_table
-from gcd_tools.analyze_entity_fields import analyze_field_contributions, print_field_summary
-from gcd_tools.cleanup_expired import cleanup_expired
+from commands.config import AppConfig, load_config
+from commands.analyze_kinds import analyze_kinds, print_summary_table
+from commands.analyze_entity_fields import analyze_field_contributions, print_field_summary
+from commands.cleanup_expired import cleanup_expired
 
 app = typer.Typer(help="Utilities for analyzing and managing local Datastore/Firestore (Datastore mode)")
 
diff --git a/gcd_tools/__init__.py b/commands/__init__.py
similarity index 59%
rename from gcd_tools/__init__.py
rename to commands/__init__.py
index a1c8e85..7493223 100644
--- a/gcd_tools/__init__.py
+++ b/commands/__init__.py
@@ -5,14 +5,14 @@
 from . import config as config
 
 __all__ = [
-    "AppConfig",
-    "load_config",
-    "build_client",
-    "list_namespaces",
-    "list_kinds",
-    "format_size",
-    "analyze_kinds",
-    "analyze_entity_fields",
-    "cleanup_expired",
-    "config",
-]
\ No newline at end of file
+	"AppConfig",
+	"load_config",
+	"build_client",
+	"list_namespaces",
+	"list_kinds",
+	"format_size",
+	"analyze_kinds",
+	"analyze_entity_fields",
+	"cleanup_expired",
+	"config",
+]
diff --git a/gcd_tools/analyze_entity_fields.py b/commands/analyze_entity_fields.py
similarity index 94%
rename from gcd_tools/analyze_entity_fields.py
rename to commands/analyze_entity_fields.py
index d23c5e0..fa70fc6 100644
--- a/gcd_tools/analyze_entity_fields.py
+++ b/commands/analyze_entity_fields.py
@@ -125,10 +125,14 @@ def analyze_field_contributions(
 ) -> Dict:
     client = build_client(config)
 
-    # If no namespace provided, iterate across all namespaces
+    # If no namespace provided, or config.namespaces is None/empty, iterate all namespaces
     if namespace is None:
+        if hasattr(config, "namespaces") and (not config.namespaces):
+            ns_list = list_namespaces(client)
+        else:
+            ns_list = [namespace] if namespace else list_namespaces(client)
         results: Dict[str, Dict] = {}
-        for ns in list_namespaces(client):
+        for ns in ns_list:
             results[ns or ""] = _analyze_single_namespace(
                 client, kind=kind, namespace=ns, group_by_field=group_by_field, only_fields=only_fields
             )
@@ -161,4 +165,4 @@ def print_field_summary(result: Dict) -> None:
         )
         for field, stats in result["fields"].items():
             avg = stats["avg_per_entity"]
-            print(f"  {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)")
\ No newline at end of file
+            print(f"  {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)")
diff --git a/gcd_tools/analyze_kinds.py b/commands/analyze_kinds.py
similarity index 92%
rename from gcd_tools/analyze_kinds.py
rename to commands/analyze_kinds.py
index a679f5c..9807532 100644
--- a/gcd_tools/analyze_kinds.py
+++ b/commands/analyze_kinds.py
@@ -70,13 +70,18 @@ def analyze_kinds(config: AppConfig, method: Optional[str] = None) -> List[Dict]
     # Decide method priority: parameter > config > default
     method = method or getattr(config, "method", None) or "stats"
 
-    # Thanks to config.py normalisation, [] is the only “all” case
-    namespaces = config.namespaces or list_namespaces(client)
+    # If namespaces is None or empty, iterate all available namespaces
+    if not config.namespaces:
+        namespaces = list_namespaces(client)
+    else:
+        namespaces = config.namespaces
 
+    print(f"Found namespaces: {namespaces}")
     from tqdm import tqdm
     results: List[Dict] = []
     for ns in namespaces:
         kinds = config.kinds or list_kinds(client, ns)
+        print(f"Namespace '{ns}': found kinds: {kinds}")
         logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds))
         for kind in tqdm(kinds, desc=f"Analyzing kinds in ns={ns or '(default)'}", unit="kind"):
             if method == "stats":
diff --git a/gcd_tools/cleanup_expired.py b/commands/cleanup_expired.py
similarity index 71%
rename from gcd_tools/cleanup_expired.py
rename to commands/cleanup_expired.py
index 0f522b9..4d8b9d4 100644
--- a/gcd_tools/cleanup_expired.py
+++ b/commands/cleanup_expired.py
@@ -6,8 +6,6 @@
 
 from google.cloud import datastore
 
-from tqdm import tqdm
-
 from .config import (
     AppConfig,
     build_client,
@@ -33,8 +31,11 @@ def cleanup_expired(
 ) -> Dict[str, int]:
     client = build_client(config)
 
-    # Determine namespaces: explicit list, or all
-    namespaces = config.namespaces if config.namespaces else list_namespaces(client)
+    # If namespaces is None or empty, iterate all available namespaces
+    if not config.namespaces:
+        namespaces = list_namespaces(client)
+    else:
+        namespaces = config.namespaces
 
     totals: Dict[str, int] = {}
     now = datetime.now(timezone.utc)
@@ -47,6 +48,7 @@ def cleanup_expired(
             query = client.query(kind=kind, namespace=ns or None)
             to_delete: List[datastore.Key] = []
             entities = list(query.fetch())
+            from tqdm import tqdm
             for entity in tqdm(entities, desc=f"Scanning {kind} in ns={ns or '(default)'}", unit="entity"):
                 expire_at = entity.get(config.ttl_field)
                 expired = expire_at is None if config.delete_missing_ttl else False
@@ -68,17 +70,17 @@ def cleanup_expired(
                 )
                 totals[f"{ns}:{kind}"] = len(to_delete)
             else:
-                    deleted = 0
-                    if to_delete:
-                        for batch in tqdm(list(chunked(to_delete, config.batch_size)), desc=f"Deleting {kind} in ns={ns or '(default)'}", unit="batch"):
-                            client.delete_multi(batch)
-                            deleted += len(batch)
-                    logger.info(
-                        "ns=%s kind=%s deleted %d expired entities",
-                        ns or "(default)",
-                        kind,
-                        deleted,
-                    )
-                    totals[f"{ns}:{kind}"] = deleted
+                deleted = 0
+                if to_delete:
+                    for batch in tqdm(list(chunked(to_delete, config.batch_size)), desc=f"Deleting {kind} in ns={ns or '(default)'}", unit="batch"):
+                        client.delete_multi(batch)
+                        deleted += len(batch)
+                logger.info(
+                    "ns=%s kind=%s deleted %d expired entities",
+                    ns or "(default)",
+                    kind,
+                    deleted,
+                )
+                totals[f"{ns}:{kind}"] = deleted
 
-    return totals
\ No newline at end of file
+    return totals
diff --git a/gcd_tools/config.py b/commands/config.py
similarity index 99%
rename from gcd_tools/config.py
rename to commands/config.py
index 071ce9e..420c993 100644
--- a/gcd_tools/config.py
+++ b/commands/config.py
@@ -8,7 +8,6 @@
 import yaml
 from google.cloud import datastore
 
-
 @dataclass
 class AppConfig:
     project_id: Optional[str] = None
@@ -87,7 +86,6 @@ def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) ->
     return config
 
 
-
 def _configure_logging(level: str) -> None:
     level_value = getattr(logging, level.upper(), logging.INFO)
     logging.basicConfig(level=level_value, format="%(asctime)s | %(levelname)s | %(message)s")
@@ -148,4 +146,4 @@ def format_size(bytes_size: int) -> str:
         if size < 1024:
             return f"{size:.2f} {unit}"
         size /= 1024
-    return f"{size:.2f} PB"
\ No newline at end of file
+    return f"{size:.2f} PB"
diff --git a/pyproject.toml b/pyproject.toml
index 862abd0..0ff44d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ local-storage-utils = "cli:app"
 
 [tool.setuptools.packages.find]
 where = ["."]
-include = ["gcd_tools*"]
+include = ["commands*"]
 
 [tool.black]
 line-length = 100
diff --git a/tests/test_commands.py b/tests/test_commands.py
index 0742187..70c05a3 100644
--- a/tests/test_commands.py
+++ b/tests/test_commands.py
@@ -1,10 +1,9 @@
-
 import sys
 import os
 import pytest
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from gcd_tools import analyze_kinds, analyze_entity_fields, cleanup_expired, config
-from gcd_tools.config import AppConfig
+from commands import analyze_kinds, analyze_entity_fields, cleanup_expired, config
+from commands.config import AppConfig, build_client, list_namespaces
 
 # Dummy config for testing (adjust as needed for emulator)
 def make_dummy_config():
@@ -43,3 +42,11 @@ def test_cleanup_expired_runs():
         assert isinstance(result, dict)
     except Exception as e:
         pytest.skip(f"cleanup_expired requires emulator: {e}")
+
+def test_list_namespaces_returns_default_and_any_custom():
+    cfg = AppConfig(project_id="dummy-project", emulator_host="localhost:8010")
+    client = build_client(cfg)
+    namespaces = list_namespaces(client)
+    assert "" in namespaces  # default namespace always present
+    # This test will pass if at least the default namespace is present
+    # Add more asserts if you know your emulator has more namespaces
diff --git a/tests/test_import.py b/tests/test_import.py
index e87bc50..c30178d 100644
--- a/tests/test_import.py
+++ b/tests/test_import.py
@@ -3,8 +3,8 @@
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 
 def test_imports():
-    import gcd_tools
-    from gcd_tools import analyze_kinds, analyze_entity_fields, cleanup_expired, config
+    import commands
+    from commands import analyze_kinds, analyze_entity_fields, cleanup_expired, config
 
-    assert gcd_tools is not None
+    assert commands is not None
     assert hasattr(config, "AppConfig")
\ No newline at end of file