diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..cd842b7 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,49 @@ +name: build + +on: + push: + branches: [ main ] + tags: [ "v*" ] + pull_request: + +jobs: + ci: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install + run: | + python -m pip install -U pip + python -m pip install . + python -m pip install pytest + - name: Test + run: | + pytest -q + + publish: + needs: ci + if: startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Build + run: | + python -m pip install -U pip build + python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 0000000..8bd86ea --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,35 @@ +name: pr + +on: + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + - name: Install + run: | + python -m pip install -U pip + python -m pip install . + python -m pip install pytest ruff black build pip-audit + - name: Lint + run: | + ruff check . + black --check . + - name: Test + run: pytest -q + - name: Build and verify + run: | + python -m build + twine check dist/* || true + - name: Security audit + run: | + pip-audit -r requirements.txt || true \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..4208745 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,43 @@ +name: release + +on: + push: + branches: [ main ] + +jobs: + release: + runs-on: ubuntu-latest + permissions: + contents: write + id-token: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + - name: Install + run: | + python -m pip install -U pip + python -m pip install . + python -m pip install pytest ruff black build python-semantic-release pip-audit + - name: Lint + run: | + ruff check . + black --check . + - name: Test + run: pytest -q + - name: Build and verify + run: | + python -m build + twine check dist/* || true + - name: Security audit + run: | + pip-audit -r requirements.txt || true + - name: Semantic Release (version, tag, GitHub release, PyPI) + env: + PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: semantic-release publish \ No newline at end of file diff --git a/.gitignore b/.gitignore index b7faf40..e54dbfc 100644 --- a/.gitignore +++ b/.gitignore @@ -205,3 +205,10 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# Local configuration +config.yaml + +# Editor/OS +.DS_Store +Thumbs.db diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..6f4b308 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,21 @@ +# Contributing + +Thanks for your interest in contributing! + +- Open an issue to discuss substantial changes. +- Fork and create feature branches from `main`. +- Run formatting and tests before submitting a PR. + +## Dev setup + +```bash +python -m venv .venv && source .venv/bin/activate +pip install -U pip +pip install -e . +``` + +## Testing + +```bash +python -m pytest -q +``` \ No newline at end of file diff --git a/README.md b/README.md index 5d34160..23df618 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,107 @@ # local-storage-utils -Set of scripts and tools for managing GCP Datastore data in local + +Utilities for analyzing and managing local Datastore/Firestore (Datastore mode) data. Works with both the Datastore Emulator and GCP using Application Default Credentials. + +## Install (PyPI) + +```bash +pip install local-storage-utils +``` + +This installs the `lsu` CLI. + +## Install (from source) + +git clone +cd local-storage-utils +python3 -m venv .venv +source .venv/bin/activate +python -m pip install -U pip +pip install -e . + +### Troubleshooting local installs +- If you see "Command 'python' not found", use `python3 -m venv .venv` (above). Inside the venv, `python` will point to Python 3. +- If you see "externally-managed-environment", you are attempting a system-wide install. Always install into a virtual environment: + - Create a venv: `python3 -m venv .venv && source .venv/bin/activate` + - Then use the venv pip: `python -m pip install -U pip && pip install -e .` + ```bash + sudo apt-get update && sudo apt-get install -y python3-venv + ``` + +## Configuration + +- Create a local `config.yaml` in your working directory. It is gitignored and not included in the repo. +- Any CLI flag overrides values from `config.yaml`. +- If neither config nor flags provide a value, the tool falls back to environment variables (for emulator detection) or sensible defaults. + +Example `config.yaml`: + +```yaml +project_id: "my-project" # If omitted, ADC/env will be used +emulator_host: "localhost:8010" # If set, uses Datastore Emulator + +# Explicit filters (empty means all) +namespaces: [""] # Empty -> iterate all namespaces (including default "") +kinds: [] # Empty -> iterate all kinds per namespace + +# Optional defaults +kind: "SourceCollectionStateEntity" # Default for analyze-fields + +# Cleanup +ttl_field: "expireAt" +delete_missing_ttl: true +batch_size: 500 + +# Analysis +group_by_field: null + +# Logging +log_level: "INFO" +``` + +## CLI usage + +```bash +# Kind-level counts and size estimates +lsu analyze-kinds --project my-project + +# Use all namespaces/kinds by default, or restrict explicitly +lsu analyze-kinds --namespace "" --namespace tenant-a --kind SourceCollectionStateEntity + +# Field contribution analysis (falls back to config.kind/config.namespace if not provided) +lsu analyze-fields --kind SourceCollectionStateEntity --namespace "" --group-by batchId + +# TTL cleanup across namespaces/kinds (dry-run) +lsu cleanup --ttl-field expireAt --dry-run + +# TTL cleanup restricted to specific namespaces/kinds +lsu cleanup --namespace "" --namespace tenant-a --kind pipeline-job +``` + +Use `--help` on any command for full options. Config can be provided via `config.yaml` or flags. + +## Development + +- Create a virtual environment and install in editable mode as shown above +- Run tests: + +```bash +python -m pip install pytest +pytest -q +``` + +- Lint/format (optional if you use pre-commit/CI): +```bash +python -m pip install ruff black +ruff check . +black . +``` + +## Publishing + +- Automated: pushing to `main` triggers versioning, tagging, GitHub release, and PyPI publish via semantic-release. +- Prerequisites: + - Add a PyPI token to repo secrets as `PYPI_API_TOKEN`. + - Use conventional commits for proper versioning. + +Main branch should be protected (require PRs, disallow direct pushes) in repository settings. diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..8d97b20 --- /dev/null +++ b/cli.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import json +from typing import List, Optional, Annotated + +import typer + +from commands.config import AppConfig, load_config +from commands.analyze_kinds import analyze_kinds, print_summary_table +from commands.analyze_entity_fields import analyze_field_contributions, print_field_summary +from commands.cleanup_expired import cleanup_expired + +app = typer.Typer(help="Utilities for analyzing and managing local Datastore/Firestore (Datastore mode)") + +# Aliases with flags only — no defaults here +ConfigOpt = Annotated[Optional[str], typer.Option("--config", help="Path to config.yaml")] +ProjectOpt = Annotated[Optional[str], typer.Option("--project", help="GCP/Emulator project id")] +EmulatorHostOpt = Annotated[Optional[str], typer.Option("--emulator-host", help="Emulator host, e.g. localhost:8010")] +LogLevelOpt = Annotated[Optional[str], typer.Option("--log-level", help="Logging level")] +KindsOpt = Annotated[ + Optional[List[str]], + typer.Option("--kind", "-k", help="Kinds to process (omit or empty to process all in each namespace)") +] +SingleKindOpt = Annotated[Optional[str], typer.Option("--kind", "-k", help="Kind to analyze (falls back to config.kind)")] + +def _load_cfg( + config_path: Optional[str], + project: Optional[str], + emulator_host: Optional[str], + log_level: Optional[str], +) -> AppConfig: + overrides = {} + if project: + overrides["project_id"] = project + if emulator_host: + overrides["emulator_host"] = emulator_host + if log_level: + overrides["log_level"] = log_level + return load_config(config_path, overrides) + +@app.command("analyze-kinds") +def cmd_analyze_kinds( + config: ConfigOpt = None, + project: ProjectOpt = None, + emulator_host: EmulatorHostOpt = None, + log_level: LogLevelOpt = None, + kind: KindsOpt = None, + output: Annotated[Optional[str], typer.Option("--output", help="Output CSV file path")] = None, +): + cfg = _load_cfg(config, project, emulator_host, log_level) + + if kind is not None: + # Normalise: treat [""] as empty (all kinds) + cfg.kinds = [k for k in kind if k] # drop empty strings + rows = analyze_kinds(cfg) + + if output: + with open(output, "w", encoding="utf-8") as fh: + fh.write("namespace,kind,count,size,bytes\n") + for r in rows: + ns = r.get("namespace") or "" + fh.write(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}\n") + typer.echo(f"Wrote {len(rows)} rows to {output}") + else: + print_summary_table(rows) + +@app.command("analyze-fields") +def cmd_analyze_fields( + kind: SingleKindOpt = None, + namespace: Annotated[Optional[str], typer.Option("--namespace", "-n", help="Namespace to query (omit to use all)")] = None, + group_by: Annotated[Optional[str], typer.Option("--group-by", help="Group results by this field value (falls back to config.group_by_field)")] = None, + only_field: Annotated[Optional[List[str]], typer.Option("--only-field", help="Only consider these fields")] = None, + config: ConfigOpt = None, + project: ProjectOpt = None, + emulator_host: EmulatorHostOpt = None, + log_level: LogLevelOpt = None, + output_json: Annotated[Optional[str], typer.Option("--output-json", help="Write raw JSON results to file")] = None, +): + cfg = _load_cfg(config, project, emulator_host, log_level) + + target_kind = kind or cfg.kind + target_namespace = namespace if namespace is not None else cfg.namespace + group_by_field = group_by if group_by is not None else cfg.group_by_field + + if not target_kind: + raise typer.BadParameter("--kind is required (either via flag or config.kind)") + + result = analyze_field_contributions( + cfg, + kind=target_kind, + namespace=target_namespace, + group_by_field=group_by_field, + only_fields=[f for f in only_field] if only_field else None, + ) + + if output_json: + with open(output_json, "w", encoding="utf-8") as fh: + json.dump(result, fh, indent=2) + typer.echo(f"Wrote JSON results to {output_json}") + else: + print_field_summary(result) + +@app.command("cleanup") +def cmd_cleanup( + config: ConfigOpt = None, + project: ProjectOpt = None, + emulator_host: EmulatorHostOpt = None, + log_level: LogLevelOpt = None, + kind: KindsOpt = None, + ttl_field: Annotated[Optional[str], typer.Option("--ttl-field", help="TTL field name (falls back to config.ttl_field)")] = None, + delete_missing_ttl: Annotated[Optional[bool], typer.Option("--delete-missing-ttl", help="Delete when TTL field is missing (falls back to config.delete_missing_ttl)")] = None, + batch_size: Annotated[Optional[int], typer.Option("--batch-size", help="Delete batch size (falls back to config.batch_size)")] = None, + dry_run: Annotated[bool, typer.Option("--dry-run", help="Only report counts; do not delete")] = False, +): + cfg = _load_cfg(config, project, emulator_host, log_level) + + if kind is not None: + cfg.kinds = [k for k in kind if k] + if ttl_field is not None: + cfg.ttl_field = ttl_field + if delete_missing_ttl is not None: + cfg.delete_missing_ttl = delete_missing_ttl + if batch_size is not None: + cfg.batch_size = batch_size + + totals = cleanup_expired(cfg, dry_run=dry_run) + deleted_sum = sum(totals.values()) + typer.echo(f"Total entities {'to delete' if dry_run else 'deleted'}: {deleted_sum}") + +if __name__ == "__main__": + app() diff --git a/commands/__init__.py b/commands/__init__.py new file mode 100644 index 0000000..7493223 --- /dev/null +++ b/commands/__init__.py @@ -0,0 +1,18 @@ +from .config import AppConfig, load_config, build_client, list_namespaces, list_kinds, format_size +from . import analyze_kinds as analyze_kinds +from . import analyze_entity_fields as analyze_entity_fields +from . import cleanup_expired as cleanup_expired +from . import config as config + +__all__ = [ + "AppConfig", + "load_config", + "build_client", + "list_namespaces", + "list_kinds", + "format_size", + "analyze_kinds", + "analyze_entity_fields", + "cleanup_expired", + "config", +] diff --git a/commands/analyze_entity_fields.py b/commands/analyze_entity_fields.py new file mode 100644 index 0000000..fa70fc6 --- /dev/null +++ b/commands/analyze_entity_fields.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import logging +from collections import defaultdict +from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple + +from google.cloud import datastore +from google.cloud.datastore.helpers import entity_to_protobuf + +from .config import AppConfig, build_client, format_size, list_namespaces + +logger = logging.getLogger(__name__) + + +def _clone_without_field(entity: datastore.Entity, exclude_field: str) -> datastore.Entity: + new_entity = datastore.Entity(key=entity.key) + for k, v in entity.items(): + if k != exclude_field: + new_entity[k] = v + return new_entity + + +def _estimate_field_contributions( + entities: Iterable[datastore.Entity], + target_fields: Optional[List[str]] = None, +) -> Tuple[Dict[str, int], int, int]: + field_totals: DefaultDict[str, int] = defaultdict(int) + total_size = 0 + entity_count = 0 + + from tqdm import tqdm + for entity in tqdm(list(entities), desc="Analyzing field contributions", unit="entity"): + entity_count += 1 + proto = entity_to_protobuf(entity)._pb + full_size = len(proto.SerializeToString()) + total_size += full_size + + for field in (target_fields or list(entity.keys())): + if field not in entity: + continue + reduced_entity = _clone_without_field(entity, field) + reduced_size = len(entity_to_protobuf(reduced_entity)._pb.SerializeToString()) + field_totals[field] += max(0, full_size - reduced_size) + + return dict(field_totals), total_size, entity_count + + +def _analyze_single_namespace( + client: datastore.Client, + kind: str, + namespace: Optional[str], + group_by_field: Optional[str], + only_fields: Optional[List[str]], +) -> Dict: + query = client.query(kind=kind, namespace=namespace or None) + + if group_by_field: + logger.info( + "Analyzing field contributions for kind=%s, namespace=%s grouped by %s", + kind, + namespace or "(default)", + group_by_field, + ) + grouped_entities: DefaultDict[str, List[datastore.Entity]] = defaultdict(list) + for entity in query.fetch(): + group_val = entity.get(group_by_field) + key = str(group_val) if group_val is not None else "" + grouped_entities[key].append(entity) + + results: Dict[str, Dict] = {} + for group_key, ents in grouped_entities.items(): + field_totals, total_size, entity_count = _estimate_field_contributions( + ents, target_fields=only_fields + ) + results[group_key] = { + "namespace": namespace, + "kind": kind, + "group": group_key, + "entity_count": entity_count, + "total_bytes": total_size, + "total_size": format_size(total_size), + "fields": { + f: { + "bytes": b, + "avg_per_entity": (b / entity_count) if entity_count else 0.0, + "human": format_size(b), + } + for f, b in sorted(field_totals.items(), key=lambda x: x[1], reverse=True) + }, + } + return {"grouped": results} + + # Ungrouped path + logger.info( + "Analyzing field contributions for kind=%s, namespace=%s", + kind, + namespace or "(default)", + ) + field_totals, total_size, entity_count = _estimate_field_contributions( + query.fetch(), target_fields=only_fields + ) + return { + "namespace": namespace, + "kind": kind, + "entity_count": entity_count, + "total_bytes": total_size, + "total_size": format_size(total_size), + "fields": { + f: { + "bytes": b, + "avg_per_entity": (b / entity_count) if entity_count else 0.0, + "human": format_size(b), + } + for f, b in sorted(field_totals.items(), key=lambda x: x[1], reverse=True) + }, + } + + +def analyze_field_contributions( + config: AppConfig, + kind: str, + namespace: Optional[str] = None, + group_by_field: Optional[str] = None, + only_fields: Optional[List[str]] = None, +) -> Dict: + client = build_client(config) + + # If no namespace provided, or config.namespaces is None/empty, iterate all namespaces + if namespace is None: + if hasattr(config, "namespaces") and (not config.namespaces): + ns_list = list_namespaces(client) + else: + ns_list = [namespace] if namespace else list_namespaces(client) + results: Dict[str, Dict] = {} + for ns in ns_list: + results[ns or ""] = _analyze_single_namespace( + client, kind=kind, namespace=ns, group_by_field=group_by_field, only_fields=only_fields + ) + return {"by_namespace": results} + + # Single namespace + return _analyze_single_namespace( + client, kind=kind, namespace=namespace, group_by_field=group_by_field, only_fields=only_fields + ) + + +def print_field_summary(result: Dict) -> None: + if "by_namespace" in result: + for ns, data in result["by_namespace"].items(): + print(f"\n=== namespace: {ns or '(default)'} ===") + print_field_summary(data) + return + + if "grouped" in result: + for group_key, data in result["grouped"].items(): + ns = data.get("namespace") or "" + print(f"\n[group={group_key}] ns={ns} kind={data['kind']} entities={data['entity_count']} total={data['total_size']}") + for field, stats in data["fields"].items(): + avg = stats["avg_per_entity"] + print(f" {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)") + else: + ns = result.get("namespace") or "" + print( + f"ns={ns} kind={result['kind']} entities={result['entity_count']} total={result['total_size']}" + ) + for field, stats in result["fields"].items(): + avg = stats["avg_per_entity"] + print(f" {field:30} {stats['human']:>12} ({avg:.1f} bytes avg)") diff --git a/commands/analyze_kinds.py b/commands/analyze_kinds.py new file mode 100644 index 0000000..9807532 --- /dev/null +++ b/commands/analyze_kinds.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import logging +from typing import Dict, List, Optional, Tuple + +from google.cloud import datastore +from google.cloud.datastore.helpers import entity_to_protobuf + +from .config import ( + AppConfig, + build_client, + list_namespaces, + list_kinds, + format_size, +) + +logger = logging.getLogger(__name__) + + +def get_kind_stats(client, kind: str, namespace: Optional[str] = None) -> Tuple[Optional[int], Optional[int]]: + """ + Returns (count, bytes) for the given kind/namespace using Datastore statistics. + Falls back to None if not found. + """ + if namespace: + stats_kind = "__Stat_Kind_Ns__" + query = client.query(kind=stats_kind) + query.add_filter("kind_name", "=", kind) + query.add_filter("namespace_name", "=", namespace) + else: + stats_kind = "__Stat_Kind__" + query = client.query(kind=stats_kind) + query.add_filter("kind_name", "=", kind) + + results = list(query.fetch(limit=1)) + if results: + return results[0]["count"], results[0]["bytes"] + return None, None + + +def estimate_entity_count_and_size(client, kind: str, namespace: Optional[str], sample_size: int = 100) -> Tuple[int, int]: + """ + Original keys-only method: exact count, approximate bytes via sampling. + """ + # Count with keys-only + count_query = client.query(kind=kind, namespace=namespace or None) + count_query.keys_only() + total_count = sum(1 for _ in count_query.fetch()) + + # Sample for size + sample_query = client.query(kind=kind, namespace=namespace or None) + sample_entities = list(sample_query.fetch(limit=sample_size)) + if sample_entities: + avg_size = sum(len(entity_to_protobuf(e)._pb.SerializeToString()) for e in sample_entities) / len(sample_entities) + else: + avg_size = 0 + + return total_count, int(avg_size * total_count) + + +def analyze_kinds(config: AppConfig, method: Optional[str] = None) -> List[Dict]: + """ + Analyze kinds using either: + - 'stats' (default) => fast built-in Datastore statistics + - 'scan' => keys-only scan with sampling + Falls back to 'scan' if stats are missing for a kind. + """ + client = build_client(config) + + # Decide method priority: parameter > config > default + method = method or getattr(config, "method", None) or "stats" + + # If namespaces is None or empty, iterate all available namespaces + if not config.namespaces: + namespaces = list_namespaces(client) + else: + namespaces = config.namespaces + + print(f"Found namespaces: {namespaces}") + from tqdm import tqdm + results: List[Dict] = [] + for ns in namespaces: + kinds = config.kinds or list_kinds(client, ns) + print(f"Namespace '{ns}': found kinds: {kinds}") + logger.info("Analyzing namespace=%s, %d kinds", ns or "(default)", len(kinds)) + for kind in tqdm(kinds, desc=f"Analyzing kinds in ns={ns or '(default)'}", unit="kind"): + if method == "stats": + count, total_bytes = get_kind_stats(client, kind, ns) + if count is None: + logger.warning("Stats not found for kind=%s, ns=%s — falling back to scan", kind, ns or "(default)") + count, total_bytes = estimate_entity_count_and_size(client, kind, ns) + elif method == "scan": + count, total_bytes = estimate_entity_count_and_size(client, kind, ns) + else: + raise ValueError(f"Unknown method: {method}") + + results.append( + { + "namespace": ns, + "kind": kind, + "count": count, + "bytes": total_bytes, + "size": format_size(total_bytes), + } + ) + return results + + +def print_summary_table(rows: List[Dict]) -> None: + # Plain stdout table for wide compatibility + print("namespace,kind,count,size,bytes") + for r in rows: + ns = r.get("namespace") or "" + print(f"{ns},{r['kind']},{r['count']},{r['size']},{r['bytes']}") diff --git a/commands/cleanup_expired.py b/commands/cleanup_expired.py new file mode 100644 index 0000000..4d8b9d4 --- /dev/null +++ b/commands/cleanup_expired.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import logging +from datetime import datetime, timezone +from typing import Dict, List, Optional + +from google.cloud import datastore + +from .config import ( + AppConfig, + build_client, + list_namespaces, + list_kinds, + chunked, +) + +logger = logging.getLogger(__name__) + + +def _delete_in_batches(client: datastore.Client, keys: List[datastore.Key], batch_size: int) -> int: + deleted = 0 + for batch in chunked(keys, batch_size): + client.delete_multi(batch) # type: ignore[arg-type] + deleted += len(batch) + return deleted + + +def cleanup_expired( + config: AppConfig, + dry_run: bool = False, +) -> Dict[str, int]: + client = build_client(config) + + # If namespaces is None or empty, iterate all available namespaces + if not config.namespaces: + namespaces = list_namespaces(client) + else: + namespaces = config.namespaces + + totals: Dict[str, int] = {} + now = datetime.now(timezone.utc) + + for ns in namespaces: + # Determine kinds: explicit list, or all in namespace + kinds = config.kinds if config.kinds else list_kinds(client, ns) + + for kind in kinds: + query = client.query(kind=kind, namespace=ns or None) + to_delete: List[datastore.Key] = [] + entities = list(query.fetch()) + from tqdm import tqdm + for entity in tqdm(entities, desc=f"Scanning {kind} in ns={ns or '(default)'}", unit="entity"): + expire_at = entity.get(config.ttl_field) + expired = expire_at is None if config.delete_missing_ttl else False + if not expired and expire_at is not None: + try: + expired = expire_at < now + except Exception: + # If unparsable or timezone-less, skip + expired = False + if expired: + to_delete.append(entity.key) + + if dry_run: + logger.info( + "[DRY-RUN] ns=%s kind=%s would delete %d entities", + ns or "(default)", + kind, + len(to_delete), + ) + totals[f"{ns}:{kind}"] = len(to_delete) + else: + deleted = 0 + if to_delete: + for batch in tqdm(list(chunked(to_delete, config.batch_size)), desc=f"Deleting {kind} in ns={ns or '(default)'}", unit="batch"): + client.delete_multi(batch) + deleted += len(batch) + logger.info( + "ns=%s kind=%s deleted %d expired entities", + ns or "(default)", + kind, + deleted, + ) + totals[f"{ns}:{kind}"] = deleted + + return totals diff --git a/commands/config.py b/commands/config.py new file mode 100644 index 0000000..420c993 --- /dev/null +++ b/commands/config.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import os +import logging +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional, Sequence + +import yaml +from google.cloud import datastore + +@dataclass +class AppConfig: + project_id: Optional[str] = None + emulator_host: Optional[str] = None + + # Explicit filters (when empty -> use all) + namespaces: List[str] = field(default_factory=list) + kinds: List[str] = field(default_factory=list) + + # Optional defaults for commands that need them (e.g., analyze-fields) + kind: Optional[str] = None + namespace: Optional[str] = None + + # Cleanup settings + ttl_field: str = "expireAt" + delete_missing_ttl: bool = True + batch_size: int = 500 + + # Analysis settings + group_by_field: Optional[str] = None + + # Logging + log_level: str = "INFO" + + +def _as_list(value: Optional[Iterable[str]]) -> List[str]: + if value is None: + return [] + if isinstance(value, (list, tuple)): + return [str(v) for v in value] + return [str(value)] + + +def load_config(path: Optional[str] = None, overrides: Optional[Dict] = None) -> AppConfig: + config = AppConfig() + + # Load YAML if provided or if default exists + data: Dict = {} + candidate = path or os.getenv("LSU_CONFIG") + if not candidate and os.path.exists("config.yaml"): + candidate = "config.yaml" + + if candidate and os.path.exists(candidate): + with open(candidate, "r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + + overrides = overrides or {} + merged = {**data, **overrides} + + config.project_id = merged.get("project_id") or os.getenv("DATASTORE_PROJECT_ID") + config.emulator_host = merged.get("emulator_host") or os.getenv("DATASTORE_EMULATOR_HOST") + + # Explicit lists (no include/exclude). Empty -> all + config.namespaces = _as_list(merged.get("namespaces")) + config.kinds = _as_list(merged.get("kinds")) + + # 🛠 Normalise: treat [""] as empty + if config.namespaces == [""] or config.namespaces is None: + config.namespaces = [] + if config.kinds == [""] or config.kinds is None: + config.kinds = [] + + # Optional defaults used by some commands + config.kind = merged.get("kind") + config.namespace = merged.get("namespace") + + config.ttl_field = merged.get("ttl_field", config.ttl_field) + config.delete_missing_ttl = bool(merged.get("delete_missing_ttl", config.delete_missing_ttl)) + config.batch_size = int(merged.get("batch_size", config.batch_size)) + + config.group_by_field = merged.get("group_by_field", config.group_by_field) + + config.log_level = str(merged.get("log_level", config.log_level)).upper() + + _configure_logging(config.log_level) + return config + + +def _configure_logging(level: str) -> None: + level_value = getattr(logging, level.upper(), logging.INFO) + logging.basicConfig(level=level_value, format="%(asctime)s | %(levelname)s | %(message)s") + + +def build_client(config: AppConfig) -> datastore.Client: + # Prefer explicit emulator_host if provided, otherwise env decides + if config.emulator_host: + os.environ["DATASTORE_EMULATOR_HOST"] = config.emulator_host + # Project id is required in emulator; optional on GCP (ADC will detect) + if config.project_id: + os.environ.setdefault("DATASTORE_PROJECT_ID", config.project_id) + + if os.getenv("DATASTORE_EMULATOR_HOST"): + # When using emulator, ensure a project ID is present + project_id = os.getenv("DATASTORE_PROJECT_ID") or config.project_id or "local-dev" + os.environ["DATASTORE_PROJECT_ID"] = project_id + return datastore.Client(project=project_id) + + # GCP path, relies on ADC if project not provided + return datastore.Client(project=config.project_id) + + +def list_namespaces(client: datastore.Client) -> List[str]: + """ + Return all namespaces in the datastore, including the default (""). + Always queries __namespace__ in the root context so it works in emulator/GCP. + """ + # Include default namespace "" first + namespaces: List[str] = [""] + + # Force namespace=None to query the metadata root + query = client.query(kind="__namespace__", namespace=None) + query.keys_only() + + for entity in query.fetch(): + name = entity.key.name or "" + if name != "": + namespaces.append(name) + + return namespaces + + +def list_kinds(client: datastore.Client, namespace: Optional[str]) -> List[str]: + query = client.query(kind="__kind__", namespace=namespace or None) + query.keys_only() + return [e.key.name for e in query.fetch()] + + +def chunked(iterable: Sequence, chunk_size: int): + for i in range(0, len(iterable), max(1, chunk_size)): + yield iterable[i : i + chunk_size] + + +def format_size(bytes_size: int) -> str: + size = float(bytes_size) + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size < 1024: + return f"{size:.2f} {unit}" + size /= 1024 + return f"{size:.2f} PB" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0ff44d9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "local-storage-utils" +version = "0.1.0" +description = "Utilities for analyzing and managing local Datastore/Firestore (Datastore mode) data" +authors = [ + { name = "Your Name", email = "you@example.com" }, +] +readme = "README.md" +requires-python = ">=3.9" +license = { file = "LICENSE" } +keywords = ["google-cloud-datastore", "firestore", "emulator", "cleanup", "analysis"] +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] +dependencies = [ + "google-cloud-datastore>=2.19.0", + "PyYAML>=6.0.1", + "typer>=0.12.3", +] + +[project.optional-dependencies] +rich = ["rich>=13.7.0"] + +[project.scripts] +lsu = "cli:app" +local-storage-utils = "cli:app" + +[tool.setuptools.packages.find] +where = ["."] +include = ["commands*"] + +[tool.black] +line-length = 100 + +[tool.ruff] +line-length = 100 + +[tool.semantic_release] +version_variable = "pyproject.toml:version" +branch = "main" +upload_to_pypi = true +dist_path = "dist" +build_command = "python -m build" +commit_message = "chore(release): {version} [skip ci]" +changelog_sections = "feature,fix,perf,refactor,docs,style,build,ci,chore" + +[tool.semantic_release.remote] +name = "origin" +repo_url = "https://github.com/your-org/local-storage-utils" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e8f4d09 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +google-cloud-datastore>=2.19.0 +PyYAML>=6.0.1 +typer>=0.12.3 \ No newline at end of file diff --git a/tests/test_commands.py b/tests/test_commands.py new file mode 100644 index 0000000..70c05a3 --- /dev/null +++ b/tests/test_commands.py @@ -0,0 +1,52 @@ +import sys +import os +import pytest +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from commands import analyze_kinds, analyze_entity_fields, cleanup_expired, config +from commands.config import AppConfig, build_client, list_namespaces + +# Dummy config for testing (adjust as needed for emulator) +def make_dummy_config(): + return AppConfig( + project_id="dummy-project", + emulator_host="localhost:8080", + namespaces=[""], + kinds=["TestKind"], + ttl_field="expireAt", + delete_missing_ttl=True, + batch_size=10, + group_by_field=None, + log_level="INFO", + ) + +def test_analyze_kinds_runs(): + cfg = make_dummy_config() + try: + result = analyze_kinds(cfg) + assert isinstance(result, list) + except Exception as e: + pytest.skip(f"analyze_kinds requires emulator: {e}") + +def test_analyze_fields_runs(): + cfg = make_dummy_config() + try: + result = analyze_entity_fields.analyze_field_contributions(cfg, kind="TestKind") + assert isinstance(result, dict) + except Exception as e: + pytest.skip(f"analyze_fields requires emulator: {e}") + +def test_cleanup_expired_runs(): + cfg = make_dummy_config() + try: + result = cleanup_expired.cleanup_expired(cfg, dry_run=True) + assert isinstance(result, dict) + except Exception as e: + pytest.skip(f"cleanup_expired requires emulator: {e}") + +def test_list_namespaces_returns_default_and_any_custom(): + cfg = AppConfig(project_id="dummy-project", emulator_host="localhost:8010") + client = build_client(cfg) + namespaces = list_namespaces(client) + assert "" in namespaces # default namespace always present + # This test will pass if at least the default namespace is present + # Add more asserts if you know your emulator has more namespaces diff --git a/tests/test_import.py b/tests/test_import.py new file mode 100644 index 0000000..c30178d --- /dev/null +++ b/tests/test_import.py @@ -0,0 +1,10 @@ +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +def test_imports(): + import commands + from commands import analyze_kinds, analyze_entity_fields, cleanup_expired, config + + assert commands is not None + assert hasattr(config, "AppConfig") \ No newline at end of file