🔇 Try to tame down procrastinate logging a bit

simonwoerpel · simonwoerpel · commit ac2d52ec9374 · 2026-02-17T18:03:56.000+01:00
diff --git a/openaleph_procrastinate/app.py b/openaleph_procrastinate/app.py
@@ -9,6 +9,7 @@
 from procrastinate import connector, testing, utils
 from psycopg_pool import AsyncConnectionPool, ConnectionPool
 
+from openaleph_procrastinate.logging import patch_procrastinate_logging
 from openaleph_procrastinate.settings import OpenAlephSettings
 
 log = get_logger(__name__)
@@ -109,6 +110,7 @@ def make_app(tasks_module: str | None = None, sync: bool | None = False) -> App:
     settings = OpenAlephSettings()
     db_uri = mask_uri(settings.procrastinate_db_uri)
     configure_logging()
+    patch_procrastinate_logging()
     import_paths = [tasks_module] if tasks_module else None
     connector = get_connector(sync=sync)
     log.info(
diff --git a/openaleph_procrastinate/logging.py b/openaleph_procrastinate/logging.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from banal import ensure_dict
+
+
+def extract_job_summary(task_kwargs: dict) -> dict:
+    """Extract useful structured fields from job task_kwargs."""
+    summary: dict = {}
+    dataset = task_kwargs.get("dataset")
+    if dataset:
+        summary["dataset"] = dataset
+    batch = task_kwargs.get("batch")
+    if batch:
+        summary["batch"] = batch
+    payload = task_kwargs.get("payload")
+    if not isinstance(payload, dict):
+        return summary
+    entities = payload.get("entities")
+    if not isinstance(entities, list) or not entities:
+        return summary
+    summary["entities_count"] = len(entities)
+    entity_ids = [e["id"] for e in entities if isinstance(e, dict) and e.get("id")]
+    if entity_ids:
+        summary["entity_id_min"] = min(entity_ids)
+        summary["entity_id_max"] = max(entity_ids)
+        if len(entity_ids) < 11:
+            summary["entity_ids"] = entity_ids
+    content_hashes = [
+        h
+        for e in entities
+        if isinstance(e, dict)
+        for h in ensure_dict(e.get("properties")).get("contentHash", [])
+    ]
+    if content_hashes:
+        summary["content_hash_min"] = min(content_hashes)
+        summary["content_hash_max"] = max(content_hashes)
+    return summary
+
+
+def patch_procrastinate_logging():
+    """Patch procrastinate to produce concise, structured job logs.
+
+    Upstream procrastinate logs full repr() of all task kwargs in
+    Job.call_string, creating extremely noisy log lines when payloads
+    contain entity data. This patch:
+    1. Shortens call_string to just task_name[id]
+    2. Strips the full payload from log_context()
+    3. Promotes dataset/entity/hash summary to top-level structlog kwargs
+    """
+    from procrastinate.jobs import Job
+    from procrastinate.worker import Worker
+
+    @property
+    def call_string(self):
+        return f"{self.task_name}[{self.id}]"
+
+    Job.call_string = call_string
+
+    _original_log_context = Job.log_context
+
+    def log_context(self):
+        ctx = _original_log_context(self)
+        ctx.pop("task_kwargs", None)
+        ctx.update(extract_job_summary(self.task_kwargs))
+        return ctx
+
+    Job.log_context = log_context
+
+    _original_log_extra = Worker._log_extra
+
+    def _log_extra(self, action, context, job_result, **kwargs):
+        extra = _original_log_extra(
+            self, action=action, context=context, job_result=job_result, **kwargs
+        )
+        if context:
+            extra["task_name"] = context.job.task_name
+            extra["queue_name"] = context.job.queue
+            extra.update(extract_job_summary(context.job.task_kwargs))
+        return extra
+
+    Worker._log_extra = _log_extra
diff --git a/tests/test_logging.py b/tests/test_logging.py
@@ -0,0 +1,76 @@
+from openaleph_procrastinate.logging import (
+    extract_job_summary,
+    patch_procrastinate_logging,
+)
+
+
+def test_extract_job_summary_with_entities():
+    entities = [
+        {
+            "id": "doc-001",
+            "schema": "Document",
+            "properties": {"contentHash": ["abc123"]},
+        },
+        {
+            "id": "doc-002",
+            "schema": "Document",
+            "properties": {"contentHash": ["def456"]},
+        },
+    ]
+    task_kwargs = {
+        "dataset": "my-dataset",
+        "batch": "batch-1",
+        "payload": {"entities": entities},
+    }
+    summary = extract_job_summary(task_kwargs)
+    assert summary["dataset"] == "my-dataset"
+    assert summary["batch"] == "batch-1"
+    assert summary["entities_count"] == 2
+    assert summary["entity_id_min"] == "doc-001"
+    assert summary["entity_id_max"] == "doc-002"
+    assert summary["entity_ids"] == ["doc-001", "doc-002"]
+    assert summary["content_hash_min"] == "abc123"
+    assert summary["content_hash_max"] == "def456"
+
+
+def test_extract_job_summary_empty_payload():
+    assert extract_job_summary({}) == {}
+    assert extract_job_summary({"payload": "not-a-dict"}) == {}
+    assert extract_job_summary({"payload": {}}) == {}
+    assert extract_job_summary({"payload": {"entities": []}}) == {}
+
+
+def test_patch_procrastinate_logging():
+    """Verify the monkey-patch produces concise call_string and stripped log_context."""
+    from procrastinate.jobs import Job as ProcrastinateJob
+
+    patch_procrastinate_logging()
+
+    job = ProcrastinateJob(
+        id=42,
+        queue="test-queue",
+        lock="lock",
+        queueing_lock=None,
+        task_name="mylib.tasks.process",
+        task_kwargs={
+            "dataset": "investigation",
+            "payload": {
+                "entities": [
+                    {"id": "ent-1", "properties": {"contentHash": ["aaa"]}},
+                    {"id": "ent-2", "properties": {"contentHash": ["bbb"]}},
+                ]
+            },
+        },
+        scheduled_at=None,
+        attempts=0,
+    )
+
+    # call_string should be short, not include the full payload repr
+    assert job.call_string == "mylib.tasks.process[42]"
+
+    # log_context should NOT contain task_kwargs but should have summary fields
+    ctx = job.log_context()
+    assert "task_kwargs" not in ctx
+    assert ctx["dataset"] == "investigation"
+    assert ctx["entities_count"] == 2
+    assert ctx["entity_ids"] == ["ent-1", "ent-2"]