Skip to content

Commit ac2d52e

Browse files
committed
🔇 Try to tame down procrastinate logging a bit
1 parent 39d1bf6 commit ac2d52e

File tree

3 files changed

+159
-0
lines changed

3 files changed

+159
-0
lines changed

openaleph_procrastinate/app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from procrastinate import connector, testing, utils
1010
from psycopg_pool import AsyncConnectionPool, ConnectionPool
1111

12+
from openaleph_procrastinate.logging import patch_procrastinate_logging
1213
from openaleph_procrastinate.settings import OpenAlephSettings
1314

1415
log = get_logger(__name__)
@@ -109,6 +110,7 @@ def make_app(tasks_module: str | None = None, sync: bool | None = False) -> App:
109110
settings = OpenAlephSettings()
110111
db_uri = mask_uri(settings.procrastinate_db_uri)
111112
configure_logging()
113+
patch_procrastinate_logging()
112114
import_paths = [tasks_module] if tasks_module else None
113115
connector = get_connector(sync=sync)
114116
log.info(

openaleph_procrastinate/logging.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from __future__ import annotations
2+
3+
from banal import ensure_dict
4+
5+
6+
def extract_job_summary(task_kwargs: dict) -> dict:
7+
"""Extract useful structured fields from job task_kwargs."""
8+
summary: dict = {}
9+
dataset = task_kwargs.get("dataset")
10+
if dataset:
11+
summary["dataset"] = dataset
12+
batch = task_kwargs.get("batch")
13+
if batch:
14+
summary["batch"] = batch
15+
payload = task_kwargs.get("payload")
16+
if not isinstance(payload, dict):
17+
return summary
18+
entities = payload.get("entities")
19+
if not isinstance(entities, list) or not entities:
20+
return summary
21+
summary["entities_count"] = len(entities)
22+
entity_ids = [e["id"] for e in entities if isinstance(e, dict) and e.get("id")]
23+
if entity_ids:
24+
summary["entity_id_min"] = min(entity_ids)
25+
summary["entity_id_max"] = max(entity_ids)
26+
if len(entity_ids) < 11:
27+
summary["entity_ids"] = entity_ids
28+
content_hashes = [
29+
h
30+
for e in entities
31+
if isinstance(e, dict)
32+
for h in ensure_dict(e.get("properties")).get("contentHash", [])
33+
]
34+
if content_hashes:
35+
summary["content_hash_min"] = min(content_hashes)
36+
summary["content_hash_max"] = max(content_hashes)
37+
return summary
38+
39+
40+
def patch_procrastinate_logging():
41+
"""Patch procrastinate to produce concise, structured job logs.
42+
43+
Upstream procrastinate logs full repr() of all task kwargs in
44+
Job.call_string, creating extremely noisy log lines when payloads
45+
contain entity data. This patch:
46+
1. Shortens call_string to just task_name[id]
47+
2. Strips the full payload from log_context()
48+
3. Promotes dataset/entity/hash summary to top-level structlog kwargs
49+
"""
50+
from procrastinate.jobs import Job
51+
from procrastinate.worker import Worker
52+
53+
@property
54+
def call_string(self):
55+
return f"{self.task_name}[{self.id}]"
56+
57+
Job.call_string = call_string
58+
59+
_original_log_context = Job.log_context
60+
61+
def log_context(self):
62+
ctx = _original_log_context(self)
63+
ctx.pop("task_kwargs", None)
64+
ctx.update(extract_job_summary(self.task_kwargs))
65+
return ctx
66+
67+
Job.log_context = log_context
68+
69+
_original_log_extra = Worker._log_extra
70+
71+
def _log_extra(self, action, context, job_result, **kwargs):
72+
extra = _original_log_extra(
73+
self, action=action, context=context, job_result=job_result, **kwargs
74+
)
75+
if context:
76+
extra["task_name"] = context.job.task_name
77+
extra["queue_name"] = context.job.queue
78+
extra.update(extract_job_summary(context.job.task_kwargs))
79+
return extra
80+
81+
Worker._log_extra = _log_extra

tests/test_logging.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from openaleph_procrastinate.logging import (
2+
extract_job_summary,
3+
patch_procrastinate_logging,
4+
)
5+
6+
7+
def test_extract_job_summary_with_entities():
8+
entities = [
9+
{
10+
"id": "doc-001",
11+
"schema": "Document",
12+
"properties": {"contentHash": ["abc123"]},
13+
},
14+
{
15+
"id": "doc-002",
16+
"schema": "Document",
17+
"properties": {"contentHash": ["def456"]},
18+
},
19+
]
20+
task_kwargs = {
21+
"dataset": "my-dataset",
22+
"batch": "batch-1",
23+
"payload": {"entities": entities},
24+
}
25+
summary = extract_job_summary(task_kwargs)
26+
assert summary["dataset"] == "my-dataset"
27+
assert summary["batch"] == "batch-1"
28+
assert summary["entities_count"] == 2
29+
assert summary["entity_id_min"] == "doc-001"
30+
assert summary["entity_id_max"] == "doc-002"
31+
assert summary["entity_ids"] == ["doc-001", "doc-002"]
32+
assert summary["content_hash_min"] == "abc123"
33+
assert summary["content_hash_max"] == "def456"
34+
35+
36+
def test_extract_job_summary_empty_payload():
37+
assert extract_job_summary({}) == {}
38+
assert extract_job_summary({"payload": "not-a-dict"}) == {}
39+
assert extract_job_summary({"payload": {}}) == {}
40+
assert extract_job_summary({"payload": {"entities": []}}) == {}
41+
42+
43+
def test_patch_procrastinate_logging():
44+
"""Verify the monkey-patch produces concise call_string and stripped log_context."""
45+
from procrastinate.jobs import Job as ProcrastinateJob
46+
47+
patch_procrastinate_logging()
48+
49+
job = ProcrastinateJob(
50+
id=42,
51+
queue="test-queue",
52+
lock="lock",
53+
queueing_lock=None,
54+
task_name="mylib.tasks.process",
55+
task_kwargs={
56+
"dataset": "investigation",
57+
"payload": {
58+
"entities": [
59+
{"id": "ent-1", "properties": {"contentHash": ["aaa"]}},
60+
{"id": "ent-2", "properties": {"contentHash": ["bbb"]}},
61+
]
62+
},
63+
},
64+
scheduled_at=None,
65+
attempts=0,
66+
)
67+
68+
# call_string should be short, not include the full payload repr
69+
assert job.call_string == "mylib.tasks.process[42]"
70+
71+
# log_context should NOT contain task_kwargs but should have summary fields
72+
ctx = job.log_context()
73+
assert "task_kwargs" not in ctx
74+
assert ctx["dataset"] == "investigation"
75+
assert ctx["entities_count"] == 2
76+
assert ctx["entity_ids"] == ["ent-1", "ent-2"]

0 commit comments

Comments
 (0)