Refactor auto-update script: log exceptions, simplify channel discovery

Fedir-Yatsenko · claude · Fedir-Yatsenko · commit fbdf1a15a863 · 2026-03-05T14:17:14.000+02:00
- Log exceptions from asyncio.gather in job processing and deduplication
- Move auto-update channel filtering from ChannelService to the script
- Remove unused get_auto_update_channels and selectinload import
- Use most_common() for deterministic result summary ordering
- Batch commit instead of per-iteration flush when creating jobs

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/statgpt/admin/auto_update.py b/statgpt/admin/auto_update.py
@@ -24,13 +24,18 @@ async def _discover_and_create_jobs() -> list[schemas.AutoUpdateJob]:
     """Find auto-update channels and create jobs for their datasets."""
     _log.info(_SEPARATOR)
     async with get_session_contex_manager() as session:
-        channels = await AdminPortalChannelService(session).get_auto_update_channels()
-        _log.info(f"Found {len(channels)} channel(s) with auto-update enabled")
-
-        if not channels:
+        channel_service = AdminPortalChannelService(session)
+        all_channels = await channel_service.get_channels_schemas(limit=None, offset=0)
+        channel_ids = [
+            ch.id
+            for ch in all_channels
+            if (dq := ch.details.data_query) is not None and dq.details.allow_auto_update
+        ]
+        _log.info(f"Found {len(channel_ids)} channel(s) with auto-update enabled")
+
+        if not channel_ids:
             return []
 
-        channel_ids = [ch.id for ch in channels]
         return await AdminPortalDataSetService(session).create_auto_update_jobs(channel_ids)
 
 
@@ -39,13 +44,16 @@ async def _process_jobs(jobs: list[schemas.AutoUpdateJob], auth_context: AuthCon
     _log.info(_SEPARATOR)
     _log.info(f"Created {len(jobs)} auto-update job(s), starting processing...")
 
-    await asyncio.gather(
+    results = await asyncio.gather(
         *(
             auto_update_in_background_task(auto_update_job_id=job.id, auth_context=auth_context)
             for job in jobs
         ),
         return_exceptions=True,
     )
+    for job, result in zip(jobs, results):
+        if isinstance(result, Exception):
+            _log.error(f"Auto-update job {job.id} failed with exception:", exc_info=result)
 
 
 async def _get_reindex_channel_ids(job_ids: list[int]) -> set[int]:
@@ -81,7 +89,7 @@ async def _deduplicate_channels(channel_ids: set[int], auth_context: AuthContext
         f"Running deduplication for {len(channel_ids)} channel(s) "
         f"with reindex: {sorted(channel_ids)}"
     )
-    await asyncio.gather(
+    results = await asyncio.gather(
         *(
             deduplicate_dimensions_in_background_task(
                 channel_id=channel_id, auth_context=auth_context
@@ -90,6 +98,11 @@ async def _deduplicate_channels(channel_ids: set[int], auth_context: AuthContext
         ),
         return_exceptions=True,
     )
+    for channel_id, result in zip(channel_ids, results):
+        if isinstance(result, Exception):
+            _log.error(
+                f"Deduplication for channel {channel_id} failed with exception:", exc_info=result
+            )
     _log.info("Deduplication complete")
 
 
diff --git a/statgpt/admin/services/dataset.py b/statgpt/admin/services/dataset.py
@@ -2020,8 +2020,8 @@ async def create_auto_update_jobs(self, channel_ids: list[int]) -> list[schemas.
                 status=StatusEnum.QUEUED,
             )
             self._session.add(job)
-            await self._session.flush()
             jobs.append(job)
+        await self._session.commit()
 
         # Log per-channel summary
         channels_by_id = {cd.channel.id: cd.channel for cd in channel_datasets}
@@ -2033,7 +2033,6 @@ async def create_auto_update_jobs(self, channel_ids: list[int]) -> list[schemas.
                 f"'{ch.deployment_id}' (id={ch_id})"
             )
 
-        await self._session.commit()
         return [schemas.AutoUpdateJob.model_validate(job, from_attributes=True) for job in jobs]
 
     async def get_reindex_channel_ids(self, job_ids: list[int]) -> set[int]:
@@ -2097,7 +2096,7 @@ def _format_result_summary(jobs: list[models.AutoUpdateJob]) -> str:
         result_counts = Counter(job_statuses)
 
         parts: list[str] = []
-        for job_status, count in result_counts.items():
+        for job_status, count in result_counts.most_common():
             part = f"{count} {job_status}"
             if job_status in reindex_statuses:
                 breakdown = ", ".join(f"{c} {s}" for s, c in reindex_statuses[job_status].items())
diff --git a/statgpt/common/services/channel.py b/statgpt/common/services/channel.py
@@ -1,7 +1,6 @@
 from fastapi import HTTPException, status
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import selectinload
 from sqlalchemy.sql.expression import func
 
 import statgpt.common.models as models
@@ -28,7 +27,7 @@ async def get_channels_db(self, limit: int | None, offset: int) -> list[models.C
         q_result = await self._session.execute(query)
         return [item for item in q_result.scalars().all()]
 
-    async def get_channels_schemas(self, limit: int, offset: int) -> list[schemas.Channel]:
+    async def get_channels_schemas(self, limit: int | None, offset: int) -> list[schemas.Channel]:
         channels = await self.get_channels_db(limit, offset)
         return [ChannelSerializer.db_to_schema(item) for item in channels]
 
@@ -60,19 +59,3 @@ def is_channel_hybrid(channel: models.Channel) -> bool:
             return False
         indexer_version = channel_config.data_query.details.indexer_version
         return indexer_version == schemas.IndexerVersion.hybrid
-
-    @staticmethod
-    def _is_auto_update_enabled(channel: models.Channel) -> bool:
-        """Returns `True` if the channel has auto-update enabled in data_query config."""
-        config = schemas.ChannelConfig.model_validate(channel.details)
-        if config.data_query is None:
-            return False
-        return config.data_query.details.allow_auto_update
-
-    async def get_auto_update_channels(self) -> list[models.Channel]:
-        """Get all channels with auto-update enabled, with mapped_datasets eager-loaded."""
-        result = await self._session.execute(
-            select(models.Channel).options(selectinload(models.Channel.mapped_datasets))
-        )
-        channels = list(result.scalars().all())
-        return [ch for ch in channels if self._is_auto_update_enabled(ch)]