Skip to content
Merged
1 change: 1 addition & 0 deletions changes/7803.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Implement `ErrorLog` Service, Repository Layer
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changelog filename is '7803.feature.md' but the PR description mentions resolving issue #7754. Verify that the changelog filename matches the correct issue number. If #7754 is the correct issue, the file should be renamed to '7754.feature.md'.

Copilot uses AI. Check for mistakes.
2 changes: 2 additions & 0 deletions src/ai/backend/common/metrics/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ class LayerType(enum.StrEnum):
CONTAINER_REGISTRY_REPOSITORY = "container_registry_repository"
DEPLOYMENT_REPOSITORY = "deployment_repository"
DOMAIN_REPOSITORY = "domain_repository"
ERROR_LOG_REPOSITORY = "error_log_repository"
GROUP_REPOSITORY = "group_repository"
HUGGINGFACE_REGISTRY_REPOSITORY = "huggingface_registry_repository"
IMAGE_REPOSITORY = "image_repository"
Expand Down Expand Up @@ -438,6 +439,7 @@ class LayerType(enum.StrEnum):
AUTH_DB_SOURCE = "auth_db_source"
AGENT_DB_SOURCE = "agent_db_source"
DEPLOYMENT_DB_SOURCE = "deployment_db_source"
ERROR_LOG_DB_SOURCE = "error_log_db_source"
PERMISSION_CONTROLLER_DB_SOURCE = "permission_controller_db_source"
RESOURCE_PRESET_DB_SOURCE = "resource_preset_db_source"
SCHEDULE_DB_SOURCE = "schedule_db_source"
Expand Down
8 changes: 8 additions & 0 deletions src/ai/backend/manager/data/error_log/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .types import ErrorLogContent, ErrorLogData, ErrorLogMeta, ErrorLogSeverity

__all__ = (
"ErrorLogContent",
"ErrorLogData",
"ErrorLogMeta",
"ErrorLogSeverity",
)
40 changes: 40 additions & 0 deletions src/ai/backend/manager/data/error_log/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from __future__ import annotations

import enum
import uuid
from dataclasses import dataclass
from datetime import datetime
from typing import Any


class ErrorLogSeverity(enum.StrEnum):
CRITICAL = "critical"
ERROR = "error"
WARNING = "warning"


@dataclass
class ErrorLogMeta:
created_at: datetime
user: uuid.UUID | None
source: str
is_read: bool
is_cleared: bool
context_lang: str
context_env: dict[str, Any]
request_url: str | None
request_status: int | None


@dataclass
class ErrorLogContent:
severity: ErrorLogSeverity
message: str
traceback: str | None


@dataclass
class ErrorLogData:
id: uuid.UUID
meta: ErrorLogMeta
content: ErrorLogContent
Comment on lines +36 to +40
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's in the code.

72 changes: 70 additions & 2 deletions src/ai/backend/manager/models/error_logs.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
from __future__ import annotations

import uuid
from datetime import datetime
from typing import Any

import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

from .base import GUID, IDColumn, metadata
from ai.backend.manager.data.error_log.types import (
ErrorLogContent,
ErrorLogData,
ErrorLogMeta,
ErrorLogSeverity,
)

from .base import GUID, Base, IDColumn, mapper_registry

__all__ = [
"error_logs",
"ErrorLogRow",
]

error_logs = sa.Table(
"error_logs",
metadata,
mapper_registry.metadata,
IDColumn(),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now(), index=True),
sa.Column(
Expand All @@ -26,3 +40,57 @@
sa.Column("request_status", sa.Integer, nullable=True),
sa.Column("traceback", sa.Text, nullable=True),
)


class ErrorLogRow(Base):
__table__ = error_logs

def __init__(
self,
severity: ErrorLogSeverity,
source: str,
message: str,
context_lang: str,
context_env: dict[str, Any],
user: uuid.UUID | None = None,
is_read: bool = False,
is_cleared: bool = False,
request_url: str | None = None,
request_status: int | None = None,
traceback: str | None = None,
created_at: datetime | None = None,
) -> None:
self.severity = severity.value
self.source = source
self.user = user
self.is_read = is_read
self.is_cleared = is_cleared
self.message = message
self.context_lang = context_lang
self.context_env = context_env
self.request_url = request_url
self.request_status = request_status
self.traceback = traceback
if created_at:
self.created_at = created_at
Comment on lines +46 to +75
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove the table code and migrate to SQLAlchemy 2.0. Let's address this in a follow-up.


def to_dataclass(self) -> ErrorLogData:
return ErrorLogData(
id=self.id,
meta=ErrorLogMeta(
created_at=self.created_at,
user=self.user,
source=self.source,
is_read=self.is_read,
is_cleared=self.is_cleared,
context_lang=self.context_lang,
context_env=self.context_env,
request_url=self.request_url,
request_status=self.request_status,
),
content=ErrorLogContent(
severity=ErrorLogSeverity(self.severity),
message=self.message,
traceback=self.traceback,
),
)
9 changes: 9 additions & 0 deletions src/ai/backend/manager/repositories/error_log/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .creators import ErrorLogCreatorSpec
from .repositories import ErrorLogRepositories
from .repository import ErrorLogRepository

__all__ = (
"ErrorLogCreatorSpec",
"ErrorLogRepositories",
"ErrorLogRepository",
)
45 changes: 45 additions & 0 deletions src/ai/backend/manager/repositories/error_log/creators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import annotations

import uuid
from dataclasses import dataclass
from datetime import datetime
from typing import Any, override

from ai.backend.manager.data.error_log.types import ErrorLogSeverity
from ai.backend.manager.models.error_logs import ErrorLogRow
from ai.backend.manager.repositories.base import CreatorSpec

__all__ = ("ErrorLogCreatorSpec",)


@dataclass
class ErrorLogCreatorSpec(CreatorSpec[ErrorLogRow]):
severity: ErrorLogSeverity
source: str
message: str
context_lang: str
context_env: dict[str, Any]
user: uuid.UUID | None = None
is_read: bool = False
is_cleared: bool = False
request_url: str | None = None
request_status: int | None = None
traceback: str | None = None
created_at: datetime | None = None

@override
def build_row(self) -> ErrorLogRow:
return ErrorLogRow(
severity=self.severity,
source=self.source,
message=self.message,
context_lang=self.context_lang,
context_env=self.context_env,
user=self.user,
is_read=self.is_read,
is_cleared=self.is_cleared,
request_url=self.request_url,
request_status=self.request_status,
traceback=self.traceback,
created_at=self.created_at,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .db_source import ErrorLogDBSource

__all__ = ("ErrorLogDBSource",)
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from ai.backend.common.exception import BackendAIError
from ai.backend.common.metrics.metric import DomainType, LayerType
from ai.backend.common.resilience.policies.metrics import MetricArgs, MetricPolicy
from ai.backend.common.resilience.policies.retry import BackoffStrategy, RetryArgs, RetryPolicy
from ai.backend.common.resilience.resilience import Resilience
from ai.backend.manager.data.error_log.types import ErrorLogData
from ai.backend.manager.models.error_logs import ErrorLogRow
from ai.backend.manager.repositories.base import (
Creator,
execute_creator,
)

if TYPE_CHECKING:
from ai.backend.manager.models.utils import ExtendedAsyncSAEngine

__all__ = ("ErrorLogDBSource",)

error_log_db_source_resilience = Resilience(
policies=[
MetricPolicy(MetricArgs(domain=DomainType.DB_SOURCE, layer=LayerType.ERROR_LOG_DB_SOURCE)),
RetryPolicy(
RetryArgs(
max_retries=5,
retry_delay=0.1,
backoff_strategy=BackoffStrategy.FIXED,
non_retryable_exceptions=(BackendAIError,),
)
),
]
)


class ErrorLogDBSource:
_db: ExtendedAsyncSAEngine

def __init__(self, db: ExtendedAsyncSAEngine) -> None:
self._db = db

@error_log_db_source_resilience.apply()
async def create(self, creator: Creator[ErrorLogRow]) -> ErrorLogData:
async with self._db.begin_session() as db_sess:
result = await execute_creator(db_sess, creator)
return result.row.to_dataclass()
Comment on lines +22 to +47
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Redundant resilience policies are applied at both the Repository and DBSource layers. This pattern differs from existing repositories (e.g., ImageRepository) where resilience is only applied at the Repository layer, not at the DBSource layer. Having resilience at both layers could result in excessive retries (up to 50 attempts: 10 repository retries * 5 DBSource retries). Remove the resilience decorator and policies from ErrorLogDBSource to align with the established pattern.

Copilot uses AI. Check for mistakes.
20 changes: 20 additions & 0 deletions src/ai/backend/manager/repositories/error_log/repositories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Self

if TYPE_CHECKING:
from ai.backend.manager.repositories.types import RepositoryArgs

from .repository import ErrorLogRepository


@dataclass
class ErrorLogRepositories:
repository: ErrorLogRepository

@classmethod
def create(cls, args: RepositoryArgs) -> Self:
return cls(
repository=ErrorLogRepository(db=args.db),
)
46 changes: 46 additions & 0 deletions src/ai/backend/manager/repositories/error_log/repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from ai.backend.common.exception import BackendAIError
from ai.backend.common.metrics.metric import DomainType, LayerType
from ai.backend.common.resilience.policies.metrics import MetricArgs, MetricPolicy
from ai.backend.common.resilience.policies.retry import BackoffStrategy, RetryArgs, RetryPolicy
from ai.backend.common.resilience.resilience import Resilience
from ai.backend.manager.data.error_log.types import ErrorLogData
from ai.backend.manager.models.error_logs import ErrorLogRow
from ai.backend.manager.repositories.base import Creator

from .db_source import ErrorLogDBSource

if TYPE_CHECKING:
from ai.backend.manager.models.utils import ExtendedAsyncSAEngine

__all__ = ("ErrorLogRepository",)

error_log_repository_resilience = Resilience(
policies=[
MetricPolicy(
MetricArgs(domain=DomainType.REPOSITORY, layer=LayerType.ERROR_LOG_REPOSITORY)
),
RetryPolicy(
RetryArgs(
max_retries=10,
retry_delay=0.1,
backoff_strategy=BackoffStrategy.FIXED,
non_retryable_exceptions=(BackendAIError,),
)
),
]
)


class ErrorLogRepository:
_db_source: ErrorLogDBSource

def __init__(self, db: ExtendedAsyncSAEngine) -> None:
self._db_source = ErrorLogDBSource(db)

@error_log_repository_resilience.apply()
async def create(self, creator: Creator[ErrorLogRow]) -> ErrorLogData:
return await self._db_source.create(creator)
4 changes: 4 additions & 0 deletions src/ai/backend/manager/repositories/repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from ai.backend.manager.repositories.deployment.repositories import DeploymentRepositories
from ai.backend.manager.repositories.domain.repositories import DomainRepositories
from ai.backend.manager.repositories.error_log.repositories import ErrorLogRepositories
from ai.backend.manager.repositories.group.repositories import GroupRepositories
from ai.backend.manager.repositories.huggingface_registry.repositories import (
HuggingFaceRegistryRepositories,
Expand Down Expand Up @@ -63,6 +64,7 @@ class Repositories:
container_registry: ContainerRegistryRepositories
deployment: DeploymentRepositories
domain: DomainRepositories
error_log: ErrorLogRepositories
group: GroupRepositories
image: ImageRepositories
keypair_resource_policy: KeypairResourcePolicyRepositories
Expand Down Expand Up @@ -97,6 +99,7 @@ def create(cls, args: RepositoryArgs) -> Self:
container_registry_repositories = ContainerRegistryRepositories.create(args)
deployment_repositories = DeploymentRepositories.create(args)
domain_repositories = DomainRepositories.create(args)
error_log_repositories = ErrorLogRepositories.create(args)
group_repositories = GroupRepositories.create(args)
image_repositories = ImageRepositories.create(args)
keypair_resource_policy_repositories = KeypairResourcePolicyRepositories.create(args)
Expand Down Expand Up @@ -130,6 +133,7 @@ def create(cls, args: RepositoryArgs) -> Self:
container_registry=container_registry_repositories,
deployment=deployment_repositories,
domain=domain_repositories,
error_log=error_log_repositories,
group=group_repositories,
image=image_repositories,
keypair_resource_policy=keypair_resource_policy_repositories,
Expand Down
12 changes: 12 additions & 0 deletions src/ai/backend/manager/services/error_log/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from __future__ import annotations

from .actions import CreateErrorLogAction, CreateErrorLogActionResult
from .processors import ErrorLogProcessors
from .service import ErrorLogService

__all__ = (
"CreateErrorLogAction",
"CreateErrorLogActionResult",
"ErrorLogProcessors",
"ErrorLogService",
)
8 changes: 8 additions & 0 deletions src/ai/backend/manager/services/error_log/actions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .base import ErrorLogAction
from .create import CreateErrorLogAction, CreateErrorLogActionResult

__all__ = (
"ErrorLogAction",
"CreateErrorLogAction",
"CreateErrorLogActionResult",
)
Loading
Loading