-
Notifications
You must be signed in to change notification settings - Fork 10
Collections: Add dynamic batching #718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
614801c
f8f9c2f
f9b3365
a074e36
1106abc
ff0726f
c7da9f0
8437442
f94998f
ed1ee38
5ec17be
3246e88
292be60
86e555c
c1f997c
5295849
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| """add columns to collection job and documents table | ||
|
|
||
| Revision ID: 050 | ||
| Revises: 049 | ||
| Create Date: 2026-03-25 10:09:47.318575 | ||
|
|
||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "050" | ||
| down_revision = "049" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| op.add_column( | ||
| "collection_jobs", | ||
| sa.Column( | ||
| "docs_num", | ||
| sa.Integer(), | ||
| nullable=True, | ||
| comment="Total number of documents to be processed in this job", | ||
| ), | ||
| ) | ||
| op.add_column( | ||
| "collection_jobs", | ||
| sa.Column( | ||
| "total_size", | ||
| sa.Integer(), | ||
| nullable=True, | ||
| comment="Total size of documents being uploaded to collection", | ||
| ), | ||
| ) | ||
| op.add_column( | ||
| "document", | ||
| sa.Column( | ||
| "file_size", | ||
| sa.Integer(), | ||
| nullable=True, | ||
| comment="Size of the document in bytes", | ||
|
||
| ), | ||
| ) | ||
|
|
||
|
|
||
| def downgrade(): | ||
| op.drop_column("document", "file_size") | ||
| op.drop_column("collection_jobs", "total_size") | ||
| op.drop_column("collection_jobs", "docs_num") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |
| CollectionCrud, | ||
| CollectionJobCrud, | ||
| DocumentCollectionCrud, | ||
| DocumentCrud, | ||
| ) | ||
| from app.core.cloud import get_cloud_storage | ||
| from app.models import ( | ||
|
|
@@ -95,12 +96,25 @@ def create_collection( | |
| if request.name: | ||
| ensure_unique_name(session, current_user.project_.id, request.name) | ||
|
|
||
| # Calculate total size of all documents | ||
| document_crud = DocumentCrud(session, current_user.project_.id) | ||
| total_size = 0 | ||
| for doc_id in request.documents: | ||
| doc = document_crud.read_one(doc_id) | ||
|
||
| total_size += doc.file_size or 0 | ||
|
|
||
| logger.info( | ||
| f"[create_collection] Calculated total size | {{'total_documents': {len(request.documents)}, 'total_size_bytes': {total_size}, 'total_size_mb': {round(total_size / (1024 * 1024), 2)}}}" | ||
|
||
| ) | ||
|
|
||
| collection_job_crud = CollectionJobCrud(session, current_user.project_.id) | ||
| collection_job = collection_job_crud.create( | ||
| CollectionJobCreate( | ||
| action_type=CollectionActionType.CREATE, | ||
| project_id=current_user.project_.id, | ||
| status=CollectionJobStatus.PENDING, | ||
| docs_num=len(request.documents), | ||
| total_size=total_size, | ||
| ) | ||
| ) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,8 @@ | ||
| from datetime import datetime | ||
| from typing import Any | ||
| from uuid import UUID, uuid4 | ||
|
|
||
| from pydantic import model_serializer | ||
|
||
| from sqlmodel import Field, SQLModel | ||
|
|
||
| from app.core.util import now | ||
|
|
@@ -41,6 +43,11 @@ class Document(DocumentBase, table=True): | |
| default=False, | ||
| sa_column_kwargs={"comment": "Soft delete flag"}, | ||
| ) | ||
| file_size: int | None = Field( | ||
| default=None, | ||
| description="The size of the document in bytes", | ||
| sa_column_kwargs={"comment": "Size of the document in bytes"}, | ||
| ) | ||
|
|
||
| # Foreign keys | ||
| source_document_id: UUID | None = Field( | ||
|
|
@@ -80,9 +87,6 @@ class DocumentPublic(DocumentBase): | |
| updated_at: datetime = Field( | ||
| description="The timestamp when the document was last updated" | ||
| ) | ||
| signed_url: str | None = Field( | ||
| default=None, description="A signed URL for accessing the document" | ||
| ) | ||
|
|
||
|
|
||
| class TransformedDocumentPublic(DocumentPublic): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,7 @@ | |
|
|
||
| from app.crud import DocumentCrud, CollectionCrud | ||
| from app.api.deps import SessionDep | ||
| from app.models import DocumentCollection, Collection, CollectionPublic | ||
| from app.models import DocumentCollection, Collection, CollectionPublic, Document | ||
|
|
||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
@@ -56,24 +56,51 @@ def extract_error_message(err: Exception) -> str: | |
|
|
||
|
|
||
| def batch_documents( | ||
| document_crud: DocumentCrud, documents: List[UUID], batch_size: int | ||
| ): | ||
| """Batch document IDs into chunks of size `batch_size`, load each via `DocumentCrud.read_each`, | ||
| and return a list of document batches.""" | ||
| document_crud: DocumentCrud, documents: List[UUID] | ||
| ) -> List[List[Document]]: | ||
coderabbitai[bot] marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """ | ||
| Batch documents dynamically based on size and count limits. | ||
|
|
||
| Creates a new batch when either: | ||
| - Total size reaches 30 MB (31,457,280 bytes) | ||
| - Document count reaches 200 | ||
|
|
||
| Returns: | ||
| List of document batches | ||
| """ | ||
|
|
||
| MAX_BATCH_SIZE_BYTES = 30 * 1024 * 1024 # 30 MB in bytes | ||
| MAX_BATCH_COUNT = 200 # Maximum documents per batch | ||
nishika26 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| docs_batches = [] | ||
| current_batch = [] | ||
| current_batch_size = 0 | ||
|
|
||
| for doc_id in documents: | ||
| doc = document_crud.read_one(doc_id) | ||
| doc_size = doc.file_size or 0 | ||
|
|
||
| would_exceed_size = (current_batch_size + doc_size) > MAX_BATCH_SIZE_BYTES | ||
| would_exceed_count = len(current_batch) >= MAX_BATCH_COUNT | ||
|
|
||
| if current_batch and (would_exceed_size or would_exceed_count): | ||
| logger.info( | ||
| f"[batch_documents] Batch completed | {{'batch_num': {len(docs_batches) + 1}, 'doc_count': {len(current_batch)}, 'batch_size_bytes': {current_batch_size}, 'batch_size_mb': {round(current_batch_size / (1024 * 1024), 2)}}}" | ||
|
||
| ) | ||
| docs_batches.append(current_batch) | ||
| current_batch = [] | ||
| current_batch_size = 0 | ||
|
|
||
| current_batch.append(doc) | ||
| current_batch_size += doc_size | ||
|
|
||
| if current_batch: | ||
| docs_batches.append(current_batch) | ||
|
|
||
| logger.info( | ||
| f"[batch_documents] Starting batch iteration for documents | {{'batch_size': {batch_size}, 'total_documents': {len(documents)}}}" | ||
| f"[batch_documents] Batching complete | {{'total_batches': {len(docs_batches)}, 'total_documents': {len(documents)}}}" | ||
| ) | ||
| docs_batches = [] | ||
| start, stop = 0, batch_size | ||
| while True: | ||
| view = documents[start:stop] | ||
| if not view: | ||
| break | ||
| batch_docs = document_crud.read_each(view) | ||
| docs_batches.append(batch_docs) | ||
| start = stop | ||
| stop += batch_size | ||
|
|
||
| return docs_batches | ||
|
|
||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.