Skip to content

Commit c442eba

Browse files
Feature/GitHub permission sync (onyx-dot-app#4996)
* github perm sync initial draft * introduce github doc sync and perm sync * remove specific start time check * Refactor GitHub connector to use SlimCheckpointOutputWrapper for improved document handling * Update GitHub sync frequency defaults from 30 minutes to 5 minutes * Add stop signal handling and progress reporting in GitHub document sync * Refactor tests for Confluence and Google Drive connectors to use a mock fetch function for document access * change the doc_sync approach * add static typing for ocument columns and where clause * remove prefix logic in connector runner * mypy fix * code review changes * mypy fix * fix review comments * add sort order * Implement merge heads migration for Alembic and update Confluence and Google Drive test * github unit tests fix * delete merge head and rebase the docmetadata field migration --------- Co-authored-by: Subash <[email protected]>
1 parent 56f16d1 commit c442eba

File tree

33 files changed

+1282
-96
lines changed

33 files changed

+1282
-96
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""add_doc_metadata_field_in_document_model
2+
3+
Revision ID: 3fc5d75723b3
4+
Revises: 2f95e36923e6
5+
Create Date: 2025-07-28 18:45:37.985406
6+
7+
"""
8+
9+
from alembic import op
10+
import sqlalchemy as sa
11+
from sqlalchemy.dialects import postgresql
12+
13+
# revision identifiers, used by Alembic.
14+
revision = "3fc5d75723b3"
15+
down_revision = "2f95e36923e6"
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade() -> None:
21+
op.add_column(
22+
"document",
23+
sa.Column(
24+
"doc_metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True
25+
),
26+
)
27+
28+
29+
def downgrade() -> None:
30+
op.drop_column("document", "doc_metadata")

backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from onyx.db.connector import mark_cc_pair_as_permissions_synced
4848
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
4949
from onyx.db.document import get_document_ids_for_connector_credential_pair
50+
from onyx.db.document import get_documents_for_connector_credential_pair_limited_columns
5051
from onyx.db.document import upsert_document_by_connector_credential_pair
5152
from onyx.db.engine.sql_engine import get_session_with_current_tenant
5253
from onyx.db.engine.sql_engine import get_session_with_tenant
@@ -58,7 +59,9 @@
5859
from onyx.db.sync_record import insert_sync_record
5960
from onyx.db.sync_record import update_sync_record_status
6061
from onyx.db.users import batch_add_ext_perm_user_if_not_exists
62+
from onyx.db.utils import DocumentRow
6163
from onyx.db.utils import is_retryable_sqlalchemy_error
64+
from onyx.db.utils import SortOrder
6265
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
6366
from onyx.redis.redis_connector import RedisConnector
6467
from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
@@ -498,16 +501,31 @@ def connector_permission_sync_generator_task(
498501
# this is can be used to determine documents that are "missing" and thus
499502
# should no longer be accessible. The decision as to whether we should find
500503
# every document during the doc sync process is connector-specific.
501-
def fetch_all_existing_docs_fn() -> list[str]:
502-
return get_document_ids_for_connector_credential_pair(
504+
def fetch_all_existing_docs_fn(
505+
sort_order: SortOrder | None = None,
506+
) -> list[DocumentRow]:
507+
result = get_documents_for_connector_credential_pair_limited_columns(
503508
db_session=db_session,
504509
connector_id=cc_pair.connector.id,
505510
credential_id=cc_pair.credential.id,
511+
sort_order=sort_order,
506512
)
513+
return list(result)
514+
515+
def fetch_all_existing_docs_ids_fn() -> list[str]:
516+
result = get_document_ids_for_connector_credential_pair(
517+
db_session=db_session,
518+
connector_id=cc_pair.connector.id,
519+
credential_id=cc_pair.credential.id,
520+
)
521+
return result
507522

508523
doc_sync_func = sync_config.doc_sync_config.doc_sync_func
509524
document_external_accesses = doc_sync_func(
510-
cc_pair, fetch_all_existing_docs_fn, callback
525+
cc_pair,
526+
fetch_all_existing_docs_fn,
527+
fetch_all_existing_docs_ids_fn,
528+
callback,
511529
)
512530

513531
task_logger.info(

backend/ee/onyx/configs/app_configs.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,19 @@
7171
)
7272

7373

74+
#####
75+
# GitHub
76+
#####
77+
# In seconds, default is 5 minutes
78+
GITHUB_PERMISSION_DOC_SYNC_FREQUENCY = int(
79+
os.environ.get("GITHUB_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
80+
)
81+
# In seconds, default is 5 minutes
82+
GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY = int(
83+
os.environ.get("GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
84+
)
85+
86+
7487
#####
7588
# Slack
7689
#####

backend/ee/onyx/external_permissions/confluence/doc_sync.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from collections.abc import Generator
77

88
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
9+
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
910
from ee.onyx.external_permissions.utils import generic_doc_sync
1011
from onyx.access.models import DocExternalAccess
1112
from onyx.configs.constants import DocumentSource
@@ -25,6 +26,7 @@
2526
def confluence_doc_sync(
2627
cc_pair: ConnectorCredentialPair,
2728
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
29+
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
2830
callback: IndexingHeartbeatInterface | None,
2931
) -> Generator[DocExternalAccess, None, None]:
3032
"""
@@ -43,7 +45,7 @@ def confluence_doc_sync(
4345

4446
yield from generic_doc_sync(
4547
cc_pair=cc_pair,
46-
fetch_all_existing_docs_fn=fetch_all_existing_docs_fn,
48+
fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
4749
callback=callback,
4850
doc_source=DocumentSource.CONFLUENCE,
4951
slim_connector=confluence_connector,

0 commit comments

Comments
 (0)