Skip to content

Commit 212a640

Browse files
authored
feat(fai): add chunk_count to content hash table (#5819)
1 parent fa07320 commit 212a640

File tree

10 files changed

+83
-23
lines changed

10 files changed

+83
-23
lines changed

fern/apis/fai/openapi.json

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4701,6 +4701,10 @@
47014701
},
47024702
"ContentHashEntry": {
47034703
"properties": {
4704+
"domain": {
4705+
"type": "string",
4706+
"title": "Domain"
4707+
},
47044708
"parent_id": {
47054709
"type": "string",
47064710
"title": "Parent Id"
@@ -4709,6 +4713,10 @@
47094713
"type": "string",
47104714
"title": "Content Hash"
47114715
},
4716+
"chunk_count": {
4717+
"type": "integer",
4718+
"title": "Chunk Count"
4719+
},
47124720
"indexed_at": {
47134721
"anyOf": [
47144722
{
@@ -4723,8 +4731,10 @@
47234731
},
47244732
"type": "object",
47254733
"required": [
4734+
"domain",
47264735
"parent_id",
4727-
"content_hash"
4736+
"content_hash",
4737+
"chunk_count"
47284738
],
47294739
"title": "ContentHashEntry",
47304740
"description": "A single content hash entry."
@@ -4738,12 +4748,17 @@
47384748
"content_hash": {
47394749
"type": "string",
47404750
"title": "Content Hash"
4751+
},
4752+
"chunk_count": {
4753+
"type": "integer",
4754+
"title": "Chunk Count"
47414755
}
47424756
},
47434757
"type": "object",
47444758
"required": [
47454759
"parent_id",
4746-
"content_hash"
4760+
"content_hash",
4761+
"chunk_count"
47474762
],
47484763
"title": "ContentHashUpsertEntry",
47494764
"description": "Entry for upserting a content hash (without timestamps)."

packages/fai-sdk/src/api/resources/contentHash/client/Client.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ export class ContentHash {
140140
* await client.contentHash.batchUpsertContentHashes("domain", {
141141
* entries: [{
142142
* parent_id: "parent_id",
143-
* content_hash: "content_hash"
143+
* content_hash: "content_hash",
144+
* chunk_count: 1
144145
* }]
145146
* })
146147
*/

packages/fai-sdk/src/api/resources/contentHash/client/requests/BatchUpsertContentHashesRequest.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ import * as FernAI from "../../../../index.js";
99
* {
1010
* entries: [{
1111
* parent_id: "parent_id",
12-
* content_hash: "content_hash"
12+
* content_hash: "content_hash",
13+
* chunk_count: 1
1314
* }]
1415
* }
1516
*/

packages/fai-sdk/src/api/types/ContentHashEntry.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
* A single content hash entry.
77
*/
88
export interface ContentHashEntry {
9+
domain: string;
910
parent_id: string;
1011
content_hash: string;
12+
chunk_count: number;
1113
indexed_at?: string;
1214
}

packages/fai-sdk/src/api/types/ContentHashUpsertEntry.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@
88
export interface ContentHashUpsertEntry {
99
parent_id: string;
1010
content_hash: string;
11+
chunk_count: number;
1112
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""add_chunk_count_to_content_hashes
2+
3+
Revision ID: bef764295d1b
4+
Revises: create_content_hashes_table
5+
Create Date: 2025-12-10 16:52:03.487012
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
14+
# revision identifiers, used by Alembic.
15+
revision: str = 'bef764295d1b'
16+
down_revision: Union[str, Sequence[str], None] = 'create_content_hashes_table'
17+
branch_labels: Union[str, Sequence[str], None] = None
18+
depends_on: Union[str, Sequence[str], None] = None
19+
20+
21+
def upgrade() -> None:
22+
"""Upgrade schema."""
23+
op.add_column('content_hashes', sa.Column('chunk_count', sa.Integer(), nullable=False, server_default='0', comment='Number of chunks created during indexing'))
24+
25+
26+
def downgrade() -> None:
27+
"""Downgrade schema."""
28+
op.drop_column('content_hashes', 'chunk_count')

servers/fai/src/fai/models/api/content_hash_api.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
class ContentHashEntry(BaseModel):
77
"""A single content hash entry."""
88

9+
domain: str
910
parent_id: str
1011
content_hash: str
12+
chunk_count: int
1113
indexed_at: str | None = None
1214

1315

@@ -16,6 +18,7 @@ class ContentHashUpsertEntry(BaseModel):
1618

1719
parent_id: str
1820
content_hash: str
21+
chunk_count: int
1922

2023

2124
class BatchGetContentHashesRequest(BaseModel):

servers/fai/src/fai/models/db/content_hash_db.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from datetime import datetime
22

3-
from sqlalchemy import Column, DateTime, Index, String
3+
from sqlalchemy import Column, DateTime, Index, Integer, String
44

55
from fai.models.base import Base
66

@@ -17,6 +17,7 @@ class ContentHashDb(Base):
1717
parent_id = Column(String, primary_key=True, nullable=False, comment="FDR PageId or EndpointId")
1818

1919
content_hash = Column(String, nullable=False, comment="SHA-256 hash of page markdown or endpoint document content")
20+
chunk_count = Column(Integer, nullable=False, comment="Number of chunks created during indexing")
2021

2122
# Metadata
2223
indexed_at = Column(DateTime, nullable=False, default=datetime.utcnow)

servers/fai/src/fai/routes/content_hash.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,10 @@ async def batch_get_content_hashes(
5757

5858
entries = [
5959
ContentHashEntry(
60+
domain=h.domain,
6061
parent_id=h.parent_id,
6162
content_hash=h.content_hash,
63+
chunk_count=h.chunk_count,
6264
indexed_at=h.indexed_at.isoformat(),
6365
)
6466
for h in hashes
@@ -99,9 +101,15 @@ async def batch_upsert_content_hashes(
99101

100102
if existing:
101103
existing.content_hash = entry.content_hash
104+
existing.chunk_count = entry.chunk_count
102105
existing.updated_at = datetime.utcnow()
103106
else:
104-
new_hash = ContentHashDb(domain=domain, parent_id=entry.parent_id, content_hash=entry.content_hash)
107+
new_hash = ContentHashDb(
108+
domain=domain,
109+
parent_id=entry.parent_id,
110+
content_hash=entry.content_hash,
111+
chunk_count=entry.chunk_count,
112+
)
105113
db.add(new_hash)
106114

107115
upserted_count += 1

servers/fai/tests/routes/test_content_hash.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ async def test_get_all_hashes_for_domain(self, test_client: TestClient, test_ses
2929
domain = "test-domain"
3030

3131
# Insert test data
32-
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1")
33-
hash2 = ContentHashDb(domain=domain, parent_id="page-2", content_hash="hash2")
34-
hash3 = ContentHashDb(domain="other-domain", parent_id="page-3", content_hash="hash3")
32+
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1", chunk_count=5)
33+
hash2 = ContentHashDb(domain=domain, parent_id="page-2", content_hash="hash2", chunk_count=10)
34+
hash3 = ContentHashDb(domain="other-domain", parent_id="page-3", content_hash="hash3", chunk_count=15)
3535
test_session.add_all([hash1, hash2, hash3])
3636
await test_session.commit()
3737

@@ -52,9 +52,9 @@ async def test_get_specific_hashes(self, test_client: TestClient, test_session:
5252
domain = "test-domain"
5353

5454
# Insert test data
55-
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1")
56-
hash2 = ContentHashDb(domain=domain, parent_id="page-2", content_hash="hash2")
57-
hash3 = ContentHashDb(domain=domain, parent_id="page-3", content_hash="hash3")
55+
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1", chunk_count=5)
56+
hash2 = ContentHashDb(domain=domain, parent_id="page-2", content_hash="hash2", chunk_count=10)
57+
hash3 = ContentHashDb(domain=domain, parent_id="page-3", content_hash="hash3", chunk_count=15)
5858
test_session.add_all([hash1, hash2, hash3])
5959
await test_session.commit()
6060

@@ -94,8 +94,8 @@ async def test_upsert_new_hashes(self, test_client: TestClient, test_session: As
9494
f"/content-hash/{domain}/batch-upsert",
9595
json={
9696
"entries": [
97-
{"parent_id": "page-1", "content_hash": "hash1"},
98-
{"parent_id": "page-2", "content_hash": "hash2"},
97+
{"parent_id": "page-1", "content_hash": "hash1", "chunk_count": 5},
98+
{"parent_id": "page-2", "content_hash": "hash2", "chunk_count": 10},
9999
]
100100
},
101101
)
@@ -119,14 +119,14 @@ async def test_upsert_update_existing_hashes(self, test_client: TestClient, test
119119
domain = "test-domain"
120120

121121
# Insert initial hash
122-
initial = ContentHashDb(domain=domain, parent_id="page-1", content_hash="old-hash")
122+
initial = ContentHashDb(domain=domain, parent_id="page-1", content_hash="old-hash", chunk_count=5)
123123
test_session.add(initial)
124124
await test_session.commit()
125125

126126
# Update with new hash
127127
response = test_client.post(
128128
f"/content-hash/{domain}/batch-upsert",
129-
json={"entries": [{"parent_id": "page-1", "content_hash": "new-hash"}]},
129+
json={"entries": [{"parent_id": "page-1", "content_hash": "new-hash", "chunk_count": 10}]},
130130
)
131131

132132
assert response.status_code == 200
@@ -146,7 +146,7 @@ async def test_upsert_mixed(self, test_client: TestClient, test_session: AsyncSe
146146
domain = "test-domain"
147147

148148
# Insert existing hash
149-
existing = ContentHashDb(domain=domain, parent_id="page-1", content_hash="old-hash")
149+
existing = ContentHashDb(domain=domain, parent_id="page-1", content_hash="old-hash", chunk_count=5)
150150
test_session.add(existing)
151151
await test_session.commit()
152152

@@ -155,8 +155,8 @@ async def test_upsert_mixed(self, test_client: TestClient, test_session: AsyncSe
155155
f"/content-hash/{domain}/batch-upsert",
156156
json={
157157
"entries": [
158-
{"parent_id": "page-1", "content_hash": "updated-hash"},
159-
{"parent_id": "page-2", "content_hash": "new-hash"},
158+
{"parent_id": "page-1", "content_hash": "updated-hash", "chunk_count": 10},
159+
{"parent_id": "page-2", "content_hash": "new-hash", "chunk_count": 15},
160160
]
161161
},
162162
)
@@ -184,9 +184,9 @@ async def test_delete_hashes(self, test_client: TestClient, test_session: AsyncS
184184
domain = "test-domain"
185185

186186
# Insert test hashes
187-
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1")
188-
hash2 = ContentHashDb(domain=domain, parent_id="page-2", content_hash="hash2")
189-
hash3 = ContentHashDb(domain=domain, parent_id="page-3", content_hash="hash3")
187+
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1", chunk_count=5)
188+
hash2 = ContentHashDb(domain=domain, parent_id="page-2", content_hash="hash2", chunk_count=10)
189+
hash3 = ContentHashDb(domain=domain, parent_id="page-3", content_hash="hash3", chunk_count=15)
190190
test_session.add_all([hash1, hash2, hash3])
191191
await test_session.commit()
192192

@@ -215,7 +215,7 @@ async def test_delete_empty_list(self, test_client: TestClient, test_session: As
215215
domain = "test-domain"
216216

217217
# Insert test hash
218-
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1")
218+
hash1 = ContentHashDb(domain=domain, parent_id="page-1", content_hash="hash1", chunk_count=5)
219219
test_session.add(hash1)
220220
await test_session.commit()
221221

0 commit comments

Comments
 (0)