Skip to content

Commit 71a1a77

Browse files
committed
feat: return data_source_info and data_source_detail_dict
1 parent 5067e4f commit 71a1a77

File tree

4 files changed

+158
-4
lines changed

4 files changed

+158
-4
lines changed

api/controllers/console/datasets/datasets_document.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -751,12 +751,12 @@ def get(self, dataset_id, document_id):
751751
elif metadata == "without":
752752
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
753753
document_process_rules = document.dataset_process_rule.to_dict() if document.dataset_process_rule else {}
754-
data_source_info = document.data_source_detail_dict
755754
response = {
756755
"id": document.id,
757756
"position": document.position,
758757
"data_source_type": document.data_source_type,
759-
"data_source_info": data_source_info,
758+
"data_source_info": document.data_source_info_dict,
759+
"data_source_detail_dict": document.data_source_detail_dict,
760760
"dataset_process_rule_id": document.dataset_process_rule_id,
761761
"dataset_process_rule": dataset_process_rules,
762762
"document_process_rule": document_process_rules,
@@ -784,12 +784,12 @@ def get(self, dataset_id, document_id):
784784
else:
785785
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
786786
document_process_rules = document.dataset_process_rule.to_dict() if document.dataset_process_rule else {}
787-
data_source_info = document.data_source_detail_dict
788787
response = {
789788
"id": document.id,
790789
"position": document.position,
791790
"data_source_type": document.data_source_type,
792-
"data_source_info": data_source_info,
791+
"data_source_info": document.data_source_info_dict,
792+
"data_source_detail_dict": document.data_source_detail_dict,
793793
"dataset_process_rule_id": document.dataset_process_rule_id,
794794
"dataset_process_rule": dataset_process_rules,
795795
"document_process_rule": document_process_rules,

api/tests/unit_tests/controllers/__init__.py

Whitespace-only changes.

api/tests/unit_tests/controllers/console/__init__.py

Whitespace-only changes.
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
"""
2+
Test for document detail API data_source_info serialization fix.
3+
4+
This test verifies that the document detail API returns both data_source_info
5+
and data_source_detail_dict for all data_source_type values, including "local_file".
6+
"""
7+
8+
import json
9+
from typing import Generic, Literal, NotRequired, TypedDict, TypeVar, Union
10+
11+
from models.dataset import Document
12+
13+
14+
class LocalFileInfo(TypedDict):
15+
file_path: str
16+
size: int
17+
created_at: NotRequired[str]
18+
19+
20+
class UploadFileInfo(TypedDict):
21+
upload_file_id: str
22+
23+
24+
class NotionImportInfo(TypedDict):
25+
notion_page_id: str
26+
workspace_id: str
27+
28+
29+
class WebsiteCrawlInfo(TypedDict):
30+
url: str
31+
job_id: str
32+
33+
34+
RawInfo = Union[LocalFileInfo, UploadFileInfo, NotionImportInfo, WebsiteCrawlInfo]
35+
T_type = TypeVar("T_type", bound=str)
36+
T_info = TypeVar("T_info", bound=Union[LocalFileInfo, UploadFileInfo, NotionImportInfo, WebsiteCrawlInfo])
37+
38+
39+
class Case(TypedDict, Generic[T_type, T_info]):
40+
data_source_type: T_type
41+
data_source_info: str
42+
expected_raw: T_info
43+
44+
45+
LocalFileCase = Case[Literal["local_file"], LocalFileInfo]
46+
UploadFileCase = Case[Literal["upload_file"], UploadFileInfo]
47+
NotionImportCase = Case[Literal["notion_import"], NotionImportInfo]
48+
WebsiteCrawlCase = Case[Literal["website_crawl"], WebsiteCrawlInfo]
49+
50+
AnyCase = Union[LocalFileCase, UploadFileCase, NotionImportCase, WebsiteCrawlCase]
51+
52+
53+
case_1: LocalFileCase = {
54+
"data_source_type": "local_file",
55+
"data_source_info": json.dumps({
56+
"file_path": "/tmp/test.txt",
57+
"size": 1024
58+
}),
59+
"expected_raw": {
60+
"file_path": "/tmp/test.txt",
61+
"size": 1024
62+
}
63+
}
64+
65+
66+
# ERROR: Expected LocalFileInfo, but got WebsiteCrawlInfo
67+
case_2: LocalFileCase = {
68+
"data_source_type": "local_file",
69+
"data_source_info": "...",
70+
"expected_raw": {
71+
"file_path": "https://google.com",
72+
"size": 123
73+
}
74+
}
75+
76+
cases: list[AnyCase] = [case_1]
77+
78+
79+
class TestDocumentDetailDataSourceInfo:
80+
"""Test cases for document detail API data_source_info serialization."""
81+
82+
def test_data_source_info_dict_returns_raw_data(self):
83+
"""Test that data_source_info_dict returns raw JSON data for all data_source_type values."""
84+
# Test data for different data_source_type values
85+
for case in cases:
86+
document = Document(
87+
data_source_type=case["data_source_type"],
88+
data_source_info=case["data_source_info"],
89+
)
90+
91+
# Test data_source_info_dict (raw data)
92+
raw_result = document.data_source_info_dict
93+
assert raw_result == case["expected_raw"], f"Failed for {case['data_source_type']}"
94+
95+
# Verify raw_result is always a valid dict
96+
assert isinstance(raw_result, dict)
97+
98+
def test_local_file_data_source_info_without_db_context(self):
99+
"""Test that local_file type data_source_info_dict works without database context."""
100+
test_data: LocalFileInfo = {
101+
"file_path": "/local/path/document.txt",
102+
"size": 512,
103+
"created_at": "2024-01-01T00:00:00Z",
104+
}
105+
106+
document = Document(
107+
data_source_type="local_file",
108+
data_source_info=json.dumps(test_data),
109+
)
110+
111+
# data_source_info_dict should return the raw data (this doesn't need DB context)
112+
raw_data = document.data_source_info_dict
113+
assert raw_data == test_data
114+
assert isinstance(raw_data, dict)
115+
116+
# Verify the data contains expected keys for pipeline mode
117+
assert "file_path" in raw_data
118+
assert "size" in raw_data
119+
120+
def test_notion_and_website_crawl_data_source_detail(self):
121+
"""Test that notion_import and website_crawl return raw data in data_source_detail_dict."""
122+
# Test notion_import
123+
notion_data: NotionImportInfo = {"notion_page_id": "page-123", "workspace_id": "ws-456"}
124+
document = Document(
125+
data_source_type="notion_import",
126+
data_source_info=json.dumps(notion_data),
127+
)
128+
129+
# data_source_detail_dict should return raw data for notion_import
130+
detail_result = document.data_source_detail_dict
131+
assert detail_result == notion_data
132+
133+
# Test website_crawl
134+
website_data: WebsiteCrawlInfo = {"url": "https://example.com", "job_id": "job-789"}
135+
document = Document(
136+
data_source_type="website_crawl",
137+
data_source_info=json.dumps(website_data),
138+
)
139+
140+
# data_source_detail_dict should return raw data for website_crawl
141+
detail_result = document.data_source_detail_dict
142+
assert detail_result == website_data
143+
144+
def test_local_file_data_source_detail_dict_without_db(self):
145+
"""Test that local_file returns empty data_source_detail_dict (this doesn't need DB context)."""
146+
# Test local_file - this should work without database context since it returns {} early
147+
document = Document(
148+
data_source_type="local_file",
149+
data_source_info=json.dumps({"file_path": "/tmp/test.txt"}),
150+
)
151+
152+
# Should return empty dict for local_file type (handled in the model)
153+
detail_result = document.data_source_detail_dict
154+
assert detail_result == {}

0 commit comments

Comments
 (0)