Skip to content

Commit 1905ac0

Browse files
committed
feat: return data_source_info and data_source_detail_dict
1 parent 5067e4f commit 1905ac0

File tree

4 files changed

+115
-4
lines changed

4 files changed

+115
-4
lines changed

api/controllers/console/datasets/datasets_document.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -751,12 +751,12 @@ def get(self, dataset_id, document_id):
751751
elif metadata == "without":
752752
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
753753
document_process_rules = document.dataset_process_rule.to_dict() if document.dataset_process_rule else {}
754-
data_source_info = document.data_source_detail_dict
755754
response = {
756755
"id": document.id,
757756
"position": document.position,
758757
"data_source_type": document.data_source_type,
759-
"data_source_info": data_source_info,
758+
"data_source_info": document.data_source_info_dict,
759+
"data_source_detail_dict": document.data_source_detail_dict,
760760
"dataset_process_rule_id": document.dataset_process_rule_id,
761761
"dataset_process_rule": dataset_process_rules,
762762
"document_process_rule": document_process_rules,
@@ -784,12 +784,12 @@ def get(self, dataset_id, document_id):
784784
else:
785785
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
786786
document_process_rules = document.dataset_process_rule.to_dict() if document.dataset_process_rule else {}
787-
data_source_info = document.data_source_detail_dict
788787
response = {
789788
"id": document.id,
790789
"position": document.position,
791790
"data_source_type": document.data_source_type,
792-
"data_source_info": data_source_info,
791+
"data_source_info": document.data_source_info_dict,
792+
"data_source_detail_dict": document.data_source_detail_dict,
793793
"dataset_process_rule_id": document.dataset_process_rule_id,
794794
"dataset_process_rule": dataset_process_rules,
795795
"document_process_rule": document_process_rules,

api/tests/unit_tests/controllers/__init__.py

Whitespace-only changes.

api/tests/unit_tests/controllers/console/__init__.py

Whitespace-only changes.
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
Test for document detail API data_source_info serialization fix.
3+
4+
This test verifies that the document detail API returns both data_source_info
5+
and data_source_detail_dict for all data_source_type values, including "local_file".
6+
"""
7+
8+
import json
9+
10+
from models.dataset import Document
11+
12+
13+
class TestDocumentDetailDataSourceInfo:
14+
"""Test cases for document detail API data_source_info serialization."""
15+
16+
def test_data_source_info_dict_returns_raw_data(self):
17+
"""Test that data_source_info_dict returns raw JSON data for all data_source_type values."""
18+
# Test data for different data_source_type values
19+
test_cases = [
20+
{
21+
"data_source_type": "local_file",
22+
"data_source_info": json.dumps({"file_path": "/tmp/test.txt", "size": 1024}),
23+
"expected_raw": {"file_path": "/tmp/test.txt", "size": 1024},
24+
},
25+
{
26+
"data_source_type": "upload_file",
27+
"data_source_info": json.dumps({"upload_file_id": "test-file-id"}),
28+
"expected_raw": {"upload_file_id": "test-file-id"},
29+
},
30+
{
31+
"data_source_type": "notion_import",
32+
"data_source_info": json.dumps({"notion_page_id": "page-123", "workspace_id": "ws-456"}),
33+
"expected_raw": {"notion_page_id": "page-123", "workspace_id": "ws-456"},
34+
},
35+
{
36+
"data_source_type": "website_crawl",
37+
"data_source_info": json.dumps({"url": "https://example.com", "job_id": "job-789"}),
38+
"expected_raw": {"url": "https://example.com", "job_id": "job-789"},
39+
}
40+
]
41+
42+
for case in test_cases:
43+
document = Document(
44+
data_source_type=case["data_source_type"],
45+
data_source_info=case["data_source_info"]
46+
)
47+
48+
# Test data_source_info_dict (raw data)
49+
raw_result = document.data_source_info_dict
50+
assert raw_result == case["expected_raw"], f"Failed for {case['data_source_type']}"
51+
52+
# Verify raw_result is always a valid dict
53+
assert isinstance(raw_result, dict)
54+
55+
def test_local_file_data_source_info_without_db_context(self):
56+
"""Test that local_file type data_source_info_dict works without database context."""
57+
test_data = {
58+
"file_path": "/local/path/document.txt",
59+
"size": 512,
60+
"created_at": "2024-01-01T00:00:00Z"
61+
}
62+
63+
document = Document(
64+
data_source_type="local_file",
65+
data_source_info=json.dumps(test_data)
66+
)
67+
68+
# data_source_info_dict should return the raw data (this doesn't need DB context)
69+
raw_data = document.data_source_info_dict
70+
assert raw_data == test_data
71+
assert isinstance(raw_data, dict)
72+
73+
# Verify the data contains expected keys for pipeline mode
74+
assert "file_path" in raw_data
75+
assert "size" in raw_data
76+
77+
def test_notion_and_website_crawl_data_source_detail(self):
78+
"""Test that notion_import and website_crawl return raw data in data_source_detail_dict."""
79+
# Test notion_import
80+
notion_data = {"notion_page_id": "page-123", "workspace_id": "ws-456"}
81+
document = Document(
82+
data_source_type="notion_import",
83+
data_source_info=json.dumps(notion_data)
84+
)
85+
86+
# data_source_detail_dict should return raw data for notion_import
87+
detail_result = document.data_source_detail_dict
88+
assert detail_result == notion_data
89+
90+
# Test website_crawl
91+
website_data = {"url": "https://example.com", "job_id": "job-789"}
92+
document = Document(
93+
data_source_type="website_crawl",
94+
data_source_info=json.dumps(website_data)
95+
)
96+
97+
# data_source_detail_dict should return raw data for website_crawl
98+
detail_result = document.data_source_detail_dict
99+
assert detail_result == website_data
100+
101+
def test_local_file_data_source_detail_dict_without_db(self):
102+
"""Test that local_file returns empty data_source_detail_dict (this doesn't need DB context)."""
103+
# Test local_file - this should work without database context since it returns {} early
104+
document = Document(
105+
data_source_type="local_file",
106+
data_source_info=json.dumps({"file_path": "/tmp/test.txt"})
107+
)
108+
109+
# Should return empty dict for local_file type (handled in the model)
110+
detail_result = document.data_source_detail_dict
111+
assert detail_result == {}

0 commit comments

Comments
 (0)