✨ Now user can name the knowledge base whatever he wants without obeying the naming rule of elasticsearch

Jasonxia007 · Jasonxia007 · commit ca7a88f6f5b5 · 2025-12-11T09:07:30.000+08:00
diff --git a/backend/services/vectordatabase_service.py b/backend/services/vectordatabase_service.py
@@ -118,10 +118,12 @@ def _rethrow_or_plain(exc: Exception) -> None:
     msg = str(exc)
     try:
         parsed = json.loads(msg)
-        if isinstance(parsed, dict) and parsed.get("error_code"):
-            raise Exception(json.dumps(parsed, ensure_ascii=False))
     except Exception:
-        pass
+        raise Exception(msg)
+
+    if isinstance(parsed, dict) and parsed.get("error_code"):
+        raise Exception(json.dumps(parsed, ensure_ascii=False))
+
     raise Exception(msg)
 
 
diff --git a/test/backend/app/test_vectordatabase_app.py b/test/backend/app/test_vectordatabase_app.py
@@ -6,7 +6,7 @@
 import os
 import sys
 import pytest
-from unittest.mock import patch, MagicMock, ANY
+from unittest.mock import patch, MagicMock, ANY, AsyncMock
 from fastapi.testclient import TestClient
 from fastapi import FastAPI
 
@@ -1368,6 +1368,108 @@ async def test_health_check_exception(vdb_core_mock):
         mock_health.assert_called_once_with(ANY)
 
 
+@pytest.mark.asyncio
+async def test_get_document_error_info_not_found(vdb_core_mock, auth_data):
+    """
+    Test document error info when document is not found.
+    """
+    with patch("backend.apps.vectordatabase_app.get_all_files_status", new=AsyncMock(return_value={})):
+        response = client.get(
+            f"/indices/{auth_data['index_name']}/documents/missing_doc/error-info",
+            headers=auth_data["auth_header"],
+        )
+
+    assert response.status_code == 404
+    assert "not found" in response.json()["detail"]
+
+
+@pytest.mark.asyncio
+async def test_get_document_error_info_no_task_id(auth_data):
+    """
+    Test document error info when task id is empty.
+    """
+    with patch(
+        "backend.apps.vectordatabase_app.get_all_files_status",
+        new=AsyncMock(
+            return_value={
+                "doc-1": {
+                    "latest_task_id": ""
+                }
+            }
+        ),
+    ), patch("backend.apps.vectordatabase_app.get_redis_service") as mock_redis:
+        response = client.get(
+            "/indices/test_index/documents/doc-1/error-info",
+            headers=auth_data["auth_header"],
+        )
+
+    assert response.status_code == 200
+    assert response.json() == {"status": "success", "error_code": None}
+    mock_redis.assert_not_called()
+
+
+@pytest.mark.asyncio
+async def test_get_document_error_info_json_error_code(auth_data):
+    """
+    Test document error info JSON parsing for error_code.
+    """
+    redis_mock = MagicMock()
+    redis_mock.get_error_info.return_value = '{"error_code": "INVALID_FORMAT"}'
+
+    with patch(
+        "backend.apps.vectordatabase_app.get_all_files_status",
+        new=AsyncMock(
+            return_value={
+                "doc-1": {
+                    "latest_task_id": "task-123"
+                }
+            }
+        ),
+    ), patch(
+        "backend.apps.vectordatabase_app.get_redis_service",
+        return_value=redis_mock,
+    ):
+        response = client.get(
+            "/indices/test_index/documents/doc-1/error-info",
+            headers=auth_data["auth_header"],
+        )
+
+    assert response.status_code == 200
+    assert response.json() == {"status": "success", "error_code": "INVALID_FORMAT"}
+    redis_mock.get_error_info.assert_called_once_with("task-123")
+
+
+@pytest.mark.asyncio
+async def test_get_document_error_info_regex_error_code(auth_data):
+    """
+    Test document error info regex extraction when JSON parsing fails.
+    """
+    redis_mock = MagicMock()
+    redis_mock.get_error_info.return_value = "oops {'error_code': 'TIMEOUT_ERROR'}"
+
+    with patch(
+        "backend.apps.vectordatabase_app.get_all_files_status",
+        new=AsyncMock(
+            return_value={
+                "doc-1": {
+                    "latest_task_id": "task-999"
+                }
+            }
+        ),
+    ), patch(
+        "backend.apps.vectordatabase_app.get_redis_service",
+        return_value=redis_mock,
+    ):
+        response = client.get(
+            "/indices/test_index/documents/doc-1/error-info",
+            headers=auth_data["auth_header"],
+        )
+
+    assert response.status_code == 200
+    assert response.json() == {"status": "success", "error_code": "TIMEOUT_ERROR"}
+    redis_mock.get_error_info.assert_called_once_with("task-999")
+
+
 @pytest.mark.asyncio
 async def test_health_check_timeout_exception(vdb_core_mock):
     """
@@ -1562,6 +1664,59 @@ async def test_hybrid_search_value_error(vdb_core_mock, auth_data):
         assert response.json() == {"detail": "Query text is required"}
 
 
+@pytest.mark.asyncio
+async def test_get_index_chunks_value_error(vdb_core_mock):
+    """
+    Test get_index_chunks maps ValueError to 404.
+    """
+    index_name = "test_index"
+    with patch("backend.apps.vectordatabase_app.get_vector_db_core", return_value=vdb_core_mock), \
+        patch("backend.apps.vectordatabase_app.get_index_name_by_knowledge_name", return_value="resolved_index"), \
+        patch("backend.apps.vectordatabase_app.ElasticSearchService.get_index_chunks") as mock_get_chunks:
+
+        mock_get_chunks.side_effect = ValueError("Unknown index")
+
+        response = client.post(f"/indices/{index_name}/chunks")
+
+    assert response.status_code == 404
+    assert response.json() == {"detail": "Unknown index"}
+    mock_get_chunks.assert_called_once_with(
+        index_name="resolved_index",
+        page=None,
+        page_size=None,
+        path_or_url=None,
+        vdb_core=ANY,
+    )
+
+
+@pytest.mark.asyncio
+async def test_create_chunk_value_error(vdb_core_mock, auth_data):
+    """
+    Test create_chunk maps ValueError to 404.
+    """
+    with patch("backend.apps.vectordatabase_app.get_vector_db_core", return_value=vdb_core_mock), \
+        patch("backend.apps.vectordatabase_app.get_current_user_id", return_value=(auth_data["user_id"], auth_data["tenant_id"])), \
+        patch("backend.apps.vectordatabase_app.get_index_name_by_knowledge_name", return_value=auth_data["index_name"]), \
+        patch("backend.apps.vectordatabase_app.ElasticSearchService.create_chunk") as mock_create:
+
+        mock_create.side_effect = ValueError("Invalid chunk payload")
+
+        payload = {
+            "content": "Hello world",
+            "path_or_url": "doc-1",
+        }
+
+        response = client.post(
+            f"/indices/{auth_data['index_name']}/chunk",
+            json=payload,
+            headers=auth_data["auth_header"],
+        )
+
+    assert response.status_code == 404
+    assert response.json() == {"detail": "Invalid chunk payload"}
+    mock_create.assert_called_once()
+
+
 @pytest.mark.asyncio
 async def test_hybrid_search_exception(vdb_core_mock, auth_data):
     """
diff --git a/test/backend/data_process/test_tasks.py b/test/backend/data_process/test_tasks.py
@@ -1012,6 +1012,13 @@ def test_extract_error_code_parses_detail_and_regex_and_unknown():
     assert extract_error_code("no code here") == "unknown_error"
 
 
+def test_extract_error_code_top_level_key():
+    from backend.data_process.tasks import extract_error_code
+
+    payload = json.dumps({"error_code": "top_level"})
+    assert extract_error_code(payload) == "top_level"
+
+
 def test_save_error_to_redis_branches(monkeypatch):
     from backend.data_process.tasks import save_error_to_redis
 
@@ -1112,6 +1119,58 @@ def test_process_error_fallback_when_save_error_raises(monkeypatch, tmp_path):
     ) or self.states == []
 
 
+def test_process_error_truncates_reason_when_no_error_code(monkeypatch, tmp_path):
+    """process should truncate long messages when extract_error_code is falsy"""
+    tasks, fake_ray = import_tasks_with_fake_ray(monkeypatch, initialized=True)
+
+    long_msg = "x" * 250
+    error_json = json.dumps({"message": long_msg})
+
+    # Provide actor but make ray.get raise inside the try block
+    class FakeActor:
+        def __init__(self):
+            self.process_file = types.SimpleNamespace(remote=lambda *a, **k: "ref_err")
+            self.store_chunks_in_redis = types.SimpleNamespace(
+                remote=lambda *a, **k: None)
+
+    monkeypatch.setattr(tasks, "get_ray_actor", lambda: FakeActor())
+    fake_ray.get = lambda *_: (_ for _ in ()).throw(Exception(error_json))
+    # Force extract_error_code to return None so truncation path executes
+    monkeypatch.setattr(tasks, "extract_error_code", lambda *a, **k: None)
+
+    calls: list[str] = []
+
+    def save_and_capture(task_id, reason, start_time):
+        calls.append(reason)
+
+    monkeypatch.setattr(tasks, "save_error_to_redis", save_and_capture)
+
+    # Ensure source file exists so FileNotFound is not raised before ray.get
+    f = tmp_path / "exists.txt"
+    f.write_text("data")
+
+    self = FakeSelf("trunc-proc")
+    with pytest.raises(Exception):
+        tasks.process(
+            self,
+            source=str(f),
+            source_type="local",
+            chunking_strategy="basic",
+            index_name="idx",
+            original_filename="f.txt",
+        )
+
+    # Captured reason should be truncated because error_code is falsy
+    assert len(calls) >= 1
+    truncated_reason = calls[-1]
+    assert truncated_reason.endswith("...")
+    assert len(truncated_reason) <= 203
+    assert any(
+        s.get("meta", {}).get("stage") == "text_extraction_failed"
+        for s in self.states
+    )
+
+
 def test_forward_cancel_check_warning_then_continue(monkeypatch):
     tasks, _ = import_tasks_with_fake_ray(monkeypatch)
     monkeypatch.setattr(tasks, "ELASTICSEARCH_SERVICE", "http://api")
@@ -1197,6 +1256,58 @@ def post(self, *a, **k):
     assert "detail_err" in str(exc.value)
 
 
+def test_forward_index_documents_regex_error_code(monkeypatch):
+    tasks, _ = import_tasks_with_fake_ray(monkeypatch)
+    monkeypatch.setattr(tasks, "ELASTICSEARCH_SERVICE", "http://api")
+    monkeypatch.setattr(tasks, "get_file_size", lambda *a, **k: 0)
+
+    class FakeResponse:
+        status = 500
+
+        async def text(self):
+            # Include quotes so regex r'\"error_code\": \"...\"' matches
+            return 'oops "error_code":"regex_branch"'
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *a):
+            return False
+
+    class FakeSession:
+        def __init__(self, *a, **k):
+            pass
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *a):
+            return False
+
+        def post(self, *a, **k):
+            return FakeResponse()
+
+    fake_aiohttp = types.SimpleNamespace(
+        TCPConnector=lambda verify_ssl=False: None,
+        ClientTimeout=lambda total=None: None,
+        ClientSession=FakeSession,
+        ClientConnectorError=Exception,
+        ClientResponseError=Exception,
+    )
+    monkeypatch.setattr(tasks, "aiohttp", fake_aiohttp)
+    monkeypatch.setattr(tasks, "run_async", _run_coro)
+
+    self = FakeSelf("regex-err")
+    with pytest.raises(Exception) as exc:
+        tasks.forward(
+            self,
+            processed_data={"chunks": [{"content": "x", "metadata": {}}]},
+            index_name="idx",
+            source="/a.txt",
+        )
+    assert "regex_branch" in str(exc.value)
+
+
 def test_forward_index_documents_client_connector_error(monkeypatch):
     tasks, _ = import_tasks_with_fake_ray(monkeypatch)
     monkeypatch.setattr(tasks, "ELASTICSEARCH_SERVICE", "http://api")
@@ -1273,6 +1384,69 @@ def post(self, *a, **k):
     assert "Failed to connect to API" in str(exc.value) or "timeout" in str(exc.value).lower()
 
 
+def test_forward_truncates_reason_when_no_error_code(monkeypatch):
+    tasks, _ = import_tasks_with_fake_ray(monkeypatch)
+    monkeypatch.setattr(tasks, "ELASTICSEARCH_SERVICE", "http://api")
+    monkeypatch.setattr(tasks, "get_file_size", lambda *a, **k: 0)
+    monkeypatch.setattr(tasks, "extract_error_code", lambda *a, **k: None)
+
+    long_msg = json.dumps({"message": "m" * 250})
+    monkeypatch.setattr(
+        tasks, "run_async", lambda coro: (_ for _ in ()).throw(Exception(long_msg))
+    )
+
+    reasons: list[str] = []
+    monkeypatch.setattr(
+        tasks, "save_error_to_redis", lambda tid, reason, st: reasons.append(reason)
+    )
+
+    self = FakeSelf("f-trunc")
+    with pytest.raises(Exception):
+        tasks.forward(
+            self,
+            processed_data={"chunks": [{"content": "x", "metadata": {}}]},
+            index_name="idx",
+            source="/a.txt",
+        )
+
+    assert reasons and reasons[0].endswith("...")
+    assert len(reasons[0]) <= 203
+    assert any(
+        s.get("meta", {}).get("stage") == "forward_task_failed" for s in self.states
+    )
+
+
+def test_forward_fallback_truncates_on_non_json_error(monkeypatch):
+    tasks, _ = import_tasks_with_fake_ray(monkeypatch)
+    monkeypatch.setattr(tasks, "ELASTICSEARCH_SERVICE", "http://api")
+    monkeypatch.setattr(tasks, "get_file_size", lambda *a, **k: 0)
+    monkeypatch.setattr(tasks, "extract_error_code", lambda *a, **k: None)
+
+    monkeypatch.setattr(
+        tasks, "run_async", lambda coro: (_ for _ in ()).throw(Exception("n" * 250))
+    )
+
+    reasons: list[str] = []
+    monkeypatch.setattr(
+        tasks, "save_error_to_redis", lambda tid, reason, st: reasons.append(reason)
+    )
+
+    self = FakeSelf("f-fallback")
+    with pytest.raises(Exception):
+        tasks.forward(
+            self,
+            processed_data={"chunks": [{"content": "x", "metadata": {}}]},
+            index_name="idx",
+            source="/a.txt",
+        )
+
+    assert reasons and reasons[0].endswith("...")
+    assert len(reasons[0]) <= 203
+    assert any(
+        s.get("meta", {}).get("stage") == "forward_task_failed" for s in self.states
+    )
+
+
 def test_forward_error_truncates_reason_and_uses_save(monkeypatch):
     tasks, _ = import_tasks_with_fake_ray(monkeypatch)
     long_message = "m" * 250
diff --git a/test/backend/services/test_vectordatabase_service.py b/test/backend/services/test_vectordatabase_service.py