fix: knowledgebase add method

zakahan · zakahan · commit 76e99ce6549e · 2025-09-10T12:43:26.000+08:00
diff --git a/veadk/database/viking/viking_database.py b/veadk/database/viking/viking_database.py
@@ -136,11 +136,25 @@ def _upload_to_tos(
         self,
         data: str | list[str] | TextIO | BinaryIO | bytes,
         **kwargs: Any,
-    ):
-        file_ext = kwargs.get(
-            "file_ext", ".pdf"
-        )  # when bytes data, file_ext is required
+    ) -> tuple[int, str]:
+        """
+        Upload data to TOS (Tinder Object Storage).
 
+        Args:
+            data: The data to be uploaded. Can be one of the following types:
+                - str: File path or string data
+                - list[str]: List of strings
+                - TextIO: File object (text)
+                - BinaryIO: File object (binary)
+                - bytes: Binary data
+            **kwargs: Additional keyword arguments.
+                - file_name (str): The file name (including suffix).
+
+        Returns:
+            tuple: A tuple containing the status code and TOS URL.
+                - status_code (int): HTTP status code
+                - tos_url (str): The URL of the uploaded file in TOS
+        """
         ak = self.config.volcengine_ak
         sk = self.config.volcengine_sk
 
@@ -151,21 +165,31 @@ def _upload_to_tos(
 
         client = tos.TosClientV2(ak, sk, tos_endpoint, tos_region, max_connections=1024)
 
+        # Extract file_name from kwargs - this is now required and includes the extension
+        file_names = kwargs.get("file_name")
+
         if isinstance(data, str) and os.path.isfile(data):  # Process file path
-            file_ext = os.path.splitext(data)[1]
-            new_key = f"{tos_key}/{str(uuid.uuid4())}{file_ext}"
+            # Use provided file_name which includes the extension
+            new_key = f"{tos_key}/{file_names}"
             with open(data, "rb") as f:
                 upload_data = f.read()
 
+        elif (
+            isinstance(data, list)
+            and all(isinstance(item, str) for item in data)
+            and all(os.path.isfile(item) for item in data)
+        ):
+            # Process list of file paths - this should be handled at a higher level
+            raise ValueError(
+                "Uploading multiple files through a list of file paths is not supported in _upload_to_tos directly. Please call this function for each file individually."
+            )
+
         elif isinstance(
             data,
             (io.TextIOWrapper, io.BufferedReader),  # file type: TextIO | BinaryIO
         ):  # Process file stream
-            # Try to get the file extension from the file name, and use the default value if there is none
-            file_ext = ".unknown"
-            if hasattr(data, "name"):
-                _, file_ext = os.path.splitext(data.name)
-            new_key = f"{tos_key}/{str(uuid.uuid4())}{file_ext}"
+            # Use provided file_name which includes the extension
+            new_key = f"{tos_key}/{file_names}"
             if isinstance(data, TextIO):
                 # Encode the text stream content into bytes
                 upload_data = data.read().encode("utf-8")
@@ -174,16 +198,19 @@ def _upload_to_tos(
                 upload_data = data.read()
 
         elif isinstance(data, str):  # Process ordinary strings
-            new_key = f"{tos_key}/{str(uuid.uuid4())}.txt"
+            # Use provided file_name which includes the extension
+            new_key = f"{tos_key}/{file_names}"
             upload_data = data.encode("utf-8")  # Encode as byte type
 
         elif isinstance(data, list):  # Process list of strings
-            new_key = f"{tos_key}/{str(uuid.uuid4())}.txt"
+            # Use provided file_name which includes the extension
+            new_key = f"{tos_key}/{file_names}"
             # Join the strings in the list with newlines and encode as byte type
             upload_data = "\n".join(data).encode("utf-8")
 
         elif isinstance(data, bytes):  # Process bytes data
-            new_key = f"{tos_key}/{str(uuid.uuid4())}{file_ext}"
+            # Use provided file_name which includes the extension
+            new_key = f"{tos_key}/{file_names}"
             upload_data = data
 
         else:
@@ -231,28 +258,136 @@ def add(
         **kwargs,
     ):
         """
+        Add documents to the Viking database.
         Args:
-            data: str, file path or file stream:  Both file or file.read() are acceptable.
-            **kwargs: collection_name(required)
+            data: The data to be added. Can be one of the following types:
+                - str: File path or string data
+                - list[str]: List of file paths or list of strings
+                - TextIO: File object (text)
+                - BinaryIO: File object (binary)
+                - bytes: Binary data
+            collection_name: The name of the collection to add documents to.
+            **kwargs: Additional keyword arguments.
+                - file_name (str | list[str]): The file name or a list of file names (including suffix).
+                - doc_id (str): The document ID. If not provided, a UUID will be generated.
         Returns:
-            {
+            dict or list: A dictionary containing the TOS URL and document ID, or a list of such dictionaries for multiple file uploads.
+            Format: {
                 "tos_url": "tos://<bucket>/<key>",
                 "doc_id": "<doc_id>",
             }
         """
-
-        status, tos_url = self._upload_to_tos(data=data, **kwargs)
-        if status != 200:
-            raise ValueError(f"Error in upload_to_tos: {status}")
-        doc_id = self._add_doc(
-            collection_name=collection_name,
-            tos_url=tos_url,
-            doc_id=str(uuid.uuid4()),
-        )
-        return {
-            "tos_url": f"tos://{tos_url}",
-            "doc_id": doc_id,
-        }
+        # Handle list of file paths (multiple file upload)
+        if (
+            isinstance(data, list)
+            and all(isinstance(item, str) for item in data)
+            and all(os.path.isfile(item) for item in data)
+        ):
+            # Handle multiple file upload
+            file_names = kwargs.get("file_name")
+            if (
+                not file_names
+                or not isinstance(file_names, list)
+                or len(file_names) != len(data)
+            ):
+                raise ValueError(
+                    "For multiple file upload, file_name must be provided as a list with the same length as data"
+                )
+
+            results = []
+            for i, file_path in enumerate(data):
+                # Create kwargs for this specific file
+                single_kwargs = kwargs.copy()
+                single_kwargs["file_name"] = file_names[i]
+
+                # Generate or use provided doc_id for this file
+                doc_id = single_kwargs.get("doc_id")
+                if not doc_id:
+                    doc_id = str(uuid.uuid4())
+                    single_kwargs["doc_id"] = doc_id
+
+                status, tos_url = self._upload_to_tos(data=file_path, **single_kwargs)
+                if status != 200:
+                    raise ValueError(
+                        f"Error in upload_to_tos for file {file_path}: {status}"
+                    )
+
+                doc_id = self._add_doc(
+                    collection_name=collection_name,
+                    tos_url=tos_url,
+                    doc_id=doc_id,
+                )
+
+                results.append(
+                    {
+                        "tos_url": f"tos://{tos_url}",
+                        "doc_id": doc_id,
+                    }
+                )
+
+            return results
+
+        # Handle list of strings (multiple string upload)
+        elif isinstance(data, list) and all(isinstance(item, str) for item in data):
+            # Handle multiple string upload
+            file_names = kwargs.get("file_name")
+            if (
+                not file_names
+                or not isinstance(file_names, list)
+                or len(file_names) != len(data)
+            ):
+                raise ValueError(
+                    "For multiple string upload, file_name must be provided as a list with the same length as data"
+                )
+
+            results = []
+            for i, content in enumerate(data):
+                # Create kwargs for this specific string
+                single_kwargs = kwargs.copy()
+                single_kwargs["file_name"] = file_names[i]
+
+                # Generate or use provided doc_id for this string
+                doc_id = single_kwargs.get("doc_id")
+                if not doc_id:
+                    doc_id = str(uuid.uuid4())
+                    single_kwargs["doc_id"] = doc_id
+
+                status, tos_url = self._upload_to_tos(data=content, **single_kwargs)
+                if status != 200:
+                    raise ValueError(f"Error in upload_to_tos for string {i}: {status}")
+
+                doc_id = self._add_doc(
+                    collection_name=collection_name,
+                    tos_url=tos_url,
+                    doc_id=doc_id,
+                )
+
+                results.append(
+                    {
+                        "tos_url": f"tos://{tos_url}",
+                        "doc_id": doc_id,
+                    }
+                )
+
+            return results
+
+        # Handle single file upload or other data types
+        else:
+            # Handle doc_id from kwargs or generate a new one
+            doc_id = kwargs.get("doc_id", str(uuid.uuid4()))
+
+            status, tos_url = self._upload_to_tos(data=data, **kwargs)
+            if status != 200:
+                raise ValueError(f"Error in upload_to_tos: {status}")
+            doc_id = self._add_doc(
+                collection_name=collection_name,
+                tos_url=tos_url,
+                doc_id=doc_id,
+            )
+            return {
+                "tos_url": f"tos://{tos_url}",
+                "doc_id": doc_id,
+            }
 
     def delete(self, **kwargs: Any):
         name = kwargs.get("name")
diff --git a/veadk/knowledgebase/knowledgebase.py b/veadk/knowledgebase/knowledgebase.py
@@ -56,9 +56,16 @@ def add(
     ):
         """
         Add documents to the vector database.
-        You can only upload files or file characters when the adapter type used is vikingdb.
-        In addition, if you upload data of the bytes type,
-            for example, if you read the file stream of a pdf, then you need to pass an additional parameter file_ext = '.pdf'.
+        Args:
+            data (str | list[str] | TextIO | BinaryIO | bytes): The data to be added.
+                - str: A single file path. (viking only)
+                - list[str]: A list of file paths.
+                - TextIO: A file object (TextIO). (viking only) file descriptor
+                - BinaryIO: A file object (BinaryIO). (viking only) file descriptor
+                - bytes: Binary data. (viking only) binary data (f.read())
+            app_name: index name
+            **kwargs: Additional keyword arguments.
+                - file_name (str | list[str]): The file name or a list of file names (including suffix). (viking only)
         """
         if self.backend != "viking" and not (
             isinstance(data, str) or isinstance(data, list)
@@ -73,8 +80,7 @@ def add(
         if self.backend == "viking":
             # Case 1: Handling file paths or lists of file paths (str)
             if isinstance(data, str) and os.path.isfile(data):
-                # 单个文件路径，直接调用client.add
-                # 获取文件名（包括后缀名）
+                # Get the file name (including the suffix)
                 if "file_name" not in kwargs or not kwargs["file_name"]:
                     kwargs["file_name"] = os.path.basename(data)
                 return self._adapter.add(data=data, index=index, **kwargs)
@@ -125,6 +131,10 @@ def add(
             # Case6: Unsupported data type
             raise TypeError(f"Unsupported data type: {type(data)}")
 
+        if isinstance(data, list):
+            raise TypeError(
+                f"Unsupported data type: {type(data)}, Only viking support file_path and file bytes"
+            )
         # not viking
         return self._adapter.add(data=data, index=index, **kwargs)