[owl] Validate file column URI (#829)

jiahuei · jiahuei · commit 37fcc50dd0fc · 2025-11-27T16:19:11.000Z
Backend - owl (API server)

- Add file column URI validation
- Log file size and URI in Docling log messages
- Log row data validation failure
diff --git a/services/api/src/owl/db/gen_executor.py b/services/api/src/owl/db/gen_executor.py
@@ -520,7 +520,9 @@ async def _setup_tasks(self) -> None:
         # Process inputs and dependencies
         if self._regen_strategy is None:
             _body: RowAdd = self.body
-            self._column_dict = {k: v for k, v in _body.data.items() if k in self._col_map}
+            self._column_dict = {
+                k: v for k, v in _body.data.items() if k in self._col_map and not k.endswith("_")
+            }
         else:
             _body: RowRegen = self.body
             _row = await self.table.get_row(self._row_id)
@@ -1387,7 +1389,8 @@ async def _load_uri_as_base64(uri: str | None) -> str | AudioContent | ImageCont
                         f"{', '.join(DOCUMENT_FILE_EXTENSIONS + AUDIO_FILE_EXTENSIONS + IMAGE_FILE_EXTENSIONS)}"
                     )
                 )
-    except BadInputError:
+    except BadInputError as e:
+        logger.warning(f'Failed to parse file "{uri}" due to error: {repr(e)}')
         raise
     except Exception as e:
         logger.warning(f'Failed to parse file "{uri}" due to error: {repr(e)}')
diff --git a/services/api/src/owl/db/gen_table.py b/services/api/src/owl/db/gen_table.py
@@ -108,6 +108,7 @@
     json_loads,
     open_uri_async,
     s3_upload,
+    validate_url,
 )
 from owl.version import __version__ as owl_version
 
@@ -1417,27 +1418,25 @@ def create_vector_validator(col: ColumnMetadata):
                     @classmethod
                     def vector_validator(cls, v: np.ndarray | None) -> np.ndarray | None:
                         if v is not None and len(v) != col.vlen:
-                            raise ValueError(
-                                f"Array input for column {col.column_id} must have length {col.vlen}"
-                            )
+                            raise ValueError(f"Array input must have length {col.vlen}")
                         return v
 
                     return vector_validator
 
                 validators[f"validate_{col.column_id}"] = create_vector_validator(col)
                 field_definitions[col.column_id] = (NumpyArray | None, Field(default=None))
             else:
-                # if col.is_file_column:
-                #     # Create URL validator
-                #     def create_url_validator(col: ColumnMetadata):
-                #         @field_validator(col.column_id, mode="after")
-                #         @classmethod
-                #         def url_validator(cls, v: str | None) -> str | None:
-                #             return validate_url(v) if v else None
+                if col.is_file_column:
+                    # Create URL validator
+                    def create_url_validator(col: ColumnMetadata):
+                        @field_validator(col.column_id, mode="after")
+                        @classmethod
+                        def url_validator(cls, v: str | None) -> str | None:
+                            return validate_url(v, error_cls=ValueError) if v else None
 
-                #         return url_validator
+                        return url_validator
 
-                #     validators[f"validate_{col.column_id}"] = create_url_validator(col)
+                    validators[f"validate_{col.column_id}"] = create_url_validator(col)
                 # Get the Python type from ColumnDtype
                 py_type = col.dtype.to_python_type()
                 field_definitions[col.column_id] = (py_type | None, Field(default=None))
@@ -3175,14 +3174,33 @@ async def drop_columns(
     def _jsonify(x: Any) -> Any:
         return x.tolist() if isinstance(x, np.ndarray) else x
 
+    def validate_row_data(self, data: dict[str, Any]):
+        try:
+            self.data_table_model.model_validate(data, strict=False)
+        except ValidationError as e:
+            # Set invalid value to None, and save original value to state
+            msg = ""
+            for error in e.errors():
+                if len(error["loc"]) != 1:
+                    logger.warning(
+                        f"Cannot handle row data validation error with nested loc: {repr(e)}"
+                    )
+                    continue
+                col = error["loc"][0]
+                msg += f'Column "{col}": {error.get("msg", "")}. '
+            raise BadInputError(f"Row data contains errors. {msg}") from e
+
     def _validate_row_data(self, data: dict[str, Any]) -> DataTableRow:
         try:
             row = self.data_table_model.model_validate(data, strict=False)
         except ValidationError as e:
             # Set invalid value to None, and save original value to state
             for error in e.errors():
-                if len(error["loc"]) > 1:
-                    raise BadInputError(f"Input data contains errors: {e}") from e
+                if len(error["loc"]) != 1:
+                    logger.warning(
+                        f"Cannot handle row data validation error with nested loc: {repr(e)}"
+                    )
+                    continue
                 col = error["loc"][0]
                 state = data.get(f"{col}_", {})
                 data[col], data[f"{col}_"] = (
@@ -3193,7 +3211,7 @@ def _validate_row_data(self, data: dict[str, Any]) -> DataTableRow:
             try:
                 row = self.data_table_model.model_validate(data, strict=False)
             except ValidationError as e:
-                raise BadInputError(f"Input data contains errors: {e}") from e
+                raise BadInputError(f"Row data contains errors: {e}") from e
         return row
 
     # Row Create Ops
diff --git a/services/api/src/owl/docparse.py b/services/api/src/owl/docparse.py
@@ -137,11 +137,11 @@ def split_chunks(
                     for d in text_splitter.split_documents([Document(page_content=chunk.text)])
                 ]
             logger.info(
-                f"{_id} - {len(request.chunks):,d} chunks split into {len(chunks):,d} chunks.",
+                f"{len(request.chunks):,d} chunks split into {len(chunks):,d} chunks. ({_id})",
             )
             return chunks
         except Exception as e:
-            logger.exception(f"{_id} - Failed to split chunks.")
+            logger.exception(f"Failed to split chunks. ({_id})")
             raise BadInputError("Failed to split chunks.") from e
 
 
@@ -216,7 +216,7 @@ async def load_document(
                 ext = splitext(file_name)[1].lower()
                 if ext in [".pdf", ".docx", ".pptx", ".xlsx", ".html"]:
                     doc_loader = DoclingLoader(self.request_id)
-                    md = await doc_loader.convert_document_to_markdown(
+                    md = await doc_loader.document_to_markdown(
                         file_name=file_name, content=content
                     )
                 elif ext in [".md", ".txt"]:
@@ -320,7 +320,7 @@ async def load_document_chunks(
                         )
                     else:
                         doc_loader = DoclingLoader(self.request_id, page_break_placeholder=None)
-                    chunks = await doc_loader.convert_document_to_chunks(
+                    chunks = await doc_loader.document_to_chunks(
                         file_name=file_name,
                         content=content,
                         chunk_size=chunk_size,
@@ -438,13 +438,13 @@ def __init__(
         )
         self.page_break_placeholder = page_break_placeholder
 
-    async def retrieve_document_content(
+    async def _parse_document(
         self,
         file_name: str,
         content: bytes,
-    ) -> dict:  # Expecting JSON response from docling-serve
+    ) -> dict:
         """
-        Retrieves the content of a document file using Docling-Serve API (async pattern).
+        Parse the document using Docling-Serve API (async pattern).
 
         Args:
             file_path (str): Path to the document file to be parsed (local temp path).
@@ -458,7 +458,10 @@ async def retrieve_document_content(
         Raises:
             HTTPException: If the document conversion fails via docling-serve.
         """
-        logger.info(f'{self.request_id} - Calling Docling-Serve for file "{file_name}".')
+        size_mb = get_bytes_size_mb(content)
+        logger.info(
+            f'Calling Docling-Serve for file "{file_name}" with size {size_mb:.3f} MiB. ({self.request_id})'
+        )
 
         files = {"files": (file_name, content, "application/octet-stream")}
         data = {
@@ -507,7 +510,10 @@ async def retrieve_document_content(
                 elif task_status in ("failure", "revoked"):
                     error_info = status_data.get("task_result", {}).get("error", "Unknown error")
                     logger.error(
-                        f'Docling-Serve task "{task_id}" for document "{file_name}" failed: {error_info}'
+                        (
+                            f'Docling-Serve task "{task_id}" for document "{file_name}" '
+                            f"with size {size_mb:.3f} MiB failed: {error_info}. ({self.request_id})"
+                        )
                     )
                     raise BadInputError(f'Your document "{file_name}" cannot be parsed.')
                 # If not success, failure, or revoked, it's still processing or in another state
@@ -516,7 +522,7 @@ async def retrieve_document_content(
             else:  # Executed if the while loop completes without a 'break'
                 logger.error(
                     (
-                        f'Docling-Serve task "{task_id}" for document "{file_name}" '
+                        f'Docling-Serve task "{task_id}" for document "{file_name}" with size {size_mb:.3f} MiB '
                         f"timed out after polling for {time_slept} seconds. ({self.request_id})"
                     )
                 )
@@ -537,24 +543,20 @@ async def retrieve_document_content(
         except Exception as e:
             raise UnexpectedError(f"Docling-Serve API error: {e}") from e
 
-    async def convert_document_to_markdown(self, file_name: str, content: bytes) -> str:
+    async def document_to_markdown(self, file_name: str, content: bytes) -> str:
         """
         Converts a document to Markdown format using Docling-Serve.
         """
-        docling_response = await self.retrieve_document_content(file_name, content)
-        logger.info(
-            f"Converted `{file_name}` to Markdown in {docling_response.get('processing_time', '0'):.3f} seconds, "
-            f"{get_bytes_size_mb(content):.3f} MB."
-        )
+        docling_response = await self._parse_document(file_name, content)
         return docling_response.get("document", {}).get("md_content", "")
 
-    async def convert_document_to_chunks(
+    async def document_to_chunks(
         self, file_name: str, content: bytes, chunk_size: int, chunk_overlap: int
     ) -> list[Chunk]:
         """
         Converts a document to chunks, respecting page and table boundaries, using Docling-Serve.
         """
-        docling_response = await self.retrieve_document_content(file_name, content)
+        docling_response = await self._parse_document(file_name, content)
         md_content = docling_response.get("document", {}).get("md_content", "")
 
         documents = [Document(page_content=md_content, metadata={"page": 1})]
diff --git a/services/api/src/owl/routers/gen_table.py b/services/api/src/owl/routers/gen_table.py
@@ -552,6 +552,19 @@ async def add_rows(
     billing.has_gen_table_quota(table)
     billing.has_db_storage_quota()
     billing.has_egress_quota()
+    # Validate data
+    try:
+        [table.validate_row_data(d) for d in body.data]
+    except Exception as e:
+        logger.info(
+            (
+                "Row data validation failed. "
+                f'Table={table.schema_id}."{table.table_metadata.short_id}" '
+                f"Org={org.id} "
+                f"User={user.id} "
+                f"Error={repr(e)}"
+            )
+        )
     executor = MultiRowGenExecutor(
         request=request,
         table=table,
@@ -840,6 +853,19 @@ async def update_rows(
     billing.has_gen_table_quota(table)
     billing.has_db_storage_quota()
     billing.has_egress_quota()
+    # Validate data
+    try:
+        {row_id: table.validate_row_data(d) for row_id, d in body.data.items()}
+    except Exception as e:
+        logger.info(
+            (
+                "Row data validation failed. "
+                f'Table={table.schema_id}."{table.table_metadata.short_id}" '
+                f"Org={org.id} "
+                f"User={user.id} "
+                f"Error={repr(e)}"
+            )
+        )
     await table.update_rows(body.data)
     return OkResponse()
 
@@ -935,14 +961,24 @@ async def embed_file(
         content_type=mime,
         filename=file_name,
     )
-    # if overwrite:
-    #     file_table.delete_file(file_name=file_name)
     # --- Add into Knowledge Table --- #
     logger.info(f'{request_id} - Parsing file "{file_name}".')
     doc_parser = GeneralDocLoader(request_id=request_id)
-    chunks = await doc_parser.load_document_chunks(
-        file_name, file_content, data.chunk_size, data.chunk_overlap
-    )
+    try:
+        chunks = await doc_parser.load_document_chunks(
+            file_name, file_content, data.chunk_size, data.chunk_overlap
+        )
+    except BadInputError as e:
+        logger.warning(f'Failed to parse file "{file_uri}" due to error: {repr(e)}')
+        raise
+    except Exception as e:
+        logger.warning(f'Failed to parse file "{file_uri}" due to error: {repr(e)}')
+        raise BadInputError(
+            (
+                f'Sorry we encountered an issue while processing your file "{file_name}". '
+                "Please ensure the file is not corrupted and is in a supported format."
+            )
+        ) from e
     logger.info(f'{request_id} - Embedding file "{file_name}" with {len(chunks):,d} chunks.')
 
     # --- Extract title --- #
diff --git a/services/api/src/owl/utils/io.py b/services/api/src/owl/utils/io.py
@@ -106,22 +106,25 @@ def _is_private_or_local_ip(ip: str) -> bool:
     return addr.is_private or addr.is_loopback or addr.is_link_local
 
 
-def validate_url(url: str) -> str:
-    parsed = urlparse(url)
+def validate_url(url: str, *, error_cls: type[Exception] = BadInputError) -> str:
+    try:
+        parsed = urlparse(url)
+    except Exception as e:
+        raise error_cls(f'URL "{url}" is invalid: {e}') from e
     if parsed.scheme == "s3":
         return url
     if parsed.scheme != "https":
-        raise BadInputError(f"Unsupported scheme: {parsed.scheme}")
+        raise error_cls(f'URL "{url}" is invalid: Scheme is not "https".')
     if not parsed.hostname:
-        raise BadInputError("URL must contain hostname.")
+        raise error_cls(f'URL "{url}" is invalid: Missing hostname.')
     try:
         ips = {info[4][0] for info in socket.getaddrinfo(parsed.hostname, None)}
     except socket.gaierror as e:
-        raise BadInputError("Failed to resolve hostname.") from e
+        raise error_cls(f'URL "{url}" is invalid: {e}') from e
     if not ips:
-        raise BadInputError("Failed to resolve hostname.")
+        raise error_cls(f'URL "{url}" is invalid: Failed to resolve hostname.')
     if any(_is_private_or_local_ip(ip) for ip in ips):
-        raise BadInputError(f"Target '{url}' resolves to private or local IP.")
+        raise error_cls(f'URL "{url}" is invalid: Hostname resolves to private or local IP.')
     return url
 
 
@@ -168,14 +171,14 @@ async def open_uri_async(uri: str) -> AsyncGenerator[tuple[AsyncResponse, str],
 
 def get_bytes_size_mb(bytes_content: bytes, decimal_places: int = 3) -> float:
     """
-    Convert bytes to megabytes (MB).
+    Convert bytes to Mebibyte (MiB).
 
     Args:
         bytes_content (bytes): The content in bytes to be calculated.
         decimal_places (int, optional): Number of decimal places to round to. Defaults to 3.
 
     Returns:
-        float: The converted value in megabytes (MB)
+        float: The converted value in Mebibyte (MiB)
     """
     mb_value = len(bytes_content) / (1024 * 1024)  # 1 MB = 1024 KB = 1024 * 1024 bytes
     return round(mb_value, decimal_places)
diff --git a/services/api/tests/test_docparse.py b/services/api/tests/test_docparse.py
@@ -56,9 +56,7 @@ async def test_convert_pdf_document_to_markdown(doc_path: str):
     with open(doc_path, "rb") as f:
         doc_content_bytes = f.read()
 
-    api_response_data = await loader.retrieve_document_content(
-        basename(doc_path), doc_content_bytes
-    )
+    api_response_data = await loader._parse_document(basename(doc_path), doc_content_bytes)
 
     api_document_content = api_response_data.get("document", {})