Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion backend/agents/create_agent_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,10 @@ async def create_tool_config_list(agent_id, tenant_id, user_id, version_no: int

# special logic for knowledge base search tool
if tool_config.class_name == "KnowledgeBaseSearchTool":
is_multimodal = tool_config.params.pop("multimodal", False)
tool_config.metadata = {
"vdb_core": get_vector_db_core(),
"embedding_model": get_embedding_model(tenant_id=tenant_id),
"embedding_model": get_embedding_model(tenant_id=tenant_id, is_multimodal=is_multimodal),
}
elif tool_config.class_name == "AnalyzeTextFileTool":
tool_config.metadata = {
Expand Down
3 changes: 2 additions & 1 deletion backend/apps/model_managment_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@
@router.post("/healthcheck")
async def check_model_health(
display_name: str = Query(..., description="Display name to check"),
modelType: str = Query(..., description="..."),

Check warning on line 301 in backend/apps/model_managment_app.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Rename this parameter "modelType" to match the regular expression ^[_a-z][a-z0-9_]*$.

See more on https://sonarcloud.io/project/issues?id=ModelEngine-Group_nexent&issues=AZ01Ev938MGqxOY_YCvt&open=AZ01Ev938MGqxOY_YCvt&pullRequest=2720
authorization: Optional[str] = Header(None)
):
"""Check and update model connectivity, returning the latest status.
Expand All @@ -308,7 +309,7 @@
"""
try:
_, tenant_id = get_current_user_id(authorization)
result = await check_model_connectivity(display_name, tenant_id)
result = await check_model_connectivity(display_name, tenant_id, modelType)
return JSONResponse(status_code=HTTPStatus.OK, content={
"message": "Successfully checked model connectivity",
"data": result
Expand Down
23 changes: 12 additions & 11 deletions backend/apps/vectordatabase_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,15 @@ def create_new_index(
# Extract optional fields from request body
ingroup_permission = None
group_ids = None
embedding_model_name = None
is_multimodal = False
embedding_model_name: Optional[str] = None
if request:
ingroup_permission = request.get("ingroup_permission")
group_ids = request.get("group_ids")
embedding_model_name = request.get("embedding_model_name")
is_multimodal = request.get("is_multimodal")
embedding_model_name = request.get("embeddingModel") or request.get("embedding_model")
if isinstance(embedding_model_name, str):
embedding_model_name = embedding_model_name.strip() or None

# Treat path parameter as user-facing knowledge base name for new creations
return ElasticSearchService.create_knowledge_base(
Expand All @@ -80,7 +84,7 @@ def create_new_index(
tenant_id=tenant_id,
ingroup_permission=ingroup_permission,
group_ids=group_ids,
embedding_model_name=embedding_model_name,
is_multimodal=is_multimodal,
)
except Exception as e:
raise HTTPException(
Expand Down Expand Up @@ -124,13 +128,15 @@ async def update_index(
knowledge_name = request.get("knowledge_name")
ingroup_permission = request.get("ingroup_permission")
group_ids = request.get("group_ids")
is_multimodal = request.get("is_multimodal")

# Call service layer to update knowledge base
result = ElasticSearchService.update_knowledge_base(
index_name=index_name,
knowledge_name=knowledge_name,
ingroup_permission=ingroup_permission,
group_ids=group_ids,
is_multimodal=is_multimodal,
tenant_id=tenant_id,
user_id=user_id,
)
Expand Down Expand Up @@ -199,15 +205,10 @@ def create_index_documents(
try:
user_id, tenant_id = get_current_user_id(authorization)

# Get the knowledge base record to retrieve the saved embedding model
knowledge_record = get_knowledge_record({'index_name': index_name})
saved_embedding_model_name = None
if knowledge_record:
saved_embedding_model_name = knowledge_record.get('embedding_model_name')

# Use the saved model from knowledge base, fallback to tenant default if not set
embedding_model = get_embedding_model(tenant_id, saved_embedding_model_name)

is_multimodal = True if knowledge_record.get(
'is_multimodal') == 'Y' else False
embedding_model = get_embedding_model(tenant_id, is_multimodal)
return ElasticSearchService.index_documents(
embedding_model=embedding_model,
index_name=index_name,
Expand Down
4 changes: 4 additions & 0 deletions backend/consts/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ class VectorDatabaseType(str, Enum):
# Data Processing Service Configuration
DATA_PROCESS_SERVICE = os.getenv("DATA_PROCESS_SERVICE")
CLIP_MODEL_PATH = os.getenv("CLIP_MODEL_PATH")
TABLE_TRANSFORMER_MODEL_PATH = os.getenv("TABLE_TRANSFORMER_MODEL_PATH")
UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH = os.getenv(
"UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH"
)


# Upload Configuration
Expand Down
62 changes: 59 additions & 3 deletions backend/data_process/ray_actors.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
from io import BytesIO
import logging
import json
from typing import Any, Dict, List, Optional

import ray

from consts.const import RAY_ACTOR_NUM_CPUS, REDIS_BACKEND_URL, DEFAULT_EXPECTED_CHUNK_SIZE, DEFAULT_MAXIMUM_CHUNK_SIZE
from database.attachment_db import get_file_stream
from consts.const import (
RAY_ACTOR_NUM_CPUS,
REDIS_BACKEND_URL,
DEFAULT_EXPECTED_CHUNK_SIZE,
DEFAULT_MAXIMUM_CHUNK_SIZE,
TABLE_TRANSFORMER_MODEL_PATH,
UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH,
)
from database.attachment_db import build_s3_url, get_file_stream, upload_fileobj
from database.model_management_db import get_model_by_model_id
from nexent.data_process import DataProcessCore

Expand All @@ -27,7 +35,7 @@
f"Ray actor initialized using {RAY_ACTOR_NUM_CPUS} CPU cores...")
self._processor = DataProcessCore()

def process_file(

Check failure on line 38 in backend/data_process/ray_actors.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this function to reduce its Cognitive Complexity from 25 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=ModelEngine-Group_nexent&issues=AZ01Ev9M8MGqxOY_YCvs&open=AZ01Ev9M8MGqxOY_YCvs&pullRequest=2720
self,
source: str,
chunking_strategy: str,
Expand Down Expand Up @@ -58,6 +66,11 @@
if task_id:
params['task_id'] = task_id

params["table_transformer_model_path"] = TABLE_TRANSFORMER_MODEL_PATH
params[
"unstructured_default_model_initialize_params_json_path"
] = UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH

# Get chunk size parameters from embedding model if model_id is provided
if model_id and tenant_id:
try:
Expand Down Expand Up @@ -95,12 +108,55 @@
logger.error(f"Failed to fetch file from {source}: {e}")
raise

chunks = self._processor.file_process(
result = self._processor.file_process(
file_data=file_data,
filename=source,
chunking_strategy=chunking_strategy,
**params
)
if isinstance(result, tuple) and len(result) == 2:
chunks, images_info = result
else:
chunks = result
images_info = []

if len(images_info) > 0:
folder = "images_in_attachments"
for index, image_data in enumerate(images_info):
if not isinstance(image_data, dict):
logger.warning(
f"[RayActor] Skipping image entry at index {index}: unexpected type {type(image_data)}"
)
continue
if "image_bytes" not in image_data:
logger.warning(
f"[RayActor] Skipping image entry at index {index}: missing image_bytes"
)
continue

img_obj = BytesIO(image_data["image_bytes"])
result = upload_fileobj(
file_obj=img_obj,
file_name=f"{index}.{image_data['image_format']}",
prefix=folder)

image_data["source_file"] = source
image_data["image_url"] = build_s3_url(result.get("object_name", ""))


chunks.append({
"content": json.dumps({
"source_file": source,
"position": image_data["position"],
"image_url": build_s3_url(result.get("object_name", ""))
}),
"filename": source,
"metadata": {
"chunk_index": len(chunks) + index,
"process_source": "UniversalImageExtractor",
"image_url": build_s3_url(result.get("object_name", ""))
}
})

if chunks is None:
logger.warning(
Expand Down
60 changes: 59 additions & 1 deletion backend/database/attachment_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,63 @@
import os
import uuid
from datetime import datetime
from typing import Any, BinaryIO, Dict, List, Optional
from typing import Any, BinaryIO, Dict, List, Optional, Tuple

from .client import minio_client


def _normalize_object_and_bucket(object_name: str, bucket: Optional[str] = None) -> Tuple[str, Optional[str]]:
"""
Normalize object_name + bucket from supported URL styles.

Supports:
- s3://bucket/key
- /bucket/key
- key (uses provided bucket or default bucket)
"""
if not object_name:
return object_name, bucket

if object_name.startswith("s3://"):

Check failure on line 22 in backend/database/attachment_db.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Define a constant instead of duplicating this literal "s3://" 3 times.

See more on https://sonarcloud.io/project/issues?id=ModelEngine-Group_nexent&issues=AZ01Ev8z8MGqxOY_YCvr&open=AZ01Ev8z8MGqxOY_YCvr&pullRequest=2720
s3_path = object_name[len("s3://") :]
parts = s3_path.split("/", 1)
parsed_bucket = parts[0] if parts[0] else None
parsed_key = parts[1] if len(parts) > 1 else ""
return parsed_key, parsed_bucket or bucket

if object_name.startswith("/"):
path = object_name.lstrip("/")
parts = path.split("/", 1)
parsed_bucket = parts[0] if parts[0] else None
parsed_key = parts[1] if len(parts) > 1 else ""
return parsed_key, parsed_bucket or bucket

return object_name, bucket


def build_s3_url(object_name: str, bucket: Optional[str] = None) -> str:
"""
Build an s3://bucket/key style URL from an object name (or passthrough if already s3://).
"""
if not object_name:
return ""

if object_name.startswith("s3://"):
return object_name

if object_name.startswith("/"):
path = object_name.lstrip("/")
parts = path.split("/", 1)
if len(parts) == 2:
return f"s3://{parts[0]}/{parts[1]}"
return f"s3://{parts[0]}/"

resolved_bucket = bucket or minio_client.storage_config.default_bucket
if resolved_bucket:
return f"s3://{resolved_bucket}/{object_name}"
return f"s3://{object_name}"


def generate_object_name(file_name: str, prefix: str = "attachments") -> str:
"""
Generate a unique object name
Expand Down Expand Up @@ -165,6 +217,7 @@
"""
Get file size by object name
"""
object_name, bucket = _normalize_object_and_bucket(object_name, bucket)
bucket = bucket or minio_client.storage_config.default_bucket
return minio_client.get_file_size(object_name, bucket)

Expand All @@ -181,6 +234,7 @@
bool: True if file exists, False otherwise
"""
try:
object_name, bucket = _normalize_object_and_bucket(object_name, bucket)
return minio_client.file_exists(object_name, bucket)
except Exception:
return False
Expand All @@ -198,6 +252,8 @@
Returns:
Dict[str, Any]: Result containing success flag and error message (if any)
"""
source_object, bucket = _normalize_object_and_bucket(source_object, bucket)
dest_object, bucket = _normalize_object_and_bucket(dest_object, bucket)
success, result = minio_client.copy_file(source_object, dest_object, bucket)
if success:
return {"success": True, "object_name": result}
Expand Down Expand Up @@ -242,6 +298,7 @@
Returns:
Dict[str, Any]: Delete result, containing success flag and error message (if any)
"""
object_name, bucket = _normalize_object_and_bucket(object_name, bucket)
if not bucket:
bucket = minio_client.storage_config.default_bucket
success, result = minio_client.delete_file(object_name, bucket)
Expand All @@ -265,6 +322,7 @@
Returns:
Optional[BinaryIO]: Standard BinaryIO stream object, or None if failed
"""
object_name, bucket = _normalize_object_and_bucket(object_name, bucket)
success, result = minio_client.get_file_stream(object_name, bucket)
if not success:
return None
Expand Down
1 change: 1 addition & 0 deletions backend/database/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ class KnowledgeRecord(TableBase):
group_ids = Column(String, doc="Knowledge base group IDs list")
ingroup_permission = Column(
String(30), doc="In-group permission: EDIT, READ_ONLY, PRIVATE")
is_multimodal = Column(String(1), default="N", doc="Whether it is multimodal. Optional values: Y/N")


class TenantConfig(TableBase):
Expand Down
9 changes: 9 additions & 0 deletions backend/database/knowledge_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def create_knowledge_record(query: Dict[str, Any]) -> Dict[str, Any]:
"knowledge_name": knowledge_name,
"group_ids": convert_list_to_string(group_ids) if isinstance(group_ids, list) else group_ids,
"ingroup_permission": query.get("ingroup_permission"),
"is_multimodal": 'Y' if query.get("is_multimodal") else 'N'
}

# For backward compatibility: if caller explicitly provides index_name,
Expand Down Expand Up @@ -178,6 +179,9 @@ def update_knowledge_record(query: Dict[str, Any]) -> bool:
if query.get("group_ids") is not None:
record.group_ids = query["group_ids"]

if query.get("is_multimodal"):
record.is_multimodal = 'Y' if query["is_multimodal"] else 'N'

# Update timestamp and user
if query.get("user_id"):
record.updated_by = query["user_id"]
Expand Down Expand Up @@ -254,6 +258,11 @@ def get_knowledge_record(query: Optional[Dict[str, Any]] = None) -> Dict[str, An
db_query = db_query.filter(
KnowledgeRecord.tenant_id == query['tenant_id'])

if 'is_multimodal' in query:
db_query = db_query.filter(
KnowledgeRecord.is_multimodal == query['is_multimodal']
)

result = db_query.first()

if result:
Expand Down
11 changes: 8 additions & 3 deletions backend/database/model_management_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def get_model_records(filters: Optional[Dict[str, Any]], tenant_id: str) -> List
return result_list


def get_model_by_display_name(display_name: str, tenant_id: str) -> Optional[Dict[str, Any]]:
def get_model_by_display_name(display_name: str, tenant_id: str, model_type: str = None) -> Optional[Dict[str, Any]]:
"""
Get a model record by display name

Expand All @@ -179,6 +179,11 @@ def get_model_by_display_name(display_name: str, tenant_id: str) -> Optional[Dic
tenant_id:
"""
filters = {'display_name': display_name}

if model_type in ["multiEmbedding", "multi_embedding"]:
filters['model_type'] = "multi_embedding"
elif model_type == "embedding":
filters['model_type'] = "embedding"

records = get_model_records(filters, tenant_id)
if not records:
Expand All @@ -203,7 +208,7 @@ def get_models_by_display_name(display_name: str, tenant_id: str) -> List[Dict[s
return get_model_records(filters, tenant_id)


def get_model_id_by_display_name(display_name: str, tenant_id: str) -> Optional[int]:
def get_model_id_by_display_name(display_name: str, tenant_id: str, model_type: str = None) -> Optional[int]:
"""
Get a model ID by display name

Expand All @@ -214,7 +219,7 @@ def get_model_id_by_display_name(display_name: str, tenant_id: str) -> Optional[
Returns:
Optional[int]: Model ID
"""
model = get_model_by_display_name(display_name, tenant_id)
model = get_model_by_display_name(display_name, tenant_id, model_type)
return model["model_id"] if model else None


Expand Down
2 changes: 1 addition & 1 deletion backend/services/config_sync_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ async def save_config_impl(config, tenant_id, user_id):

config_key = get_env_key(model_type) + "_ID"
model_id = get_model_id_by_display_name(
model_display_name, tenant_id)
model_display_name, tenant_id, model_type=model_type)

handle_model_config(tenant_id, user_id, config_key,
model_id, tenant_config_dict)
Expand Down
Loading