OpenCSGs
diff --git a/‎data_celery/datasource/hive/tasks.py‎
Lines changed: 58 additions & 12 deletions b/‎data_celery/datasource/hive/tasks.py‎
Lines changed: 58 additions & 12 deletions
diff --git a/‎data_celery/datasource/mongo/tasks.py‎
Lines changed: 8 additions & 34 deletions b/‎data_celery/datasource/mongo/tasks.py‎
Lines changed: 8 additions & 34 deletions
diff --git a/‎data_celery/datasource/mysql/tasks.py‎
Lines changed: 1 addition & 1 deletion b/‎data_celery/datasource/mysql/tasks.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data_celery/formatify/tasks.py‎
Lines changed: 44 additions & 12 deletions b/‎data_celery/formatify/tasks.py‎
Lines changed: 44 additions & 12 deletions
diff --git a/‎data_engine/core/executor.py‎
Lines changed: 3 additions & 2 deletions b/‎data_engine/core/executor.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎data_engine/core/ray_executor.py‎
Lines changed: 2 additions & 1 deletion b/‎data_engine/core/ray_executor.py‎
Lines changed: 2 additions & 1 deletion
@@ -12,6 +12,7 @@
                                ensure_directory_exists_remove, get_datasource_csg_hub_server_dir)
 from data_celery.mongo_tools.tools import insert_datasource_run_task_log_info,insert_datasource_run_task_log_error
 from data_server.datasource.services.datasource import get_datasource_connector
+from data_server.datasource.schemas import DataSourceCreate
 from data_engine.exporter.load import load_exporter
 from pathlib import Path
 import pandas as pd
@@ -103,13 +104,32 @@ def collection_hive_task(task_uid: str,user_name: str,user_token: str):
             max_line = extra_config["max_line_json"]
         if use_type == "sql":
             if use_sql:
-                connector = get_datasource_connector(collection_task.datasource)
-                if not connector.test_connection():
+                try:
+                    # 将数据库对象转换为 DataSourceCreate 对象
+                    datasource_create = DataSourceCreate(
+                        name=collection_task.datasource.name,
+                        des=collection_task.datasource.des,
+                        source_type=collection_task.datasource.source_type,
+                        host=collection_task.datasource.host,
+                        port=collection_task.datasource.port,
+                        username=collection_task.datasource.username,
+                        password=collection_task.datasource.password,
+                        database=collection_task.datasource.database,
+                        auth_type=collection_task.datasource.auth_type
+                    )
+                    connector = get_datasource_connector(datasource_create)
+                    test_result = connector.test_connection()
+                    if not test_result or not test_result.get("success", False):
+                        collection_task.task_status = DataSourceTaskStatusEnum.ERROR.value
+                        error_msg = test_result.get("message", "Connection failed") if test_result else "Connection test returned None"
+                        insert_datasource_run_task_log_error(task_uid, f"Task with UID {task_uid} failed to connect to the database: {error_msg}")
+                        return False
+                    get_table_dataset_by_sql(connector, task_uid, use_sql, db_session, collection_task,
+                                             datasource_temp_parquet_dir, max_line=max_line)
+                except Exception as e:
                     collection_task.task_status = DataSourceTaskStatusEnum.ERROR.value
-                    insert_datasource_run_task_log_error(task_uid, f"Task with UID {task_uid} failed to connect to the database.")
+                    insert_datasource_run_task_log_error(task_uid, f"Error occurred while executing the task: {str(e)}")
                     return False
-                get_table_dataset_by_sql(connector, task_uid, use_sql, db_session, collection_task,
-                                         datasource_temp_parquet_dir, max_line=max_line)
                 upload_path = datasource_temp_parquet_dir.join('run_sql')
                 upload_to_csg_hub_server(csg_hub_dataset_id,
                                          csg_hub_dataset_name,
@@ -125,14 +145,34 @@ def collection_hive_task(task_uid: str,user_name: str,user_token: str):
                 source = hive_config["source"]
                 total_count = 0
                 records_count = 0
-                connector = get_datasource_connector(collection_task.datasource)
-                if not connector.test_connection():
+                try:
+                    # 将数据库对象转换为 DataSourceCreate 对象
+                    datasource_create = DataSourceCreate(
+                        name=collection_task.datasource.name,
+                        des=collection_task.datasource.des,
+                        source_type=collection_task.datasource.source_type,
+                        host=collection_task.datasource.host,
+                        port=collection_task.datasource.port,
+                        username=collection_task.datasource.username,
+                        password=collection_task.datasource.password,
+                        database=collection_task.datasource.database,
+                        auth_type=collection_task.datasource.auth_type
+                    )
+                    connector = get_datasource_connector(datasource_create)
+                    test_result = connector.test_connection()
+                    if not test_result or not test_result.get("success", False):
+                        collection_task.task_status = DataSourceTaskStatusEnum.ERROR.value
+                        error_msg = test_result.get("message", "Connection failed") if test_result else "Connection test returned None"
+                        insert_datasource_run_task_log_error(task_uid, f"Task with UID {task_uid} failed to connect to the database: {error_msg}")
+                        return False
+                    for table_name in source.keys():
+                        table_total = connector.get_table_total_count_hive(table_name)
+                        total_count += table_total
+                except Exception as e:
                     collection_task.task_status = DataSourceTaskStatusEnum.ERROR.value
-                    insert_datasource_run_task_log_error(task_uid, f"Task with UID {task_uid} failed to connect to the database.")
+                    insert_datasource_run_task_log_error(task_uid, f"Error occurred while executing the task: {str(e)}")
                     return False
-                for table_name in source.keys():
-                    table_total = connector.get_table_total_count_hive(table_name)
-                    total_count += table_total
+                
                 collection_task.total_count = total_count
                 collection_task.records_count = records_count
                 db_session.commit()
@@ -165,8 +205,14 @@ def collection_hive_task(task_uid: str,user_name: str,user_token: str):
     except Exception as e:
         if collection_task:
             collection_task.task_status = DataSourceTaskStatusEnum.ERROR.value
+        error_type = type(e).__name__
+        error_msg = str(e)
+        error_traceback = traceback.format_exc()
+        logger.error(f"Task {task_uid} error: {error_type}: {error_msg}")
+        logger.error(f"Full traceback:\n{error_traceback}")
         traceback.print_exc()
-        insert_datasource_run_task_log_error(task_uid, f"Error occurred while executing the task: {e}")
+        insert_datasource_run_task_log_error(task_uid, f"Error occurred while executing the task: {error_type}: {error_msg}")
+        insert_datasource_run_task_log_error(task_uid, f"Traceback: {error_traceback}")
         return False
     finally:
         if collection_task:
 
@@ -11,7 +11,6 @@
 from data_server.datasource.services.datasource import get_datasource_connector
 from data_celery.mongo_tools.tools import insert_datasource_run_task_log_info, insert_datasource_run_task_log_error
 from data_engine.exporter.load import load_exporter
-from pathlib import Path
 import pandas as pd
 from loguru import logger
 
@@ -256,46 +255,21 @@ def upload_to_csg_hub_server(csg_hub_dataset_id: str,
             branch=csg_hub_dataset_default_branch,
             user_name=user_name,
             user_token=user_token,
-            work_dir=datasource_csg_hub_server_dir
+            work_dir=datasource_csg_hub_server_dir,
+            path_is_dir=True
         )
-        upload_path: Path = Path(datasource_temp_json_dir)
-        # Check whether the uploaded directory exists and is not empty
-        if not os.path.exists(upload_path):
-            insert_datasource_run_task_log_error(collection_task.task_uid,
-                                                 f"the task[{collection_task.task_uid}] upload csg hub-server fail: upload path {upload_path} does not exist")
-            return False
-
-        # List all files in the upload directory for debugging
-        file_list = []
-        for root, dirs, files in os.walk(upload_path):
-            for file in files:
-                file_list.append(os.path.join(root, file))
-        insert_datasource_run_task_log_info(collection_task.task_uid,
-                                            f"Files to upload: {len(file_list)} files found in {upload_path}")
-        if len(file_list) == 0:
-            insert_datasource_run_task_log_error(collection_task.task_uid,
-                                                 f"the task[{collection_task.task_uid}] upload csg hub-server fail: upload path {upload_path} is empty")
-            return False
-
-        output_branch_name = exporter.export_from_files(upload_path)
-
-        if output_branch_name:
-            collection_task.csg_hub_branch = output_branch_name
+        exporter.export_large_folder()
+        if csg_hub_dataset_default_branch:
+            collection_task.csg_hub_branch = csg_hub_dataset_default_branch
             db_session.commit()
             insert_datasource_run_task_log_info(collection_task.task_uid,
                                                 f"the task[{collection_task.task_uid}] upload csg hub-server success...")
         else:
             insert_datasource_run_task_log_error(collection_task.task_uid,
-                                                 f"the task[{collection_task.task_uid}] upload csg hub-server fail: export_from_files returned None")
+                                                 f"the task[{collection_task.task_uid}] upload csg hub-server fail...")
     except Exception as e:
         logger.error(e)
-        error_msg = str(e)
-        # Check if this is a "nothing to commit" error
-        if "nothing to commit" in error_msg.lower() or "working tree clean" in error_msg.lower():
-            insert_datasource_run_task_log_error(collection_task.task_uid,
-                                                 f"the task[{collection_task.task_uid}] upload csg hub-server fail: No files to commit. This may happen if: 1) Files are already committed in the branch, 2) Files are ignored by .gitignore, 3) File paths are incorrect. Error: {error_msg}")
-        else:
-            insert_datasource_run_task_log_error(collection_task.task_uid,
-                                                 f"Task UID {collection_task.task_uid} Error occurred while uploading to CSG Hub server: {error_msg}")
+        insert_datasource_run_task_log_error(collection_task.task_uid,
+                                             f"Task UID {collection_task.task_uid} Error occurred while uploading to CSG Hub server: {e}")
         return False
     return True
@@ -210,7 +210,7 @@ def upload_to_csg_hub_server(csg_hub_dataset_id: str,
     try:
         # Upload to CSG Hub server
         ensure_directory_exists_remove(datasource_csg_hub_server_dir)
-        insert_datasource_run_task_log_error(collection_task.task_uid,
+        insert_datasource_run_task_log_info(collection_task.task_uid,
                                              f"Starting upload csg hub-server the task[{collection_task.task_uid}]...")
         exporter = load_exporter(
             export_path=datasource_temp_json_dir,
 
@@ -26,9 +26,10 @@
                             get_endpoint,
                             REPO_TYPE_DATASET)
 from data_engine.utils.env import GetHubEndpoint
+
+
 @celery_app.task
 def format_task(task_id: int, user_name: str, user_token: str):
-
     tmp_path: str = None
     db_session: Session = None
     format_task: DataFormatTask = None
@@ -49,9 +50,10 @@ def format_task(task_id: int, user_name: str, user_token: str):
             user_token=user_token,
         )
         ingester_result = ingesterCSGHUB.ingest()
-        insert_formatity_task_log_info(format_task.task_uid, f"Download directory completed... Directory address：{ingester_result}")
+        insert_formatity_task_log_info(format_task.task_uid,
+                                       f"Download directory completed... Directory address：{ingester_result}")
         work_dir = Path(tmp_path).joinpath('work')
-        file_bool = search_files(tmp_path,[format_task.from_data_type])
+        file_bool = search_files(tmp_path, [format_task.from_data_type])
 
         if not file_bool:
             insert_formatity_task_log_info(format_task.task_uid, f"file not found. task ended....")
@@ -78,10 +80,44 @@ def format_task(task_id: int, user_name: str, user_token: str):
             path_is_dir=True,
             work_dir=str(work_dir)
         )
-        exporter.export_large_folder()
-        insert_formatity_task_log_info(format_task.task_uid, 'Upload completed...')
-        format_task.task_status = DataFormatTaskStatusEnum.COMPLETED.value
-        db_session.commit()
+
+        # 上传文件重试逻辑：最多重试3次
+        max_retry_count = 3
+        upload_success = False
+        retry_count = 0
+
+        for attempt in range(max_retry_count):
+            try:
+                if attempt == 0:
+                    insert_formatity_task_log_info(format_task.task_uid, f'开始上传文件（第1次尝试）...')
+                else:
+                    retry_count += 1
+                    insert_formatity_task_log_info(format_task.task_uid,
+                                                   f'重新尝试上传文件（第{retry_count + 1}次尝试，共{max_retry_count}次）...')
+
+                exporter.export_large_folder()
+                upload_success = True
+                insert_formatity_task_log_info(format_task.task_uid, 'Upload completed...')
+                break
+            except Exception as e:
+                error_msg = f'上传文件失败（第{attempt + 1}次尝试）: {str(e)}'
+                insert_formatity_task_log_error(format_task.task_uid, error_msg)
+                logger.error(f"Task {format_task.task_uid} upload attempt {attempt + 1} failed: {error_msg}")
+
+                # 如果已经是最后一次尝试，终止任务
+                if attempt == max_retry_count - 1:
+                    final_error_msg = f'上传文件失败，已重试{max_retry_count}次，任务终止。错误信息: {str(e)}'
+                    insert_formatity_task_log_error(format_task.task_uid, final_error_msg)
+                    logger.error(
+                        f"Task {format_task.task_uid} upload failed after {max_retry_count} attempts: {final_error_msg}")
+                    format_task.task_status = DataFormatTaskStatusEnum.ERROR.value
+                    db_session.commit()
+                    raise RuntimeError(final_error_msg)
+
+        # 如果上传成功，更新任务状态为完成
+        if upload_success:
+            format_task.task_status = DataFormatTaskStatusEnum.COMPLETED.value
+            db_session.commit()
         pass
     except Exception as e:
         traceback.print_exc()
@@ -245,22 +281,20 @@ def convert_ppt_to_markdown(file_path: str, task_uid):
 
 from typing import List, Dict, Tuple
 
-def search_files(folder_path: str, types: List[int]) -> Tuple[bool, List[str]]:
 
+def search_files(folder_path: str, types: List[int]) -> Tuple[bool, List[str]]:
     type_map: Dict[int, List[str]] = {
         0: ['.ppt', '.pptx'],  # PPT
         1: ['.doc', '.docx'],  # Word
         3: ['.xls', '.xlsx']  # Excel
     }
 
-
     target_extensions = set()
     for file_type in types:
         if file_type in type_map:
             for ext in type_map[file_type]:
                 target_extensions.add(ext.lower())
 
-
     found_files: List[str] = []
 
     def traverse(current_path: str) -> None:
@@ -286,8 +320,6 @@ def traverse(current_path: str) -> None:
         except Exception as e:
             print(f"Processing path {current_path} error: {str(e)}")
 
-
     traverse(folder_path)
 
-
     return bool(len(found_files) > 0)
@@ -74,7 +74,7 @@ def __init__(
 
         # Check if this is the specific output_only tool by tool name
         tool_name = getattr(self.cfg, 'tool_name', '')
-        is_specific_output_only = (tool_name == 'template_executor_06_common_internal')
+        is_specific_output_only = (tool_name == 'smoltalk_chinese_common_internal')
 
         # normal_logic
         if not is_specific_output_only:
@@ -127,7 +127,8 @@ def __init__(
             branch = self.cfg.branch,
             user_name=self.user_name,
             user_token=self.user_token,
-            work_dir=self.work_dir
+            work_dir=self.work_dir,
+            auto_version=True  # Pipeline jobs use auto versioning
         )
 
         # setup tracer
 
@@ -101,7 +101,8 @@ def __init__(self, cfg=None):
             branch=self.cfg.branch,
             user_name=self.user_name,
             user_token=self.user_token,
-            work_dir=self.work_dir
+            work_dir=self.work_dir,
+            auto_version=True  # Pipeline jobs use auto versioning
         )
 
         # setup tracer
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,8 @@ def __init__(self, cfg=None):`
`101`	`101`	`branch=self.cfg.branch,`
`102`	`102`	`user_name=self.user_name,`
`103`	`103`	`user_token=self.user_token,`
`104`		`- work_dir=self.work_dir`
	`104`	`+ work_dir=self.work_dir,`
	`105`	`+ auto_version=True # Pipeline jobs use auto versioning`
`105`	`106`	`)`
`106`	`107`
`107`	`108`	`# setup tracer`