y-scope · Eden-D-Zhang · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
@@ -0,0 +1,263 @@
+"""
+Profiling utilities for CLP query execution performance analysis.
+
+This module provides lightweight profiling decorators using pyinstrument.
+
+Profile outputs include:
+- HTML files with interactive flame graphs and call trees
+- Text summaries showing call hierarchy and timing
+"""
+
+import datetime
+import functools
+import inspect
+import logging
+import os
+from pathlib import Path
+from typing import Any, Callable, Optional, Tuple, TypeVar
+
+from pyinstrument import Profiler
+
+logger = logging.getLogger(__name__)
+
+F = TypeVar("F", bound=Callable[..., Any])
+
+PROFILING_INTERVAL = 0.001
+
+
+def profile(
+    section_name: Optional[str] = None,
+    job_id_param: Optional[str] = None,
+    task_id_param: Optional[str] = None,
+) -> Callable[[F], F]:
+    """
+    Decorator for profiling function execution with automatic context extraction.
+
+    Output files are written to $CLP_LOGS_DIR/profiles/ (e.g., clp-package/var/log/query_worker/
+    profiles/).
+
+    :param section_name: Override for profile section name. If None, uses function name.
+    :param job_id_param: Parameter name to extract job_id from (default: "job_id").
+                         Can use dot notation for attributes, e.g., "job.id"
+    :param task_id_param: Parameter name to extract task_id from (default: "task_id").
+                          Can use dot notation for attributes, e.g., "task.id"
+    :return: Decorated function with profiling capabilities
+    """
+
+    def decorator(func: F) -> F:
+        name = section_name or func.__name__
+        is_async = inspect.iscoroutinefunction(func)
+
+        if is_async:
+
+            @functools.wraps(func)
+            async def async_wrapper(*args, **kwargs):
+                if not _is_profiling_enabled():
+                    return await func(*args, **kwargs)
+
+                # Profiling enabled: extract context and profile execution
+                job_id, task_id = _extract_context_from_args(
+                    func, args, kwargs, job_id_param, task_id_param
+                )
+
+                profiler = Profiler(interval=PROFILING_INTERVAL)
+                try:
+                    profiler.start()
+                except RuntimeError as e:
+                    # Skip profiling this function to avoid conflicts
+                    if "already a profiler running" in str(e):
+                        logger.debug(
+                            f"Skipping nested profiling for {name} "
+                            f"(parent profiler already active)"
+                        )
+                        return await func(*args, **kwargs)
+                    raise
+
+                try:
+                    result = await func(*args, **kwargs)
+                    return result
+                finally:
+                    profiler.stop()
+                    _save_profile(profiler, name, job_id, task_id)
+
+            return async_wrapper  # type: ignore
+
+        else:
+
+            @functools.wraps(func)
+            def sync_wrapper(*args, **kwargs):
+                if not _is_profiling_enabled():
+                    return func(*args, **kwargs)
+
+                # Profiling enabled: extract context and profile execution
+                job_id, task_id = _extract_context_from_args(
+                    func, args, kwargs, job_id_param, task_id_param
+                )
+
+                profiler = Profiler(interval=PROFILING_INTERVAL)
+                try:
+                    profiler.start()
+                except RuntimeError as e:
+                    # Skip profiling this function to avoid conflicts
+                    if "already a profiler running" in str(e):
+                        logger.debug(
+                            f"Skipping nested profiling for {name} "
+                            f"(parent profiler already active)"
+                        )
+                        return func(*args, **kwargs)
+                    raise
+
+                try:
+                    result = func(*args, **kwargs)
+                    return result
+                finally:
+                    profiler.stop()
+                    _save_profile(profiler, name, job_id, task_id)
+
+            return sync_wrapper  # type: ignore
+
+    return decorator
+
+
+def _extract_context_from_args(
+    func: Callable,
+    args: tuple,
+    kwargs: dict,
+    job_id_param: Optional[str] = None,
+    task_id_param: Optional[str] = None,
+) -> Tuple[str, str]:
+    """
+    Extract job_id and task_id from function arguments.
+
+    :param func: The function being profiled
+    :param args: Positional arguments passed to the function
+    :param kwargs: Keyword arguments passed to the function
+    :param job_id_param: Name/path of the parameter containing job_id (default: "job_id").
+    :param task_id_param: Name/path of the parameter containing task_id (default: "task_id").
+    :return: Tuple of (job_id, task_id) as strings. Empty strings if not found.
+    """
+    job_id = ""
+    task_id = ""
+
+    try:
+        # Get function signature
+        sig = inspect.signature(func)
+        param_names = list(sig.parameters.keys())
+
+        def extract_value(param_spec: str) -> str:
+            """Extract value from parameter, supporting dot notation for attributes."""
+            if not param_spec:
+                return ""
+
+            # Split on '.' to handle attribute access
+            parts = param_spec.split(".")
+            param_name = parts[0]
+
+            # Find the parameter value
+            value = None
+            if param_name in kwargs:
+                value = kwargs[param_name]
+            elif param_name in param_names:
+                idx = param_names.index(param_name)
+                if idx < len(args):
+                    value = args[idx]
+
+            if value is None:
+                return ""
+
+            # Navigate through attributes if dot notation was used
+            for attr_name in parts[1:]:
+                if hasattr(value, attr_name):
+                    value = getattr(value, attr_name)
+                else:
+                    return ""
+
+            return str(value)
+
-        def extract_value(param_spec: str) -> str:
-            """Extract value from parameter, supporting dot notation for attributes."""
-            if not param_spec:
-                return ""
-
-            # Split on '.' to handle attribute access
-            parts = param_spec.split(".")
-            param_name = parts[0]
-
-            # Find the parameter value
-            value = None
-            if param_name in kwargs:
-                value = kwargs[param_name]
-            elif param_name in param_names:
-                idx = param_names.index(param_name)
-                if idx < len(args):
-                    value = args[idx]
-
-            if value is None:
-                return ""
-
-            # Navigate through attributes if dot notation was used
-            for attr_name in parts[1:]:
-                if hasattr(value, attr_name):
-                    value = getattr(value, attr_name)
-                else:
-                    return ""
-
-            return str(value)
+        def extract_value(param_spec: str) -> str:
+            """Extract value from parameter, supporting dot notation for attributes."""
+            if not param_spec:
+                return ""
+
+            # Split on '.' to handle attribute access
+            parts = param_spec.split(".")
+            param_name = parts[0]
+
+            # Find the parameter value
+            value = None
+            if param_name in kwargs:
+                value = kwargs[param_name]
+            elif param_name in param_names:
+                idx = param_names.index(param_name)
+                if idx < len(args):
+                    value = args[idx]
+
+            if value is None:
+                return ""
+
+            # Navigate through attributes if dot notation was used
+-            for attr_name in parts[1:]:
+-                if hasattr(value, attr_name):
+-                    value = getattr(value, attr_name)
+-                else:
+            for attr_name in parts[1:]:
+                if isinstance(value, dict) and attr_name in value:
+                    value = value[attr_name]
+                elif hasattr(value, attr_name):
+                    value = getattr(value, attr_name)
+                else:
+                    return ""
+
+            return str(value)
-        def extract_value(param_spec: str) -> str:
-            """Extract value from parameter, supporting dot notation for attributes."""
-            if not param_spec:
-                return ""
-
-            # Split on '.' to handle attribute access
-            parts = param_spec.split(".")
-            param_name = parts[0]
-
-            # Find the parameter value
-            value = None
-            if param_name in kwargs:
-                value = kwargs[param_name]
-            elif param_name in param_names:
-                idx = param_names.index(param_name)
-                if idx < len(args):
-                    value = args[idx]
-
-            if value is None:
-                return ""
-
-            # Navigate through attributes if dot notation was used
-            for attr_name in parts[1:]:
-                if hasattr(value, attr_name):
-                    value = getattr(value, attr_name)
-                else:
-                    return ""
-
-            return str(value)
+        def extract_value(param_spec: str) -> str:
+            """Extract value from parameter, supporting dot notation for attributes."""
+            if not param_spec:
+                return ""
+
+            # Split on '.' to handle attribute access
+            parts = param_spec.split(".")
+            param_name = parts[0]
+
+            # Find the parameter value
+            value = None
+            if param_name in kwargs:
+                value = kwargs[param_name]
+            elif param_name in param_names:
+                idx = param_names.index(param_name)
+                if idx < len(args):
+                    value = args[idx]
+
+            if value is None:
+                return ""
+
+            # Navigate through attributes if dot notation was used
+-            for attr_name in parts[1:]:
+-                if hasattr(value, attr_name):
+-                    value = getattr(value, attr_name)
+-                else:
+            for attr_name in parts[1:]:
+                if isinstance(value, dict) and attr_name in value:
+                    value = value[attr_name]
+                elif hasattr(value, attr_name):
+                    value = getattr(value, attr_name)
+                else:
+                    return ""
+
+            return str(value)
+        # Extract job_id
+        job_id_key = job_id_param or "job_id"
+        job_id = extract_value(job_id_key)
+
+        # Extract task_id
+        task_id_key = task_id_param or "task_id"
+        task_id = extract_value(task_id_key)
+
+    except Exception as e:
+        logger.debug(f"Failed to extract context from {func.__name__}: {e}")
+
+    return job_id, task_id
+
+
+def _is_profiling_enabled() -> bool:
+    """
+    Check if profiling is enabled.
+    TODO: Add `CLPConfig` mechanism to enable/disable profiling for each component.
+
+    :return: False
+    """
+    return False
+
+
+def _save_profile(
+    profiler: Profiler, section_name: str, job_id: str = "", task_id: str = ""
+) -> None:
+    """
+    Save profiler output to HTML and text formats. Generates .html and .txt files.
+
+    :param profiler: The pyinstrument Profiler object containing profiling data
+    :param section_name: Name identifying this profiling section
+    :param job_id: Optional job identifier for filename
+    :param task_id: Optional task identifier for filename
+    """
+    try:
+        # Get the session for logging
+        session = profiler.last_session
+        if not session:
+            logger.debug(f"No profiling session for {section_name}")
+            return
+
+        duration = session.duration
+        sample_count = session.sample_count
+
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        filename_parts = [section_name]
+
+        if job_id:
+            filename_parts.append(f"job{job_id}")
+        if task_id:
+            filename_parts.append(f"task{task_id}")
+
+        filename_parts.append(timestamp)
+        base_filename = "_".join(filename_parts)
+
+        output_dir = Path(os.getenv("CLP_LOGS_DIR")) / "profiles"
+        output_dir.mkdir(exist_ok=True, parents=True)
+
-        output_dir = Path(os.getenv("CLP_LOGS_DIR")) / "profiles"
-        output_dir.mkdir(exist_ok=True, parents=True)
+        logs_dir = os.getenv("CLP_LOGS_DIR")
+        if not logs_dir:
+            logger.warning("CLP_LOGS_DIR is not set; writing profiles to CWD ./profiles")
+            output_dir = Path.cwd() / "profiles"
+        else:
+            output_dir = Path(logs_dir) / "profiles"
+        output_dir.mkdir(exist_ok=True, parents=True)
-        output_dir = Path(os.getenv("CLP_LOGS_DIR")) / "profiles"
-        output_dir.mkdir(exist_ok=True, parents=True)
+        logs_dir = os.getenv("CLP_LOGS_DIR")
+        if not logs_dir:
+            logger.warning("CLP_LOGS_DIR is not set; writing profiles to CWD ./profiles")
+            output_dir = Path.cwd() / "profiles"
+        else:
+            output_dir = Path(logs_dir) / "profiles"
+        output_dir.mkdir(exist_ok=True, parents=True)
+        # Save HTML with interactive visualization
+        html_path = output_dir / f"{base_filename}.html"
+        with open(html_path, "w", encoding="utf-8") as f:
+            f.write(profiler.output_html())
+
+        # Save human-readable text summary with call hierarchy
+        txt_path = output_dir / f"{base_filename}.txt"
+        with open(txt_path, "w", encoding="utf-8") as f:
+            # Header
+            f.write("=" * 80 + "\n")
+            f.write(f"CLP Query Profiling Report (pyinstrument)\n")
+            f.write(f"Section: {section_name}\n")
+            if job_id:
+                f.write(f"Job ID: {job_id}\n")
+            if task_id:
+                f.write(f"Task ID: {task_id}\n")
+            f.write(f"Timestamp: {timestamp}\n")
+            f.write("=" * 80 + "\n\n")
+            f.write(profiler.output_text(unicode=True, color=False))
+
+        logger.info(
+            f"Profile saved: {section_name} "
+            f"(duration={duration:.6f}s, samples={sample_count}) "
+            f"HTML={html_path}, TXT={txt_path}"
+        )
+
+    except Exception as e:
+        logger.error(f"Failed to save profile for {section_name}: {e}", exc_info=True)
@@ -16,6 +16,7 @@ boto3 = "^1.35.81"
 mariadb = "~1.0.11"
 mysql-connector-python = "^8.2.0"
 pydantic = "^2.11.9"
+pyinstrument = "^5.0.0"
 python-dotenv = "^1.0.1"
 python-Levenshtein = "~0.22"
 sqlalchemy = "~2.0"

@@ -14,6 +14,7 @@
     WorkerConfig,
 )
 from clp_py_utils.clp_logging import set_logging_level
+from clp_py_utils.profiling_utils import profile
 from clp_py_utils.s3_utils import (
     generate_s3_virtual_hosted_style_url,
     get_credential_env_vars,
@@ -179,6 +180,7 @@ def _make_command_and_env_vars(
 
 
 @app.task(bind=True)
+@profile(section_name="extract_stream_task")
 def extract_stream(
     self: Task,
     job_id: str,

@@ -12,6 +12,7 @@
     WorkerConfig,
 )
 from clp_py_utils.clp_logging import set_logging_level
+from clp_py_utils.profiling_utils import profile
 from clp_py_utils.s3_utils import generate_s3_virtual_hosted_style_url, get_credential_env_vars
 from clp_py_utils.sql_adapter import SQL_Adapter
 from job_orchestration.executor.query.celery import app
@@ -165,6 +166,7 @@ def _make_command_and_env_vars(
 
 
 @app.task(bind=True)
+@profile(section_name="search_task")
 def search(
     self: Task,
     job_id: str,

@@ -46,6 +46,7 @@
 )
 from clp_py_utils.core import read_yaml_config_file
 from clp_py_utils.decorators import exception_default_value
+from clp_py_utils.profiling_utils import profile
 from clp_py_utils.sql_adapter import SQL_Adapter
 from job_orchestration.executor.query.extract_stream_task import extract_stream
 from job_orchestration.executor.query.fs_search_task import search
@@ -543,6 +544,7 @@ def get_task_group_for_job(
         raise NotImplementedError(error_msg)
 
 
+@profile(section_name="scheduler_dispatch_job", job_id_param="job.id")
 def dispatch_query_job(
     db_conn,
     job: QueryJob,
@@ -564,6 +566,7 @@ def dispatch_query_job(
     job.state = InternalJobState.RUNNING
 
 
+@profile(section_name="scheduler_acquire_reducer", job_id_param="job.id")
 async def acquire_reducer_for_job(job: SearchJob):
     reducer_host: Optional[str] = None
     reducer_port: Optional[int] = None
@@ -892,6 +895,7 @@ def found_max_num_latest_results(
         return max_timestamp_in_remaining_archives <= min_timestamp_in_top_results
 
 
+@profile(section_name="scheduler_handle_finished_search", job_id_param="job.id")
 async def handle_finished_search_job(
     db_conn, job: SearchJob, task_results: Optional[Any], results_cache_uri: str
 ) -> None: