feat: add Inconsistent Description heuristic

AmineRaouane · AmineRaouane · commit 6458c76a9f77 · 2025-09-06T02:20:05.000+01:00
Signed-off-by: Amine &lt;amine.raouane@enim.ac.ma&gt;
diff --git a/src/macaron/ai/README.md b/src/macaron/ai/README.md
@@ -5,13 +5,13 @@ This module provides the foundation for interacting with Large Language Models (
 ## Module Components
 
 - **ai_client.py**
-  Defines the abstract [`AIClient`](./ai_client.py) class. This class handles the initialization of LLM configuration from the defaults and serves as the base for all specific AI client implementations.
+  Defines the abstract [`AIClient`](./clients/base.py) class. This class handles the initialization of LLM configuration from the defaults and serves as the base for all specific AI client implementations.
 
 - **openai_client.py**
-  Implements the [`OpenAiClient`](./openai_client.py) class, a concrete subclass of [`AIClient`](./ai_client.py). This client interacts with OpenAI-like APIs by sending requests using HTTP and processing the responses. It also validates and structures responses using the tools provided.
+  Implements the [`OpenAiClient`](./clients/openai_client.py) class, a concrete subclass of [`AIClient`](./ai_client.py). This client interacts with OpenAI-like APIs by sending requests using HTTP and processing the responses. It also validates and structures responses using the tools provided.
 
 - **ai_factory.py**
-  Contains the [`AIClientFactory`](./ai_factory.py) class, which is responsible for reading provider configuration from the defaults and creating the correct AI client instance.
+  Contains the [`AIClientFactory`](./clients/base.py) class, which is responsible for reading provider configuration from the defaults and creating the correct AI client instance.
 
 - **ai_tools.py**
   Offers utility functions such as `structure_response` to assist with parsing and validating the JSON response returned by an LLM. These functions ensure that responses conform to a given Pydantic model for easier downstream processing.
@@ -22,11 +22,11 @@ This module provides the foundation for interacting with Large Language Models (
    The module reads the LLM configuration from the application defaults (using the `defaults` module). Make sure that the `llm` section in your configuration includes valid settings such as `enabled`, `api_key`, `api_endpoint`, `model`, and `context_window`.
 
 2. **Creating a Client:**
-   Use the [`AIClientFactory`](./ai_factory.py) to create an AI client instance. The factory checks the configured provider and returns a client (e.g., an instance of [`OpenAiClient`](./openai_client.py)) that can be used to invoke the LLM.
+   Use the [`AIClientFactory`](./clients/ai_factory.py) to create an AI client instance. The factory checks the configured provider and returns a client (e.g., an instance of [`OpenAiClient`](./clients/openai_client.py)) that can be used to invoke the LLM.
 
    Example:
    ```py
-   from macaron.ai.ai_factory import AIClientFactory
+   from macaron.ai.clients.ai_factory import AIClientFactory
 
    factory = AIClientFactory()
    client = factory.create_client(system_prompt="You are a helpful assistant.")
@@ -45,6 +45,6 @@ This module provides the foundation for interacting with Large Language Models (
 ## Extensibility
 
 The design of the AI module is provider-agnostic. To add support for additional LLM providers:
-- Implement a new client by subclassing [`AIClient`](./ai_client.py).
-- Add the new client to the [`PROVIDER_MAPPING`](./ai_factory.py).
+- Implement a new client by subclassing [`AIClient`](./clients/base.py).
+- Add the new client to the [`PROVIDER_MAPPING`](./clients/ai_factory.py).
 - Update the configuration defaults accordingly.
diff --git a/src/macaron/ai/ai_tools.py b/src/macaron/ai/ai_tools.py
@@ -5,32 +5,26 @@
 import json
 import logging
 import re
-from typing import TypeVar
-
-from pydantic import BaseModel, ValidationError
-
-T = TypeVar("T", bound=BaseModel)
+from typing import Any
 
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-def structure_response(response_text: str, response_model: type[T]) -> T | None:
+def extract_json(response_text: str) -> Any:
     """
-    Structure and parse the response from the LLM.
+    Parse the response from the LLM.
 
     If raw JSON parsing fails, attempts to extract a JSON object from text.
 
     Parameters
     ----------
     response_text: str
         The response text from the LLM.
-    response_model: Type[T]
-        The Pydantic model to structure the response against.
 
     Returns
     -------
-    T | None
-        The structured Pydantic model instance.
+    dict[str, Any] | None
+        The structured JSON object.
     """
     try:
         data = json.loads(response_text)
@@ -46,8 +40,4 @@ def structure_response(response_text: str, response_model: type[T]) -> T | None:
             logger.debug("Failed to parse extracted JSON: %s", e)
             return None
 
-    try:
-        return response_model.model_validate(data)
-    except ValidationError as e:
-        logger.debug("Validation failed against response model: %s", e)
-        return None
+    return data
diff --git a/src/macaron/ai/clients/__init__.py b/src/macaron/ai/clients/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+"""This module provides a mapping of AI client providers to their respective client classes."""
+
+from macaron.ai.clients.base import AIClient
+from macaron.ai.clients.openai_client import OpenAiClient
+
+PROVIDER_MAPPING: dict[str, type[AIClient]] = {"openai": OpenAiClient}
diff --git a/src/macaron/ai/clients/ai_factory.py b/src/macaron/ai/clients/ai_factory.py
@@ -5,8 +5,8 @@
 
 import logging
 
-from macaron.ai.ai_client import AIClient
-from macaron.ai.openai_client import OpenAiClient
+from macaron.ai.clients import PROVIDER_MAPPING
+from macaron.ai.clients.base import AIClient
 from macaron.config.defaults import defaults
 from macaron.errors import ConfigurationError
 
@@ -16,37 +16,30 @@
 class AIClientFactory:
     """Factory to create AI clients based on provider configuration."""
 
-    PROVIDER_MAPPING: dict[str, type[AIClient]] = {"openai": OpenAiClient}
-
     def __init__(self) -> None:
         """
         Initialize the AI client.
 
         The LLM configuration is read from defaults.
         """
-        self.defaults = self._load_defaults()
+        self.params = self._load_defaults()
 
-    def _load_defaults(self) -> dict:
+    def _load_defaults(self) -> dict | None:
         section_name = "llm"
         default_values = {
             "enabled": False,
             "provider": "",
             "api_key": "",
             "api_endpoint": "",
             "model": "",
-            "context_window": 10000,
         }
 
         if defaults.has_section(section_name):
             section = defaults[section_name]
             default_values["enabled"] = section.getboolean("enabled", default_values["enabled"])
-            default_values["api_key"] = str(section.get("api_key", default_values["api_key"])).strip().lower()
-            default_values["api_endpoint"] = (
-                str(section.get("api_endpoint", default_values["api_endpoint"])).strip().lower()
-            )
-            default_values["model"] = str(section.get("model", default_values["model"])).strip().lower()
-            default_values["provider"] = str(section.get("provider", default_values["provider"])).strip().lower()
-            default_values["context_window"] = section.getint("context_window", 10000)
+            for key, default_value in default_values.items():
+                if isinstance(default_value, str):
+                    default_values[key] = str(section.get(key, default_value)).strip().lower()
 
         if default_values["enabled"]:
             for key, value in default_values.items():
@@ -59,12 +52,11 @@ def _load_defaults(self) -> dict:
 
     def create_client(self, system_prompt: str) -> AIClient | None:
         """Create an AI client based on the configured provider."""
-        client_class = self.PROVIDER_MAPPING.get(self.defaults["provider"])
-        if client_class is None:
-            logger.error("Provider '%s' is not supported.", self.defaults["provider"])
+        if not self.params or not self.params["enabled"]:
             return None
-        return client_class(system_prompt, self.defaults)
 
-    def list_available_providers(self) -> list[str]:
-        """List all registered providers."""
-        return list(self.PROVIDER_MAPPING.keys())
+        client_class = PROVIDER_MAPPING.get(self.params["provider"])
+        if client_class is None:
+            logger.error("Provider '%s' is not supported.", self.params["provider"])
+            return None
+        return client_class(system_prompt, self.params)
diff --git a/src/macaron/ai/clients/base.py b/src/macaron/ai/clients/base.py
@@ -3,36 +3,28 @@
 
 """This module defines the abstract AIClient class for implementing AI clients."""
 
-import logging
 from abc import ABC, abstractmethod
-from typing import Any, TypeVar
-
-from pydantic import BaseModel
-
-T = TypeVar("T", bound=BaseModel)
-
-logger: logging.Logger = logging.getLogger(__name__)
 
 
 class AIClient(ABC):
     """This abstract class is used to implement ai clients."""
 
-    def __init__(self, system_prompt: str, defaults: dict) -> None:
+    def __init__(self, system_prompt: str, params: dict) -> None:
         """
         Initialize the AI client.
 
         The LLM configuration is read from defaults.
         """
         self.system_prompt = system_prompt
-        self.defaults = defaults
+        self.params = params
 
     @abstractmethod
     def invoke(
         self,
         user_prompt: str,
         temperature: float = 0.2,
-        structured_output: type[T] | None = None,
-    ) -> Any:
+        response_format: dict | None = None,
+    ) -> dict:
         """
         Invoke the LLM and optionally validate its response.
 
@@ -42,12 +34,12 @@ def invoke(
             The user prompt to send to the LLM.
         temperature: float
             The temperature for the LLM response.
-        structured_output: Optional[Type[T]]
-            The Pydantic model to validate the response against. If provided, the response will be parsed and validated.
+        response_format: dict | None
+            The json schema to validate the response against.
 
         Returns
         -------
-        Optional[T | str]
-            The validated Pydantic model instance if `structured_output` is provided,
+        dict
+            The validated schema if `response_format` is provided,
             or the raw string response if not.
         """
diff --git a/src/macaron/ai/clients/openai_client.py b/src/macaron/ai/clients/openai_client.py
@@ -8,8 +8,8 @@
 
 from pydantic import BaseModel
 
-from macaron.ai.ai_client import AIClient
-from macaron.ai.ai_tools import structure_response
+from macaron.ai.ai_tools import extract_json
+from macaron.ai.clients.base import AIClient
 from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError
 from macaron.util import send_post_http_raw
 
@@ -25,7 +25,7 @@ def invoke(
         self,
         user_prompt: str,
         temperature: float = 0.2,
-        structured_output: type[T] | None = None,
+        response_format: dict | None = None,
         max_tokens: int = 4000,
         timeout: int = 30,
     ) -> Any:
@@ -38,8 +38,8 @@ def invoke(
             The user prompt to send to the LLM.
         temperature: float
             The temperature for the LLM response.
-        structured_output: Optional[Type[T]]
-            The Pydantic model to validate the response against. If provided, the response will be parsed and validated.
+        response_format: dict
+            The json schema to validate the response against. If provided, the response will be parsed and validated.
         max_tokens: int
             The maximum number of tokens for the LLM response.
         timeout: int
@@ -56,28 +56,21 @@ def invoke(
         HeuristicAnalyzerValueError
             If there is an error in parsing or validating the response.
         """
-        if not self.defaults["enabled"]:
+        if not self.params["enabled"]:
             raise ConfigurationError("AI client is not enabled. Please check your configuration.")
 
-        if len(user_prompt.split()) > self.defaults["context_window"]:
-            logger.warning(
-                "User prompt exceeds context window (%s words). "
-                "Truncating the prompt to fit within the context window.",
-                self.defaults["context_window"],
-            )
-            user_prompt = " ".join(user_prompt.split()[: self.defaults["context_window"]])
-
-        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.defaults["api_key"]}"}
+        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.params['api_key']}"}
         payload = {
-            "model": self.defaults["model"],
+            "model": self.params["model"],
             "messages": [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}],
+            "response_format": response_format,
             "temperature": temperature,
             "max_tokens": max_tokens,
         }
 
         try:
             response = send_post_http_raw(
-                url=self.defaults["api_endpoint"], json_data=payload, headers=headers, timeout=timeout
+                url=self.params["api_endpoint"], json_data=payload, headers=headers, timeout=timeout
             )
             if not response:
                 raise HeuristicAnalyzerValueError("No response received from the LLM.")
@@ -89,11 +82,7 @@ def invoke(
                 logger.info("LLM call token usage: %s", usage_str)
 
             message_content = response_json["choices"][0]["message"]["content"]
-
-            if not structured_output:
-                logger.debug("Returning raw message content (no structured output requested).")
-                return message_content
-            return structure_response(message_content, structured_output)
+            return extract_json(message_content)
 
         except Exception as e:
             logger.error("Error during LLM invocation: %s", e)
diff --git a/src/macaron/ai/prompts/__init__.py b/src/macaron/ai/prompts/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
diff --git a/src/macaron/ai/schemas/__init__.py b/src/macaron/ai/schemas/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini
@@ -647,6 +647,3 @@ api_key =
 api_endpoint =
 # The model to use for the LLM service.
 model =
-# The context window size for the LLM service.
-# This is the maximum number of tokens that the LLM can process in a single request.
-context_window = 10000
diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py
@@ -52,6 +52,9 @@ class Heuristics(str, Enum):
     #: Indicates that the package contains some code that doesn't match the docstrings.
     MATCHING_DOCSTRINGS = "matching_docstrings"
 
+    #: Indicates that the package description is inconsistent.
+    INCONSISTENT_DESCRIPTION = "inconsistent_description"
+
 
 class HeuristicResult(str, Enum):
     """Result type indicating the outcome of a heuristic."""
diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py
diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py
diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
diff --git a/tests/malware_analyzer/pypi/test_inconsistent_description.py b/tests/malware_analyzer/pypi/test_inconsistent_description.py
diff --git a/tests/malware_analyzer/pypi/test_matching_docstrings.py b/tests/malware_analyzer/pypi/test_matching_docstrings.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.`
	`2`	`+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.`