feat: add accuracy patch to behave scenarios

rgonalo · rgonalo · commit 38d2a9735898 · 2025-10-28T14:01:24.000+01:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,13 +6,16 @@ v3.6.0
 
 *Release date: In development*
 
-- Removed xcuitest deprecated get_window_size() method and replaced it with get_window_rect() in all mobile actions
+- Remove xcuitest deprecated `get_window_size` method and replaced it with `get_window_rect` in all mobile actions
 - Add text comparison methods based on AI libraries. To use them, install the `ai` extra dependency:
 
 .. code:: console
 
     $ pip install toolium[ai]
 
+- Add accuracy tag to behave scenarios using `@accuracy_<percent>_<retries>`, e.g. `@accuracy_80_10` for 80%
+  accuracy with 10 retries
+
 v3.5.0
 ------
 
diff --git a/docs/ai_utils.rst b/docs/ai_utils.rst
@@ -88,3 +88,31 @@ you have (direct OpenAI access or Azure OpenAI):
     AZURE_OPENAI_API_KEY=<your_api_key>
     AZURE_OPENAI_ENDPOINT=<your_endpoint>
     OPENAI_API_VERSION=<your_api_version>
+
+
+Accuracy tag for Behave scenarios
+---------------------------------
+
+You can use accuracy tags in your Behave scenarios to specify the desired accuracy level and number of retries for
+scenarios that involve AI-generated content. The accuracy tag follows the format `@accuracy_<percent>_<retries>`,
+where `<percent>` is the desired accuracy percentage (0-100) and `<retries>` is the number of retries to achieve that
+accuracy. For example, `@accuracy_80_10` indicates that the scenario should achieve at least 80% accuracy retrying the
+scenario execution 10 times.
+
+.. code-block:: bash
+
+    @accuracy_80_10
+    Scenario: Validate AI-generated response accuracy
+      Given the AI model generates a response
+      When the user sends a message
+      Then the AI response should be accurate
+
+When a scenario is tagged with an accuracy tag, Toolium will automatically execute the scenario multiple times. If the
+scenario does not meet the specified accuracy after the given number of retries, it will be marked as failed.
+
+Other examples of accuracy tags:
+- `@accuracy_percent_85_retries_10`: 85% accuracy, 10 retries
+- `@accuracy_percent_75`: 75% accuracy, default 10 retries
+- `@accuracy_90_5`: 90% accuracy, 5 retries
+- `@accuracy_80`: 80% accuracy, default 10 retries
+- `@accuracy`: default 90% accuracy, 10 retries
diff --git a/docs/bdd_integration.rst b/docs/bdd_integration.rst
@@ -70,14 +70,18 @@ Toolium defines three tags to configure driver:
 * :code:`@reset_driver`: identifies a scenario that should not reuse the driver. The browser will be closed and reopen before this test.
 * :code:`@no_driver`: identifies a scenario or feature that should not start the driver, typically in API tests.
 
-And other scenario tags to configure Appium tests:
+It also supports other scenario tags to configure Appium tests:
 
 * :code:`@no_reset_app`: mobile app will not be reset before test (i.e. no-reset Appium capability is set to true)
 * :code:`@reset_app`: mobile app will be reset before test (i.e. no-reset and full-reset Appium capabilities are set to false)
 * :code:`@full_reset_app`: mobile app will be full reset before test (i.e. full-reset Appium capability is set to true)
 * :code:`@android_only`: identifies a scenario that should only be executed in Android
 * :code:`@ios_only`: identifies a scenario that should only be executed in iOS
 
+And also supports accuracy tag for AI related scenarios:
+
+* :code:`@accuracy_<percent>_<retries>`: identifies a scenario that should achieve at least `<percent>` accuracy retrying up to `<retries>` times (view :ref:`Accuracy tag for Behave scenarios <ai_utils>` for more details)
+
 Behave - Dynamic Environment
 ----------------------------
 
diff --git a/toolium/behave/environment.py b/toolium/behave/environment.py
@@ -23,13 +23,14 @@
 
 from behave.api.async_step import use_or_create_async_context
 
-from toolium.utils import dataset
+from toolium.behave.env_utils import DynamicEnvironment
 from toolium.config_files import ConfigFiles
 from toolium.driver_wrapper import DriverWrappersPool
 from toolium.jira import add_jira_status, change_all_jira_status, save_jira_conf
-from toolium.visual_test import VisualTest
 from toolium.pageelements import PageElement
-from toolium.behave.env_utils import DynamicEnvironment
+from toolium.utils import dataset
+from toolium.utils.ai_utils.accuracy import patch_feature_scenarios_with_accuracy
+from toolium.visual_test import VisualTest
 
 
 def before_all(context):
@@ -86,6 +87,9 @@ def before_feature(context, feature):
     context.feature_storage = dict()
     context.storage = collections.ChainMap(context.feature_storage, context.run_storage)
 
+    # Patch scenarios when accuracy tags are present
+    patch_feature_scenarios_with_accuracy(context, feature)
+
     # Behave dynamic environment
     context.dyn_env.get_steps_from_feature_description(feature.description)
     context.dyn_env.execute_before_feature_steps(context)
diff --git a/toolium/test/utils/ai_utils/test_accuracy.py b/toolium/test/utils/ai_utils/test_accuracy.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright 2025 Telefónica Innovación Digital, S.L.
+This file is part of Toolium.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import pytest
+
+from toolium.utils.ai_utils.accuracy import get_accuracy_and_retries_from_tags
+
+
+accuracy_tags_examples = (
+    (['accuracy'], {'accuracy': 0.9, 'retries': 10}),
+    (['accuracy_85'], {'accuracy': 0.85, 'retries': 10}),
+    (['accuracy_percent_80'], {'accuracy': 0.8, 'retries': 10}),
+    (['accuracy_75_5'], {'accuracy': 0.75, 'retries': 5}),
+    (['accuracy_percent_70_retries_3'], {'accuracy': 0.7, 'retries': 3}),
+    (['other_tag', 'accuracy_95_15'], {'accuracy': 0.95, 'retries': 15}),
+    (['no_accuracy_tag'], None),
+    (['accuracy_85', 'accuracy_95_15'], {'accuracy': 0.85, 'retries': 10}),
+    ([], None),
+)
+
+
+@pytest.mark.parametrize('tags, expected_accuracy_data', accuracy_tags_examples)
+def test_get_accuracy_and_retries_from_tags(tags, expected_accuracy_data):
+    accuracy_data = get_accuracy_and_retries_from_tags(tags)
+    assert accuracy_data == expected_accuracy_data
diff --git a/toolium/utils/ai_utils/accuracy.py b/toolium/utils/ai_utils/accuracy.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright 2025 Telefónica Innovación Digital, S.L.
+This file is part of Toolium.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import functools
+import re
+from behave.model import ScenarioOutline
+from behave.model_core import Status
+
+
+def get_accuracy_and_retries_from_tags(tags):
+    """
+    Extract accuracy and retries values from accuracy tag using regex.
+    Examples of valid tags:
+    - accuracy
+    - accuracy_90
+    - accuracy_percent_90
+    - accuracy_90_10
+    - accuracy_percent_90_retries_10
+
+    :param tags: behave tags
+    :return: dict with 'accuracy' and 'retries' keys if tag matches, None otherwise
+    """
+    accuracy_regex = re.compile(r'^accuracy(?:_(?:percent_)?(\d+)(?:_retries_(\d+)|_(\d+))?)?', re.IGNORECASE)
+    for tag in tags:
+        match = accuracy_regex.search(tag)
+        if match:
+            # Default values: 90% accuracy, 10 retries
+            accuracy_percent = (int(match.group(1)) / 100.0) if match.group(1) else 0.9
+            # Check if retries is in group 2 (accuracy_percent_90_retries_10) or group 3 (accuracy_90_10)
+            retries = int(match.group(2)) if match.group(2) else (int(match.group(3)) if match.group(3) else 10)
+            return {'accuracy': accuracy_percent, 'retries': retries}
+    return None
+
+
+def patch_scenario_with_accuracy(context, scenario, accuracy=0.9, retries=10):
+    """Monkey-patches :func:`~behave.model.Scenario.run()` to execute multiple times and calculate the accuracy of the
+    results.
+
+    This is helpful when the test is flaky due to unreliable test infrastructure or when the application under test is
+    AI based and its responses may vary slightly.
+
+    :param context: behave context
+    :param scenario: Scenario or ScenarioOutline to patch
+    :param accuracy: Minimum accuracy required to consider the scenario as passed
+    :param retries: Number of times the scenario will be executed
+    """
+    def scenario_run_with_accuracy(context, scenario_run, scenario, *args, **kwargs):
+        # Execute the scenario multiple times and count passed executions
+        passed_executions = 0
+        for retry in range(1, retries+1):
+            if not scenario_run(*args, **kwargs):
+                passed_executions += 1
+                status = "PASSED"
+            else:
+                status = "FAILED"
+            print(f"ACCURACY SCENARIO {status}: retry {retry}/{retries}")
+            context.logger.info(f"Accuracy scenario {status} (retry {retry}/{retries})")
+
+        # Calculate scenario accuracy
+        scenario_accuracy = passed_executions / retries
+        has_passed = scenario_accuracy >= accuracy
+        final_status = 'PASSED' if has_passed else 'FAILED'
+        print(f"\nACCURACY SCENARIO {final_status}: {retries} retries, accuracy {scenario_accuracy} >= {accuracy}")
+        final_message = (f"Accuracy scenario {final_status} after {retries} retries with"
+                         f" accuracy {scenario_accuracy} >= {accuracy}")
+
+        # Set final scenario status
+        if has_passed:
+            context.logger.info(final_message)
+            scenario.set_status(Status.passed)
+        else:
+            context.logger.error(final_message)
+            scenario.set_status(Status.failed)
+        return not has_passed  # Run method returns true when failed
+
+    scenario_run = scenario.run
+    scenario.run = functools.partial(scenario_run_with_accuracy, context, scenario_run, scenario)
+
+
+def patch_scenario_from_tags(context, scenario):
+    """Patch scenario with accuracy method when accuracy tags are present in scenario.
+
+    :param context: behave context
+    :param scenario: behave scenario
+    """
+    accuracy_data = get_accuracy_and_retries_from_tags(scenario.effective_tags)
+    if accuracy_data:
+        patch_scenario_with_accuracy(context, scenario, accuracy=accuracy_data['accuracy'],
+                                     retries=accuracy_data['retries'])
+
+
+def patch_feature_scenarios_with_accuracy(context, feature):
+    """Patch feature scenarios with accuracy method when accuracy tags are present in scenarios.
+
+    :param context: behave context
+    :param feature: behave feature
+    """
+    try:
+        for scenario in feature.scenarios:
+            if isinstance(scenario, ScenarioOutline):
+                for outline_scenario in scenario.scenarios:
+                    patch_scenario_from_tags(context, outline_scenario)
+            else:
+                patch_scenario_from_tags(context, scenario)
+    except Exception as e:
+        # Log error but do not fail the execution to avoid errors in before feature method
+        context.logger.error(f"Error applying accuracy policy: {e}")
diff --git a/toolium/utils/ai_utils/text_similarity.py b/toolium/utils/ai_utils/text_similarity.py
@@ -106,8 +106,8 @@ def get_text_similarity_with_openai(text, expected_text, azure=False):
         explanation = response['explanation']
     except (KeyError, ValueError, TypeError) as e:
         raise ValueError(f"Unexpected response format from OpenAI: {response}") from e
-    logger.info(f"OpenAI LLM similarity: {similarity} between '{text}' and '{expected_text}'."
-                f" LLM explanation: {explanation}")
+    logger.info(f"OpenAI LLM similarity: {similarity} between '{text}' and '{expected_text}'")
+    logger.info(f"OpenAI LLM explanation: {explanation}")
     return similarity