feat: accuracy tag uses stored data in retries

rgonalo · rgonalo · commit e1d86fd3efa9 · 2025-11-11T16:12:51.000+01:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -15,6 +15,8 @@ v3.6.0
 
 - Add accuracy tag to behave scenarios using `@accuracy_<percent>_<retries>`, e.g. `@accuracy_80_10` for 80%
   accuracy with 10 retries
+- Add accuracy data tag to behave scenarios using `@accuracy_data_<suffix>`, e.g. `@accuracy_data_greetings` for
+  accuracy data with greetings suffix
 
 v3.5.0
 ------
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-3.6.0.dev1
+3.6.0.dev2
diff --git a/docs/ai_utils.rst b/docs/ai_utils.rst
@@ -90,8 +90,11 @@ you have (direct OpenAI access or Azure OpenAI):
     OPENAI_API_VERSION=<your_api_version>
 
 
-Accuracy tag for Behave scenarios
----------------------------------
+Accuracy tags for Behave scenarios
+----------------------------------
+
+@accuracy
+~~~~~~~~~
 
 You can use accuracy tags in your Behave scenarios to specify the desired accuracy level and number of retries for
 scenarios that involve AI-generated content. The accuracy tag follows the format `@accuracy_<percent>_<retries>`,
@@ -116,3 +119,41 @@ Other examples of accuracy tags:
 - `@accuracy_90_5`: 90% accuracy, 5 retries
 - `@accuracy_80`: 80% accuracy, default 10 retries
 - `@accuracy`: default 90% accuracy, 10 retries
+
+@accuracy_data
+~~~~~~~~~~~~~~
+
+You can also use accuracy data tags in your Behave scenarios to specify different sets of accuracy data for each retry.
+The accuracy data tag follows the format `@accuracy_data_<suffix>`, where `<suffix>` is a custom suffix to identify the
+accuracy data set. For example, `@accuracy_data_greetings` indicates that the scenario should use the accuracy data set
+with the suffix "greetings".
+
+.. code-block:: bash
+
+    @accuracy_80
+    @accuracy_data_greetings
+    Scenario: Validate AI-generated greeting responses
+      Given the AI model generates a greeting response
+      When the user sends "[CONTEXT:accuracy_retry_data.question]" message
+      Then the AI greeting response should be similar to "[CONTEXT:accuracy_retry_data.answer]"
+
+When a scenario is tagged with an accuracy data tag, Toolium will automatically use the specified accuracy data set for
+each retry. This allows you to test different scenarios with varying data inputs. Accuracy data should be stored
+previously in the context storage under the key `accuracy_data_<suffix>`, where `<suffix>` matches the one used in the
+tag. For example, for the tag `@accuracy_data_greetings`, the accuracy data should be stored under the key
+`accuracy_data_greetings`. The accuracy data should be a list of dictionaries, where each dictionary contains the data
+for a specific retry.
+
+For example, to store accuracy data for greetings, you can do the following in a step definition:
+
+.. code-block:: python
+
+    accuracy_data_greetings = [
+        {"question": "Hello", "answer": "Hi, how can I help you?"},
+        {"question": "Good morning", "answer": "Good morning! What can I do for you today?"},
+        {"question": "Hey there", "answer": "Hey! How can I assist you?"}
+    ]
+    context.storage["accuracy_data_greetings"] = accuracy_data_greetings
+
+This way, during each retry of the scenario, Toolium will use the corresponding data from the accuracy data set based on
+the retry index.
diff --git a/toolium/behave/environment.py b/toolium/behave/environment.py
@@ -87,13 +87,13 @@ def before_feature(context, feature):
     context.feature_storage = dict()
     context.storage = collections.ChainMap(context.feature_storage, context.run_storage)
 
-    # Patch scenarios when accuracy tags are present
-    patch_feature_scenarios_with_accuracy(context, feature)
-
     # Behave dynamic environment
     context.dyn_env.get_steps_from_feature_description(feature.description)
     context.dyn_env.execute_before_feature_steps(context)
 
+    # Patch scenarios when accuracy tags are present
+    patch_feature_scenarios_with_accuracy(context, feature)
+
 
 def before_scenario(context, scenario):
     """Scenario initialization
diff --git a/toolium/test/utils/ai_utils/test_accuracy.py b/toolium/test/utils/ai_utils/test_accuracy.py
@@ -16,9 +16,11 @@
 limitations under the License.
 """
 
+import mock
 import pytest
 
-from toolium.utils.ai_utils.accuracy import get_accuracy_and_retries_from_tags
+from toolium.utils.ai_utils.accuracy import (get_accuracy_and_retries_from_tags, get_accuracy_data_suffix_from_tags,
+                                             get_accuracy_data, store_retry_data)
 
 
 accuracy_tags_examples = (
@@ -31,10 +33,116 @@
     (['no_accuracy_tag'], None),
     (['accuracy_85', 'accuracy_95_15'], {'accuracy': 0.85, 'retries': 10}),
     ([], None),
+    (['accuracy_data', 'accuracy_data_50'], None),
+    (['accuracy_75_5', 'accuracy_data'], {'accuracy': 0.75, 'retries': 5})
 )
 
 
 @pytest.mark.parametrize('tags, expected_accuracy_data', accuracy_tags_examples)
 def test_get_accuracy_and_retries_from_tags(tags, expected_accuracy_data):
     accuracy_data = get_accuracy_and_retries_from_tags(tags)
     assert accuracy_data == expected_accuracy_data
+
+
+accuracy_tags_examples = (
+    (['accuracy'], 8, {'accuracy': 0.9, 'retries': 8}),
+    (['accuracy_85'], 8, {'accuracy': 0.85, 'retries': 8}),
+    (['accuracy_percent_80'], 8, {'accuracy': 0.8, 'retries': 8}),
+    (['accuracy_75_5'], 8, {'accuracy': 0.75, 'retries': 5}),
+    (['accuracy_percent_70_retries_3'], 8, {'accuracy': 0.7, 'retries': 3}),
+    (['other_tag', 'accuracy_95_15'], 8, {'accuracy': 0.95, 'retries': 15}),
+    (['no_accuracy_tag'], 8, None),
+    (['accuracy_85', 'accuracy_95_15'], 8, {'accuracy': 0.85, 'retries': 8}),
+    ([], 8, None),
+    (['accuracy_data', 'accuracy_data_50'], 8, None),
+    (['accuracy_75_5', 'accuracy_data'], 8, {'accuracy': 0.75, 'retries': 5})
+)
+
+
+@pytest.mark.parametrize('tags, data_length, expected_accuracy_data', accuracy_tags_examples)
+def test_get_accuracy_and_retries_from_tags_with_data_length(tags, data_length, expected_accuracy_data):
+    accuracy_data = get_accuracy_and_retries_from_tags(tags, accuracy_data_len=data_length)
+    assert accuracy_data == expected_accuracy_data
+
+
+accuracy_data_suffix_examples = (
+    (['accuracy_data'], ''),
+    (['accuracy_data_balance'], '_balance'),
+    (['accuracy_data_balance_50'], '_balance_50'),
+    (['other_tag', 'accuracy_data_transactions'], '_transactions'),
+    (['no_accuracy_data_tag'], ''),
+    (['accuracy', 'accuracy_85', 'accuracy_percent_70_retries_3'], ''),
+    ([], '')
+)
+
+
+@pytest.mark.parametrize('tags, expected_data_suffix', accuracy_data_suffix_examples)
+def test_get_accuracy_data_suffix_from_tags(tags, expected_data_suffix):
+    data_suffix = get_accuracy_data_suffix_from_tags(tags)
+    assert data_suffix == expected_data_suffix
+
+
+@pytest.fixture
+def context():
+    context = mock.MagicMock()
+    context.storage = {
+        'accuracy_data': [{'question': 'Q1', 'answer': 'A1'},
+                              {'question': 'Q2', 'answer': 'A2'}],
+        'accuracy_data_balance': [{'question': 'Q1 balance', 'answer': 'A1'},
+                                      {'question': 'Q2 balance', 'answer': 'A2'}],
+        'accuracy_data_wrong': "This is not a list"
+    }
+    return context
+
+
+def test_get_accuracy_data_default_suffix(context):
+    data = get_accuracy_data(context, data_key_suffix='')
+    assert data == [{'question': 'Q1', 'answer': 'A1'},
+                    {'question': 'Q2', 'answer': 'A2'}]
+
+
+def test_get_accuracy_data_with_suffix(context):
+    data = get_accuracy_data(context, data_key_suffix='_balance')
+    assert data == [{'question': 'Q1 balance', 'answer': 'A1'},
+                    {'question': 'Q2 balance', 'answer': 'A2'}]
+
+
+def test_get_accuracy_data_with_nonexistent_suffix(context):
+    data = get_accuracy_data(context, data_key_suffix='_nonexistent')
+    assert data == []
+
+
+def test_get_accuracy_data_with_wrong_type(context):
+    with pytest.raises(AssertionError) as exc:
+        get_accuracy_data(context, data_key_suffix='_wrong')
+    assert str(exc.value) == 'Expected accuracy_data_wrong must be a list: This is not a list'
+
+
+def test_store_retry_data_default_suffix(context):
+    store_retry_data(context, retry=1, data_key_suffix='')
+    assert context.storage['accuracy_retry_data'] == {'question': 'Q1', 'answer': 'A1'}
+    assert context.storage['accuracy_retry_index'] == 1
+    store_retry_data(context, retry=2, data_key_suffix='')
+    assert context.storage['accuracy_retry_data'] == {'question': 'Q2', 'answer': 'A2'}
+    assert context.storage['accuracy_retry_index'] == 2
+    store_retry_data(context, retry=3, data_key_suffix='')
+    assert context.storage['accuracy_retry_data'] == {'question': 'Q1', 'answer': 'A1'}
+    assert context.storage['accuracy_retry_index'] == 3
+
+
+def test_store_retry_data_with_suffix(context):
+    store_retry_data(context, retry=1, data_key_suffix='_balance')
+    assert context.storage['accuracy_retry_data'] == {'question': 'Q1 balance', 'answer': 'A1'}
+    assert context.storage['accuracy_retry_index'] == 1
+    store_retry_data(context, retry=2, data_key_suffix='_balance')
+    assert context.storage['accuracy_retry_data'] == {'question': 'Q2 balance', 'answer': 'A2'}
+    assert context.storage['accuracy_retry_index'] == 2
+    store_retry_data(context, retry=3, data_key_suffix='_balance')
+    assert context.storage['accuracy_retry_data'] == {'question': 'Q1 balance', 'answer': 'A1'}
+    assert context.storage['accuracy_retry_index'] == 3
+
+
+def test_store_retry_data_with_nonexistent_suffix(context):
+    store_retry_data(context, retry=1, data_key_suffix='_nonexistent')
+    assert context.storage['accuracy_retry_data'] is None
+    assert context.storage['accuracy_retry_index'] == 1
diff --git a/toolium/utils/ai_utils/accuracy.py b/toolium/utils/ai_utils/accuracy.py
@@ -22,32 +22,56 @@
 from behave.model_core import Status
 
 
-def get_accuracy_and_retries_from_tags(tags):
+def get_accuracy_and_retries_from_tags(tags, accuracy_data_len=None):
     """
     Extract accuracy and retries values from accuracy tag using regex.
     Examples of valid tags:
+
     - accuracy
     - accuracy_90
     - accuracy_percent_90
     - accuracy_90_10
     - accuracy_percent_90_retries_10
 
     :param tags: behave tags
+    :param accuracy_data_len: length of accuracy data if available
     :return: dict with 'accuracy' and 'retries' keys if tag matches, None otherwise
     """
-    accuracy_regex = re.compile(r'^accuracy(?:_(?:percent_)?(\d+)(?:_retries_(\d+)|_(\d+))?)?', re.IGNORECASE)
+    # Default values: 90% accuracy, 10 retries
+    default_accuracy = 0.9
+    default_retries = accuracy_data_len if accuracy_data_len is not None else 10
+    accuracy_regex = re.compile(r'^accuracy(?!_data)(?:_(?:percent_)?(\d+)(?:_retries_(\d+)|_(\d+))?)?$', re.IGNORECASE)
     for tag in tags:
         match = accuracy_regex.search(tag)
         if match:
-            # Default values: 90% accuracy, 10 retries
-            accuracy_percent = (int(match.group(1)) / 100.0) if match.group(1) else 0.9
+            accuracy_percent = (int(match.group(1)) / 100.0) if match.group(1) else default_accuracy
             # Check if retries is in group 2 (accuracy_percent_90_retries_10) or group 3 (accuracy_90_10)
-            retries = int(match.group(2)) if match.group(2) else (int(match.group(3)) if match.group(3) else 10)
+            retries = int(match.group(2)) if match.group(2) else (int(match.group(3)) if match.group(3)
+                                                                  else default_retries)
             return {'accuracy': accuracy_percent, 'retries': retries}
     return None
 
 
-def patch_scenario_with_accuracy(context, scenario, accuracy=0.9, retries=10):
+def get_accuracy_data_suffix_from_tags(tags):
+    """
+    Extract accuracy data suffix from behave tags
+
+    Examples of valid tags:
+    - accuracy_data
+    - accuracy_data_suffix
+
+    :param tags: behave tags
+    :return: data_key_suffix if tag matches, empty string otherwise
+    """
+    accuracy_data_regex = re.compile(r'^accuracy_data(?:_(\w+))?', re.IGNORECASE)
+    for tag in tags:
+        match = accuracy_data_regex.search(tag)
+        if match:
+            return f'_{match.group(1)}' if match.group(1) else ''
+    return ''
+
+
+def patch_scenario_with_accuracy(context, scenario, data_key_suffix, accuracy=0.9, retries=10):
     """Monkey-patches :func:`~behave.model.Scenario.run()` to execute multiple times and calculate the accuracy of the
     results.
 
@@ -56,20 +80,23 @@ def patch_scenario_with_accuracy(context, scenario, accuracy=0.9, retries=10):
 
     :param context: behave context
     :param scenario: Scenario or ScenarioOutline to patch
+    :param data_key_suffix: Suffix to identify accuracy data in context storage
     :param accuracy: Minimum accuracy required to consider the scenario as passed
     :param retries: Number of times the scenario will be executed
     """
     def scenario_run_with_accuracy(context, scenario_run, scenario, *args, **kwargs):
         # Execute the scenario multiple times and count passed executions
         passed_executions = 0
         for retry in range(1, retries+1):
+            context.logger.info(f"Running accuracy scenario retry ({retry}/{retries})")
+            store_retry_data(context, retry, data_key_suffix)
             if not scenario_run(*args, **kwargs):
                 passed_executions += 1
                 status = "PASSED"
             else:
                 status = "FAILED"
             print(f"ACCURACY SCENARIO {status}: retry {retry}/{retries}")
-            context.logger.info(f"Accuracy scenario {status} (retry {retry}/{retries})")
+            context.logger.info(f"Accuracy scenario retry {status} ({retry}/{retries})")
 
         # Calculate scenario accuracy
         scenario_accuracy = passed_executions / retries
@@ -98,10 +125,13 @@ def patch_scenario_from_tags(context, scenario):
     :param context: behave context
     :param scenario: behave scenario
     """
-    accuracy_data = get_accuracy_and_retries_from_tags(scenario.effective_tags)
-    if accuracy_data:
-        patch_scenario_with_accuracy(context, scenario, accuracy=accuracy_data['accuracy'],
-                                     retries=accuracy_data['retries'])
+    data_key_suffix = get_accuracy_data_suffix_from_tags(scenario.effective_tags)
+    accuracy_data = get_accuracy_data(context, data_key_suffix)
+    accuracy_data_len = len(accuracy_data) if accuracy_data else None
+    accuracy_settings = get_accuracy_and_retries_from_tags(scenario.effective_tags, accuracy_data_len)
+    if accuracy_settings:
+        patch_scenario_with_accuracy(context, scenario, data_key_suffix,
+                                     accuracy=accuracy_settings['accuracy'], retries=accuracy_settings['retries'])
 
 
 def patch_feature_scenarios_with_accuracy(context, feature):
@@ -120,3 +150,33 @@ def patch_feature_scenarios_with_accuracy(context, feature):
     except Exception as e:
         # Log error but do not fail the execution to avoid errors in before feature method
         context.logger.error(f"Error applying accuracy policy: {e}")
+
+
+def get_accuracy_data(context, data_key_suffix):
+    """
+    Retrieve accuracy data stored in context.
+
+    :param context: behave context
+    :param data_key_suffix: Suffix to identify accuracy data in context storage
+    :return: accuracy data list
+    """
+    accuracy_data_key = f"accuracy_data{data_key_suffix}"
+    accuracy_data = context.storage.get(accuracy_data_key, [])
+    assert isinstance(accuracy_data, list), f"Expected {accuracy_data_key} must be a list: {accuracy_data}"
+    return accuracy_data
+
+
+def store_retry_data(context, retry, data_key_suffix):
+    """Extract data to be used in current retry execution and store it in accuracy_retry_data key in context.
+    context.storage['accuracy_data{data_key_suffix}'] is expected to be a list of dicts with data for each retry.
+    Retry data is selected using modulo to avoid index errors, so retries can be greater than available data.
+
+    :param context: behave context
+    :param retry: current retry index (starting from 1)
+    :param data_key_suffix: Suffix to identify accuracy data in context storage
+    """
+    accuracy_data = get_accuracy_data(context, data_key_suffix)
+    retry_data = accuracy_data[(retry - 1) % len(accuracy_data)] if accuracy_data else None
+    context.storage["accuracy_retry_data"] = retry_data
+    context.storage["accuracy_retry_index"] = retry
+    context.logger.info(f"Stored accuracy data for retry {retry} in accuracy_retry_data: {retry_data}")