Skip to content

Commit e1d86fd

Browse files
committed
feat: accuracy tag uses stored data in retries
1 parent 1bcea5e commit e1d86fd

File tree

6 files changed

+229
-18
lines changed

6 files changed

+229
-18
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ v3.6.0
1515
1616
- Add accuracy tag to behave scenarios using `@accuracy_<percent>_<retries>`, e.g. `@accuracy_80_10` for 80%
1717
accuracy with 10 retries
18+
- Add accuracy data tag to behave scenarios using `@accuracy_data_<suffix>`, e.g. `@accuracy_data_greetings` for
19+
accuracy data with greetings suffix
1820

1921
v3.5.0
2022
------

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.6.0.dev1
1+
3.6.0.dev2

docs/ai_utils.rst

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,11 @@ you have (direct OpenAI access or Azure OpenAI):
9090
OPENAI_API_VERSION=<your_api_version>
9191
9292
93-
Accuracy tag for Behave scenarios
94-
---------------------------------
93+
Accuracy tags for Behave scenarios
94+
----------------------------------
95+
96+
@accuracy
97+
~~~~~~~~~
9598

9699
You can use accuracy tags in your Behave scenarios to specify the desired accuracy level and number of retries for
97100
scenarios that involve AI-generated content. The accuracy tag follows the format `@accuracy_<percent>_<retries>`,
@@ -116,3 +119,41 @@ Other examples of accuracy tags:
116119
- `@accuracy_90_5`: 90% accuracy, 5 retries
117120
- `@accuracy_80`: 80% accuracy, default 10 retries
118121
- `@accuracy`: default 90% accuracy, 10 retries
122+
123+
@accuracy_data
124+
~~~~~~~~~~~~~~
125+
126+
You can also use accuracy data tags in your Behave scenarios to specify different sets of accuracy data for each retry.
127+
The accuracy data tag follows the format `@accuracy_data_<suffix>`, where `<suffix>` is a custom suffix to identify the
128+
accuracy data set. For example, `@accuracy_data_greetings` indicates that the scenario should use the accuracy data set
129+
with the suffix "greetings".
130+
131+
.. code-block:: bash
132+
133+
@accuracy_80
134+
@accuracy_data_greetings
135+
Scenario: Validate AI-generated greeting responses
136+
Given the AI model generates a greeting response
137+
When the user sends "[CONTEXT:accuracy_retry_data.question]" message
138+
Then the AI greeting response should be similar to "[CONTEXT:accuracy_retry_data.answer]"
139+
140+
When a scenario is tagged with an accuracy data tag, Toolium will automatically use the specified accuracy data set for
141+
each retry. This allows you to test different scenarios with varying data inputs. Accuracy data should be stored
142+
previously in the context storage under the key `accuracy_data_<suffix>`, where `<suffix>` matches the one used in the
143+
tag. For example, for the tag `@accuracy_data_greetings`, the accuracy data should be stored under the key
144+
`accuracy_data_greetings`. The accuracy data should be a list of dictionaries, where each dictionary contains the data
145+
for a specific retry.
146+
147+
For example, to store accuracy data for greetings, you can do the following in a step definition:
148+
149+
.. code-block:: python
150+
151+
accuracy_data_greetings = [
152+
{"question": "Hello", "answer": "Hi, how can I help you?"},
153+
{"question": "Good morning", "answer": "Good morning! What can I do for you today?"},
154+
{"question": "Hey there", "answer": "Hey! How can I assist you?"}
155+
]
156+
context.storage["accuracy_data_greetings"] = accuracy_data_greetings
157+
158+
This way, during each retry of the scenario, Toolium will use the corresponding data from the accuracy data set based on
159+
the retry index.

toolium/behave/environment.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,13 @@ def before_feature(context, feature):
8787
context.feature_storage = dict()
8888
context.storage = collections.ChainMap(context.feature_storage, context.run_storage)
8989

90-
# Patch scenarios when accuracy tags are present
91-
patch_feature_scenarios_with_accuracy(context, feature)
92-
9390
# Behave dynamic environment
9491
context.dyn_env.get_steps_from_feature_description(feature.description)
9592
context.dyn_env.execute_before_feature_steps(context)
9693

94+
# Patch scenarios when accuracy tags are present
95+
patch_feature_scenarios_with_accuracy(context, feature)
96+
9797

9898
def before_scenario(context, scenario):
9999
"""Scenario initialization

toolium/test/utils/ai_utils/test_accuracy.py

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
limitations under the License.
1717
"""
1818

19+
import mock
1920
import pytest
2021

21-
from toolium.utils.ai_utils.accuracy import get_accuracy_and_retries_from_tags
22+
from toolium.utils.ai_utils.accuracy import (get_accuracy_and_retries_from_tags, get_accuracy_data_suffix_from_tags,
23+
get_accuracy_data, store_retry_data)
2224

2325

2426
accuracy_tags_examples = (
@@ -31,10 +33,116 @@
3133
(['no_accuracy_tag'], None),
3234
(['accuracy_85', 'accuracy_95_15'], {'accuracy': 0.85, 'retries': 10}),
3335
([], None),
36+
(['accuracy_data', 'accuracy_data_50'], None),
37+
(['accuracy_75_5', 'accuracy_data'], {'accuracy': 0.75, 'retries': 5})
3438
)
3539

3640

3741
@pytest.mark.parametrize('tags, expected_accuracy_data', accuracy_tags_examples)
3842
def test_get_accuracy_and_retries_from_tags(tags, expected_accuracy_data):
3943
accuracy_data = get_accuracy_and_retries_from_tags(tags)
4044
assert accuracy_data == expected_accuracy_data
45+
46+
47+
accuracy_tags_examples = (
48+
(['accuracy'], 8, {'accuracy': 0.9, 'retries': 8}),
49+
(['accuracy_85'], 8, {'accuracy': 0.85, 'retries': 8}),
50+
(['accuracy_percent_80'], 8, {'accuracy': 0.8, 'retries': 8}),
51+
(['accuracy_75_5'], 8, {'accuracy': 0.75, 'retries': 5}),
52+
(['accuracy_percent_70_retries_3'], 8, {'accuracy': 0.7, 'retries': 3}),
53+
(['other_tag', 'accuracy_95_15'], 8, {'accuracy': 0.95, 'retries': 15}),
54+
(['no_accuracy_tag'], 8, None),
55+
(['accuracy_85', 'accuracy_95_15'], 8, {'accuracy': 0.85, 'retries': 8}),
56+
([], 8, None),
57+
(['accuracy_data', 'accuracy_data_50'], 8, None),
58+
(['accuracy_75_5', 'accuracy_data'], 8, {'accuracy': 0.75, 'retries': 5})
59+
)
60+
61+
62+
@pytest.mark.parametrize('tags, data_length, expected_accuracy_data', accuracy_tags_examples)
63+
def test_get_accuracy_and_retries_from_tags_with_data_length(tags, data_length, expected_accuracy_data):
64+
accuracy_data = get_accuracy_and_retries_from_tags(tags, accuracy_data_len=data_length)
65+
assert accuracy_data == expected_accuracy_data
66+
67+
68+
accuracy_data_suffix_examples = (
69+
(['accuracy_data'], ''),
70+
(['accuracy_data_balance'], '_balance'),
71+
(['accuracy_data_balance_50'], '_balance_50'),
72+
(['other_tag', 'accuracy_data_transactions'], '_transactions'),
73+
(['no_accuracy_data_tag'], ''),
74+
(['accuracy', 'accuracy_85', 'accuracy_percent_70_retries_3'], ''),
75+
([], '')
76+
)
77+
78+
79+
@pytest.mark.parametrize('tags, expected_data_suffix', accuracy_data_suffix_examples)
80+
def test_get_accuracy_data_suffix_from_tags(tags, expected_data_suffix):
81+
data_suffix = get_accuracy_data_suffix_from_tags(tags)
82+
assert data_suffix == expected_data_suffix
83+
84+
85+
@pytest.fixture
86+
def context():
87+
context = mock.MagicMock()
88+
context.storage = {
89+
'accuracy_data': [{'question': 'Q1', 'answer': 'A1'},
90+
{'question': 'Q2', 'answer': 'A2'}],
91+
'accuracy_data_balance': [{'question': 'Q1 balance', 'answer': 'A1'},
92+
{'question': 'Q2 balance', 'answer': 'A2'}],
93+
'accuracy_data_wrong': "This is not a list"
94+
}
95+
return context
96+
97+
98+
def test_get_accuracy_data_default_suffix(context):
99+
data = get_accuracy_data(context, data_key_suffix='')
100+
assert data == [{'question': 'Q1', 'answer': 'A1'},
101+
{'question': 'Q2', 'answer': 'A2'}]
102+
103+
104+
def test_get_accuracy_data_with_suffix(context):
105+
data = get_accuracy_data(context, data_key_suffix='_balance')
106+
assert data == [{'question': 'Q1 balance', 'answer': 'A1'},
107+
{'question': 'Q2 balance', 'answer': 'A2'}]
108+
109+
110+
def test_get_accuracy_data_with_nonexistent_suffix(context):
111+
data = get_accuracy_data(context, data_key_suffix='_nonexistent')
112+
assert data == []
113+
114+
115+
def test_get_accuracy_data_with_wrong_type(context):
116+
with pytest.raises(AssertionError) as exc:
117+
get_accuracy_data(context, data_key_suffix='_wrong')
118+
assert str(exc.value) == 'Expected accuracy_data_wrong must be a list: This is not a list'
119+
120+
121+
def test_store_retry_data_default_suffix(context):
122+
store_retry_data(context, retry=1, data_key_suffix='')
123+
assert context.storage['accuracy_retry_data'] == {'question': 'Q1', 'answer': 'A1'}
124+
assert context.storage['accuracy_retry_index'] == 1
125+
store_retry_data(context, retry=2, data_key_suffix='')
126+
assert context.storage['accuracy_retry_data'] == {'question': 'Q2', 'answer': 'A2'}
127+
assert context.storage['accuracy_retry_index'] == 2
128+
store_retry_data(context, retry=3, data_key_suffix='')
129+
assert context.storage['accuracy_retry_data'] == {'question': 'Q1', 'answer': 'A1'}
130+
assert context.storage['accuracy_retry_index'] == 3
131+
132+
133+
def test_store_retry_data_with_suffix(context):
134+
store_retry_data(context, retry=1, data_key_suffix='_balance')
135+
assert context.storage['accuracy_retry_data'] == {'question': 'Q1 balance', 'answer': 'A1'}
136+
assert context.storage['accuracy_retry_index'] == 1
137+
store_retry_data(context, retry=2, data_key_suffix='_balance')
138+
assert context.storage['accuracy_retry_data'] == {'question': 'Q2 balance', 'answer': 'A2'}
139+
assert context.storage['accuracy_retry_index'] == 2
140+
store_retry_data(context, retry=3, data_key_suffix='_balance')
141+
assert context.storage['accuracy_retry_data'] == {'question': 'Q1 balance', 'answer': 'A1'}
142+
assert context.storage['accuracy_retry_index'] == 3
143+
144+
145+
def test_store_retry_data_with_nonexistent_suffix(context):
146+
store_retry_data(context, retry=1, data_key_suffix='_nonexistent')
147+
assert context.storage['accuracy_retry_data'] is None
148+
assert context.storage['accuracy_retry_index'] == 1

toolium/utils/ai_utils/accuracy.py

Lines changed: 71 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,32 +22,56 @@
2222
from behave.model_core import Status
2323

2424

25-
def get_accuracy_and_retries_from_tags(tags):
25+
def get_accuracy_and_retries_from_tags(tags, accuracy_data_len=None):
2626
"""
2727
Extract accuracy and retries values from accuracy tag using regex.
2828
Examples of valid tags:
29+
2930
- accuracy
3031
- accuracy_90
3132
- accuracy_percent_90
3233
- accuracy_90_10
3334
- accuracy_percent_90_retries_10
3435
3536
:param tags: behave tags
37+
:param accuracy_data_len: length of accuracy data if available
3638
:return: dict with 'accuracy' and 'retries' keys if tag matches, None otherwise
3739
"""
38-
accuracy_regex = re.compile(r'^accuracy(?:_(?:percent_)?(\d+)(?:_retries_(\d+)|_(\d+))?)?', re.IGNORECASE)
40+
# Default values: 90% accuracy, 10 retries
41+
default_accuracy = 0.9
42+
default_retries = accuracy_data_len if accuracy_data_len is not None else 10
43+
accuracy_regex = re.compile(r'^accuracy(?!_data)(?:_(?:percent_)?(\d+)(?:_retries_(\d+)|_(\d+))?)?$', re.IGNORECASE)
3944
for tag in tags:
4045
match = accuracy_regex.search(tag)
4146
if match:
42-
# Default values: 90% accuracy, 10 retries
43-
accuracy_percent = (int(match.group(1)) / 100.0) if match.group(1) else 0.9
47+
accuracy_percent = (int(match.group(1)) / 100.0) if match.group(1) else default_accuracy
4448
# Check if retries is in group 2 (accuracy_percent_90_retries_10) or group 3 (accuracy_90_10)
45-
retries = int(match.group(2)) if match.group(2) else (int(match.group(3)) if match.group(3) else 10)
49+
retries = int(match.group(2)) if match.group(2) else (int(match.group(3)) if match.group(3)
50+
else default_retries)
4651
return {'accuracy': accuracy_percent, 'retries': retries}
4752
return None
4853

4954

50-
def patch_scenario_with_accuracy(context, scenario, accuracy=0.9, retries=10):
55+
def get_accuracy_data_suffix_from_tags(tags):
56+
"""
57+
Extract accuracy data suffix from behave tags
58+
59+
Examples of valid tags:
60+
- accuracy_data
61+
- accuracy_data_suffix
62+
63+
:param tags: behave tags
64+
:return: data_key_suffix if tag matches, empty string otherwise
65+
"""
66+
accuracy_data_regex = re.compile(r'^accuracy_data(?:_(\w+))?', re.IGNORECASE)
67+
for tag in tags:
68+
match = accuracy_data_regex.search(tag)
69+
if match:
70+
return f'_{match.group(1)}' if match.group(1) else ''
71+
return ''
72+
73+
74+
def patch_scenario_with_accuracy(context, scenario, data_key_suffix, accuracy=0.9, retries=10):
5175
"""Monkey-patches :func:`~behave.model.Scenario.run()` to execute multiple times and calculate the accuracy of the
5276
results.
5377
@@ -56,20 +80,23 @@ def patch_scenario_with_accuracy(context, scenario, accuracy=0.9, retries=10):
5680
5781
:param context: behave context
5882
:param scenario: Scenario or ScenarioOutline to patch
83+
:param data_key_suffix: Suffix to identify accuracy data in context storage
5984
:param accuracy: Minimum accuracy required to consider the scenario as passed
6085
:param retries: Number of times the scenario will be executed
6186
"""
6287
def scenario_run_with_accuracy(context, scenario_run, scenario, *args, **kwargs):
6388
# Execute the scenario multiple times and count passed executions
6489
passed_executions = 0
6590
for retry in range(1, retries+1):
91+
context.logger.info(f"Running accuracy scenario retry ({retry}/{retries})")
92+
store_retry_data(context, retry, data_key_suffix)
6693
if not scenario_run(*args, **kwargs):
6794
passed_executions += 1
6895
status = "PASSED"
6996
else:
7097
status = "FAILED"
7198
print(f"ACCURACY SCENARIO {status}: retry {retry}/{retries}")
72-
context.logger.info(f"Accuracy scenario {status} (retry {retry}/{retries})")
99+
context.logger.info(f"Accuracy scenario retry {status} ({retry}/{retries})")
73100

74101
# Calculate scenario accuracy
75102
scenario_accuracy = passed_executions / retries
@@ -98,10 +125,13 @@ def patch_scenario_from_tags(context, scenario):
98125
:param context: behave context
99126
:param scenario: behave scenario
100127
"""
101-
accuracy_data = get_accuracy_and_retries_from_tags(scenario.effective_tags)
102-
if accuracy_data:
103-
patch_scenario_with_accuracy(context, scenario, accuracy=accuracy_data['accuracy'],
104-
retries=accuracy_data['retries'])
128+
data_key_suffix = get_accuracy_data_suffix_from_tags(scenario.effective_tags)
129+
accuracy_data = get_accuracy_data(context, data_key_suffix)
130+
accuracy_data_len = len(accuracy_data) if accuracy_data else None
131+
accuracy_settings = get_accuracy_and_retries_from_tags(scenario.effective_tags, accuracy_data_len)
132+
if accuracy_settings:
133+
patch_scenario_with_accuracy(context, scenario, data_key_suffix,
134+
accuracy=accuracy_settings['accuracy'], retries=accuracy_settings['retries'])
105135

106136

107137
def patch_feature_scenarios_with_accuracy(context, feature):
@@ -120,3 +150,33 @@ def patch_feature_scenarios_with_accuracy(context, feature):
120150
except Exception as e:
121151
# Log error but do not fail the execution to avoid errors in before feature method
122152
context.logger.error(f"Error applying accuracy policy: {e}")
153+
154+
155+
def get_accuracy_data(context, data_key_suffix):
156+
"""
157+
Retrieve accuracy data stored in context.
158+
159+
:param context: behave context
160+
:param data_key_suffix: Suffix to identify accuracy data in context storage
161+
:return: accuracy data list
162+
"""
163+
accuracy_data_key = f"accuracy_data{data_key_suffix}"
164+
accuracy_data = context.storage.get(accuracy_data_key, [])
165+
assert isinstance(accuracy_data, list), f"Expected {accuracy_data_key} must be a list: {accuracy_data}"
166+
return accuracy_data
167+
168+
169+
def store_retry_data(context, retry, data_key_suffix):
170+
"""Extract data to be used in current retry execution and store it in accuracy_retry_data key in context.
171+
context.storage['accuracy_data{data_key_suffix}'] is expected to be a list of dicts with data for each retry.
172+
Retry data is selected using modulo to avoid index errors, so retries can be greater than available data.
173+
174+
:param context: behave context
175+
:param retry: current retry index (starting from 1)
176+
:param data_key_suffix: Suffix to identify accuracy data in context storage
177+
"""
178+
accuracy_data = get_accuracy_data(context, data_key_suffix)
179+
retry_data = accuracy_data[(retry - 1) % len(accuracy_data)] if accuracy_data else None
180+
context.storage["accuracy_retry_data"] = retry_data
181+
context.storage["accuracy_retry_index"] = retry
182+
context.logger.info(f"Stored accuracy data for retry {retry} in accuracy_retry_data: {retry_data}")

0 commit comments

Comments
 (0)