Skip to content

Commit edd7afd

Browse files
authored
Parse anomalies (#44)
* add interval parsing * change primitive name * fix lint
1 parent b850ea9 commit edd7afd

File tree

7 files changed

+245
-102
lines changed

7 files changed

+245
-102
lines changed

sigllm/pipelines/prompter/mistral_prompter_0shot.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"sigllm.primitives.transformation.format_as_string",
88

99
"sigllm.primitives.prompting.huggingface.HF",
10-
"sigllm.primitives.transformation.parse_anomaly_response",
10+
"sigllm.primitives.prompting.anomalies.parse_anomaly_response",
1111
"sigllm.primitives.transformation.format_as_integer",
1212
"sigllm.primitives.prompting.anomalies.val2idx",
1313
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows",

sigllm/pipelines/prompter/mistral_prompter_1shot.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"sigllm.primitives.transformation.format_as_string",
1313

1414
"sigllm.primitives.prompting.huggingface.HF",
15-
"sigllm.primitives.transformation.parse_anomaly_response",
15+
"sigllm.primitives.prompting.anomalies.parse_anomaly_response",
1616
"sigllm.primitives.transformation.format_as_integer",
1717
"sigllm.primitives.prompting.anomalies.val2idx",
1818
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows",

sigllm/primitives/jsons/sigllm.primitives.transformation.parse_anomaly_response.json renamed to sigllm/primitives/jsons/sigllm.primitives.prompting.anomalies.parse_anomaly_response.json

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
{
2-
"name": "sigllm.primitives.transformation.parse_anomaly_response",
2+
"name": "sigllm.primitives.prompting.anomalies.parse_anomaly_response",
33
"contributors": ["Salim Cherkaoui"],
44
"description": "Parse LLM responses to extract anomaly values from text format.",
55
"classifiers": {
66
"type": "transformer",
77
"subtype": "parser"
88
},
99
"modalities": ["text"],
10-
"primitive": "sigllm.primitives.transformation.parse_anomaly_response",
10+
"primitive": "sigllm.primitives.prompting.anomalies.parse_anomaly_response",
1111
"produce": {
1212
"args": [
1313
{
@@ -21,5 +21,13 @@
2121
"type": "ndarray"
2222
}
2323
]
24+
},
25+
"hyperparameters": {
26+
"fixed": {
27+
"interval": {
28+
"type": "bool",
29+
"default": false
30+
}
31+
}
2432
}
2533
}

sigllm/primitives/prompting/anomalies.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,84 @@
55
This module contains functions that help filter LLMs results to get the final anomalies.
66
"""
77

8+
import ast
9+
import re
10+
811
import numpy as np
912

13+
PATTERN = r'\[([\d\s,]+)\]'
14+
15+
16+
def _clean_response(text):
17+
text = text.strip().lower()
18+
text = re.sub(r',+', ',', text)
19+
20+
if 'no anomalies' in text or 'no anomaly' in text:
21+
return ''
22+
23+
return text
24+
25+
26+
def _parse_list_response(text):
27+
clean = _clean_response(text)
28+
29+
# match anything that consists of digits and commas
30+
match = re.search(PATTERN, clean)
31+
32+
if match:
33+
values = match.group(1)
34+
values = [val.strip() for val in values.split(',') if val.strip()]
35+
return ','.join(values)
36+
37+
return ''
38+
39+
40+
def _parse_interval_response(text):
41+
clean = _clean_response(text)
42+
match = re.finditer(PATTERN, clean)
43+
44+
if match:
45+
values = list()
46+
for m in match:
47+
interval = ast.literal_eval(m.group())
48+
if len(interval) == 2:
49+
start, end = ast.literal_eval(m.group())
50+
values.extend(list(range(start, end + 1)))
51+
52+
return values
53+
54+
return []
55+
56+
57+
def parse_anomaly_response(X, interval=False):
58+
"""Parse a list of lists of LLM responses to extract anomaly values and format them as strings.
59+
60+
Args:
61+
X (List[List[str]]):
62+
List of lists of response texts from the LLM in the format
63+
"Answer: no anomalies" or "Answer: [val1, val2, ..., valN]."
64+
values must be within brackets.
65+
interval (bool):
66+
Whether to parse the response as a list "Answer: [val1, val2, ..., valN]."
67+
or list of intervals "Answer: [[s1, e1], [s2, e2], ..., [sn, en]]."
68+
69+
Returns:
70+
List[List[str]]:
71+
List of lists of parsed responses where each element is either
72+
"val1,val2,...,valN" if anomalies are found, or empty string if
73+
no anomalies are present.
74+
"""
75+
method = _parse_list_response
76+
if interval:
77+
method = _parse_interval_response
78+
79+
result = []
80+
for response_list in X:
81+
parsed_list = [method(response) for response in response_list]
82+
result.append(parsed_list)
83+
84+
return result
85+
1086

1187
def val2idx(y, X):
1288
"""Convert detected anomalies values into indices.

sigllm/primitives/transformation.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -184,44 +184,3 @@ def transform(self, X, minimum=0, decimal=2):
184184
values = X * 10 ** (-decimal)
185185

186186
return values + minimum
187-
188-
189-
def parse_anomaly_response(X):
190-
"""Parse a list of lists of LLM responses to extract anomaly values and format them as strings.
191-
192-
Args:
193-
X (List[List[str]]):
194-
List of lists of response texts from the LLM in the format
195-
"Answer: no anomalies" or "Answer: [val1, val2, ..., valN]."
196-
values must be within brackets.
197-
198-
Returns:
199-
List[List[str]]:
200-
List of lists of parsed responses where each element is either
201-
"val1,val2,...,valN" if anomalies are found, or empty string if
202-
no anomalies are present.
203-
"""
204-
205-
def _parse_single_response(text):
206-
text = text.strip().lower()
207-
208-
if 'no anomalies' in text or 'no anomaly' in text:
209-
return ''
210-
211-
# match anything that consists of digits and commas
212-
pattern = r'\[([\d\s,]+)\]'
213-
match = re.search(pattern, text)
214-
215-
if match:
216-
values = match.group(1)
217-
values = [val.strip() for val in values.split(',') if val.strip()]
218-
return ','.join(values)
219-
220-
return ''
221-
222-
result = []
223-
for response_list in X:
224-
parsed_list = [_parse_single_response(response) for response in response_list]
225-
result.append(parsed_list)
226-
227-
return result

tests/primitives/prompting/test_anomalies.py

Lines changed: 157 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
# -*- coding: utf-8 -*-
2+
import unittest
23

34
import numpy as np
45
from pytest import fixture
56

67
from sigllm.primitives.prompting.anomalies import (
8+
_clean_response,
9+
_parse_interval_response,
10+
_parse_list_response,
711
find_anomalies_in_windows,
812
format_anomalies,
913
merge_anomalous_sequences,
14+
parse_anomaly_response,
1015
val2idx,
1116
)
1217

@@ -100,10 +105,160 @@ def test_val2idx(anomalous_val, windows):
100105

101106

102107
# timestamp2interval
103-
104-
105108
def test_format_anomalies(idx_list, timestamp):
106109
expected = [(1000, 1820, 0), (5950, 6950, 0), (7390, 8390, 0), (11530, 12840, 0)]
107110
result = format_anomalies(idx_list, timestamp)
108111

109112
assert expected == result
113+
114+
115+
def test_clean_response_no_anomalies():
116+
test_cases = [
117+
'no anomalies',
118+
'NO ANOMALIES',
119+
' no anomalies ',
120+
'There are no anomalies in this data',
121+
'No anomaly detected',
122+
' No anomaly ',
123+
]
124+
for text in test_cases:
125+
assert _clean_response(text) == ''
126+
127+
128+
def test_clean_response_with_anomalies():
129+
test_cases = [
130+
('[1, 2, 3]', '[1, 2, 3]'),
131+
(' [1, 2, 3] ', '[1, 2, 3]'),
132+
('Anomalies found at [1, 2, 3]', 'anomalies found at [1, 2, 3]'),
133+
('ANOMALIES AT [1, 2, 3]', 'anomalies at [1, 2, 3]'),
134+
]
135+
for input_text, expected in test_cases:
136+
assert _clean_response(input_text) == expected
137+
138+
139+
def test_parse_list_response_valid_cases():
140+
test_cases = [
141+
('[1, 2, 3]', '1,2,3'),
142+
(' [1, 2, 3] ', '1,2,3'),
143+
('Anomalies found at [1, 2, 3]', '1,2,3'),
144+
('[1,2,3]', '1,2,3'),
145+
('[1, 2, 3, 4, 5]', '1,2,3,4,5'),
146+
]
147+
for input_text, expected in test_cases:
148+
assert _parse_list_response(input_text) == expected
149+
150+
151+
def test_parse_list_response_invalid_cases():
152+
test_cases = [
153+
'no anomalies',
154+
'[]',
155+
'[ ]',
156+
'text with [no numbers]',
157+
'text with [letters, and, symbols]',
158+
' ',
159+
]
160+
for text in test_cases:
161+
assert _parse_list_response(text) == ''
162+
163+
164+
def test_parse_list_response_edge_cases():
165+
test_cases = [
166+
('[1,2,3,]', '1,2,3'), # trailing comma
167+
('[1,,2,3]', '1,2,3'), # double comma
168+
('[1, 2, 3], [5]', '1,2,3'), # two lists
169+
]
170+
for input_text, expected in test_cases:
171+
assert _parse_list_response(input_text) == expected
172+
173+
174+
def test_parse_interval_response_valid_cases():
175+
test_cases = [
176+
('[[1, 3]]', [1, 2, 3]),
177+
(' [[1, 3]] ', [1, 2, 3]),
178+
('Anomalies found at [[1, 3]]', [1, 2, 3]),
179+
('[[1, 3], [5, 7]]', [1, 2, 3, 5, 6, 7]),
180+
('[[1, 3], [5, 7], [8, 9]]', [1, 2, 3, 5, 6, 7, 8, 9]),
181+
('[[1, 3], [4, 6],]', [1, 2, 3, 4, 5, 6]),
182+
('[[1, 2], [3]]', [1, 2]),
183+
('[[1,,3]]', [1, 2, 3]),
184+
('[[0, 10]]', list(range(11))),
185+
]
186+
for input_text, expected in test_cases:
187+
assert _parse_interval_response(input_text) == expected
188+
189+
190+
def test_parse_interval_response_invalid_cases():
191+
test_cases = [
192+
'[]',
193+
'[[]]',
194+
'text with [no numbers]',
195+
'[[1]]', # single number instead of pair
196+
'[[1, 2, 3]]', # triple instead of pair
197+
]
198+
for text in test_cases:
199+
assert _parse_interval_response(text) == []
200+
201+
202+
def test_parse_interval_response_multiple_matches():
203+
test_cases = [
204+
('Found [[1, 3]] and [[5, 7]]', [1, 2, 3, 5, 6, 7]),
205+
('[[1, 2]] in first part and [[3, 4]] in second', [1, 2, 3, 4]),
206+
('Multiple intervals: [[1, 3]], [[4, 6]], [[7, 9]]', [1, 2, 3, 4, 5, 6, 7, 8, 9]),
207+
('[[1, 2]] and [[1, 2]] and [[1, 2]]', [1, 2, 1, 2, 1, 2]),
208+
]
209+
for input_text, expected in test_cases:
210+
assert _parse_interval_response(input_text) == expected
211+
212+
213+
class ParseAnomalyResponseTest(unittest.TestCase):
214+
def test_no_anomalies(self):
215+
data = [['Answer: no anomalies'], ['Answer: no anomaly'], ['no anomaly, with extra']]
216+
expected = [[''], [''], ['']]
217+
218+
output = parse_anomaly_response(data)
219+
self.assertEqual(output, expected)
220+
221+
def test_single_anomaly(self):
222+
data = [['Answer: [123]'], ['Answer: [456]', 'answer: [789]']]
223+
expected = [['123'], ['456', '789']]
224+
225+
output = parse_anomaly_response(data)
226+
self.assertEqual(output, expected)
227+
228+
def test_multiple_anomalies(self):
229+
data = [['Answer: [123, 456, 789]'], ['Answer: [111, 222, 333]']]
230+
expected = [['123,456,789'], ['111,222,333']]
231+
232+
output = parse_anomaly_response(data)
233+
self.assertEqual(output, expected)
234+
235+
def test_mixed_responses(self):
236+
data = [['Answer: no anomalies', 'Answer: [123, 456]'], ['Answer: [789]', 'no anomaly']]
237+
expected = [['', '123,456'], ['789', '']]
238+
239+
output = parse_anomaly_response(data)
240+
self.assertEqual(output, expected)
241+
242+
def test_different_formats(self):
243+
data = [
244+
['Answer: [123, 456]', 'Answer: [ 789 , 101 ]'],
245+
['Answer: [1,2,3]', 'Answer: [ 4 , 5 , 6 ]'],
246+
]
247+
expected = [['123,456', '789,101'], ['1,2,3', '4,5,6']]
248+
249+
output = parse_anomaly_response(data)
250+
self.assertEqual(output, expected)
251+
252+
def test_empty_responses(self):
253+
data = [[''], ['Answer: no anomalies'], ['answer'], ['no anomly']]
254+
expected = [[''], [''], [''], ['']]
255+
256+
output = parse_anomaly_response(data)
257+
self.assertEqual(output, expected)
258+
259+
def test_invalid_format(self):
260+
data = [['Answer: invalid format'], ['Answer: [123, abc]']]
261+
expected = [[''], ['']]
262+
263+
output = parse_anomaly_response(data)
264+
self.assertEqual(output, expected)

0 commit comments

Comments
 (0)