Skip to content

Commit cfa5d27

Browse files
feat: adding RegextTextExtractor component from experimental (#9879)
* initial import of component * adding release notes * adding docs to docusaurus
1 parent fe60c76 commit cfa5d27

File tree

6 files changed

+331
-6
lines changed

6 files changed

+331
-6
lines changed

docs/pydoc/config/extractors_api.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
loaders:
22
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
33
search_path: [../../../haystack/components/extractors]
4-
modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
4+
modules: [
5+
"named_entity_extractor",
6+
"llm_metadata_extractor",
7+
"image/llm_document_content_extractor",
8+
"regex_text_extractor",
9+
]
510
ignore_when_discovered: ["__init__"]
611
processors:
712
- type: filter
@@ -15,7 +20,7 @@ processors:
1520
- type: crossref
1621
renderer:
1722
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
18-
excerpt: Extracts predefined entities out of a piece of text.
23+
excerpt: Components to extract specific elements from textual data.
1924
category_slug: haystack-api
2025
title: Extractors
2126
slug: extractors-api

docs/pydoc/config_docusaurus/extractors_api.yml

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
loaders:
22
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
33
search_path: [../../../haystack/components/extractors]
4-
modules: ["named_entity_extractor", "llm_metadata_extractor", "image/llm_document_content_extractor"]
4+
modules: [
5+
"named_entity_extractor",
6+
"llm_metadata_extractor",
7+
"image/llm_document_content_extractor",
8+
"regex_text_extractor",
9+
]
510
ignore_when_discovered: ["__init__"]
611
processors:
712
- type: filter
@@ -14,10 +19,12 @@ processors:
1419
- type: smart
1520
- type: crossref
1621
renderer:
17-
type: haystack_pydoc_tools.renderers.DocusaurusRenderer
18-
description: Extracts predefined entities out of a piece of text.
22+
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
23+
excerpt: Components to extract specific elements from textual data.
24+
category_slug: haystack-api
1925
title: Extractors
20-
id: extractors-api
26+
slug: extractors-api
27+
order: 65
2128
markdown:
2229
descriptive_class_title: false
2330
classdef_code_block: false

haystack/components/extractors/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
_import_structure = {
1111
"llm_metadata_extractor": ["LLMMetadataExtractor"],
12+
"regex_text_extractor": ["RegexTextExtractor"],
1213
"named_entity_extractor": ["NamedEntityAnnotation", "NamedEntityExtractor", "NamedEntityExtractorBackend"],
1314
}
1415

@@ -17,6 +18,7 @@
1718
from .named_entity_extractor import NamedEntityAnnotation as NamedEntityAnnotation
1819
from .named_entity_extractor import NamedEntityExtractor as NamedEntityExtractor
1920
from .named_entity_extractor import NamedEntityExtractorBackend as NamedEntityExtractorBackend
21+
from .regex_text_extractor import RegexTextExtractor
2022

2123
else:
2224
sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import re
6+
from typing import Union
7+
8+
from haystack import component, logging
9+
from haystack.dataclasses import ChatMessage
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
@component
15+
class RegexTextExtractor:
16+
"""
17+
Extracts text from chat message or string input using a regex pattern.
18+
19+
RegexTextExtractor parses input text or ChatMessages using a provided regular expression pattern.
20+
It can be configured to search through all messages or only the last message in a list of ChatMessages.
21+
22+
### Usage example
23+
24+
```python
25+
from haystack_experimental.components.extractors import RegexTextExtractor
26+
from haystack.dataclasses import ChatMessage
27+
28+
# Using with a string
29+
parser = RegexTextExtractor(regex_pattern='<issue url=\"(.+)\">')
30+
result = parser.run(text_or_messages='<issue url="github.com/hahahaha">hahahah</issue>')
31+
# result: {"captured_text": "github.com/hahahaha"}
32+
33+
# Using with ChatMessages
34+
messages = [ChatMessage.from_user('<issue url="github.com/hahahaha">hahahah</issue>')]
35+
result = parser.run(text_or_messages=messages)
36+
# result: {"captured_text": "github.com/hahahaha"}
37+
```
38+
"""
39+
40+
def __init__(self, regex_pattern: str):
41+
"""
42+
Creates an instance of the RegexTextExtractor component.
43+
44+
:param regex_pattern:
45+
The regular expression pattern used to extract text.
46+
The pattern should include a capture group to extract the desired text.
47+
Example: '<issue url=\"(.+)\">' captures 'github.com/hahahaha' from '<issue url="github.com/hahahaha">'.
48+
"""
49+
self.regex_pattern = regex_pattern
50+
51+
# Check if the pattern has at least one capture group
52+
num_groups = re.compile(regex_pattern).groups
53+
if num_groups < 1:
54+
logger.warning(
55+
"The provided regex pattern {regex_pattern} doesn't contain any capture groups. "
56+
"The entire match will be returned instead.",
57+
regex_pattern=regex_pattern,
58+
)
59+
60+
@component.output_types(captured_text=str, captured_texts=list[str])
61+
def run(self, text_or_messages: Union[str, list[ChatMessage]]) -> dict:
62+
"""
63+
Extracts text from input using the configured regex pattern.
64+
65+
:param text_or_messages:
66+
Either a string or a list of ChatMessage objects to search through.
67+
68+
:returns:
69+
- If match found: {"captured_text": "matched text"}
70+
- If no match and return_empty_on_no_match=True: {}
71+
72+
:raises:
73+
- ValueError: if receiving a list the last element is not a ChatMessage instance.
74+
"""
75+
if isinstance(text_or_messages, str):
76+
return RegexTextExtractor._build_result(self._extract_from_text(text_or_messages))
77+
if not text_or_messages:
78+
logger.warning("Received empty list of messages")
79+
return {}
80+
return self._process_last_message(text_or_messages)
81+
82+
@staticmethod
83+
def _build_result(result: Union[str, list[str]]) -> dict:
84+
"""Helper method to build the return dictionary based on configuration."""
85+
if (isinstance(result, str) and result == "") or (isinstance(result, list) and not result):
86+
return {}
87+
return {"captured_text": result}
88+
89+
def _process_last_message(self, messages: list[ChatMessage]) -> dict:
90+
"""Process only the last message and build the result."""
91+
last_message = messages[-1]
92+
if not isinstance(last_message, ChatMessage):
93+
raise ValueError(f"Expected ChatMessage object, got {type(last_message)}")
94+
if last_message.text is None:
95+
logger.warning("Last message has no text content")
96+
return {}
97+
result = self._extract_from_text(last_message.text)
98+
return RegexTextExtractor._build_result(result)
99+
100+
def _extract_from_text(self, text: str) -> Union[str, list[str]]:
101+
"""
102+
Extract text using the regex pattern.
103+
104+
:param text:
105+
The text to search through.
106+
107+
:returns:
108+
The text captured by the first capturing group in the regex pattern.
109+
If the pattern has no capture groups, returns the entire match.
110+
If no match is found, returns an empty string.
111+
"""
112+
match = re.search(self.regex_pattern, text)
113+
if not match:
114+
return ""
115+
if match.groups():
116+
return match.group(1)
117+
return match.group(0)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
A new component `RegexTextExtractor` which allows to extract text from chat messages or strings input based on custom regex pattern.
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import pytest
6+
7+
from haystack import Pipeline
8+
from haystack.components.extractors.regex_text_extractor import RegexTextExtractor
9+
from haystack.dataclasses import ChatMessage
10+
11+
12+
class TestRegexTextExtractor:
13+
def test_init_with_capture_group(self):
14+
pattern = r'<issue url="(.+?)">'
15+
extractor = RegexTextExtractor(regex_pattern=pattern)
16+
assert extractor.regex_pattern == pattern
17+
18+
def test_init_without_capture_group(self):
19+
pattern = r"<issue>"
20+
extractor = RegexTextExtractor(regex_pattern=pattern)
21+
assert extractor.regex_pattern == pattern
22+
23+
def test_extract_from_string_with_capture_group(self):
24+
pattern = r'<issue url="(.+?)">'
25+
extractor = RegexTextExtractor(regex_pattern=pattern)
26+
text = '<issue url="github.com/hahahaha">hahahah</issue>'
27+
result = extractor.run(text_or_messages=text)
28+
assert result == {"captured_text": "github.com/hahahaha"}
29+
30+
def test_extract_from_string_without_capture_group(self):
31+
pattern = r"<issue>"
32+
extractor = RegexTextExtractor(regex_pattern=pattern)
33+
text = "This is an <issue> tag in the text"
34+
result = extractor.run(text_or_messages=text)
35+
assert result == {"captured_text": "<issue>"}
36+
37+
def test_extract_from_string_no_match(self):
38+
pattern = r'<issue url="(.+?)">'
39+
extractor = RegexTextExtractor(regex_pattern=pattern)
40+
text = "This text has no matching pattern"
41+
result = extractor.run(text_or_messages=text)
42+
assert result == {}
43+
44+
def test_extract_from_string_empty_input(self):
45+
pattern = r'<issue url="(.+?)">'
46+
extractor = RegexTextExtractor(regex_pattern=pattern)
47+
text = ""
48+
result = extractor.run(text_or_messages=text)
49+
assert result == {}
50+
51+
def test_extract_from_chat_messages_single_message(self):
52+
pattern = r'<issue url="(.+?)">'
53+
extractor = RegexTextExtractor(regex_pattern=pattern)
54+
messages = [ChatMessage.from_user('<issue url="github.com/test">test issue</issue>')]
55+
result = extractor.run(text_or_messages=messages)
56+
assert result == {"captured_text": "github.com/test"}
57+
58+
def test_extract_from_chat_messages_multiple_messages(self):
59+
pattern = r'<issue url="(.+?)">'
60+
extractor = RegexTextExtractor(regex_pattern=pattern)
61+
messages = [
62+
ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
63+
ChatMessage.from_user('Second message with <issue url="second.com">second</issue>'),
64+
ChatMessage.from_user('Last message with <issue url="last.com">last</issue>'),
65+
]
66+
result = extractor.run(text_or_messages=messages)
67+
assert result == {"captured_text": "last.com"}
68+
69+
def test_extract_from_chat_messages_no_match_in_last(self):
70+
pattern = r'<issue url="(.+?)">'
71+
extractor = RegexTextExtractor(regex_pattern=pattern)
72+
messages = [
73+
ChatMessage.from_user('First message with <issue url="first.com">first</issue>'),
74+
ChatMessage.from_user("Last message with no matching pattern"),
75+
]
76+
result = extractor.run(text_or_messages=messages)
77+
assert result == {}
78+
79+
def test_extract_from_chat_messages_empty_list(self):
80+
pattern = r'<issue url="(.+?)">'
81+
extractor = RegexTextExtractor(regex_pattern=pattern)
82+
messages = []
83+
result = extractor.run(text_or_messages=messages)
84+
assert result == {}
85+
86+
def test_extract_from_chat_messages_invalid_type(self):
87+
pattern = r'<issue url="(.+?)">'
88+
extractor = RegexTextExtractor(regex_pattern=pattern)
89+
messages = ["not a ChatMessage object"]
90+
with pytest.raises(ValueError, match="Expected ChatMessage object, got <class 'str'>"):
91+
extractor.run(text_or_messages=messages)
92+
93+
def test_multiple_capture_groups(self):
94+
pattern = r"(\w+)@(\w+)\.(\w+)"
95+
extractor = RegexTextExtractor(regex_pattern=pattern)
96+
text = "Contact us at [email protected] for support"
97+
result = extractor.run(text_or_messages=text)
98+
# return the first capture group (username)
99+
assert result == {"captured_text": "user"}
100+
101+
def test_special_characters_in_pattern(self):
102+
"""Test regex pattern with special characters."""
103+
pattern = r"\[(\w+)\]"
104+
extractor = RegexTextExtractor(regex_pattern=pattern)
105+
106+
text = "This has [special] characters [in] brackets"
107+
result = extractor.run(text_or_messages=text)
108+
109+
assert result == {"captured_text": "special"}
110+
111+
def test_whitespace_handling(self):
112+
"""Test regex pattern with whitespace handling."""
113+
pattern = r"\s+(\w+)\s+"
114+
extractor = RegexTextExtractor(regex_pattern=pattern)
115+
116+
text = "word1 word2 word3"
117+
result = extractor.run(text_or_messages=text)
118+
119+
assert result == {"captured_text": "word2"}
120+
121+
def test_nested_capture_groups(self):
122+
"""Test regex with nested capture groups."""
123+
pattern = r'<(\w+)\s+attr="([^"]+)">'
124+
extractor = RegexTextExtractor(regex_pattern=pattern)
125+
126+
text = '<div attr="value">content</div>'
127+
result = extractor.run(text_or_messages=text)
128+
129+
# Should return the first capture group (tag name)
130+
assert result == {"captured_text": "div"}
131+
132+
def test_optional_capture_group(self):
133+
"""Test regex with optional capture group."""
134+
pattern = r"(\w+)(?:@(\w+))?"
135+
extractor = RegexTextExtractor(regex_pattern=pattern)
136+
137+
text = "username@domain"
138+
result = extractor.run(text_or_messages=text)
139+
140+
assert result == {"captured_text": "username"}
141+
142+
def test_optional_capture_group_no_match(self):
143+
"""Test regex with optional capture group when optional part is missing."""
144+
pattern = r"(\w+)(?:@(\w+))?"
145+
extractor = RegexTextExtractor(regex_pattern=pattern)
146+
147+
text = "username"
148+
result = extractor.run(text_or_messages=text)
149+
150+
assert result == {"captured_text": "username"}
151+
152+
def test_pipeline_integration(self):
153+
"""Test component integration in a Haystack pipeline."""
154+
pattern = r'<issue url="(.+?)">'
155+
extractor = RegexTextExtractor(regex_pattern=pattern)
156+
157+
pipe = Pipeline()
158+
pipe.add_component("extractor", extractor)
159+
160+
text = '<issue url="github.com/pipeline-test">pipeline test</issue>'
161+
result = pipe.run(data={"extractor": {"text_or_messages": text}})
162+
163+
assert result["extractor"] == {"captured_text": "github.com/pipeline-test"}
164+
165+
def test_pipeline_integration_with_chat_messages(self):
166+
"""Test component integration in pipeline with ChatMessages."""
167+
pattern = r'<issue url="(.+?)">'
168+
extractor = RegexTextExtractor(regex_pattern=pattern)
169+
170+
pipe = Pipeline()
171+
pipe.add_component("extractor", extractor)
172+
173+
messages = [ChatMessage.from_user('<issue url="github.com/chat-test">chat test</issue>')]
174+
result = pipe.run(data={"extractor": {"text_or_messages": messages}})
175+
176+
assert result["extractor"] == {"captured_text": "github.com/chat-test"}
177+
178+
def test_very_long_text(self):
179+
pattern = r"(\d+)"
180+
extractor = RegexTextExtractor(regex_pattern=pattern)
181+
long_text = "a" * 10000 + "123" + "b" * 10000
182+
result = extractor.run(text_or_messages=long_text)
183+
assert result == {"captured_text": "123"}
184+
185+
def test_multiple_matches_first_is_captured(self):
186+
pattern = r"(\d+)"
187+
extractor = RegexTextExtractor(regex_pattern=pattern)
188+
text = "First: 123, Second: 456, Third: 789"
189+
result = extractor.run(text_or_messages=text)
190+
assert result == {"captured_text": "123"}

0 commit comments

Comments
 (0)