Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 31 additions & 13 deletions api/core/rag/cleaner/clean_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,44 @@ def clean(cls, text: str, process_rule: dict) -> str:
pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
text = re.sub(pattern, "", text)

# Remove URL but keep Markdown image URLs
# First, temporarily replace Markdown image URLs with a placeholder
markdown_image_pattern = r"!\[.*?\]\((https?://[^\s)]+)\)"
placeholders: list[str] = []
# Remove URL but keep Markdown image URLs and link URLs
# Replace the ENTIRE markdown link/image with a single placeholder to protect
# the link text (which might also be a URL) from being removed
markdown_link_pattern = r"\[([^\]]*)\]\((https?://[^)]+)\)"
markdown_image_pattern = r"!\[.*?\]\((https?://[^)]+)\)"
placeholders: list[tuple[str, str, str]] = [] # (type, text, url)

def replace_with_placeholder(match, placeholders=placeholders):
def replace_markdown_with_placeholder(match, placeholders=placeholders):
link_type = "link"
link_text = match.group(1)
url = match.group(2)
placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
placeholders.append((link_type, link_text, url))
return placeholder

def replace_image_with_placeholder(match, placeholders=placeholders):
link_type = "image"
url = match.group(1)
placeholder = f"__MARKDOWN_IMAGE_URL_{len(placeholders)}__"
placeholders.append(url)
return f"![image]({placeholder})"
placeholder = f"__MARKDOWN_PLACEHOLDER_{len(placeholders)}__"
placeholders.append((link_type, "image", url))
return placeholder

text = re.sub(markdown_image_pattern, replace_with_placeholder, text)
# Protect markdown links first
text = re.sub(markdown_link_pattern, replace_markdown_with_placeholder, text)
# Then protect markdown images
text = re.sub(markdown_image_pattern, replace_image_with_placeholder, text)

# Now remove all remaining URLs
url_pattern = r"https?://[^\s)]+"
url_pattern = r"https?://\S+"
text = re.sub(url_pattern, "", text)

# Finally, restore the Markdown image URLs
for i, url in enumerate(placeholders):
text = text.replace(f"__MARKDOWN_IMAGE_URL_{i}__", url)
# Restore the Markdown links and images
for i, (link_type, text_or_alt, url) in enumerate(placeholders):
placeholder = f"__MARKDOWN_PLACEHOLDER_{i}__"
if link_type == "link":
text = text.replace(placeholder, f"[{text_or_alt}]({url})")
else: # image
text = text.replace(placeholder, f"![{text_or_alt}]({url})")
return text

def filter_string(self, text):
Expand Down
6 changes: 6 additions & 0 deletions api/core/tools/utils/text_processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,19 @@
def remove_leading_symbols(text: str) -> str:
"""
Remove leading punctuation or symbols from the given text.
Preserves markdown links like [text](url) at the start.

Args:
text (str): The input text to process.

Returns:
str: The text with leading punctuation or symbols removed.
"""
# Check if text starts with a markdown link - preserve it
markdown_link_pattern = r"^\[([^\]]+)\]\((https?://[^)]+)\)"
if re.match(markdown_link_pattern, text):
return text

# Match Unicode ranges for punctuation and symbols
# FIXME this pattern is confused quick fix for #11868 maybe refactor it later
pattern = r'^[\[\]\u2000-\u2025\u2027-\u206F\u2E00-\u2E7F\u3000-\u300F\u3011-\u303F"#$%&\'()*+,./:;<=>?@^_`~]+'
Expand Down
Empty file.
213 changes: 213 additions & 0 deletions api/tests/unit_tests/core/rag/cleaner/test_clean_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
from core.rag.cleaner.clean_processor import CleanProcessor


class TestCleanProcessor:
"""Test cases for CleanProcessor.clean method."""

def test_clean_default_removal_of_invalid_symbols(self):
"""Test default cleaning removes invalid symbols."""
# Test <| replacement
assert CleanProcessor.clean("text<|with<|invalid", None) == "text<with<invalid"

# Test |> replacement
assert CleanProcessor.clean("text|>with|>invalid", None) == "text>with>invalid"

# Test removal of control characters
text_with_control = "normal\x00text\x1fwith\x07control\x7fchars"
expected = "normaltextwithcontrolchars"
assert CleanProcessor.clean(text_with_control, None) == expected

# Test U+FFFE removal
text_with_ufffe = "normal\ufffepadding"
expected = "normalpadding"
assert CleanProcessor.clean(text_with_ufffe, None) == expected

def test_clean_with_none_process_rule(self):
"""Test cleaning with None process_rule - only default cleaning applied."""
text = "Hello<|World\x00"
expected = "Hello<World"
assert CleanProcessor.clean(text, None) == expected

def test_clean_with_empty_process_rule(self):
"""Test cleaning with empty process_rule dict - only default cleaning applied."""
text = "Hello<|World\x00"
expected = "Hello<World"
assert CleanProcessor.clean(text, {}) == expected

def test_clean_with_empty_rules(self):
"""Test cleaning with empty rules - only default cleaning applied."""
text = "Hello<|World\x00"
expected = "Hello<World"
assert CleanProcessor.clean(text, {"rules": {}}) == expected

def test_clean_remove_extra_spaces_enabled(self):
"""Test remove_extra_spaces rule when enabled."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": True}]}}

# Test multiple newlines reduced to two
text = "Line1\n\n\n\n\nLine2"
expected = "Line1\n\nLine2"
assert CleanProcessor.clean(text, process_rule) == expected

# Test various whitespace characters reduced to single space
text = "word1\u2000\u2001\t\t \u3000word2"
expected = "word1 word2"
assert CleanProcessor.clean(text, process_rule) == expected

# Test combination of newlines and spaces
text = "Line1\n\n\n\n \t Line2"
expected = "Line1\n\n Line2"
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_remove_extra_spaces_disabled(self):
"""Test remove_extra_spaces rule when disabled."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_extra_spaces", "enabled": False}]}}

text = "Line1\n\n\n\n\nLine2 with spaces"
# Should only apply default cleaning (no invalid symbols here)
assert CleanProcessor.clean(text, process_rule) == text

def test_clean_remove_urls_emails_enabled(self):
"""Test remove_urls_emails rule when enabled."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}}

# Test email removal
text = "Contact us at test@example.com for more info"
expected = "Contact us at for more info"
assert CleanProcessor.clean(text, process_rule) == expected

# Test URL removal
text = "Visit https://example.com or http://test.org"
expected = "Visit or "
assert CleanProcessor.clean(text, process_rule) == expected

# Test both email and URL
text = "Email me@test.com and visit https://site.com"
expected = "Email and visit "
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_preserve_markdown_links_and_images(self):
"""Test that markdown links and images are preserved when removing URLs."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}}

# Test markdown link preservation
text = "Check [Google](https://google.com) for info"
expected = "Check [Google](https://google.com) for info"
assert CleanProcessor.clean(text, process_rule) == expected

# Test markdown image preservation
text = "Image: ![alt](https://example.com/image.png)"
expected = "Image: ![alt](https://example.com/image.png)"
assert CleanProcessor.clean(text, process_rule) == expected

# Test both link and image preservation
text = "[Link](https://link.com) and ![Image](https://image.com/img.jpg)"
expected = "[Link](https://link.com) and ![Image](https://image.com/img.jpg)"
assert CleanProcessor.clean(text, process_rule) == expected

# Test that non-markdown URLs are still removed
text = "Check [Link](https://keep.com) but remove https://remove.com"
expected = "Check [Link](https://keep.com) but remove "
assert CleanProcessor.clean(text, process_rule) == expected

# Test email removal alongside markdown preservation
text = "Email: test@test.com, link: [Click](https://site.com)"
expected = "Email: , link: [Click](https://site.com)"
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_remove_urls_emails_disabled(self):
"""Test remove_urls_emails rule when disabled."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": False}]}}

text = "Email test@example.com visit https://example.com"
# Should only apply default cleaning
assert CleanProcessor.clean(text, process_rule) == text

def test_clean_both_rules_enabled(self):
"""Test both pre-processing rules enabled together."""
process_rule = {
"rules": {
"pre_processing_rules": [
{"id": "remove_extra_spaces", "enabled": True},
{"id": "remove_urls_emails", "enabled": True},
]
}
}

text = "Hello\n\n\n\n World test@example.com \n\n\nhttps://example.com"
expected = "Hello\n\n World \n\n"
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_with_markdown_link_and_extra_spaces(self):
"""Test markdown link preservation with extra spaces removal."""
process_rule = {
"rules": {
"pre_processing_rules": [
{"id": "remove_extra_spaces", "enabled": True},
{"id": "remove_urls_emails", "enabled": True},
]
}
}

text = "[Link](https://example.com)\n\n\n\n Text https://remove.com"
expected = "[Link](https://example.com)\n\n Text "
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_unknown_rule_id_ignored(self):
"""Test that unknown rule IDs are silently ignored."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "unknown_rule", "enabled": True}]}}

text = "Hello<|World\x00"
expected = "Hello<World"
# Only default cleaning should be applied
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_empty_text(self):
"""Test cleaning empty text."""
assert CleanProcessor.clean("", None) == ""
assert CleanProcessor.clean("", {}) == ""
assert CleanProcessor.clean("", {"rules": {}}) == ""

def test_clean_text_with_only_invalid_symbols(self):
"""Test text containing only invalid symbols."""
text = "<|<|\x00\x01\x02\ufffe|>|>"
# <| becomes <, |> becomes >, control chars and U+FFFE are removed
assert CleanProcessor.clean(text, None) == "<<>>"

def test_clean_multiple_markdown_links_preserved(self):
"""Test multiple markdown links are all preserved."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}}

text = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)"
expected = "[One](https://one.com) [Two](http://two.org) [Three](https://three.net)"
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_markdown_link_text_as_url(self):
"""Test markdown link where the link text itself is a URL."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}}

# Link text that looks like URL should be preserved
text = "[https://text-url.com](https://actual-url.com)"
expected = "[https://text-url.com](https://actual-url.com)"
assert CleanProcessor.clean(text, process_rule) == expected

# Text URL without markdown should be removed
text = "https://text-url.com https://actual-url.com"
expected = " "
assert CleanProcessor.clean(text, process_rule) == expected

def test_clean_complex_markdown_link_content(self):
"""Test markdown links with complex content - known limitation with brackets in link text."""
process_rule = {"rules": {"pre_processing_rules": [{"id": "remove_urls_emails", "enabled": True}]}}

# Note: The regex pattern [^\]]* cannot handle ] within link text
# This is a known limitation - the pattern stops at the first ]
text = "[Text with [brackets] and (parens)](https://example.com)"
# Actual behavior: only matches up to first ], URL gets removed
expected = "[Text with [brackets] and (parens)]("
assert CleanProcessor.clean(text, process_rule) == expected

# Test that properly formatted markdown links work
text = "[Text with (parens) and symbols](https://example.com)"
expected = "[Text with (parens) and symbols](https://example.com)"
assert CleanProcessor.clean(text, process_rule) == expected
5 changes: 5 additions & 0 deletions api/tests/unit_tests/utils/test_text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
("", ""),
(" ", " "),
("【测试】", "【测试】"),
# Markdown link preservation - should be preserved if text starts with a markdown link
("[Google](https://google.com) is a search engine", "[Google](https://google.com) is a search engine"),
("[Example](http://example.com) some text", "[Example](http://example.com) some text"),
# Leading symbols before markdown link are removed, including the opening bracket [
("@[Test](https://example.com)", "Test](https://example.com)"),
],
)
def test_remove_leading_symbols(input_text, expected_output):
Expand Down
Loading