|
| 1 | +def split_continuous_references(text: str) -> str: |
| 2 | + """ |
| 3 | + Split continuous reference tags into individual reference tags. |
| 4 | +
|
| 5 | + Converts patterns like [1:92ff35fb, 4:bfe6f044] to [1:92ff35fb] [4:bfe6f044] |
| 6 | +
|
| 7 | + Only processes text if: |
| 8 | + 1. '[' appears exactly once |
| 9 | + 2. ']' appears exactly once |
| 10 | + 3. Contains commas between '[' and ']' |
| 11 | +
|
| 12 | + Args: |
| 13 | + text (str): Text containing reference tags |
| 14 | +
|
| 15 | + Returns: |
| 16 | + str: Text with split reference tags, or original text if conditions not met |
| 17 | + """ |
| 18 | + # Early return if text is empty |
| 19 | + if not text: |
| 20 | + return text |
| 21 | + # Check if '[' appears exactly once |
| 22 | + if text.count("[") != 1: |
| 23 | + return text |
| 24 | + # Check if ']' appears exactly once |
| 25 | + if text.count("]") != 1: |
| 26 | + return text |
| 27 | + # Find positions of brackets |
| 28 | + open_bracket_pos = text.find("[") |
| 29 | + close_bracket_pos = text.find("]") |
| 30 | + |
| 31 | + # Check if brackets are in correct order |
| 32 | + if open_bracket_pos >= close_bracket_pos: |
| 33 | + return text |
| 34 | + # Extract content between brackets |
| 35 | + content_between_brackets = text[open_bracket_pos + 1 : close_bracket_pos] |
| 36 | + # Check if there's a comma between brackets |
| 37 | + if "," not in content_between_brackets: |
| 38 | + return text |
| 39 | + text = text.replace(content_between_brackets, content_between_brackets.replace(", ", "][")) |
| 40 | + text = text.replace(content_between_brackets, content_between_brackets.replace(",", "][")) |
| 41 | + |
| 42 | + return text |
| 43 | + |
| 44 | + |
| 45 | +def process_streaming_references_complete(text_buffer: str) -> tuple[str, str]: |
| 46 | + """ |
| 47 | + Complete streaming reference processing to ensure reference tags are never split. |
| 48 | +
|
| 49 | + Args: |
| 50 | + text_buffer (str): The accumulated text buffer. |
| 51 | +
|
| 52 | + Returns: |
| 53 | + tuple[str, str]: (processed_text, remaining_buffer) |
| 54 | + """ |
| 55 | + import re |
| 56 | + |
| 57 | + # Pattern to match complete reference tags: [refid:memoriesID] |
| 58 | + complete_pattern = r"\[\d+:[^\]]+\]" |
| 59 | + |
| 60 | + # Find all complete reference tags |
| 61 | + complete_matches = list(re.finditer(complete_pattern, text_buffer)) |
| 62 | + |
| 63 | + if complete_matches: |
| 64 | + # Find the last complete tag |
| 65 | + last_match = complete_matches[-1] |
| 66 | + end_pos = last_match.end() |
| 67 | + |
| 68 | + # Check if there's any incomplete reference after the last complete one |
| 69 | + remaining_text = text_buffer[end_pos:] |
| 70 | + |
| 71 | + # Look for potential incomplete reference patterns after the last complete tag |
| 72 | + incomplete_pattern = r"\[\d*:?[^\]]*$" |
| 73 | + if re.search(incomplete_pattern, remaining_text): |
| 74 | + # There's a potential incomplete reference, find where it starts |
| 75 | + incomplete_match = re.search(incomplete_pattern, remaining_text) |
| 76 | + if incomplete_match: |
| 77 | + incomplete_start = end_pos + incomplete_match.start() |
| 78 | + processed_text = text_buffer[:incomplete_start] |
| 79 | + remaining_buffer = text_buffer[incomplete_start:] |
| 80 | + |
| 81 | + # Apply reference splitting to the processed text |
| 82 | + processed_text = split_continuous_references(processed_text) |
| 83 | + return processed_text, remaining_buffer |
| 84 | + |
| 85 | + # No incomplete reference after the last complete tag, process all |
| 86 | + processed_text = split_continuous_references(text_buffer) |
| 87 | + return processed_text, "" |
| 88 | + |
| 89 | + # Check for incomplete reference tags - be more specific about what constitutes a potential reference |
| 90 | + # Look for opening bracket with number and colon that could be a reference tag |
| 91 | + opening_pattern = r"\[\d+:" |
| 92 | + opening_matches = list(re.finditer(opening_pattern, text_buffer)) |
| 93 | + |
| 94 | + if opening_matches: |
| 95 | + # Find the last opening tag |
| 96 | + last_opening = opening_matches[-1] |
| 97 | + opening_start = last_opening.start() |
| 98 | + |
| 99 | + # Check if this might be a complete reference tag (has closing bracket after the pattern) |
| 100 | + remaining_text = text_buffer[last_opening.end() :] |
| 101 | + if "]" in remaining_text: |
| 102 | + # This looks like a complete reference tag, process it |
| 103 | + processed_text = split_continuous_references(text_buffer) |
| 104 | + return processed_text, "" |
| 105 | + else: |
| 106 | + # Incomplete reference tag, keep it in buffer |
| 107 | + processed_text = text_buffer[:opening_start] |
| 108 | + processed_text = split_continuous_references(processed_text) |
| 109 | + return processed_text, text_buffer[opening_start:] |
| 110 | + |
| 111 | + # More sophisticated check for potential reference patterns |
| 112 | + # Only hold back text if we see a pattern that could be the start of a reference tag |
| 113 | + potential_ref_pattern = r"\[\d*:?$" # Matches [, [1, [12:, etc. at end of buffer |
| 114 | + if re.search(potential_ref_pattern, text_buffer): |
| 115 | + # Find the position of the potential reference start |
| 116 | + match = re.search(potential_ref_pattern, text_buffer) |
| 117 | + if match: |
| 118 | + ref_start = match.start() |
| 119 | + processed_text = text_buffer[:ref_start] |
| 120 | + processed_text = split_continuous_references(processed_text) |
| 121 | + return processed_text, text_buffer[ref_start:] |
| 122 | + |
| 123 | + # Check for standalone [ only at the very end of the buffer |
| 124 | + # This prevents cutting off mathematical expressions like [ \Delta U = Q - W ] |
| 125 | + if text_buffer.endswith("["): |
| 126 | + # Only hold back the single [ character |
| 127 | + processed_text = text_buffer[:-1] |
| 128 | + processed_text = split_continuous_references(processed_text) |
| 129 | + return processed_text, "[" |
| 130 | + |
| 131 | + # No reference-like patterns found, process all text |
| 132 | + processed_text = split_continuous_references(text_buffer) |
| 133 | + return processed_text, "" |
0 commit comments