55import re
66
77
8- def get_sections (cleaned_text : str ) -> list :
9- """
10- Returns the section details from the content
11-
12- Args:
13- cleaned_text: The input text
14-
15- Returns:
16- list: The sections related to text
17-
18- """
19- combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
20- doc_metadata = re .findall (combined_pattern , cleaned_text , re .DOTALL )
21- doc_metadata = [match for group in doc_metadata for match in group if match ]
22- return clean_sections (doc_metadata )
23-
24-
25- def clean_sections (sections : list ) -> list :
26- """Cleans the sections by removing special characters and extra white spaces."""
27- cleanedSections = [re .sub (r"[=#]" , "" , match ).strip () for match in sections ]
28-
29- return cleanedSections
30-
31-
328def remove_markdown_tags (text : str , tag_patterns : dict ) -> str :
339 """
3410 Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -52,7 +28,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
5228 return text
5329
5430
55- def clean_text_with_section_extraction (src_text : str ) -> tuple [ str , str ] :
31+ def clean_text (src_text : str ) -> str :
5632 """This function performs following cleanup activities on the text, remove all unicode characters
5733 remove line spacing,remove stop words, normalize characters
5834
@@ -77,8 +53,6 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
7753 }
7854 cleaned_text = remove_markdown_tags (src_text , tag_patterns )
7955
80- sections = get_sections (cleaned_text )
81-
8256 # Updated regex to keep Unicode letters, punctuation, whitespace, currency symbols, and percentage signs,
8357 # while also removing non-printable characters
8458 cleaned_text = re .sub (r"[^\p{L}\p{P}\s\p{Sc}%\x20-\x7E]" , "" , cleaned_text )
@@ -90,7 +64,7 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
9064 except Exception as e :
9165 logging .error (f"An error occurred in clean_text: { e } " )
9266 return ""
93- return cleaned_text , sections
67+ return cleaned_text
9468
9569
9670async def process_pre_embedding_cleaner (record : dict ) -> dict :
@@ -114,19 +88,7 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
11488 "warnings" : None ,
11589 }
11690
117- # scenarios when page by chunking is enabled
118- if isinstance (record ["data" ]["chunk" ], dict ):
119- (
120- cleaned_record ["data" ]["cleanedChunk" ],
121- cleaned_record ["data" ]["sections" ],
122- ) = clean_text_with_section_extraction (record ["data" ]["chunk" ]["content" ])
123- cleaned_record ["data" ]["chunk" ] = record ["data" ]["chunk" ]["content" ]
124- else :
125- (
126- cleaned_record ["data" ]["cleanedChunk" ],
127- cleaned_record ["data" ]["sections" ],
128- ) = clean_text_with_section_extraction (record ["data" ]["chunk" ])
129- cleaned_record ["data" ]["chunk" ] = record ["data" ]["chunk" ]
91+ cleaned_record ["data" ]["cleaned_chunk" ] = clean_text (record ["data" ]["content" ])
13092
13193 except Exception as e :
13294 logging .error ("string cleanup Error: %s" , e )
0 commit comments