55import re
66
77
8+ def get_sections (text : str ) -> list :
9+ """
10+ Returns the section details from the content.
11+
12+ Args:
13+ text: The input text
14+
15+ Returns:
16+ list: The sections related to text
17+ """
18+ # Updated regex pattern to capture markdown headers like ### Header
19+ combined_pattern = r"(?<=\n|^)[#]+\s*(.*?)(?=\n)"
20+ doc_metadata = re .findall (combined_pattern , text , re .DOTALL )
21+ return clean_sections (doc_metadata )
22+
23+
24+ def clean_sections (sections : list ) -> list :
25+ """
26+ Cleans the sections by removing special characters and extra white spaces.
27+ """
28+ cleaned_sections = [re .sub (r"[=#]" , "" , match ).strip () for match in sections ]
29+ return cleaned_sections
30+
31+
832def remove_markdown_tags (text : str , tag_patterns : dict ) -> str :
933 """
1034 Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -28,7 +52,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
2852 return text
2953
3054
31- def clean_text (src_text : str ) -> str :
55+ def clean_text_and_extract_metadata (src_text : str ) -> tuple [ str , str ] :
3256 """This function performs following cleanup activities on the text, remove all unicode characters
3357 remove line spacing,remove stop words, normalize characters
3458
@@ -38,16 +62,21 @@ def clean_text(src_text: str) -> str:
3862 Returns:
3963 str: The clean text."""
4064
65+ return_record = {}
66+
4167 try :
4268 logging .info (f"Input text: { src_text } " )
4369 if len (src_text ) == 0 :
4470 logging .error ("Input text is empty" )
4571 raise ValueError ("Input text is empty" )
4672
73+ return_record ["marked_up_chunk" ] = src_text
74+ return_record ["sections" ] = get_sections (src_text )
75+
4776 # Define specific patterns for each tag
4877 tag_patterns = {
4978 "figurecontent" : r"<!-- FigureContent=(.*?)-->" ,
50- "figure" : r"<figure>(.*?)</figure>" ,
79+ "figure" : r"<figure(?:\s+FigureId=\"[^\"]*\")? >(.*?)</figure>" ,
5180 "figures" : r"\(figures/\d+\)(.*?)\(figures/\d+\)" ,
5281 "figcaption" : r"<figcaption>(.*?)</figcaption>" ,
5382 }
@@ -61,13 +90,15 @@ def clean_text(src_text: str) -> str:
6190 if len (cleaned_text ) == 0 :
6291 logging .error ("Cleaned text is empty" )
6392 raise ValueError ("Cleaned text is empty" )
93+ else :
94+ return_record ["cleaned_chunk" ] = cleaned_text
6495 except Exception as e :
65- logging .error (f"An error occurred in clean_text : { e } " )
96+ logging .error (f"An error occurred in clean_text_and_extract_metadata : { e } " )
6697 return ""
67- return cleaned_text
98+ return return_record
6899
69100
70- async def process_pre_embedding_cleaner (record : dict ) -> dict :
101+ async def process_mark_up_cleaner (record : dict ) -> dict :
71102 """Cleanup the data using standard python libraries.
72103
73104 Args:
@@ -88,7 +119,9 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
88119 "warnings" : None ,
89120 }
90121
91- cleaned_record ["data" ]["cleaned_chunk" ] = clean_text (record ["data" ]["content" ])
122+ cleaned_record ["data" ] = clean_text_and_extract_metadata (
123+ record ["data" ]["content" ]
124+ )
92125
93126 except Exception as e :
94127 logging .error ("string cleanup Error: %s" , e )
0 commit comments