File tree Expand file tree Collapse file tree 2 files changed +26
-3
lines changed
Expand file tree Collapse file tree 2 files changed +26
-3
lines changed Original file line number Diff line number Diff line change 1+ {
2+ "IsEncrypted" : false ,
3+ "Values" : {
4+ "AIService__DocumentIntelligence__Endpoint" : " <documentIntelligenceEndpoint>" ,
5+ "AIService__DocumentIntelligence__Key" : " <documentIntelligenceKey if not using identity>" ,
6+ "AIService__Language__Endpoint" : " <languageEndpoint>" ,
7+ "AIService__Language__Key" : " <languageKey if not using identity>" ,
8+ "FunctionApp__ClientId" : " <clientId of the function app if using user assigned managed identity>" ,
9+ "IdentityType" : " <identityType> # system_assigned or user_assigned or key" ,
10+ "OpenAI__ApiKey" : " <openAIKey if using non managed identity>" ,
11+ "OpenAI__ApiVersion" : " <openAIApiVersion>" ,
12+ "OpenAI__Endpoint" : " <openAIEndpoint>" ,
13+ "OpenAI__MultiModalDeployment" : " <openAIEmbeddingDeploymentId>" ,
14+ "StorageAccount__ConnectionString" : " <connectionString if using non managed identity>" ,
15+ "StorageAccount__Endpoint" : " <Endpoint if using identity based connections>"
16+ }
17+ }
Original file line number Diff line number Diff line change @@ -71,15 +71,21 @@ def clean_text(src_text: str) -> str:
7171 try :
7272 # Define specific patterns for each tag
7373 tag_patterns = {
74- "figurecontent" : r"<!-- FigureContent=(.*?)-->" ,
74+ "figurecontent" : r"<!--.*? FigureContent=(.*?)-->" ,
7575 "figure" : r"<figure>(.*?)</figure>" ,
7676 "figures" : r"\(figures/\d+\)(.*?)\(figures/\d+\)" ,
7777 "figcaption" : r"<figcaption>(.*?)</figcaption>" ,
7878 }
7979 cleaned_text = remove_markdown_tags (src_text , tag_patterns )
8080
81- # remove line breaks
82- cleaned_text = re .sub (r"\n" , "" , cleaned_text )
81+ # remove html tags
82+ cleaned_text = re .sub (r"<.*?>" , "" , cleaned_text )
83+
84+ # Replace newline characters with spaces
85+ cleaned_text = re .sub (r"\n" , " " , cleaned_text )
86+
87+ # Replace multiple whitespace characters with a single space
88+ cleaned_text = re .sub (r"\s+" , " " , cleaned_text )
8389
8490 # remove stopwords
8591 tokens = word_tokenize (cleaned_text , "english" )
You can’t perform that action at this time.
0 commit comments