33from xml .dom import minidom
44from tqdm import tqdm
55import re
6+ import urllib .parse
67
78# --------------------------------------------------------------------
89# 1) Definitions & Constants
@@ -57,28 +58,20 @@ def parse_paths_from_summary(summary_text):
5758
5859 Returns a list of unique paths (without duplicates).
5960 """
60- # Regex to find standard Markdown links: [some text](some/path)
61- # Capture everything inside parentheses after the bracket, ignoring any leading/trailing spaces.
6261 pattern = r"\[[^\]]+\]\(\s*([^)]+?)\s*\)"
6362 matches = re .findall (pattern , summary_text )
6463
6564 cleaned_paths = []
6665 for path in matches :
67- # Trim whitespace just in case
6866 path = path .strip ()
6967
7068 # 1) Handle /README.md -> /index.html
71- # (anywhere in the path, not just the very end, but typically it should be at the end)
7269 if path .endswith ("README.md" ):
7370 path = path [:- 9 ] + "index.html"
74-
7571 # 2) Else if it ends with .md -> .html
7672 elif path .endswith (".md" ):
7773 path = path [:- 3 ] + ".html"
7874
79- # You asked NOT to remove /index or trailing slashes
80- # so we won't do any extra trimming beyond that.
81-
8275 # Avoid duplicates
8376 if path not in cleaned_paths :
8477 cleaned_paths .append (path )
@@ -113,22 +106,25 @@ def prettify_xml(element):
113106def add_translated_urls (url_element , base_domain , path ):
114107 """
115108 Add translated URLs with language codes, e.g.:
116- https://<base_domain>/<lang_code><path>
117-
109+ https://<base_domain>/<lang_code>/<path>
118110 Also sets x-default to English by default.
119111 """
112+
113+ # Encode the path for safety
114+ encoded_path = urllib .parse .quote (path , safe = "/:?=&%" )
115+
120116 # We'll set x-default to the English version
121117 xdefault_link = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
122118 xdefault_link .set ('rel' , 'alternate' )
123119 xdefault_link .set ('hreflang' , 'x-default' )
124- xdefault_link .set ('href' , f"https://{ base_domain } /en/{ path } " )
120+ xdefault_link .set ('href' , f"https://{ base_domain } /en/{ encoded_path } " )
125121
126122 # Add one <xhtml:link> for each language
127123 for lang_code in languages .values ():
128124 alt_link = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
129125 alt_link .set ('rel' , 'alternate' )
130126 alt_link .set ('hreflang' , lang_code )
131- alt_link .set ('href' , f"https://{ base_domain } /{ lang_code } /{ path } " )
127+ alt_link .set ('href' , f"https://{ base_domain } /{ lang_code } /{ encoded_path } " )
132128
133129# --------------------------------------------------------------------
134130# 3) Main logic
@@ -147,17 +143,16 @@ def main():
147143 ET .register_namespace ('xhtml' , "http://www.w3.org/1999/xhtml" )
148144 root = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset' )
149145
150- # ----------------------------------------------------------------
151- # 3.1) Process Book paths
152- # ----------------------------------------------------------------
153146 print ("**Processing Book paths**..." )
154147 for p in tqdm (book_paths , desc = "Book paths" ):
155- # Create <url> element
156148 url_element = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
157149
158- # Our base location for English is domain/en/path
150+ # Encode path to handle special chars like '+'
151+ encoded_path = urllib .parse .quote (p , safe = "/:?=&%" )
152+
153+ # Base location: domain/en/encoded_path
159154 loc_el = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
160- full_en_url = f"https://{ BOOK_DOMAIN } /en/{ p } "
155+ full_en_url = f"https://{ BOOK_DOMAIN } /en/{ encoded_path } "
161156 loc_el .text = full_en_url
162157
163158 # Priority calculation
@@ -168,17 +163,15 @@ def main():
168163 add_translated_urls (url_element , BOOK_DOMAIN , p )
169164 root .append (url_element )
170165
171- # ----------------------------------------------------------------
172- # 3.2) Process Cloud paths
173- # ----------------------------------------------------------------
174166 print ("**Processing Cloud paths**..." )
175167 for p in tqdm (cloud_paths , desc = "Cloud paths" ):
176- # Create <url> element
177168 url_element = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
178169
179- # Our base location for English is domain/en/path
170+ encoded_path = urllib .parse .quote (p , safe = "/:?=&%" )
171+
172+ # Base location: domain/en/encoded_path
180173 loc_el = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
181- full_en_url = f"https://{ CLOUD_DOMAIN } /en/{ p } "
174+ full_en_url = f"https://{ CLOUD_DOMAIN } /en/{ encoded_path } "
182175 loc_el .text = full_en_url
183176
184177 # Priority calculation
@@ -189,9 +182,6 @@ def main():
189182 add_translated_urls (url_element , CLOUD_DOMAIN , p )
190183 root .append (url_element )
191184
192- # ----------------------------------------------------------------
193- # 3.3) Write the final sitemap
194- # ----------------------------------------------------------------
195185 print ("**Generating final sitemap**..." )
196186 sitemap_xml = prettify_xml (root )
197187 with open ("sitemap.xml" , "w" , encoding = "utf-8" ) as f :
0 commit comments