HackTricks-wiki
diff --git a/‎generate_sitemap.py‎
Lines changed: 17 additions & 27 deletions b/‎generate_sitemap.py‎
Lines changed: 17 additions & 27 deletions
@@ -3,6 +3,7 @@
 from xml.dom import minidom
 from tqdm import tqdm
 import re
+import urllib.parse
 
 # --------------------------------------------------------------------
 # 1) Definitions & Constants
@@ -57,28 +58,20 @@ def parse_paths_from_summary(summary_text):
     
     Returns a list of unique paths (without duplicates).
     """
-    # Regex to find standard Markdown links: [some text](some/path)
-    # Capture everything inside parentheses after the bracket, ignoring any leading/trailing spaces.
     pattern = r"\[[^\]]+\]\(\s*([^)]+?)\s*\)"
     matches = re.findall(pattern, summary_text)
 
     cleaned_paths = []
     for path in matches:
-        # Trim whitespace just in case
         path = path.strip()
 
         # 1) Handle /README.md -> /index.html
-        #    (anywhere in the path, not just the very end, but typically it should be at the end)
         if path.endswith("README.md"):
             path = path[:-9] + "index.html"
-
         # 2) Else if it ends with .md -> .html
         elif path.endswith(".md"):
             path = path[:-3] + ".html"
 
-        # You asked NOT to remove /index or trailing slashes
-        # so we won't do any extra trimming beyond that.
-
         # Avoid duplicates
         if path not in cleaned_paths:
             cleaned_paths.append(path)
@@ -113,22 +106,25 @@ def prettify_xml(element):
 def add_translated_urls(url_element, base_domain, path):
     """
     Add translated URLs with language codes, e.g.:
-       https://<base_domain>/<lang_code><path>
-
+       https://<base_domain>/<lang_code>/<path>
     Also sets x-default to English by default.
     """
+
+    # Encode the path for safety
+    encoded_path = urllib.parse.quote(path, safe="/:?=&%")
+
     # We'll set x-default to the English version
     xdefault_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
     xdefault_link.set('rel', 'alternate')
     xdefault_link.set('hreflang', 'x-default')
-    xdefault_link.set('href', f"https://{base_domain}/en/{path}")
+    xdefault_link.set('href', f"https://{base_domain}/en/{encoded_path}")
 
     # Add one <xhtml:link> for each language
     for lang_code in languages.values():
         alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
         alt_link.set('rel', 'alternate')
         alt_link.set('hreflang', lang_code)
-        alt_link.set('href', f"https://{base_domain}/{lang_code}/{path}")
+        alt_link.set('href', f"https://{base_domain}/{lang_code}/{encoded_path}")
 
 # --------------------------------------------------------------------
 # 3) Main logic
@@ -147,17 +143,16 @@ def main():
     ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml")
     root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset')
 
-    # ----------------------------------------------------------------
-    # 3.1) Process Book paths
-    # ----------------------------------------------------------------
     print("**Processing Book paths**...")
     for p in tqdm(book_paths, desc="Book paths"):
-        # Create <url> element
         url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
 
-        # Our base location for English is domain/en/path
+        # Encode path to handle special chars like '+'
+        encoded_path = urllib.parse.quote(p, safe="/:?=&%")
+
+        # Base location: domain/en/encoded_path
         loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
-        full_en_url = f"https://{BOOK_DOMAIN}/en/{p}"
+        full_en_url = f"https://{BOOK_DOMAIN}/en/{encoded_path}"
         loc_el.text = full_en_url
 
         # Priority calculation
@@ -168,17 +163,15 @@ def main():
         add_translated_urls(url_element, BOOK_DOMAIN, p)
         root.append(url_element)
 
-    # ----------------------------------------------------------------
-    # 3.2) Process Cloud paths
-    # ----------------------------------------------------------------
     print("**Processing Cloud paths**...")
     for p in tqdm(cloud_paths, desc="Cloud paths"):
-        # Create <url> element
         url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
 
-        # Our base location for English is domain/en/path
+        encoded_path = urllib.parse.quote(p, safe="/:?=&%")
+
+        # Base location: domain/en/encoded_path
         loc_el = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
-        full_en_url = f"https://{CLOUD_DOMAIN}/en/{p}"
+        full_en_url = f"https://{CLOUD_DOMAIN}/en/{encoded_path}"
         loc_el.text = full_en_url
 
         # Priority calculation
@@ -189,9 +182,6 @@ def main():
         add_translated_urls(url_element, CLOUD_DOMAIN, p)
         root.append(url_element)
 
-    # ----------------------------------------------------------------
-    # 3.3) Write the final sitemap
-    # ----------------------------------------------------------------
     print("**Generating final sitemap**...")
     sitemap_xml = prettify_xml(root)
     with open("sitemap.xml", "w", encoding="utf-8") as f: