11import requests
22import xml .etree .ElementTree as ET
33from xml .dom import minidom
4- from concurrent .futures import ThreadPoolExecutor , as_completed
54from tqdm import tqdm
5+ from urllib .parse import quote
66
77# Correct hreflang values for languages
88languages = {
@@ -35,20 +35,24 @@ def fetch_sitemap(url):
3535 response .raise_for_status ()
3636 return response .text
3737
38- def check_url_exists (url ):
39- """Check if a URL exists using a HEAD request."""
40- try :
41- r = requests .head (url , headers = HEADERS , allow_redirects = True , timeout = 30 )
42- return r .status_code == 200
43- except Exception :
44- return False
38+ # def check_url_exists(url):
39+ # """Check if a URL exists using a HEAD request."""
40+ # try:
41+ # r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=30)
42+ # return r.status_code == 200
43+ # except Exception:
44+ # return False
4545
4646def prettify_xml (element ):
4747 """Prettify and return a string representation of the XML."""
4848 rough_string = ET .tostring (element , encoding = 'utf-8' )
4949 reparsed = minidom .parseString (rough_string )
5050 return reparsed .toprettyxml (indent = " " )
5151
52+ def encode_url (url ):
53+ """Encode the URL to make it XML-safe and RFC-compliant."""
54+ return quote (url , safe = ":/?&=" ) # Leave common URL-safe characters untouched
55+
5256def main ():
5357 # URLs of the sitemaps
5458 book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml"
@@ -76,7 +80,7 @@ def main():
7680 # Add static entry for https://www.hacktricks.xyz/
7781 static_url = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
7882 loc = ET .SubElement (static_url , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
79- loc .text = "https://www.hacktricks.xyz/"
83+ loc .text = encode_url ( "https://www.hacktricks.xyz/" )
8084 new_root .append (static_url )
8185
8286 # Process main URLs
@@ -93,6 +97,9 @@ def main():
9397 priority = url_element .find ('ns:priority' , ns )
9498 lastmod = url_element .find ('ns:lastmod' , ns )
9599
100+ # Encode the base loc_text
101+ loc_text = encode_url (loc_text )
102+
96103 # Determine base domain and path
97104 parts = loc_text .split ("/" )
98105 if len (parts ) > 3 :
@@ -112,7 +119,7 @@ def main():
112119 else :
113120 # If original was just the root, translated is also root + /lang
114121 translated_url = f"{ base_domain } /{ lang_code } "
115- translation_urls [hreflang ] = translated_url
122+ translation_urls [hreflang ] = encode_url ( translated_url )
116123
117124 url_entries .append ((
118125 loc_text ,
@@ -121,18 +128,16 @@ def main():
121128 translation_urls
122129 ))
123130
124- # Parallel check all translation URLs with progress bar
125- all_translation_checks = {}
126- with ThreadPoolExecutor (max_workers = 30 ) as executor :
127- # Submit all tasks to executor
128- future_to_url = {executor .submit (check_url_exists , t_url ): (hreflang , t_url )
129- for _ , _ , _ , t_urls in url_entries for hreflang , t_url in t_urls .items ()}
130-
131- # Use tqdm to show progress
132- for future in tqdm (as_completed (future_to_url ), total = len (future_to_url ), desc = "Checking Translation URLs" ):
133- hreflang , t_url = future_to_url [future ]
134- result = future .result ()
135- all_translation_checks [t_url ] = result
131+ # Commented-out URL checks, assuming all translations exist for now
132+ # all_translation_checks = {}
133+ # with ThreadPoolExecutor(max_workers=10) as executor:
134+ # future_to_url = {executor.submit(check_url_exists, t_url): (hreflang, t_url)
135+ # for _, _, _, t_urls in url_entries for hreflang, t_url in t_urls.items()}
136+ #
137+ # for future in tqdm(as_completed(future_to_url), total=len(future_to_url), desc="Checking Translation URLs"):
138+ # hreflang, t_url = future_to_url[future]
139+ # result = future.result()
140+ # all_translation_checks[t_url] = result
136141
137142 # Build the final sitemap
138143 for (loc_text , priority_val , lastmod_val , translation_urls ) in url_entries :
@@ -149,16 +154,12 @@ def main():
149154 lastmod_el = ET .SubElement (new_url , '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' )
150155 lastmod_el .text = lastmod_val
151156
152- # Add existing translations (excluding English, which is default )
157+ # Add all translations (assume all exist for now )
153158 for hreflang , t_url in translation_urls .items ():
154- if all_translation_checks .get (t_url , False ):
155- alt_link = ET .SubElement (new_url , '{http://www.w3.org/1999/xhtml}link' )
156- alt_link .set ('rel' , 'alternate' )
157- alt_link .set ('hreflang' , hreflang )
158- alt_link .set ('href' , t_url )
159- else :
160- # Print in red if not found
161- print ("\033 [31m" + f"{ t_url } NOT FOUND" + "\033 [0m" )
159+ alt_link = ET .SubElement (new_url , '{http://www.w3.org/1999/xhtml}link' )
160+ alt_link .set ('rel' , 'alternate' )
161+ alt_link .set ('hreflang' , hreflang )
162+ alt_link .set ('href' , t_url )
162163
163164 new_root .append (new_url )
164165
0 commit comments