2424 "uk" : "uk" , # Ukrainian
2525}
2626
27- # User agent for Googlebot
28- HEADERS = {
29- #"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" # gitbook returns 403
30- }
31-
32- def fetch_sitemap (url ):
33- """Fetch and return the contents of a sitemap."""
34- response = requests .get (url , headers = HEADERS , timeout = 30 )
35- response .raise_for_status ()
36- return response .text
37-
38- # def check_url_exists(url):
39- # """Check if a URL exists using a HEAD request."""
40- # try:
41- # r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=30)
42- # return r.status_code == 200
43- # except Exception:
44- # return False
45-
4627def prettify_xml (element ):
4728 """Prettify and return a string representation of the XML."""
4829 rough_string = ET .tostring (element , encoding = 'utf-8' )
@@ -51,7 +32,7 @@ def prettify_xml(element):
5132
5233def encode_url (url ):
5334 """Encode the URL to make it XML-safe and RFC-compliant."""
54- return quote (url , safe = ":/?&=" ) # Leave common URL-safe characters untouched
35+ return quote (url , safe = ":/?&=" )
5536
5637def add_static_urls_without_translations (root , urls ):
5738 """Add static URLs without translations to the sitemap."""
@@ -64,37 +45,14 @@ def add_static_urls_without_translations(root, urls):
6445 root .append (url_element )
6546
6647def main ():
67- # URLs of the sitemaps
68- book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml"
69- cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"
70-
71- # Fetch both sitemaps
72- book_sitemap_data = fetch_sitemap (book_sitemap_url )
73- cloud_sitemap_data = fetch_sitemap (cloud_sitemap_url )
74-
75- # Parse XML
76- ns = {'ns' : 'http://www.sitemaps.org/schemas/sitemap/0.9' }
77- book_root = ET .fromstring (book_sitemap_data )
78- cloud_root = ET .fromstring (cloud_sitemap_data )
79-
80- all_urls = book_root .findall ('ns:url' , ns ) + cloud_root .findall ('ns:url' , ns )
81-
8248 # Prepare the output sitemap
8349 ET .register_namespace ('' , "http://www.sitemaps.org/schemas/sitemap/0.9" )
8450 ET .register_namespace ('xhtml' , "http://www.w3.org/1999/xhtml" )
8551 new_root = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset' )
8652
87- seen_locs = set ()
88- url_entries = [] # Store info for each main URL
89-
90- # Add static entry for https://www.hacktricks.xyz/
91- static_url = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
92- loc = ET .SubElement (static_url , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
93- loc .text = encode_url ("https://www.hacktricks.xyz/" )
94- new_root .append (static_url )
95-
96- # Add static URLs for training.hacktricks.xyz without translations
97- static_training_urls = [
53+ # Add static URLs without translations
54+ static_urls = [
55+ "https://www.hacktricks.xyz/" ,
9856 "https://training.hacktricks.xyz/" ,
9957 "https://training.hacktricks.xyz/courses/arte" ,
10058 "https://training.hacktricks.xyz/courses/arta" ,
@@ -108,76 +66,25 @@ def main():
10866 "https://training.hacktricks.xyz/terms" ,
10967 "https://training.hacktricks.xyz/privacy" ,
11068 ]
111- add_static_urls_without_translations (new_root , static_training_urls )
112-
113- # Process main URLs
114- for url_element in tqdm (all_urls , desc = "Processing URLs" ):
115- loc = url_element .find ('ns:loc' , ns )
116- if loc is None :
117- continue
118- loc_text = loc .text .strip ()
119-
120- if loc_text in seen_locs :
121- continue
122- seen_locs .add (loc_text )
123-
124- priority = url_element .find ('ns:priority' , ns )
125- lastmod = url_element .find ('ns:lastmod' , ns )
126-
127- # Encode the base loc_text
128- loc_text = encode_url (loc_text )
129-
130- # Determine base domain and path
131- parts = loc_text .split ("/" )
132- if len (parts ) > 3 :
133- base_domain_parts = parts [:3 ]
134- page_path = "/" .join (parts [3 :])
135- else :
136- base_domain_parts = parts [:3 ]
137- page_path = ""
138-
139- base_domain = "/" .join (base_domain_parts )
140-
141- # Construct all translation URLs for this loc
142- translation_urls = {}
143- for lang_code , hreflang in languages .items ():
144- if page_path :
145- translated_url = f"{ base_domain } /{ lang_code } /{ page_path } "
146- else :
147- # If original was just the root, translated is also root + /lang
148- translated_url = f"{ base_domain } /{ lang_code } "
149- translation_urls [hreflang ] = encode_url (translated_url )
150-
151- url_entries .append ((
152- loc_text ,
153- priority .text if priority is not None else None ,
154- lastmod .text if lastmod is not None else None ,
155- translation_urls
156- ))
157-
158- # Build the final sitemap
159- for (loc_text , priority_val , lastmod_val , translation_urls ) in url_entries :
160- new_url = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
161-
162- loc_el = ET .SubElement (new_url , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
163- loc_el .text = loc_text
164-
165- if priority_val :
166- priority_el = ET .SubElement (new_url , '{http://www.sitemaps.org/schemas/sitemap/0.9}priority' )
167- priority_el .text = priority_val
168-
169- if lastmod_val :
170- lastmod_el = ET .SubElement (new_url , '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' )
171- lastmod_el .text = lastmod_val
172-
173- # Add all translations (assume all exist for now)
174- for hreflang , t_url in translation_urls .items ():
175- alt_link = ET .SubElement (new_url , '{http://www.sitemaps.org/schemas/sitemap/0.9}link' )
176- alt_link .set ('rel' , 'alternate' )
177- alt_link .set ('hreflang' , hreflang )
178- alt_link .set ('href' , t_url )
179-
180- new_root .append (new_url )
69+ add_static_urls_without_translations (new_root , static_urls )
70+
71+ # Add URLs with translations (Example: book.hacktricks.xyz)
72+ url_element = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
73+ loc = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
74+ loc .text = encode_url ("https://book.hacktricks.xyz/" )
75+ priority = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}priority' )
76+ priority .text = "0.84"
77+ lastmod = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' )
78+ lastmod .text = "2024-12-14"
79+
80+ # Add translations
81+ for hreflang , lang_path in languages .items ():
82+ alt_link = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
83+ alt_link .set ('rel' , 'alternate' )
84+ alt_link .set ('hreflang' , hreflang )
85+ alt_link .set ('href' , encode_url (f"https://book.hacktricks.xyz/{ lang_path } " ))
86+
87+ new_root .append (url_element )
18188
18289 # Save prettified XML to file
18390 beautified_xml = prettify_xml (new_root )
0 commit comments