2424 "uk" : "uk" , # Ukrainian
2525}
2626
27+ # User agent for Googlebot
28+ HEADERS = {
29+ #"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" # gitbook returns 403
30+ }
31+
32+ def fetch_sitemap (url ):
33+ """Fetch and return the contents of a sitemap."""
34+ response = requests .get (url , headers = HEADERS , timeout = 30 )
35+ response .raise_for_status ()
36+ return response .text
37+
2738def prettify_xml (element ):
2839 """Prettify and return a string representation of the XML."""
2940 rough_string = ET .tostring (element , encoding = 'utf-8' )
@@ -32,27 +43,44 @@ def prettify_xml(element):
3243
3344def encode_url (url ):
3445 """Encode the URL to make it XML-safe and RFC-compliant."""
35- return quote (url , safe = ":/?&=" )
46+ return quote (url , safe = ":/?&=" ) # Leave common URL-safe characters untouched
3647
3748def add_static_urls_without_translations (root , urls ):
3849 """Add static URLs without translations to the sitemap."""
3950 for url in urls :
4051 url_element = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
41-
4252 loc = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
4353 loc .text = encode_url (url )
44-
4554 root .append (url_element )
4655
4756def main ():
57+ # URLs of the sitemaps
58+ book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml"
59+ cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"
60+
61+ # Fetch both sitemaps
62+ book_sitemap_data = fetch_sitemap (book_sitemap_url )
63+ cloud_sitemap_data = fetch_sitemap (cloud_sitemap_url )
64+
65+ # Parse XML
66+ ns = {'ns' : 'http://www.sitemaps.org/schemas/sitemap/0.9' }
67+ book_root = ET .fromstring (book_sitemap_data )
68+ cloud_root = ET .fromstring (cloud_sitemap_data )
69+
70+ all_urls = book_root .findall ('ns:url' , ns ) + cloud_root .findall ('ns:url' , ns )
71+
4872 # Prepare the output sitemap
4973 ET .register_namespace ('' , "http://www.sitemaps.org/schemas/sitemap/0.9" )
5074 ET .register_namespace ('xhtml' , "http://www.w3.org/1999/xhtml" )
5175 new_root = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset' )
5276
53- # Add static URLs without translations
54- static_urls = [
55- "https://www.hacktricks.xyz/" ,
77+ # Add static entry for https://www.hacktricks.xyz/
78+ add_static_urls_without_translations (new_root , [
79+ "https://www.hacktricks.xyz/"
80+ ])
81+
82+ # Add static URLs for training.hacktricks.xyz without translations
83+ static_training_urls = [
5684 "https://training.hacktricks.xyz/" ,
5785 "https://training.hacktricks.xyz/courses/arte" ,
5886 "https://training.hacktricks.xyz/courses/arta" ,
@@ -66,25 +94,45 @@ def main():
6694 "https://training.hacktricks.xyz/terms" ,
6795 "https://training.hacktricks.xyz/privacy" ,
6896 ]
69- add_static_urls_without_translations (new_root , static_urls )
70-
71- # Add URLs with translations (Example: book.hacktricks.xyz)
72- url_element = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
73- loc = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
74- loc .text = encode_url ("https://book.hacktricks.xyz/" )
75- priority = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}priority' )
76- priority .text = "0.84"
77- lastmod = ET .SubElement (url_element , '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' )
78- lastmod .text = "2024-12-14"
79-
80- # Add translations
81- for hreflang , lang_path in languages .items ():
82- alt_link = ET .SubElement (url_element , '{http://www.w3.org/1999/xhtml}link' )
83- alt_link .set ('rel' , 'alternate' )
84- alt_link .set ('hreflang' , hreflang )
85- alt_link .set ('href' , encode_url (f"https://book.hacktricks.xyz/{ lang_path } " ))
86-
87- new_root .append (url_element )
97+ add_static_urls_without_translations (new_root , static_training_urls )
98+
99+ # Process main URLs from book and cloud hacktricks sitemaps
100+ for url_element in tqdm (all_urls , desc = "Processing URLs" ):
101+ loc = url_element .find ('ns:loc' , ns )
102+ if loc is None :
103+ continue
104+
105+ loc_text = loc .text .strip ()
106+ priority = url_element .find ('ns:priority' , ns )
107+ lastmod = url_element .find ('ns:lastmod' , ns )
108+
109+ # Create a new <url> element
110+ url_entry = ET .Element ('{http://www.sitemaps.org/schemas/sitemap/0.9}url' )
111+
112+ # Add <loc>
113+ loc_el = ET .SubElement (url_entry , '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' )
114+ loc_el .text = encode_url (loc_text )
115+
116+ # Add <priority> if available
117+ if priority is not None :
118+ priority_el = ET .SubElement (url_entry , '{http://www.sitemaps.org/schemas/sitemap/0.9}priority' )
119+ priority_el .text = priority .text
120+
121+ # Add <lastmod> if available
122+ if lastmod is not None :
123+ lastmod_el = ET .SubElement (url_entry , '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' )
124+ lastmod_el .text = lastmod .text
125+
126+ # Add alternate links for translations
127+ base_domain = loc_text .split ('/' )[0 :3 ]
128+ base_domain = '/' .join (base_domain )
129+ for hreflang , lang_path in languages .items ():
130+ alt_link = ET .SubElement (url_entry , '{http://www.w3.org/1999/xhtml}link' )
131+ alt_link .set ('rel' , 'alternate' )
132+ alt_link .set ('hreflang' , hreflang )
133+ alt_link .set ('href' , encode_url (f"{ base_domain } /{ lang_path } " ))
134+
135+ new_root .append (url_entry )
88136
89137 # Save prettified XML to file
90138 beautified_xml = prettify_xml (new_root )
0 commit comments