Skip to content

Commit 7a9a3de

Browse files
authored
Update generate_sitemap.py
1 parent 936e7b7 commit 7a9a3de

File tree

1 file changed

+73
-25
lines changed

1 file changed

+73
-25
lines changed

generate_sitemap.py

Lines changed: 73 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,17 @@
2424
"uk": "uk", # Ukrainian
2525
}
2626

27+
# User agent for Googlebot
28+
HEADERS = {
29+
#"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" # gitbook returns 403
30+
}
31+
32+
def fetch_sitemap(url):
33+
"""Fetch and return the contents of a sitemap."""
34+
response = requests.get(url, headers=HEADERS, timeout=30)
35+
response.raise_for_status()
36+
return response.text
37+
2738
def prettify_xml(element):
2839
"""Prettify and return a string representation of the XML."""
2940
rough_string = ET.tostring(element, encoding='utf-8')
@@ -32,27 +43,44 @@ def prettify_xml(element):
3243

3344
def encode_url(url):
3445
"""Encode the URL to make it XML-safe and RFC-compliant."""
35-
return quote(url, safe=":/?&=")
46+
return quote(url, safe=":/?&=") # Leave common URL-safe characters untouched
3647

3748
def add_static_urls_without_translations(root, urls):
3849
"""Add static URLs without translations to the sitemap."""
3950
for url in urls:
4051
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
41-
4252
loc = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
4353
loc.text = encode_url(url)
44-
4554
root.append(url_element)
4655

4756
def main():
57+
# URLs of the sitemaps
58+
book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml"
59+
cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"
60+
61+
# Fetch both sitemaps
62+
book_sitemap_data = fetch_sitemap(book_sitemap_url)
63+
cloud_sitemap_data = fetch_sitemap(cloud_sitemap_url)
64+
65+
# Parse XML
66+
ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
67+
book_root = ET.fromstring(book_sitemap_data)
68+
cloud_root = ET.fromstring(cloud_sitemap_data)
69+
70+
all_urls = book_root.findall('ns:url', ns) + cloud_root.findall('ns:url', ns)
71+
4872
# Prepare the output sitemap
4973
ET.register_namespace('', "http://www.sitemaps.org/schemas/sitemap/0.9")
5074
ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml")
5175
new_root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset')
5276

53-
# Add static URLs without translations
54-
static_urls = [
55-
"https://www.hacktricks.xyz/",
77+
# Add static entry for https://www.hacktricks.xyz/
78+
add_static_urls_without_translations(new_root, [
79+
"https://www.hacktricks.xyz/"
80+
])
81+
82+
# Add static URLs for training.hacktricks.xyz without translations
83+
static_training_urls = [
5684
"https://training.hacktricks.xyz/",
5785
"https://training.hacktricks.xyz/courses/arte",
5886
"https://training.hacktricks.xyz/courses/arta",
@@ -66,25 +94,45 @@ def main():
6694
"https://training.hacktricks.xyz/terms",
6795
"https://training.hacktricks.xyz/privacy",
6896
]
69-
add_static_urls_without_translations(new_root, static_urls)
70-
71-
# Add URLs with translations (Example: book.hacktricks.xyz)
72-
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
73-
loc = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
74-
loc.text = encode_url("https://book.hacktricks.xyz/")
75-
priority = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
76-
priority.text = "0.84"
77-
lastmod = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
78-
lastmod.text = "2024-12-14"
79-
80-
# Add translations
81-
for hreflang, lang_path in languages.items():
82-
alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
83-
alt_link.set('rel', 'alternate')
84-
alt_link.set('hreflang', hreflang)
85-
alt_link.set('href', encode_url(f"https://book.hacktricks.xyz/{lang_path}"))
86-
87-
new_root.append(url_element)
97+
add_static_urls_without_translations(new_root, static_training_urls)
98+
99+
# Process main URLs from book and cloud hacktricks sitemaps
100+
for url_element in tqdm(all_urls, desc="Processing URLs"):
101+
loc = url_element.find('ns:loc', ns)
102+
if loc is None:
103+
continue
104+
105+
loc_text = loc.text.strip()
106+
priority = url_element.find('ns:priority', ns)
107+
lastmod = url_element.find('ns:lastmod', ns)
108+
109+
# Create a new <url> element
110+
url_entry = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
111+
112+
# Add <loc>
113+
loc_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
114+
loc_el.text = encode_url(loc_text)
115+
116+
# Add <priority> if available
117+
if priority is not None:
118+
priority_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
119+
priority_el.text = priority.text
120+
121+
# Add <lastmod> if available
122+
if lastmod is not None:
123+
lastmod_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
124+
lastmod_el.text = lastmod.text
125+
126+
# Add alternate links for translations
127+
base_domain = loc_text.split('/')[0:3]
128+
base_domain = '/'.join(base_domain)
129+
for hreflang, lang_path in languages.items():
130+
alt_link = ET.SubElement(url_entry, '{http://www.w3.org/1999/xhtml}link')
131+
alt_link.set('rel', 'alternate')
132+
alt_link.set('hreflang', hreflang)
133+
alt_link.set('href', encode_url(f"{base_domain}/{lang_path}"))
134+
135+
new_root.append(url_entry)
88136

89137
# Save prettified XML to file
90138
beautified_xml = prettify_xml(new_root)

0 commit comments

Comments
 (0)