Skip to content

Commit 0c73cd4

Browse files
authored
Update generate_sitemap.py
1 parent c187934 commit 0c73cd4

File tree

1 file changed

+23
-116
lines changed

1 file changed

+23
-116
lines changed

generate_sitemap.py

Lines changed: 23 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,6 @@
2424
"uk": "uk", # Ukrainian
2525
}
2626

27-
# User agent for Googlebot
28-
HEADERS = {
29-
#"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" # gitbook returns 403
30-
}
31-
32-
def fetch_sitemap(url):
33-
"""Fetch and return the contents of a sitemap."""
34-
response = requests.get(url, headers=HEADERS, timeout=30)
35-
response.raise_for_status()
36-
return response.text
37-
38-
# def check_url_exists(url):
39-
# """Check if a URL exists using a HEAD request."""
40-
# try:
41-
# r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=30)
42-
# return r.status_code == 200
43-
# except Exception:
44-
# return False
45-
4627
def prettify_xml(element):
4728
"""Prettify and return a string representation of the XML."""
4829
rough_string = ET.tostring(element, encoding='utf-8')
@@ -51,7 +32,7 @@ def prettify_xml(element):
5132

5233
def encode_url(url):
5334
"""Encode the URL to make it XML-safe and RFC-compliant."""
54-
return quote(url, safe=":/?&=") # Leave common URL-safe characters untouched
35+
return quote(url, safe=":/?&=")
5536

5637
def add_static_urls_without_translations(root, urls):
5738
"""Add static URLs without translations to the sitemap."""
@@ -64,37 +45,14 @@ def add_static_urls_without_translations(root, urls):
6445
root.append(url_element)
6546

6647
def main():
67-
# URLs of the sitemaps
68-
book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml"
69-
cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"
70-
71-
# Fetch both sitemaps
72-
book_sitemap_data = fetch_sitemap(book_sitemap_url)
73-
cloud_sitemap_data = fetch_sitemap(cloud_sitemap_url)
74-
75-
# Parse XML
76-
ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
77-
book_root = ET.fromstring(book_sitemap_data)
78-
cloud_root = ET.fromstring(cloud_sitemap_data)
79-
80-
all_urls = book_root.findall('ns:url', ns) + cloud_root.findall('ns:url', ns)
81-
8248
# Prepare the output sitemap
8349
ET.register_namespace('', "http://www.sitemaps.org/schemas/sitemap/0.9")
8450
ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml")
8551
new_root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset')
8652

87-
seen_locs = set()
88-
url_entries = [] # Store info for each main URL
89-
90-
# Add static entry for https://www.hacktricks.xyz/
91-
static_url = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
92-
loc = ET.SubElement(static_url, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
93-
loc.text = encode_url("https://www.hacktricks.xyz/")
94-
new_root.append(static_url)
95-
96-
# Add static URLs for training.hacktricks.xyz without translations
97-
static_training_urls = [
53+
# Add static URLs without translations
54+
static_urls = [
55+
"https://www.hacktricks.xyz/",
9856
"https://training.hacktricks.xyz/",
9957
"https://training.hacktricks.xyz/courses/arte",
10058
"https://training.hacktricks.xyz/courses/arta",
@@ -108,76 +66,25 @@ def main():
10866
"https://training.hacktricks.xyz/terms",
10967
"https://training.hacktricks.xyz/privacy",
11068
]
111-
add_static_urls_without_translations(new_root, static_training_urls)
112-
113-
# Process main URLs
114-
for url_element in tqdm(all_urls, desc="Processing URLs"):
115-
loc = url_element.find('ns:loc', ns)
116-
if loc is None:
117-
continue
118-
loc_text = loc.text.strip()
119-
120-
if loc_text in seen_locs:
121-
continue
122-
seen_locs.add(loc_text)
123-
124-
priority = url_element.find('ns:priority', ns)
125-
lastmod = url_element.find('ns:lastmod', ns)
126-
127-
# Encode the base loc_text
128-
loc_text = encode_url(loc_text)
129-
130-
# Determine base domain and path
131-
parts = loc_text.split("/")
132-
if len(parts) > 3:
133-
base_domain_parts = parts[:3]
134-
page_path = "/".join(parts[3:])
135-
else:
136-
base_domain_parts = parts[:3]
137-
page_path = ""
138-
139-
base_domain = "/".join(base_domain_parts)
140-
141-
# Construct all translation URLs for this loc
142-
translation_urls = {}
143-
for lang_code, hreflang in languages.items():
144-
if page_path:
145-
translated_url = f"{base_domain}/{lang_code}/{page_path}"
146-
else:
147-
# If original was just the root, translated is also root + /lang
148-
translated_url = f"{base_domain}/{lang_code}"
149-
translation_urls[hreflang] = encode_url(translated_url)
150-
151-
url_entries.append((
152-
loc_text,
153-
priority.text if priority is not None else None,
154-
lastmod.text if lastmod is not None else None,
155-
translation_urls
156-
))
157-
158-
# Build the final sitemap
159-
for (loc_text, priority_val, lastmod_val, translation_urls) in url_entries:
160-
new_url = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
161-
162-
loc_el = ET.SubElement(new_url, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
163-
loc_el.text = loc_text
164-
165-
if priority_val:
166-
priority_el = ET.SubElement(new_url, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
167-
priority_el.text = priority_val
168-
169-
if lastmod_val:
170-
lastmod_el = ET.SubElement(new_url, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
171-
lastmod_el.text = lastmod_val
172-
173-
# Add all translations (assume all exist for now)
174-
for hreflang, t_url in translation_urls.items():
175-
alt_link = ET.SubElement(new_url, '{http://www.sitemaps.org/schemas/sitemap/0.9}link')
176-
alt_link.set('rel', 'alternate')
177-
alt_link.set('hreflang', hreflang)
178-
alt_link.set('href', t_url)
179-
180-
new_root.append(new_url)
69+
add_static_urls_without_translations(new_root, static_urls)
70+
71+
# Add URLs with translations (Example: book.hacktricks.xyz)
72+
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
73+
loc = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
74+
loc.text = encode_url("https://book.hacktricks.xyz/")
75+
priority = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
76+
priority.text = "0.84"
77+
lastmod = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
78+
lastmod.text = "2024-12-14"
79+
80+
# Add translations
81+
for hreflang, lang_path in languages.items():
82+
alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
83+
alt_link.set('rel', 'alternate')
84+
alt_link.set('hreflang', hreflang)
85+
alt_link.set('href', encode_url(f"https://book.hacktricks.xyz/{lang_path}"))
86+
87+
new_root.append(url_element)
18188

18289
# Save prettified XML to file
18390
beautified_xml = prettify_xml(new_root)

0 commit comments

Comments
 (0)