Skip to content

Commit dd47103

Browse files
authored
Update generate_sitemap.py
1 parent a7bd000 commit dd47103

File tree

1 file changed

+32
-31
lines changed

1 file changed

+32
-31
lines changed

generate_sitemap.py

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import requests
22
import xml.etree.ElementTree as ET
33
from xml.dom import minidom
4-
from concurrent.futures import ThreadPoolExecutor, as_completed
54
from tqdm import tqdm
5+
from urllib.parse import quote
66

77
# Correct hreflang values for languages
88
languages = {
@@ -35,20 +35,24 @@ def fetch_sitemap(url):
3535
response.raise_for_status()
3636
return response.text
3737

38-
def check_url_exists(url):
39-
"""Check if a URL exists using a HEAD request."""
40-
try:
41-
r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=30)
42-
return r.status_code == 200
43-
except Exception:
44-
return False
38+
# def check_url_exists(url):
39+
# """Check if a URL exists using a HEAD request."""
40+
# try:
41+
# r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=30)
42+
# return r.status_code == 200
43+
# except Exception:
44+
# return False
4545

4646
def prettify_xml(element):
4747
"""Prettify and return a string representation of the XML."""
4848
rough_string = ET.tostring(element, encoding='utf-8')
4949
reparsed = minidom.parseString(rough_string)
5050
return reparsed.toprettyxml(indent=" ")
5151

52+
def encode_url(url):
53+
"""Encode the URL to make it XML-safe and RFC-compliant."""
54+
return quote(url, safe=":/?&=") # Leave common URL-safe characters untouched
55+
5256
def main():
5357
# URLs of the sitemaps
5458
book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml"
@@ -76,7 +80,7 @@ def main():
7680
# Add static entry for https://www.hacktricks.xyz/
7781
static_url = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
7882
loc = ET.SubElement(static_url, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
79-
loc.text = "https://www.hacktricks.xyz/"
83+
loc.text = encode_url("https://www.hacktricks.xyz/")
8084
new_root.append(static_url)
8185

8286
# Process main URLs
@@ -93,6 +97,9 @@ def main():
9397
priority = url_element.find('ns:priority', ns)
9498
lastmod = url_element.find('ns:lastmod', ns)
9599

100+
# Encode the base loc_text
101+
loc_text = encode_url(loc_text)
102+
96103
# Determine base domain and path
97104
parts = loc_text.split("/")
98105
if len(parts) > 3:
@@ -112,7 +119,7 @@ def main():
112119
else:
113120
# If original was just the root, translated is also root + /lang
114121
translated_url = f"{base_domain}/{lang_code}"
115-
translation_urls[hreflang] = translated_url
122+
translation_urls[hreflang] = encode_url(translated_url)
116123

117124
url_entries.append((
118125
loc_text,
@@ -121,18 +128,16 @@ def main():
121128
translation_urls
122129
))
123130

124-
# Parallel check all translation URLs with progress bar
125-
all_translation_checks = {}
126-
with ThreadPoolExecutor(max_workers=30) as executor:
127-
# Submit all tasks to executor
128-
future_to_url = {executor.submit(check_url_exists, t_url): (hreflang, t_url)
129-
for _, _, _, t_urls in url_entries for hreflang, t_url in t_urls.items()}
130-
131-
# Use tqdm to show progress
132-
for future in tqdm(as_completed(future_to_url), total=len(future_to_url), desc="Checking Translation URLs"):
133-
hreflang, t_url = future_to_url[future]
134-
result = future.result()
135-
all_translation_checks[t_url] = result
131+
# Commented-out URL checks, assuming all translations exist for now
132+
# all_translation_checks = {}
133+
# with ThreadPoolExecutor(max_workers=10) as executor:
134+
# future_to_url = {executor.submit(check_url_exists, t_url): (hreflang, t_url)
135+
# for _, _, _, t_urls in url_entries for hreflang, t_url in t_urls.items()}
136+
#
137+
# for future in tqdm(as_completed(future_to_url), total=len(future_to_url), desc="Checking Translation URLs"):
138+
# hreflang, t_url = future_to_url[future]
139+
# result = future.result()
140+
# all_translation_checks[t_url] = result
136141

137142
# Build the final sitemap
138143
for (loc_text, priority_val, lastmod_val, translation_urls) in url_entries:
@@ -149,16 +154,12 @@ def main():
149154
lastmod_el = ET.SubElement(new_url, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
150155
lastmod_el.text = lastmod_val
151156

152-
# Add existing translations (excluding English, which is default)
157+
# Add all translations (assume all exist for now)
153158
for hreflang, t_url in translation_urls.items():
154-
if all_translation_checks.get(t_url, False):
155-
alt_link = ET.SubElement(new_url, '{http://www.w3.org/1999/xhtml}link')
156-
alt_link.set('rel', 'alternate')
157-
alt_link.set('hreflang', hreflang)
158-
alt_link.set('href', t_url)
159-
else:
160-
# Print in red if not found
161-
print("\033[31m" + f"{t_url} NOT FOUND" + "\033[0m")
159+
alt_link = ET.SubElement(new_url, '{http://www.w3.org/1999/xhtml}link')
160+
alt_link.set('rel', 'alternate')
161+
alt_link.set('hreflang', hreflang)
162+
alt_link.set('href', t_url)
162163

163164
new_root.append(new_url)
164165

0 commit comments

Comments
 (0)