-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsitemapindex-parser.py
More file actions
36 lines (27 loc) · 1.23 KB
/
sitemapindex-parser.py
File metadata and controls
36 lines (27 loc) · 1.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import xml.etree.ElementTree as ET
import json
import requests
# Read main.xml file, which contains sitemap index from https://docs.aws.amazon.com/sitemap_index.xml, with SDK sitemaps removed
with open("main.xml", "r") as xml_file:
sitemap_index_data = xml_file.read()
# Function to extract URLs from xml content
def extract_urls_from_xml(xml_content):
root = ET.fromstring(xml_content)
return [element.text for element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
sitemap_index_urls = extract_urls_from_xml(sitemap_index_data)
all_urls = []
for url in sitemap_index_urls:
try:
response = requests.get(url)
response.raise_for_status() # Check for HTTP errors
xml_content = response.text
# Extract URLs from each sitemap in sitemap index, add to list of all URLs
extracted_urls = extract_urls_from_xml(xml_content)
all_urls.extend(extracted_urls)
# Print the extracted URLs
for extracted_url in extracted_urls:
print(extracted_url)
except requests.exceptions.RequestException as e:
print(f"Error fetching XML from {url}: {e}")
with open("html_urls.json", "w") as json_file:
json.dump(all_urls, json_file, indent=2)