Skip to content

Commit 94d68e8

Browse files
Add dynamic sitemap generation route and caching mechanism (#2234)
This change: - Adds a route to dynamically generate sitemap.xml on the first request. - Implements caching mechanism to serve the generated sitemap.xml from the directory on subsequent requests. - Updates the Flask app to include the generate_sitemap function. - Ensures the sitemap is generated during the initial deployment and reused afterward. This change improves the SEO functionality by ensuring the sitemap is always available and up-to-date, with minimal performance overhead. issue: #1639
1 parent ed34a45 commit 94d68e8

File tree

2 files changed

+319
-0
lines changed

2 files changed

+319
-0
lines changed

gcp/appengine/generate_sitemap.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Generate sitemap."""
15+
import gzip
16+
import shutil
17+
import sys
18+
import os
19+
import osv
20+
import osv.logs
21+
import datetime
22+
import argparse
23+
from google.cloud import ndb
24+
25+
from xml.etree.ElementTree import Element, SubElement, ElementTree
26+
27+
_SITEMAPS_DIRECTORY = './sitemap'
28+
_SITEMAP_INDEX_PATH = f'{_SITEMAPS_DIRECTORY}/index.xml'
29+
_SITEMAP_URL_LIMIT = 49999
30+
31+
32+
def fetch_vulnerability_ids(ecosystem: str) -> list[str]:
33+
"""Fetch vulnerabilities' id for the given ecosystem."""
34+
bugs = osv.Bug.query(
35+
osv.Bug.status == osv.BugStatus.PROCESSED,
36+
osv.Bug.public == True, # pylint: disable=singleton-comparison
37+
osv.Bug.ecosystem == ecosystem).order(-osv.Bug.timestamp)
38+
bug_ids = [bug.db_id for bug in bugs]
39+
return bug_ids
40+
41+
42+
def osv_get_ecosystems():
43+
"""Get list of ecosystems."""
44+
query = osv.Bug.query(projection=[osv.Bug.ecosystem], distinct=True)
45+
return sorted([bug.ecosystem[0] for bug in query if bug.ecosystem],
46+
key=str.lower)
47+
48+
49+
def get_sitemap_filename_for_ecosystem(ecosystem: str) -> str:
50+
ecosystem_name = ecosystem.replace(' ', '_').replace('.', '__').strip()
51+
return f'{_SITEMAPS_DIRECTORY}/{ecosystem_name}.xml'
52+
53+
54+
def get_sitemap_url_for_ecosystem(ecosystem: str, base_url: str) -> str:
55+
ecosystem_name = ecosystem.replace(' ', '_').replace('.', '__').strip()
56+
return f'{base_url}/sitemap/{ecosystem_name}.xml'
57+
58+
59+
def generate_sitemap_for_ecosystem(ecosystem: str, base_url: str) -> None:
60+
"""Generate a sitemap for the give n ecosystem."""
61+
os.makedirs(_SITEMAPS_DIRECTORY, exist_ok=True)
62+
63+
vulnerability_ids = fetch_vulnerability_ids(ecosystem)
64+
filename = get_sitemap_filename_for_ecosystem(ecosystem)
65+
urlset = Element(
66+
'urlset', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
67+
68+
# TODO: For large ecosystems with over 50,000 vulnerabilities, generate
69+
# multiple sitemaps.
70+
for vuln in vulnerability_ids[:_SITEMAP_URL_LIMIT]:
71+
url = SubElement(urlset, 'url')
72+
loc = SubElement(url, 'loc')
73+
loc.text = f"{base_url}/vulnerability/{vuln}"
74+
lastmod = SubElement(url, 'lastmod')
75+
lastmod.text = datetime.datetime.now().isoformat()
76+
77+
tree = ElementTree(urlset)
78+
tree.write(filename, encoding='utf-8', xml_declaration=True)
79+
80+
81+
def compress_file(file_path: str) -> str:
82+
"""Compress the file using gzip and return the path to the compressed file."""
83+
base, _ = os.path.splitext(file_path)
84+
compressed_file_path = f"{base}.gz"
85+
with open(file_path, 'rb') as f_in:
86+
with gzip.open(compressed_file_path, 'wb') as f_out:
87+
shutil.copyfileobj(f_in, f_out)
88+
# Remove the original uncompressed file
89+
os.remove(file_path)
90+
return compressed_file_path
91+
92+
93+
def generate_sitemap_index(ecosystems: set[str], base_url: str) -> None:
94+
"""Generate a sitemap index."""
95+
os.makedirs(_SITEMAPS_DIRECTORY, exist_ok=True)
96+
97+
sitemapindex = Element(
98+
'sitemapindex', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
99+
100+
for ecosystem in ecosystems:
101+
sitemap = SubElement(sitemapindex, "sitemap")
102+
loc = SubElement(sitemap, 'loc')
103+
loc.text = get_sitemap_url_for_ecosystem(ecosystem, base_url)
104+
lastmod = SubElement(sitemap, 'lastmod')
105+
lastmod.text = datetime.datetime.now().isoformat()
106+
107+
tree = ElementTree(sitemapindex)
108+
tree.write(_SITEMAP_INDEX_PATH, encoding='utf-8', xml_declaration=True)
109+
110+
111+
def generate_sitemaps(base_url: str) -> None:
112+
"""Generate sitemaps including all vulnerabilities, split by ecosystem."""
113+
114+
# Go over the base ecosystems index. Otherwise we'll have duplicated
115+
# vulnerabilities in the sitemap.
116+
base_ecosystems = {
117+
ecosystem for ecosystem in osv_get_ecosystems() if ':' not in ecosystem
118+
}
119+
for ecosystem in base_ecosystems:
120+
generate_sitemap_for_ecosystem(ecosystem, base_url)
121+
compress_file(get_sitemap_filename_for_ecosystem(ecosystem))
122+
123+
generate_sitemap_index(base_ecosystems, base_url)
124+
compress_file(_SITEMAP_INDEX_PATH)
125+
126+
127+
def main() -> int:
128+
parser = argparse.ArgumentParser(description='Generate sitemaps.')
129+
parser.add_argument(
130+
'--base_url', required=True, help='The base URL for the sitemap entries.')
131+
args = parser.parse_args()
132+
generate_sitemaps(args.base_url)
133+
return 0
134+
135+
136+
if __name__ == '__main__':
137+
_ndb_client = ndb.Client()
138+
osv.logs.setup_gcp_logging('generate_sitemap')
139+
with _ndb_client.context():
140+
sys.exit(main())
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# Copyright 2021 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
"""Sitemap generator tests."""
14+
15+
# limitations under the License.
16+
import unittest
17+
import tempfile
18+
import os
19+
import gzip
20+
from unittest.mock import patch, MagicMock
21+
import generate_sitemap
22+
import osv
23+
24+
25+
class TestSitemapGeneration(unittest.TestCase):
26+
"""Tests to verify the functionality of the sitemap generator script"""
27+
28+
def temp_file(self):
29+
# Create a temporary file for testing
30+
self.test_file = tempfile.NamedTemporaryFile(delete=False)
31+
self.test_file.write(b'This is a test file.')
32+
self.test_file.close()
33+
return self.test_file.name
34+
35+
def test_compress_file(self):
36+
"""Test it compresses the file and removes the original file."""
37+
input_filename = self.temp_file()
38+
39+
# Call the compress_file function
40+
compressed_file_path = generate_sitemap.compress_file(input_filename)
41+
42+
# Verify that the original file is removed
43+
self.assertFalse(os.path.exists(input_filename))
44+
45+
# Verify that the compressed file is created
46+
self.assertTrue(os.path.exists(compressed_file_path))
47+
48+
# Verify the contents of the compressed file
49+
with gzip.open(compressed_file_path, 'rb') as f:
50+
content = f.read()
51+
self.assertEqual(content, b'This is a test file.')
52+
53+
# Clean up compressed file created during the test
54+
os.remove(compressed_file_path)
55+
56+
@patch.object(osv.Bug, 'query')
57+
def test_fetch_vulnerability_ids(self, mock_query):
58+
"""Test it returns the vulnerability ids for ecosystem"""
59+
# Mock the returned query
60+
mock_query.return_value.order.return_value = [
61+
MagicMock(db_id='vuln1'),
62+
MagicMock(db_id='vuln2')
63+
]
64+
65+
result = generate_sitemap.fetch_vulnerability_ids('Go')
66+
self.assertEqual(result, ['vuln1', 'vuln2'])
67+
68+
@patch.object(osv.Bug, 'query')
69+
def test_osv_get_ecosystems(self, mock_query):
70+
"""Test it returns the ecosystems"""
71+
# Mock the returned query
72+
mock_query.return_value = [
73+
MagicMock(ecosystem=['UVI']),
74+
MagicMock(ecosystem=['Go'])
75+
]
76+
77+
result = generate_sitemap.osv_get_ecosystems()
78+
self.assertEqual(result, ['Go', 'UVI'])
79+
80+
@patch('generate_sitemap.fetch_vulnerability_ids')
81+
@patch('generate_sitemap.ElementTree')
82+
@patch('generate_sitemap.os.makedirs')
83+
def test_generate_sitemap_for_ecosystem(self, mock_makedirs,
84+
mock_element_tree, mock_fetch_vulns):
85+
"""Check it generates the sitemap for ecosystem"""
86+
mock_fetch_vulns.return_value = ['vuln1', 'vuln2']
87+
mock_tree = MagicMock()
88+
mock_element_tree.return_value = mock_tree
89+
90+
generate_sitemap.generate_sitemap_for_ecosystem('Go', 'http://example.com')
91+
92+
mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
93+
mock_tree.write.assert_called_once_with(
94+
'./sitemap/Go.xml', encoding='utf-8', xml_declaration=True)
95+
96+
@patch('generate_sitemap.fetch_vulnerability_ids')
97+
@patch('generate_sitemap.ElementTree')
98+
@patch('generate_sitemap.os.makedirs')
99+
def test_generate_sitemap_for_ecosystem_with_space(self, mock_makedirs,
100+
mock_element_tree,
101+
mock_fetch_vulns):
102+
""""
103+
Check it creates the sitemap correctly where there is a space in the
104+
ecosystem name.
105+
"""
106+
mock_fetch_vulns.return_value = ['vuln1', 'vuln2']
107+
mock_tree = MagicMock()
108+
mock_element_tree.return_value = mock_tree
109+
110+
generate_sitemap.generate_sitemap_for_ecosystem('Rocky Linux',
111+
'http://example.com')
112+
113+
mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
114+
mock_tree.write.assert_called_once_with(
115+
'./sitemap/Rocky_Linux.xml', encoding='utf-8', xml_declaration=True)
116+
117+
@patch('generate_sitemap.fetch_vulnerability_ids')
118+
@patch('generate_sitemap.ElementTree')
119+
@patch('generate_sitemap.os.makedirs')
120+
def test_generate_sitemap_for_ecosystem_with_period(self, mock_makedirs,
121+
mock_element_tree,
122+
mock_fetch_vulns):
123+
""""
124+
Check it creates the sitemap correctly where there is a period in the
125+
ecosystem name.
126+
"""
127+
mock_fetch_vulns.return_value = ['vuln1', 'vuln2']
128+
mock_tree = MagicMock()
129+
mock_element_tree.return_value = mock_tree
130+
131+
generate_sitemap.generate_sitemap_for_ecosystem('crates.io',
132+
'http://example.com')
133+
134+
mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
135+
mock_tree.write.assert_called_once_with(
136+
'./sitemap/crates__io.xml', encoding='utf-8', xml_declaration=True)
137+
138+
@patch('generate_sitemap.ElementTree')
139+
@patch('generate_sitemap.os.makedirs')
140+
def test_generate_sitemap_index(self, mock_makedirs, mock_element_tree):
141+
"""Check it generates the sitemap index as expected"""
142+
mock_tree = MagicMock()
143+
mock_element_tree.return_value = mock_tree
144+
145+
generate_sitemap.generate_sitemap_index({'Go', 'UVI'}, 'http://example.com')
146+
147+
mock_makedirs.assert_called_once_with('./sitemap', exist_ok=True)
148+
mock_tree.write.assert_called_once_with(
149+
'./sitemap/index.xml', encoding='utf-8', xml_declaration=True)
150+
151+
@patch('generate_sitemap.generate_sitemap_for_ecosystem')
152+
@patch('generate_sitemap.generate_sitemap_index')
153+
@patch('generate_sitemap.osv_get_ecosystems')
154+
@patch('generate_sitemap.compress_file')
155+
def test_generate_sitemap(self, mock_compress_file, mock_get_ecosystems,
156+
mock_generate_index, mock_generate_sitemap):
157+
"""
158+
Check the outer wrapper generates the ecosystems' sitemaps as well as
159+
sitemap index.
160+
"""
161+
mock_get_ecosystems.return_value = ['Go', 'UVI:Library', 'Android']
162+
163+
generate_sitemap.generate_sitemaps('http://example.com')
164+
165+
self.assertEqual(mock_generate_sitemap.call_count, 2)
166+
mock_generate_sitemap.assert_any_call('Go', 'http://example.com')
167+
mock_generate_sitemap.assert_any_call('Android', 'http://example.com')
168+
169+
self.assertEqual(mock_compress_file.call_count, 3)
170+
mock_compress_file.assert_any_call('./sitemap/Go.xml')
171+
mock_compress_file.assert_any_call('./sitemap/Android.xml')
172+
mock_compress_file.assert_any_call('./sitemap/index.xml')
173+
174+
mock_generate_index.assert_called_once_with({'Android', 'Go'},
175+
'http://example.com')
176+
177+
178+
if __name__ == '__main__':
179+
unittest.main()

0 commit comments

Comments
 (0)