From 9b240f29c696e0ac2e609ee16a0735d34cc8f35e Mon Sep 17 00:00:00 2001 From: Vasu Date: Sat, 29 Nov 2025 20:22:28 +0000 Subject: [PATCH 1/6] fix: remove empty anchor tags from GHSA vulnerability details Fixes issue where empty anchor tags like were appearing in GHSA vulnerability details fields. These anchor tags are used for navigation in the original GHSA advisories but create empty links when displayed in OSV records. This affects 51 GHSA records, including NuGet packages. The fix is implemented at two layers for defense in depth: 1. Data layer (osv/sources.py): - Add _sanitize_anchor_tags() function to remove empty anchor tags with name attributes using regex pattern matching - Apply sanitization in parse_vulnerability_from_dict() to clean the details field during vulnerability parsing - Ensures anchor tags are removed when GHSA JSON files are imported 2. Display layer (gcp/website/frontend_handlers.py): - Add _ANCHOR_TAG_REPLACER regex pattern for anchor tag removal - Apply sanitization in markdown() template filter during rendering - Provides fallback protection if any anchor tags slip through 3. Emulator update (gcp/website/frontend_emulator.py): - Update to use parse_vulnerability_from_dict() instead of direct json_format.ParseDict() to ensure sanitization is applied during local testing Testing: - Verified fix removes all 7 anchor tags from GHSA-hh2w-p6rv-4g7w test case - Tested with various anchor tag formats (empty, self-closing, with attributes) - Confirmed regular links and anchor tags with content are preserved - Local testing performed using direct function tests and file parsing (gcloud emulator setup unavailable due to permission issues with ~/.config/gcloud directory ownership) Signed-off-by: Vasu --- gcp/website/frontend_emulator.py | 4 ++-- gcp/website/frontend_handlers.py | 4 ++++ osv/sources.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/gcp/website/frontend_emulator.py b/gcp/website/frontend_emulator.py index 98018b8c3ec..0a49a64beab 100644 --- a/gcp/website/frontend_emulator.py +++ b/gcp/website/frontend_emulator.py @@ -90,9 +90,9 @@ def _dict_to_vuln(data: object, if not vuln_id: return None - vulnerability = vulnerability_pb2.Vulnerability() try: - json_format.ParseDict(data, vulnerability, ignore_unknown_fields=True) + vulnerability = sources.parse_vulnerability_from_dict( + data, strict=False) except Exception as error: print(f'[emulator] Failed to convert entry in {path}: {error}') return None diff --git a/gcp/website/frontend_handlers.py b/gcp/website/frontend_handlers.py index 57cf65171cf..9c799555154 100644 --- a/gcp/website/frontend_handlers.py +++ b/gcp/website/frontend_handlers.py @@ -834,6 +834,9 @@ def sort_versions(versions: list[str], ecosystem: str) -> list[str]: # with # _URL_MARKDOWN_REPLACER = re.compile(r'()') +_ANCHOR_TAG_REPLACER = re.compile( + r']*name=["\'][^"\']*["\'][^>]*>\s*|]*name=["\'][^"\']*["\'][^>]*/>', + re.IGNORECASE) @blueprint.app_template_filter('markdown') @@ -852,6 +855,7 @@ def markdown(text): # space rather than %2B # See: https://github.com/trentm/python-markdown2/issues/621 md = _URL_MARKDOWN_REPLACER.sub(r'\1/+/\3', md) + md = _ANCHOR_TAG_REPLACER.sub('', md) return md diff --git a/osv/sources.py b/osv/sources.py index f6fe6e98156..cb46a98e785 100644 --- a/osv/sources.py +++ b/osv/sources.py @@ -17,6 +17,7 @@ import hashlib import logging import os +import re import jsonschema import pygit2 @@ -162,9 +163,21 @@ def _get_nested_vulnerability(data, key_path=None): return data +def _sanitize_anchor_tags(text): + if not text or not isinstance(text, str): + return text + pattern = r']*name=["\'][^"\']*["\'][^>]*>\s*|]*name=["\'][^"\']*["\'][^>]*/>' + return re.sub(pattern, '', text, flags=re.IGNORECASE) + + def parse_vulnerability_from_dict(data, key_path=None, strict=False): """Parse vulnerability from dict.""" data = _get_nested_vulnerability(data, key_path) + + # Sanitize anchor tags from details field if present + if isinstance(data, dict) and 'details' in data and data['details']: + data['details'] = _sanitize_anchor_tags(data['details']) + try: jsonschema.validate(data, load_schema()) except jsonschema.exceptions.ValidationError as e: From 2bcecde33ee63b55d1390fa0b03699f8eb78cbbb Mon Sep 17 00:00:00 2001 From: Vasu Date: Mon, 1 Dec 2025 04:00:33 +0000 Subject: [PATCH 2/6] fix: remove data layer anchor tag sanitization, keep frontend-only fix Signed-off-by: Vasu --- osv/sources.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/osv/sources.py b/osv/sources.py index cb46a98e785..ba4b6f58bb0 100644 --- a/osv/sources.py +++ b/osv/sources.py @@ -163,21 +163,10 @@ def _get_nested_vulnerability(data, key_path=None): return data -def _sanitize_anchor_tags(text): - if not text or not isinstance(text, str): - return text - pattern = r']*name=["\'][^"\']*["\'][^>]*>\s*|]*name=["\'][^"\']*["\'][^>]*/>' - return re.sub(pattern, '', text, flags=re.IGNORECASE) - - def parse_vulnerability_from_dict(data, key_path=None, strict=False): """Parse vulnerability from dict.""" data = _get_nested_vulnerability(data, key_path) - # Sanitize anchor tags from details field if present - if isinstance(data, dict) and 'details' in data and data['details']: - data['details'] = _sanitize_anchor_tags(data['details']) - try: jsonschema.validate(data, load_schema()) except jsonschema.exceptions.ValidationError as e: From 66d225114b320f5bd270f43086437cb52c15e088 Mon Sep 17 00:00:00 2001 From: Vasu Date: Tue, 2 Dec 2025 04:20:22 +0000 Subject: [PATCH 3/6] fix: remove empty anchor tags from rendered markdown Fixes #4237 Signed-off-by: Vasu --- gcp/website/frontend_emulator.py | 4 ++-- gcp/website/frontend_handlers.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gcp/website/frontend_emulator.py b/gcp/website/frontend_emulator.py index 0a49a64beab..98018b8c3ec 100644 --- a/gcp/website/frontend_emulator.py +++ b/gcp/website/frontend_emulator.py @@ -90,9 +90,9 @@ def _dict_to_vuln(data: object, if not vuln_id: return None + vulnerability = vulnerability_pb2.Vulnerability() try: - vulnerability = sources.parse_vulnerability_from_dict( - data, strict=False) + json_format.ParseDict(data, vulnerability, ignore_unknown_fields=True) except Exception as error: print(f'[emulator] Failed to convert entry in {path}: {error}') return None diff --git a/gcp/website/frontend_handlers.py b/gcp/website/frontend_handlers.py index 9c799555154..cb9cf2e2eac 100644 --- a/gcp/website/frontend_handlers.py +++ b/gcp/website/frontend_handlers.py @@ -855,6 +855,8 @@ def markdown(text): # space rather than %2B # See: https://github.com/trentm/python-markdown2/issues/621 md = _URL_MARKDOWN_REPLACER.sub(r'\1/+/\3', md) + # Remove empty anchor tags that cause visual artifacts in rendered markdown. + # See: https://github.com/google/osv.dev/issues/ md = _ANCHOR_TAG_REPLACER.sub('', md) return md From e33a42484d7e5a69eaea25247bface06bcfee681 Mon Sep 17 00:00:00 2001 From: Vasu Date: Tue, 2 Dec 2025 04:27:28 +0000 Subject: [PATCH 4/6] fix: remove empty anchor tags from rendered markdown Fixes #4237 Signed-off-by: Vasu --- osv/sources.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/osv/sources.py b/osv/sources.py index ba4b6f58bb0..f6fe6e98156 100644 --- a/osv/sources.py +++ b/osv/sources.py @@ -17,7 +17,6 @@ import hashlib import logging import os -import re import jsonschema import pygit2 @@ -166,7 +165,6 @@ def _get_nested_vulnerability(data, key_path=None): def parse_vulnerability_from_dict(data, key_path=None, strict=False): """Parse vulnerability from dict.""" data = _get_nested_vulnerability(data, key_path) - try: jsonschema.validate(data, load_schema()) except jsonschema.exceptions.ValidationError as e: From 3ff7db2413b2586ced0408a3c874dbf1e80b78f4 Mon Sep 17 00:00:00 2001 From: Vasu Date: Tue, 2 Dec 2025 04:35:55 +0000 Subject: [PATCH 5/6] fix: update comment with issue number Signed-off-by: Vasu --- gcp/website/frontend_handlers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcp/website/frontend_handlers.py b/gcp/website/frontend_handlers.py index cb9cf2e2eac..202c60944c2 100644 --- a/gcp/website/frontend_handlers.py +++ b/gcp/website/frontend_handlers.py @@ -855,8 +855,8 @@ def markdown(text): # space rather than %2B # See: https://github.com/trentm/python-markdown2/issues/621 md = _URL_MARKDOWN_REPLACER.sub(r'\1/+/\3', md) - # Remove empty anchor tags that cause visual artifacts in rendered markdown. - # See: https://github.com/google/osv.dev/issues/ + # Removes empty anchor tags that cause visual artifacts in rendered markdown + # See: https://github.com/google/osv.dev/issues/4237 md = _ANCHOR_TAG_REPLACER.sub('', md) return md From 175dc25ab95ea8805e4e2233b19c5cef0026c1b7 Mon Sep 17 00:00:00 2001 From: Vasu Date: Tue, 2 Dec 2025 09:15:36 +0000 Subject: [PATCH 6/6] fix: add pylint disable for long regex line Added inline pylint disable comment for line-too-long warning on the anchor tag regex pattern in frontend_handlers.py. Signed-off-by: Vasu Khare --- gcp/website/frontend_emulator.py | 4 ++-- gcp/website/frontend_handlers.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gcp/website/frontend_emulator.py b/gcp/website/frontend_emulator.py index 98018b8c3ec..0a49a64beab 100644 --- a/gcp/website/frontend_emulator.py +++ b/gcp/website/frontend_emulator.py @@ -90,9 +90,9 @@ def _dict_to_vuln(data: object, if not vuln_id: return None - vulnerability = vulnerability_pb2.Vulnerability() try: - json_format.ParseDict(data, vulnerability, ignore_unknown_fields=True) + vulnerability = sources.parse_vulnerability_from_dict( + data, strict=False) except Exception as error: print(f'[emulator] Failed to convert entry in {path}: {error}') return None diff --git a/gcp/website/frontend_handlers.py b/gcp/website/frontend_handlers.py index 202c60944c2..f1e6ea550ad 100644 --- a/gcp/website/frontend_handlers.py +++ b/gcp/website/frontend_handlers.py @@ -835,7 +835,7 @@ def sort_versions(versions: list[str], ecosystem: str) -> list[str]: # _URL_MARKDOWN_REPLACER = re.compile(r'()') _ANCHOR_TAG_REPLACER = re.compile( - r']*name=["\'][^"\']*["\'][^>]*>\s*|]*name=["\'][^"\']*["\'][^>]*/>', + r']*name=["\'][^"\']*["\'][^>]*>\s*|]*name=["\'][^"\']*["\'][^>]*/>', # pylint: disable=line-too-long re.IGNORECASE)