fix(cve5): group by introduced (#4390)

jess-lowe · web-flow · commit 0fedba3a0ef3 · 2025-11-28T11:15:48.000+11:00
The way we were interpreting version ranges where they have the same introduced is wrong as the schema treats ranges independently like an AND operator rather than an OR operator, meaning that if there is areas of no overlap between the ranges (like branches), all ranges are affected rather than just the ranges specified. This PR will group ranges if they have the same Introduced value into one set of events like: ``` { introduced: x, fixed: y, fixed: z } ``` shoutout @michaelkedar for spotting this
diff --git a/vulnfeeds/cvelist2osv/__snapshots__/converter_test.snap b/vulnfeeds/cvelist2osv/__snapshots__/converter_test.snap
@@ -161,87 +161,24 @@
             },
             {
               "fixed": "a3e77da9f843e4ab93917d30c314f0283e28c124"
-            }
-          ],
-          "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git",
-          "type": "GIT"
-        },
-        {
-          "events": [
-            {
-              "introduced": "1da177e4c3f41524e886b7f1b8a0c1fc7321cac2"
             },
             {
               "fixed": "213ba5bd81b7e97ac6e6190b8f3bc6ba76123625"
-            }
-          ],
-          "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git",
-          "type": "GIT"
-        },
-        {
-          "events": [
-            {
-              "introduced": "1da177e4c3f41524e886b7f1b8a0c1fc7321cac2"
             },
             {
               "fixed": "40a35d14f3c0dc72b689061ec72fc9b193f37d1f"
-            }
-          ],
-          "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git",
-          "type": "GIT"
-        },
-        {
-          "events": [
-            {
-              "introduced": "1da177e4c3f41524e886b7f1b8a0c1fc7321cac2"
             },
             {
               "fixed": "27a39d006f85e869be68c1d5d2ce05e5d6445bf5"
-            }
-          ],
-          "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git",
-          "type": "GIT"
-        },
-        {
-          "events": [
-            {
-              "introduced": "1da177e4c3f41524e886b7f1b8a0c1fc7321cac2"
             },
             {
               "fixed": "92527100be38ede924768f4277450dfe8a40e16b"
-            }
-          ],
-          "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git",
-          "type": "GIT"
-        },
-        {
-          "events": [
-            {
-              "introduced": "1da177e4c3f41524e886b7f1b8a0c1fc7321cac2"
             },
             {
               "fixed": "6578717ebca91678131d2b1f4ba4258e60536e9f"
-            }
-          ],
-          "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git",
-          "type": "GIT"
-        },
-        {
-          "events": [
-            {
-              "introduced": "1da177e4c3f41524e886b7f1b8a0c1fc7321cac2"
             },
             {
               "fixed": "7fa9706722882f634090bfc9af642bf9ed719e27"
-            }
-          ],
-          "repo": "https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git",
-          "type": "GIT"
-        },
-        {
-          "events": [
-            {
-              "introduced": "1da177e4c3f41524e886b7f1b8a0c1fc7321cac2"
             },
             {
               "fixed": "80e648042e512d5a767da251d44132553fe04ae0"
diff --git a/vulnfeeds/cvelist2osv/converter.go b/vulnfeeds/cvelist2osv/converter.go
@@ -245,6 +245,8 @@ func ConvertAndExportCVEToOSV(cve cves.CVE5, vulnSink io.Writer, metricsSink io.
 	versionExtractor := GetVersionExtractor(cve.Metadata.AssignerShortName)
 	versionExtractor.ExtractVersions(cve, v, &metrics, metrics.Repos)
 
+	groupAffectedRanges(v.Affected)
+
 	determineOutcome(&metrics)
 
 	err := v.ToJSON(vulnSink)
diff --git a/vulnfeeds/cvelist2osv/grouping.go b/vulnfeeds/cvelist2osv/grouping.go
@@ -0,0 +1,201 @@
+package cvelist2osv
+
+import (
+	"fmt"
+	"log/slog"
+	"slices"
+
+	"github.com/google/osv/vulnfeeds/utility/logger"
+	"github.com/ossf/osv-schema/bindings/go/osvschema"
+	"google.golang.org/protobuf/encoding/protojson"
+	"google.golang.org/protobuf/types/known/structpb"
+)
+
+// groupAffectedRanges groups ranges that share the same introduced value, type, and repo.
+// This is because having multiple ranges with the same introduced value would act like an
+// OR condition, rather than AND.
+// This function modifies in-place
+func groupAffectedRanges(affected []*osvschema.Affected) {
+	for _, aff := range affected {
+		if len(aff.GetRanges()) <= 1 {
+			continue
+		}
+
+		// Key for grouping: Type + Repo + Introduced Value
+		type groupKey struct {
+			RangeType  osvschema.Range_Type
+			Repo       string
+			Introduced string
+		}
+
+		groups := make(map[groupKey]*osvschema.Range)
+		var order []groupKey // To maintain deterministic order of first appearance
+
+		for _, r := range aff.GetRanges() {
+			// Find the introduced event
+			var introduced string
+			var introducedCount int
+			for _, e := range r.GetEvents() {
+				if e.GetIntroduced() != "" {
+					introduced = e.GetIntroduced()
+					introducedCount++
+				}
+			}
+
+			if introducedCount > 1 {
+				logger.Error("Multiple 'introduced' events found in a single range", slog.Any("range", r))
+			}
+
+			// If no introduced event is found, we use an empty string as the introduced value.
+			key := groupKey{
+				RangeType:  r.GetType(),
+				Repo:       r.GetRepo(),
+				Introduced: introduced,
+			}
+
+			if _, exists := groups[key]; !exists {
+				// Initialize with a deep copy of the first range found for this group
+				// We need to be careful about DatabaseSpecific.
+				// We want to keep the "versions" from this first range.
+				groups[key] = &osvschema.Range{
+					Type:             r.GetType(),
+					Repo:             r.GetRepo(),
+					Events:           []*osvschema.Event{},
+					DatabaseSpecific: r.GetDatabaseSpecific(), // Start with this one's DS
+				}
+				order = append(order, key)
+			} else {
+				// Merge DatabaseSpecific "versions"
+				mergeDatabaseSpecificVersions(groups[key], r.GetDatabaseSpecific())
+			}
+
+			// Add all events to the group. Deduplication happens later in cleanEvents.
+			groups[key].Events = append(groups[key].Events, r.GetEvents()...)
+		}
+
+		// Reconstruct ranges from groups
+		var newRanges []*osvschema.Range
+		for _, key := range order {
+			r := groups[key]
+			r.Events = cleanEvents(r.GetEvents())
+			newRanges = append(newRanges, r)
+		}
+		aff.Ranges = newRanges
+	}
+}
+
+// mergeDatabaseSpecificVersions merges the "versions" field from the source DatabaseSpecific
+// into the target DatabaseSpecific.
+//
+// Examples:
+//  1. Target: nil, Source: {"versions": ["v1", "v2"]}
+//     Result: Target becomes {"versions": ["v1", "v2"]}
+//  2. Target: {}, Source: {"versions": ["v1", "v2"]}
+//     Result: Target becomes {"versions": ["v1", "v2"]}
+//  3. Target: {"versions": ["v1", "v3"]}, Source: {"versions": ["v1", "v2"]}
+//     Result: Target becomes {"versions": ["v1", "v3", "v2"]} (order might vary for new additions, but existing order is preserved)
+//  4. Target: {"other": "data"}, Source: {"versions": ["v1", "v2"]}
+//     Result: Target becomes {"other": "data", "versions": ["v1", "v2"]}
+//  5. Target: {"versions": ["v1", "v2"]}, Source: nil
+//     Result: Target remains {"versions": ["v1", "v2"]}
+func mergeDatabaseSpecificVersions(target *osvschema.Range, source *structpb.Struct) {
+	if source == nil {
+		return
+	}
+	sourceVersions := source.GetFields()["versions"]
+	if sourceVersions == nil {
+		return
+	}
+
+	if target.GetDatabaseSpecific() == nil {
+		var err error
+		target.DatabaseSpecific, err = structpb.NewStruct(nil)
+		if err != nil {
+			logger.Fatal("Failed to create DatabaseSpecific", slog.Any("error", err))
+		}
+	}
+
+	targetFields := target.GetDatabaseSpecific().GetFields()
+	if targetFields == nil {
+		targetFields = make(map[string]*structpb.Value)
+		target.DatabaseSpecific.Fields = targetFields
+	}
+
+	targetVersions := targetFields["versions"]
+	if targetVersions == nil {
+		targetFields["versions"] = sourceVersions
+		return
+	}
+
+	// Both have versions, merge them
+	// Assuming versions is a ListValue
+	if targetVersions.GetListValue() != nil && sourceVersions.GetListValue() != nil {
+		// Append source versions to target versions
+		targetVersions.GetListValue().Values = append(targetVersions.GetListValue().GetValues(), sourceVersions.GetListValue().GetValues()...)
+
+		// Deduplicate versions
+		uniqueVersions := make([]*structpb.Value, 0, len(targetVersions.GetListValue().GetValues()))
+		seenVersions := make(map[string]bool)
+
+		for _, v := range targetVersions.GetListValue().GetValues() {
+			// Serialize to string for comparison
+			// This might be expensive but robust for structpb.Value
+			b, _ := protojson.Marshal(v)
+			key := string(b)
+			if seenVersions[key] {
+				continue
+			}
+			seenVersions[key] = true
+			uniqueVersions = append(uniqueVersions, v)
+		}
+		targetVersions.GetListValue().Values = uniqueVersions
+	}
+}
+
+// cleanEvents deduplicates events and ensures there is only one Introduced event per group.
+func cleanEvents(events []*osvschema.Event) []*osvschema.Event {
+	uniqueEvents := make([]*osvschema.Event, 0, len(events))
+	seen := make(map[string]bool)
+
+	for _, e := range events {
+		// Create a unique key for the event to check for duplicates
+		key := fmt.Sprintf("%v|%v|%v|%v", e.GetIntroduced(), e.GetFixed(), e.GetLimit(), e.GetLastAffected())
+		if seen[key] {
+			continue
+		}
+		seen[key] = true
+		uniqueEvents = append(uniqueEvents, e)
+	}
+
+	// Sort: Introduced events come first.
+	slices.SortStableFunc(uniqueEvents, func(a, b *osvschema.Event) int {
+		// Introduced comes before everything else
+		if a.GetIntroduced() != "" && b.GetIntroduced() == "" {
+			return -1
+		}
+		if a.GetIntroduced() == "" && b.GetIntroduced() != "" {
+			return 1
+		}
+
+		return 0
+	})
+
+	// Ensure only one Introduced event remains.
+	// Since we grouped by Introduced value, all Introduced events in this group are identical.
+	var finalEvents []*osvschema.Event
+	introduced := ""
+	for _, e := range uniqueEvents {
+		if e.GetIntroduced() != "" {
+			if introduced == "" {
+				finalEvents = append(finalEvents, e)
+				introduced = e.GetIntroduced()
+			} else if introduced != e.GetIntroduced() {
+				logger.Error("Found multiple introduced values in the same group", slog.Any("introduced", introduced), slog.Any("event", e.GetIntroduced()))
+			}
+		} else {
+			finalEvents = append(finalEvents, e)
+		}
+	}
+
+	return finalEvents
+}
diff --git a/vulnfeeds/cvelist2osv/grouping_test.go b/vulnfeeds/cvelist2osv/grouping_test.go