Skip to content

Commit 3f53c0f

Browse files
committed
fix(cache): include artifact URIs in cache key to prevent incorrect reuse
Previously only artifact names were used in cache key generation, causing incorrect cache hits when different data sources had same artifact names. Now uses 'name@uri' format to ensure different URIs produce different keys. Signed-off-by: Aman-Cool <aman017102007@gmail.com>
1 parent a26b3d8 commit 3f53c0f

File tree

3 files changed

+145
-4
lines changed

3 files changed

+145
-4
lines changed

api/v2alpha1/cache_key.proto

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ import "google/protobuf/struct.proto";
2121
import "pipeline_spec.proto";
2222

2323
message CacheKey {
24+
// Input artifact identifiers. Each string in ArtifactNameList now uses the format
25+
// "name@uri" to include both the artifact name and URI, ensuring different data
26+
// sources produce different cache keys. Previously only names were used, which
27+
// caused incorrect cache reuse when artifacts had the same name but different URIs.
2428
map<string, ArtifactNameList> inputArtifactNames = 1;
2529
map<string, Value> inputParameters = 2 [deprecated = true];
2630
map<string, RuntimeArtifact> outputArtifactsSpec = 3;
@@ -36,5 +40,8 @@ message ContainerSpec {
3640
}
3741

3842
message ArtifactNameList {
43+
// Artifact identifiers. Format: "name@uri" when URI is present, or just "name"
44+
// when URI is empty. This format ensures cache keys differ for artifacts with
45+
// the same name but different data sources.
3946
repeated string artifactNames = 1;
4047
}

backend/src/v2/cacheutils/cache.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,15 @@ func (c *client) GenerateCacheKey(
187187
for inputArtifactName, inputArtifactList := range inputs.GetArtifacts() {
188188
inputArtifactNameList := cachekey.ArtifactNameList{ArtifactNames: make([]string, 0)}
189189
for _, artifact := range inputArtifactList.Artifacts {
190-
inputArtifactNameList.ArtifactNames = append(inputArtifactNameList.ArtifactNames, artifact.GetName())
190+
// CRITICAL FIX: Include both name AND URI in the cache key identifier.
191+
// Previously only the name was used, which caused incorrect cache reuse
192+
// when different data sources had artifacts with the same logical name.
193+
// The format "name@uri" ensures different URIs produce different cache keys.
194+
artifactIdentifier := artifact.GetName()
195+
if uri := artifact.GetUri(); uri != "" {
196+
artifactIdentifier = artifact.GetName() + "@" + uri
197+
}
198+
inputArtifactNameList.ArtifactNames = append(inputArtifactNameList.ArtifactNames, artifactIdentifier)
191199
}
192200
cacheKey.InputArtifactNames[inputArtifactName] = &inputArtifactNameList
193201
}

backend/src/v2/cacheutils/cache_test.go

Lines changed: 129 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,11 @@ func TestGenerateCacheKey(t *testing.T) {
9898
cmdArgs: []string{"sh", "ec", "test"},
9999
image: "python:3.11",
100100
want: &cachekey.CacheKey{
101+
// CRITICAL: Artifact identifiers now include URIs in "name@uri" format
102+
// to ensure different data sources produce different cache keys.
101103
InputArtifactNames: map[string]*cachekey.ArtifactNameList{
102-
"dataset_one": {ArtifactNames: []string{"1"}},
103-
"dataset_two": {ArtifactNames: []string{"2"}},
104+
"dataset_one": {ArtifactNames: []string{"1@gs://some-bucket/dataset-one"}},
105+
"dataset_two": {ArtifactNames: []string{"2@gs://some-bucket/dataset-two"}},
104106
},
105107
InputParameterValues: map[string]*structpb.Value{
106108
"message": {Kind: &structpb.Value_StringValue{StringValue: "Some string value"}},
@@ -187,8 +189,9 @@ func TestGenerateCacheKey(t *testing.T) {
187189
image: "python:3.11",
188190
pvcNames: []string{"workspace-pvc", "data-pvc"},
189191
want: &cachekey.CacheKey{
192+
// CRITICAL: Artifact identifiers now include URIs in "name@uri" format
190193
InputArtifactNames: map[string]*cachekey.ArtifactNameList{
191-
"dataset_one": {ArtifactNames: []string{"1"}},
194+
"dataset_one": {ArtifactNames: []string{"1@gs://some-bucket/dataset-one"}},
192195
},
193196
InputParameterValues: map[string]*structpb.Value{
194197
"message": {Kind: &structpb.Value_StringValue{StringValue: "Some string value"}},
@@ -353,6 +356,129 @@ func TestGenerateFingerPrint(t *testing.T) {
353356
}
354357
}
355358

359+
// TestGenerateFingerPrint_ConsidersArtifactURIs verifies that different artifact URIs
360+
// produce different fingerprints, even when artifact names are identical.
361+
// This test validates the fix for the critical cache key bug where only artifact names
362+
// (not URIs) were included in the cache key, causing incorrect cache reuse across
363+
// different data sources.
364+
func TestGenerateFingerPrint_ConsidersArtifactURIs(t *testing.T) {
365+
cacheClient, err := NewClient("ml-pipeline.kubeflow", "8887", false, &tls.Config{})
366+
require.NoError(t, err)
367+
368+
// Create two inputs with same artifact name but different URIs
369+
inputsWithURIA := &pipelinespec.ExecutorInput_Inputs{
370+
Artifacts: map[string]*pipelinespec.ArtifactList{
371+
"dataset": {
372+
Artifacts: []*pipelinespec.RuntimeArtifact{
373+
{
374+
Name: "data",
375+
Uri: "gs://bucket-a/dataset/v1",
376+
},
377+
},
378+
},
379+
},
380+
}
381+
382+
inputsWithURIB := &pipelinespec.ExecutorInput_Inputs{
383+
Artifacts: map[string]*pipelinespec.ArtifactList{
384+
"dataset": {
385+
Artifacts: []*pipelinespec.RuntimeArtifact{
386+
{
387+
Name: "data", // Same name as above
388+
Uri: "gs://bucket-b/dataset/v2", // Different URI
389+
},
390+
},
391+
},
392+
},
393+
}
394+
395+
inputsWithSameURI := &pipelinespec.ExecutorInput_Inputs{
396+
Artifacts: map[string]*pipelinespec.ArtifactList{
397+
"dataset": {
398+
Artifacts: []*pipelinespec.RuntimeArtifact{
399+
{
400+
Name: "data",
401+
Uri: "gs://bucket-a/dataset/v1", // Same URI as inputsWithURIA
402+
},
403+
},
404+
},
405+
},
406+
}
407+
408+
outputs := &pipelinespec.ExecutorInput_Outputs{
409+
Parameters: map[string]*pipelinespec.ExecutorInput_OutputParameter{
410+
"output": {OutputFile: "/tmp/output"},
411+
},
412+
}
413+
outputParamTypes := map[string]string{"output": "STRING"}
414+
415+
// Generate cache keys
416+
cacheKeyA, err := cacheClient.GenerateCacheKey(inputsWithURIA, outputs, outputParamTypes, []string{"echo"}, "python:3.11", nil)
417+
require.NoError(t, err)
418+
419+
cacheKeyB, err := cacheClient.GenerateCacheKey(inputsWithURIB, outputs, outputParamTypes, []string{"echo"}, "python:3.11", nil)
420+
require.NoError(t, err)
421+
422+
cacheKeySameAsA, err := cacheClient.GenerateCacheKey(inputsWithSameURI, outputs, outputParamTypes, []string{"echo"}, "python:3.11", nil)
423+
require.NoError(t, err)
424+
425+
// Generate fingerprints
426+
fingerprintA, err := cacheClient.GenerateFingerPrint(cacheKeyA)
427+
require.NoError(t, err)
428+
429+
fingerprintB, err := cacheClient.GenerateFingerPrint(cacheKeyB)
430+
require.NoError(t, err)
431+
432+
fingerprintSameAsA, err := cacheClient.GenerateFingerPrint(cacheKeySameAsA)
433+
require.NoError(t, err)
434+
435+
// CRITICAL ASSERTIONS:
436+
// Different URIs MUST produce different fingerprints to prevent incorrect cache reuse
437+
assert.NotEqual(t, fingerprintA, fingerprintB,
438+
"CRITICAL BUG: Same artifact name with different URIs produced same fingerprint! "+
439+
"This would cause incorrect cache reuse across different data sources.")
440+
441+
// Same URIs should produce same fingerprints (cache hit is correct)
442+
assert.Equal(t, fingerprintA, fingerprintSameAsA,
443+
"Same artifact name and URI should produce same fingerprint for correct cache hits.")
444+
445+
// Verify the cache key contains the URI in the artifact identifier
446+
assert.Contains(t, cacheKeyA.InputArtifactNames["dataset"].ArtifactNames[0], "gs://bucket-a/dataset/v1",
447+
"Cache key should contain the artifact URI")
448+
assert.Contains(t, cacheKeyB.InputArtifactNames["dataset"].ArtifactNames[0], "gs://bucket-b/dataset/v2",
449+
"Cache key should contain the artifact URI")
450+
}
451+
452+
// TestGenerateCacheKey_ArtifactWithoutURI verifies that artifacts without URIs
453+
// still work correctly (backward compatibility).
454+
func TestGenerateCacheKey_ArtifactWithoutURI(t *testing.T) {
455+
cacheClient, err := NewClient("ml-pipeline.kubeflow", "8887", false, &tls.Config{})
456+
require.NoError(t, err)
457+
458+
inputsNoURI := &pipelinespec.ExecutorInput_Inputs{
459+
Artifacts: map[string]*pipelinespec.ArtifactList{
460+
"dataset": {
461+
Artifacts: []*pipelinespec.RuntimeArtifact{
462+
{
463+
Name: "data",
464+
Uri: "", // No URI
465+
},
466+
},
467+
},
468+
},
469+
}
470+
471+
outputs := &pipelinespec.ExecutorInput_Outputs{}
472+
outputParamTypes := map[string]string{}
473+
474+
cacheKey, err := cacheClient.GenerateCacheKey(inputsNoURI, outputs, outputParamTypes, []string{"echo"}, "python:3.11", nil)
475+
require.NoError(t, err)
476+
477+
// When URI is empty, the artifact identifier should just be the name
478+
assert.Equal(t, "data", cacheKey.InputArtifactNames["dataset"].ArtifactNames[0],
479+
"Artifact without URI should use just the name as identifier")
480+
}
481+
356482
func TestGenerateFingerPrint_ConsidersPVCNames(t *testing.T) {
357483
base := &cachekey.CacheKey{
358484
InputArtifactNames: map[string]*cachekey.ArtifactNameList{

0 commit comments

Comments
 (0)