Skip to content

Commit 52d3a5e

Browse files
pieternhectorcast-db
authored andcommitted
Library to validate and normalize cloud specific tags (#819)
## Changes Prompted by the proposed fix for a tagging-related problem in #810, I investigated how tag validation works. This turned out to be quite a bit more complex than anticipated. Tags at the job level (or cluster level) are passed through to the underlying compute infrastructure and as such are tested against cloud-specific validation rules. GCP appears to be the most restrictive. It would be disappointing to always restrict to `\w+`, so this package implements validation and normalization rules for each cloud. It can pick the right cloud to use using a Go SDK configuration. ## Tests Exhaustive unit tests. The regular expressions were pulled by #814.
1 parent f7170dd commit 52d3a5e

File tree

13 files changed

+532
-0
lines changed

13 files changed

+532
-0
lines changed

libs/tags/aws.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
package tags
2+
3+
import (
4+
"regexp"
5+
"unicode"
6+
7+
"golang.org/x/text/unicode/rangetable"
8+
)
9+
10+
// The union of all characters allowed in AWS tags.
11+
// This must be used only after filtering out non-Latin1 characters,
12+
// because the [unicode] classes include non-Latin1 characters.
13+
var awsChars = rangetable.Merge(
14+
unicode.Digit,
15+
unicode.Space,
16+
unicode.Letter,
17+
rangetable.New('+', '-', '=', '.', ':', '/', '@'),
18+
)
19+
20+
var awsTag = &tag{
21+
keyLength: 127,
22+
keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`),
23+
keyNormalize: chain(
24+
normalizeMarks(),
25+
replaceNotIn(latin1, '_'),
26+
replaceNotIn(awsChars, '_'),
27+
),
28+
29+
valueLength: 255,
30+
valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`),
31+
valueNormalize: chain(
32+
normalizeMarks(),
33+
replaceNotIn(latin1, '_'),
34+
replaceNotIn(awsChars, '_'),
35+
),
36+
}

libs/tags/aws_test.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package tags
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestAwsNormalizeKey(t *testing.T) {
11+
assert.Equal(t, "1 a b c", awsTag.NormalizeKey("1 a b c"))
12+
assert.Equal(t, "+-=.:/@__", awsTag.NormalizeKey("+-=.:/@?)"))
13+
assert.Equal(t, "test", awsTag.NormalizeKey("test"))
14+
15+
// Remove marks; unicode becomes underscore.
16+
assert.Equal(t, "cafe _", awsTag.NormalizeKey("café 🍎"))
17+
18+
// Replace forbidden characters with underscore.
19+
assert.Equal(t, "cafe __", awsTag.NormalizeKey("café 🍎?"))
20+
}
21+
22+
func TestAwsNormalizeValue(t *testing.T) {
23+
assert.Equal(t, "1 a b c", awsTag.NormalizeValue("1 a b c"))
24+
assert.Equal(t, "+-=.:/@__", awsTag.NormalizeValue("+-=.:/@?)"))
25+
assert.Equal(t, "test", awsTag.NormalizeValue("test"))
26+
27+
// Remove marks; unicode becomes underscore.
28+
assert.Equal(t, "cafe _", awsTag.NormalizeValue("café 🍎"))
29+
30+
// Replace forbidden characters with underscore.
31+
assert.Equal(t, "cafe __", awsTag.NormalizeValue("café 🍎?"))
32+
}
33+
34+
func TestAwsValidateKey(t *testing.T) {
35+
assert.ErrorContains(t, awsTag.ValidateKey(""), "not be empty")
36+
assert.ErrorContains(t, awsTag.ValidateKey(strings.Repeat("a", 512)), "length")
37+
assert.ErrorContains(t, awsTag.ValidateKey("café 🍎"), "latin")
38+
assert.ErrorContains(t, awsTag.ValidateKey("????"), "pattern")
39+
assert.NoError(t, awsTag.ValidateKey(strings.Repeat("a", 127)))
40+
assert.NoError(t, awsTag.ValidateKey(awsTag.NormalizeKey("café 🍎")))
41+
}
42+
43+
func TestAwsValidateValue(t *testing.T) {
44+
assert.ErrorContains(t, awsTag.ValidateValue(strings.Repeat("a", 512)), "length")
45+
assert.ErrorContains(t, awsTag.ValidateValue("café 🍎"), "latin1")
46+
assert.ErrorContains(t, awsTag.ValidateValue("????"), "pattern")
47+
assert.NoError(t, awsTag.ValidateValue(strings.Repeat("a", 127)))
48+
assert.NoError(t, awsTag.ValidateValue(awsTag.NormalizeValue("café 🍎")))
49+
}

libs/tags/azure.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package tags
2+
3+
import (
4+
"regexp"
5+
6+
"golang.org/x/text/unicode/rangetable"
7+
)
8+
9+
// All characters that may not be used in Azure tag keys.
10+
var azureForbiddenChars = rangetable.New('<', '>', '*', '&', '%', ';', '\\', '/', '+', '?')
11+
12+
var azureTag = &tag{
13+
keyLength: 512,
14+
keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`),
15+
keyNormalize: chain(
16+
replaceNotIn(latin1, '_'),
17+
replaceIn(azureForbiddenChars, '_'),
18+
),
19+
20+
valueLength: 256,
21+
valuePattern: regexp.MustCompile(`^.*$`),
22+
valueNormalize: chain(
23+
replaceNotIn(latin1, '_'),
24+
),
25+
}

libs/tags/azure_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package tags
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestAzureNormalizeKey(t *testing.T) {
11+
assert.Equal(t, "test", azureTag.NormalizeKey("test"))
12+
assert.Equal(t, "café __", azureTag.NormalizeKey("café 🍎?"))
13+
}
14+
15+
func TestAzureNormalizeValue(t *testing.T) {
16+
assert.Equal(t, "test", azureTag.NormalizeValue("test"))
17+
assert.Equal(t, "café _?", azureTag.NormalizeValue("café 🍎?"))
18+
}
19+
20+
func TestAzureValidateKey(t *testing.T) {
21+
assert.ErrorContains(t, azureTag.ValidateKey(""), "not be empty")
22+
assert.ErrorContains(t, azureTag.ValidateKey(strings.Repeat("a", 513)), "length")
23+
assert.ErrorContains(t, azureTag.ValidateKey("café 🍎"), "latin")
24+
assert.ErrorContains(t, azureTag.ValidateKey("????"), "pattern")
25+
assert.NoError(t, azureTag.ValidateKey(strings.Repeat("a", 127)))
26+
assert.NoError(t, azureTag.ValidateKey(azureTag.NormalizeKey("café 🍎")))
27+
}
28+
29+
func TestAzureValidateValue(t *testing.T) {
30+
assert.ErrorContains(t, azureTag.ValidateValue(strings.Repeat("a", 513)), "length")
31+
assert.ErrorContains(t, azureTag.ValidateValue("café 🍎"), "latin")
32+
assert.NoError(t, azureTag.ValidateValue(strings.Repeat("a", 127)))
33+
assert.NoError(t, azureTag.ValidateValue(azureTag.NormalizeValue("café 🍎")))
34+
}

libs/tags/cloud.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package tags
2+
3+
import "github.com/databricks/databricks-sdk-go/config"
4+
5+
type Cloud interface {
6+
// ValidateKey checks if a tag key can be used with the cloud provider.
7+
ValidateKey(key string) error
8+
9+
// ValidateValue checks if a tag value can be used with the cloud provider.
10+
ValidateValue(value string) error
11+
12+
// NormalizeKey normalizes a tag key for the cloud provider.
13+
NormalizeKey(key string) string
14+
15+
// NormalizeValue normalizes a tag value for the cloud provider.
16+
NormalizeValue(value string) string
17+
}
18+
19+
func ForCloud(cfg *config.Config) Cloud {
20+
var t *tag
21+
switch {
22+
case cfg.IsAws():
23+
t = awsTag
24+
case cfg.IsAzure():
25+
t = azureTag
26+
case cfg.IsGcp():
27+
t = gcpTag
28+
default:
29+
panic("unknown cloud provider")
30+
}
31+
return t
32+
}

libs/tags/cloud_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package tags
2+
3+
import (
4+
"testing"
5+
6+
"github.com/databricks/databricks-sdk-go/config"
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestForCloudAws(t *testing.T) {
11+
c := &config.Config{
12+
Host: "https://dbc-XXXXXXXX-YYYY.cloud.databricks.com/",
13+
}
14+
15+
assert.Equal(t, awsTag, ForCloud(c))
16+
}
17+
18+
func TestForCloudAzure(t *testing.T) {
19+
c := &config.Config{
20+
Host: "https://adb-xxx.y.azuredatabricks.net/",
21+
}
22+
23+
assert.Equal(t, azureTag, ForCloud(c))
24+
}
25+
26+
func TestForCloudGcp(t *testing.T) {
27+
c := &config.Config{
28+
Host: "https://123.4.gcp.databricks.com/",
29+
}
30+
31+
assert.Equal(t, gcpTag, ForCloud(c))
32+
}

libs/tags/gcp.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package tags
2+
3+
import (
4+
"regexp"
5+
"unicode"
6+
)
7+
8+
// Tag keys and values on GCP are limited to 63 characters and must match the
9+
// regular expression `^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`.
10+
// For normalization, we define one table for the outer characters and
11+
// one table for the inner characters. The outer table is used to trim
12+
// leading and trailing characters, and the inner table is used to
13+
// replace invalid characters with underscores.
14+
15+
var gcpOuter = &unicode.RangeTable{
16+
R16: []unicode.Range16{
17+
// 0-9
18+
{0x0030, 0x0039, 1},
19+
// A-Z
20+
{0x0041, 0x005A, 1},
21+
// a-z
22+
{0x0061, 0x007A, 1},
23+
},
24+
LatinOffset: 3,
25+
}
26+
27+
var gcpInner = &unicode.RangeTable{
28+
R16: []unicode.Range16{
29+
// Hyphen-minus (dash)
30+
{0x002D, 0x002D, 1},
31+
// Full stop (period)
32+
{0x002E, 0x002E, 1},
33+
// 0-9
34+
{0x0030, 0x0039, 1},
35+
// A-Z
36+
{0x0041, 0x005A, 1},
37+
// Low line (underscore)
38+
{0x005F, 0x005F, 1},
39+
// a-z
40+
{0x0061, 0x007A, 1},
41+
},
42+
LatinOffset: 6,
43+
}
44+
45+
var gcpTag = &tag{
46+
keyLength: 63,
47+
keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`),
48+
keyNormalize: chain(
49+
normalizeMarks(),
50+
replaceNotIn(latin1, '_'),
51+
replaceNotIn(gcpInner, '_'),
52+
trimIfNotIn(gcpOuter),
53+
),
54+
55+
valueLength: 63,
56+
valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`),
57+
valueNormalize: chain(
58+
normalizeMarks(),
59+
replaceNotIn(latin1, '_'),
60+
replaceNotIn(gcpInner, '_'),
61+
trimIfNotIn(gcpOuter),
62+
),
63+
}

libs/tags/gcp_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package tags
2+
3+
import (
4+
"strings"
5+
"testing"
6+
"unicode"
7+
8+
"github.com/stretchr/testify/assert"
9+
)
10+
11+
func TestGcpOuter(t *testing.T) {
12+
assert.True(t, unicode.In('A', gcpOuter))
13+
assert.True(t, unicode.In('Z', gcpOuter))
14+
assert.True(t, unicode.In('a', gcpOuter))
15+
assert.True(t, unicode.In('z', gcpOuter))
16+
assert.True(t, unicode.In('0', gcpOuter))
17+
assert.True(t, unicode.In('9', gcpOuter))
18+
assert.False(t, unicode.In('-', gcpOuter))
19+
assert.False(t, unicode.In('.', gcpOuter))
20+
assert.False(t, unicode.In('_', gcpOuter))
21+
assert.False(t, unicode.In('!', gcpOuter))
22+
}
23+
24+
func TestGcpInner(t *testing.T) {
25+
assert.True(t, unicode.In('A', gcpInner))
26+
assert.True(t, unicode.In('Z', gcpInner))
27+
assert.True(t, unicode.In('a', gcpInner))
28+
assert.True(t, unicode.In('z', gcpInner))
29+
assert.True(t, unicode.In('0', gcpInner))
30+
assert.True(t, unicode.In('9', gcpInner))
31+
assert.True(t, unicode.In('-', gcpInner))
32+
assert.True(t, unicode.In('.', gcpInner))
33+
assert.True(t, unicode.In('_', gcpInner))
34+
assert.False(t, unicode.In('!', gcpInner))
35+
}
36+
37+
func TestGcpNormalizeKey(t *testing.T) {
38+
assert.Equal(t, "test", gcpTag.NormalizeKey("test"))
39+
assert.Equal(t, "cafe", gcpTag.NormalizeKey("café 🍎?"))
40+
assert.Equal(t, "cafe_foo", gcpTag.NormalizeKey("__café_foo__"))
41+
42+
}
43+
44+
func TestGcpNormalizeValue(t *testing.T) {
45+
assert.Equal(t, "test", gcpTag.NormalizeValue("test"))
46+
assert.Equal(t, "cafe", gcpTag.NormalizeValue("café 🍎?"))
47+
assert.Equal(t, "cafe_foo", gcpTag.NormalizeValue("__café_foo__"))
48+
}
49+
50+
func TestGcpValidateKey(t *testing.T) {
51+
assert.ErrorContains(t, gcpTag.ValidateKey(""), "not be empty")
52+
assert.ErrorContains(t, gcpTag.ValidateKey(strings.Repeat("a", 64)), "length")
53+
assert.ErrorContains(t, gcpTag.ValidateKey("café 🍎"), "latin")
54+
assert.ErrorContains(t, gcpTag.ValidateKey("????"), "pattern")
55+
assert.NoError(t, gcpTag.ValidateKey(strings.Repeat("a", 32)))
56+
assert.NoError(t, gcpTag.ValidateKey(gcpTag.NormalizeKey("café 🍎")))
57+
}
58+
59+
func TestGcpValidateValue(t *testing.T) {
60+
assert.ErrorContains(t, gcpTag.ValidateValue(strings.Repeat("a", 64)), "length")
61+
assert.ErrorContains(t, gcpTag.ValidateValue("café 🍎"), "latin")
62+
assert.ErrorContains(t, gcpTag.ValidateValue("????"), "pattern")
63+
assert.NoError(t, gcpTag.ValidateValue(strings.Repeat("a", 32)))
64+
assert.NoError(t, gcpTag.ValidateValue(gcpTag.NormalizeValue("café 🍎")))
65+
}

libs/tags/latin.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package tags
2+
3+
import "unicode"
4+
5+
// Range table for all characters in the Latin1 character set.
6+
var latin1 = &unicode.RangeTable{
7+
R16: []unicode.Range16{
8+
{0x0000, 0x00ff, 1},
9+
},
10+
LatinOffset: 1,
11+
}

libs/tags/latin_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package tags
2+
3+
import (
4+
"testing"
5+
"unicode"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestLatinTable(t *testing.T) {
11+
assert.True(t, unicode.In('\u0000', latin1))
12+
assert.True(t, unicode.In('A', latin1))
13+
assert.True(t, unicode.In('Z', latin1))
14+
assert.True(t, unicode.In('\u00ff', latin1))
15+
assert.False(t, unicode.In('\u0100', latin1))
16+
}

0 commit comments

Comments
 (0)