Skip to content

Commit a2026c9

Browse files
try faster appends
1 parent 6a16a1f commit a2026c9

File tree

1 file changed

+84
-60
lines changed

1 file changed

+84
-60
lines changed

datadog/serializer.go

Lines changed: 84 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"strconv"
99
"strings"
1010
"time"
11+
"unicode/utf8"
1112

1213
stats "github.com/segmentio/stats/v5"
1314
)
@@ -27,8 +28,12 @@ func (s *serializer) Write(b []byte) (int, error) {
2728
return 0, io.ErrClosedPipe
2829
}
2930

30-
// Ensure the serialized metric payload has valid UTF-8 encoded bytes
31-
b = bytes.ToValidUTF8(b, []byte("\uFFFD"))
31+
// Ensure the serialized metric payload has valid UTF-8 encoded bytes.
32+
// Because ToValidUTF8 makes a copy make one pass through to ensure we
33+
// actually need to change anything.
34+
if !utf8.Valid(b) {
35+
b = bytes.ToValidUTF8(b, []byte("\uFFFD"))
36+
}
3237
if len(b) <= s.bufferSize {
3338
return s.conn.Write(b)
3439
}
@@ -207,86 +212,105 @@ const (
207212
maxLen = 250 // guard for the StatsD UDP packet size
208213
)
209214

210-
// isTrim returns true if the byte is to be trimmed at the ends.
211-
func isTrim(b byte) bool { return b == '.' || b == '_' || b == '-' }
215+
var shouldTrim [256]bool = [256]bool{
216+
'.': true,
217+
'_': true,
218+
'-': true,
219+
}
212220

213221
// appendSanitizedMetricName converts *any* string into something that StatsD / Graphite
214222
// accepts without complaints.
215223
func appendSanitizedMetricName(dst []byte, raw string) []byte {
216-
nameLen := 0
217-
orig := len(dst)
218224
if raw == "" {
219225
if len(dst) == 0 {
220226
return append(dst, "_unnamed_"...)
221227
}
222228
return dst
223229
}
224-
// ── 1. accent folding (creates one temporary ↴)
225-
// tmp := stripUnicodeAccents([]byte(raw))
226-
227-
// ── 2. run the same ASCII sanitizer, but write into dst
228-
lastWasRepl := false
229-
for i := 0; i < len(raw); i++ {
230-
c := byte(raw[i])
231-
232-
if c < 128 && valid[c] {
233-
// ASCII valid chars
234-
dst = append(dst, c)
235-
nameLen++
236-
lastWasRepl = false
237-
} else if c >= 0xC2 && c <= 0xC3 && i+1 < len(raw) {
238-
// Check for 2-byte UTF-8 sequences that are common accented letters
239-
c2 := byte(raw[i+1])
240-
if c2 >= 0x80 && c2 <= 0xBF { // Valid second byte
241-
// Decode the 2-byte sequence
242-
codepoint := uint16(c&0x1F)<<6 | uint16(c2&0x3F)
243-
244-
// Map common accented characters (U+00C0-U+00FF range)
245-
if codepoint >= 0xC0 && codepoint <= 0xFF {
246-
mapped := accentMap[codepoint]
247-
if valid[mapped] {
230+
orig := len(dst)
231+
232+
// Pre-grow
233+
need := len(raw)
234+
if need > maxLen {
235+
need = maxLen
236+
}
237+
if cap(dst)-len(dst) < need {
238+
nd := make([]byte, len(dst), len(dst)+need)
239+
copy(nd, dst)
240+
dst = nd
241+
}
242+
243+
n := len(raw)
244+
i := 0
245+
lastWasReplacement := false
246+
247+
// Skip leading trim while building
248+
for i < n {
249+
c := raw[i]
250+
if !shouldTrim[c] {
251+
break
252+
}
253+
i++
254+
}
255+
256+
for i < n && (len(dst)-orig) < maxLen {
257+
// Batch ASCII-valid run
258+
remaining := maxLen - (len(dst) - orig)
259+
j := i
260+
limit := i + remaining
261+
if limit > n {
262+
limit = n
263+
}
264+
for j < limit {
265+
c := raw[j]
266+
if c >= 128 || !valid[c] {
267+
break
268+
}
269+
j++
270+
}
271+
if j > i {
272+
dst = append(dst, raw[i:j]...)
273+
lastWasReplacement = false
274+
i = j
275+
continue
276+
}
277+
278+
// 2-byte common accent folding
279+
c0 := raw[i]
280+
if c0 >= 0xC2 && c0 <= 0xC3 && i+1 < n {
281+
c1 := raw[i+1]
282+
if c1 >= 0x80 && c1 <= 0xBF {
283+
code := uint16(c0&0x1F)<<6 | uint16(c1&0x3F)
284+
if code >= 0xC0 && code <= 0xFF {
285+
mapped := accentMap[code]
286+
if valid[mapped] && (len(dst)-orig) < maxLen {
248287
dst = append(dst, mapped)
249-
nameLen++
250-
lastWasRepl = false
251-
i++ // Skip the second byte
288+
lastWasReplacement = false
289+
i += 2
252290
continue
253291
}
254292
}
255293
}
256-
// If we get here, treat as invalid
257-
if !lastWasRepl {
258-
dst = append(dst, replacement)
259-
nameLen++
260-
lastWasRepl = true
261-
}
262-
} else if !lastWasRepl {
263-
// Everything else (3-byte, 4-byte sequences, invalid chars)
264-
dst = append(dst, replacement)
265-
nameLen++
266-
lastWasRepl = true
267294
}
268295

269-
if nameLen >= maxLen {
270-
break
296+
// Replacement for everything else
297+
if !lastWasReplacement && len(dst) > orig && (len(dst)-orig) < maxLen {
298+
dst = append(dst, replacement)
299+
lastWasReplacement = true
271300
}
301+
i++
272302
}
273303

274-
// 3. trim leading / trailing '.', '_' or '-'
275-
start, end := orig, len(dst)
276-
for start < end && isTrim(dst[start]) {
277-
start++
278-
}
279-
for end > start && isTrim(dst[end-1]) {
280-
end--
281-
}
282-
283-
// 4. compact if we trimmed something
284-
if start > orig || end < len(dst) {
285-
copy(dst[orig:], dst[start:end])
286-
dst = dst[:orig+(end-start)]
304+
// Trim trailing '.' '_' '-'
305+
for l := len(dst); l > orig; {
306+
c := dst[l-1]
307+
if !shouldTrim[c] {
308+
break
309+
}
310+
l--
311+
dst = dst[:l]
287312
}
288313

289-
// 5. fallback if everything vanished
290314
if len(dst) == orig {
291315
return append(dst, "_truncated_"...)
292316
}

0 commit comments

Comments
 (0)