8
8
"strconv"
9
9
"strings"
10
10
"time"
11
+ "unicode/utf8"
11
12
12
13
stats "github.com/segmentio/stats/v5"
13
14
)
@@ -27,8 +28,12 @@ func (s *serializer) Write(b []byte) (int, error) {
27
28
return 0 , io .ErrClosedPipe
28
29
}
29
30
30
- // Ensure the serialized metric payload has valid UTF-8 encoded bytes
31
- b = bytes .ToValidUTF8 (b , []byte ("\uFFFD " ))
31
+ // Ensure the serialized metric payload has valid UTF-8 encoded bytes.
32
+ // Because ToValidUTF8 makes a copy make one pass through to ensure we
33
+ // actually need to change anything.
34
+ if ! utf8 .Valid (b ) {
35
+ b = bytes .ToValidUTF8 (b , []byte ("\uFFFD " ))
36
+ }
32
37
if len (b ) <= s .bufferSize {
33
38
return s .conn .Write (b )
34
39
}
@@ -207,86 +212,105 @@ const (
207
212
maxLen = 250 // guard for the StatsD UDP packet size
208
213
)
209
214
210
- // isTrim returns true if the byte is to be trimmed at the ends.
211
- func isTrim (b byte ) bool { return b == '.' || b == '_' || b == '-' }
215
+ var shouldTrim [256 ]bool = [256 ]bool {
216
+ '.' : true ,
217
+ '_' : true ,
218
+ '-' : true ,
219
+ }
212
220
213
221
// appendSanitizedMetricName converts *any* string into something that StatsD / Graphite
214
222
// accepts without complaints.
215
223
func appendSanitizedMetricName (dst []byte , raw string ) []byte {
216
- nameLen := 0
217
- orig := len (dst )
218
224
if raw == "" {
219
225
if len (dst ) == 0 {
220
226
return append (dst , "_unnamed_" ... )
221
227
}
222
228
return dst
223
229
}
224
- // ── 1. accent folding (creates one temporary ↴)
225
- // tmp := stripUnicodeAccents([]byte(raw))
226
-
227
- // ── 2. run the same ASCII sanitizer, but write into dst
228
- lastWasRepl := false
229
- for i := 0 ; i < len (raw ); i ++ {
230
- c := byte (raw [i ])
231
-
232
- if c < 128 && valid [c ] {
233
- // ASCII valid chars
234
- dst = append (dst , c )
235
- nameLen ++
236
- lastWasRepl = false
237
- } else if c >= 0xC2 && c <= 0xC3 && i + 1 < len (raw ) {
238
- // Check for 2-byte UTF-8 sequences that are common accented letters
239
- c2 := byte (raw [i + 1 ])
240
- if c2 >= 0x80 && c2 <= 0xBF { // Valid second byte
241
- // Decode the 2-byte sequence
242
- codepoint := uint16 (c & 0x1F )<< 6 | uint16 (c2 & 0x3F )
243
-
244
- // Map common accented characters (U+00C0-U+00FF range)
245
- if codepoint >= 0xC0 && codepoint <= 0xFF {
246
- mapped := accentMap [codepoint ]
247
- if valid [mapped ] {
230
+ orig := len (dst )
231
+
232
+ // Pre-grow
233
+ need := len (raw )
234
+ if need > maxLen {
235
+ need = maxLen
236
+ }
237
+ if cap (dst )- len (dst ) < need {
238
+ nd := make ([]byte , len (dst ), len (dst )+ need )
239
+ copy (nd , dst )
240
+ dst = nd
241
+ }
242
+
243
+ n := len (raw )
244
+ i := 0
245
+ lastWasReplacement := false
246
+
247
+ // Skip leading trim while building
248
+ for i < n {
249
+ c := raw [i ]
250
+ if ! shouldTrim [c ] {
251
+ break
252
+ }
253
+ i ++
254
+ }
255
+
256
+ for i < n && (len (dst )- orig ) < maxLen {
257
+ // Batch ASCII-valid run
258
+ remaining := maxLen - (len (dst ) - orig )
259
+ j := i
260
+ limit := i + remaining
261
+ if limit > n {
262
+ limit = n
263
+ }
264
+ for j < limit {
265
+ c := raw [j ]
266
+ if c >= 128 || ! valid [c ] {
267
+ break
268
+ }
269
+ j ++
270
+ }
271
+ if j > i {
272
+ dst = append (dst , raw [i :j ]... )
273
+ lastWasReplacement = false
274
+ i = j
275
+ continue
276
+ }
277
+
278
+ // 2-byte common accent folding
279
+ c0 := raw [i ]
280
+ if c0 >= 0xC2 && c0 <= 0xC3 && i + 1 < n {
281
+ c1 := raw [i + 1 ]
282
+ if c1 >= 0x80 && c1 <= 0xBF {
283
+ code := uint16 (c0 & 0x1F )<< 6 | uint16 (c1 & 0x3F )
284
+ if code >= 0xC0 && code <= 0xFF {
285
+ mapped := accentMap [code ]
286
+ if valid [mapped ] && (len (dst )- orig ) < maxLen {
248
287
dst = append (dst , mapped )
249
- nameLen ++
250
- lastWasRepl = false
251
- i ++ // Skip the second byte
288
+ lastWasReplacement = false
289
+ i += 2
252
290
continue
253
291
}
254
292
}
255
293
}
256
- // If we get here, treat as invalid
257
- if ! lastWasRepl {
258
- dst = append (dst , replacement )
259
- nameLen ++
260
- lastWasRepl = true
261
- }
262
- } else if ! lastWasRepl {
263
- // Everything else (3-byte, 4-byte sequences, invalid chars)
264
- dst = append (dst , replacement )
265
- nameLen ++
266
- lastWasRepl = true
267
294
}
268
295
269
- if nameLen >= maxLen {
270
- break
296
+ // Replacement for everything else
297
+ if ! lastWasReplacement && len (dst ) > orig && (len (dst )- orig ) < maxLen {
298
+ dst = append (dst , replacement )
299
+ lastWasReplacement = true
271
300
}
301
+ i ++
272
302
}
273
303
274
- // 3. trim leading / trailing '.', '_' or '-'
275
- start , end := orig , len (dst )
276
- for start < end && isTrim (dst [start ]) {
277
- start ++
278
- }
279
- for end > start && isTrim (dst [end - 1 ]) {
280
- end --
281
- }
282
-
283
- // 4. compact if we trimmed something
284
- if start > orig || end < len (dst ) {
285
- copy (dst [orig :], dst [start :end ])
286
- dst = dst [:orig + (end - start )]
304
+ // Trim trailing '.' '_' '-'
305
+ for l := len (dst ); l > orig ; {
306
+ c := dst [l - 1 ]
307
+ if ! shouldTrim [c ] {
308
+ break
309
+ }
310
+ l --
311
+ dst = dst [:l ]
287
312
}
288
313
289
- // 5. fallback if everything vanished
290
314
if len (dst ) == orig {
291
315
return append (dst , "_truncated_" ... )
292
316
}
0 commit comments