Skip to content

Commit 770ce2d

Browse files
committed
Better handling of YAML that tastes like JSON
For the most part, JSON is a subset of YAML. This might lead one to think that we should ALWAYS use YAML processing. Unfortunately a JSON "stream" (as defined by Go's encoding/json and many other places, though not the JSON spec) is a series of JSON objects. E.g. This: ``` {}{}{} ``` ...is a valid JSON stream. YAML does NOT accept that, insisting on `---` on a new line between YAML documents. Before this commit, YAMLOrJSONDecoder tries to detect if the input is JSON by looking at the first few characters for "{". Unfortunately, some perfectly valid YAML also tastes like that. After this commit, YAMLOrJSONDecoder will detect a failure to parse as JSON and instead flip to YAML parsing. This should handle the ambiguous YAML. Once we flip to YAML we never flip back, and once we detect a JSON stream (as defined above) we lose the ability to flip to YAML. A multi-document is either all JSON or all YAML, even if we use the JSON parser to decode the first object (because JSON is YAML for a single object).
1 parent 0eaee48 commit 770ce2d

File tree

4 files changed

+684
-49
lines changed

4 files changed

+684
-49
lines changed

staging/src/k8s.io/apimachinery/pkg/util/yaml/decoder.go

Lines changed: 122 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ import (
2020
"bufio"
2121
"bytes"
2222
"encoding/json"
23+
"errors"
2324
"fmt"
2425
"io"
2526
"strings"
2627
"unicode"
28+
"unicode/utf8"
2729

2830
jsonutil "k8s.io/apimachinery/pkg/util/json"
2931

@@ -92,7 +94,7 @@ func UnmarshalStrict(data []byte, v interface{}) error {
9294
// YAML decoding path is not used (so that error messages are
9395
// JSON specific).
9496
func ToJSON(data []byte) ([]byte, error) {
95-
if hasJSONPrefix(data) {
97+
if IsJSONBuffer(data) {
9698
return data, nil
9799
}
98100
return yaml.YAMLToJSON(data)
@@ -102,7 +104,8 @@ func ToJSON(data []byte) ([]byte, error) {
102104
// separating individual documents. It first converts the YAML
103105
// body to JSON, then unmarshals the JSON.
104106
type YAMLToJSONDecoder struct {
105-
reader Reader
107+
reader Reader
108+
inputOffset int
106109
}
107110

108111
// NewYAMLToJSONDecoder decodes YAML documents from the provided
@@ -121,7 +124,7 @@ func NewYAMLToJSONDecoder(r io.Reader) *YAMLToJSONDecoder {
121124
// yaml.Unmarshal.
122125
func (d *YAMLToJSONDecoder) Decode(into interface{}) error {
123126
bytes, err := d.reader.Read()
124-
if err != nil && err != io.EOF {
127+
if err != nil && err != io.EOF { //nolint:errorlint
125128
return err
126129
}
127130

@@ -131,9 +134,14 @@ func (d *YAMLToJSONDecoder) Decode(into interface{}) error {
131134
return YAMLSyntaxError{err}
132135
}
133136
}
137+
d.inputOffset += len(bytes)
134138
return err
135139
}
136140

141+
func (d *YAMLToJSONDecoder) InputOffset() int {
142+
return d.inputOffset
143+
}
144+
137145
// YAMLDecoder reads chunks of objects and returns ErrShortBuffer if
138146
// the data is not sufficient.
139147
type YAMLDecoder struct {
@@ -229,18 +237,20 @@ func splitYAMLDocument(data []byte, atEOF bool) (advance int, token []byte, err
229237
return 0, nil, nil
230238
}
231239

232-
// decoder is a convenience interface for Decode.
233-
type decoder interface {
234-
Decode(into interface{}) error
235-
}
236-
237-
// YAMLOrJSONDecoder attempts to decode a stream of JSON documents or
238-
// YAML documents by sniffing for a leading { character.
240+
// YAMLOrJSONDecoder attempts to decode a stream of JSON or YAML documents.
241+
// While JSON is YAML, the way Go's JSON decode defines a multi-document stream
242+
// is a series of JSON objects (e.g. {}{}), but YAML defines a multi-document
243+
// stream as a series of documents separated by "---".
244+
//
245+
// This decoder will attempt to decode the stream as JSON first, and if that
246+
// fails, it will switch to YAML. Once it determines the stream is JSON (by
247+
// finding a non-YAML-delimited series of objects), it will not switch to YAML.
248+
// Once it switches to YAML it will not switch back to JSON.
239249
type YAMLOrJSONDecoder struct {
240-
r io.Reader
241-
bufferSize int
242-
243-
decoder decoder
250+
json *json.Decoder
251+
yaml *YAMLToJSONDecoder
252+
stream *StreamReader
253+
count int // how many objects have been decoded
244254
}
245255

246256
type JSONSyntaxError struct {
@@ -265,31 +275,108 @@ func (e YAMLSyntaxError) Error() string {
265275
// how far into the stream the decoder will look to figure out whether this
266276
// is a JSON stream (has whitespace followed by an open brace).
267277
func NewYAMLOrJSONDecoder(r io.Reader, bufferSize int) *YAMLOrJSONDecoder {
268-
return &YAMLOrJSONDecoder{
269-
r: r,
270-
bufferSize: bufferSize,
278+
d := &YAMLOrJSONDecoder{}
279+
280+
reader, _, mightBeJSON := GuessJSONStream(r, bufferSize)
281+
d.stream = reader
282+
if mightBeJSON {
283+
d.json = json.NewDecoder(reader)
284+
} else {
285+
d.yaml = NewYAMLToJSONDecoder(reader)
271286
}
287+
return d
272288
}
273289

274290
// Decode unmarshals the next object from the underlying stream into the
275291
// provide object, or returns an error.
276292
func (d *YAMLOrJSONDecoder) Decode(into interface{}) error {
277-
if d.decoder == nil {
278-
buffer, _, isJSON := GuessJSONStream(d.r, d.bufferSize)
279-
if isJSON {
280-
d.decoder = json.NewDecoder(buffer)
293+
// Because we don't know if this is a JSON or YAML stream, a failure from
294+
// both decoders is ambiguous. When in doubt, it will return the error from
295+
// the JSON decoder. Unfortunately, this means that if the first document
296+
// is invalid YAML, the error won't be awesome.
297+
// TODO: the errors from YAML are not great, we could improve them a lot.
298+
var firstErr error
299+
if d.json != nil {
300+
err := d.json.Decode(into)
301+
if err == nil {
302+
d.stream.Consume(int(d.json.InputOffset()) - d.stream.Consumed())
303+
d.count++
304+
return nil
305+
}
306+
if err == io.EOF { //nolint:errorlint
307+
return err
308+
}
309+
var syntax *json.SyntaxError
310+
if ok := errors.As(err, &syntax); ok {
311+
firstErr = JSONSyntaxError{
312+
Offset: syntax.Offset,
313+
Err: syntax,
314+
}
281315
} else {
282-
d.decoder = NewYAMLToJSONDecoder(buffer)
316+
firstErr = err
317+
}
318+
if d.count > 1 {
319+
// If we found 0 or 1 JSON object(s), this stream is still
320+
// ambiguous. But if we found more than 1 JSON object, then this
321+
// is an unambiguous JSON stream, and we should not switch to YAML.
322+
return err
323+
}
324+
// If JSON decoding hits the end of one object and then fails on the
325+
// next, it leaves any leading whitespace in the buffer, which can
326+
// confuse the YAML decoder. We just eat any whitespace we find, up to
327+
// and including the first newline.
328+
d.stream.Rewind()
329+
if err := d.consumeWhitespace(); err == nil {
330+
d.yaml = NewYAMLToJSONDecoder(d.stream)
331+
}
332+
d.json = nil
333+
}
334+
if d.yaml != nil {
335+
err := d.yaml.Decode(into)
336+
if err == nil {
337+
d.stream.Consume(d.yaml.InputOffset() - d.stream.Consumed())
338+
d.count++
339+
return nil
340+
}
341+
if err == io.EOF { //nolint:errorlint
342+
return err
343+
}
344+
if firstErr == nil {
345+
firstErr = err
283346
}
284347
}
285-
err := d.decoder.Decode(into)
286-
if syntax, ok := err.(*json.SyntaxError); ok {
287-
return JSONSyntaxError{
288-
Offset: syntax.Offset,
289-
Err: syntax,
348+
if firstErr != nil {
349+
return firstErr
350+
}
351+
return fmt.Errorf("decoding failed as both JSON and YAML")
352+
}
353+
354+
func (d *YAMLOrJSONDecoder) consumeWhitespace() error {
355+
consumed := 0
356+
for {
357+
buf, err := d.stream.ReadN(4)
358+
if err != nil && err == io.EOF { //nolint:errorlint
359+
return err
360+
}
361+
r, sz := utf8.DecodeRune(buf)
362+
if r == utf8.RuneError || sz == 0 {
363+
return fmt.Errorf("invalid utf8 rune")
364+
}
365+
d.stream.RewindN(len(buf) - sz)
366+
if !unicode.IsSpace(r) {
367+
d.stream.RewindN(sz)
368+
d.stream.Consume(consumed)
369+
return nil
370+
}
371+
if r == '\n' {
372+
d.stream.Consume(consumed)
373+
return nil
374+
}
375+
if err == io.EOF { //nolint:errorlint
376+
break
290377
}
291378
}
292-
return err
379+
return io.EOF
293380
}
294381

295382
type Reader interface {
@@ -311,7 +398,7 @@ func (r *YAMLReader) Read() ([]byte, error) {
311398
var buffer bytes.Buffer
312399
for {
313400
line, err := r.reader.Read()
314-
if err != nil && err != io.EOF {
401+
if err != nil && err != io.EOF { //nolint:errorlint
315402
return nil, err
316403
}
317404

@@ -329,11 +416,11 @@ func (r *YAMLReader) Read() ([]byte, error) {
329416
if buffer.Len() != 0 {
330417
return buffer.Bytes(), nil
331418
}
332-
if err == io.EOF {
419+
if err == io.EOF { //nolint:errorlint
333420
return nil, err
334421
}
335422
}
336-
if err == io.EOF {
423+
if err == io.EOF { //nolint:errorlint
337424
if buffer.Len() != 0 {
338425
// If we're at EOF, we have a final, non-terminated line. Return it.
339426
return buffer.Bytes(), nil
@@ -369,26 +456,20 @@ func (r *LineReader) Read() ([]byte, error) {
369456
// GuessJSONStream scans the provided reader up to size, looking
370457
// for an open brace indicating this is JSON. It will return the
371458
// bufio.Reader it creates for the consumer.
372-
func GuessJSONStream(r io.Reader, size int) (io.Reader, []byte, bool) {
373-
buffer := bufio.NewReaderSize(r, size)
459+
func GuessJSONStream(r io.Reader, size int) (*StreamReader, []byte, bool) {
460+
buffer := NewStreamReader(r, size)
374461
b, _ := buffer.Peek(size)
375-
return buffer, b, hasJSONPrefix(b)
462+
return buffer, b, IsJSONBuffer(b)
376463
}
377464

378465
// IsJSONBuffer scans the provided buffer, looking
379466
// for an open brace indicating this is JSON.
380467
func IsJSONBuffer(buf []byte) bool {
381-
return hasJSONPrefix(buf)
468+
return hasPrefix(buf, jsonPrefix)
382469
}
383470

384471
var jsonPrefix = []byte("{")
385472

386-
// hasJSONPrefix returns true if the provided buffer appears to start with
387-
// a JSON open brace.
388-
func hasJSONPrefix(buf []byte) bool {
389-
return hasPrefix(buf, jsonPrefix)
390-
}
391-
392473
// Return true if the first non-whitespace bytes in buf is
393474
// prefix.
394475
func hasPrefix(buf []byte, prefix []byte) bool {

staging/src/k8s.io/apimachinery/pkg/util/yaml/decoder_test.go

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package yaml
1919
import (
2020
"bufio"
2121
"bytes"
22-
"encoding/json"
2322
"fmt"
2423
"io"
2524
"math/rand"
@@ -105,7 +104,7 @@ stuff: 1
105104
}
106105
b = make([]byte, 15)
107106
n, err = r.Read(b)
108-
if err != io.EOF || n != 0 {
107+
if err != io.EOF || n != 0 { //nolint:errorlint
109108
t.Fatalf("expected EOF: %d / %v", n, err)
110109
}
111110
}
@@ -205,7 +204,7 @@ stuff: 1
205204
t.Fatalf("unexpected object: %#v", obj)
206205
}
207206
obj = generic{}
208-
if err := s.Decode(&obj); err != io.EOF {
207+
if err := s.Decode(&obj); err != io.EOF { //nolint:errorlint
209208
t.Fatalf("unexpected error: %v", err)
210209
}
211210
}
@@ -319,6 +318,11 @@ func TestYAMLOrJSONDecoder(t *testing.T) {
319318
{"foo": "bar"},
320319
{"baz": "biz"},
321320
}},
321+
// Spaces for indent, tabs are not allowed in YAML.
322+
{"foo:\n field: bar\n---\nbaz:\n field: biz", 100, false, false, []generic{
323+
{"foo": map[string]any{"field": "bar"}},
324+
{"baz": map[string]any{"field": "biz"}},
325+
}},
322326
{"foo: bar\n---\n", 100, false, false, []generic{
323327
{"foo": "bar"},
324328
}},
@@ -334,6 +338,38 @@ func TestYAMLOrJSONDecoder(t *testing.T) {
334338
{"foo: bar\n", 100, false, false, []generic{
335339
{"foo": "bar"},
336340
}},
341+
// First document is JSON, second is YAML
342+
{"{\"foo\": \"bar\"}\n---\n{baz: biz}", 100, false, false, []generic{
343+
{"foo": "bar"},
344+
{"baz": "biz"},
345+
}},
346+
// First document is JSON, second is YAML, longer than the buffer
347+
{"{\"foo\": \"bar\"}\n---\n{baz: biz0123456780123456780123456780123456780123456789}", 20, false, false, []generic{
348+
{"foo": "bar"},
349+
{"baz": "biz0123456780123456780123456780123456780123456789"},
350+
}},
351+
// First document is JSON, then whitespace, then YAML
352+
{"{\"foo\": \"bar\"} \n---\n{baz: biz}", 100, false, false, []generic{
353+
{"foo": "bar"},
354+
{"baz": "biz"},
355+
}},
356+
// First document is YAML, second is JSON
357+
{"{foo: bar}\n---\n{\"baz\": \"biz\"}", 100, false, false, []generic{
358+
{"foo": "bar"},
359+
{"baz": "biz"},
360+
}},
361+
// First document is JSON, second is YAML, using spaces
362+
{"{\n \"foo\": \"bar\"\n}\n---\n{\n baz: biz\n}", 100, false, false, []generic{
363+
{"foo": "bar"},
364+
{"baz": "biz"},
365+
}},
366+
// First document is JSON, second is YAML, using tabs
367+
{"{\n\t\"foo\": \"bar\"\n}\n---\n{\n\tbaz: biz\n}", 100, false, false, []generic{
368+
{"foo": "bar"},
369+
{"baz": "biz"},
370+
}},
371+
// First 2 documents are JSON, third is YAML (stream is JSON)
372+
{"{\"foo\": \"bar\"}\n{\"baz\": \"biz\"}\n---\n{qux: zrb}", 100, true, true, nil},
337373
}
338374
for i, testCase := range testCases {
339375
decoder := NewYAMLOrJSONDecoder(bytes.NewReader([]byte(testCase.input)), testCase.buffer)
@@ -348,7 +384,7 @@ func TestYAMLOrJSONDecoder(t *testing.T) {
348384
}
349385
objs = append(objs, out)
350386
}
351-
if err != io.EOF {
387+
if err != io.EOF { //nolint:errorlint
352388
switch {
353389
case testCase.err && err == nil:
354390
t.Errorf("%d: unexpected non-error", i)
@@ -360,12 +396,12 @@ func TestYAMLOrJSONDecoder(t *testing.T) {
360396
continue
361397
}
362398
}
363-
switch decoder.decoder.(type) {
364-
case *YAMLToJSONDecoder:
399+
switch {
400+
case decoder.yaml != nil:
365401
if testCase.isJSON {
366402
t.Errorf("%d: expected JSON decoder, got YAML", i)
367403
}
368-
case *json.Decoder:
404+
case decoder.json != nil:
369405
if !testCase.isJSON {
370406
t.Errorf("%d: expected YAML decoder, got JSON", i)
371407
}
@@ -419,7 +455,7 @@ func testReadLines(t *testing.T, lineLengths []int) {
419455
var readLines [][]byte
420456
for range lines {
421457
bytes, err := lineReader.Read()
422-
if err != nil && err != io.EOF {
458+
if err != nil && err != io.EOF { //nolint:errorlint
423459
t.Fatalf("failed to read lines: %v", err)
424460
}
425461
readLines = append(readLines, bytes)

0 commit comments

Comments
 (0)