Skip to content

Commit 04a360a

Browse files
committed
test(xml): add test cases regarding XML encoding
1 parent fac18d5 commit 04a360a

File tree

4 files changed

+93
-6
lines changed

4 files changed

+93
-6
lines changed

internal/reader/xml/decoder.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,23 @@ func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
2121
buffer := &bytes.Buffer{}
2222
io.Copy(buffer, data)
2323

24-
enc := getEncoding(buffer.Bytes())
25-
if enc == nil || bytes.EqualFold(enc, []byte("utf-8")) {
26-
// filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
24+
if hasUTF8XMLDeclaration(buffer.Bytes()) {
25+
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
26+
// For now we just expect the invalid characters to be stripped out.
27+
28+
// Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
2729
filteredBytes := filterValidXMLChars(buffer.Bytes())
30+
2831
decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
2932
} else {
3033
data.Seek(0, io.SeekStart)
31-
// invalid characters will be filtered later via decoder.CharsetReader
3234
decoder = xml.NewDecoder(data)
35+
36+
// The XML document will be converted to UTF-8 by encoding.CharsetReader
37+
// Invalid characters will be filtered later via decoder.CharsetReader
38+
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
3339
}
3440

35-
decoder.CharsetReader = charsetReaderFilterInvalidUtf8
3641
decoder.Entity = xml.HTMLEntity
3742
decoder.Strict = false
3843

@@ -46,7 +51,7 @@ func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader,
4651
}
4752
rawData, err := io.ReadAll(utf8Reader)
4853
if err != nil {
49-
return nil, fmt.Errorf("encoding: unable to read data: %w", err)
54+
return nil, fmt.Errorf("xml: unable to read data: %w", err)
5055
}
5156
filteredBytes := filterValidXMLChars(rawData)
5257
return bytes.NewReader(filteredBytes), nil
@@ -110,3 +115,8 @@ func getEncoding(b []byte) []byte {
110115
}
111116
return v[1 : idx+1]
112117
}
118+
119+
func hasUTF8XMLDeclaration(data []byte) bool {
120+
enc := getEncoding(data)
121+
return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
122+
}

internal/reader/xml/decoder_test.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,78 @@ package xml // import "miniflux.app/v2/internal/reader/xml"
66
import (
77
"encoding/xml"
88
"fmt"
9+
"os"
910
"strings"
1011
"testing"
1112
"unicode/utf8"
1213
)
1314

15+
func TestXMLDocumentWithISO88591Encoding(t *testing.T) {
16+
fp, err := os.Open("testdata/iso88591.xml")
17+
if err != nil {
18+
t.Fatal(err)
19+
}
20+
defer fp.Close()
21+
22+
type myXMLDocument struct {
23+
XMLName xml.Name `xml:"note"`
24+
To string `xml:"to"`
25+
From string `xml:"from"`
26+
}
27+
28+
var doc myXMLDocument
29+
30+
decoder := NewXMLDecoder(fp)
31+
err = decoder.Decode(&doc)
32+
if err != nil {
33+
t.Fatal(err)
34+
}
35+
36+
expectedTo := "Anaïs"
37+
expectedFrom := "Jürgen"
38+
39+
if doc.To != expectedTo {
40+
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
41+
}
42+
if doc.From != expectedFrom {
43+
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
44+
}
45+
}
46+
47+
func TestXMLDocumentWithISO88591FileEncodingButUTF8Prolog(t *testing.T) {
48+
fp, err := os.Open("testdata/iso88591_utf8_mismatch.xml")
49+
if err != nil {
50+
t.Fatal(err)
51+
}
52+
defer fp.Close()
53+
54+
type myXMLDocument struct {
55+
XMLName xml.Name `xml:"note"`
56+
To string `xml:"to"`
57+
From string `xml:"from"`
58+
}
59+
60+
var doc myXMLDocument
61+
62+
decoder := NewXMLDecoder(fp)
63+
err = decoder.Decode(&doc)
64+
if err != nil {
65+
t.Fatal(err)
66+
}
67+
68+
// TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
69+
// For now we just expect the invalid characters to be stripped out.
70+
expectedTo := "Anas"
71+
expectedFrom := "Jrgen"
72+
73+
if doc.To != expectedTo {
74+
t.Errorf(`Incorrect "to" field, expected: %q, got: %q`, expectedTo, doc.To)
75+
}
76+
if doc.From != expectedFrom {
77+
t.Errorf(`Incorrect "from" field, expected: %q, got: %q`, expectedFrom, doc.From)
78+
}
79+
}
80+
1481
func TestXMLDocumentWithIllegalUnicodeCharacters(t *testing.T) {
1582
type myxml struct {
1683
XMLName xml.Name `xml:"rss"`
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<?xml version="1.0" encoding="iso8859-1"?>
2+
<note>
3+
<to>Anaïs</to>
4+
<from>Jürgen</from>
5+
</note>
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<note>
3+
<to>Anaïs</to>
4+
<from>Jürgen</from>
5+
</note>

0 commit comments

Comments
 (0)