1+ #include < cstdint>
12#include < iostream>
23
34#include " stream.h"
67#define YAML_PREFETCH_SIZE 2048
78#endif
89
9- #define S_ARRAY_SIZE (A ) (sizeof (A) / sizeof (*(A)))
10- #define S_ARRAY_END (A ) ((A) + S_ARRAY_SIZE(A))
11-
1210#define CP_REPLACEMENT_CHARACTER (0xFFFD )
1311
1412namespace YAML {
15- enum UtfIntroState {
16- uis_start,
17- uis_utfbe_b1,
18- uis_utf32be_b2,
19- uis_utf32be_bom3,
20- uis_utf32be,
21- uis_utf16be,
22- uis_utf16be_bom1,
23- uis_utfle_bom1,
24- uis_utf16le_bom2,
25- uis_utf32le_bom3,
26- uis_utf16le,
27- uis_utf32le,
28- uis_utf8_imp,
29- uis_utf16le_imp,
30- uis_utf32le_imp3,
31- uis_utf8_bom1,
32- uis_utf8_bom2,
33- uis_utf8,
34- uis_error
35- };
36-
37- enum UtfIntroCharType {
38- uict00,
39- uictBB,
40- uictBF,
41- uictEF,
42- uictFE,
43- uictFF,
44- uictAscii,
45- uictOther,
46- uictMax
47- };
48-
49- static bool s_introFinalState[] = {
50- false , // uis_start
51- false , // uis_utfbe_b1
52- false , // uis_utf32be_b2
53- false , // uis_utf32be_bom3
54- true , // uis_utf32be
55- true , // uis_utf16be
56- false , // uis_utf16be_bom1
57- false , // uis_utfle_bom1
58- false , // uis_utf16le_bom2
59- false , // uis_utf32le_bom3
60- true , // uis_utf16le
61- true , // uis_utf32le
62- false , // uis_utf8_imp
63- false , // uis_utf16le_imp
64- false , // uis_utf32le_imp3
65- false , // uis_utf8_bom1
66- false , // uis_utf8_bom2
67- true , // uis_utf8
68- true , // uis_error
69- };
70-
71- static UtfIntroState s_introTransitions[][uictMax] = {
72- // uict00, uictBB, uictBF, uictEF,
73- // uictFE, uictFF, uictAscii, uictOther
74- {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
75- uis_utfle_bom1, uis_utf8_imp, uis_utf8},
76- {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
77- uis_utf16be, uis_utf8},
78- {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
79- uis_utf8, uis_utf8},
80- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
81- uis_utf8},
82- {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
83- uis_utf32be, uis_utf32be, uis_utf32be},
84- {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
85- uis_utf16be, uis_utf16be, uis_utf16be},
86- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
87- uis_utf8},
88- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
89- uis_utf8, uis_utf8},
90- {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
91- uis_utf16le, uis_utf16le, uis_utf16le},
92- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
93- uis_utf16le, uis_utf16le, uis_utf16le},
94- {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
95- uis_utf16le, uis_utf16le, uis_utf16le},
96- {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
97- uis_utf32le, uis_utf32le, uis_utf32le},
98- {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
99- uis_utf8, uis_utf8},
100- {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
101- uis_utf16le, uis_utf16le, uis_utf16le},
102- {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
103- uis_utf16le, uis_utf16le, uis_utf16le},
104- {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
105- uis_utf8},
106- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
107- uis_utf8},
108- {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
109- uis_utf8},
110- };
111-
112- static char s_introUngetCount[][uictMax] = {
113- // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
114- {0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 }, {0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 },
115- {3 , 3 , 3 , 3 , 0 , 3 , 3 , 3 }, {4 , 4 , 4 , 4 , 4 , 0 , 4 , 4 },
116- {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 }, {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 },
117- {2 , 2 , 2 , 2 , 2 , 0 , 2 , 2 }, {2 , 2 , 2 , 2 , 0 , 2 , 2 , 2 },
118- {0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 }, {0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 },
119- {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 }, {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 },
120- {0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 }, {0 , 3 , 3 , 3 , 3 , 3 , 3 , 3 },
121- {4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 }, {2 , 0 , 2 , 2 , 2 , 2 , 2 , 2 },
122- {3 , 3 , 0 , 3 , 3 , 3 , 3 , 3 }, {1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 },
123- };
124-
125- inline UtfIntroCharType IntroCharTypeOf (std::istream::int_type ch) {
126- if (std::istream::traits_type::eof () == ch) {
127- return uictOther;
128- }
129-
130- switch (ch) {
131- case 0 :
132- return uict00;
133- case 0xBB :
134- return uictBB;
135- case 0xBF :
136- return uictBF;
137- case 0xEF :
138- return uictEF;
139- case 0xFE :
140- return uictFE;
141- case 0xFF :
142- return uictFF;
143- }
144-
145- if ((ch > 0 ) && (ch < 0xFF )) {
146- return uictAscii;
147- }
148-
149- return uictOther;
150- }
15113
15214inline char Utf8Adjust (unsigned long ch, unsigned char lead_bits,
15315 unsigned char rshift) {
@@ -182,6 +44,58 @@ inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
18244 }
18345}
18446
47+ uint8_t Stream::CheckBOM (const unsigned char * bom, uint8_t size) {
48+ if (size >= 4 ) {
49+ if (bom[0 ] == 0x00 && bom[1 ] == 0x00 && bom[2 ] == 0xFE && bom[3 ] == 0xFF ) {
50+ m_charSet = utf32be;
51+ return 4 ;
52+ }
53+ if (bom[0 ] == 0x00 && bom[1 ] == 0x00 && bom[2 ] == 0x00 ) {
54+ m_charSet = utf32be;
55+ return 0 ;
56+ }
57+
58+ if (bom[0 ] == 0xFF && bom[1 ] == 0xFE && bom[2 ] == 0x00 && bom[3 ] == 0x00 ) {
59+ m_charSet = utf32le;
60+ return 4 ;
61+ }
62+ if (bom[1 ] == 0x00 && bom[2 ] == 0x00 && bom[3 ] == 0x00 ) {
63+ m_charSet = utf32le;
64+ return 0 ;
65+ }
66+ }
67+
68+ if (size >= 2 ) {
69+ if (bom[0 ] == 0xFE && bom[1 ] == 0xFF ) {
70+ m_charSet = utf16be;
71+ return 2 ;
72+ }
73+ if (bom[0 ] == 0x00 ) {
74+ m_charSet = utf16be;
75+ return 0 ;
76+ }
77+
78+ if (bom[0 ] == 0xFF && bom[1 ] == 0xFE ) {
79+ m_charSet = utf16le;
80+ return 2 ;
81+ }
82+ if (bom[1 ] == 0x00 ) {
83+ m_charSet = utf16le;
84+ return 0 ;
85+ }
86+ }
87+
88+ if (size >= 3 ) {
89+ if (bom[0 ] == 0xEF && bom[1 ] == 0xBB && bom[2 ] == 0xBF ) {
90+ m_charSet = utf8;
91+ return 3 ;
92+ }
93+ }
94+
95+ m_charSet = utf8;
96+ return 0 ;
97+ }
98+
18599Stream::Stream (std::istream& input)
186100 : m_input(input),
187101 m_mark{},
@@ -190,52 +104,28 @@ Stream::Stream(std::istream& input)
190104 m_pPrefetched (new unsigned char [YAML_PREFETCH_SIZE]),
191105 m_nPrefetchedAvailable (0 ),
192106 m_nPrefetchedUsed (0 ) {
193- using char_traits = std::istream::traits_type;
194107
195108 if (!input)
196109 return ;
197110
198111 // Determine (or guess) the character-set by reading the BOM, if any. See
199112 // the YAML specification for the determination algorithm.
200- char_traits::int_type intro[4 ]{};
201- int nIntroUsed = 0 ;
202- UtfIntroState state = uis_start;
203- for (; !s_introFinalState[state];) {
204- std::istream::int_type ch = input.get ();
205- intro[nIntroUsed++] = ch;
206- UtfIntroCharType charType = IntroCharTypeOf (ch);
207- UtfIntroState newState = s_introTransitions[state][charType];
208- int nUngets = s_introUngetCount[state][charType];
209- if (nUngets > 0 ) {
113+ unsigned char buffer[4 ];
114+ uint8_t size = 4 ;
115+ for (uint8_t i = 0 ; i < 4 ; i++) {
116+ buffer[i] = input.get ();
117+ if (!input.good ()) {
210118 input.clear ();
211- for (; nUngets > 0 ; --nUngets) {
212- if (char_traits::eof () != intro[--nIntroUsed])
213- input.putback (char_traits::to_char_type (intro[nIntroUsed]));
214- }
119+ size = i;
120+ break ;
215121 }
216- state = newState;
217122 }
218-
219- switch (state) {
220- case uis_utf8:
221- m_charSet = utf8;
222- break ;
223- case uis_utf16le:
224- m_charSet = utf16le;
225- break ;
226- case uis_utf16be:
227- m_charSet = utf16be;
228- break ;
229- case uis_utf32le:
230- m_charSet = utf32le;
231- break ;
232- case uis_utf32be:
233- m_charSet = utf32be;
234- break ;
235- default :
236- m_charSet = utf8;
237- break ;
123+ auto bom_size = CheckBOM (buffer, size);
124+ size -= bom_size;
125+ for (uint8_t i = 0 ; i < size; i++) {
126+ m_pPrefetched[i] = buffer[bom_size + i];
238127 }
128+ m_nPrefetchedAvailable = size;
239129
240130 ReadAheadTo (0 );
241131}
@@ -409,11 +299,8 @@ unsigned char Stream::GetNextByte() const {
409299 m_nPrefetchedAvailable = static_cast <std::size_t >(
410300 pBuf->sgetn (ReadBuffer (m_pPrefetched), YAML_PREFETCH_SIZE));
411301 m_nPrefetchedUsed = 0 ;
412- if (! m_nPrefetchedAvailable) {
302+ if (m_nPrefetchedAvailable == 0 ) {
413303 m_input.setstate (std::ios_base::eofbit);
414- }
415-
416- if (0 == m_nPrefetchedAvailable) {
417304 return 0 ;
418305 }
419306 }
0 commit comments