@@ -56,6 +56,55 @@ enum TagState {
5656 Exit ,
5757}
5858
59+ /// A reference to an encoding together with information about how it was retrieved.
60+ ///
61+ /// The state transition diagram:
62+ ///
63+ /// ```mermaid
64+ /// flowchart LR
65+ /// Implicit -- from_str --> Explicit
66+ /// Implicit -- BOM --> BomDetected
67+ /// Implicit -- "encoding=..." --> XmlDetected
68+ /// BomDetected -- "encoding=..." --> XmlDetected
69+ /// ```
70+ #[ cfg( feature = "encoding" ) ]
71+ #[ derive( Clone , Copy ) ]
72+ enum EncodingRef {
73+ /// Encoding was implicitly assumed to have a specified value. It can be refined
74+ /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
75+ Implicit ( & ' static Encoding ) ,
76+ /// Encoding was explicitly set to the desired value. It cannot be changed
77+ /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
78+ Explicit ( & ' static Encoding ) ,
79+ /// Encoding was detected from a byte order mark (BOM) or by the first bytes
80+ /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
81+ BomDetected ( & ' static Encoding ) ,
82+ /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
83+ /// It can no longer change
84+ XmlDetected ( & ' static Encoding ) ,
85+ }
86+ #[ cfg( feature = "encoding" ) ]
87+ impl EncodingRef {
88+ #[ inline]
89+ fn encoding ( & self ) -> & ' static Encoding {
90+ match self {
91+ Self :: Implicit ( e) => e,
92+ Self :: Explicit ( e) => e,
93+ Self :: BomDetected ( e) => e,
94+ Self :: XmlDetected ( e) => e,
95+ }
96+ }
97+ #[ inline]
98+ fn can_be_refined ( & self ) -> bool {
99+ match self {
100+ Self :: Implicit ( _) | Self :: BomDetected ( _) => true ,
101+ Self :: Explicit ( _) | Self :: XmlDetected ( _) => false ,
102+ }
103+ }
104+ }
105+
106+ ////////////////////////////////////////////////////////////////////////////////////////////////////
107+
59108/// A low level encoding-agnostic XML event reader.
60109///
61110/// Consumes a `BufRead` and streams XML `Event`s.
@@ -144,11 +193,8 @@ pub struct Reader<R: BufRead> {
144193 pending_pop : bool ,
145194
146195 #[ cfg( feature = "encoding" ) ]
147- /// the encoding specified in the xml, defaults to utf8
148- encoding : & ' static Encoding ,
149- #[ cfg( feature = "encoding" ) ]
150- /// check if quick-rs could find out the encoding
151- is_encoding_set : bool ,
196+ /// Reference to the encoding used to read an XML
197+ encoding : EncodingRef ,
152198}
153199
154200/// Builder methods
@@ -172,9 +218,7 @@ impl<R: BufRead> Reader<R> {
172218 pending_pop : false ,
173219
174220 #[ cfg( feature = "encoding" ) ]
175- encoding : :: encoding_rs:: UTF_8 ,
176- #[ cfg( feature = "encoding" ) ]
177- is_encoding_set : false ,
221+ encoding : EncodingRef :: Implicit ( UTF_8 ) ,
178222 }
179223 }
180224
@@ -412,7 +456,7 @@ impl<R: BufRead> Reader<R> {
412456 pub fn decoder ( & self ) -> Decoder {
413457 Decoder {
414458 #[ cfg( feature = "encoding" ) ]
415- encoding : self . encoding ,
459+ encoding : self . encoding . encoding ( ) ,
416460 }
417461 }
418462}
@@ -683,10 +727,9 @@ impl<R: BufRead> Reader<R> {
683727 {
684728 Ok ( Some ( bytes) ) => {
685729 #[ cfg( feature = "encoding" ) ]
686- if first {
730+ if first && self . encoding . can_be_refined ( ) {
687731 if let Some ( encoding) = detect_encoding ( bytes) {
688- self . encoding = encoding;
689- self . is_encoding_set = true ;
732+ self . encoding = EncodingRef :: BomDetected ( encoding) ;
690733 }
691734 }
692735
@@ -843,9 +886,10 @@ impl<R: BufRead> Reader<R> {
843886
844887 // Try getting encoding from the declaration event
845888 #[ cfg( feature = "encoding" ) ]
846- if let Some ( enc) = event. encoder ( ) {
847- self . encoding = enc;
848- self . is_encoding_set = true ;
889+ if self . encoding . can_be_refined ( ) {
890+ if let Some ( encoding) = event. encoder ( ) {
891+ self . encoding = EncodingRef :: XmlDetected ( encoding) ;
892+ }
849893 }
850894
851895 Ok ( Event :: Decl ( event) )
@@ -905,6 +949,15 @@ impl Reader<BufReader<File>> {
905949impl < ' a > Reader < & ' a [ u8 ] > {
906950 /// Creates an XML reader from a string slice.
907951 pub fn from_str ( s : & ' a str ) -> Self {
952+ // Rust strings are guaranteed to be UTF-8, so lock the encoding
953+ #[ cfg( feature = "encoding" ) ]
954+ {
955+ let mut reader = Self :: from_reader ( s. as_bytes ( ) ) ;
956+ reader. encoding = EncodingRef :: Explicit ( UTF_8 ) ;
957+ reader
958+ }
959+
960+ #[ cfg( not( feature = "encoding" ) ) ]
908961 Self :: from_reader ( s. as_bytes ( ) )
909962 }
910963
@@ -1533,8 +1586,6 @@ impl Decoder {
15331586 /// Copied from [`Encoding::decode_with_bom_removal`]
15341587 #[ inline]
15351588 fn remove_bom < ' b > ( & self , bytes : & ' b [ u8 ] ) -> & ' b [ u8 ] {
1536- use encoding_rs:: * ;
1537-
15381589 if self . encoding == UTF_8 && bytes. starts_with ( b"\xEF \xBB \xBF " ) {
15391590 return & bytes[ 3 ..] ;
15401591 }
@@ -1556,15 +1607,13 @@ impl Decoder {
15561607 pub ( crate ) fn utf8 ( ) -> Self {
15571608 Decoder {
15581609 #[ cfg( feature = "encoding" ) ]
1559- encoding : encoding_rs :: UTF_8 ,
1610+ encoding : UTF_8 ,
15601611 }
15611612 }
15621613
15631614 #[ cfg( feature = "encoding" ) ]
15641615 pub ( crate ) fn utf16 ( ) -> Self {
1565- Decoder {
1566- encoding : encoding_rs:: UTF_16LE ,
1567- }
1616+ Decoder { encoding : UTF_16LE }
15681617 }
15691618}
15701619
@@ -2480,6 +2529,62 @@ mod test {
24802529 ) ;
24812530 }
24822531 }
2532+
2533+ #[ cfg( feature = "encoding" ) ]
2534+ mod encoding {
2535+ use crate :: events:: Event ;
2536+ use crate :: reader:: Reader ;
2537+ use encoding_rs:: { UTF_8 , UTF_16LE , WINDOWS_1251 } ;
2538+ use pretty_assertions:: assert_eq;
2539+
2540+ mod bytes {
2541+ use super :: * ;
2542+ use pretty_assertions:: assert_eq;
2543+
2544+ /// Checks that encoding is detected by BOM and changed after XML declaration
2545+ #[ test]
2546+ fn bom_detected( ) {
2547+ let mut reader = Reader :: from_bytes( b"\xFF \xFE <?xml encoding='windows-1251'?>" ) ;
2548+
2549+ assert_eq!( reader. decoder( ) . encoding( ) , UTF_8 ) ;
2550+ reader. read_event_buffered( $buf) . unwrap( ) ;
2551+ assert_eq!( reader. decoder( ) . encoding( ) , UTF_16LE ) ;
2552+
2553+ reader. read_event_buffered( $buf) . unwrap( ) ;
2554+ assert_eq!( reader. decoder( ) . encoding( ) , WINDOWS_1251 ) ;
2555+
2556+ assert_eq!( reader. read_event_buffered( $buf) . unwrap( ) , Event :: Eof ) ;
2557+ }
2558+
2559+ /// Checks that encoding is changed by XML declaration, but only once
2560+ #[ test]
2561+ fn xml_declaration( ) {
2562+ let mut reader = Reader :: from_bytes( b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>" ) ;
2563+
2564+ assert_eq!( reader. decoder( ) . encoding( ) , UTF_8 ) ;
2565+ reader. read_event_buffered( $buf) . unwrap( ) ;
2566+ assert_eq!( reader. decoder( ) . encoding( ) , UTF_16LE ) ;
2567+
2568+ reader. read_event_buffered( $buf) . unwrap( ) ;
2569+ assert_eq!( reader. decoder( ) . encoding( ) , UTF_16LE ) ;
2570+
2571+ assert_eq!( reader. read_event_buffered( $buf) . unwrap( ) , Event :: Eof ) ;
2572+ }
2573+ }
2574+
2575+ /// Checks that XML declaration cannot change the encoding from UTF-8 if
2576+ /// a `Reader` was created using `from_str` method
2577+ #[ test]
2578+ fn str_always_has_utf8( ) {
2579+ let mut reader = Reader :: from_str( "<?xml encoding='UTF-16'?>" ) ;
2580+
2581+ assert_eq!( reader. decoder( ) . encoding( ) , UTF_8 ) ;
2582+ reader. read_event_buffered( $buf) . unwrap( ) ;
2583+ assert_eq!( reader. decoder( ) . encoding( ) , UTF_8 ) ;
2584+
2585+ assert_eq!( reader. read_event_buffered( $buf) . unwrap( ) , Event :: Eof ) ;
2586+ }
2587+ }
24832588 } ;
24842589 }
24852590
0 commit comments