1- // Copyright (c) 2014-2015 Dr. Colin Hirsch and Daniel Frey
1+ // Copyright (c) 2014-2016 Dr. Colin Hirsch and Daniel Frey
22// Please see LICENSE for license or visit https://github.com/ColinH/PEGTL/
33
44#ifndef TAOCPP_JSON_EMBEDDED_PEGTL_CONTRIB_UNESCAPE_HH
@@ -27,20 +27,20 @@ namespace tao_json_pegtl
2727 string += char ( utf32 & 0xff );
2828 return true ;
2929 }
30- else if ( utf32 <= 0x7ff ) {
30+ if ( utf32 <= 0x7ff ) {
3131 char tmp[] = { char ( ( ( utf32 & 0x7c0 ) >> 6 ) | 0xc0 ),
3232 char ( ( ( utf32 & 0x03f ) ) | 0x80 ) };
3333 string.append ( tmp, sizeof ( tmp ) );
3434 return true ;
3535 }
36- else if ( utf32 <= 0xffff ) {
36+ if ( utf32 <= 0xffff ) {
3737 char tmp[] = { char ( ( ( utf32 & 0xf000 ) >> 12 ) | 0xe0 ),
3838 char ( ( ( utf32 & 0x0fc0 ) >> 6 ) | 0x80 ),
3939 char ( ( ( utf32 & 0x003f ) ) | 0x80 ) };
4040 string.append ( tmp, sizeof ( tmp ) );
4141 return true ;
4242 }
43- else if ( utf32 <= 0x10ffff ) {
43+ if ( utf32 <= 0x10ffff ) {
4444 char tmp[] = { char ( ( ( utf32 & 0x1c0000 ) >> 18 ) | 0xf0 ),
4545 char ( ( ( utf32 & 0x03f000 ) >> 12 ) | 0x80 ),
4646 char ( ( ( utf32 & 0x000fc0 ) >> 6 ) | 0x80 ),
@@ -63,7 +63,7 @@ namespace tao_json_pegtl
6363 case ' A' : case ' B' : case ' C' : case ' D' : case ' E' : case ' F' :
6464 return I ( c - ' A' + 10 );
6565 }
66- assert ( false ); // LCOV_EXCL_LINE
66+ throw std::runtime_error ( " invalid character in unhex " ); // LCOV_EXCL_LINE
6767 }
6868
6969 template < typename I >
@@ -88,7 +88,7 @@ namespace tao_json_pegtl
8888 }
8989 };
9090
91- // This function MUST be called for a character matching T which must be tao_json_pegtl::one< ... >.
91+ // This action MUST be called for a character matching T which MUST be tao_json_pegtl::one< ... >.
9292 template < typename T, char ... Rs >
9393 struct unescape_c
9494 {
@@ -113,12 +113,13 @@ namespace tao_json_pegtl
113113 return * ( r.begin () + i );
114114 }
115115 }
116- assert ( false ); // LCOV_EXCL_LINE
116+ throw std::runtime_error ( " invalid character in unescape " ); // LCOV_EXCL_LINE
117117 }
118118 };
119119
120- // See examples/unescape.cc to see why the following two actions
121- // have the convenience of skipping the first input character...
120+ // See examples/unescape.cc for why the following two actions
121+ // skip the first input character. They also MUST be called
122+ // with non-empty matched inputs!
122123
123124 struct unescape_u
124125 {
@@ -142,17 +143,20 @@ namespace tao_json_pegtl
142143 }
143144 };
144145
145- // Like unescape_u, but (a) assumes 4 hexdigits per code point,
146- // and (b) accepts multiple consecutive escaped 16-bit values.
147- // It encodes UTF-16 surrogate pairs as single UTF-8 sequence
148- // as required for JSON by RFC 7159.
146+ // The unescape_j action is similar to unescape_u, however unlike
147+ // unescape_u it
148+ // (a) assumes exactly 4 hexdigits per escape sequence,
149+ // (b) accepts multiple consecutive escaped 16-bit values.
150+ // When applied to more than one escape sequence, unescape_j
151+ // translates UTF-16 surrogate pairs in the input into a single
152+ // UTF-8 sequence in st.unescaped, as required for JSON by RFC 7159.
149153
150154 struct unescape_j
151155 {
152156 template < typename Input, typename State >
153157 static void apply ( const Input & in, State & st )
154158 {
155- assert ( ( ( in.size () + 1 ) % 6 ) == 0 ); // Expects multiple "\\u1234" with the first backslash already skipped .
159+ assert ( ( ( in.size () + 1 ) % 6 ) == 0 ); // Expects multiple "\\u1234", starting with the first "u" .
156160 for ( const char * b = in.begin () + 1 ; b < in.end (); b += 6 ) {
157161 const auto c = unhex_string< unsigned >( b, b + 4 );
158162 if ( ( 0xd800 <= c ) && ( c <= 0xdbff ) && ( b + 6 < in.end () ) ) {
0 commit comments