@@ -53,54 +53,81 @@ namespace Rcpp {
5353 typedef internal::const_string_proxy<STRSXP> const_StringProxy;
5454
5555 /* * default constructor */
56- String ( ): data( Rf_mkChar(" " ) ), buffer(), valid(true ), buffer_ready(true ) {
56+ String ( ): data( Rf_mkChar(" " ) ), buffer(), valid(true ), buffer_ready(true ), enc(CE_NATIVE) {
5757 RCPP_STRING_DEBUG ( " String()" ) ;
5858 }
5959
6060 /* * copy constructor */
61- String ( const String& other) : data( other.get_sexp()), valid(true ), buffer_ready(false ) {
61+ String ( const String& other) : data( other.get_sexp()), valid(true ), buffer_ready(false ),
62+ enc (Rf_getCharCE(other.get_sexp())) {
63+ RCPP_STRING_DEBUG ( " String(const String&)" ) ;
64+ }
65+
66+ String ( const String& other, const char * enc) : data( other.get_sexp()), valid(true ), buffer_ready(false ) {
67+ set_encoding (enc);
6268 RCPP_STRING_DEBUG ( " String(const String&)" ) ;
6369 }
6470
6571 /* * construct a string from a single CHARSXP SEXP */
66- String (SEXP charsxp) : data(charsxp), valid(true ), buffer_ready(false ) {
72+ String (SEXP charsxp) : data(charsxp), valid(true ), buffer_ready(false ),
73+ enc(Rf_getCharCE(charsxp)) {
74+ RCPP_STRING_DEBUG ( " String(SEXP)" ) ;
75+ }
76+
77+ String (SEXP charsxp, const char * enc) : data(charsxp), valid(true ), buffer_ready(false ) {
78+ set_encoding (enc);
6779 RCPP_STRING_DEBUG ( " String(SEXP)" ) ;
6880 }
6981
7082 /* * from string proxy */
71- String ( const StringProxy& proxy ): data( proxy.get() ), valid(true ), buffer_ready(false ){
83+ String ( const StringProxy& proxy ): data( proxy.get() ), valid(true ), buffer_ready(false ),
84+ enc(Rf_getCharCE(proxy.get())){
7285 RCPP_STRING_DEBUG ( " String( const StringProxy&)" ) ;
7386 }
87+
88+ String ( const StringProxy& proxy, const char * enc ): data( proxy.get() ), valid(true ), buffer_ready(false ) {
89+ set_encoding (enc);
90+ RCPP_STRING_DEBUG ( " String( const StringProxy&)" ) ;
91+ }
92+
7493 /* * from string proxy */
75- String ( const const_StringProxy& proxy ): data( proxy.get() ), valid(true ), buffer_ready(false ){
94+ String ( const const_StringProxy& proxy ): data( proxy.get() ), valid(true ), buffer_ready(false ),
95+ enc(Rf_getCharCE(proxy.get())){
96+ RCPP_STRING_DEBUG ( " String( const const_StringProxy&)" ) ;
97+ }
98+
99+ String ( const const_StringProxy& proxy, const char * enc ): data( proxy.get() ), valid(true ), buffer_ready(false ) {
100+ set_encoding (enc);
76101 RCPP_STRING_DEBUG ( " String( const const_StringProxy&)" ) ;
77102 }
78103
79104 /* * from a std::string */
80- String ( const std::string& s) : buffer(s), valid(false ), buffer_ready(true ) {
105+ String ( const std::string& s) : buffer(s), valid(false ), buffer_ready(true ),
106+ enc(CE_NATIVE) {
81107 RCPP_STRING_DEBUG ( " String(const std::string& )" ) ;
82108 }
83109
84- String ( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true ), buffer_ready(false ) {
110+ String ( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true ), buffer_ready(false ),
111+ enc(CE_NATIVE) {
85112 RCPP_STRING_DEBUG ( " String(const std::wstring& )" ) ;
86113 }
87114
88115 /* * from a const char* */
89- String ( const char * s) : buffer(s), valid(false ), buffer_ready(true ){
116+ String ( const char * s) : buffer(s), valid(false ), buffer_ready(true ), enc(CE_NATIVE) {
90117 RCPP_STRING_DEBUG ( " String(const char*)" ) ;
91118 }
92119
93- String ( const wchar_t * s) : data(internal::make_charsexp(s)), valid(true ), buffer_ready(false ) {
120+ String ( const wchar_t * s) : data(internal::make_charsexp(s)), valid(true ), buffer_ready(false ),
121+ enc(CE_NATIVE) {
94122 RCPP_STRING_DEBUG ( " String(const wchar_t* s)" ) ;
95123 }
96124
97-
98125 /* * constructors from R primitives */
99- String ( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true ), buffer_ready(false ) {}
100- String ( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true ), buffer_ready(false ){}
101- String ( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false ){}
102- String ( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false ){}
103- String ( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true ), buffer_ready(false ){}
126+ String ( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true ), buffer_ready(false ), enc(CE_NATIVE) {}
127+ String ( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true ), buffer_ready(false ), enc(CE_NATIVE) {}
128+ String ( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false ), enc(CE_NATIVE) {}
129+ String ( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false ), enc(CE_NATIVE) {}
130+ String ( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true ), buffer_ready(false ), enc(CE_NATIVE) {}
104131
105132
106133 inline String& operator =( int x ){ data = internal::r_coerce<INTSXP ,STRSXP>( x ) ; valid = true ; buffer_ready = false ; return *this ; }
@@ -127,13 +154,13 @@ namespace Rcpp {
127154 inline String& operator =( const std::wstring& s){ return assign_wide_string (s) ; }
128155 inline String& operator =( const wchar_t * s){ return assign_wide_string (s) ; }
129156
130-
131157 inline String& operator +=( const std::string& s){
132158 RCPP_STRING_DEBUG ( " String::operator+=( std::string )" ) ;
133159 if ( is_na () ) return *this ;
134160 setBuffer () ; buffer += s ; valid = false ;
135161 return *this ;
136162 }
163+
137164 inline String& operator +=( const char * s){
138165 RCPP_STRING_DEBUG ( " String::operator+=( const char*)" ) ;
139166 if ( is_na () ) return *this ;
@@ -214,8 +241,6 @@ namespace Rcpp {
214241 return replace_first ( s.get_cstring (), news.get_cstring () ) ;
215242 }
216243
217-
218-
219244 inline String& replace_last ( const char * s, const char * news ){
220245 RCPP_STRING_DEBUG_2 ( " String::replace_last( const char* = '%s' , const char* = '%s')" , s, news ) ;
221246 if ( is_na () ) return *this ;
@@ -312,7 +337,7 @@ namespace Rcpp {
312337
313338 inline SEXP get_sexp () const {
314339 RCPP_STRING_DEBUG_1 ( " String::get_sexp const ( valid = %d) " , valid ) ;
315- return valid ? data : Rf_mkChar ( buffer.c_str () ) ;
340+ return valid ? data : Rf_mkCharCE ( buffer.c_str (), enc ) ;
316341 }
317342
318343 inline SEXP get_sexp () {
@@ -333,6 +358,39 @@ namespace Rcpp {
333358 inline const char * get_cstring () const {
334359 return buffer_ready ? buffer.c_str () : CHAR (data) ;
335360 }
361+
362+ inline const char * get_encoding () const {
363+ switch (enc) {
364+ case CE_BYTES:
365+ return " bytes" ;
366+ case CE_LATIN1:
367+ return " latin1" ;
368+ case CE_UTF8:
369+ return " UTF-8" ;
370+ default :
371+ return " unknown" ;
372+ }
373+ }
374+
375+ inline void set_encoding ( cetype_t encoding ) {
376+ enc = encoding;
377+ if (data != NULL )
378+ data = Rf_mkCharCE (Rf_translateCharUTF8 (data), enc);
379+ }
380+
381+ inline void set_encoding (const char * encoding) {
382+ if ( encoding == " bytes" ) {
383+ enc = CE_BYTES;
384+ } else if ( encoding == " latin1" ) {
385+ enc = CE_LATIN1;
386+ } else if ( encoding == " UTF-8" ) {
387+ enc = CE_UTF8;
388+ } else {
389+ enc = CE_ANY;
390+ Rcout << " Unknown encoding" << std::endl;
391+ }
392+ set_encoding (enc);
393+ }
336394
337395 bool operator <( const Rcpp::String& other ) const {
338396 return strcmp ( get_cstring (), other.get_cstring () ) < 0 ;
@@ -353,6 +411,9 @@ namespace Rcpp {
353411
354412 /* * the CHARSXP this String encapsulates */
355413 SEXP data ;
414+
415+ /* * the encoding of encapsulated CHARSXP */
416+ cetype_t enc;
356417
357418 /* * a buffer used to do string operations withough going back to the SEXP */
358419 std::string buffer ;
@@ -373,7 +434,7 @@ namespace Rcpp {
373434 inline void setData (){
374435 RCPP_STRING_DEBUG ( " setData" ) ;
375436 if (!valid) {
376- data = Rf_mkChar (buffer.c_str ()) ;
437+ data = Rf_mkCharCE (buffer.c_str (), enc ) ;
377438 valid = true ;
378439 }
379440 }
0 commit comments