Skip to content

Commit 5d04d4f

Browse files
author
thirdwing
committed
set encoding explictly in String ctor
1 parent 837ad76 commit 5d04d4f

File tree

1 file changed

+81
-20
lines changed

1 file changed

+81
-20
lines changed

inst/include/Rcpp/String.h

Lines changed: 81 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -53,54 +53,81 @@ namespace Rcpp {
5353
typedef internal::const_string_proxy<STRSXP> const_StringProxy;
5454

5555
/** default constructor */
56-
String( ): data( Rf_mkChar("") ), buffer(), valid(true), buffer_ready(true) {
56+
String( ): data( Rf_mkChar("") ), buffer(), valid(true), buffer_ready(true), enc(CE_NATIVE) {
5757
RCPP_STRING_DEBUG( "String()" ) ;
5858
}
5959

6060
/** copy constructor */
61-
String( const String& other) : data( other.get_sexp()), valid(true), buffer_ready(false) {
61+
String( const String& other) : data( other.get_sexp()), valid(true), buffer_ready(false),
62+
enc(Rf_getCharCE(other.get_sexp())) {
63+
RCPP_STRING_DEBUG( "String(const String&)" ) ;
64+
}
65+
66+
String( const String& other, const char * enc) : data( other.get_sexp()), valid(true), buffer_ready(false) {
67+
set_encoding(enc);
6268
RCPP_STRING_DEBUG( "String(const String&)" ) ;
6369
}
6470

6571
/** construct a string from a single CHARSXP SEXP */
66-
String(SEXP charsxp) : data(charsxp), valid(true), buffer_ready(false) {
72+
String(SEXP charsxp) : data(charsxp), valid(true), buffer_ready(false),
73+
enc(Rf_getCharCE(charsxp)) {
74+
RCPP_STRING_DEBUG( "String(SEXP)" ) ;
75+
}
76+
77+
String(SEXP charsxp, const char * enc) : data(charsxp), valid(true), buffer_ready(false) {
78+
set_encoding(enc);
6779
RCPP_STRING_DEBUG( "String(SEXP)" ) ;
6880
}
6981

7082
/** from string proxy */
71-
String( const StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false){
83+
String( const StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false),
84+
enc(Rf_getCharCE(proxy.get())){
7285
RCPP_STRING_DEBUG( "String( const StringProxy&)" ) ;
7386
}
87+
88+
String( const StringProxy& proxy, const char * enc ): data( proxy.get() ), valid(true), buffer_ready(false) {
89+
set_encoding(enc);
90+
RCPP_STRING_DEBUG( "String( const StringProxy&)" ) ;
91+
}
92+
7493
/** from string proxy */
75-
String( const const_StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false){
94+
String( const const_StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false),
95+
enc(Rf_getCharCE(proxy.get())){
96+
RCPP_STRING_DEBUG( "String( const const_StringProxy&)" ) ;
97+
}
98+
99+
String( const const_StringProxy& proxy, const char * enc ): data( proxy.get() ), valid(true), buffer_ready(false) {
100+
set_encoding(enc);
76101
RCPP_STRING_DEBUG( "String( const const_StringProxy&)" ) ;
77102
}
78103

79104
/** from a std::string */
80-
String( const std::string& s) : buffer(s), valid(false), buffer_ready(true) {
105+
String( const std::string& s) : buffer(s), valid(false), buffer_ready(true),
106+
enc(CE_NATIVE) {
81107
RCPP_STRING_DEBUG( "String(const std::string& )" ) ;
82108
}
83109

84-
String( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false) {
110+
String( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false),
111+
enc(CE_NATIVE) {
85112
RCPP_STRING_DEBUG( "String(const std::wstring& )" ) ;
86113
}
87114

88115
/** from a const char* */
89-
String( const char* s) : buffer(s), valid(false), buffer_ready(true){
116+
String( const char* s) : buffer(s), valid(false), buffer_ready(true), enc(CE_NATIVE){
90117
RCPP_STRING_DEBUG( "String(const char*)" ) ;
91118
}
92119

93-
String( const wchar_t* s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false) {
120+
String( const wchar_t* s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false),
121+
enc(CE_NATIVE) {
94122
RCPP_STRING_DEBUG( "String(const wchar_t* s)" ) ;
95123
}
96124

97-
98125
/** constructors from R primitives */
99-
String( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true), buffer_ready(false) {}
100-
String( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true), buffer_ready(false){}
101-
String( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false){}
102-
String( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false){}
103-
String( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true), buffer_ready(false){}
126+
String( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
127+
String( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
128+
String( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false), enc(CE_NATIVE){}
129+
String( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false), enc(CE_NATIVE){}
130+
String( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
104131

105132

106133
inline String& operator=( int x ){ data = internal::r_coerce<INTSXP ,STRSXP>( x ) ; valid = true ; buffer_ready = false ; return *this ; }
@@ -127,13 +154,13 @@ namespace Rcpp {
127154
inline String& operator=( const std::wstring& s){ return assign_wide_string(s) ; }
128155
inline String& operator=( const wchar_t* s){ return assign_wide_string(s) ; }
129156

130-
131157
inline String& operator+=( const std::string& s){
132158
RCPP_STRING_DEBUG( "String::operator+=( std::string )" ) ;
133159
if( is_na() ) return *this ;
134160
setBuffer() ; buffer += s ; valid = false ;
135161
return *this ;
136162
}
163+
137164
inline String& operator+=( const char* s){
138165
RCPP_STRING_DEBUG( "String::operator+=( const char*)" ) ;
139166
if( is_na() ) return *this ;
@@ -214,8 +241,6 @@ namespace Rcpp {
214241
return replace_first( s.get_cstring(), news.get_cstring() ) ;
215242
}
216243

217-
218-
219244
inline String& replace_last( const char* s, const char* news ){
220245
RCPP_STRING_DEBUG_2( "String::replace_last( const char* = '%s' , const char* = '%s')", s, news ) ;
221246
if( is_na() ) return *this ;
@@ -312,7 +337,7 @@ namespace Rcpp {
312337

313338
inline SEXP get_sexp() const {
314339
RCPP_STRING_DEBUG_1( "String::get_sexp const ( valid = %d) ", valid ) ;
315-
return valid ? data : Rf_mkChar( buffer.c_str() ) ;
340+
return valid ? data : Rf_mkCharCE( buffer.c_str(), enc ) ;
316341
}
317342

318343
inline SEXP get_sexp() {
@@ -333,6 +358,39 @@ namespace Rcpp {
333358
inline const char* get_cstring() const {
334359
return buffer_ready ? buffer.c_str() : CHAR(data) ;
335360
}
361+
362+
inline const char* get_encoding() const {
363+
switch (enc) {
364+
case CE_BYTES:
365+
return "bytes";
366+
case CE_LATIN1:
367+
return "latin1";
368+
case CE_UTF8:
369+
return "UTF-8";
370+
default:
371+
return "unknown";
372+
}
373+
}
374+
375+
inline void set_encoding( cetype_t encoding ) {
376+
enc = encoding;
377+
if (data != NULL)
378+
data = Rf_mkCharCE(Rf_translateCharUTF8(data), enc);
379+
}
380+
381+
inline void set_encoding(const char* encoding) {
382+
if ( encoding == "bytes" ) {
383+
enc = CE_BYTES;
384+
} else if ( encoding == "latin1" ) {
385+
enc = CE_LATIN1;
386+
} else if ( encoding == "UTF-8" ) {
387+
enc = CE_UTF8;
388+
} else {
389+
enc = CE_ANY;
390+
Rcout << "Unknown encoding" << std::endl;
391+
}
392+
set_encoding(enc);
393+
}
336394

337395
bool operator<( const Rcpp::String& other ) const {
338396
return strcmp( get_cstring(), other.get_cstring() ) < 0;
@@ -353,6 +411,9 @@ namespace Rcpp {
353411

354412
/** the CHARSXP this String encapsulates */
355413
SEXP data ;
414+
415+
/** the encoding of encapsulated CHARSXP */
416+
cetype_t enc;
356417

357418
/** a buffer used to do string operations withough going back to the SEXP */
358419
std::string buffer ;
@@ -373,7 +434,7 @@ namespace Rcpp {
373434
inline void setData(){
374435
RCPP_STRING_DEBUG( "setData" ) ;
375436
if(!valid) {
376-
data = Rf_mkChar(buffer.c_str()) ;
437+
data = Rf_mkCharCE(buffer.c_str(), enc) ;
377438
valid = true ;
378439
}
379440
}

0 commit comments

Comments
 (0)