13
13
#include "unicode.h"
14
14
#include "smb_common.h"
15
15
16
- /*
17
- * smb_utf16_bytes() - how long will a string be after conversion?
18
- * @from: pointer to input string
19
- * @maxbytes: don't go past this many bytes of input string
20
- * @codepage: destination codepage
21
- *
22
- * Walk a utf16le string and return the number of bytes that the string will
23
- * be after being converted to the given charset, not including any null
24
- * termination required. Don't walk past maxbytes in the source buffer.
25
- *
26
- * Return: string length after conversion
27
- */
28
- static int smb_utf16_bytes (const __le16 * from , int maxbytes ,
29
- const struct nls_table * codepage )
30
- {
31
- int i ;
32
- int charlen , outlen = 0 ;
33
- int maxwords = maxbytes / 2 ;
34
- char tmp [NLS_MAX_CHARSET_SIZE ];
35
- __u16 ftmp ;
36
-
37
- for (i = 0 ; i < maxwords ; i ++ ) {
38
- ftmp = get_unaligned_le16 (& from [i ]);
39
- if (ftmp == 0 )
40
- break ;
41
-
42
- charlen = codepage -> uni2char (ftmp , tmp , NLS_MAX_CHARSET_SIZE );
43
- if (charlen > 0 )
44
- outlen += charlen ;
45
- else
46
- outlen ++ ;
47
- }
48
-
49
- return outlen ;
50
- }
51
-
52
16
/*
53
17
* cifs_mapchar() - convert a host-endian char to proper char in codepage
54
18
* @target: where converted character should be copied
55
- * @src_char: 2 byte host-endian source character
19
+ * @from: host-endian source string
56
20
* @cp: codepage to which character should be converted
57
21
* @mapchar: should character be mapped according to mapchars mount option?
58
22
*
@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
63
27
* Return: string length after conversion
64
28
*/
65
29
static int
66
- cifs_mapchar (char * target , const __u16 src_char , const struct nls_table * cp ,
30
+ cifs_mapchar (char * target , const __u16 * from , const struct nls_table * cp ,
67
31
bool mapchar )
68
32
{
69
33
int len = 1 ;
34
+ __u16 src_char ;
35
+
36
+ src_char = * from ;
70
37
71
38
if (!mapchar )
72
39
goto cp_convert ;
@@ -104,12 +71,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
104
71
105
72
cp_convert :
106
73
len = cp -> uni2char (src_char , target , NLS_MAX_CHARSET_SIZE );
107
- if (len <= 0 ) {
108
- * target = '?' ;
109
- len = 1 ;
110
- }
74
+ if (len <= 0 )
75
+ goto surrogate_pair ;
111
76
112
77
goto out ;
78
+
79
+ surrogate_pair :
80
+ /* convert SURROGATE_PAIR and IVS */
81
+ if (strcmp (cp -> charset , "utf8" ))
82
+ goto unknown ;
83
+ len = utf16s_to_utf8s (from , 3 , UTF16_LITTLE_ENDIAN , target , 6 );
84
+ if (len <= 0 )
85
+ goto unknown ;
86
+ return len ;
87
+
88
+ unknown :
89
+ * target = '?' ;
90
+ len = 1 ;
91
+ goto out ;
92
+ }
93
+
94
+ /*
95
+ * smb_utf16_bytes() - compute converted string length
96
+ * @from: pointer to input string
97
+ * @maxbytes: input string length
98
+ * @codepage: destination codepage
99
+ *
100
+ * Walk a utf16le string and return the number of bytes that the string will
101
+ * be after being converted to the given charset, not including any null
102
+ * termination required. Don't walk past maxbytes in the source buffer.
103
+ *
104
+ * Return: string length after conversion
105
+ */
106
+ static int smb_utf16_bytes (const __le16 * from , int maxbytes ,
107
+ const struct nls_table * codepage )
108
+ {
109
+ int i , j ;
110
+ int charlen , outlen = 0 ;
111
+ int maxwords = maxbytes / 2 ;
112
+ char tmp [NLS_MAX_CHARSET_SIZE ];
113
+ __u16 ftmp [3 ];
114
+
115
+ for (i = 0 ; i < maxwords ; i ++ ) {
116
+ ftmp [0 ] = get_unaligned_le16 (& from [i ]);
117
+ if (ftmp [0 ] == 0 )
118
+ break ;
119
+ for (j = 1 ; j <= 2 ; j ++ ) {
120
+ if (i + j < maxwords )
121
+ ftmp [j ] = get_unaligned_le16 (& from [i + j ]);
122
+ else
123
+ ftmp [j ] = 0 ;
124
+ }
125
+
126
+ charlen = cifs_mapchar (tmp , ftmp , codepage , 0 );
127
+ if (charlen > 0 )
128
+ outlen += charlen ;
129
+ else
130
+ outlen ++ ;
131
+ }
132
+
133
+ return outlen ;
113
134
}
114
135
115
136
/*
@@ -139,12 +160,12 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
139
160
static int smb_from_utf16 (char * to , const __le16 * from , int tolen , int fromlen ,
140
161
const struct nls_table * codepage , bool mapchar )
141
162
{
142
- int i , charlen , safelen ;
163
+ int i , j , charlen , safelen ;
143
164
int outlen = 0 ;
144
165
int nullsize = nls_nullsize (codepage );
145
166
int fromwords = fromlen / 2 ;
146
167
char tmp [NLS_MAX_CHARSET_SIZE ];
147
- __u16 ftmp ;
168
+ __u16 ftmp [ 3 ]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
148
169
149
170
/*
150
171
* because the chars can be of varying widths, we need to take care
@@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
155
176
safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize );
156
177
157
178
for (i = 0 ; i < fromwords ; i ++ ) {
158
- ftmp = get_unaligned_le16 (& from [i ]);
159
- if (ftmp == 0 )
179
+ ftmp [ 0 ] = get_unaligned_le16 (& from [i ]);
180
+ if (ftmp [ 0 ] == 0 )
160
181
break ;
182
+ for (j = 1 ; j <= 2 ; j ++ ) {
183
+ if (i + j < fromwords )
184
+ ftmp [j ] = get_unaligned_le16 (& from [i + j ]);
185
+ else
186
+ ftmp [j ] = 0 ;
187
+ }
161
188
162
189
/*
163
190
* check to see if converting this character might make the
@@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
172
199
/* put converted char into 'to' buffer */
173
200
charlen = cifs_mapchar (& to [outlen ], ftmp , codepage , mapchar );
174
201
outlen += charlen ;
202
+
203
+ /*
204
+ * charlen (=bytes of UTF-8 for 1 character)
205
+ * 4bytes UTF-8(surrogate pair) is charlen=4
206
+ * (4bytes UTF-16 code)
207
+ * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
208
+ * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
209
+ */
210
+ if (charlen == 4 )
211
+ i ++ ;
212
+ else if (charlen >= 5 )
213
+ /* 5-6bytes UTF-8 */
214
+ i += 2 ;
175
215
}
176
216
177
217
/* properly null-terminate string */
@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
306
346
char src_char ;
307
347
__le16 dst_char ;
308
348
wchar_t tmp ;
349
+ wchar_t wchar_to [6 ]; /* UTF-16 */
350
+ int ret ;
351
+ unicode_t u ;
309
352
310
353
if (!mapchars )
311
354
return smb_strtoUTF16 (target , source , srclen , cp );
@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
348
391
* if no match, use question mark, which at least in
349
392
* some cases serves as wild card
350
393
*/
351
- if (charlen < 1 ) {
352
- dst_char = cpu_to_le16 (0x003f );
353
- charlen = 1 ;
394
+ if (charlen > 0 )
395
+ goto ctoUTF16 ;
396
+
397
+ /* convert SURROGATE_PAIR */
398
+ if (strcmp (cp -> charset , "utf8" ))
399
+ goto unknown ;
400
+ if (* (source + i ) & 0x80 ) {
401
+ charlen = utf8_to_utf32 (source + i , 6 , & u );
402
+ if (charlen < 0 )
403
+ goto unknown ;
404
+ } else
405
+ goto unknown ;
406
+ ret = utf8s_to_utf16s (source + i , charlen ,
407
+ UTF16_LITTLE_ENDIAN ,
408
+ wchar_to , 6 );
409
+ if (ret < 0 )
410
+ goto unknown ;
411
+
412
+ i += charlen ;
413
+ dst_char = cpu_to_le16 (* wchar_to );
414
+ if (charlen <= 3 )
415
+ /* 1-3bytes UTF-8 to 2bytes UTF-16 */
416
+ put_unaligned (dst_char , & target [j ]);
417
+ else if (charlen == 4 ) {
418
+ /*
419
+ * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
420
+ * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
421
+ * (charlen=3+4 or 4+4)
422
+ */
423
+ put_unaligned (dst_char , & target [j ]);
424
+ dst_char = cpu_to_le16 (* (wchar_to + 1 ));
425
+ j ++ ;
426
+ put_unaligned (dst_char , & target [j ]);
427
+ } else if (charlen >= 5 ) {
428
+ /* 5-6bytes UTF-8 to 6bytes UTF-16 */
429
+ put_unaligned (dst_char , & target [j ]);
430
+ dst_char = cpu_to_le16 (* (wchar_to + 1 ));
431
+ j ++ ;
432
+ put_unaligned (dst_char , & target [j ]);
433
+ dst_char = cpu_to_le16 (* (wchar_to + 2 ));
434
+ j ++ ;
435
+ put_unaligned (dst_char , & target [j ]);
354
436
}
437
+ continue ;
438
+
439
+ unknown :
440
+ dst_char = cpu_to_le16 (0x003f );
441
+ charlen = 1 ;
355
442
}
443
+
444
+ ctoUTF16 :
356
445
/*
357
446
* character may take more than one byte in the source string,
358
447
* but will take exactly two bytes in the target string
0 commit comments