Skip to content

Commit 0c18031

Browse files
namjaejeonsmfrench
authored andcommitted
ksmbd: add support for surrogate pair conversion
ksmbd is missing supporting to convert filename included surrogate pair characters. It triggers a "file or folder does not exist" error in Windows client. [Steps to Reproduce for bug] 1. Create surrogate pair file touch $(echo -e '\xf0\x9d\x9f\xa3') touch $(echo -e '\xf0\x9d\x9f\xa4') 2. Try to open these files in ksmbd share through Windows client. This patch update unicode functions not to consider about surrogate pair (and IVS). Reviewed-by: Marios Makassikis <[email protected]> Tested-by: Marios Makassikis <[email protected]> Signed-off-by: Namjae Jeon <[email protected]> Signed-off-by: Steve French <[email protected]>
1 parent ecce70c commit 0c18031

File tree

1 file changed

+138
-49
lines changed

1 file changed

+138
-49
lines changed

fs/smb/server/unicode.c

Lines changed: 138 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -13,46 +13,10 @@
1313
#include "unicode.h"
1414
#include "smb_common.h"
1515

16-
/*
17-
* smb_utf16_bytes() - how long will a string be after conversion?
18-
* @from: pointer to input string
19-
* @maxbytes: don't go past this many bytes of input string
20-
* @codepage: destination codepage
21-
*
22-
* Walk a utf16le string and return the number of bytes that the string will
23-
* be after being converted to the given charset, not including any null
24-
* termination required. Don't walk past maxbytes in the source buffer.
25-
*
26-
* Return: string length after conversion
27-
*/
28-
static int smb_utf16_bytes(const __le16 *from, int maxbytes,
29-
const struct nls_table *codepage)
30-
{
31-
int i;
32-
int charlen, outlen = 0;
33-
int maxwords = maxbytes / 2;
34-
char tmp[NLS_MAX_CHARSET_SIZE];
35-
__u16 ftmp;
36-
37-
for (i = 0; i < maxwords; i++) {
38-
ftmp = get_unaligned_le16(&from[i]);
39-
if (ftmp == 0)
40-
break;
41-
42-
charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
43-
if (charlen > 0)
44-
outlen += charlen;
45-
else
46-
outlen++;
47-
}
48-
49-
return outlen;
50-
}
51-
5216
/*
5317
* cifs_mapchar() - convert a host-endian char to proper char in codepage
5418
* @target: where converted character should be copied
55-
* @src_char: 2 byte host-endian source character
19+
* @from: host-endian source string
5620
* @cp: codepage to which character should be converted
5721
* @mapchar: should character be mapped according to mapchars mount option?
5822
*
@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
6327
* Return: string length after conversion
6428
*/
6529
static int
66-
cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
30+
cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
6731
bool mapchar)
6832
{
6933
int len = 1;
34+
__u16 src_char;
35+
36+
src_char = *from;
7037

7138
if (!mapchar)
7239
goto cp_convert;
@@ -104,12 +71,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
10471

10572
cp_convert:
10673
len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
107-
if (len <= 0) {
108-
*target = '?';
109-
len = 1;
110-
}
74+
if (len <= 0)
75+
goto surrogate_pair;
11176

11277
goto out;
78+
79+
surrogate_pair:
80+
/* convert SURROGATE_PAIR and IVS */
81+
if (strcmp(cp->charset, "utf8"))
82+
goto unknown;
83+
len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
84+
if (len <= 0)
85+
goto unknown;
86+
return len;
87+
88+
unknown:
89+
*target = '?';
90+
len = 1;
91+
goto out;
92+
}
93+
94+
/*
95+
* smb_utf16_bytes() - compute converted string length
96+
* @from: pointer to input string
97+
* @maxbytes: input string length
98+
* @codepage: destination codepage
99+
*
100+
* Walk a utf16le string and return the number of bytes that the string will
101+
* be after being converted to the given charset, not including any null
102+
* termination required. Don't walk past maxbytes in the source buffer.
103+
*
104+
* Return: string length after conversion
105+
*/
106+
static int smb_utf16_bytes(const __le16 *from, int maxbytes,
107+
const struct nls_table *codepage)
108+
{
109+
int i, j;
110+
int charlen, outlen = 0;
111+
int maxwords = maxbytes / 2;
112+
char tmp[NLS_MAX_CHARSET_SIZE];
113+
__u16 ftmp[3];
114+
115+
for (i = 0; i < maxwords; i++) {
116+
ftmp[0] = get_unaligned_le16(&from[i]);
117+
if (ftmp[0] == 0)
118+
break;
119+
for (j = 1; j <= 2; j++) {
120+
if (i + j < maxwords)
121+
ftmp[j] = get_unaligned_le16(&from[i + j]);
122+
else
123+
ftmp[j] = 0;
124+
}
125+
126+
charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
127+
if (charlen > 0)
128+
outlen += charlen;
129+
else
130+
outlen++;
131+
}
132+
133+
return outlen;
113134
}
114135

115136
/*
@@ -139,12 +160,12 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
139160
static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
140161
const struct nls_table *codepage, bool mapchar)
141162
{
142-
int i, charlen, safelen;
163+
int i, j, charlen, safelen;
143164
int outlen = 0;
144165
int nullsize = nls_nullsize(codepage);
145166
int fromwords = fromlen / 2;
146167
char tmp[NLS_MAX_CHARSET_SIZE];
147-
__u16 ftmp;
168+
__u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
148169

149170
/*
150171
* because the chars can be of varying widths, we need to take care
@@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
155176
safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
156177

157178
for (i = 0; i < fromwords; i++) {
158-
ftmp = get_unaligned_le16(&from[i]);
159-
if (ftmp == 0)
179+
ftmp[0] = get_unaligned_le16(&from[i]);
180+
if (ftmp[0] == 0)
160181
break;
182+
for (j = 1; j <= 2; j++) {
183+
if (i + j < fromwords)
184+
ftmp[j] = get_unaligned_le16(&from[i + j]);
185+
else
186+
ftmp[j] = 0;
187+
}
161188

162189
/*
163190
* check to see if converting this character might make the
@@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
172199
/* put converted char into 'to' buffer */
173200
charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
174201
outlen += charlen;
202+
203+
/*
204+
* charlen (=bytes of UTF-8 for 1 character)
205+
* 4bytes UTF-8(surrogate pair) is charlen=4
206+
* (4bytes UTF-16 code)
207+
* 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
208+
* (2 UTF-8 pairs divided to 2 UTF-16 pairs)
209+
*/
210+
if (charlen == 4)
211+
i++;
212+
else if (charlen >= 5)
213+
/* 5-6bytes UTF-8 */
214+
i += 2;
175215
}
176216

177217
/* properly null-terminate string */
@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
306346
char src_char;
307347
__le16 dst_char;
308348
wchar_t tmp;
349+
wchar_t wchar_to[6]; /* UTF-16 */
350+
int ret;
351+
unicode_t u;
309352

310353
if (!mapchars)
311354
return smb_strtoUTF16(target, source, srclen, cp);
@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
348391
* if no match, use question mark, which at least in
349392
* some cases serves as wild card
350393
*/
351-
if (charlen < 1) {
352-
dst_char = cpu_to_le16(0x003f);
353-
charlen = 1;
394+
if (charlen > 0)
395+
goto ctoUTF16;
396+
397+
/* convert SURROGATE_PAIR */
398+
if (strcmp(cp->charset, "utf8"))
399+
goto unknown;
400+
if (*(source + i) & 0x80) {
401+
charlen = utf8_to_utf32(source + i, 6, &u);
402+
if (charlen < 0)
403+
goto unknown;
404+
} else
405+
goto unknown;
406+
ret = utf8s_to_utf16s(source + i, charlen,
407+
UTF16_LITTLE_ENDIAN,
408+
wchar_to, 6);
409+
if (ret < 0)
410+
goto unknown;
411+
412+
i += charlen;
413+
dst_char = cpu_to_le16(*wchar_to);
414+
if (charlen <= 3)
415+
/* 1-3bytes UTF-8 to 2bytes UTF-16 */
416+
put_unaligned(dst_char, &target[j]);
417+
else if (charlen == 4) {
418+
/*
419+
* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
420+
* 7-8bytes UTF-8(IVS) divided to 2 UTF-16
421+
* (charlen=3+4 or 4+4)
422+
*/
423+
put_unaligned(dst_char, &target[j]);
424+
dst_char = cpu_to_le16(*(wchar_to + 1));
425+
j++;
426+
put_unaligned(dst_char, &target[j]);
427+
} else if (charlen >= 5) {
428+
/* 5-6bytes UTF-8 to 6bytes UTF-16 */
429+
put_unaligned(dst_char, &target[j]);
430+
dst_char = cpu_to_le16(*(wchar_to + 1));
431+
j++;
432+
put_unaligned(dst_char, &target[j]);
433+
dst_char = cpu_to_le16(*(wchar_to + 2));
434+
j++;
435+
put_unaligned(dst_char, &target[j]);
354436
}
437+
continue;
438+
439+
unknown:
440+
dst_char = cpu_to_le16(0x003f);
441+
charlen = 1;
355442
}
443+
444+
ctoUTF16:
356445
/*
357446
* character may take more than one byte in the source string,
358447
* but will take exactly two bytes in the target string

0 commit comments

Comments
 (0)