From e27ecdd94d81e5bc3d1f68591701db5adb342f0d Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Fri, 24 Apr 2009 10:11:40 +0200 Subject: nls: utf8_wcstombs: use correct buffer size in error case When utf8_wcstombs encounters a character that cannot be encoded, we must not decrease the remaining output buffer size because nothing has been written to the output buffer. Signed-off-by: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- fs/nls/nls_base.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 9b0efdad891..000736d89c9 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -144,7 +144,6 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) size = utf8_wctomb(op, *ip, maxlen); if (size == -1) { /* Ignore character and move on */ - maxlen--; } else { op += size; maxlen -= size; -- cgit v1.2.3 From 905c02acbd89f427c87a6d0a50fed757f6b3001c Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Fri, 24 Apr 2009 10:11:56 +0200 Subject: nls: utf8_wcstombs: fix buffer overflow utf8_wcstombs forgot to include one-byte UTF-8 characters when calculating the output buffer size, i.e., theoretically, it was possible to overflow the output buffer with an input string that contains enough ASCII characters. In practice, this was no problem because the only user so far (VFAT) always uses a big enough output buffer. Signed-off-by: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- fs/nls/nls_base.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 000736d89c9..750abf211e2 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -150,6 +150,7 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) } } else { *op++ = (__u8) *ip; + maxlen--; } ip++; } -- cgit v1.2.3 From 74675a58507e769beee7d949dbed788af3c4139d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 30 Apr 2009 10:08:18 -0400 Subject: NLS: update handling of Unicode This patch (as1239) updates the kernel's treatment of Unicode. The character-set conversion routines are well behind the current state of the Unicode specification: They don't recognize the existence of code points beyond plane 0 or of surrogate pairs in the UTF-16 encoding. The old wchar_t 16-bit type is retained because it's still used in lots of places. This shouldn't cause any new problems; if a conversion now results in an invalid 16-bit code then before it must have yielded an undefined code. Difficult-to-read names like "utf_mbstowcs" are replaced with more transparent names like "utf8s_to_utf16s" and the ordering of the parameters is rationalized (buffer lengths come immediate after the pointers they refer to, and the inputs precede the outputs). Fortunately the low-level conversion routines are used in only a few places; the interfaces to the higher-level uni2char and char2uni methods have been left unchanged. Signed-off-by: Alan Stern Acked-by: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- fs/befs/linuxvfs.c | 20 +++--- fs/fat/dir.c | 29 +++++---- fs/fat/namei_vfat.c | 4 +- fs/isofs/joliet.c | 36 +---------- fs/ncpfs/ncplib_kernel.c | 8 ++- fs/nls/nls_base.c | 164 ++++++++++++++++++++++++++++++----------------- fs/nls/nls_utf8.c | 13 +++- 7 files changed, 150 insertions(+), 124 deletions(-) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 9367b6297d8..89cd2deeb4a 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in, { struct nls_table *nls = BEFS_SB(sb)->nls; int i, o; - wchar_t uni; + unicode_t uni; int unilen, utflen; char *result; /* The utf8->nls conversion won't make the final nls string bigger @@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in, for (i = o = 0; i < in_len; i += utflen, o += unilen) { /* convert from UTF-8 to Unicode */ - utflen = utf8_mbtowc(&uni, &in[i], in_len - i); - if (utflen < 0) { + utflen = utf8_to_utf32(&in[i], in_len - i, &uni); + if (utflen < 0) goto conv_err; - } /* convert from Unicode to nls */ + if (uni > MAX_WCHAR_T) + goto conv_err; unilen = nls->uni2char(uni, &result[o], in_len - o); - if (unilen < 0) { + if (unilen < 0) goto conv_err; - } } result[o] = '\0'; *out_len = o; @@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in, /* convert from nls to unicode */ unilen = nls->char2uni(&in[i], in_len - i, &uni); - if (unilen < 0) { + if (unilen < 0) goto conv_err; - } /* convert from unicode to UTF-8 */ - utflen = utf8_wctomb(&result[o], uni, 3); - if (utflen <= 0) { + utflen = utf32_to_utf8(uni, &result[o], 3); + if (utflen <= 0) goto conv_err; - } } result[o] = '\0'; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index f3500294eec..7c14c8cbbab 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -22,6 +22,19 @@ #include #include "fat.h" +/* + * Maximum buffer size of short name. + * [(MSDOS_NAME + '.') * max one char + nul] + * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] + */ +#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) +/* + * Maximum buffer size of unicode chars from slots. + * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] + */ +#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) +#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) + static inline loff_t fat_make_i_pos(struct super_block *sb, struct buffer_head *bh, struct msdos_dir_entry *de) @@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, unsigned char *buf, int size) { if (sbi->options.utf8) - return utf8_wcstombs(buf, uni, size); + return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, + UTF16_HOST_ENDIAN, buf, size); else return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, sbi->nls_io); @@ -324,19 +338,6 @@ parse_long: return 0; } -/* - * Maximum buffer size of short name. - * [(MSDOS_NAME + '.') * max one char + nul] - * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] - */ -#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) -/* - * Maximum buffer size of unicode chars from slots. - * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] - */ -#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) -#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) - /* * Return values: negative -> error, 0 -> not found, positive -> found, * value is the total amount of slots, including the shortname entry. diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index b50ecbe97f8..f92ad999535 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, if (utf8) { int name_len = strlen(name); - *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX); + *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); /* * We stripped '.'s before and set len appropriately, - * but utf8_mbstowcs doesn't care about len + * but utf8s_to_utf16s doesn't care about len */ *outlen -= (name_len - len); diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index 92c14b850e9..a048de81c09 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) return (op - ascii); } -/* Convert big endian wide character string to utf8 */ -static int -wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen) -{ - const __u8 *ip; - __u8 *op; - int size; - __u16 c; - - op = s; - ip = pwcs; - while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) { - c = (*ip << 8) | ip[1]; - if (c > 0x7f) { - size = utf8_wctomb(op, c, maxlen); - if (size == -1) { - /* Ignore character and move on */ - maxlen--; - } else { - op += size; - maxlen -= size; - } - } else { - *op++ = (__u8) c; - } - ip += 2; - inlen--; - } - return (op - s); -} - int get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) { @@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; if (utf8) { - len = wcsntombs_be(outname, de->name, - de->name_len[0] >> 1, PAGE_SIZE); + len = utf16s_to_utf8s((const wchar_t *) de->name, + de->name_len[0] >> 1, UTF16_BIG_ENDIAN, + outname, PAGE_SIZE); } else { len = uni16_to_x8(outname, (__be16 *) de->name, de->name_len[0] >> 1, nls); diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 97645f11211..0ec6237a597 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen, if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { int k; + unicode_t u; - k = utf8_mbtowc(&ec, iname, iname_end - iname); - if (k < 0) + k = utf8_to_utf32(iname, iname_end - iname, &u); + if (k < 0 || u > MAX_WCHAR_T) return -EINVAL; iname += k; + ec = u; } else { if (*iname == NCP_ESC) { int k; @@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen, if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { int k; - k = utf8_wctomb(iname, ec, iname_end - iname); + k = utf32_to_utf8(ec, iname, iname_end - iname); if (k < 0) { err = -ENAMETOOLONG; goto quit; diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 750abf211e2..477d37d83b3 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -15,6 +15,7 @@ #include #include #include +#include static struct nls_table default_table; static struct nls_table *tables = &default_table; @@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] = {0, /* end of table */} }; -int -utf8_mbtowc(wchar_t *p, const __u8 *s, int n) +#define UNICODE_MAX 0x0010ffff +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) { - long l; + unsigned long l; int c0, c, nc; const struct utf8_table *t; @@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) nc++; if ((c0 & t->cmask) == t->cval) { l &= t->lmask; - if (l < t->lval) + if (l < t->lval || l > UNICODE_MAX || + (l & SURROGATE_MASK) == SURROGATE_PAIR) return -1; - *p = l; + *pu = (unicode_t) l; return nc; } - if (n <= nc) + if (len <= nc) return -1; s++; c = (*s ^ 0x80) & 0xFF; @@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) } return -1; } +EXPORT_SYMBOL(utf8_to_utf32); -int -utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n) +int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) { - __u16 *op; - const __u8 *ip; - int size; - - op = pwcs; - ip = s; - while (*ip && n > 0) { - if (*ip & 0x80) { - size = utf8_mbtowc(op, ip, n); - if (size == -1) { - /* Ignore character and move on */ - ip++; - n--; - } else { - op++; - ip += size; - n -= size; - } - } else { - *op++ = *ip++; - n--; - } - } - return (op - pwcs); -} - -int -utf8_wctomb(__u8 *s, wchar_t wc, int maxlen) -{ - long l; + unsigned long l; int c, nc; const struct utf8_table *t; - + if (!s) return 0; - - l = wc; + + l = u; + if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + nc = 0; for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { nc++; if (l <= t->lmask) { c = t->shift; - *s = t->cval | (l >> c); + *s = (u8) (t->cval | (l >> c)); while (c > 0) { c -= 6; s++; - *s = 0x80 | ((l >> c) & 0x3F); + *s = (u8) (0x80 | ((l >> c) & 0x3F)); } return nc; } } return -1; } +EXPORT_SYMBOL(utf32_to_utf8); -int -utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) +int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) { - const __u16 *ip; - __u8 *op; + u16 *op; int size; + unicode_t u; + + op = pwcs; + while (*s && len > 0) { + if (*s & 0x80) { + size = utf8_to_utf32(s, len, &u); + if (size < 0) { + /* Ignore character and move on */ + size = 1; + } else if (u >= PLANE_SIZE) { + u -= PLANE_SIZE; + *op++ = (wchar_t) (SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS)); + *op++ = (wchar_t) (SURROGATE_PAIR | + SURROGATE_LOW | + (u & SURROGATE_BITS)); + } else { + *op++ = (wchar_t) u; + } + s += size; + len -= size; + } else { + *op++ = *s++; + len--; + } + } + return op - pwcs; +} +EXPORT_SYMBOL(utf8s_to_utf16s); + +static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + return c; + case UTF16_LITTLE_ENDIAN: + return __le16_to_cpu(c); + case UTF16_BIG_ENDIAN: + return __be16_to_cpu(c); + } +} + +int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, + u8 *s, int maxlen) +{ + u8 *op; + int size; + unsigned long u, v; op = s; - ip = pwcs; - while (*ip && maxlen > 0) { - if (*ip > 0x7f) { - size = utf8_wctomb(op, *ip, maxlen); + while (len > 0 && maxlen > 0) { + u = get_utf16(*pwcs, endian); + if (!u) + break; + pwcs++; + len--; + if (u > 0x7f) { + if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { + if (u & SURROGATE_LOW) { + /* Ignore character and move on */ + continue; + } + if (len <= 0) + break; + v = get_utf16(*pwcs, endian); + if ((v & SURROGATE_MASK) != SURROGATE_PAIR || + !(v & SURROGATE_LOW)) { + /* Ignore character and move on */ + continue; + } + u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + + (v & SURROGATE_BITS); + pwcs++; + len--; + } + size = utf32_to_utf8(u, op, maxlen); if (size == -1) { /* Ignore character and move on */ } else { @@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) maxlen -= size; } } else { - *op++ = (__u8) *ip; + *op++ = (u8) u; maxlen--; } - ip++; } - return (op - s); + return op - s; } +EXPORT_SYMBOL(utf16s_to_utf8s); int register_nls(struct nls_table * nls) { @@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); -EXPORT_SYMBOL(utf8_mbtowc); -EXPORT_SYMBOL(utf8_mbstowcs); -EXPORT_SYMBOL(utf8_wctomb); -EXPORT_SYMBOL(utf8_wcstombs); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index aa2c42fdd97..0d60a44acac 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { int n; - if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) { + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { *out = '?'; return -EINVAL; } @@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { int n; + unicode_t u; - if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) { + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { *uni = 0x003f; /* ? */ - n = -EINVAL; + return -EINVAL; } + *uni = (wchar_t) u; return n; } -- cgit v1.2.3