From 32032d42b51bf673bdc17b2248a6a32b4de30676 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev <teodor@sigaev.ru> Date: Mon, 2 Mar 2009 15:10:09 +0000 Subject: [PATCH] Fix usage of char2wchar/wchar2char. Changes: - pg_wchar and wchar_t could have different size, so char2wchar doesn't call pg_mb2wchar_with_len to prevent out-of-bound memory bug - make char2wchar/wchar2char symmetric, now they should not be called with C-locale because mbstowcs/wcstombs oftenly doesn't work correct with C-locale. - Text parser uses pg_mb2wchar_with_len directly in case of C-locale and multibyte encoding Per bug report by Hiroshi Inoue <inoue@tpf.co.jp> and following discussion. Backpatch up to 8.2 when multybyte support was implemented in tsearch. --- src/backend/tsearch/wparser_def.c | 51 ++++++++++++++++++++----------- src/backend/utils/mb/mbutils.c | 25 ++++++--------- 2 files changed, 42 insertions(+), 34 deletions(-) diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index a4143549756..8d7cc1b8d50 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.20 2009/01/15 16:33:59 teodor Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.21 2009/03/02 15:10:09 teodor Exp $ * *------------------------------------------------------------------------- */ @@ -240,12 +240,12 @@ typedef struct TParser int lenstr; /* length of mbstring */ #ifdef USE_WIDE_UPPER_LOWER wchar_t *wstr; /* wide character string */ - int lenwstr; /* length of wsting */ + pg_wchar *pgwstr; /* wide character string for C-locale */ + bool usewide; #endif /* State of parse */ int charmaxlen; - bool usewide; TParserPosition *state; bool ignore; bool wanthost; @@ -299,13 +299,24 @@ TParserInit(char *str, int len) if (prs->charmaxlen > 1) { prs->usewide = true; - prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); - prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1, - prs->str, prs->lenstr); + if ( lc_ctype_is_c() ) + { + /* + * char2wchar doesn't work for C-locale and + * sizeof(pg_wchar) could be not equal to sizeof(wchar_t) + */ + prs->pgwstr = (pg_wchar*) palloc(sizeof(pg_wchar) * (prs->lenstr + 1)); + pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr); + } + else + { + prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1)); + char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr); + } } else -#endif prs->usewide = false; +#endif prs->state = newTParserPosition(NULL); prs->state->state = TPS_Base; @@ -331,6 +342,8 @@ TParserClose(TParser *prs) #ifdef USE_WIDE_UPPER_LOWER if (prs->wstr) pfree(prs->wstr); + if (prs->pgwstr) + pfree(prs->pgwstr); #endif pfree(prs); @@ -338,10 +351,12 @@ TParserClose(TParser *prs) /* * Character-type support functions, equivalent to is* macros, but - * working with any possible encodings and locales. Note, - * that with multibyte encoding and C-locale isw* function may fail - * or give wrong result. Note 2: multibyte encoding and C-locale - * often are used for Asian languages + * working with any possible encodings and locales. Notes: + * - with multibyte encoding and C-locale isw* function may fail + * or give wrong result. + * - multibyte encoding and C-locale often are used for + * Asian languages. + * - if locale is C the we use pgwstr instead of wstr */ #ifdef USE_WIDE_UPPER_LOWER @@ -352,14 +367,14 @@ p_is##type(TParser *prs) { \ Assert( prs->state ); \ if ( prs->usewide ) \ { \ - if ( lc_ctype_is_c() ) \ - return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \ + if ( prs->pgwstr ) \ + return is##type( 0xff & *( prs->pgwstr + prs->state->poschar) );\ \ return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \ } \ \ return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \ -} \ +} \ \ static int \ p_isnot##type(TParser *prs) { \ @@ -373,9 +388,9 @@ p_isalnum(TParser *prs) if (prs->usewide) { - if (lc_ctype_is_c()) + if (prs->pgwstr) { - unsigned int c = *(prs->wstr + prs->state->poschar); + unsigned int c = *(prs->pgwstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding with C-locale is @@ -405,9 +420,9 @@ p_isalpha(TParser *prs) if (prs->usewide) { - if (lc_ctype_is_c()) + if (prs->pgwstr) { - unsigned int c = *(prs->wstr + prs->state->poschar); + unsigned int c = *(prs->pgwstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding with C-locale is diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index c3cf7f5db69..f5ba80d101d 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -4,7 +4,7 @@ * (currently mule internal code (mic) is used) * Tatsuo Ishii * - * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.78 2009/01/22 10:09:48 mha Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.79 2009/03/02 15:10:09 teodor Exp $ */ #include "postgres.h" @@ -601,7 +601,10 @@ wchar2char(char *to, const wchar_t *from, size_t tolen) } else #endif /* WIN32 */ + { + Assert( !lc_ctype_is_c() ); result = wcstombs(to, from, tolen); + } return result; } @@ -647,22 +650,12 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen) else #endif /* WIN32 */ { - if (lc_ctype_is_c()) - { - /* - * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be - * allocated with sufficient space - */ - result = pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen); - } - else - { - /* mbstowcs requires ending '\0' */ - char *str = pnstrdup(from, fromlen); + /* mbstowcs requires ending '\0' */ + char *str = pnstrdup(from, fromlen); - result = mbstowcs(to, str, tolen); - pfree(str); - } + Assert( !lc_ctype_is_c() ); + result = mbstowcs(to, str, tolen); + pfree(str); } if (result == -1) -- GitLab