Skip to content
Snippets Groups Projects
Commit 4ea4f8bd authored by Bruce Momjian's avatar Bruce Momjian
Browse files

Fix for Unicode characters above 0x10000.

John Hansen
parent 917c8bb4
No related branches found
No related tags found
No related merge requests found
/* /*
* conversion functions between pg_wchar and multibyte streams. * conversion functions between pg_wchar and multibyte streams.
* Tatsuo Ishii * Tatsuo Ishii
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.38 2004/09/17 21:59:57 petere Exp $ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $
* *
* WIN1250 client encoding updated by Pavel Behal * WIN1250 client encoding updated by Pavel Behal
* *
...@@ -343,6 +343,31 @@ pg_johab_dsplen(const unsigned char *s) ...@@ -343,6 +343,31 @@ pg_johab_dsplen(const unsigned char *s)
return (pg_euc_dsplen(s)); return (pg_euc_dsplen(s));
} }
bool isLegalUTF8(const UTF8 *source, int len) {
UTF8 a;
const UTF8 *srcptr = source+len;
if(!source || (pg_utf_mblen(source) != len)) return false;
switch (len) {
default: return false;
/* Everything else falls through when "true"... */
case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
if (*source > 0xFD) return false;
}
return true;
}
/* /*
* convert UTF-8 string to pg_wchar (UCS-2) * convert UTF-8 string to pg_wchar (UCS-2)
* caller should allocate enough space for "to" * caller should allocate enough space for "to"
...@@ -398,7 +423,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ...@@ -398,7 +423,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
* returns the byte length of a UTF-8 word pointed to by s * returns the byte length of a UTF-8 word pointed to by s
*/ */
int int
pg_utf_mblen(const unsigned char *s) pg_utf_mblen(const UTF8 *s)
{ {
int len = 1; int len = 1;
...@@ -406,13 +431,19 @@ pg_utf_mblen(const unsigned char *s) ...@@ -406,13 +431,19 @@ pg_utf_mblen(const unsigned char *s)
len = 1; len = 1;
else if ((*s & 0xe0) == 0xc0) else if ((*s & 0xe0) == 0xc0)
len = 2; len = 2;
else if ((*s & 0xe0) == 0xe0) else if ((*s & 0xf0) == 0xe0)
len = 3; len = 3;
else if ((*s & 0xf8) == 0xf0)
len = 4;
else if ((*s & 0xfc) == 0xf8)
len = 5;
else if ((*s & 0xfe) == 0xfc)
len = 6;
return (len); return (len);
} }
static int static int
pg_utf_dsplen(const unsigned char *s) pg_utf_dsplen(const UTF8 *s)
{ {
return 1; /* XXX fix me! */ return 1; /* XXX fix me! */
} }
...@@ -721,7 +752,7 @@ pg_wchar_tbl pg_wchar_table[] = { ...@@ -721,7 +752,7 @@ pg_wchar_tbl pg_wchar_table[] = {
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */ {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6}, /* 6; PG_UNICODE */
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
...@@ -822,18 +853,15 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError) ...@@ -822,18 +853,15 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
while (len > 0 && *mbstr) while (len > 0 && *mbstr)
{ {
/* special UTF-8 check */
if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
{
if (noError)
return false;
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
}
l = pg_mblen(mbstr); l = pg_mblen(mbstr);
/* special UTF-8 check */
if (encoding == PG_UTF8) {
if(!isLegalUTF8(mbstr,l)) {
if (noError) return false;
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr)));
}
} else {
for (i = 1; i < l; i++) for (i = 1; i < l; i++)
{ {
/* /*
...@@ -863,10 +891,10 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError) ...@@ -863,10 +891,10 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
} }
} }
}
len -= l; len -= l;
mbstr += l; mbstr += l;
} }
return true; return true;
} }
......
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.53 2004/12/02 22:14:38 momjian Exp $ */ /* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.54 2004/12/02 22:37:14 momjian Exp $ */
#ifndef PG_WCHAR_H #ifndef PG_WCHAR_H
#define PG_WCHAR_H #define PG_WCHAR_H
...@@ -17,6 +17,14 @@ ...@@ -17,6 +17,14 @@
*/ */
typedef unsigned int pg_wchar; typedef unsigned int pg_wchar;
/*
* The UTF types
*/
typedef unsigned int UTF32; /* at least 32 bits */
typedef unsigned short UTF16; /* at least 16 bits */
typedef unsigned char UTF8; /* typically 8 bits */
/* /*
* various definitions for EUC * various definitions for EUC
*/ */
...@@ -340,4 +348,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc); ...@@ -340,4 +348,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
extern bool isLegalUTF8(const UTF8 *source, int len);
#endif /* PG_WCHAR_H */ #endif /* PG_WCHAR_H */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment