Skip to content
Snippets Groups Projects
Commit 51e78ab4 authored by Tom Lane's avatar Tom Lane
Browse files

Avoid use of sscanf() to parse ispell dictionary files.

It turns out that on FreeBSD-derived platforms (including OS X), the
*scanf() family of functions is pretty much brain-dead about multibyte
characters.  In particular it will apply isspace() to individual bytes
of input even when those bytes are part of a multibyte character, thus
allowing false recognition of a field-terminating space.

We appear to have little alternative other than instituting a coding
rule that *scanf() is not to be used if the input string might contain
multibyte characters.  (There was some discussion of relying on "%ls",
but that probably just moves the portability problem somewhere else,
and besides it doesn't fully prevent BSD *scanf() from using isspace().)

This patch is a down payment on that: it gets rid of use of sscanf()
to parse ispell dictionary files, which are certainly at great risk
of having a problem.  The code is cleaner this way anyway, though
a bit longer.

In passing, improve a few comments.

Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me.
Back-patch to all supported branches.
parent c5e9b771
No related branches found
No related tags found
No related merge requests found
......@@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
Conf->naffixes++;
}
/* Parsing states for parse_affentry() and friends */
#define PAE_WAIT_MASK 0
#define PAE_INMASK 1
#define PAE_INMASK 1
#define PAE_WAIT_FIND 2
#define PAE_INFIND 3
#define PAE_INFIND 3
#define PAE_WAIT_REPL 4
#define PAE_INREPL 5
#define PAE_INREPL 5
#define PAE_WAIT_TYPE 6
#define PAE_WAIT_FLAG 7
/*
* Parse next space-separated field of an .affix file line.
*
* *str is the input pointer (will be advanced past field)
* next is where to copy the field value to, with null termination
*
* The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
*
* Returns TRUE if we found a field, FALSE if not.
*/
static bool
get_nextfield(char **str, char *next)
{
int state = PAE_WAIT_MASK;
int avail = BUFSIZ;
while (**str)
{
if (state == PAE_WAIT_MASK)
{
if (t_iseq(*str, '#'))
return false;
else if (!t_isspace(*str))
{
int clen = pg_mblen(*str);
if (clen < avail)
{
COPYCHAR(next, *str);
next += clen;
avail -= clen;
}
state = PAE_INMASK;
}
}
else /* state == PAE_INMASK */
{
if (t_isspace(*str))
{
*next = '\0';
return true;
}
else
{
int clen = pg_mblen(*str);
if (clen < avail)
{
COPYCHAR(next, *str);
next += clen;
avail -= clen;
}
}
}
*str += pg_mblen(*str);
}
*next = '\0';
return (state == PAE_INMASK); /* OK if we got a nonempty field */
}
/*
* Parses entry of an .affix file of MySpell or Hunspell format.
*
* An .affix file entry has the following format:
* - header
* <type> <flag> <cross_flag> <flag_count>
* - fields after header:
* <type> <flag> <find> <replace> <mask>
*
* str is the input line
* field values are returned to type etc, which must be buffers of size BUFSIZ.
*
* Returns number of fields found; any omitted fields are set to empty strings.
*/
static int
parse_ooaffentry(char *str, char *type, char *flag, char *find,
char *repl, char *mask)
{
int state = PAE_WAIT_TYPE;
int fields_read = 0;
bool valid = false;
*type = *flag = *find = *repl = *mask = '\0';
while (*str)
{
switch (state)
{
case PAE_WAIT_TYPE:
valid = get_nextfield(&str, type);
state = PAE_WAIT_FLAG;
break;
case PAE_WAIT_FLAG:
valid = get_nextfield(&str, flag);
state = PAE_WAIT_FIND;
break;
case PAE_WAIT_FIND:
valid = get_nextfield(&str, find);
state = PAE_WAIT_REPL;
break;
case PAE_WAIT_REPL:
valid = get_nextfield(&str, repl);
state = PAE_WAIT_MASK;
break;
case PAE_WAIT_MASK:
valid = get_nextfield(&str, mask);
state = -1; /* force loop exit */
break;
default:
elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
state);
break;
}
if (valid)
fields_read++;
else
break; /* early EOL */
if (state < 0)
break; /* got all fields */
}
return fields_read;
}
/*
* Parses entry of an .affix file of Ispell format
*
* An .affix file entry has the following format:
* <mask> > [-<find>,]<replace>
*/
static bool
parse_affentry(char *str, char *mask, char *find, char *repl)
{
......@@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
int flag = 0;
char flagflags = 0;
tsearch_readline_state trst;
int scanread = 0;
char scanbuf[BUFSIZ];
char *recoded;
/* read file to find any flag */
......@@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
}
tsearch_readline_end(&trst);
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
......@@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
while ((recoded = tsearch_readline(&trst)) != NULL)
{
int fields_read;
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
goto nextline;
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
if (ptype)
pfree(ptype);
ptype = lowerstr_ctx(Conf, type);
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
if (fields_read < 4 ||
(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
goto nextline;
if (scanread == 4)
if (fields_read == 4)
{
if (strlen(sflag) != 1)
goto nextline;
......@@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
goto nextline;
prepl = lowerstr_ctx(Conf, repl);
/* affix flag */
/* Find position of '/' in lowercased string "prepl" */
if ((ptr = strchr(prepl, '/')) != NULL)
{
/*
* Here we use non-lowercased string "repl". We need position
* of '/' in "repl".
*/
*ptr = '\0';
ptr = repl + (ptr - prepl) + 1;
while (*ptr)
......@@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
if (STRNCMP(pstr, "compoundwords") == 0)
{
/* Find position in lowercased string "pstr" */
s = findchar(pstr, 'l');
if (s)
{
s = recoded + (s - pstr); /* we need non-lowercased
* string */
/* Here we use non-lowercased string "recoded" */
s = recoded + (s - pstr);
while (*s && !t_isspace(s))
s += pg_mblen(s);
while (*s && t_isspace(s))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment