diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 8b60774f50349222ee04fef13613e3592120973b..a127627334e80704edeed38d3522e4000a695abb 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -379,6 +379,7 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ GISTENTRY entry, identry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + int keep_current_best; Assert(!GistPageIsLeaf(p)); @@ -401,6 +402,31 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ */ best_penalty[0] = -1; + /* + * If we find a tuple that's exactly as good as the currently best one, we + * could use either one. When inserting a lot of tuples with the same or + * similar keys, it's preferable to descend down the same path when + * possible, as that's more cache-friendly. On the other hand, if all + * inserts land on the same leaf page after a split, we're never going to + * insert anything to the other half of the split, and will end up using + * only 50% of the available space. Distributing the inserts evenly would + * lead to better space usage, but that hurts cache-locality during + * insertion. To get the best of both worlds, when we find a tuple that's + * exactly as good as the previous best, choose randomly whether to stick + * to the old best, or use the new one. Once we decide to stick to the + * old best, we keep sticking to it for any subsequent equally good tuples + * we might find. This favors tuples with low offsets, but still allows + * some inserts to go to other equally-good subtrees. + * + * keep_current_best is -1 if we haven't yet had to make a random choice + * whether to keep the current best tuple. If we have done so, and + * decided to keep it, keep_current_best is 1; if we've decided to + * replace, keep_current_best is 0. (This state will be reset to -1 as + * soon as we've made the replacement, but sometimes we make the choice in + * advance of actually finding a replacement best tuple.) + */ + keep_current_best = -1; + /* * Loop over tuples on page. */ @@ -446,6 +472,9 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ if (j < r->rd_att->natts - 1) best_penalty[j + 1] = -1; + + /* we have new best, so reset keep-it decision */ + keep_current_best = -1; } else if (best_penalty[j] == usize) { @@ -468,12 +497,41 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ } /* - * If we find a tuple with zero penalty for all columns, there's no - * need to examine remaining tuples; just break out of the loop and - * return it. + * If we looped past the last column, and did not update "result", + * then this tuple is exactly as good as the prior best tuple. + */ + if (j == r->rd_att->natts && result != i) + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 0) + { + /* we choose to use the new tuple */ + result = i; + /* choose again if there are even more exactly-as-good ones */ + keep_current_best = -1; + } + } + + /* + * If we find a tuple with zero penalty for all columns, and we've + * decided we don't want to search for another tuple with equal + * penalty, there's no need to examine remaining tuples; just break + * out of the loop and return it. */ if (zero_penalty) - break; + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 1) + break; + } } return result;