From 47c8105e26cd634ec8da9706e2462ecd4faf2c8a Mon Sep 17 00:00:00 2001 From: "brendan%mozilla.org" Date: Wed, 14 Mar 2001 07:48:07 +0000 Subject: [PATCH] Fix bug 69271, r=waterson, sr=shaver: - Don't ape java.lang.String's bogo-sampling hash function for "long" (>=16 char) strings. - Theory and practice comment in pldhash.h helps analyze when to use double hashing (most of the time) vs. when to use chaining. - Subroutine ChangeTable from PL_DHashTableOperate so it can be called from PL_DHashTableEnumerate, if the latter finds that enough entries have been removed to be worth a shrink or compress cycle. git-svn-id: svn://10.0.0.236/trunk@89639 18797224-902f-48f8-a5cc-f745e15eee43 --- mozilla/xpcom/ds/pldhash.c | 179 ++++++++++++++++++++++++------------- mozilla/xpcom/ds/pldhash.h | 71 ++++++++++++++- 2 files changed, 185 insertions(+), 65 deletions(-) diff --git a/mozilla/xpcom/ds/pldhash.c b/mozilla/xpcom/ds/pldhash.c index 3cacb067d04..a36b14e386c 100644 --- a/mozilla/xpcom/ds/pldhash.c +++ b/mozilla/xpcom/ds/pldhash.c @@ -17,6 +17,9 @@ * Copyright (C) 1999,2000 Netscape Communications Corporation. * All Rights Reserved. * + * Original Contributor: + * Brendan Eich + * * Contributor(s): * * Alternatively, the contents of this file may be used under the @@ -35,6 +38,7 @@ * Double hashing implementation. * GENERATED BY js/src/plify_jsdhash.sed -- DO NOT EDIT!!! */ +#include #include #include #include "prbit.h" @@ -62,22 +66,12 @@ PL_DHashFreeTable(PLDHashTable *table, void *ptr) PR_IMPLEMENT(PLDHashNumber) PL_DHashStringKey(PLDHashTable *table, const void *key) { - const char *s; - size_t n, m; PLDHashNumber h; + const unsigned char *s; - s = key; - n = strlen(s); h = 0; - if (n < 16) { - /* Hash every char in a short string. */ - for (; n; s++, n--) - h = (h >> 28) ^ (h << 4) ^ *s; - } else { - /* Sample a la java.lang.String.hash(). */ - for (m = n / 8; n >= m; s += m, n -= m) - h = (h >> 28) ^ (h << 4) ^ *s; - } + for (s = key; *s != '\0'; s++) + h = (h >> (PL_DHASH_BITS - 4)) ^ (h << 4) ^ *s; return h; } @@ -171,6 +165,17 @@ PL_DHashTableInit(PLDHashTable *table, PLDHashTableOps *ops, void *data, int log2; PRUint32 nbytes; +#ifdef DEBUG + if (entrySize > 6 * sizeof(void *)) { + fprintf(stderr, + "pldhash: for the table at address 0x%p, the given entrySize" + " of %lu %s favors chaining over double hashing.\n", + table, + (unsigned long) entrySize, + (entrySize > 16 * sizeof(void*)) ? "definitely" : "probably"); + } +#endif + table->ops = ops; table->data = data; if (capacity < PL_DHASH_MIN_SIZE) @@ -179,10 +184,11 @@ PL_DHashTableInit(PLDHashTable *table, PLDHashTableOps *ops, void *data, capacity = PR_BIT(log2); table->hashShift = PL_DHASH_BITS - log2; table->sizeLog2 = log2; - table->sizeMask = PR_BITMASK(table->sizeLog2); + table->sizeMask = PR_BITMASK(log2); table->entrySize = entrySize; table->entryCount = table->removedCount = 0; nbytes = capacity * entrySize; + table->entryStore = ops->allocTable(table, nbytes); if (!table->entryStore) return PR_FALSE; @@ -260,21 +266,75 @@ SearchTable(PLDHashTable *table, const void *key, PLDHashNumber keyHash) return entry; } -PR_IMPLEMENT(PLDHashEntryHdr *) -PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op) +static PRBool +ChangeTable(PLDHashTable *table, int deltaLog2, PLDHashEntryHdr *skipEntry) { - int change; - PLDHashNumber keyHash; - PRUint32 i, size, capacity, nbytes, entrySize; - PLDHashEntryHdr *entry, *oldEntry, *newEntry; - char *entryStore, *newEntryStore, *entryAddr; + int oldLog2, newLog2; + PRUint32 oldCapacity, newCapacity; + char *newEntryStore, *oldEntryStore, *oldEntryAddr; + PRUint32 entrySize, i, nbytes; + PLDHashEntryHdr *oldEntry, *newEntry; PLDHashGetKey getKey; PLDHashMoveEntry moveEntry; - /* Usually we don't grow or shrink the table. */ - change = 0; + /* Look, but don't touch, until we succeed in getting new entry store. */ + oldLog2 = table->sizeLog2; + newLog2 = oldLog2 + deltaLog2; + oldCapacity = PR_BIT(oldLog2); + newCapacity = PR_BIT(newLog2); + entrySize = table->entrySize; + nbytes = newCapacity * entrySize; - /* Avoid 0 and 1 hash codes, they indicate free and deleted entries. */ + newEntryStore = table->ops->allocTable(table, nbytes); + if (!newEntryStore) + return PR_FALSE; + + table->hashShift = PL_DHASH_BITS - newLog2; + table->sizeLog2 = newLog2; + table->sizeMask = PR_BITMASK(newLog2); + table->removedCount = 0; + + memset(newEntryStore, 0, nbytes); + oldEntryAddr = oldEntryStore = table->entryStore; + table->entryStore = newEntryStore; + getKey = table->ops->getKey; + moveEntry = table->ops->moveEntry; + + /* Copy only live entries, leaving removed ones (and skipEntry) behind. */ + for (i = 0; i < oldCapacity; i++) { + oldEntry = (PLDHashEntryHdr *)oldEntryAddr; + if (oldEntry != skipEntry && ENTRY_IS_LIVE(oldEntry)) { + newEntry = SearchTable(table, getKey(table, oldEntry), + oldEntry->keyHash); + PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(newEntry)); + moveEntry(table, oldEntry, newEntry); + newEntry->keyHash = oldEntry->keyHash; + } + oldEntryAddr += entrySize; + } + + table->ops->freeTable(table, oldEntryStore); + return PR_TRUE; +} + +PR_IMPLEMENT(PLDHashEntryHdr *) +PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op) +{ + int biasedDeltaLog2; + PLDHashNumber keyHash; + PLDHashEntryHdr *entry; + PRUint32 size; + +/* + * Usually we don't grow or shrink the table, so optimize for test-not-zero + * by biasing the deltaLog2 of -1 (shrink), 0 (compress), or 1 (grow) so that + * the biased no-change value is 0. + */ +#define DELTA_LOG2_BIAS 2 + + biasedDeltaLog2 = 0; + + /* Avoid 0 and 1 hash codes, they indicate free and removed entries. */ keyHash = table->ops->hashKey(table, key); ENSURE_LIVE_KEYHASH(keyHash); keyHash *= PL_DHASH_GOLDEN_RATIO; @@ -292,12 +352,16 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op) entry->keyHash = keyHash; table->entryCount++; - /* If alpha is >= .75, set change to trigger table growth below. */ + /* If alpha is >= .75, set biasedDeltaLog2 to trigger growth. */ size = PR_BIT(table->sizeLog2); if (table->entryCount + table->removedCount >= size - (size >> 2)) { - METER(table->stats.grows++); - change = 1; - capacity = size << 1; + if (table->removedCount >= size >> 2) { + METER(table->stats.compresses++); + biasedDeltaLog2 = 0 + DELTA_LOG2_BIAS; + } else { + METER(table->stats.grows++); + biasedDeltaLog2 = 1 + DELTA_LOG2_BIAS; + } } } METER(else table->stats.addHits++); @@ -313,8 +377,7 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op) size = PR_BIT(table->sizeLog2); if (size > PL_DHASH_MIN_SIZE && table->entryCount <= size >> 2) { METER(table->stats.shrinks++); - change = -1; - capacity = size >> 1; + biasedDeltaLog2 = -1 + DELTA_LOG2_BIAS; } } METER(else table->stats.removeMisses++); @@ -325,11 +388,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op) PR_ASSERT(0); } - if (change) { - entrySize = table->entrySize; - nbytes = capacity * entrySize; - newEntryStore = table->ops->allocTable(table, nbytes); - if (!newEntryStore) { + if (biasedDeltaLog2) { + if (!ChangeTable(table, biasedDeltaLog2 - DELTA_LOG2_BIAS, entry)) { /* If we just grabbed the last free entry, undo and fail hard. */ if (op == PL_DHASH_ADD && table->entryCount + table->removedCount == size) { @@ -339,32 +399,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op) entry = NULL; } } else { - memset(newEntryStore, 0, nbytes); - entryStore = table->entryStore; - table->entryStore = newEntryStore; - - table->sizeLog2 += change; - table->sizeMask = PR_BITMASK(table->sizeLog2); - table->hashShift = PL_DHASH_BITS - table->sizeLog2; - table->removedCount = 0; - - getKey = table->ops->getKey; - moveEntry = table->ops->moveEntry; - entryAddr = entryStore; - for (i = 0; i < size; i++) { - oldEntry = (PLDHashEntryHdr *)entryAddr; - if (oldEntry != entry && ENTRY_IS_LIVE(oldEntry)) { - newEntry = SearchTable(table, getKey(table,oldEntry), - oldEntry->keyHash); - PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(newEntry)); - moveEntry(table, oldEntry, newEntry); - newEntry->keyHash = oldEntry->keyHash; - } - entryAddr += entrySize; - } - table->ops->freeTable(table, entryStore); - if (op == PL_DHASH_ADD) { + /* If the table grew, add the new (skipped) entry. */ entry = SearchTable(table, key, keyHash); PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(entry)); entry->keyHash = keyHash; @@ -372,6 +408,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op) } } +#undef DELTA_LOG2_BIAS + return entry; } @@ -388,14 +426,14 @@ PR_IMPLEMENT(PRUint32) PL_DHashTableEnumerate(PLDHashTable *table, PLDHashEnumerator etor, void *arg) { char *entryAddr; - PRUint32 i, j, n, entrySize; + PRUint32 i, j, capacity, entrySize; PLDHashEntryHdr *entry; PLDHashOperator op; entryAddr = table->entryStore; entrySize = table->entrySize; - n = PR_BIT(table->sizeLog2); - for (i = j = 0; i < n; i++) { + capacity = PR_BIT(table->sizeLog2); + for (i = j = 0; i < capacity; i++) { entry = (PLDHashEntryHdr *)entryAddr; if (ENTRY_IS_LIVE(entry)) { op = etor(table, entry, j++, arg); @@ -408,12 +446,23 @@ PL_DHashTableEnumerate(PLDHashTable *table, PLDHashEnumerator etor, void *arg) } entryAddr += entrySize; } + + /* Shrink or compress if enough entries were removed that alpha < .5. */ + if (table->removedCount >= capacity >> 2) { + METER(table->stats.enumShrinks++); + capacity = table->entryCount; + capacity += capacity >> 1; + if (capacity < PL_DHASH_MIN_SIZE) + capacity = PL_DHASH_MIN_SIZE; + (void) ChangeTable(table, + PR_CeilingLog2(capacity) - table->sizeLog2, + NULL); + } return j; } #ifdef PL_DHASHMETER #include -#include PR_IMPLEMENT(void) PL_DHashTableDumpMeter(PLDHashTable *table, PLDHashEnumerator dump, FILE *fp) @@ -490,6 +539,8 @@ PL_DHashTableDumpMeter(PLDHashTable *table, PLDHashEnumerator dump, FILE *fp) fprintf(fp, " removes while enumerating: %u\n", table->stats.removeEnums); fprintf(fp, " number of grows: %u\n", table->stats.grows); fprintf(fp, " number of shrinks: %u\n", table->stats.shrinks); + fprintf(fp, " number of compresses: %u\n", table->stats.compresses); + fprintf(fp, "number of enumerate shrinks: %u\n", table->stats.enumShrinks); if (maxChainLen && hash2) { fputs("Maximum hash chain:\n", fp); diff --git a/mozilla/xpcom/ds/pldhash.h b/mozilla/xpcom/ds/pldhash.h index ea32b3fa929..bf19a9a71b5 100644 --- a/mozilla/xpcom/ds/pldhash.h +++ b/mozilla/xpcom/ds/pldhash.h @@ -17,7 +17,10 @@ * Copyright (C) 1999,2000 Netscape Communications Corporation. * All Rights Reserved. * - * Contributor(s): + * Original Contributor: + * Brendan Eich + * + * Contributor(s): * * Alternatively, the contents of this file may be used under the * terms of the GNU Public License (the "GPL"), in which case the @@ -88,6 +91,70 @@ struct PLDHashEntryHdr { * A PLDHashTable is currently 8 words (without the PL_DHASHMETER overhead) * on most architectures, and may be allocated on the stack or within another * structure or class (see below for the Init and Finish functions to use). + * + * To decide whether to use double hashing vs. chaining, we need to develop a + * trade-off relation, as follows: + * + * Let alpha be the load factor, esize the entry size in words, count the + * entry count, and pow2 the power-of-two table size in entries. + * + * (PLDHashTable overhead) > (PLHashTable overhead) + * (unused table entry space) > (malloc and .next overhead per entry) + + * (buckets overhead) + * (1 - alpha) * esize * pow2 > 2 * count + pow2 + * + * Notice that alpha is by definition (count / pow2): + * + * (1 - alpha) * esize * pow2 > 2 * alpha * pow2 + pow2 + * (1 - alpha) * esize > 2 * alpha + 1 + * + * esize > (1 + 2 * alpha) / (1 - alpha) + * + * This assumes both tables must keep keyHash, key, and value for each entry, + * where key and value point to separately allocated strings or structures. + * If key and value can be combined into one pointer, then the trade-off is: + * + * esize > (1 + 3 * alpha) / (1 - alpha) + * + * If the entry value can be a subtype of PLDHashEntryHdr, rather than a type + * that must be allocated separately and referenced by an entry.value pointer + * member, and provided key's allocation can be fused with its entry's, then + * k (the words wasted per entry with chaining) is 4. + * + * To see these curves, feed gnuplot input like so: + * + * gnuplot> f(x,k) = (1 + k * x) / (1 - x) + * gnuplot> plot [0:.75] f(x,2), f(x,3), f(x,4) + * + * For k of 2 and a well-loaded table (alpha > .5), esize must be more than 4 + * words for chaining to be more space-efficient than double hashing. + * + * Solving for alpha helps us decide when to shrink an underloaded table: + * + * esize > (1 + k * alpha) / (1 - alpha) + * esize - alpha * esize > 1 + k * alpha + * esize - 1 > (k + esize) * alpha + * (esize - 1) / (k + esize) > alpha + * + * alpha < (esize - 1) / (esize + k) + * + * Therefore double hashing should keep alpha >= (esize - 1) / (esize + k), + * assuming esize is not too large (in which case, chaining should probably be + * used for any alpha). For esize=2 and k=3, we want alpha >= .2; for esize=3 + * and k=2, we want alpha >= .4. For k=4, esize could be 6, and alpha >= .5 + * would still obtain. + * + * The current implementation uses a constant .25 as alpha's lower bound when + * deciding to shrink the table (while respecting PL_DHASH_MIN_SIZE). + * + * Note a qualitative difference between chaining and double hashing: under + * chaining, entry addresses are stable across table shrinks and grows. With + * double hashing, you can't safely hold an entry pointer and use it after an + * ADD or REMOVE operation. + * + * The moral of this story: there is no one-size-fits-all hash table scheme, + * but for small table entry size, and assuming entry address stability is not + * required, double hashing wins. */ struct PLDHashTable { PLDHashTableOps *ops; /* virtual operations, see below */ @@ -114,6 +181,8 @@ struct PLDHashTable { PRUint32 removeEnums; /* removes done by Enumerate */ PRUint32 grows; /* table expansions */ PRUint32 shrinks; /* table contractions */ + PRUint32 compresses; /* table compressions */ + PRUint32 enumShrinks; /* contractions after Enumerate */ } stats; #endif };