Ticket #8915: hashmap-py.patch
File hashmap-py.patch, 10.7 KB (added by , 16 years ago) |
---|
-
common/hashmap.cpp
24 24 */ 25 25 26 26 // The hash map (associative array) implementation in this file is 27 // based on code by Andrew Y. Ng, 1996: 27 // based on the PyDict implementation of CPython. The erase() method 28 // is based on example code in the Wikipedia article on Hash tables. 28 29 29 /*30 * Copyright (c) 1998-2003 Massachusetts Institute of Technology.31 * This code was developed as part of the Haystack research project32 * (http://haystack.lcs.mit.edu/). Permission is hereby granted,33 * free of charge, to any person obtaining a copy of this software34 * and associated documentation files (the "Software"), to deal in35 * the Software without restriction, including without limitation36 * the rights to use, copy, modify, merge, publish, distribute,37 * sublicense, and/or sell copies of the Software, and to permit38 * persons to whom the Software is furnished to do so, subject to39 * the following conditions:40 *41 * The above copyright notice and this permission notice shall be42 * included in all copies or substantial portions of the Software.43 *44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,45 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES46 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND47 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT48 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,49 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING50 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR51 * OTHER DEALINGS IN THE SOFTWARE.52 */53 54 30 #include "common/hashmap.h" 55 31 56 32 namespace Common { 57 33 58 // const char *:34 // Hash function for strings, taken from CPython. 59 35 uint hashit(const char *p) { 60 uint hash = 0;36 uint hash = *p << 7; 61 37 byte c; 62 while ((c = *p++)) 63 hash = (hash * 31 + c); 64 return hash; 38 int size = 0; 39 while ((c = *p++)) { 40 hash = (1000003 * hash) ^ c; 41 size++; 42 } 43 return hash ^ size; 65 44 } 66 45 46 // Like hashit, but converts every char to lowercase before hashing. 67 47 uint hashit_lower(const char *p) { 68 uint hash = 0;48 uint hash = tolower(*p) << 7; 69 49 byte c; 70 while ((c = *p++)) 71 hash = (hash * 31 + tolower(c)); 72 return hash; 50 int size = 0; 51 while ((c = *p++)) { 52 hash = (1000003 * hash) ^ tolower(c); 53 size++; 54 } 55 return hash ^ size; 73 56 } 74 57 75 // The following table is taken from the GNU ISO C++ Library's hashtable.h file.76 static const uint primes[] = {77 53ul, 97ul, 193ul, 389ul, 769ul,78 1543ul, 3079ul, 6151ul, 12289ul, 24593ul,79 49157ul, 98317ul, 196613ul, 393241ul, 786433ul,80 1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,81 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,82 1610612741ul, 3221225473ul, 4294967291ul83 };84 85 uint nextTableSize(uint x) {86 int i = 0;87 while (x >= primes[i])88 i++;89 return primes[i];90 }91 92 58 #ifdef DEBUG_HASH_COLLISIONS 93 59 static double 94 60 g_collisions = 0, … … 98 64 g_size = 0; 99 65 static int g_max_capacity = 0, g_max_size = 0; 100 66 static int g_totalHashmaps = 0; 67 static int g_stats[4] = {0,0,0,0}; 101 68 102 69 void updateHashCollisionStats(int collisions, int lookups, int arrsize, int nele) { 103 70 g_collisions += collisions; … … 108 75 g_size += nele; 109 76 g_totalHashmaps++; 110 77 78 if (3*nele <= 2*8) 79 g_stats[0]++; 80 if (3*nele <= 2*16) 81 g_stats[1]++; 82 if (3*nele <= 2*32) 83 g_stats[2]++; 84 if (3*nele <= 2*64) 85 g_stats[3]++; 86 111 87 g_max_capacity = MAX(g_max_capacity, arrsize); 112 88 g_max_size = MAX(g_max_size, nele); 113 89 … … 118 94 100 * g_collPerLook / g_totalHashmaps, 119 95 g_size / g_totalHashmaps, g_max_size, 120 96 g_capacity / g_totalHashmaps, g_max_capacity); 97 fprintf(stdout, " %d less than %d; %d less than %d; %d less than %d; %d less than %d\n", 98 g_stats[0], 2*8/3, 99 g_stats[1],2*16/3, 100 g_stats[2],2*32/3, 101 g_stats[3],2*64/3); 102 103 // TODO: 104 // * Should record the maximal size of the map during its lifetime, not that at its death 105 // * Should do some statistics: how many maps are less than 2/3*8, 2/3*16, 2/3*32, ... 121 106 } 122 107 #endif 123 108 -
common/hashmap.h
24 24 */ 25 25 26 26 // The hash map (associative array) implementation in this file is 27 // based on code by Andrew Y. Ng, 1996: 27 // based on the PyDict implementation of CPython. The erase() method 28 // is based on example code in the Wikipedia article on Hash tables. 28 29 29 /*30 * Copyright (c) 1998-2003 Massachusetts Institute of Technology.31 * This code was developed as part of the Haystack research project32 * (http://haystack.lcs.mit.edu/). Permission is hereby granted,33 * free of charge, to any person obtaining a copy of this software34 * and associated documentation files (the "Software"), to deal in35 * the Software without restriction, including without limitation36 * the rights to use, copy, modify, merge, publish, distribute,37 * sublicense, and/or sell copies of the Software, and to permit38 * persons to whom the Software is furnished to do so, subject to39 * the following conditions:40 *41 * The above copyright notice and this permission notice shall be42 * included in all copies or substantial portions of the Software.43 *44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,45 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES46 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND47 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT48 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,49 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING50 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR51 * OTHER DEALINGS IN THE SOFTWARE.52 */53 54 30 #ifndef COMMON_HASHMAP_H 55 31 #define COMMON_HASHMAP_H 56 32 … … 74 50 75 51 namespace Common { 76 52 77 // The table sizes ideally are primes. We use a helper function to find78 // suitable table sizes.79 uint nextTableSize(uint x);80 81 82 53 // Enable the following #define if you want to check how many collisions the 83 54 // code produces (many collisions indicate either a bad hash function, or a 84 55 // hash table that is too small). 85 //#define DEBUG_HASH_COLLISIONS56 #define DEBUG_HASH_COLLISIONS 86 57 87 58 88 59 /** … … 136 107 } 137 108 #endif 138 109 110 enum { 111 HASHMAP_PERTURB_SHIFT = 5, 112 HASHMAP_MIN_CAPACITY = 8 113 }; 114 139 115 Node **_storage; // hashtable of size arrsize. 140 uint _capacity; 116 uint _capacity; /**< Current capacity of the HashMap; must be a power of two */ 141 117 uint _size; 142 118 143 119 HashFunc _hash; … … 302 278 _nodePool(sizeof(Node)), 303 279 #endif 304 280 _defaultVal() { 305 _capacity = nextTableSize(0);281 _capacity = HASHMAP_MIN_CAPACITY; 306 282 _storage = new Node *[_capacity]; 307 283 assert(_storage != NULL); 308 284 memset(_storage, 0, _capacity * sizeof(Node *)); … … 382 358 } 383 359 } 384 360 385 if (shrinkArray && _capacity > nextTableSize(0)) {361 if (shrinkArray && _capacity > HASHMAP_MIN_CAPACITY) { 386 362 delete[] _storage; 387 363 388 _capacity = nextTableSize(0);364 _capacity = HASHMAP_MIN_CAPACITY; 389 365 _storage = new Node *[_capacity]; 390 366 assert(_storage != NULL); 391 367 memset(_storage, 0, _capacity * sizeof(Node *)); … … 397 373 template<class Key, class Val, class HashFunc, class EqualFunc> 398 374 void HashMap<Key, Val, HashFunc, EqualFunc>::expand_array(uint newsize) { 399 375 assert(newsize > _capacity); 400 uint ctr, dex;401 376 402 377 const uint old_size = _size; 403 378 const uint old_capacity = _capacity; … … 411 386 memset(_storage, 0, _capacity * sizeof(Node *)); 412 387 413 388 // rehash all the old elements 414 for ( ctr = 0; ctr < old_capacity; ++ctr) {389 for (uint ctr = 0; ctr < old_capacity; ++ctr) { 415 390 if (old_storage[ctr] == NULL) 416 391 continue; 417 392 … … 419 394 // Since we know that no key exists twice in the old table, we 420 395 // can do this slightly better than by calling lookup, since we 421 396 // don't have to call _equal(). 422 dex = _hash(old_storage[ctr]->_key) % _capacity; 423 while (_storage[dex] != NULL) { 424 dex = (dex + 1) % _capacity; 397 const uint hash = _hash(old_storage[ctr]->_key); 398 uint idx = hash & (_capacity - 1); 399 for (uint perturb = hash; _storage[idx] != NULL; perturb >>= HASHMAP_PERTURB_SHIFT) { 400 idx = (5 * idx + perturb + 1) & (_capacity - 1); 425 401 } 426 402 427 _storage[ dex] = old_storage[ctr];403 _storage[idx] = old_storage[ctr]; 428 404 _size++; 429 405 } 430 406 … … 439 415 440 416 template<class Key, class Val, class HashFunc, class EqualFunc> 441 417 int HashMap<Key, Val, HashFunc, EqualFunc>::lookup(const Key &key) const { 442 uint ctr = _hash(key) % _capacity; 418 const uint hash = _hash(key); 419 uint ctr = hash & (_capacity - 1); 420 for (uint perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) { 421 if (_storage[ctr] == NULL || _equal(_storage[ctr]->_key, key)) 422 break; 443 423 444 while (_storage[ctr] != NULL && !_equal(_storage[ctr]->_key, key)) { 445 ctr = (ctr + 1) % _capacity; 424 ctr = (5 * ctr + perturb + 1) & (_capacity - 1); 446 425 447 426 #ifdef DEBUG_HASH_COLLISIONS 448 427 _collisions++; … … 467 446 _storage[ctr] = allocNode(key); 468 447 _size++; 469 448 470 // Keep the load factor below 75%.471 if ( _size > _capacity * 75 / 100) {472 expand_array( nextTableSize(_capacity));449 // Keep the load factor below 2/3. 450 if (3 * _size > _capacity * 2) { 451 expand_array(_capacity * 2); 473 452 ctr = lookup(key); 474 453 } 475 454 } … … 520 499 template<class Key, class Val, class HashFunc, class EqualFunc> 521 500 void HashMap<Key, Val, HashFunc, EqualFunc>::erase(const Key &key) { 522 501 // This is based on code in the Wikipedia article on Hash tables. 523 uint i = lookup(key); 502 503 const uint hash = _hash(key); 504 uint i = hash & (_capacity - 1); 505 uint perturb; 506 507 for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) { 508 if (_storage[i] == NULL || _equal(_storage[i]->_key, key)) 509 break; 510 511 i = (5 * i + perturb + 1) & (_capacity - 1); 512 } 513 524 514 if (_storage[i] == NULL) 525 515 return; // key wasn't present, so no work has to be done 516 526 517 // If we remove a key, we must check all subsequent keys and possibly 527 518 // reinsert them. 528 519 uint j = i; 529 520 freeNode(_storage[i]); 530 521 _storage[i] = NULL; 531 while (true) {522 for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) { 532 523 // Look at the next table slot 533 j = ( j + 1) % _capacity;524 j = (5 * j + perturb + 1) & (_capacity - 1); 534 525 // If the next slot is empty, we are done 535 526 if (_storage[j] == NULL) 536 527 break; 537 528 // Compute the slot where the content of the next slot should normally be, 538 529 // assuming an empty table, and check whether we have to move it. 539 uint k = _hash(_storage[j]->_key) % _capacity;530 uint k = _hash(_storage[j]->_key) & (_capacity - 1); 540 531 if ((j > i && (k <= i || k > j)) || 541 532 (j < i && (k <= i && k > j)) ) { 542 533 _storage[i] = _storage[j];