librsync  2.3.4
hashtable.h
Go to the documentation of this file.
1 /*= -*- c-basic-offset: 4; indent-tabs-mode: nil; -*-
2  *
3  * hashtable.h -- a generic open addressing hashtable.
4  *
5  * Copyright (C) 2003 by Donovan Baarda <abo@minkirri.apana.org.au>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation; either version 2.1 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
20 
21 /** \file hashtable.h
22  * A generic open addressing hashtable.
23  *
24  * This is a minimal hashtable containing pointers to arbitrary entries with
25  * configurable hashtable size and support for custom hash() and cmp() methods.
26  * The cmp() method can either be a simple comparison between two keys, or can
27  * be against a special match object containing additional mutable state. This
28  * allows for things like deferred and cached evaluation of costly comparison
29  * data. The hash() function doesn't need to avoid clustering behaviour.
30  *
31  * It uses open addressing with quadratic probing for collisions. The
32  * MurmurHash3 finalization function is optionally used on the hash() output to
33  * avoid clustering and can be disabled by setting HASHTABLE_NMIX32. There is
34  * no support for removing entries, only adding them. Multiple entries with the
35  * same key can be added, and you can use a fancy cmp() function to find
36  * particular entries by more than just their key. There is an iterator for
37  * iterating through all entries in the hashtable. There are optional
38  * NAME_find() find/match/hashcmp/entrycmp stats counters that can be disabled
39  * by defining HASHTABLE_NSTATS. There is an optional simple k=1 bloom filter
40  * for speed that can be disabled by defining HASHTABLE_NBLOOM.
41  *
42  * The types and methods of the hashtable and its contents are specified by
43  * using \#define parameters set to their basenames (the prefixes for the *_t
44  * type and *_func() methods) before doing \#include "hashtable.h". This
45  * produces static inline type-safe methods that are either application
46  * optimized for speed or wrappers around void* implementation methods for
47  * compactness.
48  *
49  * \param ENTRY - the entry type basename.
50  *
51  * \param KEY - optional key type basename (default: ENTRY).
52  *
53  * \param MATCH - optional match type basename (default: KEY).
54  *
55  * \param NAME - optional hashtable type basename (default: ENTRY_hashtable).
56  *
57  * Example: \code
58  * typedef ... mykey_t;
59  * int mykey_hash(mykey_t const *e);
60  * int mykey_cmp(mykey_t *e, mykey_t const *o);
61  *
62  * typedef struct myentry {
63  * mykey_t key; // Inherit from mykey_t.
64  * ...extra entry value data...
65  * } myentry_t;
66  * void myentry_init(myentry_t *e, ...);
67  *
68  * #define ENTRY myentry
69  * #define KEY mykey
70  * #include "hashtable.h"
71  *
72  * hashtable_t *t;
73  * myentry_t entries[300];
74  * mykey_t k;
75  * myentry_t *e;
76  *
77  * t = myentry_hashtable_new(300);
78  * myentry_init(&entries[5], ...);
79  * myentry_hashtable_add(t, &entries[5]);
80  * k = ...;
81  * e = myentry_hashtable_find(t, &k);
82  *
83  * int i;
84  * for (e = myentry_hashtable_iter(t, &i); e != NULL;
85  * e = myentry_hashtable_next(t, &i))
86  * ...
87  *
88  * myentry_hashtable_free(t);
89  * \endcode
90  *
91  * The mykey_hash() and mykey_cmp() fuctions will typically take pointers to
92  * mykey/myentry instances the same as the pointers stored in the hashtable.
93  * However it is also possible for them to take "match objects" that are a
94  * "subclass" of the entry type that contain additional state for complicated
95  * comparision operations.
96  *
97  * Example: \code
98  * typedef struct mymatch {
99  * mykey_t key; // Inherit from mykey_t;
100  * ...extra match criteria and state data...
101  * } mymatch_t;
102  * int mymatch_cmp(mymatch_t *m, myentry_t const *e);
103  *
104  * #define ENTRY myentry
105  * #define KEY mykey
106  * #define MATCH mymatch
107  * #include "hashtable.h"
108  *
109  * ...
110  * mymatch_t m;
111  *
112  * t = myentry_hashtable_new(300);
113  * ...
114  * m = ...;
115  * e = myentry_hashtable_find(t, &m);
116  * \endcode
117  *
118  * The mymatch_cmp() function is only called for finding hashtable entries and
119  * can mutate the mymatch_t object for doing things like deferred and cached
120  * evaluation of expensive match data. It can also access the whole myentry_t
121  * object to match against more than just the key. */
122 #ifndef HASHTABLE_H
123 # define HASHTABLE_H
124 
125 # include <stdbool.h>
126 
127 /** The hashtable type. */
128 typedef struct hashtable {
129  int size; /**< Size of allocated hashtable. */
130  int count; /**< Number of entries in hashtable. */
131  unsigned tmask; /**< Mask to get the hashtable index. */
132 # ifndef HASHTABLE_NBLOOM
133  unsigned bshift; /**< Shift to get the bloomfilter index. */
134 # endif
135 # ifndef HASHTABLE_NSTATS
136  /* The following are for accumulating NAME_find() stats. */
137  long find_count; /**< The count of finds tried. */
138  long match_count; /**< The count of matches found. */
139  long hashcmp_count; /**< The count of hash compares done. */
140  long entrycmp_count; /**< The count of entry compares done. */
141 # endif
142 # ifndef HASHTABLE_NBLOOM
143  unsigned char *kbloom; /**< Bloom filter of hash keys with k=1. */
144 # endif
145  void **etable; /**< Table of pointers to entries. */
146  unsigned ktable[]; /**< Table of hash keys. */
148 
149 /* void* implementations for the type-safe static inline wrappers below. */
150 hashtable_t *_hashtable_new(int size);
151 void _hashtable_free(hashtable_t *t);
152 
153 # ifndef HASHTABLE_NBLOOM
154 static inline void hashtable_setbloom(hashtable_t *t, unsigned const h)
155 {
156  /* Use upper bits for a "different hash". */
157  unsigned const i = h >> t->bshift;
158  t->kbloom[i / 8] |= (unsigned char)(1 << (i % 8));
159 }
160 
161 static inline bool hashtable_getbloom(hashtable_t *t, unsigned const h)
162 {
163  /* Use upper bits for a "different hash". */
164  unsigned const i = h >> t->bshift;
165  return (t->kbloom[i / 8] >> (i % 8)) & 1;
166 }
167 # endif
168 
169 /** MurmurHash3 finalization mix function. */
170 static inline unsigned mix32(unsigned h)
171 {
172  h ^= h >> 16;
173  h *= 0x85ebca6b;
174  h ^= h >> 13;
175  h *= 0xc2b2ae35;
176  h ^= h >> 16;
177  return h;
178 }
179 
180 /** Ensure hash's are never zero. */
181 static inline unsigned nozero(unsigned h)
182 {
183  return h ? h : (unsigned)-1;
184 }
185 
186 #endif /* !HASHTABLE_H */
187 
188 /* If ENTRY is defined, define type-dependent static inline methods. */
189 #ifdef ENTRY
190 
191 # include <assert.h>
192 # include <stddef.h>
193 
194 # define _JOIN2(x, y) x##y
195 # define _JOIN(x, y) _JOIN2(x, y)
196 
197 # ifndef KEY
198 # define KEY ENTRY
199 # endif
200 
201 # ifndef MATCH
202 # define MATCH KEY
203 # endif
204 
205 # ifndef NAME
206 # define NAME _JOIN(ENTRY, _hashtable)
207 # endif
208 
209 # define ENTRY_t _JOIN(ENTRY, _t) /**< The entry type. */
210 # define KEY_t _JOIN(KEY, _t) /**< The key type. */
211 # define MATCH_t _JOIN(MATCH, _t) /**< The match type. */
212 # define KEY_hash _JOIN(KEY, _hash) /**< The key hash(k) method. */
213 # define MATCH_cmp _JOIN(MATCH, _cmp) /**< The match cmp(m, e) method. */
214 /* The names for all the hashtable methods. */
215 # define NAME_new _JOIN(NAME, _new)
216 # define NAME_free _JOIN(NAME, _free)
217 # define NAME_stats_init _JOIN(NAME, _stats_init)
218 # define NAME_add _JOIN(NAME, _add)
219 # define NAME_find _JOIN(NAME, _find)
220 # define NAME_iter _JOIN(NAME, _iter)
221 # define NAME_next _JOIN(NAME, _next)
222 
223 /* Modified hash() with/without mix32() and reserving zero for empty buckets. */
224 # ifdef HASHTABLE_NMIX32
225 # define _KEY_HASH(k) nozero(KEY_hash((KEY_t *)k))
226 # else
227 # define _KEY_HASH(k) nozero(mix32(KEY_hash((KEY_t *)k)))
228 # endif
229 
230 /* Loop macro for probing table t for key hash hk, iterating with index i and
231  entry hash h, terminating at an empty bucket. */
232 # define _for_probe(t, hk, i, h) \
233  unsigned const *const ktable = t->ktable;\
234  unsigned const tmask = t->tmask;\
235  unsigned i, s, h;\
236  for (i = hk & tmask, s = 0; (h = ktable[i]); i = (i + ++s) & tmask)
237 
238 /* Conditional macro for incrementing stats counters. */
239 # ifndef HASHTABLE_NSTATS
240 # define _stats_inc(c) (c++)
241 # else
242 # define _stats_inc(c)
243 # endif
244 
245 /** Allocate and initialize a hashtable instance.
246  *
247  * The provided size is used as an indication of the number of entries you wish
248  * to add, but the allocated size will probably be larger depending on the
249  * implementation to enable optimisations or avoid degraded performance. It may
250  * be possible to fill the table beyond the requested size, but performance can
251  * start to degrade badly if it is over filled.
252  *
253  * \param size - The desired minimum size of the hash table.
254  *
255  * \return The initialized hashtable instance or NULL if it failed. */
256 static inline hashtable_t *NAME_new(int size)
257 {
258  return _hashtable_new(size);
259 }
260 
261 /** Destroy and free a hashtable instance.
262  *
263  * This will free the hashtable, but will not free the entries in the
264  * hashtable. If you want to free the entries too, use a hashtable iterator to
265  * free the the entries first.
266  *
267  * \param *t - The hashtable to destroy and free. */
268 static inline void NAME_free(hashtable_t *t)
269 {
270  _hashtable_free(t);
271 }
272 
273 /** Initialize hashtable stats counters.
274  *
275  * This will reset all the stats counters for the hashtable,
276  *
277  * \param *t - The hashtable to initializ stats for. */
278 static inline void NAME_stats_init(hashtable_t *t)
279 {
280 # ifndef HASHTABLE_NSTATS
281  t->find_count = t->match_count = t->hashcmp_count = t->entrycmp_count = 0;
282 # endif
283 }
284 
285 /** Add an entry to a hashtable.
286  *
287  * This doesn't use MATCH_cmp() or do any checks for existing copies or
288  * instances, so it will add duplicates. If you want to avoid adding
289  * duplicates, use NAME_find() to check for existing entries first.
290  *
291  * \param *t - The hashtable to add to.
292  *
293  * \param *e - The entry object to add.
294  *
295  * \return The added entry, or NULL if the table is full. */
296 static inline ENTRY_t *NAME_add(hashtable_t *t, ENTRY_t *e)
297 {
298  unsigned he = _KEY_HASH(e);
299 
300  assert(e != NULL);
301  if (t->count + 1 == t->size)
302  return NULL;
303 # ifndef HASHTABLE_NBLOOM
304  hashtable_setbloom(t, he);
305 # endif
306  _for_probe(t, he, i, h);
307  t->count++;
308  t->ktable[i] = he;
309  return t->etable[i] = e;
310 }
311 
312 /** Find an entry in a hashtable.
313  *
314  * Uses MATCH_cmp() to find the first matching entry in the table in the same
315  * hash() bucket.
316  *
317  * \param *t - The hashtable to search.
318  *
319  * \param *m - The key or match object to search for.
320  *
321  * \return The first found entry, or NULL if nothing was found. */
322 static inline ENTRY_t *NAME_find(hashtable_t *t, MATCH_t *m)
323 {
324  assert(m != NULL);
325  unsigned hm = _KEY_HASH(m);
326  ENTRY_t *e;
327 
328  _stats_inc(t->find_count);
329 # ifndef HASHTABLE_NBLOOM
330  if (!hashtable_getbloom(t, hm))
331  return NULL;
332 # endif
333  _for_probe(t, hm, i, he) {
334  _stats_inc(t->hashcmp_count);
335  if (hm == he) {
336  _stats_inc(t->entrycmp_count);
337  if (!MATCH_cmp(m, e = t->etable[i])) {
338  _stats_inc(t->match_count);
339  return e;
340  }
341  }
342  }
343  /* Also count the compare for the empty bucket. */
344  _stats_inc(t->hashcmp_count);
345  return NULL;
346 }
347 
348 static inline ENTRY_t *NAME_next(hashtable_t *t, int *i);
349 
350 /** Initialize a iteration and return the first entry.
351  *
352  * This works together with NAME_next() for iterating through all entries in a
353  * hashtable.
354  *
355  * Example: \code
356  * for (e = NAME_iter(t, &i); e != NULL; e = NAME_next(t, &i))
357  * ...
358  * \endcode
359  *
360  * \param *t - the hashtable to iterate over.
361  *
362  * \param *i - the int iterator index to initialize.
363  *
364  * \return The first entry or NULL if the hashtable is empty. */
365 static inline ENTRY_t *NAME_iter(hashtable_t *t, int *i)
366 {
367  assert(t != NULL);
368  assert(i != NULL);
369  *i = 0;
370  return NAME_next(t, i);
371 }
372 
373 /** Get the next entry from a hashtable iterator or NULL when finished.
374  *
375  * This works together with NAME_iter() for iterating through all entries in a
376  * hashtable.
377  *
378  * \param *t - the hashtable to iterate over.
379  *
380  * \param *i - the int iterator index to use.
381  *
382  * \return The next entry or NULL if the iterator is finished. */
383 static inline ENTRY_t *NAME_next(hashtable_t *t, int *i)
384 {
385  assert(t != NULL);
386  assert(i != NULL);
387  ENTRY_t *e = NULL;
388 
389  while ((*i < t->size) && !(e = t->etable[(*i)++])) ;
390  return e;
391 }
392 
393 # undef ENTRY
394 # undef KEY
395 # undef MATCH
396 # undef NAME
397 # undef ENTRY_t
398 # undef KEY_t
399 # undef MATCH_t
400 # undef KEY_hash
401 # undef MATCH_cmp
402 # undef NAME_new
403 # undef NAME_free
404 # undef NAME_stats_init
405 # undef NAME_add
406 # undef NAME_find
407 # undef NAME_iter
408 # undef NAME_next
409 # undef _KEY_HASH
410 #endif /* ENTRY */
#define ENTRY_t
The entry type.
Definition: hashtable.h:209
static void NAME_stats_init(hashtable_t *t)
Initialize hashtable stats counters.
Definition: hashtable.h:278
static ENTRY_t * NAME_next(hashtable_t *t, int *i)
Get the next entry from a hashtable iterator or NULL when finished.
Definition: hashtable.h:383
static ENTRY_t * NAME_add(hashtable_t *t, ENTRY_t *e)
Add an entry to a hashtable.
Definition: hashtable.h:296
#define MATCH_cmp
The match cmp(m, e) method.
Definition: hashtable.h:213
struct hashtable hashtable_t
The hashtable type.
#define MATCH_t
The match type.
Definition: hashtable.h:211
static void NAME_free(hashtable_t *t)
Destroy and free a hashtable instance.
Definition: hashtable.h:268
static ENTRY_t * NAME_iter(hashtable_t *t, int *i)
Initialize a iteration and return the first entry.
Definition: hashtable.h:365
static ENTRY_t * NAME_find(hashtable_t *t, MATCH_t *m)
Find an entry in a hashtable.
Definition: hashtable.h:322
static unsigned mix32(unsigned h)
MurmurHash3 finalization mix function.
Definition: hashtable.h:170
static unsigned nozero(unsigned h)
Ensure hash's are never zero.
Definition: hashtable.h:181
static hashtable_t * NAME_new(int size)
Allocate and initialize a hashtable instance.
Definition: hashtable.h:256
The hashtable type.
Definition: hashtable.h:128
long find_count
The count of finds tried.
Definition: hashtable.h:137
unsigned ktable[]
Table of hash keys.
Definition: hashtable.h:146
long match_count
The count of matches found.
Definition: hashtable.h:138
int count
Number of entries in hashtable.
Definition: hashtable.h:130
unsigned char * kbloom
Bloom filter of hash keys with k=1.
Definition: hashtable.h:143
long entrycmp_count
The count of entry compares done.
Definition: hashtable.h:140
int size
Size of allocated hashtable.
Definition: hashtable.h:129
unsigned tmask
Mask to get the hashtable index.
Definition: hashtable.h:131
void ** etable
Table of pointers to entries.
Definition: hashtable.h:145
long hashcmp_count
The count of hash compares done.
Definition: hashtable.h:139
unsigned bshift
Shift to get the bloomfilter index.
Definition: hashtable.h:133