Skip to content

Commit 9b1c2a3

Browse files
committed
Merge branch 'kb/hashmap-updates'
* kb/hashmap-updates: hashmap: add string interning API hashmap: add simplified hashmap_get_from_hash() API hashmap: improve struct hashmap member documentation hashmap: factor out getting a hash code from a SHA1
2 parents 0ac7443 + 7b64d42 commit 9b1c2a3

File tree

12 files changed

+159
-61
lines changed

12 files changed

+159
-61
lines changed

Documentation/technical/api-hashmap.txt

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,19 @@ Data Structures
88

99
`struct hashmap`::
1010

11-
The hash table structure.
11+
The hash table structure. Members can be used as follows, but should
12+
not be modified directly:
1213
+
13-
The `size` member keeps track of the total number of entries. The `cmpfn`
14-
member is a function used to compare two entries for equality. The `table` and
15-
`tablesize` members store the hash table and its size, respectively.
14+
The `size` member keeps track of the total number of entries (0 means the
15+
hashmap is empty).
16+
+
17+
`tablesize` is the allocated size of the hash table. A non-0 value indicates
18+
that the hashmap is initialized. It may also be useful for statistical purposes
19+
(i.e. `size / tablesize` is the current load factor).
20+
+
21+
`cmpfn` stores the comparison function specified in `hashmap_init()`. In
22+
advanced scenarios, it may be useful to change this, e.g. to switch between
23+
case-sensitive and case-insensitive lookup.
1624

1725
`struct hashmap_entry`::
1826

@@ -58,6 +66,15 @@ Functions
5866
+
5967
`strihash` and `memihash` are case insensitive versions.
6068

69+
`unsigned int sha1hash(const unsigned char *sha1)`::
70+
71+
Converts a cryptographic hash (e.g. SHA-1) into an int-sized hash code
72+
for use in hash tables. Cryptographic hashes are supposed to have
73+
uniform distribution, so in contrast to `memhash()`, this just copies
74+
the first `sizeof(int)` bytes without shuffling any bits. Note that
75+
the results will be different on big-endian and little-endian
76+
platforms, so they should not be stored or transferred over the net.
77+
6178
`void hashmap_init(struct hashmap *map, hashmap_cmp_fn equals_function, size_t initial_size)`::
6279

6380
Initializes a hashmap structure.
@@ -101,6 +118,20 @@ hashmap_entry) that has at least been initialized with the proper hash code
101118
If an entry with matching hash code is found, `key` and `keydata` are passed
102119
to `hashmap_cmp_fn` to decide whether the entry matches the key.
103120

121+
`void *hashmap_get_from_hash(const struct hashmap *map, unsigned int hash, const void *keydata)`::
122+
123+
Returns the hashmap entry for the specified hash code and key data,
124+
or NULL if not found.
125+
+
126+
`map` is the hashmap structure.
127+
+
128+
`hash` is the hash code of the entry to look up.
129+
+
130+
If an entry with matching hash code is found, `keydata` is passed to
131+
`hashmap_cmp_fn` to decide whether the entry matches the key. The
132+
`entry_or_key` parameter points to a bogus hashmap_entry structure that
133+
should not be used in the comparison.
134+
104135
`void *hashmap_get_next(const struct hashmap *map, const void *entry)`::
105136

106137
Returns the next equal hashmap entry, or NULL if not found. This can be
@@ -162,6 +193,21 @@ more entries.
162193
`hashmap_iter_first` is a combination of both (i.e. initializes the iterator
163194
and returns the first entry, if any).
164195

196+
`const char *strintern(const char *string)`::
197+
`const void *memintern(const void *data, size_t len)`::
198+
199+
Returns the unique, interned version of the specified string or data,
200+
similar to the `String.intern` API in Java and .NET, respectively.
201+
Interned strings remain valid for the entire lifetime of the process.
202+
+
203+
Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
204+
strings / data must not be modified or freed.
205+
+
206+
Interned strings are best used for short strings with high probability of
207+
duplicates.
208+
+
209+
Uses a hashmap to store the pool of interned strings.
210+
165211
Usage example
166212
-------------
167213

builtin/describe.c

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,9 @@ static int commit_name_cmp(const struct commit_name *cn1,
5656
return hashcmp(cn1->peeled, peeled ? peeled : cn2->peeled);
5757
}
5858

59-
static inline unsigned int hash_sha1(const unsigned char *sha1)
60-
{
61-
unsigned int hash;
62-
memcpy(&hash, sha1, sizeof(hash));
63-
return hash;
64-
}
65-
6659
static inline struct commit_name *find_commit_name(const unsigned char *peeled)
6760
{
68-
struct commit_name key;
69-
hashmap_entry_init(&key, hash_sha1(peeled));
70-
return hashmap_get(&names, &key, peeled);
61+
return hashmap_get_from_hash(&names, sha1hash(peeled), peeled);
7162
}
7263

7364
static int replace_name(struct commit_name *e,
@@ -114,7 +105,7 @@ static void add_to_known_names(const char *path,
114105
if (!e) {
115106
e = xmalloc(sizeof(struct commit_name));
116107
hashcpy(e->peeled, peeled);
117-
hashmap_entry_init(e, hash_sha1(peeled));
108+
hashmap_entry_init(e, sha1hash(peeled));
118109
hashmap_add(&names, e);
119110
e->path = NULL;
120111
}

decorate.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,7 @@
88

99
static unsigned int hash_obj(const struct object *obj, unsigned int n)
1010
{
11-
unsigned int hash;
12-
13-
memcpy(&hash, obj->sha1, sizeof(unsigned int));
14-
return hash % n;
11+
return sha1hash(obj->sha1) % n;
1512
}
1613

1714
static void *insert_decoration(struct decoration *n, const struct object *base, void *decoration)

diffcore-rename.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -242,14 +242,12 @@ struct file_similarity {
242242

243243
static unsigned int hash_filespec(struct diff_filespec *filespec)
244244
{
245-
unsigned int hash;
246245
if (!filespec->sha1_valid) {
247246
if (diff_populate_filespec(filespec, 0))
248247
return 0;
249248
hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
250249
}
251-
memcpy(&hash, filespec->sha1, sizeof(hash));
252-
return hash;
250+
return sha1hash(filespec->sha1);
253251
}
254252

255253
static int find_identical_files(struct hashmap *srcs,
@@ -259,15 +257,14 @@ static int find_identical_files(struct hashmap *srcs,
259257
int renames = 0;
260258

261259
struct diff_filespec *target = rename_dst[dst_index].two;
262-
struct file_similarity *p, *best, dst;
260+
struct file_similarity *p, *best = NULL;
263261
int i = 100, best_score = -1;
264262

265263
/*
266264
* Find the best source match for specified destination.
267265
*/
268-
best = NULL;
269-
hashmap_entry_init(&dst, hash_filespec(target));
270-
for (p = hashmap_get(srcs, &dst, NULL); p; p = hashmap_get_next(srcs, p)) {
266+
p = hashmap_get_from_hash(srcs, hash_filespec(target), NULL);
267+
for (; p; p = hashmap_get_next(srcs, p)) {
271268
int score;
272269
struct diff_filespec *source = p->filespec;
273270

hashmap.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
226226
current = iter->map->table[iter->tablepos++];
227227
}
228228
}
229+
230+
struct pool_entry {
231+
struct hashmap_entry ent;
232+
size_t len;
233+
unsigned char data[FLEX_ARRAY];
234+
};
235+
236+
static int pool_entry_cmp(const struct pool_entry *e1,
237+
const struct pool_entry *e2,
238+
const unsigned char *keydata)
239+
{
240+
return e1->data != keydata &&
241+
(e1->len != e2->len || memcmp(e1->data, keydata, e1->len));
242+
}
243+
244+
const void *memintern(const void *data, size_t len)
245+
{
246+
static struct hashmap map;
247+
struct pool_entry key, *e;
248+
249+
/* initialize string pool hashmap */
250+
if (!map.tablesize)
251+
hashmap_init(&map, (hashmap_cmp_fn) pool_entry_cmp, 0);
252+
253+
/* lookup interned string in pool */
254+
hashmap_entry_init(&key, memhash(data, len));
255+
key.len = len;
256+
e = hashmap_get(&map, &key, data);
257+
if (!e) {
258+
/* not found: create it */
259+
e = xmallocz(sizeof(struct pool_entry) + len);
260+
hashmap_entry_init(e, key.ent.hash);
261+
e->len = len;
262+
memcpy(e->data, data, len);
263+
hashmap_add(&map, e);
264+
}
265+
return e->data;
266+
}

hashmap.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,17 @@ extern unsigned int strihash(const char *buf);
1313
extern unsigned int memhash(const void *buf, size_t len);
1414
extern unsigned int memihash(const void *buf, size_t len);
1515

16+
static inline unsigned int sha1hash(const unsigned char *sha1)
17+
{
18+
/*
19+
* Equivalent to 'return *(unsigned int *)sha1;', but safe on
20+
* platforms that don't support unaligned reads.
21+
*/
22+
unsigned int hash;
23+
memcpy(&hash, sha1, sizeof(hash));
24+
return hash;
25+
}
26+
1627
/* data structures */
1728

1829
struct hashmap_entry {
@@ -57,6 +68,14 @@ extern void *hashmap_put(struct hashmap *map, void *entry);
5768
extern void *hashmap_remove(struct hashmap *map, const void *key,
5869
const void *keydata);
5970

71+
static inline void *hashmap_get_from_hash(const struct hashmap *map,
72+
unsigned int hash, const void *keydata)
73+
{
74+
struct hashmap_entry key;
75+
hashmap_entry_init(&key, hash);
76+
return hashmap_get(map, &key, keydata);
77+
}
78+
6079
/* hashmap_iter functions */
6180

6281
extern void hashmap_iter_init(struct hashmap *map, struct hashmap_iter *iter);
@@ -68,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
6887
return hashmap_iter_next(iter);
6988
}
7089

90+
/* string interning */
91+
92+
extern const void *memintern(const void *data, size_t len);
93+
static inline const char *strintern(const char *string)
94+
{
95+
return memintern(string, strlen(string));
96+
}
97+
7198
#endif

khash.h

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -320,19 +320,12 @@ static const double __ac_HASH_UPPER = 0.77;
320320
code; \
321321
} }
322322

323-
static inline khint_t __kh_oid_hash(const unsigned char *oid)
324-
{
325-
khint_t hash;
326-
memcpy(&hash, oid, sizeof(hash));
327-
return hash;
328-
}
329-
330323
#define __kh_oid_cmp(a, b) (hashcmp(a, b) == 0)
331324

332-
KHASH_INIT(sha1, const unsigned char *, void *, 1, __kh_oid_hash, __kh_oid_cmp)
325+
KHASH_INIT(sha1, const unsigned char *, void *, 1, sha1hash, __kh_oid_cmp)
333326
typedef kh_sha1_t khash_sha1;
334327

335-
KHASH_INIT(sha1_pos, const unsigned char *, int, 1, __kh_oid_hash, __kh_oid_cmp)
328+
KHASH_INIT(sha1_pos, const unsigned char *, int, 1, sha1hash, __kh_oid_cmp)
336329
typedef kh_sha1_pos_t khash_sha1_pos;
337330

338331
#endif /* __AC_KHASH_H */

name-hash.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,11 @@ struct cache_entry *index_dir_exists(struct index_state *istate, const char *nam
213213
struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int icase)
214214
{
215215
struct cache_entry *ce;
216-
struct hashmap_entry key;
217216

218217
lazy_init_name_hash(istate);
219218

220-
hashmap_entry_init(&key, memihash(name, namelen));
221-
ce = hashmap_get(&istate->name_hash, &key, NULL);
219+
ce = hashmap_get_from_hash(&istate->name_hash,
220+
memihash(name, namelen), NULL);
222221
while (ce) {
223222
if (same_name(ce, name, namelen, icase))
224223
return ce;

object.c

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,18 +50,7 @@ int type_from_string(const char *str)
5050
*/
5151
static unsigned int hash_obj(const unsigned char *sha1, unsigned int n)
5252
{
53-
unsigned int hash;
54-
55-
/*
56-
* Since the sha1 is essentially random, we just take the
57-
* required number of bits directly from the first
58-
* sizeof(unsigned int) bytes of sha1. First we have to copy
59-
* the bytes into a properly aligned integer. If we cared
60-
* about getting consistent results across architectures, we
61-
* would have to call ntohl() here, too.
62-
*/
63-
memcpy(&hash, sha1, sizeof(unsigned int));
64-
return hash & (n - 1);
53+
return sha1hash(sha1) & (n - 1);
6554
}
6655

6756
/*

pack-objects.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,9 @@ static uint32_t locate_object_entry_hash(struct packing_data *pdata,
77
const unsigned char *sha1,
88
int *found)
99
{
10-
uint32_t i, hash, mask = (pdata->index_size - 1);
10+
uint32_t i, mask = (pdata->index_size - 1);
1111

12-
memcpy(&hash, sha1, sizeof(uint32_t));
13-
i = hash & mask;
12+
i = sha1hash(sha1) & mask;
1413

1514
while (pdata->index[i] > 0) {
1615
uint32_t pos = pdata->index[i] - 1;

0 commit comments

Comments
 (0)