Skip to content

Commit 29c2bd5

Browse files
peffgitster
authored andcommitted
add oidset API
This is similar to many of our uses of sha1-array, but it overcomes one limitation of a sha1-array: when you are de-duplicating a large input with relatively few unique entries, sha1-array uses 20 bytes per non-unique entry. Whereas this set will use memory linear in the number of unique entries (albeit a few more than 20 bytes due to hashmap overhead). Signed-off-by: Jeff King <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 41a078c commit 29c2bd5

File tree

3 files changed

+95
-0
lines changed

3 files changed

+95
-0
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,7 @@ LIB_OBJS += notes-cache.o
781781
LIB_OBJS += notes-merge.o
782782
LIB_OBJS += notes-utils.o
783783
LIB_OBJS += object.o
784+
LIB_OBJS += oidset.o
784785
LIB_OBJS += pack-bitmap.o
785786
LIB_OBJS += pack-bitmap-write.o
786787
LIB_OBJS += pack-check.o

oidset.c

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#include "cache.h"
2+
#include "oidset.h"
3+
4+
struct oidset_entry {
5+
struct hashmap_entry hash;
6+
struct object_id oid;
7+
};
8+
9+
static int oidset_hashcmp(const void *va, const void *vb,
10+
const void *vkey)
11+
{
12+
const struct oidset_entry *a = va, *b = vb;
13+
const struct object_id *key = vkey;
14+
return oidcmp(&a->oid, key ? key : &b->oid);
15+
}
16+
17+
int oidset_contains(const struct oidset *set, const struct object_id *oid)
18+
{
19+
struct hashmap_entry key;
20+
21+
if (!set->map.cmpfn)
22+
return 0;
23+
24+
hashmap_entry_init(&key, sha1hash(oid->hash));
25+
return !!hashmap_get(&set->map, &key, oid);
26+
}
27+
28+
int oidset_insert(struct oidset *set, const struct object_id *oid)
29+
{
30+
struct oidset_entry *entry;
31+
32+
if (!set->map.cmpfn)
33+
hashmap_init(&set->map, oidset_hashcmp, 0);
34+
35+
if (oidset_contains(set, oid))
36+
return 1;
37+
38+
entry = xmalloc(sizeof(*entry));
39+
hashmap_entry_init(&entry->hash, sha1hash(oid->hash));
40+
oidcpy(&entry->oid, oid);
41+
42+
hashmap_add(&set->map, entry);
43+
return 0;
44+
}
45+
46+
void oidset_clear(struct oidset *set)
47+
{
48+
hashmap_free(&set->map, 1);
49+
}

oidset.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#ifndef OIDSET_H
2+
#define OIDSET_H
3+
4+
/**
5+
* This API is similar to sha1-array, in that it maintains a set of object ids
6+
* in a memory-efficient way. The major differences are:
7+
*
8+
* 1. It uses a hash, so we can do online duplicate removal, rather than
9+
* sort-and-uniq at the end. This can reduce memory footprint if you have
10+
* a large list of oids with many duplicates.
11+
*
12+
* 2. The per-unique-oid memory footprint is slightly higher due to hash
13+
* table overhead.
14+
*/
15+
16+
/**
17+
* A single oidset; should be zero-initialized (or use OIDSET_INIT).
18+
*/
19+
struct oidset {
20+
struct hashmap map;
21+
};
22+
23+
#define OIDSET_INIT { { NULL } }
24+
25+
/**
26+
* Returns true iff `set` contains `oid`.
27+
*/
28+
int oidset_contains(const struct oidset *set, const struct object_id *oid);
29+
30+
/**
31+
* Insert the oid into the set; a copy is made, so "oid" does not need
32+
* to persist after this function is called.
33+
*
34+
* Returns 1 if the oid was already in the set, 0 otherwise. This can be used
35+
* to perform an efficient check-and-add.
36+
*/
37+
int oidset_insert(struct oidset *set, const struct object_id *oid);
38+
39+
/**
40+
* Remove all entries from the oidset, freeing any resources associated with
41+
* it.
42+
*/
43+
void oidset_clear(struct oidset *set);
44+
45+
#endif /* OIDSET_H */

0 commit comments

Comments
 (0)