Skip to content

Commit c5a4fc1

Browse files
Merge pull request #421 from sidd-27/opti-hashtable
Reduced QuickDecode memory consumption by 128,884x
2 parents 31b29af + a00f7c3 commit c5a4fc1

File tree

3 files changed

+278
-117
lines changed

3 files changed

+278
-117
lines changed

apriltag.c

Lines changed: 134 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -117,19 +117,13 @@ static double graymodel_interpolate(struct graymodel *gm, double x, double y)
117117
return gm->C[0]*x + gm->C[1]*y + gm->C[2];
118118
}
119119

120-
struct quick_decode_entry
120+
static inline int popcount64(uint64_t x)
121121
{
122-
uint64_t rcode; // the queried code
123-
uint16_t id; // the tag ID (a small integer)
124-
uint8_t hamming; // how many errors corrected?
125-
uint8_t rotation; // number of rotations [0, 3]
126-
};
127-
128-
struct quick_decode
129-
{
130-
int nentries;
131-
struct quick_decode_entry *entries;
132-
};
122+
x -= (x >> 1) & 0x5555555555555555ULL;
123+
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
124+
x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
125+
return (x * 0x0101010101010101ULL) >> 56;
126+
}
133127

134128
/**
135129
* Assuming we are drawing the image one quadrant at a time, what would the rotated image look like?
@@ -169,26 +163,45 @@ static struct quad *quad_copy(struct quad *quad)
169163
return q;
170164
}
171165

172-
static void quick_decode_add(struct quick_decode *qd, uint64_t code, int id, int hamming)
166+
struct quick_decode_result
173167
{
174-
uint32_t bucket = code % qd->nentries;
168+
uint64_t rcode; // the queried code
169+
uint16_t id; // the tag ID (a small integer)
170+
uint8_t hamming; // how many errors corrected?
171+
uint8_t rotation; // number of rotations [0, 3]
172+
};
175173

176-
while (qd->entries[bucket].rcode != UINT64_MAX) {
177-
bucket = (bucket + 1) % qd->nentries;
178-
}
174+
#define NUM_CHUNKS 4
179175

180-
qd->entries[bucket].rcode = code;
181-
qd->entries[bucket].id = id;
182-
qd->entries[bucket].hamming = hamming;
183-
}
176+
struct quick_decode
177+
{
178+
int nbits;
179+
int chunk_size;
180+
int capacity;
181+
int chunk_mask;
182+
int shifts[NUM_CHUNKS];
183+
184+
// chunk_offsets is a map from chunk value to a range of locations in chunk_ids.
185+
// Together with chunk_ids, this allows a lookup of all codes matching a chunk value.
186+
uint16_t* chunk_offsets[NUM_CHUNKS];
187+
188+
// chunk_ids is an array of indices into the codes table
189+
uint16_t* chunk_ids[NUM_CHUNKS];
190+
191+
int maxhamming;
192+
int ncodes;
193+
};
184194

185195
static void quick_decode_uninit(apriltag_family_t *fam)
186196
{
187197
if (!fam->impl)
188198
return;
189199

190200
struct quick_decode *qd = (struct quick_decode*) fam->impl;
191-
free(qd->entries);
201+
for (int i = 0; i < NUM_CHUNKS; i++) {
202+
free(qd->chunk_offsets[i]);
203+
free(qd->chunk_ids[i]);
204+
}
192205
free(qd);
193206
fam->impl = NULL;
194207
}
@@ -198,126 +211,130 @@ static void quick_decode_init(apriltag_family_t *family, int maxhamming)
198211
assert(family->impl == NULL);
199212
assert(family->ncodes < 65536);
200213

201-
struct quick_decode *qd = calloc(1, sizeof(struct quick_decode));
202-
int capacity = family->ncodes;
203-
204-
int nbits = family->nbits;
205-
206-
if (maxhamming >= 1)
207-
capacity += family->ncodes * nbits;
208-
209-
if (maxhamming >= 2)
210-
capacity += family->ncodes * (nbits * (nbits-1)) / 2;
211-
212-
if (maxhamming >= 3)
213-
capacity += family->ncodes * nbits * ((nbits-1) * (nbits-2)) / 6;
214-
215-
qd->nentries = capacity * 3;
216-
217-
// debug_print("capacity %d, size: %.0f kB\n",
218-
// capacity, qd->nentries * sizeof(struct quick_decode_entry) / 1024.0);
214+
if (maxhamming > 3) {
215+
debug_print("\"maxhamming\" beyond 3 not supported\n");
216+
errno = EINVAL;
217+
return;
218+
}
219219

220-
qd->entries = calloc(qd->nentries, sizeof(struct quick_decode_entry));
221-
if (qd->entries == NULL) {
222-
debug_print("Failed to allocate hamming decode table\n");
223-
// errno already set to ENOMEM (Error No MEMory) by calloc() failure
220+
struct quick_decode *qd = calloc(1, sizeof(struct quick_decode));
221+
if (!qd) {
222+
debug_print("Memory allocation failed\n");
224223
return;
225224
}
225+
family->impl = qd;
226226

227-
for (int i = 0; i < qd->nentries; i++)
228-
qd->entries[i].rcode = UINT64_MAX;
227+
qd->maxhamming = maxhamming;
228+
qd->ncodes = family->ncodes;
229+
qd->nbits = family->nbits;
229230

230-
errno = 0;
231+
qd->chunk_size = (qd->nbits + (NUM_CHUNKS - 1)) / NUM_CHUNKS;
232+
qd->capacity = 1 << qd->chunk_size;
233+
qd->chunk_mask = qd->capacity - 1;
231234

232-
for (uint32_t i = 0; i < family->ncodes; i++) {
233-
uint64_t code = family->codes[i];
235+
for (int i = 0; i < NUM_CHUNKS; i++) {
236+
qd->shifts[i] = i * qd->chunk_size;
237+
}
234238

235-
// add exact code (hamming = 0)
236-
quick_decode_add(qd, code, i, 0);
239+
for (int i = 0; i < NUM_CHUNKS; i++) {
240+
qd->chunk_offsets[i] = calloc(qd->capacity + 1, sizeof(uint16_t));
241+
if (!qd->chunk_offsets[i]) {
242+
debug_print("Memory allocation failed\n");
243+
goto fail;
244+
}
245+
qd->chunk_ids[i] = calloc(qd->ncodes, sizeof(uint16_t));
246+
if (!qd->chunk_ids[i]) {
247+
debug_print("Memory allocation failed\n");
248+
goto fail;
249+
}
250+
}
237251

238-
if (maxhamming >= 1) {
239-
// add hamming 1
240-
for (int j = 0; j < nbits; j++)
241-
quick_decode_add(qd, code ^ (APRILTAG_U64_ONE << j), i, 1);
252+
// Count frequencies
253+
for (int i = 0; i < qd->ncodes; i++) {
254+
uint64_t code = family->codes[i];
255+
for (int j = 0; j < NUM_CHUNKS; j++) {
256+
int val = (code >> qd->shifts[j]) & qd->chunk_mask;
257+
qd->chunk_offsets[j][val + 1]++;
242258
}
259+
}
243260

244-
if (maxhamming >= 2) {
245-
// add hamming 2
246-
for (int j = 0; j < nbits; j++)
247-
for (int k = 0; k < j; k++)
248-
quick_decode_add(qd, code ^ (APRILTAG_U64_ONE << j) ^ (APRILTAG_U64_ONE << k), i, 2);
261+
// Prefix sum
262+
for (int i = 0; i < NUM_CHUNKS; i++) {
263+
for (int j = 0; j < qd->capacity; j++) {
264+
qd->chunk_offsets[i][j + 1] += qd->chunk_offsets[i][j];
249265
}
266+
}
250267

251-
if (maxhamming >= 3) {
252-
// add hamming 3
253-
for (int j = 0; j < nbits; j++)
254-
for (int k = 0; k < j; k++)
255-
for (int m = 0; m < k; m++)
256-
quick_decode_add(qd, code ^ (APRILTAG_U64_ONE << j) ^ (APRILTAG_U64_ONE << k) ^ (APRILTAG_U64_ONE << m), i, 3);
268+
// Populate ids
269+
uint16_t *cursors[NUM_CHUNKS];
270+
memset(cursors, 0, sizeof(cursors));
271+
for (int i = 0; i < NUM_CHUNKS; i++) {
272+
cursors[i] = malloc((qd->capacity + 1) * sizeof(uint16_t));
273+
if (cursors[i] == NULL) {
274+
debug_print("Memory allocation failed\n");
275+
for (int j = 0; j < NUM_CHUNKS; j++)
276+
free(cursors[j]);
277+
goto fail;
257278
}
279+
memcpy(cursors[i], qd->chunk_offsets[i], (qd->capacity + 1) * sizeof(uint16_t));
280+
}
258281

259-
if (maxhamming > 3) {
260-
debug_print("\"maxhamming\" beyond 3 not supported\n");
261-
// set errno to Error INvalid VALue
262-
errno = EINVAL;
263-
return;
282+
for (int i = 0; i < qd->ncodes; i++) {
283+
uint64_t code = family->codes[i];
284+
for (int j = 0; j < NUM_CHUNKS; j++) {
285+
int val = (code >> qd->shifts[j]) & qd->chunk_mask;
286+
int write_pos = cursors[j][val];
287+
qd->chunk_ids[j][write_pos] = i;
288+
cursors[j][val]++;
264289
}
265290
}
266291

267-
family->impl = qd;
292+
for (int i = 0; i < NUM_CHUNKS; i++) {
293+
free(cursors[i]);
294+
}
268295

269-
#if 0
270-
int longest_run = 0;
271-
int run = 0;
272-
int run_sum = 0;
273-
int run_count = 0;
274-
275-
// This accounting code doesn't check the last possible run that
276-
// occurs at the wrap-around. That's pretty insignificant.
277-
for (int i = 0; i < qd->nentries; i++) {
278-
if (qd->entries[i].rcode == UINT64_MAX) {
279-
if (run > 0) {
280-
run_sum += run;
281-
run_count ++;
282-
}
283-
run = 0;
284-
} else {
285-
run ++;
286-
longest_run = imax(longest_run, run);
287-
}
288-
}
296+
return;
289297

290-
printf("quick decode: longest run: %d, average run %.3f\n", longest_run, 1.0 * run_sum / run_count);
291-
#endif
298+
fail:
299+
quick_decode_uninit(family);
292300
}
293301

294-
// returns an entry with hamming set to 255 if no decode was found.
302+
// returns a result with hamming set to 255 if no decode was found.
295303
static void quick_decode_codeword(apriltag_family_t *tf, uint64_t rcode,
296-
struct quick_decode_entry *entry)
304+
struct quick_decode_result *res)
297305
{
298306
struct quick_decode *qd = (struct quick_decode*) tf->impl;
299307

300308
// qd might be null if detector_add_family_bits() failed
301309
for (int ridx = 0; qd != NULL && ridx < 4; ridx++) {
302310

303-
for (int bucket = rcode % qd->nentries;
304-
qd->entries[bucket].rcode != UINT64_MAX;
305-
bucket = (bucket + 1) % qd->nentries) {
306-
307-
if (qd->entries[bucket].rcode == rcode) {
308-
*entry = qd->entries[bucket];
309-
entry->rotation = ridx;
310-
return;
311+
for (int i = 0; i < NUM_CHUNKS; i++) {
312+
int val = (rcode >> qd->shifts[i]) & qd->chunk_mask;
313+
int start = qd->chunk_offsets[i][val];
314+
int end = qd->chunk_offsets[i][val + 1];
315+
316+
for (int j = start; j < end; j++) {
317+
uint16_t id = qd->chunk_ids[i][j];
318+
uint64_t correct_code = tf->codes[id];
319+
int hamming = popcount64(correct_code ^ rcode);
320+
321+
if (hamming <= qd->maxhamming) {
322+
res->rcode = rcode;
323+
res->id = id;
324+
res->hamming = hamming;
325+
res->rotation = ridx;
326+
return;
327+
}
311328
}
312329
}
313330

314331
rcode = rotate90(rcode, tf->nbits);
315332
}
316333

317-
entry->rcode = 0;
318-
entry->id = 65535;
319-
entry->hamming = 255;
320-
entry->rotation = 0;
334+
res->rcode = 0;
335+
res->id = 65535;
336+
res->hamming = 255;
337+
res->rotation = 0;
321338
}
322339

323340
static inline int detection_compare_function(const void *_a, const void *_b)
@@ -416,7 +433,7 @@ struct evaluate_quad_ret
416433
matd_t *H, *Hinv;
417434

418435
int decode_status;
419-
struct quick_decode_entry e;
436+
struct quick_decode_result res;
420437
};
421438

422439
static matd_t* homography_compute2(double c[4][4]) {
@@ -567,7 +584,7 @@ static void sharpen(apriltag_detector_t* td, double* values, int size) {
567584
}
568585

569586
// returns the decision margin. Return < 0 if the detection should be rejected.
570-
static float quad_decode(apriltag_detector_t* td, apriltag_family_t *family, image_u8_t *im, struct quad *quad, struct quick_decode_entry *entry, image_u8_t *im_samples)
587+
static float quad_decode(apriltag_detector_t* td, apriltag_family_t *family, image_u8_t *im, struct quad *quad, struct quick_decode_result *res, image_u8_t *im_samples)
571588
{
572589
// decode the tag binary contents by sampling the pixel
573590
// closest to the center of each bit cell.
@@ -736,7 +753,7 @@ static float quad_decode(apriltag_detector_t* td, apriltag_family_t *family, ima
736753
}
737754
}
738755

739-
quick_decode_codeword(family, rcode, entry);
756+
quick_decode_codeword(family, rcode, res);
740757
free(values);
741758
return fmin(white_score / white_score_count, black_score / black_score_count);
742759
}
@@ -949,19 +966,19 @@ static void quad_decode_task(void *_u)
949966
// optimization process over with the original quad.
950967
struct quad *quad = quad_copy(quad_original);
951968

952-
struct quick_decode_entry entry;
969+
struct quick_decode_result res;
953970

954-
float decision_margin = quad_decode(td, family, im, quad, &entry, task->im_samples);
971+
float decision_margin = quad_decode(td, family, im, quad, &res, task->im_samples);
955972

956-
if (decision_margin >= 0 && entry.hamming < 255) {
973+
if (decision_margin >= 0 && res.hamming < 255) {
957974
apriltag_detection_t *det = calloc(1, sizeof(apriltag_detection_t));
958975

959976
det->family = family;
960-
det->id = entry.id;
961-
det->hamming = entry.hamming;
977+
det->id = res.id;
978+
det->hamming = res.hamming;
962979
det->decision_margin = decision_margin;
963980

964-
double theta = entry.rotation * M_PI / 2.0;
981+
double theta = res.rotation * M_PI / 2.0;
965982
double c = cos(theta), s = sin(theta);
966983

967984
// Fix the rotation of our homography to properly orient the tag

test/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,21 @@ foreach(IMG IN LISTS TEST_IMAGE_NAMES)
3030
)
3131
endforeach()
3232

33+
# Quick decode test
34+
file(GLOB COMMON_SRC "${CMAKE_SOURCE_DIR}/common/*.c")
35+
file(GLOB TAG_FILES "${CMAKE_SOURCE_DIR}/tag*.c")
36+
37+
add_executable(test_quick_decode test_quick_decode.c "${CMAKE_SOURCE_DIR}/apriltag_quad_thresh.c" ${COMMON_SRC} ${TAG_FILES})
38+
target_include_directories(test_quick_decode PRIVATE "${CMAKE_SOURCE_DIR}")
39+
40+
if (UNIX)
41+
target_link_libraries(test_quick_decode m)
42+
endif()
43+
44+
if(NOT MSVC)
45+
find_package(Threads REQUIRED)
46+
target_link_libraries(test_quick_decode Threads::Threads)
47+
endif()
48+
49+
add_test(NAME test_quick_decode COMMAND test_quick_decode)
50+

0 commit comments

Comments
 (0)