@@ -117,19 +117,13 @@ static double graymodel_interpolate(struct graymodel *gm, double x, double y)
117117 return gm -> C [0 ]* x + gm -> C [1 ]* y + gm -> C [2 ];
118118}
119119
120- struct quick_decode_entry
120+ static inline int popcount64 ( uint64_t x )
121121{
122- uint64_t rcode ; // the queried code
123- uint16_t id ; // the tag ID (a small integer)
124- uint8_t hamming ; // how many errors corrected?
125- uint8_t rotation ; // number of rotations [0, 3]
126- };
127-
128- struct quick_decode
129- {
130- int nentries ;
131- struct quick_decode_entry * entries ;
132- };
122+ x -= (x >> 1 ) & 0x5555555555555555ULL ;
123+ x = (x & 0x3333333333333333ULL ) + ((x >> 2 ) & 0x3333333333333333ULL );
124+ x = (x + (x >> 4 )) & 0x0f0f0f0f0f0f0f0fULL ;
125+ return (x * 0x0101010101010101ULL ) >> 56 ;
126+ }
133127
134128/**
135129 * Assuming we are drawing the image one quadrant at a time, what would the rotated image look like?
@@ -169,26 +163,45 @@ static struct quad *quad_copy(struct quad *quad)
169163 return q ;
170164}
171165
172- static void quick_decode_add ( struct quick_decode * qd , uint64_t code , int id , int hamming )
166+ struct quick_decode_result
173167{
174- uint32_t bucket = code % qd -> nentries ;
168+ uint64_t rcode ; // the queried code
169+ uint16_t id ; // the tag ID (a small integer)
170+ uint8_t hamming ; // how many errors corrected?
171+ uint8_t rotation ; // number of rotations [0, 3]
172+ };
175173
176- while (qd -> entries [bucket ].rcode != UINT64_MAX ) {
177- bucket = (bucket + 1 ) % qd -> nentries ;
178- }
174+ #define NUM_CHUNKS 4
179175
180- qd -> entries [bucket ].rcode = code ;
181- qd -> entries [bucket ].id = id ;
182- qd -> entries [bucket ].hamming = hamming ;
183- }
176+ struct quick_decode
177+ {
178+ int nbits ;
179+ int chunk_size ;
180+ int capacity ;
181+ int chunk_mask ;
182+ int shifts [NUM_CHUNKS ];
183+
184+ // chunk_offsets is a map from chunk value to a range of locations in chunk_ids.
185+ // Together with chunk_ids, this allows a lookup of all codes matching a chunk value.
186+ uint16_t * chunk_offsets [NUM_CHUNKS ];
187+
188+ // chunk_ids is an array of indices into the codes table
189+ uint16_t * chunk_ids [NUM_CHUNKS ];
190+
191+ int maxhamming ;
192+ int ncodes ;
193+ };
184194
185195static void quick_decode_uninit (apriltag_family_t * fam )
186196{
187197 if (!fam -> impl )
188198 return ;
189199
190200 struct quick_decode * qd = (struct quick_decode * ) fam -> impl ;
191- free (qd -> entries );
201+ for (int i = 0 ; i < NUM_CHUNKS ; i ++ ) {
202+ free (qd -> chunk_offsets [i ]);
203+ free (qd -> chunk_ids [i ]);
204+ }
192205 free (qd );
193206 fam -> impl = NULL ;
194207}
@@ -198,126 +211,130 @@ static void quick_decode_init(apriltag_family_t *family, int maxhamming)
198211 assert (family -> impl == NULL );
199212 assert (family -> ncodes < 65536 );
200213
201- struct quick_decode * qd = calloc (1 , sizeof (struct quick_decode ));
202- int capacity = family -> ncodes ;
203-
204- int nbits = family -> nbits ;
205-
206- if (maxhamming >= 1 )
207- capacity += family -> ncodes * nbits ;
208-
209- if (maxhamming >= 2 )
210- capacity += family -> ncodes * (nbits * (nbits - 1 )) / 2 ;
211-
212- if (maxhamming >= 3 )
213- capacity += family -> ncodes * nbits * ((nbits - 1 ) * (nbits - 2 )) / 6 ;
214-
215- qd -> nentries = capacity * 3 ;
216-
217- // debug_print("capacity %d, size: %.0f kB\n",
218- // capacity, qd->nentries * sizeof(struct quick_decode_entry) / 1024.0);
214+ if (maxhamming > 3 ) {
215+ debug_print ("\"maxhamming\" beyond 3 not supported\n" );
216+ errno = EINVAL ;
217+ return ;
218+ }
219219
220- qd -> entries = calloc (qd -> nentries , sizeof (struct quick_decode_entry ));
221- if (qd -> entries == NULL ) {
222- debug_print ("Failed to allocate hamming decode table\n" );
223- // errno already set to ENOMEM (Error No MEMory) by calloc() failure
220+ struct quick_decode * qd = calloc (1 , sizeof (struct quick_decode ));
221+ if (!qd ) {
222+ debug_print ("Memory allocation failed\n" );
224223 return ;
225224 }
225+ family -> impl = qd ;
226226
227- for (int i = 0 ; i < qd -> nentries ; i ++ )
228- qd -> entries [i ].rcode = UINT64_MAX ;
227+ qd -> maxhamming = maxhamming ;
228+ qd -> ncodes = family -> ncodes ;
229+ qd -> nbits = family -> nbits ;
229230
230- errno = 0 ;
231+ qd -> chunk_size = (qd -> nbits + (NUM_CHUNKS - 1 )) / NUM_CHUNKS ;
232+ qd -> capacity = 1 << qd -> chunk_size ;
233+ qd -> chunk_mask = qd -> capacity - 1 ;
231234
232- for (uint32_t i = 0 ; i < family -> ncodes ; i ++ ) {
233- uint64_t code = family -> codes [i ];
235+ for (int i = 0 ; i < NUM_CHUNKS ; i ++ ) {
236+ qd -> shifts [i ] = i * qd -> chunk_size ;
237+ }
234238
235- // add exact code (hamming = 0)
236- quick_decode_add (qd , code , i , 0 );
239+ for (int i = 0 ; i < NUM_CHUNKS ; i ++ ) {
240+ qd -> chunk_offsets [i ] = calloc (qd -> capacity + 1 , sizeof (uint16_t ));
241+ if (!qd -> chunk_offsets [i ]) {
242+ debug_print ("Memory allocation failed\n" );
243+ goto fail ;
244+ }
245+ qd -> chunk_ids [i ] = calloc (qd -> ncodes , sizeof (uint16_t ));
246+ if (!qd -> chunk_ids [i ]) {
247+ debug_print ("Memory allocation failed\n" );
248+ goto fail ;
249+ }
250+ }
237251
238- if (maxhamming >= 1 ) {
239- // add hamming 1
240- for (int j = 0 ; j < nbits ; j ++ )
241- quick_decode_add (qd , code ^ (APRILTAG_U64_ONE << j ), i , 1 );
252+ // Count frequencies
253+ for (int i = 0 ; i < qd -> ncodes ; i ++ ) {
254+ uint64_t code = family -> codes [i ];
255+ for (int j = 0 ; j < NUM_CHUNKS ; j ++ ) {
256+ int val = (code >> qd -> shifts [j ]) & qd -> chunk_mask ;
257+ qd -> chunk_offsets [j ][val + 1 ]++ ;
242258 }
259+ }
243260
244- if (maxhamming >= 2 ) {
245- // add hamming 2
246- for (int j = 0 ; j < nbits ; j ++ )
247- for (int k = 0 ; k < j ; k ++ )
248- quick_decode_add (qd , code ^ (APRILTAG_U64_ONE << j ) ^ (APRILTAG_U64_ONE << k ), i , 2 );
261+ // Prefix sum
262+ for (int i = 0 ; i < NUM_CHUNKS ; i ++ ) {
263+ for (int j = 0 ; j < qd -> capacity ; j ++ ) {
264+ qd -> chunk_offsets [i ][j + 1 ] += qd -> chunk_offsets [i ][j ];
249265 }
266+ }
250267
251- if (maxhamming >= 3 ) {
252- // add hamming 3
253- for (int j = 0 ; j < nbits ; j ++ )
254- for (int k = 0 ; k < j ; k ++ )
255- for (int m = 0 ; m < k ; m ++ )
256- quick_decode_add (qd , code ^ (APRILTAG_U64_ONE << j ) ^ (APRILTAG_U64_ONE << k ) ^ (APRILTAG_U64_ONE << m ), i , 3 );
268+ // Populate ids
269+ uint16_t * cursors [NUM_CHUNKS ];
270+ memset (cursors , 0 , sizeof (cursors ));
271+ for (int i = 0 ; i < NUM_CHUNKS ; i ++ ) {
272+ cursors [i ] = malloc ((qd -> capacity + 1 ) * sizeof (uint16_t ));
273+ if (cursors [i ] == NULL ) {
274+ debug_print ("Memory allocation failed\n" );
275+ for (int j = 0 ; j < NUM_CHUNKS ; j ++ )
276+ free (cursors [j ]);
277+ goto fail ;
257278 }
279+ memcpy (cursors [i ], qd -> chunk_offsets [i ], (qd -> capacity + 1 ) * sizeof (uint16_t ));
280+ }
258281
259- if (maxhamming > 3 ) {
260- debug_print ("\"maxhamming\" beyond 3 not supported\n" );
261- // set errno to Error INvalid VALue
262- errno = EINVAL ;
263- return ;
282+ for (int i = 0 ; i < qd -> ncodes ; i ++ ) {
283+ uint64_t code = family -> codes [i ];
284+ for (int j = 0 ; j < NUM_CHUNKS ; j ++ ) {
285+ int val = (code >> qd -> shifts [j ]) & qd -> chunk_mask ;
286+ int write_pos = cursors [j ][val ];
287+ qd -> chunk_ids [j ][write_pos ] = i ;
288+ cursors [j ][val ]++ ;
264289 }
265290 }
266291
267- family -> impl = qd ;
292+ for (int i = 0 ; i < NUM_CHUNKS ; i ++ ) {
293+ free (cursors [i ]);
294+ }
268295
269- #if 0
270- int longest_run = 0 ;
271- int run = 0 ;
272- int run_sum = 0 ;
273- int run_count = 0 ;
274-
275- // This accounting code doesn't check the last possible run that
276- // occurs at the wrap-around. That's pretty insignificant.
277- for (int i = 0 ; i < qd -> nentries ; i ++ ) {
278- if (qd -> entries [i ].rcode == UINT64_MAX ) {
279- if (run > 0 ) {
280- run_sum += run ;
281- run_count ++ ;
282- }
283- run = 0 ;
284- } else {
285- run ++ ;
286- longest_run = imax (longest_run , run );
287- }
288- }
296+ return ;
289297
290- printf ( "quick decode: longest run: %d, average run %.3f\n" , longest_run , 1.0 * run_sum / run_count );
291- #endif
298+ fail :
299+ quick_decode_uninit ( family );
292300}
293301
294- // returns an entry with hamming set to 255 if no decode was found.
302+ // returns a result with hamming set to 255 if no decode was found.
295303static void quick_decode_codeword (apriltag_family_t * tf , uint64_t rcode ,
296- struct quick_decode_entry * entry )
304+ struct quick_decode_result * res )
297305{
298306 struct quick_decode * qd = (struct quick_decode * ) tf -> impl ;
299307
300308 // qd might be null if detector_add_family_bits() failed
301309 for (int ridx = 0 ; qd != NULL && ridx < 4 ; ridx ++ ) {
302310
303- for (int bucket = rcode % qd -> nentries ;
304- qd -> entries [bucket ].rcode != UINT64_MAX ;
305- bucket = (bucket + 1 ) % qd -> nentries ) {
306-
307- if (qd -> entries [bucket ].rcode == rcode ) {
308- * entry = qd -> entries [bucket ];
309- entry -> rotation = ridx ;
310- return ;
311+ for (int i = 0 ; i < NUM_CHUNKS ; i ++ ) {
312+ int val = (rcode >> qd -> shifts [i ]) & qd -> chunk_mask ;
313+ int start = qd -> chunk_offsets [i ][val ];
314+ int end = qd -> chunk_offsets [i ][val + 1 ];
315+
316+ for (int j = start ; j < end ; j ++ ) {
317+ uint16_t id = qd -> chunk_ids [i ][j ];
318+ uint64_t correct_code = tf -> codes [id ];
319+ int hamming = popcount64 (correct_code ^ rcode );
320+
321+ if (hamming <= qd -> maxhamming ) {
322+ res -> rcode = rcode ;
323+ res -> id = id ;
324+ res -> hamming = hamming ;
325+ res -> rotation = ridx ;
326+ return ;
327+ }
311328 }
312329 }
313330
314331 rcode = rotate90 (rcode , tf -> nbits );
315332 }
316333
317- entry -> rcode = 0 ;
318- entry -> id = 65535 ;
319- entry -> hamming = 255 ;
320- entry -> rotation = 0 ;
334+ res -> rcode = 0 ;
335+ res -> id = 65535 ;
336+ res -> hamming = 255 ;
337+ res -> rotation = 0 ;
321338}
322339
323340static inline int detection_compare_function (const void * _a , const void * _b )
@@ -416,7 +433,7 @@ struct evaluate_quad_ret
416433 matd_t * H , * Hinv ;
417434
418435 int decode_status ;
419- struct quick_decode_entry e ;
436+ struct quick_decode_result res ;
420437};
421438
422439static matd_t * homography_compute2 (double c [4 ][4 ]) {
@@ -567,7 +584,7 @@ static void sharpen(apriltag_detector_t* td, double* values, int size) {
567584}
568585
569586// returns the decision margin. Return < 0 if the detection should be rejected.
570- static float quad_decode (apriltag_detector_t * td , apriltag_family_t * family , image_u8_t * im , struct quad * quad , struct quick_decode_entry * entry , image_u8_t * im_samples )
587+ static float quad_decode (apriltag_detector_t * td , apriltag_family_t * family , image_u8_t * im , struct quad * quad , struct quick_decode_result * res , image_u8_t * im_samples )
571588{
572589 // decode the tag binary contents by sampling the pixel
573590 // closest to the center of each bit cell.
@@ -736,7 +753,7 @@ static float quad_decode(apriltag_detector_t* td, apriltag_family_t *family, ima
736753 }
737754 }
738755
739- quick_decode_codeword (family , rcode , entry );
756+ quick_decode_codeword (family , rcode , res );
740757 free (values );
741758 return fmin (white_score / white_score_count , black_score / black_score_count );
742759}
@@ -949,19 +966,19 @@ static void quad_decode_task(void *_u)
949966 // optimization process over with the original quad.
950967 struct quad * quad = quad_copy (quad_original );
951968
952- struct quick_decode_entry entry ;
969+ struct quick_decode_result res ;
953970
954- float decision_margin = quad_decode (td , family , im , quad , & entry , task -> im_samples );
971+ float decision_margin = quad_decode (td , family , im , quad , & res , task -> im_samples );
955972
956- if (decision_margin >= 0 && entry .hamming < 255 ) {
973+ if (decision_margin >= 0 && res .hamming < 255 ) {
957974 apriltag_detection_t * det = calloc (1 , sizeof (apriltag_detection_t ));
958975
959976 det -> family = family ;
960- det -> id = entry .id ;
961- det -> hamming = entry .hamming ;
977+ det -> id = res .id ;
978+ det -> hamming = res .hamming ;
962979 det -> decision_margin = decision_margin ;
963980
964- double theta = entry .rotation * M_PI / 2.0 ;
981+ double theta = res .rotation * M_PI / 2.0 ;
965982 double c = cos (theta ), s = sin (theta );
966983
967984 // Fix the rotation of our homography to properly orient the tag
0 commit comments