Skip to content

Commit 0daabc7

Browse files
authored
switches to Fowler-Noll-Vo hash algorithm for hashing names (#1221)
The previous algorithm had a very bad collision rate, especially for small strings. The new one is much better and is tested on large dictionaries (of English words and password) and is guaranteed not to collide on small strings. Warning: the change of the hash function will break the knowledge base format so do `bap --cache-clean` after the update.
1 parent 6c50124 commit 0daabc7

File tree

2 files changed

+20
-10
lines changed

2 files changed

+20
-10
lines changed

lib/knowledge/bap_knowledge.ml

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ type slot_status =
170170
type fullname = {
171171
package : string;
172172
name : string;
173-
} [@@deriving bin_io, compare, sexp]
173+
} [@@deriving bin_io, equal, compare, sexp]
174174

175175

176176
module Name : sig
@@ -218,22 +218,26 @@ end = struct
218218

219219
let registry = Hashtbl.create (module Int63)
220220

221-
let hash_name str =
221+
(* using FNV-1a algorithm *)
222+
let hash_name =
222223
let open Int63 in
223-
String.fold str ~init:(of_int 5381) ~f:(fun h c ->
224-
(h lsl 5) + h + of_int (Char.to_int c))
224+
let init = of_int64_exn 0xCBF29CE484222325L in
225+
let m = of_int64_exn 0x100000001B3L in
226+
let hash init = String.fold ~init ~f:(fun h c ->
227+
(h lxor of_int (Char.to_int c)) * m) in
228+
fun {package; name} ->
229+
hash (hash init package) name
225230

226231
let intern name =
227-
let str = full name in
228-
let id = hash_name str in
232+
let id = hash_name name in
229233
match Hashtbl.find registry id with
230234
| None -> Hashtbl.add_exn registry id name; id
231-
| Some name ->
232-
if full name = str
235+
| Some name' ->
236+
if equal_fullname name name'
233237
then id
234238
else invalid_argf "Names %S and %S have the same hash value, \
235239
Change one of them."
236-
(full name) str ()
240+
(full name) (full name') ()
237241

238242
let fullname = Hashtbl.find_exn registry
239243
include Int63
@@ -796,6 +800,8 @@ module Persistent = struct
796800
let key = of_string pk key
797801
and data = of_string pd data in
798802
Map.add_exn xs ~key ~data))
803+
804+
let name = of_binable (module Name)
799805
end
800806

801807

lib/knowledge/bap_knowledge.mli

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1278,6 +1278,11 @@ module Knowledge : sig
12781278
(** string is a persistent data type. *)
12791279
val string : string persistent
12801280

1281+
(** names are persistent.
1282+
1283+
@since 2.2.0 *)
1284+
val name : name persistent
1285+
12811286
(** [list t] derives persistence for a list. *)
12821287
val list : 'a persistent -> 'a list persistent
12831288

@@ -1351,7 +1356,6 @@ module Knowledge : sig
13511356
*)
13521357
val create : ?package:string -> string -> t
13531358

1354-
13551359
(** [read ?package input] reads a full name from input.
13561360
13571361
This function will parse the [input] and return a

0 commit comments

Comments
 (0)