@@ -11,26 +11,62 @@ use crate::tokenizer::CharRef;
11
11
12
12
include ! ( concat!( env!( "OUT_DIR" ) , "/named_entities_graph.rs" ) ) ;
13
13
14
+ /// A single node in the DAFSA.
15
+ ///
16
+ /// For memory efficiency reasons, this is packed in 32 bits. The memory representation is as follows:
17
+ /// * 8 bits: code point
18
+ /// * 8 bits: hash value
14
19
#[ derive( Clone , Copy , Debug ) ]
15
- pub ( crate ) struct Node {
16
- first_child_index : usize ,
17
- code_point : u8 ,
18
- is_last_child : bool ,
19
- is_terminal : bool ,
20
- num_nodes : u8 ,
21
- }
20
+ pub ( crate ) struct Node ( u32 ) ;
22
21
23
22
impl Node {
23
+ const IS_TERMINAL : u32 = 1 << 15 ;
24
+ const IS_LAST_CHILD : u32 = 1 << 14 ;
25
+
26
+ pub ( crate ) const fn new (
27
+ code_point : u8 ,
28
+ hash_value : u8 ,
29
+ is_terminal : bool ,
30
+ is_last_child : bool ,
31
+ first_child_index : u16 ,
32
+ ) -> Self {
33
+ let mut value = 0 ;
34
+ value |= ( code_point as u32 ) << 24 ;
35
+ value |= ( hash_value as u32 ) << 16 ;
36
+
37
+ if is_terminal {
38
+ value |= Self :: IS_TERMINAL ;
39
+ }
40
+
41
+ if is_last_child {
42
+ value |= Self :: IS_LAST_CHILD ;
43
+ }
44
+
45
+ assert ! ( first_child_index <= 0xFFF ) ;
46
+
47
+ value |= first_child_index as u32 ;
48
+
49
+ Self ( value)
50
+ }
51
+
24
52
pub ( crate ) const fn code_point ( & self ) -> u8 {
25
- self . code_point
53
+ ( self . 0 >> 24 ) as u8
26
54
}
27
55
28
- pub ( crate ) const fn num_nodes ( & self ) -> usize {
29
- self . num_nodes as usize
56
+ pub ( crate ) const fn hash_value ( & self ) -> usize {
57
+ ( ( self . 0 >> 16 ) & 0xFF ) as usize
30
58
}
31
59
32
60
pub ( crate ) const fn is_terminal ( & self ) -> bool {
33
- self . is_terminal
61
+ ( self . 0 & Self :: IS_TERMINAL ) != 0
62
+ }
63
+
64
+ const fn is_last_child ( & self ) -> bool {
65
+ ( self . 0 & Self :: IS_LAST_CHILD ) != 0
66
+ }
67
+
68
+ const fn first_child_index ( & self ) -> u16 {
69
+ ( self . 0 & 0xFFF ) as u16
34
70
}
35
71
36
72
pub ( crate ) fn children ( & self ) -> impl Iterator < Item = & ' static Node > {
@@ -49,51 +85,22 @@ impl Node {
49
85
let node = & DAFSA_NODES [ self . index ] ;
50
86
self . index += 1 ;
51
87
52
- if node. is_last_child {
88
+ if node. is_last_child ( ) {
53
89
self . done = true ;
54
90
}
55
91
56
92
Some ( node)
57
93
}
58
94
}
59
95
96
+ let first_child_index = self . first_child_index ( ) ;
60
97
ChildIterator {
61
- index : self . first_child_index ,
62
- done : self . first_child_index == 0 ,
98
+ index : first_child_index as usize ,
99
+ done : first_child_index == 0 ,
63
100
}
64
101
}
65
102
}
66
103
67
- // fn compute_unique_index(input: &str) -> Option<usize> {
68
- // debug_assert!(input.is_ascii());
69
-
70
- // let mut index = 0;
71
- // let mut current = &DAFSA_NODES[0];
72
- // for code_point in input.as_bytes() {
73
- // let mut next_node = None;
74
- // for child in current.children() {
75
- // if child.code_point == *code_point {
76
- // next_node = Some(child);
77
- // break;
78
- // } else {
79
- // index += child.num_nodes as usize;
80
- // }
81
- // }
82
-
83
- // current = next_node?;
84
-
85
- // if current.is_terminal {
86
- // index += 1;
87
- // }
88
- // }
89
-
90
- // if current.is_terminal {
91
- // Some(index)
92
- // } else {
93
- // None
94
- // }
95
- // }
96
-
97
104
pub ( crate ) fn resolve_unique_hash_value ( value : usize ) -> CharRef {
98
105
let ( first, second) = REFERENCES [ value] ;
99
106
0 commit comments