33use crate :: types:: * ;
44use bimap:: BiMap ;
55use fst:: { IntoStreamer , Map , Streamer } ;
6- use indexmap:: IndexMap ;
76use log:: error;
87use serde:: { Deserialize , Serialize } ;
98use std:: { borrow:: Cow , fmt, iter:: once} ;
@@ -182,37 +181,40 @@ struct TaggerFields {
182181 word_store_fst : Vec < u8 > ,
183182 tag_store : BiMap < String , PosIdInt > ,
184183 lang_options : TaggerLangOptions ,
184+ tags_length : usize ,
185+ groups_length : usize ,
185186}
186187
187188impl From < Tagger > for TaggerFields {
188189 fn from ( tagger : Tagger ) -> Self {
189190 let mut tag_fst_items = Vec :: new ( ) ;
190191
191192 for ( word_id, map) in tagger. tags . iter ( ) {
192- let mut i = 0u8 ;
193193 let word = tagger. str_for_word_id ( word_id) ;
194194
195- for ( inflect_id, pos_ids) in map. iter ( ) {
196- for pos_id in pos_ids {
197- assert ! ( i < 255 ) ;
198- i += 1 ;
199-
200- let key: Vec < u8 > = word. as_bytes ( ) . iter ( ) . chain ( once ( & i) ) . copied ( ) . collect ( ) ;
201- let pos_bytes = pos_id. 0 . to_be_bytes ( ) ;
202- let inflect_bytes = inflect_id. 0 . to_be_bytes ( ) ;
203-
204- let value = u64:: from_be_bytes ( [
205- inflect_bytes[ 0 ] ,
206- inflect_bytes[ 1 ] ,
207- inflect_bytes[ 2 ] ,
208- inflect_bytes[ 3 ] ,
209- 0 ,
210- 0 ,
211- pos_bytes[ 0 ] ,
212- pos_bytes[ 1 ] ,
213- ] ) ;
214- tag_fst_items. push ( ( key, value) ) ;
215- }
195+ for ( i, ( inflect_id, pos_id) ) in map. iter ( ) . enumerate ( ) {
196+ assert ! ( i < 255 ) ;
197+
198+ let key: Vec < u8 > = word
199+ . as_bytes ( )
200+ . iter ( )
201+ . chain ( once ( & ( i as u8 ) ) )
202+ . copied ( )
203+ . collect ( ) ;
204+ let pos_bytes = pos_id. 0 . to_be_bytes ( ) ;
205+ let inflect_bytes = inflect_id. 0 . to_be_bytes ( ) ;
206+
207+ let value = u64:: from_be_bytes ( [
208+ inflect_bytes[ 0 ] ,
209+ inflect_bytes[ 1 ] ,
210+ inflect_bytes[ 2 ] ,
211+ inflect_bytes[ 3 ] ,
212+ 0 ,
213+ 0 ,
214+ pos_bytes[ 0 ] ,
215+ pos_bytes[ 1 ] ,
216+ ] ) ;
217+ tag_fst_items. push ( ( key, value) ) ;
216218 }
217219 }
218220
@@ -241,6 +243,8 @@ impl From<Tagger> for TaggerFields {
241243 word_store_fst,
242244 tag_store : tagger. tag_store ,
243245 lang_options : tagger. lang_options ,
246+ tags_length : tagger. tags . len ( ) ,
247+ groups_length : tagger. groups . len ( ) ,
244248 }
245249 }
246250}
@@ -260,8 +264,8 @@ impl From<TaggerFields> for Tagger {
260264 ) ;
261265 }
262266
263- let mut tags = DefaultHashMap :: new ( ) ;
264- let mut groups = DefaultHashMap :: new ( ) ;
267+ let mut tags = DefaultHashMap :: with_capacity ( data . tags_length ) ;
268+ let mut groups = DefaultHashMap :: with_capacity ( data . groups_length ) ;
265269
266270 let tag_fst = Map :: new ( data. tag_fst ) . unwrap ( ) ;
267271 let mut stream = tag_fst. into_stream ( ) ;
@@ -271,24 +275,22 @@ impl From<TaggerFields> for Tagger {
271275 let word_id = * word_store. get_by_left ( word) . unwrap ( ) ;
272276
273277 let value_bytes = value. to_be_bytes ( ) ;
274- let inflection_id = WordIdInt ( u32:: from_be_bytes ( [
278+ let lemma_id = WordIdInt ( u32:: from_be_bytes ( [
275279 value_bytes[ 0 ] ,
276280 value_bytes[ 1 ] ,
277281 value_bytes[ 2 ] ,
278282 value_bytes[ 3 ] ,
279283 ] ) ) ;
280284 let pos_id = PosIdInt ( u16:: from_be_bytes ( [ value_bytes[ 6 ] , value_bytes[ 7 ] ] ) ) ;
281285
282- let group = groups. entry ( inflection_id ) . or_insert_with ( Vec :: new) ;
286+ let group = groups. entry ( lemma_id ) . or_insert_with ( Vec :: new) ;
283287 if !group. contains ( & word_id) {
284288 group. push ( word_id) ;
285289 }
286290
287291 tags. entry ( word_id)
288- . or_insert_with ( IndexMap :: new)
289- . entry ( inflection_id)
290292 . or_insert_with ( Vec :: new)
291- . push ( pos_id) ;
293+ . push ( ( lemma_id , pos_id) ) ;
292294 }
293295
294296 Tagger {
@@ -343,7 +345,7 @@ impl From<TaggerFields> for Tagger {
343345#[ derive( Default , Serialize , Deserialize , Clone ) ]
344346#[ serde( from = "TaggerFields" , into = "TaggerFields" ) ]
345347pub struct Tagger {
346- pub ( crate ) tags : DefaultHashMap < WordIdInt , IndexMap < WordIdInt , Vec < PosIdInt > > > ,
348+ pub ( crate ) tags : DefaultHashMap < WordIdInt , Vec < ( WordIdInt , PosIdInt ) > > ,
347349 pub ( crate ) tag_store : BiMap < String , PosIdInt > ,
348350 pub ( crate ) word_store : BiMap < String , WordIdInt > ,
349351 pub ( crate ) groups : DefaultHashMap < WordIdInt , Vec < WordIdInt > > ,
@@ -362,13 +364,11 @@ impl Tagger {
362364 {
363365 let mut output = Vec :: new ( ) ;
364366
365- for ( key, value) in map. iter ( ) {
366- for pos_id in value {
367- output. push ( WordData :: new (
368- self . id_word ( self . str_for_word_id ( key) . into ( ) ) ,
369- self . id_tag ( self . str_for_pos_id ( pos_id) ) ,
370- ) )
371- }
367+ for ( lemma_id, pos_id) in map. iter ( ) {
368+ output. push ( WordData :: new (
369+ self . id_word ( self . str_for_word_id ( lemma_id) . into ( ) ) ,
370+ self . id_tag ( self . str_for_pos_id ( pos_id) ) ,
371+ ) )
372372 }
373373
374374 output
0 commit comments