@@ -87,11 +87,11 @@ def EmbedAndKeep(anything: Any) -> Any:
8787# helper functions
8888
8989
90- def stringify_embedding (embedding : List ) -> str :
90+ def _stringify_embedding (embedding : List ) -> str :
9191 return " " .join ([f"{ i } :{ e } " for i , e in enumerate (embedding )])
9292
9393
94- def parse_lines (parser : "vw.TextFormatParser" , input_str : str ) -> List ["vw.Example" ]:
94+ def _parse_lines (parser : "vw.TextFormatParser" , input_str : str ) -> List ["vw.Example" ]:
9595 return [parser .parse_line (line ) for line in input_str .split ("\n " )]
9696
9797
@@ -116,20 +116,6 @@ def get_based_on_and_to_select_from(inputs: Dict[str, Any]) -> Tuple[Dict, Dict]
116116 return based_on , to_select_from
117117
118118
119- def prepare_inputs_for_autoembed (inputs : Dict [str , Any ]) -> Dict [str , Any ]:
120- """
121- go over all the inputs and if something is either wrapped in _ToSelectFrom or _BasedOn, and if their inner values are not already _Embed,
122- then wrap them in EmbedAndKeep while retaining their _ToSelectFrom or _BasedOn status
123- """
124-
125- next_inputs = inputs .copy ()
126- for k , v in next_inputs .items ():
127- if isinstance (v , _ToSelectFrom ) or isinstance (v , _BasedOn ):
128- if not isinstance (v .value , _Embed ):
129- next_inputs [k ].value = EmbedAndKeep (v .value )
130- return next_inputs
131-
132-
133119# end helper functions
134120
135121
@@ -195,15 +181,15 @@ def predict(self, event: TEvent) -> Any:
195181
196182 text_parser = vw .TextFormatParser (self .workspace )
197183 return self .workspace .predict_one (
198- parse_lines (text_parser , self .featurizer .format (event ))
184+ _parse_lines (text_parser , self .featurizer .format (event ))
199185 )
200186
201187 def learn (self , event : TEvent ) -> None :
202188 import vowpal_wabbit_next as vw
203189
204190 vw_ex = self .featurizer .format (event )
205191 text_parser = vw .TextFormatParser (self .workspace )
206- multi_ex = parse_lines (text_parser , vw_ex )
192+ multi_ex = _parse_lines (text_parser , vw_ex )
207193 self .workspace .learn_one (multi_ex )
208194
209195 def log (self , event : TEvent ) -> None :
@@ -489,20 +475,13 @@ def run(self, *args, **kwargs) -> Dict[str, Any]:
489475 return {"picked" : picked , "picked_metadata" : event }
490476
491477
492- def is_stringtype_instance (item : Any ) -> bool :
493- """Helper function to check if an item is a string."""
494- return isinstance (item , str ) or (
495- isinstance (item , _Embed ) and isinstance (item .value , str )
496- )
497-
498-
499- def embed_string_type (
478+ def _embed_string_type (
500479 item : Union [str , _Embed ], model : Any , namespace : Optional [str ] = None
501480) -> Dict [str , Union [str , List [str ]]]:
502481 """Helper function to embed a string or an _Embed object."""
503482 keep_str = ""
504483 if isinstance (item , _Embed ):
505- encoded = stringify_embedding (model .encode (item .value ))
484+ encoded = _stringify_embedding (model .encode (item .value ))
506485 if item .keep :
507486 keep_str = item .value .replace (" " , "_" ) + " "
508487 elif isinstance (item , str ):
@@ -518,36 +497,36 @@ def embed_string_type(
518497 return {namespace : keep_str + encoded }
519498
520499
521- def embed_dict_type (item : Dict , model : Any ) -> Dict [str , Any ]:
500+ def _embed_dict_type (item : Dict , model : Any ) -> Dict [str , Any ]:
522501 """Helper function to embed a dictionary item."""
523502 inner_dict : Dict = {}
524503 for ns , embed_item in item .items ():
525504 if isinstance (embed_item , list ):
526505 inner_dict [ns ] = []
527506 for embed_list_item in embed_item :
528- embedded = embed_string_type (embed_list_item , model , ns )
507+ embedded = _embed_string_type (embed_list_item , model , ns )
529508 inner_dict [ns ].append (embedded [ns ])
530509 else :
531- inner_dict .update (embed_string_type (embed_item , model , ns ))
510+ inner_dict .update (_embed_string_type (embed_item , model , ns ))
532511 return inner_dict
533512
534513
535- def embed_list_type (
514+ def _embed_list_type (
536515 item : list , model : Any , namespace : Optional [str ] = None
537516) -> List [Dict [str , Union [str , List [str ]]]]:
538517 ret_list : List = []
539518 for embed_item in item :
540519 if isinstance (embed_item , dict ):
541- ret_list .append (embed_dict_type (embed_item , model ))
520+ ret_list .append (_embed_dict_type (embed_item , model ))
542521 elif isinstance (embed_item , list ):
543- item_embedding = embed_list_type (embed_item , model , namespace )
522+ item_embedding = _embed_list_type (embed_item , model , namespace )
544523 # Get the first key from the first dictionary
545524 first_key = next (iter (item_embedding [0 ]))
546525 # Group the values under that key
547526 grouping = {first_key : [item [first_key ] for item in item_embedding ]}
548527 ret_list .append (grouping )
549528 else :
550- ret_list .append (embed_string_type (embed_item , model , namespace ))
529+ ret_list .append (_embed_string_type (embed_item , model , namespace ))
551530 return ret_list
552531
553532
@@ -569,10 +548,10 @@ def embed(
569548 if (isinstance (to_embed , _Embed ) and isinstance (to_embed .value , str )) or isinstance (
570549 to_embed , str
571550 ):
572- return [embed_string_type (to_embed , model , namespace )]
551+ return [_embed_string_type (to_embed , model , namespace )]
573552 elif isinstance (to_embed , dict ):
574- return [embed_dict_type (to_embed , model )]
553+ return [_embed_dict_type (to_embed , model )]
575554 elif isinstance (to_embed , list ):
576- return embed_list_type (to_embed , model , namespace )
555+ return _embed_list_type (to_embed , model , namespace )
577556 else :
578557 raise ValueError ("Invalid input format for embedding" )
0 commit comments