VowpalWabbit
diff --git a/‎src/learn_to_pick/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎src/learn_to_pick/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/learn_to_pick/base.py‎
Lines changed: 16 additions & 37 deletions b/‎src/learn_to_pick/base.py‎
Lines changed: 16 additions & 37 deletions
diff --git a/‎src/learn_to_pick/pick_best.py‎
Lines changed: 55 additions & 58 deletions b/‎src/learn_to_pick/pick_best.py‎
Lines changed: 55 additions & 58 deletions
@@ -12,7 +12,6 @@
     VwPolicy,
     VwLogger,
     embed,
-    stringify_embedding,
 )
 from learn_to_pick.pick_best import (
     PickBest,
 
@@ -87,11 +87,11 @@ def EmbedAndKeep(anything: Any) -> Any:
 # helper functions
 
 
-def stringify_embedding(embedding: List) -> str:
+def _stringify_embedding(embedding: List) -> str:
     return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)])
 
 
-def parse_lines(parser: "vw.TextFormatParser", input_str: str) -> List["vw.Example"]:
+def _parse_lines(parser: "vw.TextFormatParser", input_str: str) -> List["vw.Example"]:
     return [parser.parse_line(line) for line in input_str.split("\n")]
 
 
@@ -116,20 +116,6 @@ def get_based_on_and_to_select_from(inputs: Dict[str, Any]) -> Tuple[Dict, Dict]
     return based_on, to_select_from
 
 
-def prepare_inputs_for_autoembed(inputs: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    go over all the inputs and if something is either wrapped in _ToSelectFrom or _BasedOn, and if their inner values are not already _Embed,
-    then wrap them in EmbedAndKeep while retaining their _ToSelectFrom or _BasedOn status
-    """
-
-    next_inputs = inputs.copy()
-    for k, v in next_inputs.items():
-        if isinstance(v, _ToSelectFrom) or isinstance(v, _BasedOn):
-            if not isinstance(v.value, _Embed):
-                next_inputs[k].value = EmbedAndKeep(v.value)
-    return next_inputs
-
-
 # end helper functions
 
 
@@ -195,15 +181,15 @@ def predict(self, event: TEvent) -> Any:
 
         text_parser = vw.TextFormatParser(self.workspace)
         return self.workspace.predict_one(
-            parse_lines(text_parser, self.featurizer.format(event))
+            _parse_lines(text_parser, self.featurizer.format(event))
         )
 
     def learn(self, event: TEvent) -> None:
         import vowpal_wabbit_next as vw
 
         vw_ex = self.featurizer.format(event)
         text_parser = vw.TextFormatParser(self.workspace)
-        multi_ex = parse_lines(text_parser, vw_ex)
+        multi_ex = _parse_lines(text_parser, vw_ex)
         self.workspace.learn_one(multi_ex)
 
     def log(self, event: TEvent) -> None:
@@ -489,20 +475,13 @@ def run(self, *args, **kwargs) -> Dict[str, Any]:
         return {"picked": picked, "picked_metadata": event}
 
 
-def is_stringtype_instance(item: Any) -> bool:
-    """Helper function to check if an item is a string."""
-    return isinstance(item, str) or (
-        isinstance(item, _Embed) and isinstance(item.value, str)
-    )
-
-
-def embed_string_type(
+def _embed_string_type(
     item: Union[str, _Embed], model: Any, namespace: Optional[str] = None
 ) -> Dict[str, Union[str, List[str]]]:
     """Helper function to embed a string or an _Embed object."""
     keep_str = ""
     if isinstance(item, _Embed):
-        encoded = stringify_embedding(model.encode(item.value))
+        encoded = _stringify_embedding(model.encode(item.value))
         if item.keep:
             keep_str = item.value.replace(" ", "_") + " "
     elif isinstance(item, str):
@@ -518,36 +497,36 @@ def embed_string_type(
     return {namespace: keep_str + encoded}
 
 
-def embed_dict_type(item: Dict, model: Any) -> Dict[str, Any]:
+def _embed_dict_type(item: Dict, model: Any) -> Dict[str, Any]:
     """Helper function to embed a dictionary item."""
     inner_dict: Dict = {}
     for ns, embed_item in item.items():
         if isinstance(embed_item, list):
             inner_dict[ns] = []
             for embed_list_item in embed_item:
-                embedded = embed_string_type(embed_list_item, model, ns)
+                embedded = _embed_string_type(embed_list_item, model, ns)
                 inner_dict[ns].append(embedded[ns])
         else:
-            inner_dict.update(embed_string_type(embed_item, model, ns))
+            inner_dict.update(_embed_string_type(embed_item, model, ns))
     return inner_dict
 
 
-def embed_list_type(
+def _embed_list_type(
     item: list, model: Any, namespace: Optional[str] = None
 ) -> List[Dict[str, Union[str, List[str]]]]:
     ret_list: List = []
     for embed_item in item:
         if isinstance(embed_item, dict):
-            ret_list.append(embed_dict_type(embed_item, model))
+            ret_list.append(_embed_dict_type(embed_item, model))
         elif isinstance(embed_item, list):
-            item_embedding = embed_list_type(embed_item, model, namespace)
+            item_embedding = _embed_list_type(embed_item, model, namespace)
             # Get the first key from the first dictionary
             first_key = next(iter(item_embedding[0]))
             # Group the values under that key
             grouping = {first_key: [item[first_key] for item in item_embedding]}
             ret_list.append(grouping)
         else:
-            ret_list.append(embed_string_type(embed_item, model, namespace))
+            ret_list.append(_embed_string_type(embed_item, model, namespace))
     return ret_list
 
 
@@ -569,10 +548,10 @@ def embed(
     if (isinstance(to_embed, _Embed) and isinstance(to_embed.value, str)) or isinstance(
         to_embed, str
     ):
-        return [embed_string_type(to_embed, model, namespace)]
+        return [_embed_string_type(to_embed, model, namespace)]
     elif isinstance(to_embed, dict):
-        return [embed_dict_type(to_embed, model)]
+        return [_embed_dict_type(to_embed, model)]
     elif isinstance(to_embed, list):
-        return embed_list_type(to_embed, model, namespace)
+        return _embed_list_type(to_embed, model, namespace)
     else:
         raise ValueError("Invalid input format for embedding")
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, Union, Iterable
+from itertools import chain
 import os
 
 from learn_to_pick import base
@@ -42,6 +43,28 @@ def __init__(
         self.based_on = based_on
 
 
+class VwTxt:
+    @staticmethod
+    def embedding(embedding: List[float]) -> str:
+        return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)])
+
+    @staticmethod
+    def features(features: Union[str, List[str]]) -> str:
+        return " ".join(features) if isinstance(features, list) else features
+
+    @staticmethod
+    def _namespaces(ns: Iterable[Tuple[str, Union[str, List[str]]]]):
+        return " ".join(f"|{k} {VwTxt.features(v)}" for k, v in ns)
+
+    @staticmethod
+    def ns(ns: Union[Iterable[Tuple[str, Any]], List[Dict[str, Any]], Dict[str, Any]]):
+        if isinstance(ns, List):
+            ns = chain.from_iterable(map(dict.items, ns))
+        if isinstance(ns, Dict):
+            ns = ns.items()
+        return VwTxt._namespaces(ns)
+
+
 class PickBestFeaturizer(base.Featurizer[PickBestEvent]):
     """
     Text Featurizer class that embeds the `BasedOn` and `ToSelectFrom` inputs into a format that can be used by the learning policy
@@ -63,10 +86,6 @@ def __init__(
         self.model = model
         self.auto_embed = auto_embed
 
-    @staticmethod
-    def _str(embedding: List[float]) -> str:
-        return " ".join([f"{i}:{e}" for i, e in enumerate(embedding)])
-
     def get_label(self, event: PickBestEvent) -> tuple:
         cost = None
         if event.selected:
@@ -148,70 +167,48 @@ def format_auto_embed_on(self, event: PickBestEvent) -> str:
         context_emb, action_embs = self.get_context_and_action_embeddings(event)
         indexed_dot_product = self.get_indexed_dot_product(context_emb, action_embs)
 
-        action_lines = []
+        nactions = len(action_embs)
+
+        def _tolist(v):
+            return v if isinstance(v, list) else [v]
+
+        labels = ["" for _ in range(nactions)]
+        if cost is not None:
+            labels[chosen_action] = f"{chosen_action}:{cost}:{prob} "
+
+        dotprods = [{} for _ in range(nactions)]
         for i, action in enumerate(action_embs):
-            line_parts = []
-            dot_prods = []
-            if cost is not None and chosen_action == i:
-                line_parts.append(f"{chosen_action}:{cost}:{prob}")
-            for ns, action in action.items():
-                line_parts.append(f"|{ns}")
-                elements = action if isinstance(action, list) else [action]
-                nsa = []
-                for elem in elements:
-                    line_parts.append(f"{elem}")
-                    ns_a = f"{ns}={elem}"
-                    nsa.append(ns_a)
-                    for k, v in indexed_dot_product.items():
-                        dot_prods.append(v[ns_a])
-                nsa_str = " ".join(nsa)
-                line_parts.append(f"|# {nsa_str}")
-
-            line_parts.append(f"|dotprod {self._str(dot_prods)}")
-            action_lines.append(" ".join(line_parts))
-
-        shared = []
+            action["#"] = [f"{k}={v}" for k, _v in action.items() for v in _tolist(_v)]
+            dotprods[i] = [
+                v[f] for v in indexed_dot_product.values() for f in action["#"]
+            ]
+
+        actions_str = [
+            f"{l}{VwTxt.ns(a)} |dotprod {VwTxt.embedding(dp)}"
+            for l, a, dp in zip(labels, action_embs, dotprods)
+        ]
+
         for item in context_emb:
-            for ns, context in item.items():
-                shared.append(f"|{ns}")
-                elements = context if isinstance(context, list) else [context]
-                nsc = []
-                for elem in elements:
-                    shared.append(f"{elem}")
-                    nsc.append(f"{ns}={elem}")
-                nsc_str = " ".join(nsc)
-                shared.append(f"|@ {nsc_str}")
-
-        return "shared " + " ".join(shared) + "\n" + "\n".join(action_lines)
+            item["@"] = [f"{k}={v}" for k, _v in item.items() for v in _tolist(_v)]
+        shared_str = f"shared {VwTxt.ns(context_emb)}"
+
+        return "\n".join([shared_str] + actions_str)
 
     def format_auto_embed_off(self, event: PickBestEvent) -> str:
         """
         Converts the `BasedOn` and `ToSelectFrom` into a format that can be used by VW
         """
         chosen_action, cost, prob = self.get_label(event)
         context_emb, action_embs = self.get_context_and_action_embeddings(event)
+        nactions = len(action_embs)
 
-        example_string = ""
-        example_string += "shared "
-        for context_item in context_emb:
-            for ns, based_on in context_item.items():
-                e = " ".join(based_on) if isinstance(based_on, list) else based_on
-                example_string += f"|{ns} {e} "
-        example_string += "\n"
+        context_str = f"shared {VwTxt.ns(context_emb)}"
 
-        for i, action in enumerate(action_embs):
-            if cost is not None and chosen_action == i:
-                example_string += f"{chosen_action}:{cost}:{prob} "
-            for ns, action_embedding in action.items():
-                e = (
-                    " ".join(action_embedding)
-                    if isinstance(action_embedding, list)
-                    else action_embedding
-                )
-                example_string += f"|{ns} {e} "
-            example_string += "\n"
-        # Strip the last newline
-        return example_string[:-1]
+        labels = ["" for _ in range(nactions)]
+        if cost is not None:
+            labels[chosen_action] = f"{chosen_action}:{cost}:{prob} "
+        actions_str = [f"{l}{VwTxt.ns(a)}" for a, l in zip(action_embs, labels)]
+        return "\n".join([context_str] + actions_str)
 
     def format(self, event: PickBestEvent) -> str:
         if self.auto_embed:
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,6 @@`
`12`	`12`	`VwPolicy,`
`13`	`13`	`VwLogger,`
`14`	`14`	`embed,`
`15`		`- stringify_embedding,`
`16`	`15`	`)`
`17`	`16`	`from learn_to_pick.pick_best import (`
`18`	`17`	`PickBest,`