diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b5175a7f2..e8893d1e1 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -355,7 +355,9 @@ def get_embeddings_seq(self, seq_id: int):
     # Sampling functions - deprecated, use LlamaSampler instead
 
     def set_rng_seed(self, seed: int):
-        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "set_rng_seed is deprecated, use LlamaSampler instead"
+        )
 
     def sample_repetition_penalties(
         self,
@@ -366,30 +368,44 @@ def sample_repetition_penalties(
         penalty_freq: float,
         penalty_present: float,
     ):
-        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_repetition_penalties is deprecated, use LlamaSampler instead"
+        )
 
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_softmax is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_k is deprecated, use LlamaSampler instead"
+        )
 
     def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_min_p is deprecated, use LlamaSampler instead"
+        )
 
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
-        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_typical is deprecated, use LlamaSampler instead"
+        )
 
     def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
         raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
 
     def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_grammar is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat(
         self,
@@ -399,7 +415,9 @@ def sample_token_mirostat(
         m: int,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_mirostat_v2(
         self,
@@ -408,17 +426,25 @@ def sample_token_mirostat_v2(
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_greedy is deprecated, use LlamaSampler instead"
+        )
 
     def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token is deprecated, use LlamaSampler instead"
+        )
 
     # Grammar
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "grammar_accept_token is deprecated, use LlamaSampler instead"
+        )
 
     def reset_timings(self):
         llama_cpp.llama_perf_context_reset(self.ctx)
@@ -529,6 +555,8 @@ def normalize_embedding(embedding):
     norm = float(np.linalg.norm(embedding))
     if norm == 0.0:
         return embedding
+    if isinstance(embedding, np.ndarray):
+        return embedding / norm
     return [v / norm for v in embedding]
 
 
@@ -602,16 +630,16 @@ def sample(
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
         # This method is deprecated in favor of using LlamaSampler directly
-        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead"
+        )
 
     def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
         self.prev.append(id)
 
 
 class CustomSampler:
-    def __init__(
-        self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
-    ):
+    def __init__(self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]):
         self.apply_func = apply_func
 
         def apply_wrapper(
@@ -723,20 +751,20 @@ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_grammar_lazy_patterns(
-        self, 
-        model: LlamaModel, 
+        self,
+        model: LlamaModel,
         grammar: LlamaGrammar,
         trigger_patterns: List[str],
-        trigger_tokens: List[int]
+        trigger_tokens: List[int],
     ):
         # Convert patterns to C array
         pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
         for i, pattern in enumerate(trigger_patterns):
             pattern_ptrs[i] = pattern.encode("utf-8")
-        
+
         # Convert tokens to C array
         token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
-        
+
         sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
             model.vocab,
             grammar._grammar.encode("utf-8"),
@@ -744,7 +772,7 @@ def add_grammar_lazy_patterns(
             pattern_ptrs,
             len(trigger_patterns),
             token_array,
-            len(trigger_tokens)
+            len(trigger_tokens),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -771,13 +799,13 @@ def add_dry(
         dry_base: float,
         dry_allowed_length: int,
         dry_penalty_last_n: int,
-        seq_breakers: List[str]
+        seq_breakers: List[str],
     ):
         # Convert seq_breakers to C array
         breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
         for i, breaker in enumerate(seq_breakers):
             breaker_ptrs[i] = breaker.encode("utf-8")
-        
+
         sampler = llama_cpp.llama_sampler_init_dry(
             model.vocab,
             n_ctx_train,
@@ -786,25 +814,19 @@ def add_dry(
             dry_allowed_length,
             dry_penalty_last_n,
             breaker_ptrs,
-            len(seq_breakers)
+            len(seq_breakers),
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
-    def add_logit_bias(
-        self, 
-        n_vocab: int, 
-        logit_bias: Dict[int, float]
-    ):
+    def add_logit_bias(self, n_vocab: int, logit_bias: Dict[int, float]):
         # Convert logit_bias dict to C array
         bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
         for i, (token, bias) in enumerate(logit_bias.items()):
             bias_array[i].token = token
             bias_array[i].bias = bias
-        
+
         sampler = llama_cpp.llama_sampler_init_logit_bias(
-            n_vocab,
-            len(logit_bias),
-            bias_array
+            n_vocab, len(logit_bias), bias_array
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
@@ -838,15 +860,17 @@ def reset(self):
     def clone(self):
         # NOTE: Custom samplers cannot be cloned due to Python callback limitations
         if self.custom_samplers:
-            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
-        
+            raise NotImplementedError(
+                "Cannot clone LlamaSampler that contains custom samplers"
+            )
+
         cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
         # Create a new wrapper around the cloned sampler
         new_sampler = LlamaSampler.__new__(LlamaSampler)
         new_sampler.sampler = cloned_sampler
         new_sampler.custom_samplers = []
         new_sampler._exit_stack = ExitStack()
-        
+
         def free_sampler():
             if new_sampler.sampler is not None:
                 llama_cpp.llama_sampler_free(new_sampler.sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94ebd8..91a15ac81 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -25,6 +25,8 @@
     Deque,
     Callable,
     Dict,
+    Tuple,
+    overload,
 )
 from collections import deque
 from pathlib import Path
@@ -75,6 +77,7 @@ def __init__(
         n_ctx: int = 512,
         n_batch: int = 512,
         n_ubatch: int = 512,
+        n_seq_max: int = 1,
         n_threads: Optional[int] = None,
         n_threads_batch: Optional[int] = None,
         rope_scaling_type: Optional[
@@ -159,6 +162,7 @@ def __init__(
             n_ctx: Text context, 0 = from model
             n_batch: Prompt processing maximum batch size
             n_ubatch: Physical batch size
+            n_seq_max: Maximum number of sequences (i.e. distinct states for recurrent models or parallel batches)
             n_threads: Number of threads to use for generation
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -300,6 +304,8 @@ def __init__(
             self.model_params.kv_overrides = self._kv_overrides_array
 
         self.n_batch = min(n_ctx, n_batch)  # ???
+        self.n_ubatch = min(self.n_batch, n_ubatch)
+        self.n_seq_max = n_seq_max
         self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
         self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count()
 
@@ -310,7 +316,8 @@ def __init__(
         self.context_params = llama_cpp.llama_context_default_params()
         self.context_params.n_ctx = n_ctx
         self.context_params.n_batch = self.n_batch
-        self.context_params.n_ubatch = min(self.n_batch, n_ubatch)
+        self.context_params.n_ubatch = self.n_ubatch
+        self.context_params.n_seq_max = self.n_seq_max
         self.context_params.n_threads = self.n_threads
         self.context_params.n_threads_batch = self.n_threads_batch
         self.context_params.rope_scaling_type = (
@@ -934,7 +941,8 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                    self._input_ids[:sample_idx],
+                    self._scores[sample_idx - self.n_tokens, :],
                 ):
                     return
                 tokens_or_none = yield token
@@ -960,7 +968,10 @@ def generate(
                 )
 
     def create_embedding(
-        self, input: Union[str, List[str]], model: Optional[str] = None
+        self,
+        input: Union[str, List[str]],
+        model: Optional[str] = None,
+        return_numpy: bool = False,
     ) -> CreateEmbeddingResponse:
         """Embed a string.
 
@@ -975,9 +986,11 @@ def create_embedding(
         input = input if isinstance(input, list) else [input]
 
         # get numeric embeddings
-        embeds: Union[List[List[float]], List[List[List[float]]]]
+        embeds: Union[List[npt.NDArray[np.single]], List[List[List[float]]]]
         total_tokens: int
-        embeds, total_tokens = self.embed(input, return_count=True)  # type: ignore
+        embeds, total_tokens = self.embed(
+            input, return_count=True, return_numpy=return_numpy
+        )  # type: ignore
 
         # convert to CreateEmbeddingResponse
         data: List[Embedding] = [
@@ -999,13 +1012,103 @@ def create_embedding(
             },
         }
 
+    @overload
+    def embed(
+        self,
+        input: str,
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[False] = False,
+        return_numpy: Literal[True] = True,
+    ) -> npt.NDArray[np.single]: ...
+
+    @overload
+    def embed(
+        self,
+        input: str,
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[False] = False,
+        return_numpy: Literal[False] = False,
+    ) -> List[List[float]]: ...
+
+    @overload
+    def embed(
+        self,
+        input: List[str],
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[False] = False,
+        return_numpy: Literal[True] = True,
+    ) -> List[npt.NDArray[np.single]]: ...
+
+    @overload
+    def embed(
+        self,
+        input: List[str],
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[False] = False,
+        return_numpy: Literal[False] = False,
+    ) -> List[List[List[float]]]: ...
+
+    @overload
+    def embed(
+        self,
+        input: str,
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[True] = True,
+        return_numpy: Literal[True] = True,
+    ) -> Tuple[npt.NDArray[np.single], int]: ...
+
+    @overload
+    def embed(
+        self,
+        input: str,
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[True] = True,
+        return_numpy: Literal[False] = False,
+    ) -> Tuple[List[List[float]], int]: ...
+
+    @overload
+    def embed(
+        self,
+        input: List[str],
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[True] = True,
+        return_numpy: Literal[True] = True,
+    ) -> Tuple[List[npt.NDArray[np.single]], int]: ...
+
+    @overload
+    def embed(
+        self,
+        input: List[str],
+        normalize: bool = False,
+        truncate: bool = True,
+        return_count: Literal[True] = True,
+        return_numpy: Literal[False] = False,
+    ) -> Tuple[List[List[List[float]]], int]: ...
+
     def embed(
         self,
         input: Union[str, List[str]],
         normalize: bool = False,
         truncate: bool = True,
         return_count: bool = False,
-    ):
+        return_numpy: bool = False,
+    ) -> Union[
+        npt.NDArray[np.single],
+        List[List[float]],
+        List[npt.NDArray[np.single]],
+        List[List[List[float]]],
+        Tuple[npt.NDArray[np.single], int],
+        Tuple[List[List[float]], int],
+        Tuple[List[npt.NDArray[np.single]], int],
+        Tuple[List[List[List[float]]], int],
+    ]:
         """Embed a string.
 
         Args:
@@ -1050,20 +1153,21 @@ def decode_batch(seq_sizes: List[int]):
                 pos: int = 0
                 for i, size in enumerate(seq_sizes):
                     ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx)
-                    embedding: List[List[float]] = [
-                        ptr[pos + j * n_embd : pos + (j + 1) * n_embd]
-                        for j in range(size)
-                    ]
+                    # Convert full pointer to numpy array once (zero-copy)
+                    ptr_array = np.ctypeslib.as_array(ptr, shape=(size * n_embd,))
+                    # Reshape to 2D array: (n_tokens, n_embd)
+                    embedding = ptr_array.reshape(size, n_embd)
                     if normalize:
-                        embedding = [
-                            internals.normalize_embedding(e) for e in embedding
-                        ]
+                        # Normalize each token embedding using vectorized operations
+                        norms = np.linalg.norm(embedding, axis=1, keepdims=True)
+                        norms = np.where(norms == 0, 1, norms)  # Avoid division by zero
+                        embedding = embedding / norms
                     data.append(embedding)
                     pos += size
             else:
                 for i in range(len(seq_sizes)):
                     ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i)
-                    embedding: List[float] = ptr[:n_embd]
+                    embedding = np.ctypeslib.as_array(ptr, shape=(n_embd,))
                     if normalize:
                         embedding = internals.normalize_embedding(embedding)
                     data.append(embedding)
@@ -1110,7 +1214,20 @@ def decode_batch(seq_sizes: List[int]):
         if self.verbose:
             llama_cpp.llama_perf_context_print(self._ctx.ctx)
 
-        output = data[0] if isinstance(input, str) else data
+        output: Union[
+            npt.NDArray[np.single],
+            List[List[float]],
+            List[npt.NDArray[np.single]],
+            List[List[List[float]]],
+        ] = data[0] if isinstance(input, str) else data
+
+        if not return_numpy:
+            if isinstance(output, np.ndarray):
+                output = output.tolist()
+            elif isinstance(output, list) and all(
+                isinstance(x, np.ndarray) for x in output
+            ):
+                output = [x.tolist() for x in output]
 
         llama_cpp.llama_kv_self_clear(self._ctx.ctx)
         self.reset()
@@ -1157,9 +1274,9 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
-        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
-        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        prefix_token_id: int = 0  # self._model.token_prefix() # TODO: Fix
+        middle_token_id: int = 0  # self._model.token_middle() # TODO: Fix
+        suffix_token_id: int = 0  # self._model.token_suffix() # TODO: Fix
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1315,7 +1432,7 @@ def logit_bias_processor(
         if seed is not None:
             self.set_seed(seed)
         else:
-            self.set_seed(random.Random(self._seed).randint(0, 2 ** 32))
+            self.set_seed(random.Random(self._seed).randint(0, 2**32))
 
         finish_reason = "length"
         multibyte_fix = 0
@@ -2056,7 +2173,10 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+                return (
+                    ChatCompletionChunk(**chunk)
+                    for chunk in self.create_chat_completion(*args, **kwargs)
+                )  # type: ignore
             else:
                 return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
@@ -2080,8 +2200,9 @@ def __getstate__(self):
             # Context Params
             seed=self._seed,
             n_ctx=self.context_params.n_ctx,
-            n_batch=self.n_batch,
+            n_batch=self.context_params.n_batch,
             n_ubatch=self.context_params.n_ubatch,
+            n_seq_max=self.context_params.n_seq_max,
             n_threads=self.context_params.n_threads,
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
@@ -2318,7 +2439,11 @@ def from_pretrained(
         if additional_files:
             for additonal_file_name in additional_files:
                 # find the additional shard file:
-                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                matching_additional_files = [
+                    file
+                    for file in file_list
+                    if fnmatch.fnmatch(file, additonal_file_name)
+                ]
 
                 if len(matching_additional_files) == 0:
                     raise ValueError(
diff --git a/tests/test_llama.py b/tests/test_llama.py
index 0a1a9f5ad..53e449da8 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -66,6 +66,7 @@ def llama_cpp_model_path():
 
 def test_real_model(llama_cpp_model_path):
     import os
+
     assert os.path.exists(llama_cpp_model_path)
 
     params = llama_cpp.llama_model_default_params()
@@ -114,6 +115,7 @@ def test_real_model(llama_cpp_model_path):
     output_text = model.detokenize(output, special=True)
     assert output_text == b" over the lazy dog"
 
+
 def test_real_llama(llama_cpp_model_path):
     model = llama_cpp.Llama(
         llama_cpp_model_path,
@@ -132,11 +134,10 @@ def test_real_llama(llama_cpp_model_path):
         top_k=50,
         top_p=0.9,
         temperature=0.8,
-        seed=1337
+        seed=1337,
     )
     assert output["choices"][0]["text"] == " over the lazy dog"
 
-
     output = model.create_completion(
         "The capital of france is paris, 'true' or 'false'?:\n",
         max_tokens=4,
@@ -146,20 +147,19 @@ def test_real_llama(llama_cpp_model_path):
         seed=1337,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "true" | "false"
-""")
+"""),
     )
     assert output["choices"][0]["text"] == "true"
 
     suffix = b"rot"
     tokens = model.tokenize(suffix, add_bos=True, special=True)
+
     def logit_processor_func(input_ids, logits):
         for token in tokens:
             logits[token] *= 1000
         return logits
 
-    logit_processors = llama_cpp.LogitsProcessorList(
-        [logit_processor_func]
-    )
+    logit_processors = llama_cpp.LogitsProcessorList([logit_processor_func])
 
     output = model.create_completion(
         "The capital of france is par",
@@ -168,7 +168,7 @@ def logit_processor_func(input_ids, logits):
         top_p=0.9,
         temperature=0.8,
         seed=1337,
-        logits_processor=logit_processors
+        logits_processor=logit_processors,
     )
     assert output["choices"][0]["text"].lower().startswith("rot")
 
@@ -184,7 +184,7 @@ def logit_processor_func(input_ids, logits):
         temperature=0.8,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
-""")
+"""),
     )
     number_1 = output["choices"][0]["text"]
 
@@ -196,7 +196,7 @@ def logit_processor_func(input_ids, logits):
         temperature=0.8,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
-""")
+"""),
     )
     number_2 = output["choices"][0]["text"]
 
@@ -210,7 +210,7 @@ def logit_processor_func(input_ids, logits):
         temperature=0.8,
         grammar=llama_cpp.LlamaGrammar.from_string("""
 root ::= "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10"
-""")
+"""),
     )
     number_3 = output["choices"][0]["text"]
 
@@ -228,7 +228,32 @@ def test_real_llama_embeddings(llama_cpp_model_path):
         n_threads_batch=multiprocessing.cpu_count(),
         logits_all=False,
         flash_attn=True,
-        embedding=True
+        embedding=True,
     )
     # Smoke test for now
     model.embed("Hello World")
+
+
+def test_embed_numpy(llama_cpp_model_path: str):
+    model = llama_cpp.Llama(
+        llama_cpp_model_path,
+        embedding=True,
+        verbose=False,
+        n_seq_max=16,  # Enable batch embeddings
+    )
+    # Test single input
+    embedding_numpy = model.embed("Hello, world!", return_numpy=True)
+    assert isinstance(embedding_numpy, np.ndarray)
+    embedding_list = model.embed("Hello, world!", return_numpy=False)
+    assert isinstance(embedding_list, list)
+    # Test batch input
+    embeddings_numpy = model.embed(
+        ["Hello, world!", "Goodbye, world!"], return_numpy=True
+    )
+    assert isinstance(embeddings_numpy, list)
+    assert all(isinstance(e, np.ndarray) for e in embeddings_numpy)
+    embeddings_list = model.embed(
+        ["Hello, world!", "Goodbye, world!"], return_numpy=False
+    )
+    assert isinstance(embeddings_list, list)
+    assert all(isinstance(e, list) for e in embeddings_list)