Rebase

larryliu0820 · larryliu0820 · commit 09cccecd7e88 · 2025-09-22T01:01:55.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -650,13 +650,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
   list(APPEND _executorch_extensions tokenizers)
 endif()
 
-<<<<<<< HEAD
-=======
-if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
-endif()
-
->>>>>>> 13d8d946c0edc7e0f8df38194406c874f4e2fbbb
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
   install(
@@ -907,13 +900,10 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
 
-<<<<<<< HEAD
 if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
 endif()
 
-=======
->>>>>>> 13d8d946c0edc7e0f8df38194406c874f4e2fbbb
 if(EXECUTORCH_BUILD_KERNELS_LLM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi
@@ -34,7 +34,6 @@ class GenerationConfig:
     num_eos: int
     """Number of EOS tokens to add to the prompt."""
 
-<<<<<<< HEAD
     def __init__(
         self,
         *,
@@ -47,10 +46,6 @@ class GenerationConfig:
         num_eos: int = 0,
     ) -> None:
         """Initialize GenerationConfig with optional keyword arguments for all fields."""
-=======
-    def __init__(self) -> None:
-        """Initialize GenerationConfig with default values."""
->>>>>>> 13d8d946c0edc7e0f8df38194406c874f4e2fbbb
         ...
 
     def resolve_max_new_tokens(
@@ -373,32 +368,6 @@ class MultimodalRunner:
         Raises:
             RuntimeError: If generation fails
         """
-<<<<<<< HEAD
-    ...
-
-    def generate(
-        self,
-        inputs: dict,
-        config: GenerationConfig,
-        token_callback: Optional[Callable[[str], None]] = None,
-        stats_callback: Optional[Callable[[Stats], None]] = None,
-    ) -> None:
-        """
-        Generate text directly from a HuggingFace processor dict.
-
-        Expects at least 'input_ids' (torch.Tensor). If 'pixel_values' is provided,
-        an 'image_token_id' (or 'image_token_index') must also be present to locate
-        the image position(s) in input_ids.
-
-        Args:
-            inputs: HF processor outputs (e.g., from AutoProcessor.apply_chat_template)
-            config: Generation configuration
-            token_callback: Optional per-token callback
-            stats_callback: Optional stats callback
-
-        Raises:
-            RuntimeError: If required keys are missing, shapes are invalid, or generation fails
-        """
         ...
 
     def prefill(self, inputs: List[MultimodalInput]) -> None:
@@ -412,8 +381,6 @@ class MultimodalRunner:
         Raises:
             RuntimeError: If prefill fails
         """
-=======
->>>>>>> 13d8d946c0edc7e0f8df38194406c874f4e2fbbb
         ...
 
     def generate_text(
@@ -432,17 +399,6 @@ class MultimodalRunner:
         Raises:
             RuntimeError: If generation fails
         """
-<<<<<<< HEAD
-    ...
-
-    def generate_text(self, inputs: dict, config: GenerationConfig) -> str:
-        """
-        Generate text directly from a HuggingFace processor dict and return as string.
-
-        See generate(inputs: dict, ...) for expected keys and constraints.
-        """
-=======
->>>>>>> 13d8d946c0edc7e0f8df38194406c874f4e2fbbb
         ...
 
     def stop(self) -> None:
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
@@ -172,15 +172,15 @@ PYBIND11_MODULE(_llm_runner, m) {
                       float temperature,
                       int32_t num_bos,
                       int32_t num_eos) {
-            GenerationConfig cfg;
-            cfg.echo = echo;
-            cfg.max_new_tokens = max_new_tokens;
-            cfg.warming = warming;
-            cfg.seq_len = seq_len;
-            cfg.temperature = temperature;
-            cfg.num_bos = num_bos;
-            cfg.num_eos = num_eos;
-            return cfg;
+    GenerationConfig cfg;
+    cfg.echo = echo;
+    cfg.max_new_tokens = max_new_tokens;
+    cfg.warming = warming;
+    cfg.seq_len = seq_len;
+    cfg.temperature = temperature;
+    cfg.num_bos = num_bos;
+    cfg.num_eos = num_eos;
+    return cfg;
           }),
           py::arg("echo") = true,
           py::arg("max_new_tokens") = -1,
@@ -200,12 +200,12 @@ PYBIND11_MODULE(_llm_runner, m) {
           py::arg("num_prompt_tokens"),
           "Resolve the maximum number of new tokens to generate based on constraints")
       .def("__repr__", [](const GenerationConfig& config) {
-        return "<GenerationConfig max_new_tokens=" +
-            std::to_string(config.max_new_tokens) +
-            " seq_len=" + std::to_string(config.seq_len) +
-            " temperature=" + std::to_string(config.temperature) +
-            " echo=" + (config.echo ? "True" : "False") +
-            " warming=" + (config.warming ? "True" : "False") + ">";
+    return "<GenerationConfig max_new_tokens=" +
+        std::to_string(config.max_new_tokens) +
+        " seq_len=" + std::to_string(config.seq_len) +
+        " temperature=" + std::to_string(config.temperature) +
+        " echo=" + (config.echo ? "True" : "False") +
+        " warming=" + (config.warming ? "True" : "False") + ">";
       });
 
   // Bind Stats