Pass a full generation config instead of just sequence length to the runners.

shoumikhin · web-flow · commit 6bfe337f22be · 2025-09-26T16:27:37.000-07:00
Differential Revision: D83382480 Pull Request resolved: #14635
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import "ExecuTorchLLMConfig.h"
 #import "ExecuTorchLLMError.h"
 #import "ExecuTorchLLMMultimodalRunner.h"
 #import "ExecuTorchLLMTextRunner.h"
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ A configuration object for text generation.
+
+ This class wraps the underlying C++ GenerationConfig so that default
+ values and future fields remain a single source of truth in C++.
+*/
+NS_SWIFT_NAME(Config)
+__attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
+@interface ExecuTorchLLMConfig : NSObject<NSCopying>
+
+/** Whether to echo the input prompt in the output. */
+@property(nonatomic, getter=isEchoEnabled) BOOL echoEnabled;
+
+/** Maximum number of new tokens to generate. */
+@property(nonatomic) NSInteger maximumNewTokens;
+
+/** Whether this is a warmup run. */
+@property(nonatomic, getter=isWarming) BOOL warming;
+
+/** Maximum total sequence length. */
+@property(nonatomic) NSInteger sequenceLength;
+
+/** Temperature for sampling. */
+@property(nonatomic) double temperature;
+
+/** Number of BOS tokens to add. */
+@property(nonatomic) NSInteger bosCount;
+
+/** Number of EOS tokens to add. */
+@property(nonatomic) NSInteger eosCount;
+
+/**
+ Initializes a configuration and invokes the block to mutate it.
+
+ @param block  A block that receives the newly initialized configuration.
+ @return An initialized ExecuTorchLLMConfig instance.
+*/
+- (instancetype)initWithBlock:(NS_NOESCAPE void (^)(ExecuTorchLLMConfig *))block
+    NS_SWIFT_NAME(init(_:));
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.mm
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchLLMConfig.h"
+
+#import <executorch/extension/llm/runner/irunner.h>
+
+using namespace executorch::extension;
+
+@interface ExecuTorchLLMConfig ()
+
+- (const llm::GenerationConfig &)nativeConfig;
+
+@end
+
+@implementation ExecuTorchLLMConfig {
+  std::unique_ptr<llm::GenerationConfig> _config;
+}
+
+@dynamic echoEnabled;
+@dynamic maximumNewTokens;
+@dynamic warming;
+@dynamic sequenceLength;
+@dynamic temperature;
+@dynamic bosCount;
+@dynamic eosCount;
+
+- (instancetype)init {
+  if (self = [super init]) {
+    _config = std::make_unique<llm::GenerationConfig>();
+  }
+  return self;
+}
+
+- (instancetype)initWithBlock:(NS_NOESCAPE void (^)(ExecuTorchLLMConfig *))block {
+  if (self = [self init]) {
+    if (block) {
+      block(self);
+    }
+  }
+  return self;
+}
+
+- (id)copyWithZone:(NSZone *)zone {
+  ExecuTorchLLMConfig *config = [[[self class] allocWithZone:zone] init];
+  *config->_config = *_config;
+  return config;
+}
+
+- (const llm::GenerationConfig &)nativeConfig {
+  return *_config;
+}
+
+- (BOOL)echoEnabled {
+  return _config->echo;
+}
+
+- (void)setEchoEnabled:(BOOL)echoEnabled {
+  _config->echo = echoEnabled;
+}
+
+- (NSInteger)maximumNewTokens {
+  return _config->max_new_tokens;
+}
+
+- (void)setMaximumNewTokens:(NSInteger)maximumNewTokens {
+  _config->max_new_tokens = (int32_t)maximumNewTokens;
+}
+
+- (BOOL)warming {
+  return _config->warming;
+}
+
+- (void)setWarming:(BOOL)warming {
+  _config->warming = warming;
+}
+
+- (NSInteger)sequenceLength {
+  return _config->seq_len;
+}
+
+- (void)setSequenceLength:(NSInteger)sequenceLength {
+  _config->seq_len = (int32_t)sequenceLength;
+}
+
+- (double)temperature {
+  return _config->temperature;
+}
+
+- (void)setTemperature:(double)temperature {
+  _config->temperature = (float)temperature;
+}
+
+- (NSInteger)bosCount {
+  return _config->num_bos;
+}
+
+- (void)setBosCount:(NSInteger)bosCount {
+  _config->num_bos = (int32_t)bosCount;
+}
+
+- (NSInteger)eosCount {
+  return _config->num_eos;
+}
+
+- (void)setEosCount:(NSInteger)eosCount {
+  _config->num_eos = (int32_t)eosCount;
+}
+
+@end
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#import <Foundation/Foundation.h>
+#import "ExecuTorchLLMConfig.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -26,6 +26,7 @@ typedef NS_ENUM(NSInteger, ExecuTorchLLMMultimodalInputType) {
 */
 NS_SWIFT_NAME(Image)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMImage : NSObject<NSCopying>
 
 /**
@@ -44,8 +45,11 @@ __attribute__((deprecated("This API is experimental.")))
     NS_DESIGNATED_INITIALIZER;
 
 @property(nonatomic, readonly) NSData *data;
+
 @property(nonatomic, readonly) NSInteger width;
+
 @property(nonatomic, readonly) NSInteger height;
+
 @property(nonatomic, readonly) NSInteger channels;
 
 + (instancetype)new NS_UNAVAILABLE;
@@ -58,6 +62,7 @@ __attribute__((deprecated("This API is experimental.")))
 */
 NS_SWIFT_NAME(Audio)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMAudio : NSObject<NSCopying>
 
 /**
@@ -76,8 +81,11 @@ __attribute__((deprecated("This API is experimental.")))
     NS_DESIGNATED_INITIALIZER;
 
 @property(nonatomic, readonly) NSData *data;
+
 @property(nonatomic, readonly) NSInteger batchSize;
+
 @property(nonatomic, readonly) NSInteger bins;
+
 @property(nonatomic, readonly) NSInteger frames;
 
 + (instancetype)new NS_UNAVAILABLE;
@@ -91,6 +99,7 @@ __attribute__((deprecated("This API is experimental.")))
 */
 NS_SWIFT_NAME(MultimodalInput)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMMultimodalInput : NSObject<NSCopying>
 
 /**
@@ -124,8 +133,11 @@ __attribute__((deprecated("This API is experimental.")))
     NS_RETURNS_RETAINED;
 
 @property(nonatomic, readonly) ExecuTorchLLMMultimodalInputType type;
+
 @property(nonatomic, readonly, nullable) NSString *text;
+
 @property(nonatomic, readonly, nullable) ExecuTorchLLMImage *image;
+
 @property(nonatomic, readonly, nullable) ExecuTorchLLMAudio *audio;
 
 + (instancetype)new NS_UNAVAILABLE;
@@ -134,12 +146,13 @@ __attribute__((deprecated("This API is experimental.")))
 @end
 
 /**
- A wrapper class for the C++ llm::MultimodalLLMRunner that provides
+ A wrapper class for the C++ llm::MultimodalRunner that provides
  Objective-C APIs to load models, manage tokenization, accept mixed
  input modalities, generate text sequences, and stop the runner.
 */
 NS_SWIFT_NAME(MultimodalRunner)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMMultimodalRunner : NSObject
 
 /**
@@ -169,29 +182,32 @@ __attribute__((deprecated("This API is experimental.")))
 - (BOOL)loadWithError:(NSError **)error;
 
 /**
- Generates text given a list of multimodal inputs, up to a specified sequence length.
- Invokes the provided callback for each generated token.
+ Generates text given a list of multimodal inputs. A default configuration
+ is created and passed to the configuration block for in-place mutation.
 
- @param inputs    An ordered array of multimodal inputs.
- @param seq_len   The maximum number of tokens to generate.
- @param callback  A block called with each generated token as an NSString.
- @param error     On failure, populated with an NSError explaining the issue.
+ The token callback, if provided, is invoked for each generated token.
+
+ @param inputs     An ordered array of multimodal inputs.
+ @param config     A configuration object.
+ @param callback   A block called with each generated token as an NSString.
+ @param error      On failure, populated with an NSError explaining the issue.
  @return YES if generation completes successfully, NO if an error occurred.
 */
 - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
-   sequenceLength:(NSInteger)seq_len
+           config:(ExecuTorchLLMConfig *)config
 withTokenCallback:(nullable void (^)(NSString *))callback
-            error:(NSError **)error;
+            error:(NSError **)error
+    NS_SWIFT_NAME(generate(_:_:tokenCallback:));
 
 /**
  Stop producing new tokens and terminate the current generation process.
 */
 - (void)stop;
 
 /**
-  Remove the prefilled tokens from the KV cache and resets the start position
-  to 0. It also clears the stats for previous runs.
- */
+ Remove the prefilled tokens from the KV cache and reset the start position
+ to 0. It also clears the stats for previous runs.
+*/
 - (void)reset;
 
 + (instancetype)new NS_UNAVAILABLE;
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -15,6 +15,12 @@
 using namespace executorch::extension;
 using namespace executorch::runtime;
 
+@interface ExecuTorchLLMConfig ()
+
+- (const llm::GenerationConfig &)nativeConfig;
+
+@end
+
 @implementation ExecuTorchLLMImage
 
 - (instancetype)initWithData:(NSData *)data
@@ -157,7 +163,7 @@ - (BOOL)loadWithError:(NSError**)error {
 }
 
 - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
-   sequenceLength:(NSInteger)seq_len
+           config:(ExecuTorchLLMConfig *)config
 withTokenCallback:(nullable void (^)(NSString *))callback
             error:(NSError **)error {
   if (![self loadWithError:error]) {
@@ -192,7 +198,7 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
   }
   auto status = _runner->generate(
     std::move(nativeInputs),
-    llm::GenerationConfig{.seq_len = static_cast<int32_t>(seq_len)},
+    config.nativeConfig,
     [callback](const std::string& token) {
       if (callback) {
         callback(@(token.c_str()));
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#import <Foundation/Foundation.h>
+#import "ExecuTorchLLMConfig.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -49,29 +49,32 @@ __attribute__((deprecated("This API is experimental.")))
 - (BOOL)loadWithError:(NSError **)error;
 
 /**
- Generates text given an input prompt, up to a specified sequence length.
- Invokes the provided callback for each generated token.
+ Generates text given an input prompt. A default configuration
+ is created and passed to the configuration block for in-place mutation.
 
- @param prompt    The initial text prompt to generate from.
- @param seq_len   The maximum number of tokens to generate.
- @param callback  A block called with each generated token as an NSString.
- @param error     On failure, populated with an NSError explaining the issue.
+ The token callback, if provided, is invoked for each generated token.
+
+ @param prompt     The initial text prompt to generate from.
+ @param config     A configuration object.
+ @param callback   A block called with each generated token as an NSString.
+ @param error      On failure, populated with an NSError explaining the issue.
  @return YES if generation completes successfully, NO if an error occurred.
 */
 - (BOOL)generate:(NSString *)prompt
-   sequenceLength:(NSInteger)seq_len
-withTokenCallback:(nullable void (^)(NSString *))callback
-            error:(NSError **)error;
+           config:(ExecuTorchLLMConfig *)config
+withTokenCallback:(nullable void (^)(NSString *token))callback
+            error:(NSError **)error
+    NS_SWIFT_NAME(generate(_:_:tokenCallback:));
 
 /**
  Stop producing new tokens and terminate the current generation process.
 */
 - (void)stop;
 
 /**
-  Remove the prefilled tokens from the KV cache and resets the start position
-  to 0. It also clears the stats for previous runs.
- */
+ Remove the prefilled tokens from the KV cache and reset the start position
+ to 0. It also clears the stats for previous runs.
+*/
 - (void)reset;
 
 + (instancetype)new NS_UNAVAILABLE;
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift