You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
91
92
parser.add_argument('-b', '--batch_size_for_cuda_graph', type=int, default=1, help='Max batch size for CUDA graph')
92
93
parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.')
94
+
parser.add_argument('--chunk_size', type=int, default=-1, help='Chunk size for prefix chunking during context processing (default: -1 = disabled, >0 = enabled)')
93
95
parser.add_argument('--non-interactive', action=argparse.BooleanOptionalAction, required=False, default=False, help='Non-interactive mode, mainly for CI usage')
Copy file name to clipboardExpand all lines: src/config.h
+1Lines changed: 1 addition & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -273,6 +273,7 @@ struct Config {
273
273
float length_penalty{1.0f}; // Exponential penalty to the length that is used with beam-based generation. length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
274
274
bool past_present_share_buffer{}; // The past/present kv tensors are shared and allocated once to max_length (cuda only)
275
275
int random_seed{-1}; // -1 = Seed with random device, otherwise use value to seed RNG
276
+
int chunk_size{-1}; // Chunk size for prefix chunking during context processing. -1 = disabled, >0 = enabled with specified chunk size.
0 commit comments