@@ -241,9 +241,8 @@ def streamGenerate(self, prompt: str, **generate_kwargs) -> Generator[str, None,
241
241
242
242
logger .info (f"stream generate_kwargs: { generate_kwargs } " )
243
243
logger .info (f"model inputs: { inputs } " )
244
-
244
+ generate_kwargs . pop ( 'stopping_sequences' , None )
245
245
if chat_completion :
246
- generate_kwargs .pop ('stopping_sequences' , None )
247
246
logger .info (f"chat generate_kwargs: { generate_kwargs } " )
248
247
output = self .model .create_chat_completion (messages = inputs [0 ], stream = True , ** generate_kwargs )
249
248
for chunk in output :
@@ -256,12 +255,17 @@ def streamGenerate(self, prompt: str, **generate_kwargs) -> Generator[str, None,
256
255
val = delta ['content' ]
257
256
yield val
258
257
else :
259
- input_ids = self .model .tokenizer (inputs )
258
+ generate_kwargs .pop ('max_tokens' , None )
259
+ input_ids = self .tokenizer .encode (inputs [0 ])
260
+ # logger.info(f"model generate : {input_ids}")
261
+ logger .info (f"generate_kwargs: { generate_kwargs } " )
260
262
output = self .model .generate (tokens = input_ids , ** generate_kwargs )
261
263
for token in output :
262
264
val = self .model .detokenize ([token ])
263
- logger .info (f'LlamaCppPipeline -> generate -> Yield -> "{ val } " -> "{ type (val )} "' )
264
- yield val
265
+ # logger.info(f'LlamaCppPipeline -> generate -> Yield -> "{val}" -> "{type(val)}"')
266
+ chunk = val .decode ('utf-8' )
267
+ logger .info (f'LlamaCppPipeline -> generate -> Yield -> "{ chunk } "' )
268
+ yield chunk
265
269
266
270
# streaming sample for test
267
271
# start = 0
0 commit comments