@@ -583,15 +583,14 @@ static json oaicompat_completion_params_parse(
583583 return llama_params;
584584}
585585
586- static json format_final_response_oaicompat (const json & request, const json & result, const std::string & completion_id, bool streaming = false , bool verbose = false ) {
587- bool stopped_word = result.count (" stopped_word" ) != 0 ;
588- bool stopped_eos = json_value (result, " stopped_eos" , false );
589- int num_tokens_predicted = json_value (result, " tokens_predicted" , 0 );
590- int num_prompt_tokens = json_value (result, " tokens_evaluated" , 0 );
591- std::string content = json_value (result, " content" , std::string (" " ));
592-
586+ static json format_final_response_oaicompat (
587+ const json & request,
588+ server_task_result_cmpl_final & result,
589+ const std::string & completion_id,
590+ bool streaming = false ,
591+ bool verbose = false ) {
593592 std::string finish_reason = " length" ;
594- if (stopped_word || stopped_eos ) {
593+ if (result. stop == STOP_TYPE_WORD || result. stop == STOP_TYPE_EOS ) {
595594 finish_reason = " stop" ;
596595 }
597596
@@ -601,7 +600,7 @@ static json format_final_response_oaicompat(const json & request, const json & r
601600 {" delta" , json::object ()}}})
602601 : json::array ({json{{" finish_reason" , finish_reason},
603602 {" index" , 0 },
604- {" message" , json{{" content" , content},
603+ {" message" , json{{" content" , result. content },
605604 {" role" , " assistant" }}}}});
606605
607606 std::time_t t = std::time (0 );
@@ -613,48 +612,42 @@ static json format_final_response_oaicompat(const json & request, const json & r
613612 json_value (request, " model" , std::string (DEFAULT_OAICOMPAT_MODEL))},
614613 {" object" , streaming ? " chat.completion.chunk" : " chat.completion" },
615614 {" usage" , json {
616- {" completion_tokens" , num_tokens_predicted },
617- {" prompt_tokens" , num_prompt_tokens },
618- {" total_tokens" , num_tokens_predicted + num_prompt_tokens }
615+ {" completion_tokens" , result. n_decoded },
616+ {" prompt_tokens" , result. n_prompt_tokens },
617+ {" total_tokens" , result. n_decoded + result. n_prompt_tokens }
619618 }},
620619 {" id" , completion_id}
621620 };
622621
623622 // extra fields for debugging purposes
624623 if (verbose) {
625- res[" __verbose" ] = result;
624+ res[" __verbose" ] = result. to_json () ;
626625 }
627626
628- if (result.contains (" completion_probabilities" )) {
629- res[" completion_probabilities" ] = json_value (result, " completion_probabilities" , json::array ());
630- }
627+ // TODO: fix this
628+ // if (result.contains("completion_probabilities")) {
629+ // res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
630+ // }
631631
632- if (result.contains ( " timings" ) ) {
633- res.push_back ({" timings" , json_value ( result, " timings" , json::object () )});
632+ if (result.timings . prompt_n >= 0 ) {
633+ res.push_back ({" timings" , result. timings . to_json ( )});
634634 }
635635
636636 return res;
637637}
638638
639639// return value is vector as there is one case where we might need to generate two responses
640- static std::vector<json> format_partial_response_oaicompat (const json & result, const std::string & completion_id) {
641- if (!result.contains (" model" ) || !result.contains (" oaicompat_token_ctr" )) {
642- return std::vector<json>({result});
643- }
644-
645- bool first = json_value (result, " oaicompat_token_ctr" , 0 ) == 0 ;
646- std::string modelname = json_value (result, " model" , std::string (DEFAULT_OAICOMPAT_MODEL));
647-
648- bool stopped_word = json_value (result, " stopped_word" , false );
649- bool stopped_eos = json_value (result, " stopped_eos" , false );
650- bool stopped_limit = json_value (result, " stopped_limit" , false );
651- std::string content = json_value (result, " content" , std::string (" " ));
640+ static std::vector<json> format_partial_response_oaicompat (
641+ std::string modelname,
642+ server_task_result_cmpl_partial & result,
643+ const std::string & completion_id) {
644+ bool first = result.n_decoded == 0 ;
645+ std::string content = result.content ;
652646
653647 std::string finish_reason;
654- if (stopped_word || stopped_eos ) {
648+ if (result. stop == STOP_TYPE_WORD || result. stop == STOP_TYPE_EOS ) {
655649 finish_reason = " stop" ;
656- }
657- if (stopped_limit) {
650+ } else if (result.stop == STOP_TYPE_LIMIT) {
658651 finish_reason = " length" ;
659652 }
660653
@@ -724,17 +717,15 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
724717 {" object" , " chat.completion.chunk" }
725718 };
726719
727- if (result.contains ( " timings" ) ) {
728- ret.push_back ({" timings" , json_value ( result, " timings" , json::object () )});
720+ if (result.timings . prompt_n >= 0 ) {
721+ ret.push_back ({" timings" , result. timings . to_json ( )});
729722 }
730723
731724 if (!finish_reason.empty ()) {
732- int num_tokens_predicted = json_value (result, " tokens_predicted" , 0 );
733- int num_prompt_tokens = json_value (result, " tokens_evaluated" , 0 );
734725 ret.push_back ({" usage" , json {
735- {" completion_tokens" , num_tokens_predicted },
736- {" prompt_tokens" , num_prompt_tokens },
737- {" total_tokens" , num_tokens_predicted + num_prompt_tokens }
726+ {" completion_tokens" , result. n_decoded },
727+ {" prompt_tokens" , result. n_prompt_tokens },
728+ {" total_tokens" , result. n_decoded + result. n_prompt_tokens }
738729 }});
739730 }
740731
0 commit comments