|
| 1 | +import json |
| 2 | +from typing import List |
| 3 | +from fastchat.constants import ErrorCode, SERVER_ERROR_MSG |
| 4 | +from loguru import logger |
| 5 | +import torch |
| 6 | + |
| 7 | +from gpt_server.model_worker.base.model_worker_base import ModelWorkerBase |
| 8 | + |
| 9 | + |
| 10 | +class PhiWorker(ModelWorkerBase): |
| 11 | + def __init__( |
| 12 | + self, |
| 13 | + controller_addr: str, |
| 14 | + worker_addr: str, |
| 15 | + worker_id: str, |
| 16 | + model_path: str, |
| 17 | + model_names: List[str], |
| 18 | + limit_worker_concurrency: int, |
| 19 | + conv_template: str = None, # type: ignore |
| 20 | + ): |
| 21 | + super().__init__( |
| 22 | + controller_addr, |
| 23 | + worker_addr, |
| 24 | + worker_id, |
| 25 | + model_path, |
| 26 | + model_names, |
| 27 | + limit_worker_concurrency, |
| 28 | + conv_template, |
| 29 | + model_type="AutoModelForCausalLM", |
| 30 | + ) |
| 31 | + # from tokenizer_config.json |
| 32 | + self.stop_words_ids = [ |
| 33 | + 100257, # eos |
| 34 | + 100265, # eos |
| 35 | + ] |
| 36 | + |
| 37 | + self.stop = [ |
| 38 | + self.tokenizer.decode(skip_word) for skip_word in self.stop_words_ids |
| 39 | + ] |
| 40 | + logger.info(f"{model_names[0]} 停用词: {self.stop}") |
| 41 | + |
| 42 | + async def generate_stream_gate(self, params): |
| 43 | + self.call_ct += 1 |
| 44 | + logger.info(f"params {params}") |
| 45 | + logger.info(f"worker_id: {self.worker_id}") |
| 46 | + try: |
| 47 | + messages = params["messages"] |
| 48 | + if isinstance(messages, list): |
| 49 | + task = "chat" |
| 50 | + elif isinstance(messages, str): |
| 51 | + task = "completion" |
| 52 | + if task == "chat": |
| 53 | + # 暂时保留,用于特殊情况的处理 |
| 54 | + text = self.tokenizer.apply_chat_template( |
| 55 | + conversation=messages, |
| 56 | + tokenize=False, |
| 57 | + add_generation_prompt=True, |
| 58 | + ) |
| 59 | + elif task == "completion": |
| 60 | + text = messages |
| 61 | + |
| 62 | + input_ids = self.tokenizer([text], return_tensors="pt").input_ids |
| 63 | + # ---------------添加额外的参数------------------------ |
| 64 | + params["messages"] = messages |
| 65 | + params["prompt"] = text |
| 66 | + params["stop"].extend(self.stop) |
| 67 | + params["stop_words_ids"] = self.stop_words_ids |
| 68 | + params["input_ids"] = input_ids |
| 69 | + # ---------------添加额外的参数------------------------ |
| 70 | + async for ret in self.backend.stream_chat(params=params): |
| 71 | + yield json.dumps(ret).encode() + b"\0" |
| 72 | + |
| 73 | + except torch.cuda.OutOfMemoryError as e: |
| 74 | + ret = { |
| 75 | + "text": f"{SERVER_ERROR_MSG}\n\n({e})", |
| 76 | + "error_code": ErrorCode.CUDA_OUT_OF_MEMORY, |
| 77 | + } |
| 78 | + yield json.dumps(ret).encode() + b"\0" |
| 79 | + except (ValueError, RuntimeError) as e: |
| 80 | + logger.info(e) |
| 81 | + ret = { |
| 82 | + "text": f"{SERVER_ERROR_MSG}\n\n({e})", |
| 83 | + "error_code": ErrorCode.INTERNAL_ERROR, |
| 84 | + } |
| 85 | + yield json.dumps(ret).encode() + b"\0" |
| 86 | + |
| 87 | + def get_embeddings(self, params): |
| 88 | + return super().get_embeddings(params) |
| 89 | + |
| 90 | + |
| 91 | +if __name__ == "__main__": |
| 92 | + PhiWorker.run() |
0 commit comments