TingsongYu
diff --git a/‎code/chapter-10/cli_demo_baichuan2.py‎
Lines changed: 102 additions & 0 deletions b/‎code/chapter-10/cli_demo_baichuan2.py‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎code/chapter-10/cli_demo_chatglm3.py‎
Lines changed: 74 additions & 0 deletions b/‎code/chapter-10/cli_demo_chatglm3.py‎
Lines changed: 74 additions & 0 deletions
@@ -0,0 +1,102 @@
+import os
+import torch
+import platform
+import subprocess
+from colorama import Fore, Style
+from tempfile import NamedTemporaryFile
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation.utils import GenerationConfig
+
+
+def init_model():
+    print("init model ...")
+    model = AutoModelForCausalLM.from_pretrained(
+        r"G:\04-model-weights\Baichuan2-7B-Chat-4bits",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    model.generation_config = GenerationConfig.from_pretrained(
+        r"G:\04-model-weights\Baichuan2-7B-Chat-4bits"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        r"G:\04-model-weights\Baichuan2-7B-Chat-4bits",
+        use_fast=False,
+        trust_remote_code=True
+    )
+    return model, tokenizer
+
+
+def clear_screen():
+    if platform.system() == "Windows":
+        os.system("cls")
+    else:
+        os.system("clear")
+    print(Fore.YELLOW + Style.BRIGHT + "欢迎使用百川大模型，输入进行对话，vim 多行输入，clear 清空历史，CTRL+C 中断生成，stream 开关流式生成，exit 结束。")
+    return []
+
+
+def vim_input():
+    with NamedTemporaryFile() as tempfile:
+        tempfile.close()
+        subprocess.call(['vim', '+star', tempfile.name])
+        text = open(tempfile.name).read()
+    return text
+
+
+def main(stream=True):
+    model, tokenizer = init_model()
+    messages = clear_screen()
+
+    path_log = r"gpu_usage_log.txt"
+    f = open(path_log, "w")
+
+    while True:
+        prompt = input(Fore.GREEN + Style.BRIGHT + "\n用户：" + Style.NORMAL)
+        if prompt.strip() == "exit":
+            break
+        if prompt.strip() == "clear":
+            messages = clear_screen()
+            continue
+        if prompt.strip() == 'vim':
+            prompt = vim_input()
+            print(prompt)
+        print(Fore.CYAN + Style.BRIGHT + "\nBaichuan 2：" + Style.NORMAL, end='')
+        if prompt.strip() == "stream":
+            stream = not stream
+            print(Fore.YELLOW + "({}流式生成)\n".format("开启" if stream else "关闭"), end='')
+            continue
+        messages.append({"role": "user", "content": prompt})
+        if stream:
+            position = 0
+            try:
+                for response in model.chat(tokenizer, messages, stream=True):
+                    print(response[position:], end='', flush=True)
+                    position = len(response)
+                    if torch.backends.mps.is_available():
+                        torch.mps.empty_cache()
+            except KeyboardInterrupt:
+                pass
+            print()
+        else:
+            response = model.chat(tokenizer, messages)
+            print(response)
+            if torch.backends.mps.is_available():
+                torch.mps.empty_cache()
+        messages.append({"role": "assistant", "content": response})
+
+        conversation_length = sum([len(content['content']) for content in messages])
+        import subprocess
+        import json
+        result = subprocess.run(['gpustat', '--json'], stdout=subprocess.PIPE)
+        output = result.stdout.decode()
+        data = json.loads(output)
+        used_memory = data['gpus'][0]['memory.used']
+        f.writelines("{}, {}\n".format(conversation_length, used_memory))
+        f.flush()
+
+    print(Style.RESET_ALL)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,74 @@
+import os
+import platform
+from transformers import AutoTokenizer, AutoModel
+
+# MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
+# TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+
+MODEL_PATH = r"G:\04-model-weights\chatglm\chatglm3-6b"
+TOKENIZER_PATH = r"G:\04-model-weights\chatglm\chatglm3-6b"
+
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).quantize(bits=4, device="cuda").cuda().eval()
+# model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).cuda().eval()
+# add .quantize(bits=4, device="cuda").cuda() before .eval() to use int4 model
+# must use cuda to load int4 model
+
+os_name = platform.system()
+clear_command = 'cls' if os_name == 'Windows' else 'clear'
+stop_stream = False
+
+welcome_prompt = "欢迎使用 ChatGLM3-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序"
+
+
+def build_prompt(history):
+    prompt = welcome_prompt
+    for query, response in history:
+        prompt += f"\n\n用户：{query}"
+        prompt += f"\n\nChatGLM3-6B：{response}"
+    return prompt
+
+
+def main():
+    past_key_values, history = None, []
+    global stop_stream
+    print(welcome_prompt)
+    path_log = r"gpu_usage_log.txt"
+    f = open(path_log, "w")
+    while True:
+        query = input("\n用户：")
+        if query.strip() == "stop":
+            break
+        if query.strip() == "clear":
+            past_key_values, history = None, []
+            os.system(clear_command)
+            print(welcome_prompt)
+            continue
+        print("\nChatGLM：", end="")
+        current_length = 0
+        for response, history, past_key_values in model.stream_chat(tokenizer, query, history=history, top_p=1,
+                                                                    temperature=0.01,
+                                                                    past_key_values=past_key_values,
+                                                                    return_past_key_values=True):
+            if stop_stream:
+                stop_stream = False
+                break
+            else:
+                print(response[current_length:], end="", flush=True)
+                current_length = len(response)
+
+            # 统计文本长度
+            conversation_length = sum([len(content['content']) for content in history])
+            import subprocess
+            import json
+            result = subprocess.run(['gpustat', '--json'], stdout=subprocess.PIPE)
+            output = result.stdout.decode()
+            data = json.loads(output)
+            used_memory = data['gpus'][0]['memory.used']
+            f.writelines("{}, {}\n".format(conversation_length, used_memory))
+            f.flush()
+        print("")
+
+
+if __name__ == "__main__":
+    main()