|
| 1 | +--- |
| 2 | +title: Run the Chatbot Server |
| 3 | +weight: 3 |
| 4 | + |
| 5 | +layout: learningpathall |
| 6 | +--- |
| 7 | + |
| 8 | +## Script for ONNX Runtime based LLM Server |
| 9 | +Now create a `phi3v.py` script using the following content. This script runs the Phi3.5 vision model with ONNX Runtime. |
| 10 | + |
| 11 | +```python |
| 12 | +# Copyright (c) Microsoft Corporation. All rights reserved. |
| 13 | +# Licensed under the MIT License |
| 14 | +import argparse |
| 15 | +import os |
| 16 | +import glob |
| 17 | +import time |
| 18 | +from pathlib import Path |
| 19 | +import onnxruntime_genai as og |
| 20 | + |
| 21 | +def _find_dir_contains_sub_dir(current_dir: Path, target_dir_name): |
| 22 | + curr_path = Path(current_dir).absolute() |
| 23 | + target_dir = glob.glob(target_dir_name, root_dir=curr_path) |
| 24 | + if target_dir: |
| 25 | + return Path(curr_path / target_dir[0]).absolute() |
| 26 | + else: |
| 27 | + if curr_path.parent == curr_path: |
| 28 | + # Root dir |
| 29 | + return None |
| 30 | + return _find_dir_contains_sub_dir(curr_path / '..', target_dir_name) |
| 31 | + |
| 32 | +def _complete(text, state): |
| 33 | + return (glob.glob(text + "*") + [None])[state] |
| 34 | + |
| 35 | +def run(args: argparse.Namespace): |
| 36 | + print("Loading model...") |
| 37 | + config = og.Config(args.model_path) |
| 38 | + config.clear_providers() |
| 39 | + if args.execution_provider != "cpu": |
| 40 | + print(f"Setting model to {args.execution_provider}...") |
| 41 | + config.append_provider(args.execution_provider) |
| 42 | + model = og.Model(config) |
| 43 | + print("Model loaded") |
| 44 | + processor = model.create_multimodal_processor() |
| 45 | + tokenizer_stream = processor.create_stream() |
| 46 | + interactive = not args.non_interactive |
| 47 | + while True: |
| 48 | + if interactive: |
| 49 | + try: |
| 50 | + import readline |
| 51 | + readline.set_completer_delims(" \t\n;") |
| 52 | + readline.parse_and_bind("tab: complete") |
| 53 | + readline.set_completer(_complete) |
| 54 | + except ImportError: |
| 55 | + # Not available on some platforms. Ignore it. |
| 56 | + pass |
| 57 | + image_paths = [ |
| 58 | + image_path.strip() |
| 59 | + for image_path in input( |
| 60 | + "Image Path (comma separated; leave empty if no image): " |
| 61 | + ).split(",") |
| 62 | + ] |
| 63 | + else: |
| 64 | + if args.image_paths: |
| 65 | + image_paths = args.image_paths |
| 66 | + else: |
| 67 | + image_paths = [str(_find_dir_contains_sub_dir(Path(__file__).parent, "test") / "test_models" / "images" / "australia.jpg")] |
| 68 | + image_paths = [image_path for image_path in image_paths if image_path] |
| 69 | + images = None |
| 70 | + prompt = "<|user|>\n" |
| 71 | + if len(image_paths) == 0: |
| 72 | + print("No image provided") |
| 73 | + else: |
| 74 | + for i, image_path in enumerate(image_paths): |
| 75 | + if not os.path.exists(image_path): |
| 76 | + raise FileNotFoundError(f"Image file not found: {image_path}") |
| 77 | + print(f"Using image: {image_path}") |
| 78 | + prompt += f"<|image_{i+1}|>\n" |
| 79 | + images = og.Images.open(*image_paths) |
| 80 | + if interactive: |
| 81 | + text = input("Prompt: ") |
| 82 | + else: |
| 83 | + if args.prompt: |
| 84 | + text = args.prompt |
| 85 | + else: |
| 86 | + text = "What is shown in this image?" |
| 87 | + prompt += f"{text}<|end|>\n<|assistant|>\n" |
| 88 | + print("Processing images and prompt...") |
| 89 | + inputs = processor(prompt, images=images) |
| 90 | + print("Generating response...") |
| 91 | + start_time = time.time() |
| 92 | + params = og.GeneratorParams(model) |
| 93 | + params.set_inputs(inputs) |
| 94 | + params.set_search_options(max_length=7680) |
| 95 | + generator = og.Generator(model, params) |
| 96 | + #start_time = time.time() |
| 97 | + first_token_duration = None |
| 98 | + token_count = 0 |
| 99 | + while not generator.is_done(): |
| 100 | + generator.generate_next_token() |
| 101 | + new_token = generator.get_next_tokens()[0] |
| 102 | + decoded_token = tokenizer_stream.decode(new_token) |
| 103 | + token_count += 1 |
| 104 | + if token_count == 1: |
| 105 | + ft_end = time.time() |
| 106 | + first_token_duration = ft_end - start_time |
| 107 | + print(decoded_token, end="", flush=True) |
| 108 | + end_time = time.time() |
| 109 | + total_run_time = end_time - start_time |
| 110 | + tokens_per_sec = token_count / (end_time - ft_end) |
| 111 | + print() |
| 112 | + print(f"Total Time : {total_run_time:.4f} sec") |
| 113 | + print(f"Time to First Token : {first_token_duration:.4f} sec") |
| 114 | + print(f"Tokens per second : {tokens_per_sec:.2f} tokens/sec") |
| 115 | + for _ in range(3): |
| 116 | + print() |
| 117 | + # Delete the generator to free the captured graph before creating another one |
| 118 | + del generator |
| 119 | + if not interactive: |
| 120 | + break |
| 121 | + |
| 122 | +if __name__ == "__main__": |
| 123 | + parser = argparse.ArgumentParser() |
| 124 | + parser.add_argument( |
| 125 | + "-m", "--model_path", type=str, required=True, help="Path to the folder containing the model" |
| 126 | + ) |
| 127 | + parser.add_argument( |
| 128 | + "-e", "--execution_provider", type=str, required=True, choices=["cpu", "cuda", "dml"], help="Execution provider to run model" |
| 129 | + ) |
| 130 | + parser.add_argument( |
| 131 | + "--image_paths", nargs='*', type=str, required=False, help="Path to the images, mainly for CI usage" |
| 132 | + ) |
| 133 | + parser.add_argument( |
| 134 | + '-pr', '--prompt', required=False, help='Input prompts to generate tokens from, mainly for CI usage' |
| 135 | + ) |
| 136 | + parser.add_argument( |
| 137 | + '--non-interactive', action=argparse.BooleanOptionalAction, required=False, help='Non-interactive mode, mainly for CI usage' |
| 138 | + ) |
| 139 | + args = parser.parse_args() |
| 140 | + run(args) |
| 141 | +``` |
| 142 | + |
| 143 | +## Run the Server |
| 144 | + |
| 145 | +You are now ready to run the server to enable chatbot. |
| 146 | +Use the following command in a terminal to start the server: |
| 147 | + |
| 148 | +```python |
| 149 | +python3 phi3v.py -m cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4 -e cpu |
| 150 | +``` |
| 151 | + |
| 152 | +You should see output similar to the image below when the server starts successfully: |
| 153 | + |
0 commit comments