@FIR-754: Added all parameter parsing for the llama-cli (#18)

atrivedi-tsavoritesi · Ashish Trivedi · web-flow · commit d733056d8f7d · 2025-06-17T21:04:37.000-07:00
* @FIR-754: Added all parameter parsing for the llama-cli The test results are as follows Model Response cd /usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/; ./run_llama_cli.sh "My cat's name" " 50 tinyllama-vo-5m-para.gguf tSavorite 1.5 1024 50 0.9 5 12288 0.0 [2018-03-09 13:03:17.788243] 271:272 [[32m info[m] :: </proj/work/mmankali/bld-setuptest/tsirel-31/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:129> TXE resource allocation request processed successfully. My cat's name was Tim. He loved to play with his toy car. He would run and jump in the park, making loud noises. Tim was very happy with his new toy car. One day, Tim's mom said, "Tim. You llama_perf_sampler_print: sampling time = 999.96 ms / 56 runs ( 17.86 ms per token, 56.00 tokens per second)llama_perf_context_print: load time = 1713.55 ms llama_perf_context_print: prompt eval time = 603.51 ms / 6 tokens ( 100.58 ms per token, 9.94 tokens per second) llama_perf_context_print: eval time = 7069.36 ms / 49 runs ( 144.27 ms per token, 6.93 tokens per second) llama_perf_context_print: total time = 10046.17 ms / 55 tokens [2018-03-09 13:03:28.875126] 271:272 [[32m info[m] :: </proj/work/mmankali/bld-setuptest/tsirel-31/tsi_yocto_workspace/tsi-apc-manager/platform/rsm_mgr/rsm_process_req.c:145> TXE resource release request processed successfully. GGML Tsavorite Profiling Results: ------------------------------------------------------------------------------------------------------------------------ Calls Total(ms) T/call Self(ms) Function ------------------------------------------------------------------------------------------------------------------------ 2715 2720.000 1.002 0.000 [25%] RuntimeHostShim::awaitCommandListCompletion 1740 2635.984 1.515 2635.984 └─ [24%] [ txe_silu ] 925 1379.715 1.492 1379.715 └─ [12%] [ txe_mult ] 50 74.450 1.489 74.450 └─ [ 1%] [ txe_add ] 2715 0.448 0.000 0.448 └─ [ 0%] TXE 0 Idle 1 34.000 34.000 34.000 [ 0%] RuntimeHostShim::finalize 1 16.000 16.000 1.000 [ 0%] GGML Tsavorite 1 15.000 15.000 15.000 └─ [ 0%] RuntimeHostShim::initialize 2716 0.000 0.000 0.000 [ 0%] RuntimeHostShim::allocate 9120 0.000 0.000 0.000 [ 0%] RuntimeHostShim::getShmemManager 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::createCommandList 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::loadBlob 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::launchBlob 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::addCommandToList 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::finalizeCommandList 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::unloadBlob 2715 0.000 0.000 0.000 [ 0%] RuntimeHostShim::deallocate ======================================================================================================================== 33558 11098.000 0.331 11098.000 [100%] TOTAL ======================================================================================================================== ⟵ Back to Form The URL used is as follows http://10.50.0.124:5003/llama-cli?model=tiny-llama&backend=tSavorite&tokens=10&prompt=My+cat%27s+name&repeat-penalty=1.5&batch-size=1024&top-k=50&top-p=0.9&last-n=5&context-length=12288&temp=0.0 * @FIR-754: Addressed review comments. --------- Co-authored-by: Ashish Trivedi <atrivedi@fpga4.tsavoritesi.net>
diff --git a/tools/flaskIfc/flaskIfc.py b/tools/flaskIfc/flaskIfc.py
@@ -10,9 +10,19 @@
 app = Flask(__name__)
 
 port = '/dev/ttyUSB3'
+#port = '/dev/ttyUSB2'
 baudrate = '921600'
+#baudrate = '115200'
 exe_path = "/usr/bin/tsi/v0.1.1.tsv31_06_06_2025/bin/"
 
+DEFAULT_REPEAT_PENALTY = 1.5
+DEFAULT_BATCH_SIZE = 1024
+DEFAULT_TOP_K = 50
+DEFAULT_TOP_P = 0.9
+DEFAULT_LAST_N = 5
+DEFAULT_CONTEXT_LENGTH = 12288
+DEFAULT_TEMP = 0.0
+
 @app.route('/')
 def index():
     return render_template('index.html')
@@ -25,6 +35,13 @@ def llama_cli_serial_command():
     backend = request.args.get('backend')
     tokens = request.args.get('tokens')
     prompt = request.args.get('prompt')
+    repeat_penalty = request.args.get('repeat-penalty', DEFAULT_REPEAT_PENALTY)
+    batch_size = request.args.get('batch-size', DEFAULT_BATCH_SIZE)
+    top_k = request.args.get('top-k', DEFAULT_TOP_K)
+    top_p = request.args.get('top-p', DEFAULT_TOP_P)
+    last_n = request.args.get('last-n', DEFAULT_LAST_N)
+    context_length = request.args.get('context-length', DEFAULT_CONTEXT_LENGTH)
+    temp = request.args.get('temp', DEFAULT_TEMP)
 
     # Define the model path (update with actual paths)
     model_paths = {
@@ -51,7 +68,7 @@ def llama_cli_serial_command():
     # URL to Test this end point is as follows
     # http://10.50.30.167:5001/llama-cli?model=tiny-llama&backend=tSavorite&tokens=5&prompt=Hello+How+are+you
     script_path = "./run_llama_cli.sh"
-    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}"
+    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}"
 
     try:
         result = subprocess.run(['python3', 'serial_script.py', port, baudrate, command], capture_output=True, text=True, check=True)
@@ -167,6 +184,13 @@ def submit():
     backend = request.form.get('backend')
     tokens = request.form.get('tokens')
     prompt = request.form.get('prompt')
+    repeat_penalty = request.form.get('repeat-penalty', DEFAULT_REPEAT_PENALTY)
+    batch_size = request.form.get('batch-size', DEFAULT_BATCH_SIZE)
+    top_k = request.form.get('top-k', DEFAULT_TOP_K)
+    top_p = request.form.get('top-p', DEFAULT_TOP_P)
+    last_n = request.form.get('last-n', DEFAULT_LAST_N)
+    context_length = request.form.get('context-length', DEFAULT_CONTEXT_LENGTH)
+    temp = request.form.get('temp', DEFAULT_TEMP)
 
     # Define the model path (update with actual paths)
     model_paths = {
@@ -192,7 +216,7 @@ def submit():
     #]
 
     script_path = "./run_llama_cli.sh"
-    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend}"
+    command = f"cd {exe_path}; {script_path} \"{prompt}\" {tokens} {model_path} {backend} {repeat_penalty} {batch_size} {top_k} {top_p} {last_n} {context_length} {temp}"
 
 
     def run_script():