Refactor chat UI and token generation logic, and handle token generation errors

vraspar · vraspar · commit 46581dae0727 · 2024-10-09T13:56:14.000-07:00
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/ContentView.swift
@@ -1,3 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 import SwiftUI
 
 
@@ -12,6 +15,8 @@ struct ContentView: View {
     @State private var messages: [Message] = []  // Store chat messages locally
     @State private var isGenerating: Bool = false  // Track token generation state
     @State private var stats: String = ""  // token genetation stats
+    @State private var showAlert: Bool = false
+    @State private var errorMessage: String = ""
     
     var body: some View {
         VStack {
@@ -88,12 +93,25 @@ struct ContentView: View {
         }
         .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationStats"))) { notification in
             if let userInfo = notification.userInfo,
-               let totalTime = userInfo["totalTime"] as? Int,
-               let firstTokenTime = userInfo["firstTokenTime"] as? Int,
-               let tokenCount = userInfo["tokenCount"] as? Int {
-                stats = "Generated \(tokenCount) tokens in \(totalTime) ms. First token in \(firstTokenTime) ms."
+               let promptProcRate = userInfo["promptProcRate"] as? Double,
+               let tokenGenRate = userInfo["tokenGenRate"] as? Double {
+                stats = String(format: "Token generation rate: %.2f tokens/s. Prompt processing rate: %.2f tokens/s", tokenGenRate, promptProcRate)
+            }
+        }
+        .onReceive(NotificationCenter.default.publisher(for: NSNotification.Name("TokenGenerationError"))) { notification in
+            if let userInfo = notification.userInfo, let error = userInfo["error"] as? String {
+                    errorMessage = error
+                    showAlert = true
             }
         }
+        .alert(isPresented: $showAlert) {
+            Alert(
+                title: Text("Error"),
+                message: Text(errorMessage),
+                dismissButton: .default(Text("OK"))
+            )
+        }
+        
     }
 }
 
@@ -117,7 +135,7 @@ struct ChatBubble: View {
                     .background(Color(.systemGray5))
                     .foregroundColor(.black)
                     .cornerRadius(25)
-                    .padding(.horizontal, 20)
+                    .padding(.horizontal, 10)
                 Spacer()
             }
         }
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/GenAIGenerator.mm
@@ -1,113 +1,143 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
 #import "GenAIGenerator.h"
 #include "LocalLLM-Swift.h"
 #include "ort_genai.h"
 #include "ort_genai_c.h"
 #include <chrono>
+#include <vector>
 
 @implementation GenAIGenerator
 
-typedef std::chrono::high_resolution_clock Clock;
+typedef std::chrono::steady_clock Clock;
 typedef std::chrono::time_point<Clock> TimePoint;
+static std::unique_ptr<OgaModel> model = nullptr;
+static std::unique_ptr<OgaTokenizer> tokenizer = nullptr;
 
 + (void)generate:(nonnull NSString*)input_user_question {
-    NSLog(@"Starting token generation...");
-    
-    NSString* llmPath = [[NSBundle mainBundle] resourcePath];
-    const char* modelPath = llmPath.cString;
-    
-    // Log model creation
-    NSLog(@"Creating model ...");
-    auto model = OgaModel::Create(modelPath);
-    if (!model) {
-        NSLog(@"Failed to create model.");
-        return;
-    }
-    
-    NSLog(@"Creating tokenizer...");
-    auto tokenizer = OgaTokenizer::Create(*model);
-    if (!tokenizer) {
-        NSLog(@"Failed to create tokenizer.");
-        return;
-    }
-    
-    auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
-    
-    // Construct the prompt
-    NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
-    const char* prompt = [promptString UTF8String];
-    
-    NSLog(@"Encoding prompt...");
-    auto sequences = OgaSequences::Create();
-    tokenizer->Encode(prompt, *sequences);
-    
-    // Log parameters
-    NSLog(@"Setting generator parameters...");
-    auto params = OgaGeneratorParams::Create(*model);
-    params->SetSearchOption("max_length", 200);
-    params->SetInputSequences(*sequences);
-    
-    NSLog(@"Creating generator...");
-    auto generator = OgaGenerator::Create(*model, *params);
-    
-    bool isFirstToken = true;
-    TimePoint startTime = Clock::now();
-    TimePoint firstTokenTime;
-    int tokenCount = 0;
-    
-    NSLog(@"Starting token generation loop...");
-    while (!generator->IsDone()) {
-        generator->ComputeLogits();
-        generator->GenerateNextToken();
-        
-        if (isFirstToken) {
-            NSLog(@"First token generated.");
-            firstTokenTime = Clock::now();
-            isFirstToken = false;
+    std::vector<long long> tokenTimes; // per-token generation times
+    TimePoint startTime, firstTokenTime, tokenStartTime;
+
+    @try {
+        NSLog(@"Starting token generation...");
+
+        if (!model) {
+            NSLog(@"Creating model...");
+            NSString* llmPath = [[NSBundle mainBundle] resourcePath];
+            const char* modelPath = llmPath.cString;
+            model = OgaModel::Create(modelPath); // throws exception
+
+            if (!model) {
+                @throw [NSException exceptionWithName:@"ModelCreationError" reason:@"Failed to create model." userInfo:nil];
+            }
         }
-        
-        // Get the sequence data
-        const int32_t* seq = generator->GetSequenceData(0);
-        size_t seq_len = generator->GetSequenceCount(0);
-        
-        // Decode the new token
-        const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);
-        
-        // Check for decoding failure
-        if (!decode_tokens) {
-            NSLog(@"Token decoding failed.");
-            break;
+
+        if (!tokenizer) {
+            NSLog(@"Creating tokenizer...");
+            tokenizer = OgaTokenizer::Create(*model);  // throws exception
+            if (!tokenizer) {
+                @throw [NSException exceptionWithName:@"TokenizerCreationError" reason:@"Failed to create tokenizer." userInfo:nil];
+            }
         }
+
+        auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+
+        // Construct the prompt
+        NSString* promptString = [NSString stringWithFormat:@"<|user|>\n%@<|end|>\n<|assistant|>", input_user_question];
+        const char* prompt = [promptString UTF8String];
+
+        // Encode the prompt
+        auto sequences = OgaSequences::Create();
+        tokenizer->Encode(prompt, *sequences);
+
+        size_t promptTokensCount = sequences->SequenceCount(0); 
+
+        NSLog(@"Setting generator parameters...");
+        auto params = OgaGeneratorParams::Create(*model);
+        params->SetSearchOption("max_length", 200);
+        params->SetInputSequences(*sequences);
+
+        auto generator = OgaGenerator::Create(*model, *params);
+
+        bool isFirstToken = true;
+        NSLog(@"Starting token generation loop...");
         
-        NSLog(@"Decoded token: %s", decode_tokens);
-        tokenCount++;
+        startTime = Clock::now();
+        while (!generator->IsDone()) {
+            tokenStartTime = Clock::now();
+
+            generator->ComputeLogits();
+            generator->GenerateNextToken();
+
+            if (isFirstToken) {
+                firstTokenTime = Clock::now();
+                isFirstToken = false;
+            }
+
+            // Get the sequence data and decode the token
+            const int32_t* seq = generator->GetSequenceData(0);
+            size_t seq_len = generator->GetSequenceCount(0);
+            const char* decode_tokens = tokenizer_stream->Decode(seq[seq_len - 1]);
+
+            if (!decode_tokens) {
+                @throw [NSException exceptionWithName:@"TokenDecodeError" reason:@"Token decoding failed." userInfo:nil];
+            }
+
+            // Measure token generation time excluding logging
+            TimePoint tokenEndTime = Clock::now();
+            auto tokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(tokenEndTime - tokenStartTime).count();
+            tokenTimes.push_back(tokenDuration);
+            NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
+            [SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
+        }
+
+        TimePoint endTime = Clock::now();
+        // Log token times
+        NSLog(@"Per-token generation times: %@", [self formatTokenTimes:tokenTimes]);
+
+        // Calculate metrics
+        auto totalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
+        auto firstTokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(firstTokenTime - startTime).count();
+
+        double promtProcTime =  (double)promptTokensCount / firstTokenDuration;
+        double tokenGenRate = (double)(tokenTimes.size() - 1) * 1000.0 / (totalDuration - firstTokenDuration);
+
+        NSLog(@"Token generation completed. Total time: %lld ms, First token time: %lld ms, Total tokens: %zu", totalDuration, firstTokenDuration, tokenTimes.size());
+        NSLog(@"Prompt tokens: %zu, Prompt Processing Time: %f tokens/s", promptTokensCount, promtProcTime);
+        NSLog(@"Generated tokens: %zu, Token Generation Rate: %f tokens/s", tokenTimes.size(), tokenGenRate);
+
         
-        // Convert token to NSString and update UI on the main thread
-        NSString* decodedTokenString = [NSString stringWithUTF8String:decode_tokens];
-        [SharedTokenUpdater.shared addDecodedToken:decodedTokenString];
-    }
+        NSDictionary *stats = @{
+            @"tokenGenRate" : @(tokenGenRate),
+            @"promptProcRate": @(promtProcTime)
+        };
+        // notify main thread that token generation is complete 
+        dispatch_async(dispatch_get_main_queue(), ^{
+            [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationStats" object:nil userInfo:stats];
+            [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationCompleted" object:nil];
+        });
+
+        NSLog(@"Token generation completed.");
 
+    } @catch (NSException* e) {
+        NSString* errorMessage = e.reason;
+        NSLog(@"Error during generation: %@", errorMessage);
 
-    TimePoint endTime = Clock::now();
-    auto totalDuration = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime).count();
-    auto firstTokenDuration = std::chrono::duration_cast<std::chrono::milliseconds>(firstTokenTime - startTime).count();
-    
-    NSLog(@"Token generation completed. Total time: %lld ms, First token time: %lld ms, Total tokens: %d", totalDuration, firstTokenDuration, tokenCount);
-
-    NSDictionary *stats = @{
-        @"totalTime": @(totalDuration),
-        @"firstTokenTime": @(firstTokenDuration),
-        @"tokenCount": @(tokenCount)
-    };
-
-    // notify main thread that token generation is complete 
-    dispatch_async(dispatch_get_main_queue(), ^{
-        [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationCompleted" object:nil];
-        [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationStats" object:nil userInfo:stats];
-    });
-    NSLog(@"Token generation completed.");
+        // Send error to the UI
+        NSDictionary *errorInfo = @{@"error": errorMessage};
+        dispatch_async(dispatch_get_main_queue(), ^{
+            [[NSNotificationCenter defaultCenter] postNotificationName:@"TokenGenerationError" object:nil userInfo:errorInfo];
+        });
+    }
+}
+
+// Utility function to format token times for logging
++ (NSString*)formatTokenTimes:(const std::vector<long long>&)tokenTimes {
+    NSMutableString *formattedTimes = [NSMutableString string];
+    for (size_t i = 0; i < tokenTimes.size(); i++) {
+        [formattedTimes appendFormat:@"%lld ms, ", tokenTimes[i]];
+    }
+    return [formattedTimes copy];
 }
 
 @end
diff --git a/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png b/mobile/examples/phi-3/ios/LocalLLM/LocalLLM/Simulator Screenshot - iPhone 16.png