Android demo app update (#93)

kirklandsign · web-flow · commit baa5552aff9b · 2025-10-17T15:42:39.000-07:00
Add gemma 3 support. Need some handling for stop token, image type. Need to figure out image resizing next.

Also add some skeleton for audio input button
diff --git a/llm/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/llm/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
@@ -61,6 +61,7 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlmCall
   private ImageButton mSendButton;
   private ImageButton mGalleryButton;
   private ImageButton mCameraButton;
+  private ImageButton mAudioButton;
   private ListView mMessagesView;
   private MessageAdapter mMessageAdapter;
   private LlmModule mModule = null;
@@ -81,30 +82,36 @@ public class MainActivity extends AppCompatActivity implements Runnable, LlmCall
   private Runnable memoryUpdater;
   private boolean mThinkMode = false;
   private int promptID = 0;
-  private long startPos = 0;
-  private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2;
   private Executor executor;
   private boolean sawStartHeaderId = false;
 
   @Override
   public void onResult(String result) {
     if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) {
+      // For gemma and llava, we need to call stop() explicitly
+      if (mCurrentSettingsFields.getModelType() == ModelType.GEMMA_3
+          || mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+        mModule.stop();
+      }
       return;
     }
     result = PromptFormat.replaceSpecialToken(mCurrentSettingsFields.getModelType(), result);
 
-    if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_3 && result.equals("<|start_header_id|>")) {
+    if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_3
+        && result.equals("<|start_header_id|>")) {
       sawStartHeaderId = true;
     }
-    if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_3 && result.equals("<|end_header_id|>")) {
+    if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_3
+        && result.equals("<|end_header_id|>")) {
       sawStartHeaderId = false;
       return;
     }
     if (sawStartHeaderId) {
       return;
     }
 
-    boolean keepResult = !(result.equals("\n") || result.equals("\n\n")) || !mResultMessage.getText().isEmpty();
+    boolean keepResult =
+        !(result.equals("\n") || result.equals("\n\n")) || !mResultMessage.getText().isEmpty();
     if (keepResult) {
       mResultMessage.appendText(result);
       run();
@@ -466,6 +473,11 @@ private void setupMediaButton() {
                   .setMediaType(ActivityResultContracts.PickVisualMedia.ImageOnly.INSTANCE)
                   .build());
         });
+    mAudioButton = requireViewById(R.id.audioButton);
+    mAudioButton.setOnClickListener(
+        view -> {
+          mAddMediaLayout.setVisibility(View.GONE);
+        });
     mCameraButton = requireViewById(R.id.cameraButton);
     mCameraButton.setOnClickListener(
         view -> {
@@ -661,7 +673,8 @@ private void showMediaPreview(List<Uri> uris) {
 
     // For LLava, we want to call prefill_image as soon as an image is selected
     // Llava only support 1 image for now
-    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5 || mCurrentSettingsFields.getModelType() == ModelType.GEMMA_3) {
+    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5
+        || mCurrentSettingsFields.getModelType() == ModelType.GEMMA_3) {
       List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
       if (!processedImageList.isEmpty()) {
         mMessageAdapter.add(
@@ -673,12 +686,19 @@ private void showMediaPreview(List<Uri> uris) {
               ETLogging.getInstance().log("Starting runnable prefill image");
               ETImage img = processedImageList.get(0);
               ETLogging.getInstance().log("Llava start prefill image");
-              startPos =
-                  mModule.prefillImages(
-                      img.getInts(),
-                      img.getWidth(),
-                      img.getHeight(),
-                      ModelUtils.VISION_MODEL_IMAGE_CHANNELS);
+              if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
+                mModule.prefillImages(
+                    img.getInts(),
+                    img.getWidth(),
+                    img.getHeight(),
+                    ModelUtils.VISION_MODEL_IMAGE_CHANNELS);
+              } else if (mCurrentSettingsFields.getModelType() == ModelType.GEMMA_3) {
+                mModule.prefillImages(
+                    img.getFloats(),
+                    img.getWidth(),
+                    img.getHeight(),
+                    ModelUtils.VISION_MODEL_IMAGE_CHANNELS);
+              }
             };
         executor.execute(runnable);
       }
@@ -722,7 +742,6 @@ private void onModelRunStopped() {
           String rawPrompt = mEditTextMessage.getText().toString();
           String finalPrompt =
               mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode);
-          mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode);
           // We store raw prompt into message adapter, because we don't want to show the extra
           // tokens from system prompt
           mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID));
@@ -769,10 +788,7 @@ public void run() {
                   } else {
                     ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt);
                     mModule.generate(
-                        finalPrompt,
-                        (int) (finalPrompt.length() * 0.75) + 64,
-                        MainActivity.this,
-                        false);
+                        finalPrompt, ModelUtils.TEXT_MODEL_SEQ_LEN, MainActivity.this, false);
                   }
 
                   long generateDuration = System.currentTimeMillis() - generateStartTime;
diff --git a/llm/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/llm/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
@@ -15,8 +15,8 @@ public class ModelUtils {
   // XNNPACK or Vulkan
   static final int VISION_MODEL = 2;
   static final int VISION_MODEL_IMAGE_CHANNELS = 3;
-  static final int VISION_MODEL_SEQ_LEN = 768;
-  static final int TEXT_MODEL_SEQ_LEN = 256;
+  static final int VISION_MODEL_SEQ_LEN = 2048;
+  static final int TEXT_MODEL_SEQ_LEN = 768;
 
   // MediaTek
   static final int MEDIATEK_TEXT_MODEL = 3;
@@ -29,6 +29,7 @@ public static int getModelCategory(ModelType modelType, BackendType backendType)
       switch (modelType) {
         case GEMMA_3:
         case LLAVA_1_5:
+        case VOXTRAL:
           return VISION_MODEL;
         case LLAMA_3:
         case QWEN_3:
diff --git a/llm/android/LlamaDemo/app/src/main/res/drawable/baseline_audio_file_48.xml b/llm/android/LlamaDemo/app/src/main/res/drawable/baseline_audio_file_48.xml
@@ -0,0 +1,5 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
+      
+    <path android:fillColor="@android:color/white" android:pathData="M14,2H6C4.9,2 4.01,2.9 4.01,4L4,20c0,1.1 0.89,2 1.99,2H18c1.1,0 2,-0.9 2,-2V8L14,2zM16,13h-3v3.75c0,1.24 -1.01,2.25 -2.25,2.25S8.5,17.99 8.5,16.75c0,-1.24 1.01,-2.25 2.25,-2.25c0.46,0 0.89,0.14 1.25,0.38V11h4V13zM13,9V3.5L18.5,9H13z"/>
+    
+</vector>
diff --git a/llm/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/llm/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
@@ -234,6 +234,15 @@
                     android:layout_marginStart="40dp"
                     android:background="@drawable/custom_button_round"
                     android:src="@drawable/outline_image_48" />
+
+                <ImageButton
+                    android:id="@+id/audioButton"
+                    android:layout_width="80dp"
+                    android:layout_height="80dp"
+                    android:layout_marginStart="40dp"
+                    android:background="@drawable/custom_button_round"
+                    android:src="@drawable/baseline_audio_file_48" />
+
             </LinearLayout>
         </LinearLayout>