lm-sys
diff --git a/‎fastchat/serve/gradio_block_arena_vision.py‎
Lines changed: 94 additions & 53 deletions b/‎fastchat/serve/gradio_block_arena_vision.py‎
Lines changed: 94 additions & 53 deletions
diff --git a/‎fastchat/serve/gradio_block_arena_vision_anony.py‎
Lines changed: 24 additions & 19 deletions b/‎fastchat/serve/gradio_block_arena_vision_anony.py‎
Lines changed: 24 additions & 19 deletions
@@ -10,6 +10,7 @@
 import json
 import os
 import time
+from typing import List, Union
 
 import gradio as gr
 from gradio.data_classes import FileData
@@ -27,6 +28,7 @@
 from fastchat.model.model_adapter import (
     get_conversation_template,
 )
+from fastchat.serve.gradio_global_state import Context
 from fastchat.serve.gradio_web_server import (
     get_model_description_md,
     acknowledgment_md,
@@ -153,14 +155,14 @@ def clear_history(request: gr.Request):
     ip = get_ip(request)
     logger.info(f"clear_history. ip: {ip}")
     state = None
-    return (state, [], enable_multimodal_clear_input) + (disable_btn,) * 5
+    return (state, [], enable_multimodal_clear_input, invisible_text, invisible_btn) + (disable_btn,) * 5
 
 
 def clear_history_example(request: gr.Request):
     ip = get_ip(request)
     logger.info(f"clear_history_example. ip: {ip}")
     state = None
-    return (state, [], enable_multimodal_keep_input) + (disable_btn,) * 5
+    return (state, [], enable_multimodal_keep_input, invisible_text, invisible_btn) + (disable_btn,) * 5
 
 
 # TODO(Chris): At some point, we would like this to be a live-reporting feature.
@@ -199,11 +201,16 @@ def add_text(state, model_selector, chat_input, request: gr.Request):
     logger.info(f"add_text. ip: {ip}. len: {len(text)}")
 
     if state is None:
-        state = State(model_selector, is_vision=True)
+        if len(images) == 0:
+            state = State(model_selector, is_vision=False)
+        else:
+            state = State(model_selector, is_vision=True)
 
     if len(text) <= 0:
         state.skip_next = True
-        return (state, state.to_gradio_chatbot(), None) + (no_change_btn,) * 5
+        return (state, state.to_gradio_chatbot(), None, "", no_change_btn) + (
+            no_change_btn,
+        ) * 5
 
     all_conv_text = state.conv.get_prompt()
     all_conv_text = all_conv_text[-2000:] + "\nuser: " + text
@@ -239,19 +246,29 @@ def add_text(state, model_selector, chat_input, request: gr.Request):
     if (len(state.conv.messages) - state.conv.offset) // 2 >= CONVERSATION_TURN_LIMIT:
         logger.info(f"conversation turn limit. ip: {ip}. text: {text}")
         state.skip_next = True
-        return (state, state.to_gradio_chatbot(), {"text": CONVERSATION_LIMIT_MSG}) + (
+        return (
+            state,
+            state.to_gradio_chatbot(),
+            {"text": CONVERSATION_LIMIT_MSG},
+            "",
             no_change_btn,
-        ) * 5
+        ) + (no_change_btn,) * 5
 
     text = text[:INPUT_CHAR_LEN_LIMIT]  # Hard cut-off
     text = _prepare_text_with_image(state, text, images)
     state.conv.append_message(state.conv.roles[0], text)
     state.conv.append_message(state.conv.roles[1], None)
-    return (state, state.to_gradio_chatbot(), None) + (disable_btn,) * 5
+    return (
+        state,
+        state.to_gradio_chatbot(),
+        disable_multimodal,
+        visible_text,
+        enable_btn,
+    ) + (disable_btn,) * 5
 
 
 def build_single_vision_language_model_ui(
-    models, add_promotion_links=False, random_questions=None
+    context: Context, add_promotion_links=False, random_questions=None
 ):
     promotion = (
         f"""
@@ -273,33 +290,29 @@ def build_single_vision_language_model_ui(
 
     state = gr.State()
     gr.Markdown(notice_markdown, elem_id="notice_markdown")
+    text_and_vision_models = list(set(context.text_models + context.vision_models))
+    context_state = gr.State(context)
 
     with gr.Group():
         with gr.Row(elem_id="model_selector_row"):
             model_selector = gr.Dropdown(
-                choices=models,
-                value=models[0] if len(models) > 0 else "",
+                choices=text_and_vision_models,
+                value=text_and_vision_models[0]
+                if len(text_and_vision_models) > 0
+                else "",
                 interactive=True,
                 show_label=False,
                 container=False,
             )
 
         with gr.Accordion(
-            f"🔍 Expand to see the descriptions of {len(models)} models", open=False
+            f"🔍 Expand to see the descriptions of {len(text_and_vision_models)} models",
+            open=False,
         ):
-            model_description_md = get_model_description_md(models)
+            model_description_md = get_model_description_md(text_and_vision_models)
             gr.Markdown(model_description_md, elem_id="model_description_markdown")
 
     with gr.Row():
-        textbox = gr.MultimodalTextbox(
-            file_types=["image"],
-            show_label=False,
-            placeholder="Enter your prompt or add image here",
-            container=True,
-            render=False,
-            elem_id="input_box",
-        )
-
         with gr.Column(scale=2, visible=False) as image_column:
             imagebox = gr.Image(
                 type="pil",
@@ -312,9 +325,24 @@ def build_single_vision_language_model_ui(
             )
 
     with gr.Row():
-        textbox.render()
-        # with gr.Column(scale=1, min_width=50):
-        #     send_btn = gr.Button(value="Send", variant="primary")
+        textbox = gr.Textbox(
+            show_label=False,
+            placeholder="👉 Enter your prompt and press ENTER",
+            elem_id="input_box",
+            visible=False,
+        )
+
+        send_btn = gr.Button(
+            value="Send", variant="primary", scale=0, visible=False, interactive=False
+        )
+
+        multimodal_textbox = gr.MultimodalTextbox(
+            file_types=["image"],
+            show_label=False,
+            placeholder="Enter your prompt or add image here",
+            container=True,
+            elem_id="input_box",
+        )
 
     with gr.Row(elem_id="buttons"):
         if random_questions:
@@ -328,22 +356,6 @@ def build_single_vision_language_model_ui(
         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
         clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
 
-    cur_dir = os.path.dirname(os.path.abspath(__file__))
-
-    examples = gr.Examples(
-        examples=[
-            {
-                "text": "How can I prepare a delicious meal using these ingredients?",
-                "files": [f"{cur_dir}/example_images/fridge.jpg"],
-            },
-            {
-                "text": "What might the woman on the right be thinking about?",
-                "files": [f"{cur_dir}/example_images/distracted.jpg"],
-            },
-        ],
-        inputs=[textbox],
-    )
-
     with gr.Accordion("Parameters", open=False) as parameter_row:
         temperature = gr.Slider(
             minimum=0.0,
@@ -395,23 +407,50 @@ def build_single_vision_language_model_ui(
         [state, temperature, top_p, max_output_tokens],
         [state, chatbot] + btn_list,
     )
-    clear_btn.click(clear_history, None, [state, chatbot, textbox] + btn_list)
+    clear_btn.click(
+        clear_history,
+        None,
+        [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list,
+    )
 
     model_selector.change(
-        clear_history, None, [state, chatbot, textbox] + btn_list
-    ).then(set_visible_image, [textbox], [image_column])
-    examples.dataset.click(
-        clear_history_example, None, [state, chatbot, textbox] + btn_list
+        clear_history,
+        None,
+        [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list,
+    ).then(set_visible_image, [multimodal_textbox], [image_column])
+
+    multimodal_textbox.input(add_image, [multimodal_textbox], [imagebox]).then(
+        set_visible_image, [multimodal_textbox], [image_column]
+    ).then(
+        clear_history_example,
+        None,
+        [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list,
     )
 
-    textbox.input(add_image, [textbox], [imagebox]).then(
-        set_visible_image, [textbox], [image_column]
-    ).then(clear_history_example, None, [state, chatbot, textbox] + btn_list)
+    multimodal_textbox.submit(
+        add_text,
+        [state, model_selector, multimodal_textbox, context_state],
+        [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list,
+    ).then(set_invisible_image, [], [image_column]).then(
+        bot_response,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot] + btn_list,
+    )
 
     textbox.submit(
         add_text,
-        [state, model_selector, textbox],
-        [state, chatbot, textbox] + btn_list,
+        [state, model_selector, textbox, context_state],
+        [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list,
+    ).then(set_invisible_image, [], [image_column]).then(
+        bot_response,
+        [state, temperature, top_p, max_output_tokens],
+        [state, chatbot] + btn_list,
+    )
+
+    send_btn.click(
+        add_text,
+        [state, model_selector, textbox, context_state],
+        [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list,
     ).then(set_invisible_image, [], [image_column]).then(
         bot_response,
         [state, temperature, top_p, max_output_tokens],
@@ -422,9 +461,11 @@ def build_single_vision_language_model_ui(
         random_btn.click(
             get_vqa_sample,  # First, get the VQA sample
             [],  # Pass the path to the VQA samples
-            [textbox, imagebox],  # Outputs are textbox and imagebox
-        ).then(set_visible_image, [textbox], [image_column]).then(
-            clear_history_example, None, [state, chatbot, textbox] + btn_list
+            [multimodal_textbox, imagebox],  # Outputs are textbox and imagebox
+        ).then(set_visible_image, [multimodal_textbox], [image_column]).then(
+            clear_history_example,
+            None,
+            [state, chatbot, multimodal_textbox, textbox, send_btn] + btn_list,
         )
 
     return [state, model_selector]
@@ -8,6 +8,7 @@
 
 import gradio as gr
 import numpy as np
+from typing import Union
 
 from fastchat.constants import (
     TEXT_MODERATION_MSG,
@@ -48,7 +49,6 @@
     regenerate,
     clear_history,
     share_click,
-    add_text,
     bot_response_multi,
     set_global_vars_anony,
     load_demo_side_by_side_anony,
@@ -75,6 +75,7 @@
     BaseContentModerator,
     AzureAndOpenAIContentModerator,
 )
+from fastchat.serve.gradio_global_state import Context
 from fastchat.serve.remote_logger import get_remote_logger
 from fastchat.utils import (
     build_logger,
@@ -121,16 +122,12 @@ def get_vqa_sample():
     return (res, path)
 
 
-def load_demo_side_by_side_vision_anony(all_text_models, all_vl_models, url_params):
-    global text_models, vl_models
-    text_models = all_text_models
-    vl_models = all_vl_models
-
-    states = (None,) * num_sides
-    selector_updates = (
+def load_demo_side_by_side_vision_anony():
+    states = [None] * num_sides
+    selector_updates = [
         gr.Markdown(visible=True),
         gr.Markdown(visible=True),
-    )
+    ]
 
     return states + selector_updates
 
@@ -256,7 +253,13 @@ def clear_history(request: gr.Request):
 
 
 def add_text(
-    state0, state1, model_selector0, model_selector1, chat_input, request: gr.Request
+    state0,
+    state1,
+    model_selector0,
+    model_selector1,
+    chat_input: Union[str, dict],
+    context: Context,
+    request: gr.Request,
 ):
     if isinstance(chat_input, dict):
         text, images = chat_input["text"], chat_input["files"]
@@ -275,7 +278,7 @@ def add_text(
 
         if len(images) > 0:
             model_left, model_right = get_battle_pair(
-                vl_models,
+                context.all_vision_models,
                 VISION_BATTLE_TARGETS,
                 VISION_OUTAGE_MODELS,
                 VISION_SAMPLING_WEIGHTS,
@@ -287,7 +290,7 @@ def add_text(
             ]
         else:
             model_left, model_right = get_battle_pair(
-                text_models,
+                context.all_text_models,
                 BATTLE_TARGETS,
                 OUTAGE_MODELS,
                 SAMPLING_WEIGHTS,
@@ -408,8 +411,8 @@ def add_text(
     )
 
 
-def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=None):
-    notice_markdown = f"""
+def build_side_by_side_vision_ui_anony(context: Context, random_questions=None):
+    notice_markdown = """
 # ⚔️  LMSYS Chatbot Arena (Multimodal): Benchmarking LLMs and VLMs in the Wild
 [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2403.04132) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) | [Kaggle Competition](https://www.kaggle.com/competitions/lmsys-chatbot-arena)
 
@@ -432,7 +435,9 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=
     chatbots = [None] * num_sides
     show_vote_buttons = gr.State(True)
 
+    context_state = gr.State(context)
     gr.Markdown(notice_markdown, elem_id="notice_markdown")
+    text_and_vision_models = list(set(context.text_models + context.vision_models))
 
     with gr.Row():
         with gr.Column(scale=2, visible=False) as image_column:
@@ -445,11 +450,11 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=
         with gr.Column(scale=5):
             with gr.Group(elem_id="share-region-anony"):
                 with gr.Accordion(
-                    f"🔍 Expand to see the descriptions of {len(text_models) + len(vl_models)} models",
+                    f"🔍 Expand to see the descriptions of {len(text_and_vision_models)} models",
                     open=False,
                 ):
                     model_description_md = get_model_description_md(
-                        text_models + vl_models
+                        text_and_vision_models
                     )
                     gr.Markdown(
                         model_description_md, elem_id="model_description_markdown"
@@ -630,7 +635,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=
 
     multimodal_textbox.submit(
         add_text,
-        states + model_selectors + [multimodal_textbox],
+        states + model_selectors + [multimodal_textbox, context_state],
         states
         + chatbots
         + [multimodal_textbox, textbox, send_btn]
@@ -650,7 +655,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=
 
     textbox.submit(
         add_text,
-        states + model_selectors + [textbox],
+        states + model_selectors + [textbox, context_state],
         states
         + chatbots
         + [multimodal_textbox, textbox, send_btn]
@@ -670,7 +675,7 @@ def build_side_by_side_vision_ui_anony(text_models, vl_models, random_questions=
 
     send_btn.click(
         add_text,
-        states + model_selectors + [textbox],
+        states + model_selectors + [textbox, context_state],
         states
         + chatbots
         + [multimodal_textbox, textbox, send_btn]