Merge

BabyChouSr · BabyChouSr · commit 807b66f9db5f · 2024-10-08T00:55:40.000Z
diff --git a/fastchat/conversation.py b/fastchat/conversation.py
@@ -576,6 +576,31 @@ def to_reka_api_messages(self):
 
         return ret
 
+    def to_metagen_api_messages(self):
+        """Convert the conversation to MetaGen (Meta) chat completion format."""
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "text": self.system_message}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    text, images = msg[0], msg[1]
+                    # Currently only support one image.
+                    attachment = {
+                        "type": "base64_image",
+                        "mime": "image/jpeg",
+                        "data": images[-1].base64_str,
+                    }
+                    ret.append({"role": "user", "text": text, "attachment": attachment})
+                else:
+                    ret.append({"role": "user", "text": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "ai", "text": msg})
+        return ret
+
     def save_new_images(self, has_csam_images=False, use_remote_storage=False):
         import hashlib
         from fastchat.constants import LOGDIR
diff --git a/fastchat/serve/api_provider.py b/fastchat/serve/api_provider.py
@@ -203,6 +203,17 @@ def get_api_provider_stream_iter(
             api_base=model_api_dict["api_base"],
             api_key=model_api_dict["api_key"],
         )
+    elif model_api_dict["api_type"] == "metagen":
+        prompt = conv.to_metagen_api_messages()
+        stream_iter = metagen_api_stream_iter(
+            model_api_dict["model_name"],
+            prompt,
+            temperature,
+            top_p,
+            max_new_tokens,
+            api_base=model_api_dict["api_base"],
+            api_key=model_api_dict["api_key"],
+        )
     else:
         raise NotImplementedError()
 
@@ -1115,11 +1126,62 @@ def reka_api_stream_iter(
         model=model_name,
     )
 
-    for chunk in response:
-        try:
-            yield {"text": chunk.responses[0].chunk.content, "error_code": 0}
-        except:
-            yield {
-                "text": f"**API REQUEST ERROR** ",
-                "error_code": 1,
+    if response.status_code != 200:
+        error_message = response.text
+        logger.error(f"==== error from reka api: {error_message} ====")
+        yield {
+            "text": f"**API REQUEST ERROR** Reason: {error_message}",
+            "error_code": 1,
+        }
+        return
+
+    for line in response.iter_lines():
+        line = line.decode("utf8")
+        if not line.startswith("data: "):
+            continue
+        gen = json.loads(line[6:])
+        yield {"text": gen["text"], "error_code": 0}
+
+
+def metagen_api_stream_iter(
+    model_name,
+    messages,
+    temperature,
+    top_p,
+    max_new_tokens,
+    api_key,
+    api_base,
+):
+    res = requests.post(
+        f"{api_base}/chat_stream_completions?access_token={api_key}",
+        stream=True,
+        headers={"Content-Type": "application/json"},
+        json={
+            "model": model_name,
+            "chunks_delimited": True,
+            "messages": messages,
+            "options": {
+                "max_tokens": max_new_tokens,
+                "generation_algorithm": "top_p",
+                "top_p": top_p,
+                "temperature": temperature,
+            },
+        },
+        timeout=40,
+    )
+
+    if res.status_code != 200:
+        logger.error(f"Unexpected response ({res.status_code}): {res.text}")
+        raise ValueError("Unexpected response: ", res.json())
+
+    text = ""
+    for line in res.iter_lines():
+        if line:
+            part = json.loads(line.decode("utf-8"))
+            if "text" in part:
+                text += part["text"]
+            data = {
+                "text": text,
+                "error_code": 0,
             }
+            yield data
diff --git a/fastchat/serve/monitor/elo_analysis.py b/fastchat/serve/monitor/elo_analysis.py
@@ -495,18 +495,33 @@ def construct_style_matrices(
     return X, Y, models
 
 
-def get_bootstrap_result_style_control(X, Y, models, func_compute_elo, num_round=1000):
+def get_bootstrap_result_style_control(
+    X, Y, battles, models, func_compute_elo, num_round=1000
+):
     elos = []
     coefs = []
     assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
     k = int(
         X.shape[0] / 2
     )  # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates
 
+    battles_tie_idx = (battles["winner"] == "tie") | (
+        battles["winner"] == "tie (bothbad)"
+    )
     for _ in tqdm(range(num_round), desc="bootstrap"):
         indices = np.random.choice(list(range(k)), size=(k), replace=True)
-        _X = np.concatenate([X[indices], X[indices]])
-        _Y = np.concatenate([Y[indices], Y[indices]])
+
+        index2tie = np.zeros(k, dtype=bool)
+        index2tie[battles_tie_idx] = True
+
+        nontie_indices = indices[~index2tie[indices]]
+        tie_indices = np.concatenate(
+            [indices[index2tie[indices]], indices[index2tie[indices]] + k]
+        )
+
+        _X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
+        _Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])
+
         assert _X.shape == X.shape and _Y.shape == Y.shape
 
         states = ~_X[:, : len(models)].any(axis=0)
@@ -585,7 +600,7 @@ def report_elo_analysis_results(
         if style_control:
             X, Y, models = construct_style_matrices(battles)
             bootstrap_df, boostrap_coef = get_bootstrap_result_style_control(
-                X, Y, models, fit_mle_elo, num_round=num_bootstrap
+                X, Y, battles, models, fit_mle_elo, num_round=num_bootstrap
             )
             elo_rating_final, coef_final = fit_mle_elo(X, Y, models)
         else:
diff --git a/fastchat/serve/monitor/monitor_md.py b/fastchat/serve/monitor/monitor_md.py
@@ -6,13 +6,15 @@
 
 key_to_category_name = {
     "full": "Overall",
+    "full_style_control": "Overall w/ Style Control",
     "dedup": "De-duplicate Top Redundant Queries (soon to be default)",
     "math": "Math",
     "if": "Instruction Following",
     "multiturn": "Multi-Turn",
     "coding": "Coding",
     "hard_6": "Hard Prompts (Overall)",
     "hard_english_6": "Hard Prompts (English)",
+    "hard_6_style_control": "Hard Prompts (Overall) w/ Style Control",
     "long_user": "Longer Query",
     "english": "English",
     "chinese": "Chinese",
@@ -30,12 +32,14 @@
 }
 cat_name_to_explanation = {
     "Overall": "Overall Questions",
+    "Overall w/ Style Control": "Overall with Style Control",
     "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
     "Math": "Math",
     "Instruction Following": "Instruction Following",
     "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
     "Coding": "Coding: whether conversation contains code snippets",
     "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
+    "Hard Prompts (Overall) w/ Style Control": "Hard Prompts (Overall) with Style Control",
     "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
     "Longer Query": "Longer Query (>= 500 tokens)",
     "English": "English Prompts",
diff --git a/fastchat/serve/vision/create_vqa_examples_dir.py b/fastchat/serve/vision/create_vqa_examples_dir.py
@@ -64,14 +64,6 @@ def download_images_and_create_json(
     args = parser.parse_args()
 
     datasets_info = {
-        "realworldqa": {
-            "path": "visheratin/realworldqa",
-            "image_key": "image",
-            "question_key": "question",
-            "id_key": "index",
-            "subset": False,
-            "split": "test",
-        },
         "Memes": {
             "path": "not-lain/meme-dataset",
             "image_key": "image",
diff --git a/fastchat/serve/vision/create_vqa_examples_json.py b/fastchat/serve/vision/create_vqa_examples_json.py
@@ -17,19 +17,22 @@
     args = parser.parse_args()
 
     dataset_prop = {
-        "realworldqa": 500,
         "Memes": 500,
         "Floorplan": 500,
         "Website": 500,
-        "IllusionVQA": 500,
+        "IllusionVQA": 435,
         "NewYorker": 500,
     }
 
     dataset_json = []
     for dataset_name in dataset_prop.keys():
         with open(f"{args.output_dir}/{dataset_name}/data.json") as f:
             data = json.load(f)
-            dataset_json.extend(np.random.choice(data, dataset_prop[dataset_name]))
+            dataset_json.extend(
+                np.random.choice(
+                    data, min(dataset_prop[dataset_name], len(data)), replace=False
+                )
+            )
 
     with open(f"{args.output_dir}/metadata_sampled.json", "w") as f:
         json.dump(dataset_json, f, indent=4)