Create and analyze statistics about the time it takes to vote (#3301)

suquark · web-flow · commit f2e6ca964af7 · 2024-05-02T01:35:17.000-07:00
diff --git a/fastchat/serve/monitor/vote_time_stats/README.md b/fastchat/serve/monitor/vote_time_stats/README.md
@@ -0,0 +1,5 @@
+# Instructions
+
+First run `analyze_data.py` to collect metadata of all votes.
+
+Then run `plot.py` to get the plot. You need to edit these files for proper input or output filename
diff --git a/fastchat/serve/monitor/vote_time_stats/analyze_data.py b/fastchat/serve/monitor/vote_time_stats/analyze_data.py
@@ -0,0 +1,120 @@
+import datetime
+import glob
+import json
+from collections import deque
+import tqdm
+
+
+def _serialize_json(data):
+    # Serialize JSON with sorted keys and no whitespace
+    return json.dumps(data, sort_keys=True, separators=(",", ":")).encode("utf-8")
+
+
+types = {
+    "share",
+    "chat",
+    "flag",
+    "bothbad_vote",
+    "downvote",
+    "leftvote",
+    "rightvote",
+    "upvote",
+    "tievote",
+}
+
+chat_dict = {}
+cache_queue = deque()
+
+
+def process_record(r):
+    ip = r.pop("ip", None)
+    tstamp = r.pop("tstamp")
+    mtype = r.pop("type")
+    start = r.pop("start", None)
+    finish = r.pop("finish", None)
+
+    # gabagge collect to save memory
+    while len(cache_queue) > 100000:
+        outdated = cache_queue.popleft()
+        poped_item = chat_dict.pop(outdated["key"], None)
+        if poped_item is None:
+            # TODO: this sometimes happens, need to investigate what happens. in theory the chat dict should be synced with the queue, unless there are duplicated items
+            print("Error: Key to GC does not exist.")
+
+    assert mtype in types
+    if mtype == "chat":
+        key = _serialize_json(r["state"])
+        # TODO: add the string length of the last reply for analyzing voting time per character.
+        chat_dict[key] = {
+            "timestamp": tstamp,
+            "start": start,
+            "finish": finish,
+            "conv_id": r["state"]["conv_id"],
+        }
+        cache_queue.append({"key": key, "timestamp": tstamp})
+    elif mtype in ("leftvote", "rightvote", "bothbad_vote", "tievote"):
+        left_key = _serialize_json(r["states"][0])
+        right_key = _serialize_json(r["states"][1])
+        if left_key not in chat_dict:
+            # TODO: this sometimes happens, it means we have the vote but we cannot find previous chat, need to investigate what happens
+            print(
+                f'WARNING: Cannot find vote context for conversation {r["states"][0]["conv_id"]}'
+            )
+            return
+        if right_key not in chat_dict:
+            print(
+                f'WARNING: Cannot find vote context for conversation {r["states"][1]["conv_id"]}'
+            )
+            return
+        vote_time_data = {
+            "timestamp": tstamp,
+            "type": mtype,
+            "left": chat_dict[left_key],
+            "right": chat_dict[right_key],
+            "ip": ip,
+        }
+        return vote_time_data
+
+    return None
+
+
+def process_file(infile: str, outfile: str):
+    with open(infile) as f:
+        records = []
+        for l in f.readlines():
+            l = l.strip()
+            if l:
+                try:
+                    r = json.loads(l)
+                    if r.get("tstamp") is not None:
+                        records.append(r)
+                except Exception:
+                    pass
+        # sort the record in case there are out-of-order records
+        records.sort(key=lambda x: x["tstamp"])
+
+        with open(outfile, "a") as outfile:
+            for r in records:
+                try:
+                    output = process_record(r)
+                    if output is not None:
+                        outfile.write(json.dumps(output) + "\n")
+                except Exception as e:
+                    import traceback
+
+                    print("Error:", e)
+                    traceback.print_exc()
+
+
+today = datetime.datetime.today().isoformat().split("T", 1)[0]
+# sort it to make sure the date is continuous for each server
+filelist = sorted(glob.glob("/mnt/disks/data/fastchat_logs/server*/202*-*-*-conv.json"))
+filelist = [
+    f for f in filelist if today not in f
+]  # skip today because date could be partial
+
+# TODO: change this to select different range of data
+filelist = [f for f in filelist if "2024-03-" in f]
+
+for f in tqdm.tqdm(filelist):
+    process_file(f, "output.jsonl")
diff --git a/fastchat/serve/monitor/vote_time_stats/plot.py b/fastchat/serve/monitor/vote_time_stats/plot.py
@@ -0,0 +1,66 @@
+import json
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+
+
+infile = "output.jsonl"
+date = "2024-03"  # used in the plot
+
+durations = []
+
+with open(infile) as f:
+    for line in f:
+        data = json.loads(line)
+        l = data["left"]["finish"]
+        r = data["right"]["finish"]
+        v = data["timestamp"]
+        durations.append(v - max(l, r))
+
+print(
+    f"Avg: {np.mean(durations)}, Median: {np.median(durations)}, Max: {np.max(durations)}"
+)
+
+# Define the new cutoff and number of bins
+cutoff = 200.0  # New cutoff value
+num_bins_inside_cutoff = 20  # Number of bins from 0 to cutoff
+
+for i, n in enumerate(durations):
+    if n > cutoff:
+        durations[i] = cutoff + 0.5 * cutoff / num_bins_inside_cutoff
+
+# Create bin edges from 0 to cutoff, with the specified number of bins
+bin_edges = np.linspace(0, cutoff, num_bins_inside_cutoff + 1)
+
+# Adjusting the overflow bin to end at 110
+overflow_cap = (
+    cutoff + cutoff / num_bins_inside_cutoff
+)  # Adjust as needed based on distribution
+bin_edges = np.append(bin_edges, overflow_cap)
+
+# Create the plot with custom bins
+sns.histplot(
+    durations, bins=bin_edges, kde=False
+)  # Turn off KDE for clearer bar visibility
+plt.title(f'Distribution of "time to vote" {date}')
+plt.xlabel("Duration (seconds)")
+plt.ylabel("Frequency")
+
+# Highlight the overflow bin
+plt.axvline(x=cutoff, color="red", linestyle="--")
+plt.text(
+    cutoff + 1, plt.ylim()[1] * 0.9, "Overflow", color="red", ha="left"
+)  # Adjust text alignment
+
+# Customizing x-axis labels to hide the "110"
+ax = plt.gca()  # Get current axis
+labels = [item.get_text() for item in ax.get_xticklabels()]
+if "110" in labels:
+    labels[labels.index("110")] = ""  # Replace "110" with an empty string
+ax.set_xticklabels(labels)
+
+# Ensure nothing is cut off in the plot
+plt.tight_layout()
+
+# Save the plot to a file with high resolution
+plt.savefig(f"duration_distribution_time_to_vote_{date}.png", dpi=300)