Skip to content

Commit f21c56f

Browse files
committed
improve
1 parent 234e3b0 commit f21c56f

File tree

1 file changed

+65
-51
lines changed

1 file changed

+65
-51
lines changed

fastchat/serve/monitor/clean_chat_data.py

Lines changed: 65 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,13 @@
77
import argparse
88
import json
99
import os
10+
import hashlib
1011
from pytz import timezone
1112
from functools import partial
1213
from datetime import datetime, timedelta
1314
import time
1415
import multiprocessing as mp
1516

16-
from tqdm import tqdm
17-
1817
from fastchat.serve.monitor.basic_stats import NUM_SERVERS
1918
from fastchat.serve.monitor.clean_battle_data import (
2019
to_openai_format,
@@ -64,18 +63,15 @@ def get_action_type_data(filename, action_type):
6463
except FileNotFoundError:
6564
time.sleep(2)
6665

66+
rows = []
6767
for l in lines:
6868
row = json.loads(l)
6969
if row["type"] == action_type:
70-
return row
71-
70+
rows.append(row)
71+
return rows
7272

73-
def process_data(row, action_type, all_ips):
74-
# Initialize local counters
75-
ct_invalid_conv_id = 0
76-
ct_invalid = 0
77-
ct_network_error = 0
7873

74+
def process_data(row, action_type):
7975
try:
8076
if action_type in ["chat", "upvote", "downvote"]:
8177
state = row["state"]
@@ -88,40 +84,64 @@ def process_data(row, action_type, all_ips):
8884
model = row["states"][1]["model_name"]
8985
conversation_id = state["conv_id"]
9086
except KeyError:
91-
ct_invalid_conv_id += 1
92-
return None, ct_invalid_conv_id, ct_invalid, ct_network_error, None
87+
return {
88+
"result": None,
89+
"ct_invalid_conv_id": 1,
90+
"ct_invalid": 0,
91+
"ct_network_error": 0,
92+
"model": None,
93+
}
9394

9495
if conversation_id is None:
95-
ct_invalid_conv_id += 1
96-
return None, ct_invalid_conv_id, ct_invalid, ct_network_error, None
96+
return {
97+
"result": None,
98+
"ct_invalid_conv_id": 1,
99+
"ct_invalid": 0,
100+
"ct_network_error": 0,
101+
"model": None,
102+
}
97103

98104
conversation = to_openai_format(state["messages"][state["offset"] :])
99105
if not isinstance(model, str):
100-
ct_invalid += 1
101-
return None, ct_invalid_conv_id, ct_invalid, ct_network_error, None
106+
return {
107+
"result": None,
108+
"ct_invalid_conv_id": 0,
109+
"ct_invalid": 1,
110+
"ct_network_error": 0,
111+
"model": None,
112+
}
102113
model = replace_model_name(model, row["tstamp"])
103114

104115
try:
105116
lang_code = detect_language(state["messages"][state["offset"]][1])
106117
except IndexError:
107-
ct_invalid += 1
108-
return None, ct_invalid_conv_id, ct_invalid, ct_network_error, None
118+
return {
119+
"result": None,
120+
"ct_invalid_conv_id": 0,
121+
"ct_invalid": 1,
122+
"ct_network_error": 0,
123+
"model": None,
124+
}
109125

110126
if not all(isinstance(x["content"], str) for x in conversation):
111-
ct_invalid += 1
112-
return None, ct_invalid_conv_id, ct_invalid, ct_network_error, None
127+
return {
128+
"result": None,
129+
"ct_invalid_conv_id": 0,
130+
"ct_invalid": 1,
131+
"ct_network_error": 0,
132+
"model": None,
133+
}
113134

114135
messages = "".join([x["content"] for x in conversation]).lower()
115136
if NETWORK_ERROR_MSG in messages:
116-
ct_network_error += 1
117-
return None, ct_invalid_conv_id, ct_invalid, ct_network_error, None
118-
119-
ip = row["ip"]
120-
# Synchronize access to all_ips using the lock
121-
with LOCK:
122-
if ip not in all_ips:
123-
all_ips[ip] = len(all_ips)
124-
user_id = all_ips[ip]
137+
return {
138+
"result": None,
139+
"ct_invalid_conv_id": 0,
140+
"ct_invalid": 0,
141+
"ct_network_error": 1,
142+
"model": None,
143+
}
144+
user_id = hashlib.md5(row["ip"].encode()).hexdigest()
125145

126146
# Prepare the result data
127147
result = dict(
@@ -134,43 +154,37 @@ def process_data(row, action_type, all_ips):
134154
tstamp=row["tstamp"],
135155
)
136156

137-
return result, ct_invalid_conv_id, ct_invalid, ct_network_error, model
157+
return {
158+
"result": result,
159+
"ct_invalid_conv_id": 0,
160+
"ct_invalid": 0,
161+
"ct_network_error": 0,
162+
"model": model,
163+
}
138164

139165

140166
def clean_chat_data(log_files, action_type):
141167
with mp.Pool() as pool:
142168
# Use partial to pass action_type to get_action_type_data
143169
func = partial(get_action_type_data, action_type=action_type)
144-
raw_data = pool.map(func, log_files, chunksize=1)
145-
170+
file_data = pool.map(func, log_files, chunksize=1)
146171
# filter out Nones as some files may not contain any data belong to action_type
172+
raw_data = []
173+
for data in file_data:
174+
raw_data.extend(data)
147175
raw_data = [r for r in raw_data if r is not None]
148-
all_ips = MANAGER.dict()
149176

150177
# Use the multiprocessing Pool
151178
with mp.Pool() as pool:
152-
func = partial(process_data, action_type=action_type, all_ips=all_ips)
179+
func = partial(process_data, action_type=action_type)
153180
results = pool.map(func, raw_data, chunksize=1)
154181

155-
# Initialize counters and collections in the parent process
156-
ct_invalid_conv_id = 0
157-
ct_invalid = 0
158-
ct_network_error = 0
159-
all_models = set()
160-
chats = []
161-
162182
# Aggregate results from child processes
163-
for res in results:
164-
if res is None:
165-
continue
166-
data, inv_conv_id, inv, net_err, model = res
167-
ct_invalid_conv_id += inv_conv_id
168-
ct_invalid += inv
169-
ct_network_error += net_err
170-
if data:
171-
chats.append(data)
172-
if model:
173-
all_models.add(model)
183+
ct_invalid_conv_id = sum([data["ct_invalid_conv_id"] for data in results])
184+
ct_invalid = sum([data["ct_invalid"] for data in results])
185+
ct_network_error = sum([data["ct_network_error"] for data in results])
186+
all_models = set([data["model"] for data in results if not (data["model"] is None)])
187+
chats = [data["result"] for data in results if not (data["result"] is None)]
174188

175189
chats.sort(key=lambda x: x["tstamp"])
176190
last_updated_tstamp = chats[-1]["tstamp"]

0 commit comments

Comments
 (0)