77import argparse
88import json
99import os
10+ import hashlib
1011from pytz import timezone
1112from functools import partial
1213from datetime import datetime , timedelta
1314import time
1415import multiprocessing as mp
1516
16- from tqdm import tqdm
17-
1817from fastchat .serve .monitor .basic_stats import NUM_SERVERS
1918from fastchat .serve .monitor .clean_battle_data import (
2019 to_openai_format ,
@@ -64,18 +63,15 @@ def get_action_type_data(filename, action_type):
6463 except FileNotFoundError :
6564 time .sleep (2 )
6665
66+ rows = []
6767 for l in lines :
6868 row = json .loads (l )
6969 if row ["type" ] == action_type :
70- return row
71-
70+ rows . append ( row )
71+ return rows
7272
73- def process_data (row , action_type , all_ips ):
74- # Initialize local counters
75- ct_invalid_conv_id = 0
76- ct_invalid = 0
77- ct_network_error = 0
7873
74+ def process_data (row , action_type ):
7975 try :
8076 if action_type in ["chat" , "upvote" , "downvote" ]:
8177 state = row ["state" ]
@@ -88,40 +84,64 @@ def process_data(row, action_type, all_ips):
8884 model = row ["states" ][1 ]["model_name" ]
8985 conversation_id = state ["conv_id" ]
9086 except KeyError :
91- ct_invalid_conv_id += 1
92- return None , ct_invalid_conv_id , ct_invalid , ct_network_error , None
87+ return {
88+ "result" : None ,
89+ "ct_invalid_conv_id" : 1 ,
90+ "ct_invalid" : 0 ,
91+ "ct_network_error" : 0 ,
92+ "model" : None ,
93+ }
9394
9495 if conversation_id is None :
95- ct_invalid_conv_id += 1
96- return None , ct_invalid_conv_id , ct_invalid , ct_network_error , None
96+ return {
97+ "result" : None ,
98+ "ct_invalid_conv_id" : 1 ,
99+ "ct_invalid" : 0 ,
100+ "ct_network_error" : 0 ,
101+ "model" : None ,
102+ }
97103
98104 conversation = to_openai_format (state ["messages" ][state ["offset" ] :])
99105 if not isinstance (model , str ):
100- ct_invalid += 1
101- return None , ct_invalid_conv_id , ct_invalid , ct_network_error , None
106+ return {
107+ "result" : None ,
108+ "ct_invalid_conv_id" : 0 ,
109+ "ct_invalid" : 1 ,
110+ "ct_network_error" : 0 ,
111+ "model" : None ,
112+ }
102113 model = replace_model_name (model , row ["tstamp" ])
103114
104115 try :
105116 lang_code = detect_language (state ["messages" ][state ["offset" ]][1 ])
106117 except IndexError :
107- ct_invalid += 1
108- return None , ct_invalid_conv_id , ct_invalid , ct_network_error , None
118+ return {
119+ "result" : None ,
120+ "ct_invalid_conv_id" : 0 ,
121+ "ct_invalid" : 1 ,
122+ "ct_network_error" : 0 ,
123+ "model" : None ,
124+ }
109125
110126 if not all (isinstance (x ["content" ], str ) for x in conversation ):
111- ct_invalid += 1
112- return None , ct_invalid_conv_id , ct_invalid , ct_network_error , None
127+ return {
128+ "result" : None ,
129+ "ct_invalid_conv_id" : 0 ,
130+ "ct_invalid" : 1 ,
131+ "ct_network_error" : 0 ,
132+ "model" : None ,
133+ }
113134
114135 messages = "" .join ([x ["content" ] for x in conversation ]).lower ()
115136 if NETWORK_ERROR_MSG in messages :
116- ct_network_error += 1
117- return None , ct_invalid_conv_id , ct_invalid , ct_network_error , None
118-
119- ip = row ["ip" ]
120- # Synchronize access to all_ips using the lock
121- with LOCK :
122- if ip not in all_ips :
123- all_ips [ip ] = len (all_ips )
124- user_id = all_ips [ip ]
137+ return {
138+ "result" : None ,
139+ "ct_invalid_conv_id" : 0 ,
140+ "ct_invalid" : 0 ,
141+ "ct_network_error" : 1 ,
142+ "model" : None ,
143+ }
144+ user_id = hashlib .md5 (row ["ip" ].encode ()).hexdigest ()
125145
126146 # Prepare the result data
127147 result = dict (
@@ -134,43 +154,37 @@ def process_data(row, action_type, all_ips):
134154 tstamp = row ["tstamp" ],
135155 )
136156
137- return result , ct_invalid_conv_id , ct_invalid , ct_network_error , model
157+ return {
158+ "result" : result ,
159+ "ct_invalid_conv_id" : 0 ,
160+ "ct_invalid" : 0 ,
161+ "ct_network_error" : 0 ,
162+ "model" : model ,
163+ }
138164
139165
140166def clean_chat_data (log_files , action_type ):
141167 with mp .Pool () as pool :
142168 # Use partial to pass action_type to get_action_type_data
143169 func = partial (get_action_type_data , action_type = action_type )
144- raw_data = pool .map (func , log_files , chunksize = 1 )
145-
170+ file_data = pool .map (func , log_files , chunksize = 1 )
146171 # filter out Nones as some files may not contain any data belong to action_type
172+ raw_data = []
173+ for data in file_data :
174+ raw_data .extend (data )
147175 raw_data = [r for r in raw_data if r is not None ]
148- all_ips = MANAGER .dict ()
149176
150177 # Use the multiprocessing Pool
151178 with mp .Pool () as pool :
152- func = partial (process_data , action_type = action_type , all_ips = all_ips )
179+ func = partial (process_data , action_type = action_type )
153180 results = pool .map (func , raw_data , chunksize = 1 )
154181
155- # Initialize counters and collections in the parent process
156- ct_invalid_conv_id = 0
157- ct_invalid = 0
158- ct_network_error = 0
159- all_models = set ()
160- chats = []
161-
162182 # Aggregate results from child processes
163- for res in results :
164- if res is None :
165- continue
166- data , inv_conv_id , inv , net_err , model = res
167- ct_invalid_conv_id += inv_conv_id
168- ct_invalid += inv
169- ct_network_error += net_err
170- if data :
171- chats .append (data )
172- if model :
173- all_models .add (model )
183+ ct_invalid_conv_id = sum ([data ["ct_invalid_conv_id" ] for data in results ])
184+ ct_invalid = sum ([data ["ct_invalid" ] for data in results ])
185+ ct_network_error = sum ([data ["ct_network_error" ] for data in results ])
186+ all_models = set ([data ["model" ] for data in results if not (data ["model" ] is None )])
187+ chats = [data ["result" ] for data in results if not (data ["result" ] is None )]
174188
175189 chats .sort (key = lambda x : x ["tstamp" ])
176190 last_updated_tstamp = chats [- 1 ]["tstamp" ]
0 commit comments