1010import os
1111from pytz import timezone
1212import time
13+ import pandas as pd
14+ import tiktoken
1315
1416from tqdm import tqdm
1517
2628)
2729
2830
29- def get_log_files (max_num_files = None ):
31+ def get_log_files (max_num_files = None , is_vision = False ):
3032 dates = []
31- for month in range (4 , 12 ):
32- for day in range (1 , 33 ):
33- dates .append (f"2023-{ month :02d} -{ day :02d} " )
33+ for year in range (2023 , 2025 ):
34+ for month in range (1 , 13 ):
35+ for day in range (1 , 33 ):
36+ dates .append (f"{ year } -{ month :02d} -{ day :02d} " )
3437
3538 filenames = []
3639 for d in dates :
3740 for i in range (NUM_SERVERS ):
38- name = os .path .expanduser (f"~/fastchat_logs/server{ i } /{ d } -conv.json" )
41+ prefix = ""
42+ if is_vision :
43+ prefix = "vision-tmp-"
44+ name = os .path .expanduser (f"~/fastchat_logs/server{ i } /{ prefix } { d } -conv.json" )
3945 if os .path .exists (name ):
4046 filenames .append (name )
4147 max_num_files = max_num_files or len (filenames )
@@ -44,7 +50,8 @@ def get_log_files(max_num_files=None):
4450 return filenames
4551
4652
47- def clean_chat_data (log_files , action_type ):
53+ def clean_chat_data (log_files , action_type , remove_prompt = False ):
54+ encoding = tiktoken .encoding_for_model ("gpt-3.5-turbo" )
4855 raw_data = []
4956 for filename in tqdm (log_files , desc = "read files" ):
5057 for retry in range (5 ):
@@ -65,11 +72,15 @@ def clean_chat_data(log_files, action_type):
6572 ct_invalid_conv_id = 0
6673 ct_invalid = 0
6774 ct_network_error = 0
75+ ct_img_chat = 0
76+ ct_csam = 0
6877 for row in raw_data :
6978 try :
7079 if action_type in ["chat" , "upvote" , "downvote" ]:
7180 state = row ["state" ]
7281 model = row ["model" ]
82+ if state .get ("has_csam_image" , False ):
83+ ct_csam += 1
7384 elif action_type == "leftvote" :
7485 state = row ["states" ][0 ]
7586 model = row ["states" ][0 ]["model_name" ]
@@ -92,17 +103,31 @@ def clean_chat_data(log_files, action_type):
92103 model = replace_model_name (model , row ["tstamp" ])
93104
94105 try :
95- lang_code = detect_language (state ["messages" ][state ["offset" ]][1 ])
96- except IndexError :
106+ msg = state ["messages" ][state ["offset" ]][1 ]
107+ if isinstance (msg , list ):
108+ ct_img_chat += 1
109+ msg = msg [0 ]
110+ lang_code = detect_language (msg )
111+ if not all (isinstance (x ["content" ][0 ], str ) for x in conversation ):
112+ ct_invalid += 1
113+ except (IndexError , TypeError ):
97114 ct_invalid += 1
98115 continue
99116
100- if not all (isinstance (x ["content" ], str ) for x in conversation ):
101- ct_invalid += 1
102- continue
117+ # add token length
118+ messages_concat = ""
119+ for x in conversation :
120+ msg = x ["content" ]
121+ if isinstance (x ["content" ], list ):
122+ msg = x ["content" ][0 ]
123+ x ["num_tokens" ] = len (
124+ encoding .encode (msg , allowed_special = "all" )
125+ )
126+ messages_concat += msg .lower ()
103127
104- messages = "" .join ([x ["content" ] for x in conversation ]).lower ()
105- if NETWORK_ERROR_MSG in messages :
128+ if remove_prompt :
129+ x .pop ("content" )
130+ if NETWORK_ERROR_MSG in messages_concat :
106131 ct_network_error += 1
107132 continue
108133
@@ -141,10 +166,10 @@ def clean_chat_data(log_files, action_type):
141166 dedup_chats .append (chats [i ])
142167
143168 print (
144- f"#raw: { len (raw_data )} , #chat: { len (chats )} , #dedup_chat: { len (dedup_chats )} "
169+ f"#raw: { len (raw_data )} , #chat: { len (chats )} , #dedup_chat: { len (dedup_chats )} , #csam: { ct_csam } "
145170 )
146171 print (
147- f"#invalid_conv_id: { ct_invalid_conv_id } , #network_error: { ct_network_error } , #invalid: { ct_invalid } "
172+ f"#invalid_conv_id: { ct_invalid_conv_id } , #network_error: { ct_network_error } , #invalid: { ct_invalid } , #img-chat: { ct_img_chat } "
148173 )
149174 print (f"#models: { len (all_models )} , { all_models } " )
150175 print (f"last-updated: { last_updated_datetime } " )
@@ -156,16 +181,30 @@ def clean_chat_data(log_files, action_type):
156181 parser = argparse .ArgumentParser ()
157182 parser .add_argument ("--action-type" , type = str , default = "chat" )
158183 parser .add_argument ("--max-num-files" , type = int )
184+ parser .add_argument ("--vision" , action = "store_true" )
185+ parser .add_argument ("--start-time" , type = str ) # example: 2024-08-01
186+ parser .add_argument ("--end-time" , type = str ) # example: 2024-08-01
187+ parser .add_argument ("--remove-prompt" , action = "store_true" )
159188 args = parser .parse_args ()
160189
161- log_files = get_log_files (args .max_num_files )
162- chats = clean_chat_data (log_files , args .action_type )
163- last_updated_tstamp = chats [- 1 ]["tstamp" ]
190+ log_files = get_log_files (args .max_num_files , args .vision )
191+ # print(log_files)
192+ chats = clean_chat_data (log_files , args .action_type , args .remove_prompt )
193+ print (len (chats ))
194+ # convert to dataframe
195+ chats = pd .DataFrame (chats )
196+ if args .start_time is not None :
197+ chats = chats [pd .to_datetime (chats ["tstamp" ], unit = "s" ) >= pd .to_datetime (args .start_time )]
198+ chats = chats [pd .to_datetime (chats ["tstamp" ], unit = 's' ) < pd .to_datetime (args .end_time )]
199+ print (len (chats ))
200+
201+ last_updated_tstamp = chats .iloc [- 1 ]["tstamp" ]
164202 cutoff_date = datetime .datetime .fromtimestamp (
165203 last_updated_tstamp , tz = timezone ("US/Pacific" )
166204 ).strftime ("%Y%m%d" )
167205
168206 output = f"clean_{ args .action_type } _conv_{ cutoff_date } .json"
169- with open (output , "w" ) as fout :
170- json .dump (chats , fout , indent = 2 , ensure_ascii = False )
207+ # with open(output, "w") as fout:
208+ # json.dump(chats, fout, indent=2, ensure_ascii=False)
209+ chats .to_json (output , orient = "records" , indent = 2 , force_ascii = False )
171210 print (f"Write cleaned data to { output } " )
0 commit comments