66import nltk
77from tqdm import tqdm
88from nltk .tokenize import word_tokenize
9-
9+ import pandas as pd
1010
1111def is_code_conversation (text : str ) -> tuple [bool , list [str ]]:
1212 """Check if the text is a code conversation"""
@@ -132,18 +132,26 @@ def check_code_conv(conv) -> tuple[bool, list[str]]:
132132
133133
134134def check_conv_row (conv_row ):
135+ # check_a, code_a = check_code_conv(conv_row["conversation"])
136+ # return check_a, code_a
135137 check_a , code_a = check_code_conv (conv_row ["conversation_a" ])
136138 check_b , code_b = check_code_conv (conv_row ["conversation_b" ])
137-
138139 return check_a or check_b , code_a + code_b
139140
140-
141- def process_battle_file (battle_file_path : str , n_cpus : int ):
142- with open (battle_file_path , "r" ) as f :
143- data = json .load (f )
141+ def process_battle_file (battle_file_path : str , n_cpus : int , direct_chat : bool = False ):
142+ # with open(battle_file_path, "r") as f:
143+ # data = json.load(f)
144+ # data = pd.read_json(battle_file_path).to_dict("records")
145+ data = pd .read_json (battle_file_path , lines = True ).to_dict ("records" )
146+ print (data [0 ])
144147
145148 with mp .Pool (n_cpus ) as pool :
146- tagged_data = list (tqdm (pool .imap (check_conv_row , data ), total = len (data )))
149+ tagged_data = list (
150+ tqdm (
151+ pool .imap (check_conv_row , data ),
152+ total = len (data ),
153+ )
154+ )
147155
148156 output_data = [row for row , (is_code , _ ) in zip (data , tagged_data ) if is_code ]
149157
@@ -154,6 +162,7 @@ def process_battle_file(battle_file_path: str, n_cpus: int):
154162 parser = argparse .ArgumentParser ()
155163 parser .add_argument ("--clean-battle-file" , type = str )
156164 parser .add_argument ("--output-clean-battle-file" , type = str , default = None )
165+ parser .add_argument ("--direct-chat" , action = "store_true" )
157166 parser .add_argument ("--n-cpus" , type = int , default = - 1 )
158167
159168 args = parser .parse_args ()
0 commit comments