@@ -87,33 +87,45 @@ def process_request(self, request, max_model_len=None, **kwargs):
87
87
bool: Whether preprocessing is successful
88
88
str: error message
89
89
"""
90
+ data_processor_logger .info (f"Start processing request: { request } " )
90
91
request .chat_template = kwargs .get ("chat_template" )
91
92
request = self ._apply_default_parameters (request )
92
93
if request .get ("eos_token_ids" ) is None or len (request .eos_token_ids ) == 0 :
93
94
request .eos_token_ids = self .eos_token_ids
95
+
96
+ # processing stop_sequences
94
97
stop_sequences = request .get ("stop" , [])
95
98
if stop_sequences is not None and len (stop_sequences ) != 0 :
96
99
stop_seqs , stop_seqs_len = self .update_stop_seq (stop_sequences )
97
100
request .set ("stop_token_ids" , stop_seqs )
98
101
request .set ("stop_seqs_len" , stop_seqs_len )
99
102
103
+ # processing bad_words
100
104
bad_words = request .get ("bad_words" )
101
105
bad_words_token_ids = request .get ("bad_words_token_ids" )
102
106
if bad_words :
103
107
bad_words_token_ids = self .update_bad_words (bad_words , bad_words_token_ids )
104
108
request ["bad_words_token_ids" ] = bad_words_token_ids
105
109
110
+ # processing prompt_token_ids
106
111
if request .prompt_token_ids is None or len (request .prompt_token_ids ) == 0 :
107
- if request .prompt is None and request .messages is None :
108
- raise ValueError (f"The request should have `prompt_token_ids`, `prompt` or `messages`: { request } ." )
109
112
if request .prompt is not None :
110
- prompt = request .prompt if request .prompt is not None else request .messages [0 ]
111
- prompt = prompt [0 ] if isinstance (prompt , list ) else prompt
112
- tokens = self .tokenizer .tokenize (prompt )
113
- token_ids = self .tokenizer .convert_tokens_to_ids (tokens )
114
- request .prompt_token_ids = token_ids
115
- data_processor_logger .info (f"req_id:{ request .request_id } , tokens:{ tokens } , token_ids: { token_ids } " )
116
- else :
113
+ # prompt = request.prompt if request.prompt is not None else request.messages[0]
114
+ prompt = request .prompt
115
+ assert isinstance (prompt , str ) or (
116
+ isinstance (prompt , list ) and all ([isinstance (t , int ) for t in prompt ])
117
+ ), f"prompt must be a string or a list of integers, but got { type (prompt )} "
118
+
119
+ if isinstance (prompt , list ): # if prompt is a token id list
120
+ request .prompt_token_ids = prompt
121
+ else :
122
+ tokens = self .tokenizer .tokenize (prompt )
123
+ token_ids = self .tokenizer .convert_tokens_to_ids (tokens )
124
+ request .prompt_token_ids = token_ids
125
+ data_processor_logger .debug (
126
+ f"request_ids: { request .request_id } , prompt: { prompt } , tokens: { tokens } , token_ids: { token_ids } "
127
+ )
128
+ elif request .messages is not None :
117
129
task = request .to_dict ()
118
130
chat_template_kwargs = kwargs .get ("chat_template_kwargs" )
119
131
if chat_template_kwargs :
@@ -124,24 +136,26 @@ def process_request(self, request, max_model_len=None, **kwargs):
124
136
else :
125
137
raise ValueError ("Invalid input: chat_template_kwargs must be a dict" )
126
138
request .prompt_token_ids = self .messages2ids (task )
139
+ else :
140
+ raise ValueError (f"The request should have `prompt_token_ids`, `prompt` or `messages`: { request } ." )
127
141
128
142
if len (request .prompt_token_ids ) == 0 :
129
143
raise ValueError ("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs" )
144
+
145
+ # truncate prompts that exceed the length limit
130
146
if max_model_len is not None and len (request .prompt_token_ids ) > max_model_len :
131
147
request .prompt_token_ids = request .prompt_token_ids [: max_model_len - 1 ]
132
148
if request .get ("max_tokens" ) is None :
133
- request .set (
134
- "max_tokens" ,
135
- max (1 , max_model_len - len (request .prompt_token_ids )),
136
- )
149
+ request .set ("max_tokens" , max (1 , max_model_len - len (request .prompt_token_ids )))
137
150
if request .get ("temperature" ) < _SAMPLING_EPS :
138
151
# zero temperature is equivalent to greedy sampling
139
152
request .set ("temperature" , 1 )
140
153
if request .get ("top_p" ) < _SAMPLING_EPS :
141
154
request .set ("top_p" , _SAMPLING_EPS )
142
155
if self .reasoning_parser and self .reasoning_parser .__class__ .__name__ == "ErnieX1ReasoningParser" :
143
156
request .enable_thinking = True
144
- data_processor_logger .info (f"Processed request { request } " )
157
+
158
+ data_processor_logger .info (f"Processed request: { request } " )
145
159
return request
146
160
147
161
def process_request_dict (self , request , max_model_len = None ):
@@ -155,6 +169,7 @@ def process_request_dict(self, request, max_model_len=None):
155
169
bool: Whether preprocessing is successful
156
170
str: error message
157
171
"""
172
+ data_processor_logger .info (f"Start processing request dict: { request } " )
158
173
request = self ._apply_default_parameters (request )
159
174
if not request .get ("eos_token_ids" ):
160
175
request ["eos_token_ids" ] = self .eos_token_ids
@@ -175,18 +190,21 @@ def process_request_dict(self, request, max_model_len=None):
175
190
176
191
# processing prompt_token_ids
177
192
if not request .get ("prompt_token_ids" ):
178
- if request .get ("prompt" ) is None and request .get ("messages" ) is None :
179
- raise ValueError (f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': { request } " )
180
193
if request .get ("prompt" ):
181
194
prompt = request .get ("prompt" )
182
- prompt = prompt [0 ] if isinstance (prompt , list ) else prompt
183
- request ["text_after_process" ] = prompt
184
- tokens = self .tokenizer .tokenize (prompt )
185
- token_ids = self .tokenizer .convert_tokens_to_ids (tokens )
186
- request ["prompt_token_ids" ] = token_ids
187
- req_id = request .get ("request_id" , None )
188
- data_processor_logger .info (f"req_id:{ req_id } , tokens:{ tokens } , token_ids: { token_ids } " )
189
- else :
195
+ assert isinstance (prompt , str ) or (
196
+ isinstance (prompt , list ) and all ([isinstance (t , int ) for t in prompt ])
197
+ ), f"prompt must be a string or a list of integers, but got { type (prompt )} "
198
+ if isinstance (prompt , list ): # if prompt is a token id list
199
+ request ["prompt_token_ids" ] = prompt
200
+ else :
201
+ request ["text_after_process" ] = prompt
202
+ tokens = self .tokenizer .tokenize (prompt )
203
+ token_ids = self .tokenizer .convert_tokens_to_ids (tokens )
204
+ request ["prompt_token_ids" ] = token_ids
205
+ req_id = request .get ("request_id" , None )
206
+ data_processor_logger .info (f"req_id:{ req_id } , tokens:{ tokens } , token_ids: { token_ids } " )
207
+ elif request .get ("messages" ):
190
208
chat_template_kwargs = request .get ("chat_template_kwargs" )
191
209
if chat_template_kwargs :
192
210
if isinstance (chat_template_kwargs , dict ):
@@ -196,6 +214,9 @@ def process_request_dict(self, request, max_model_len=None):
196
214
else :
197
215
raise ValueError ("Invalid input: chat_template_kwargs must be a dict" )
198
216
request ["prompt_token_ids" ] = self .messages2ids (request )
217
+ else :
218
+ raise ValueError (f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': { request } " )
219
+
199
220
if len (request ["prompt_token_ids" ]) == 0 :
200
221
raise ValueError ("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs" )
201
222
@@ -211,8 +232,8 @@ def process_request_dict(self, request, max_model_len=None):
211
232
request ["top_p" ] = _SAMPLING_EPS
212
233
if self .reasoning_parser and self .reasoning_parser .__class__ .__name__ == "ErnieX1ReasoningParser" :
213
234
request ["enable_thinking" ] = True
214
- data_processor_logger .info (f"Processed request { request } " )
215
235
236
+ data_processor_logger .info (f"Processed request dict: { request } " )
216
237
return request
217
238
218
239
def process_response (self , response_dict , ** kwargs ):
0 commit comments