1111# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212# See the License for the specific language governing permissions and
1313# limitations under the License.
14- from typing import Any , Dict , List , Optional , Sequence , TypeVar
14+ from typing import Any , Optional , Sequence , TypeVar
1515
1616from datasets import Dataset , DatasetDict
1717from transformers import PreTrainedTokenizer
2020DatasetType = TypeVar ("DatasetType" , Dataset , DatasetDict )
2121
2222
23- def is_conversational (example : Dict [str , Any ]) -> bool :
23+ def is_conversational (example : dict [str , Any ]) -> bool :
2424 r"""
2525 Check if the example is in a conversational format.
2626
2727 Args:
28- example (`Dict [str, Any]`):
28+ example (`dict [str, Any]`):
2929 A single data entry of a dataset. The example can have different keys depending on the
3030 dataset type.
3131
@@ -60,7 +60,7 @@ def is_conversational(example: Dict[str, Any]) -> bool:
6060 return False
6161
6262
63- def apply_chat_template (example : Dict [str , List [ Dict [str , str ]]], tokenizer : PreTrainedTokenizer ) -> Dict [str , str ]:
63+ def apply_chat_template (example : dict [str , list [ dict [str , str ]]], tokenizer : PreTrainedTokenizer ) -> dict [str , str ]:
6464 r"""
6565 Apply a chat template to a conversational example.
6666
@@ -139,13 +139,13 @@ def apply_chat_template(example: Dict[str, List[Dict[str, str]]], tokenizer: Pre
139139
140140
141141def maybe_apply_chat_template (
142- example : Dict [str , List [ Dict [str , str ]]], tokenizer : PreTrainedTokenizer
143- ) -> Dict [str , str ]:
142+ example : dict [str , list [ dict [str , str ]]], tokenizer : PreTrainedTokenizer
143+ ) -> dict [str , str ]:
144144 r"""
145145 If the example is in a conversational format, apply a chat template to it.
146146
147147 Args:
148- example (`Dict [str, List[Dict [str, str]]`):
148+ example (`dict [str, list[dict [str, str]]`):
149149 Dictionary representing a single data entry of a conversational dataset. Each data entry can have different
150150 keys depending on the dataset type. The supported dataset types are:
151151
@@ -163,7 +163,7 @@ def maybe_apply_chat_template(
163163 The tokenizer to apply the chat template with.
164164
165165 Returns:
166- `Dict [str, str]`: The formatted example with the chat template applied.
166+ `dict [str, str]`: The formatted example with the chat template applied.
167167
168168 Note:
169169 This function does not alter the keys, except for Language modeling dataset, where `"messages"` is replaced by
@@ -188,7 +188,7 @@ def maybe_apply_chat_template(
188188 return example
189189
190190
191- def _unpair_row (examples : List [ Dict [str , List [ Dict [str , str ]]]]) -> List [ Dict [str , List [ Dict [str , str ]]]]:
191+ def _unpair_row (examples : list [ dict [str , list [ dict [str , str ]]]]) -> list [ dict [str , list [ dict [str , str ]]]]:
192192 batch_size = len (examples ["chosen" ])
193193 new_rows = {
194194 "completion" : examples ["chosen" ] + examples ["rejected" ],
@@ -288,7 +288,7 @@ def maybe_unpair_preference_dataset(
288288 return dataset
289289
290290
291- def extract_prompt (example : Dict [str , Sequence ]) -> Dict [str , Sequence ]:
291+ def extract_prompt (example : dict [str , Sequence ]) -> dict [str , Sequence ]:
292292 r"""
293293 Extracts the shared prompt from a preference data example, where the prompt is implicit within both
294294 the chosen and rejected completions.
@@ -307,7 +307,7 @@ def extract_prompt(example: Dict[str, Sequence]) -> Dict[str, Sequence]:
307307 }
308308
309309
310- def maybe_extract_prompt (example : Dict [str , List ]) -> Dict [str , List ]:
310+ def maybe_extract_prompt (example : dict [str , list ]) -> dict [str , list ]:
311311 r"""
312312 Extracts the shared prompt from a preference data example, where the prompt is implicit within both
313313 the chosen and rejected completions.
@@ -318,12 +318,12 @@ def maybe_extract_prompt(example: Dict[str, List]) -> Dict[str, List]:
318318 "rejected" completions.
319319
320320 Args:
321- example (`Dict [str, List ]`):
321+ example (`dict [str, list ]`):
322322 A dictionary representing a single data entry in the preference dataset. It must contain the keys
323323 `"chosen"` and `"rejected"`, where each value is either conversational or standard (`str`).
324324
325325 Returns:
326- `Dict [str, List ]`: A dictionary containing:
326+ `dict [str, list ]`: A dictionary containing:
327327 - `"prompt"`: The longest common prefix between the "chosen" and "rejected" completions.
328328 - `"chosen"`: The remainder of the "chosen" completion, with the prompt removed.
329329 - `"rejected"`: The remainder of the "rejected" completion, with the prompt removed.
0 commit comments