forked from PaddlePaddle/ERNIE
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_utils.py
More file actions
92 lines (72 loc) · 2.72 KB
/
data_utils.py
File metadata and controls
92 lines (72 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
utils for data processor
"""
import base64
import math
from io import BytesIO
import xxhash
from PIL import Image
from paddleformers.transformers.legacy.tokenizer_utils_base import BatchEncoding
def get_text_token_num(tokenizer, text: str):
"""text tokenize and count"""
tokens = tokenizer.encode(text)
if isinstance(tokens, BatchEncoding):
return len(tokens["input_ids"])
else:
return len(tokens)
def get_uniq_id(text):
"""text hash"""
return xxhash.xxh32_intdigest(text)
def image_to_json_serializable(image):
"""
Convert an image into a JSON-serializable format.
Parameters:
image (PIL.Image or bytes or str): The input image, which can be a PIL Image object, raw bytes,
or a file path as a string.
Returns:
The image data encoded in Base64, or a string (if the input is already a valid path or string representation).
"""
if isinstance(image, Image.Image):
buffered = BytesIO()
image.save(buffered, format="JPEG")
output = base64.b64encode(buffered.getvalue()).decode("utf-8")
elif isinstance(image, bytes):
output = base64.b64encode(image).decode("utf-8")
elif isinstance(image, str):
output = image
else:
raise ValueError(f"Unsupported image type {type(image)}.")
return output
def merge_list(lists):
"""merge multi list to one list
Args:
lists (list[list]): [[], [], ...]
Returns:
list: one list
"""
new_list = lists[0]
for one in lists[1:]:
new_list.extend(one)
return new_list
def round_by_factor(number: int, factor: int):
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int):
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor