Skip to content

Commit 05365b8

Browse files
Fix pandas codec for object type columns (#2080)
* Implemented JSONCodec * Encode object type with JSONCodec for pandas * Included more types for mlflow * Implemented can_encode function for JSON codec * Included test for JSON codec * Fixed pandas codec to treat objects differently * Included tests for mlflow metadata * Included test for mlflow runtime * Fixed can_encode for JSONCodec and a pandas test * Fixed code registry but need to remove duplicated json encoding code from hf runtime * Fixed linting issues * Refactored duplicated code and simplified logic for can_encode * Renamed util function * Fixed support to encode numpy and reverted can_encode function * Fixed bug in pandas encoding * Reverted comment in pandas codec * Moved env and cli tests to sequential. * Included sleep before retrieving metrics * Run metrics tests seq * Simplified code by removing JSONCodec * Included more tests for pandas codecs * Moved PandasJsonContentType inside PandasCodec.
1 parent 6d7d140 commit 05365b8

File tree

9 files changed

+693
-39
lines changed

9 files changed

+693
-39
lines changed

mlserver/codecs/json.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
# seperate file to side step circular dependency on the decode_str function
22

3-
from typing import Any, Union
43
import json
4+
import numpy as np
5+
from typing import Any, List, Union
56

67
try:
78
import orjson
89
except ImportError:
910
orjson = None # type: ignore
1011

1112
from .string import decode_str
13+
from .lists import as_list
14+
from .utils import InputOrOutput
1215

1316

1417
# originally taken from: mlserver/rest/responses.py
@@ -57,5 +60,36 @@ def encode_to_json_bytes(v: Any) -> bytes:
5760
def decode_from_bytelike_json_to_dict(v: Union[bytes, str]) -> dict:
5861
if orjson is None:
5962
return json.loads(v)
60-
6163
return orjson.loads(v)
64+
65+
66+
class JSONEncoderWithArray(json.JSONEncoder):
67+
def default(self, obj):
68+
if isinstance(obj, np.ndarray):
69+
return obj.tolist()
70+
elif isinstance(obj, np.floating):
71+
return float(obj)
72+
elif isinstance(obj, np.integer):
73+
return int(obj)
74+
else:
75+
return json.JSONEncoder.default(self, obj)
76+
77+
78+
def encode_to_json(v: Any, use_bytes: bool = True) -> Union[str, bytes]:
79+
enc_v = json.dumps(
80+
v,
81+
ensure_ascii=False,
82+
allow_nan=False,
83+
indent=None,
84+
separators=(",", ":"),
85+
cls=JSONEncoderWithArray,
86+
)
87+
if use_bytes:
88+
enc_v = enc_v.encode("utf-8") # type: ignore[assignment]
89+
return enc_v
90+
91+
92+
def decode_json_input_or_output(input_or_output: InputOrOutput) -> List[Any]:
93+
packed = input_or_output.data.root
94+
unpacked = map(json.loads, as_list(packed))
95+
return list(unpacked)

mlserver/codecs/pandas.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from typing import Optional, Any, List, Tuple
55

66
from .base import RequestCodec, register_request_codec
7-
from .numpy import to_datatype, to_dtype, convert_nan
7+
from .numpy import to_dtype, convert_nan, to_datatype
8+
from .json import decode_json_input_or_output, encode_to_json
89
from .string import encode_str, StringCodec
910
from .utils import get_decoded_or_raw, InputOrOutput, inject_batch_dimension
1011
from .lists import ListElement
@@ -19,8 +20,12 @@
1920

2021

2122
def _to_series(input_or_output: InputOrOutput) -> pd.Series:
22-
payload = get_decoded_or_raw(input_or_output)
23+
parameters = input_or_output.parameters
24+
25+
if parameters and parameters.content_type == PandasCodec.JsonContentType:
26+
return pd.Series(decode_json_input_or_output(input_or_output))
2327

28+
payload = get_decoded_or_raw(input_or_output)
2429
if Datatype(input_or_output.datatype) == Datatype.BYTES:
2530
# Don't convert the dtype of BYTES
2631
return pd.Series(payload)
@@ -43,7 +48,13 @@ def _to_response_output(series: pd.Series, use_bytes: bool = True) -> ResponseOu
4348

4449
content_type = None
4550
if datatype == Datatype.BYTES:
46-
data, content_type = _process_bytes(data, use_bytes)
51+
processed_data, content_type = _process_bytes(data, use_bytes)
52+
53+
if content_type is None:
54+
data = [encode_to_json(elem, use_bytes) for elem in data]
55+
content_type = PandasCodec.JsonContentType
56+
else:
57+
data = processed_data
4758

4859
shape = inject_batch_dimension(list(series.shape))
4960
parameters = None
@@ -90,6 +101,7 @@ class PandasCodec(RequestCodec):
90101
"""
91102

92103
ContentType = "pd"
104+
JsonContentType = "pd_json"
93105
TypeHint = pd.DataFrame
94106

95107
@classmethod

runtimes/huggingface/mlserver_huggingface/codecs/utils.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,15 @@
55
import numpy as np
66
from PIL import Image, ImageChops
77
from transformers.pipelines import Conversation
8+
from mlserver.codecs.json import JSONEncoderWithArray
89

910
IMAGE_PREFIX = "data:image/"
1011
DEFAULT_IMAGE_FORMAT = "PNG"
1112

1213

13-
class HuggingfaceJSONEncoder(json.JSONEncoder):
14+
class HuggingfaceJSONEncoder(JSONEncoderWithArray):
1415
def default(self, obj):
15-
if isinstance(obj, np.ndarray):
16-
return obj.tolist()
17-
elif isinstance(obj, np.floating):
18-
return float(obj)
19-
elif isinstance(obj, np.integer):
20-
return int(obj)
21-
elif isinstance(obj, Image.Image):
16+
if isinstance(obj, Image.Image):
2217
buf = io.BytesIO()
2318
if not obj.format:
2419
obj.format = DEFAULT_IMAGE_FORMAT
@@ -37,7 +32,7 @@ def default(self, obj):
3732
"new_user_input": obj.new_user_input,
3833
}
3934
else:
40-
return json.JSONEncoder.default(self, obj)
35+
return super().default(obj)
4136

4237

4338
def json_encode(payload: Any, use_bytes: bool = False):

runtimes/mlflow/mlserver_mlflow/metadata.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
from typing import Union, Tuple, List, Optional
22

3-
from mlflow.types.schema import Schema, ColSpec, TensorSpec, DataType
3+
from mlflow.types.schema import (
4+
Schema,
5+
ColSpec,
6+
TensorSpec,
7+
Array,
8+
Object,
9+
Map,
10+
AnyType,
11+
DataType,
12+
)
413

514
from mlserver.types import MetadataTensor, Parameters
615
from mlserver.types import Datatype as MDatatype
@@ -35,8 +44,10 @@
3544
def _get_content_type(input_spec: InputSpec) -> Tuple[MDatatype, str]:
3645
if isinstance(input_spec, TensorSpec):
3746
datatype = to_datatype(input_spec.type)
38-
content_type = NumpyCodec.ContentType
39-
return datatype, content_type
47+
return datatype, NumpyCodec.ContentType
48+
49+
if isinstance(input_spec.type, (Array, Object, Map, AnyType)):
50+
return MDatatype.BYTES, PandasCodec.JsonContentType
4051

4152
# TODO: Check if new type, which may not exist
4253
return _MLflowToContentType[input_spec.type]

runtimes/mlflow/tests/test_metadata.py

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,17 @@
33

44
from typing import Tuple, List
55

6-
from mlflow.types.schema import ColSpec, TensorSpec, DataType, Schema
6+
from mlflow.types.schema import (
7+
ColSpec,
8+
TensorSpec,
9+
DataType,
10+
Schema,
11+
Array,
12+
Map,
13+
Object,
14+
Property,
15+
AnyType,
16+
)
717
from mlflow.pyfunc import _enforce_schema
818
from mlserver.codecs import (
919
NumpyCodec,
@@ -46,6 +56,30 @@
4656
ColSpec(name="foo", type=DataType.binary),
4757
(MDatatype.BYTES, Base64Codec.ContentType),
4858
),
59+
(
60+
ColSpec(name="foo", type=Array(dtype=DataType.long)),
61+
(MDatatype.BYTES, PandasCodec.JsonContentType),
62+
),
63+
(
64+
ColSpec(name="foo", type=Map(Array(dtype=DataType.long))),
65+
(MDatatype.BYTES, PandasCodec.JsonContentType),
66+
),
67+
(
68+
ColSpec(
69+
name="foo",
70+
type=Object(
71+
properties=[
72+
Property("a", DataType.long),
73+
Property("b", DataType.string),
74+
]
75+
),
76+
),
77+
(MDatatype.BYTES, PandasCodec.JsonContentType),
78+
),
79+
(
80+
ColSpec(name="foo", type=AnyType()),
81+
(MDatatype.BYTES, PandasCodec.JsonContentType),
82+
),
4983
],
5084
)
5185
def test_get_content_type(input_spec: InputSpec, expected: Tuple[MDatatype, str]):
@@ -142,6 +176,49 @@ def test_get_shape(input_spec: InputSpec, expected: List[int]):
142176
),
143177
],
144178
),
179+
(
180+
Schema(
181+
inputs=[
182+
ColSpec(type=Array(dtype=DataType.long)),
183+
ColSpec(type=Map(Array(dtype=DataType.long))),
184+
ColSpec(
185+
type=Object(
186+
properties=[
187+
Property("a", DataType.long),
188+
Property("b", DataType.string),
189+
]
190+
)
191+
),
192+
ColSpec(type=AnyType()),
193+
]
194+
),
195+
[
196+
MetadataTensor(
197+
name="input-0",
198+
datatype="BYTES",
199+
shape=[-1, 1],
200+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
201+
),
202+
MetadataTensor(
203+
name="input-1",
204+
datatype="BYTES",
205+
shape=[-1, 1],
206+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
207+
),
208+
MetadataTensor(
209+
name="input-2",
210+
datatype="BYTES",
211+
shape=[-1, 1],
212+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
213+
),
214+
MetadataTensor(
215+
name="input-3",
216+
datatype="BYTES",
217+
shape=[-1, 1],
218+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
219+
),
220+
],
221+
),
145222
],
146223
)
147224
def test_to_metadata_tensors(schema: Schema, expected: List[MetadataTensor]):
@@ -193,6 +270,54 @@ def test_to_metadata_tensors(schema: Schema, expected: List[MetadataTensor]):
193270
data=[b"2021-08-24T15:01:19"],
194271
),
195272
),
273+
(
274+
ColSpec(name="foo", type=Array(dtype=DataType.long)),
275+
RequestInput(
276+
name="foo",
277+
datatype="BYTES",
278+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
279+
shape=[2],
280+
data=[b"[1,2]", b"[3,4]"],
281+
),
282+
),
283+
(
284+
ColSpec(name="foo", type=Map(Array(dtype=DataType.long))),
285+
RequestInput(
286+
name="foo",
287+
datatype="BYTES",
288+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
289+
shape=[3],
290+
data=[b'{"a":[1,2]}', b'{"b":[3,4]}', b'{"c":[5,6]}'],
291+
),
292+
),
293+
(
294+
ColSpec(
295+
name="foo",
296+
type=Object(
297+
properties=[
298+
Property("a", DataType.long),
299+
Property("b", DataType.string),
300+
]
301+
),
302+
),
303+
RequestInput(
304+
name="foo",
305+
datatype="BYTES",
306+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
307+
shape=[2],
308+
data=[b'{"a":1,"b":"hello"}', b'{"a":2,"b":"world"}'],
309+
),
310+
),
311+
(
312+
ColSpec(name="foo", type=AnyType()),
313+
RequestInput(
314+
name="foo",
315+
datatype="BYTES",
316+
parameters=Parameters(content_type=PandasCodec.JsonContentType),
317+
shape=[3],
318+
data=[b'"a"', b"[1,2]", b'{"b":2}'],
319+
),
320+
),
196321
],
197322
)
198323
def test_content_types(tensor_spec: TensorSpec, request_input: RequestInput):
@@ -221,6 +346,25 @@ def test_content_types(tensor_spec: TensorSpec, request_input: RequestInput):
221346
),
222347
PandasCodec.ContentType,
223348
),
349+
(
350+
# Expect DataFrame for named column inputs
351+
Schema(
352+
inputs=[
353+
ColSpec(type=Array(dtype=DataType.long)),
354+
ColSpec(type=Map(Array(dtype=DataType.long))),
355+
ColSpec(
356+
type=Object(
357+
properties=[
358+
Property("a", DataType.long),
359+
Property("b", DataType.string),
360+
]
361+
),
362+
),
363+
ColSpec(type=AnyType()),
364+
],
365+
),
366+
PandasCodec.ContentType,
367+
),
224368
(
225369
# Expect tensor dictionary for named tensor inputs
226370
Schema(

0 commit comments

Comments
 (0)