Skip to content

Commit c722aaf

Browse files
authored
[AINode] More accurate exception for model management (#16895)
1 parent 94461b0 commit c722aaf

File tree

32 files changed

+285
-376
lines changed

32 files changed

+285
-376
lines changed

integration-test/src/test/java/org/apache/iotdb/ainode/it/AINodeClusterConfigIT.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,4 @@ private void aiNodeRegisterAndRemoveTest(Statement statement) throws SQLExceptio
107107
}
108108
Assert.fail("The target AINode is not removed successfully after all retries.");
109109
}
110-
111-
// TODO: We might need to add remove unknown test in the future, but current infrastructure is too
112-
// hard to implement it.
113110
}

integration-test/src/test/java/org/apache/iotdb/ainode/it/AINodeModelManageIT.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,15 +131,15 @@ private void userDefinedModelManagementTest(Statement statement)
131131
public void dropBuiltInModelErrorTestInTree() throws SQLException {
132132
try (Connection connection = EnvFactory.getEnv().getConnection(BaseEnv.TREE_SQL_DIALECT);
133133
Statement statement = connection.createStatement()) {
134-
errorTest(statement, "drop model sundial", "1510: Cannot delete built-in model: sundial");
134+
errorTest(statement, "drop model sundial", "1506: Cannot delete built-in model: sundial");
135135
}
136136
}
137137

138138
@Test
139139
public void dropBuiltInModelErrorTestInTable() throws SQLException {
140140
try (Connection connection = EnvFactory.getEnv().getConnection(BaseEnv.TABLE_SQL_DIALECT);
141141
Statement statement = connection.createStatement()) {
142-
errorTest(statement, "drop model sundial", "1510: Cannot delete built-in model: sundial");
142+
errorTest(statement, "drop model sundial", "1506: Cannot delete built-in model: sundial");
143143
}
144144
}
145145

iotdb-client/service-rpc/src/main/java/org/apache/iotdb/rpc/TSStatusCode.java

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -244,16 +244,17 @@ public enum TSStatusCode {
244244
CQ_UPDATE_LAST_EXEC_TIME_ERROR(1403),
245245

246246
// AI
247-
CREATE_MODEL_ERROR(1500),
248-
DROP_MODEL_ERROR(1501),
249-
MODEL_EXIST_ERROR(1502),
250-
GET_MODEL_INFO_ERROR(1503),
251-
NO_REGISTERED_AI_NODE_ERROR(1504),
252-
MODEL_NOT_FOUND_ERROR(1505),
253-
REGISTER_AI_NODE_ERROR(1506),
254-
UNAVAILABLE_AI_DEVICE_ERROR(1507),
255-
AI_NODE_INTERNAL_ERROR(1510),
256-
REMOVE_AI_NODE_ERROR(1511),
247+
NO_REGISTERED_AI_NODE_ERROR(1500),
248+
REGISTER_AI_NODE_ERROR(1501),
249+
REMOVE_AI_NODE_ERROR(1502),
250+
MODEL_EXISTED_ERROR(1503),
251+
MODEL_NOT_EXIST_ERROR(1504),
252+
CREATE_MODEL_ERROR(1505),
253+
DROP_BUILTIN_MODEL_ERROR(1506),
254+
DROP_MODEL_ERROR(1507),
255+
UNAVAILABLE_AI_DEVICE_ERROR(1508),
256+
257+
AINODE_INTERNAL_ERROR(1599), // In case somebody too lazy to add a new error code
257258

258259
// Pipe Plugin
259260
CREATE_PIPE_PLUGIN_ERROR(1600),

iotdb-core/ainode/iotdb/ainode/core/config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
AINODE_THRIFT_COMPRESSION_ENABLED,
4747
AINODE_VERSION_INFO,
4848
)
49-
from iotdb.ainode.core.exception import BadNodeUrlError
49+
from iotdb.ainode.core.exception import BadNodeUrlException
5050
from iotdb.ainode.core.log import Logger
5151
from iotdb.ainode.core.util.decorator import singleton
5252
from iotdb.thrift.common.ttypes import TEndPoint
@@ -437,7 +437,7 @@ def _load_config_from_file(self) -> None:
437437
file_configs["ain_cluster_ingress_time_zone"]
438438
)
439439

440-
except BadNodeUrlError:
440+
except BadNodeUrlException:
441441
logger.warning("Cannot load AINode conf file, use default configuration.")
442442

443443
except Exception as e:
@@ -489,12 +489,12 @@ def parse_endpoint_url(endpoint_url: str) -> TEndPoint:
489489
"""
490490
split = endpoint_url.split(":")
491491
if len(split) != 2:
492-
raise BadNodeUrlError(endpoint_url)
492+
raise BadNodeUrlException(endpoint_url)
493493

494494
ip = split[0]
495495
try:
496496
port = int(split[1])
497497
result = TEndPoint(ip, port)
498498
return result
499499
except ValueError:
500-
raise BadNodeUrlError(endpoint_url)
500+
raise BadNodeUrlException(endpoint_url)

iotdb-core/ainode/iotdb/ainode/core/constant.py

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -81,33 +81,18 @@
8181
class TSStatusCode(Enum):
8282
SUCCESS_STATUS = 200
8383
REDIRECTION_RECOMMEND = 400
84-
MODEL_EXIST_ERROR = 1502
85-
MODEL_NOT_FOUND_ERROR = 1505
86-
UNAVAILABLE_AI_DEVICE_ERROR = 1507
87-
AINODE_INTERNAL_ERROR = 1510
84+
MODEL_EXISTED_ERROR = 1503
85+
MODEL_NOT_EXIST_ERROR = 1504
86+
CREATE_MODEL_ERROR = 1505
87+
DROP_BUILTIN_MODEL_ERROR = 1506
88+
DROP_MODEL_ERROR = 1507
89+
UNAVAILABLE_AI_DEVICE_ERROR = 1508
90+
8891
INVALID_URI_ERROR = 1511
8992
INVALID_INFERENCE_CONFIG = 1512
9093
INFERENCE_INTERNAL_ERROR = 1520
9194

92-
def get_status_code(self) -> int:
93-
return self.value
94-
95+
AINODE_INTERNAL_ERROR = 1599 # In case somebody too lazy to add a new error code
9596

96-
class HyperparameterName(Enum):
97-
# Training hyperparameter
98-
LEARNING_RATE = "learning_rate"
99-
EPOCHS = "epochs"
100-
BATCH_SIZE = "batch_size"
101-
USE_GPU = "use_gpu"
102-
NUM_WORKERS = "num_workers"
103-
104-
# Structure hyperparameter
105-
KERNEL_SIZE = "kernel_size"
106-
INPUT_VARS = "input_vars"
107-
BLOCK_TYPE = "block_type"
108-
D_MODEL = "d_model"
109-
INNER_LAYERS = "inner_layer"
110-
OUTER_LAYERS = "outer_layer"
111-
112-
def name(self):
97+
def get_status_code(self) -> int:
11398
return self.value

iotdb-core/ainode/iotdb/ainode/core/exception.py

Lines changed: 40 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
)
2424

2525

26-
class _BaseError(Exception):
26+
class _BaseException(Exception):
2727
"""Base class for exceptions in this module."""
2828

2929
def __init__(self):
@@ -33,122 +33,94 @@ def __str__(self) -> str:
3333
return self.message
3434

3535

36-
class BadNodeUrlError(_BaseError):
36+
class BadNodeUrlException(_BaseException):
3737
def __init__(self, node_url: str):
38+
super().__init__()
3839
self.message = "Bad node url: {}".format(node_url)
3940

4041

41-
class ModelNotExistError(_BaseError):
42-
def __init__(self, msg: str):
43-
self.message = "Model is not exists: {} ".format(msg)
44-
45-
46-
class BadConfigValueError(_BaseError):
47-
def __init__(self, config_name: str, config_value, hint: str = ""):
48-
self.message = "Bad value [{0}] for config {1}. {2}".format(
49-
config_value, config_name, hint
50-
)
51-
42+
# ==================== Model Management ====================
5243

53-
class MissingConfigError(_BaseError):
54-
def __init__(self, config_name: str):
55-
self.message = "Missing config: {}".format(config_name)
5644

57-
58-
class MissingOptionError(_BaseError):
59-
def __init__(self, config_name: str):
60-
self.message = "Missing task option: {}".format(config_name)
45+
class ModelExistedException(_BaseException):
46+
def __init__(self, model_id: str):
47+
super().__init__()
48+
self.message = "Model {} already exists".format(model_id)
6149

6250

63-
class RedundantOptionError(_BaseError):
64-
def __init__(self, option_name: str):
65-
self.message = "Redundant task option: {}".format(option_name)
51+
class ModelNotExistException(_BaseException):
52+
def __init__(self, model_id: str):
53+
super().__init__()
54+
self.message = "Model {} does not exist".format(model_id)
6655

6756

68-
class WrongTypeConfigError(_BaseError):
69-
def __init__(self, config_name: str, expected_type: str):
70-
self.message = "Wrong type for config: {0}, expected: {1}".format(
71-
config_name, expected_type
57+
class InvalidModelUriException(_BaseException):
58+
def __init__(self, msg: str):
59+
super().__init__()
60+
self.message = (
61+
"Model registration failed because the specified uri is invalid: {}".format(
62+
msg
63+
)
7264
)
7365

7466

75-
class UnsupportedError(_BaseError):
76-
def __init__(self, msg: str):
77-
self.message = "{0} is not supported in current version".format(msg)
67+
class BuiltInModelDeletionException(_BaseException):
68+
def __init__(self, model_id: str):
69+
super().__init__()
70+
self.message = "Cannot delete built-in model: {}".format(model_id)
7871

7972

80-
class InvalidUriError(_BaseError):
81-
def __init__(self, uri: str):
82-
self.message = "Invalid uri: {}, there are no {} or {} under this uri.".format(
83-
uri, MODEL_WEIGHTS_FILE_IN_PT, MODEL_CONFIG_FILE_IN_YAML
73+
class BadConfigValueException(_BaseException):
74+
def __init__(self, config_name: str, config_value, hint: str = ""):
75+
super().__init__()
76+
self.message = "Bad value [{0}] for config {1}. {2}".format(
77+
config_value, config_name, hint
8478
)
8579

8680

87-
class InvalidWindowArgumentError(_BaseError):
88-
def __init__(self, window_interval, window_step, dataset_length):
89-
self.message = f"Invalid inference input: window_interval {window_interval}, window_step {window_step}, dataset_length {dataset_length}"
90-
91-
92-
class InferenceModelInternalError(_BaseError):
81+
class InferenceModelInternalException(_BaseException):
9382
def __init__(self, msg: str):
83+
super().__init__()
9484
self.message = "Inference model internal error: {0}".format(msg)
9585

9686

97-
class BuiltInModelNotSupportError(_BaseError):
87+
class BuiltInModelNotSupportException(_BaseException):
9888
def __init__(self, msg: str):
89+
super().__init__()
9990
self.message = "Built-in model not support: {0}".format(msg)
10091

10192

102-
class BuiltInModelDeletionError(_BaseError):
103-
def __init__(self, model_id: str):
104-
self.message = "Cannot delete built-in model: {0}".format(model_id)
105-
106-
107-
class WrongAttributeTypeError(_BaseError):
93+
class WrongAttributeTypeException(_BaseException):
10894
def __init__(self, attribute_name: str, expected_type: str):
95+
super().__init__()
10996
self.message = "Wrong type for attribute: {0}, expected: {1}".format(
11097
attribute_name, expected_type
11198
)
11299

113100

114-
class NumericalRangeException(_BaseError):
101+
class NumericalRangeException(_BaseException):
115102
def __init__(self, attribute_name: str, value, min_value, max_value):
103+
super().__init__()
116104
self.message = (
117105
"Attribute {0} expect value between {1} and {2}, got {3} instead.".format(
118106
attribute_name, min_value, max_value, value
119107
)
120108
)
121109

122110

123-
class StringRangeException(_BaseError):
111+
class StringRangeException(_BaseException):
124112
def __init__(self, attribute_name: str, value: str, expect_value):
113+
super().__init__()
125114
self.message = "Attribute {0} expect value in {1}, got {2} instead.".format(
126115
attribute_name, expect_value, value
127116
)
128117

129118

130-
class ListRangeException(_BaseError):
119+
class ListRangeException(_BaseException):
131120
def __init__(self, attribute_name: str, value: list, expected_type: str):
121+
super().__init__()
132122
self.message = (
133123
"Attribute {0} expect value type list[{1}], got {2} instead.".format(
134124
attribute_name, expected_type, value
135125
)
136126
)
137-
138-
139-
class AttributeNotSupportError(_BaseError):
140-
def __init__(self, model_name: str, attribute_name: str):
141-
self.message = "Attribute {0} is not supported in model {1}".format(
142-
attribute_name, model_name
143-
)
144-
145-
146-
# This is used to extract the key message in RuntimeError instead of the traceback message
147-
def runtime_error_extractor(error_message):
148-
pattern = re.compile(r"RuntimeError: (.+)")
149-
match = pattern.search(error_message)
150-
151-
if match:
152-
return match.group(1)
153-
else:
154-
return ""

iotdb-core/ainode/iotdb/ainode/core/inference/dispatcher/basic_dispatcher.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# under the License.
1717
#
1818

19-
from iotdb.ainode.core.exception import InferenceModelInternalError
19+
from iotdb.ainode.core.exception import InferenceModelInternalException
2020
from iotdb.ainode.core.inference.dispatcher.abstract_dispatcher import (
2121
AbstractDispatcher,
2222
)
@@ -41,7 +41,7 @@ def _select_pool_by_hash(self, req, pool_ids) -> int:
4141
"""
4242
model_id = req.model_id
4343
if not pool_ids:
44-
raise InferenceModelInternalError(
44+
raise InferenceModelInternalException(
4545
f"No available pools for model {model_id}"
4646
)
4747
start_idx = hash(req.req_id) % len(pool_ids)
@@ -51,7 +51,7 @@ def _select_pool_by_hash(self, req, pool_ids) -> int:
5151
state = self.pool_states[pool_id]
5252
if state == PoolState.RUNNING:
5353
return pool_id
54-
raise InferenceModelInternalError(
54+
raise InferenceModelInternalException(
5555
f"No RUNNING pools available for model {model_id}"
5656
)
5757

iotdb-core/ainode/iotdb/ainode/core/inference/pool_controller.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
import torch.multiprocessing as mp
2626

27-
from iotdb.ainode.core.exception import InferenceModelInternalError
27+
from iotdb.ainode.core.exception import InferenceModelInternalException
2828
from iotdb.ainode.core.inference.inference_request import (
2929
InferenceRequest,
3030
InferenceRequestProxy,
@@ -374,7 +374,7 @@ def add_request(self, req: InferenceRequest, infer_proxy: InferenceRequestProxy)
374374
if not self.has_request_pools(model_id):
375375
logger.error(f"[Inference] No pools found for model {model_id}.")
376376
infer_proxy.set_result(None)
377-
raise InferenceModelInternalError(
377+
raise InferenceModelInternalException(
378378
"Dispatch request failed, because no inference pools are init."
379379
)
380380
# TODO: Implement adaptive scaling based on requests.(e.g. lazy initialization)

iotdb-core/ainode/iotdb/ainode/core/inference/pool_group.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
import torch.multiprocessing as mp
2121

22-
from iotdb.ainode.core.exception import InferenceModelInternalError
22+
from iotdb.ainode.core.exception import InferenceModelInternalException
2323
from iotdb.ainode.core.inference.dispatcher.basic_dispatcher import BasicDispatcher
2424
from iotdb.ainode.core.inference.inference_request import (
2525
InferenceRequest,
@@ -90,14 +90,14 @@ def dispatch_request(
9090

9191
def get_request_pool(self, pool_id) -> InferenceRequestPool:
9292
if pool_id not in self.pool_group:
93-
raise InferenceModelInternalError(
93+
raise InferenceModelInternalException(
9494
f"[Inference][Pool-{pool_id}] Pool not found for model {self.model_id}"
9595
)
9696
return self.pool_group[pool_id][0]
9797

9898
def get_request_queue(self, pool_id) -> mp.Queue:
9999
if pool_id not in self.pool_group:
100-
raise InferenceModelInternalError(
100+
raise InferenceModelInternalException(
101101
f"[Inference][Pool-{pool_id}] Pool not found for model {self.model_id}"
102102
)
103103
return self.pool_group[pool_id][1]

iotdb-core/ainode/iotdb/ainode/core/inference/pool_scheduler/basic_pool_scheduler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import torch
2222

23-
from iotdb.ainode.core.exception import InferenceModelInternalError
23+
from iotdb.ainode.core.exception import InferenceModelInternalException
2424
from iotdb.ainode.core.inference.pool_group import PoolGroup
2525
from iotdb.ainode.core.inference.pool_scheduler.abstract_pool_scheduler import (
2626
AbstractPoolScheduler,
@@ -113,7 +113,7 @@ def schedule(self, model_id: str) -> List[ScaleAction]:
113113
if model_id not in self._request_pool_map:
114114
pool_num = estimate_pool_size(self.DEFAULT_DEVICE, model_id)
115115
if pool_num <= 0:
116-
raise InferenceModelInternalError(
116+
raise InferenceModelInternalException(
117117
f"Not enough memory to run model {model_id}."
118118
)
119119
return [ScaleAction(ScaleActionType.SCALE_UP, pool_num, model_id)]

0 commit comments

Comments
 (0)