From c72ee5e818a402675a5a14494a7d521c5fdf8da8 Mon Sep 17 00:00:00 2001 From: wujiji Date: Mon, 29 Sep 2025 10:11:30 +0800 Subject: [PATCH] [Iluvatar GPU] Adapt VL model --- custom_ops/setup_ops.py | 4 + docs/get_started/installation/iluvatar_gpu.md | 145 ++++++++++++++++++ .../get_started/installation/iluvatar_gpu.md | 145 ++++++++++++++++++ .../layers/attention/iluvatar_attn_backend.py | 16 +- .../model_executor/layers/rotary_embedding.py | 3 + .../models/ernie4_5_vl/dfnrope/modeling.py | 3 +- .../models/ernie4_5_vl/image_op.py | 5 + .../models/ernie4_5_vl/modeling_resampler.py | 3 +- fastdeploy/worker/gpu_model_runner.py | 5 +- fastdeploy/worker/iluvatar_model_runner.py | 15 ++ fastdeploy/worker/iluvatar_worker.py | 2 + requirements_iluvatar.txt | 2 +- 12 files changed, 341 insertions(+), 7 deletions(-) diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index ee9cb59bf8..5b090b97c3 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -535,6 +535,10 @@ def find_end_files(directory, end_str): "gpu_ops/token_penalty_multi_scores.cu", "gpu_ops/sample_kernels/rejection_top_p_sampling.cu", "gpu_ops/sample_kernels/top_k_renorm_probs.cu", + "gpu_ops/text_image_index_out.cu", + "gpu_ops/text_image_gather_scatter.cu", + "gpu_ops/extract_text_token_output.cu", + "gpu_ops/set_data_ipc.cu", "iluvatar_ops/moe_dispatch.cu", "iluvatar_ops/moe_reduce.cu", "iluvatar_ops/paged_attn.cu", diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index 393f250a1c..90ea3bdb53 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -409,3 +409,148 @@ Accuracy: 0.962 Invaild: 0.000 Latency: 17332.728 s ``` + +# Run ERNIE-4.5-VL-28B-A3B-Paddle model on iluvatar machine + +## Machine Preparation +First, the `TP=2` when running the ERNIE-4.5-VL-28B-A3B-Paddle model and so you need to prepare a machine with the following configurations: + +| CPU | Memory | Card | Hard Disk| +| :---: | :---: | :---: | :---: | +| x86 | 1TB| 2xBI150| 1TB| + +## Image Preparation +Pull the Docker image + +```bash +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +``` + +## Container Preparation +### Start Container + +```bash +docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker exec -it paddle_infer bash +``` + +/home/paddle contains the model files, *.whl packages, and scripts. + +### Install paddle + +```bash +pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ +pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +``` +For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) + +### Install FastDeploy +```bash +pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +``` + +## Prepare the inference demo script + +script list below: + +`run_demo_vl.sh`: + +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export FD_DEBUG=1 +python3 run_demo_vl.py +``` + +`run_demo_vl.py`: + +```python +import io +import requests +from PIL import Image + +from fastdeploy.entrypoints.llm import LLM +from fastdeploy.engine.sampling_params import SamplingParams +from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer + + +PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle" +tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH) + +messages = [ + { + "role": "user", + "content": [ + {"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}}, + {"type":"text", "text":"图中的文物属于哪个年代"} + ] + } +] +prompt = tokenizer.apply_chat_template(messages, tokenize=False) +images, videos = [], [] +for message in messages: + content = message["content"] + if not isinstance(content, list): + continue + for part in content: + if part["type"] == "image_url": + url = part["image_url"]["url"] + image_bytes = requests.get(url).content + img = Image.open(io.BytesIO(image_bytes)) + images.append(img) + elif part["type"] == "video_url": + url = part["video_url"]["url"] + video_bytes = requests.get(url).content + videos.append({ + "video": video_bytes, + "max_frames": 30 + }) + +sampling_params = SamplingParams(temperature=0.1, max_tokens=6400) +llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl") +outputs = llm.generate(prompts={ + "prompt": prompt, + "multimodal_data": { + "image": images, + "video": videos + } +}, sampling_params=sampling_params) +# 输出结果 +for output in outputs: + prompt = output.prompt + generated_text = output.outputs.text + reasoning_text = output.outputs.reasoning_content + print(f"generated_text={generated_text}") +``` + +## run demo + +```bash +./run_demo_vl.sh +``` + +The following logs will be printed: + +``` +[2025-09-23 10:13:10,844] [ INFO] - Using download source: huggingface +[2025-09-23 10:13:10,844] [ INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json +[2025-09-23 10:13:10,845] [ INFO] - Using download source: huggingface +[2025-09-23 10:13:10,845] [ INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or +unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. + warnings.warn( +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset +`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. warnings.warn( +INFO 2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready... +Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00, 1.41s/it] +Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00, 6.65it/s] +INFO 2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds. +prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s] +generated_text= +图中的文物是**北齐释迦牟尼佛像**,属于**北齐(公元550年-577年)**的文物。 + +这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。 +``` diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index 1ece14ea21..67b4acb804 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -409,3 +409,148 @@ Accuracy: 0.962 Invaild: 0.000 Latency: 17332.728 s ``` + +# 如何在天数机器上运行ERNIE-4.5-VL-28B-A3B-Paddle model + +## 准备机器 +首先运行ERNIE-4.5-VL-28B-A3B-Paddle模型需要`TP=2`, 所以您需要准备以下配置的机器:: + +| CPU | Memory | Card | Hard Disk| +| :---: | :---: | :---: | :---: | +| x86 | 1TB| 2xBI150| 1TB| + +## 准备镜像 +拉取镜像: + +```bash +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +``` + +## 准备容器 +### 启动容器 + +```bash +docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker exec -it paddle_infer bash +``` + +/home/paddle 为模型文件、whl包、脚本所在目录。 + +### Install paddle + +```bash +pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ +pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +``` +获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) + +### 安装FastDeploy +```bash +pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +``` + +## 准备推理demo脚本 + +脚本列表如下所示: + +`run_demo_vl.sh`: + +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export FD_DEBUG=1 +python3 run_demo_vl.py +``` + +`run_demo_vl.py`: + +```python +import io +import requests +from PIL import Image + +from fastdeploy.entrypoints.llm import LLM +from fastdeploy.engine.sampling_params import SamplingParams +from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer + + +PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle" +tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH) + +messages = [ + { + "role": "user", + "content": [ + {"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}}, + {"type":"text", "text":"图中的文物属于哪个年代"} + ] + } +] +prompt = tokenizer.apply_chat_template(messages, tokenize=False) +images, videos = [], [] +for message in messages: + content = message["content"] + if not isinstance(content, list): + continue + for part in content: + if part["type"] == "image_url": + url = part["image_url"]["url"] + image_bytes = requests.get(url).content + img = Image.open(io.BytesIO(image_bytes)) + images.append(img) + elif part["type"] == "video_url": + url = part["video_url"]["url"] + video_bytes = requests.get(url).content + videos.append({ + "video": video_bytes, + "max_frames": 30 + }) + +sampling_params = SamplingParams(temperature=0.1, max_tokens=6400) +llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl") +outputs = llm.generate(prompts={ + "prompt": prompt, + "multimodal_data": { + "image": images, + "video": videos + } +}, sampling_params=sampling_params) +# 输出结果 +for output in outputs: + prompt = output.prompt + generated_text = output.outputs.text + reasoning_text = output.outputs.reasoning_content + print(f"generated_text={generated_text}") +``` + +## 运行demo + +```bash +./run_demo_vl.sh +``` + +打印如下log: + +``` +[2025-09-23 10:13:10,844] [ INFO] - Using download source: huggingface +[2025-09-23 10:13:10,844] [ INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json +[2025-09-23 10:13:10,845] [ INFO] - Using download source: huggingface +[2025-09-23 10:13:10,845] [ INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or +unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. + warnings.warn( +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset +`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. warnings.warn( +INFO 2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready... +Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00, 1.41s/it] +Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00, 6.65it/s] +INFO 2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds. +prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s] +generated_text= +图中的文物是**北齐释迦牟尼佛像**,属于**北齐(公元550年-577年)**的文物。 + +这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。 +``` diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index 355400a319..2db2c771e3 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -86,11 +86,19 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_ self.scale = 1.0 / sqrt(head_dim) self.num_layers = fd_config.model_config.num_hidden_layers self.dtype = paddle.get_default_dtype() + self.enable_mm = fd_config.model_config.enable_mm def init_attention_metadata(self, forward_meta: ForwardMeta): """Initialize attntion metadata hence all layers in the forward pass can reuse it.""" - self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :] - self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :] + if self.enable_mm: + # VL: TODO: The first 0 may need to be replaced with batch_id + # of max_num_seqs when running multiple batch case later + self.rope_cos = forward_meta.rotary_embs[0, 0, 0, :, :, :] + self.rope_sin = forward_meta.rotary_embs[0, 1, 0, :, :, :] + else: + # text + self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :] + self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :] self.prefill_info_dict = {} self.decode_info_dict = {} self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0] @@ -115,7 +123,9 @@ def init_attention_metadata(self, forward_meta: ForwardMeta): self.prefill_info_dict["cu_seqlens_q"][1:] = forward_meta.seq_lens_encoder[ self.prefill_info_dict["batch_ids"], 0 ] - self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"]) + self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum( + self.prefill_info_dict["cu_seqlens_q"], dtype="int32" + ) self.tmp_buffer = paddle.zeros( [self.prefill_num_tokens + self.decode_len, self.hidden_dim], dtype=self.dtype diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py index eb1d65695a..9859ea9154 100644 --- a/fastdeploy/model_executor/layers/rotary_embedding.py +++ b/fastdeploy/model_executor/layers/rotary_embedding.py @@ -411,6 +411,9 @@ def __call__(self, position_ids): rot_emb[0] = cos_thw rot_emb[1] = sin_thw + if current_platform.is_iluvatar(): + rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape([2, 1, self.max_position, 1, self.rotary_dim]) + return rot_emb diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py index 1933db2b0f..de5366e67b 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py @@ -35,6 +35,7 @@ from fastdeploy.model_executor.layers.utils import divide, get_tensor from fastdeploy.model_executor.utils import set_weight_attrs +from fastdeploy.platforms import current_platform from .activation import ACT2FN from .configuration import DFNRopeVisionTransformerConfig @@ -174,7 +175,7 @@ def __init__( mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), weight_attr=None, has_bias=True, - fuse_matmul_bias=True, + fuse_matmul_bias=False if current_platform.is_iluvatar() else True, gather_output=False, ) self.proj = RowParallelLinear( diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py b/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py index 4324d921f3..e5fbb3be3f 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py @@ -26,6 +26,11 @@ text_image_gather_scatter, text_image_index_out, ) +elif current_platform.is_iluvatar(): + from fastdeploy.model_executor.ops.iluvatar import ( + text_image_gather_scatter, + text_image_index_out, + ) else: raise ImportError("Unsupported platform, only support CUDA and XPU") diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py index 90ab12c712..8a54b0d2cf 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py @@ -31,6 +31,7 @@ scatter_axis, ) from fastdeploy.model_executor.utils import set_weight_attrs +from fastdeploy.platforms import current_platform class ScatterOp(PyLayer): @@ -172,7 +173,7 @@ def __init__( self.spatial_dim, input_is_parallel=True, has_bias=True, - fuse_matmul_bias=True, + fuse_matmul_bias=False if current_platform.is_iluvatar() else True, ) if self.tensor_parallel_degree > 1 else nn.Linear(self.spatial_dim, self.spatial_dim) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 4711684bb7..4917e9cadc 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -44,7 +44,10 @@ from fastdeploy.platforms import current_platform if current_platform.is_iluvatar(): - from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx + from fastdeploy.model_executor.ops.iluvatar import ( + set_data_ipc, + set_value_by_flags_and_idx, + ) recover_decode_task = None share_external_data = None diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 5ea6408be7..ca9adae0e1 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -14,6 +14,8 @@ # limitations under the License. """ +import paddle + from fastdeploy import envs from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention import IluvatarAttnBackend @@ -36,6 +38,19 @@ def __init__( assert self.guided_backend is None, "Iluvatar does not support guided decoding" assert not envs.ENABLE_V1_KVCACHE_SCHEDULER, "Iluvatar does not support v1 kvcache scheduler" assert not self.cache_config.enable_prefix_caching, "Iluvatar does not support prefix caching" + if self.enable_mm: + assert ( + not self.cache_config.enable_chunked_prefill + ), "Iluvatar does not support chunked prefill for VL model" + # VL neox style = True + if self.enable_mm: + emb_shape = self.share_inputs["rope_emb"].shape + emb_shape[-1] *= 2 + self.share_inputs["rope_emb"] = paddle.full( + shape=emb_shape, + fill_value=0, + dtype="float32", + ) def initialize_attn_backend(self) -> None: """ diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index c1b0605887..f8501124db 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -40,6 +40,8 @@ def __init__( local_rank: int, rank: int, ): + if fd_config.model_config.enable_mm: + paddle.set_flags({"FLAGS_enable_ixattnbkd": True, "FLAGS_enable_ixdnn_attn": False}) super(IluvatarWorker, self).__init__( fd_config=fd_config, local_rank=local_rank, diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt index d481e3febb..858c0d51ea 100644 --- a/requirements_iluvatar.txt +++ b/requirements_iluvatar.txt @@ -10,7 +10,7 @@ tqdm pynvml uvicorn==0.29.0 fastapi -paddleformers +paddleformers==0.3.0 redis etcd3 httpx