diff --git a/paddlemix/examples/ppdocbee2/ppdocbee2_infer.py b/paddlemix/examples/ppdocbee2/ppdocbee2_infer.py new file mode 100644 index 000000000..ae5463319 --- /dev/null +++ b/paddlemix/examples/ppdocbee2/ppdocbee2_infer.py @@ -0,0 +1,133 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import paddle + +from paddlemix.models.qwen2_vl import MIXQwen2Tokenizer +from paddlemix.models.ppdocbee2 import PPDocBee2ForConditionalGeneration +from paddlemix.processors.qwen2_vl_processing import ( + Qwen2VLImageProcessor, + Qwen2VLProcessor, + process_vision_info, +) +from paddlemix.utils.log import logger + + +def main(args): + paddle.seed(seed=0) + compute_dtype = "float16" if args.fp16 else "bfloat16" + if "npu" in paddle.get_device(): + is_bfloat16_supported = True + else: + is_bfloat16_supported = paddle.amp.is_bfloat16_supported() + if compute_dtype == "bfloat16" and not is_bfloat16_supported: + logger.warning("bfloat16 is not supported on your device,change to float32") + compute_dtype = "float32" + + model = PPDocBee2ForConditionalGeneration.from_pretrained(args.model_path, dtype=compute_dtype) + + image_processor = Qwen2VLImageProcessor() + tokenizer = MIXQwen2Tokenizer.from_pretrained(args.model_path) + processor = Qwen2VLProcessor(image_processor, tokenizer) + + # min_pixels = 256*28*28 # 200704 + # max_pixels = 1280*28*28 # 1003520 + # processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels) + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": f"{args.image_file}", + }, + {"type": "text", "text": f"{args.question}"}, + ], + } + ] + + # Preparation for inference + image_inputs, video_inputs = process_vision_info(messages) + + question = messages[0]["content"][1]["text"] + image_pad_token = "<|vision_start|><|image_pad|><|vision_end|>" + text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_token}{question}<|im_end|>\n<|im_start|>assistant\n" + text = [text] + + inputs = processor( + text=text, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pd", + ) + + if args.benchmark: + import time + + start = 0.0 + total = 0.0 + for i in range(20): + if i > 10: + start = time.time() + with paddle.no_grad(): + generated_ids = model.generate( + **inputs, + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + top_p=0.001, + top_k=1, + ) # already trimmed in paddle + output_text = processor.batch_decode( + generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + if i > 10: + total += time.time() - start + print("s/it: ", total / 10) + print(f"\nGPU memory_allocated: {paddle.device.cuda.memory_allocated() / 1024 ** 3:.2f} GB") + print(f"\nGPU max_memory_allocated: {paddle.device.cuda.max_memory_allocated() / 1024 ** 3:.2f} GB") + print(f"\nGPU memory_reserved: {paddle.device.cuda.memory_reserved() / 1024 ** 3:.2f} GB") + print(f"\nGPU max_memory_reserved: {paddle.device.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB") + print("output_text:\n", output_text) + + else: + with paddle.no_grad(): + # Inference: Generation of the output + generated_ids = model.generate( + **inputs, + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + top_p=0.001, + top_k=1, + ) # already trimmed in paddle + output_text = processor.batch_decode( + generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + print("output_text:\n", output_text[0]) + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", type=str, default="PaddleMIX/PPDocBee-2B-1129") + parser.add_argument("--question", type=str, default="识别这份表格的内容") + parser.add_argument("--image_file", type=str, default="paddlemix/demo_images/medal_table.png") + parser.add_argument("--temperature", type=float, default=0.1) + parser.add_argument("--max_new_tokens", type=int, default=2048) + parser.add_argument("--fp16", action="store_true") + parser.add_argument("--benchmark", action="store_true") + args = parser.parse_args() + main(args) diff --git a/paddlemix/examples/ppdocbee2/readme.md b/paddlemix/examples/ppdocbee2/readme.md new file mode 100644 index 000000000..e69de29bb diff --git a/paddlemix/models/ppdocbee2/__init__.py b/paddlemix/models/ppdocbee2/__init__.py new file mode 100644 index 000000000..76b76a883 --- /dev/null +++ b/paddlemix/models/ppdocbee2/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + +from .modeling_ppdocbee2 import PPDocBee2ForConditionalGeneration, PPDocBee2TransformerPretrainedModel \ No newline at end of file diff --git a/paddlemix/models/ppdocbee2/modeling_ppdocbee2.py b/paddlemix/models/ppdocbee2/modeling_ppdocbee2.py new file mode 100644 index 000000000..e5d6fed1b --- /dev/null +++ b/paddlemix/models/ppdocbee2/modeling_ppdocbee2.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import paddle +import paddle.nn.functional as F + +from ..qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLPreTrainedModel, Qwen2_5_VLModel, Qwen2LMHead +from ..qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration +from ..qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel + +class PPDocBee2TransformerPretrainedModel(Qwen2_5_VisionTransformerPretrainedModel): + layer_idx = 15 + + def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor) -> paddle.Tensor: + """ + Args: + hidden_states (`paddle.Tensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + grid_thw (`paddle.Tensor` of shape `(num_images_or_videos, 3)`): + The temporal, height and width of feature shape of each image in LLM. + + Returns: + `paddle.Tensor`: hidden_states. + """ + hidden_states = self.patch_embed(hidden_states) + rotary_pos_emb = self.rot_pos_emb(grid_thw) + window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = paddle.to_tensor(data=cu_window_seqlens, dtype="int32", place=hidden_states.place) + cu_window_seqlens = paddle.unique_consecutive(x=cu_window_seqlens) + seq_len, _ = tuple(hidden_states.shape) + hidden_states = hidden_states.reshape([seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1]) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape([seq_len, -1]) + rotary_pos_emb = rotary_pos_emb.reshape([seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1]) + rotary_pos_emb = rotary_pos_emb[window_index, :, :] + rotary_pos_emb = rotary_pos_emb.reshape([seq_len, -1]) + + cu_seqlens = paddle.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + axis=0, dtype="int32" + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + multi_vit = [] + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + if self.enable_recompute and self.training: + hidden_states = self.recompute_training_full(blk, hidden_states, cu_seqlens_now, rotary_pos_emb) + else: + hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens_now, rotary_pos_emb=rotary_pos_emb) + + multi_vit.append(hidden_states.clone()) # TODO + + if isinstance(self.layer_idx, int): + hidden_states = self.merger(hidden_states + multi_vit[self.layer_idx]) + elif isinstance(self.layer_idx, list): + hidden_states = self.merger(hidden_states + sum([multi_vit[id] for id in self.layer_idx])/len(self.layer_idx)) + else: + raise AttributeError(f'{type(self.layer_idx), self.layer_idx}') + + reverse_indices = paddle.argsort(x=window_index) + hidden_states = hidden_states[reverse_indices, :] + + return hidden_states + + + +class PPDocBee2ForConditionalGeneration(Qwen2_5_VLForConditionalGeneration): + def __init__(self, config, attn_implementation="flash_attention_2"): + # Qwen2_5_VLPreTrainedModel.__init__(config) + super(Qwen2_5_VLForConditionalGeneration, self).__init__(config) + + # super().__init__(config, attn_implementation) + # self.visual = PPDocBee2TransformerPretrainedModel._from_config(config.vision_config) + + config._attn_implementation = attn_implementation + config.vision_config._attn_implementation = attn_implementation + + self.visual = PPDocBee2TransformerPretrainedModel._from_config(config.vision_config) + self.model = Qwen2_5_VLModel(config) + self.vocab_size = config.vocab_size + if config.tie_word_embeddings: + self.lm_head = Qwen2LMHead(config, embedding_weights=self.model.embed_tokens.weight, transpose_y=True) + self.tie_weights() + else: + self.lm_head = Qwen2LMHead(config) + self.padding_side = "left" # set it to left by default, user can use setter to change padding_sides + + self.enable_recompute = False +