diff --git a/src/models.js b/src/models.js index f05a58665..e870a74a0 100644 --- a/src/models.js +++ b/src/models.js @@ -7842,6 +7842,7 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([ ['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]], ['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]], ['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]], + ['qwen2_vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]], ]); const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([ diff --git a/src/pipelines.js b/src/pipelines.js index 19272706d..6d6b33d29 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -42,6 +42,7 @@ import { AutoModelForDepthEstimation, AutoModelForImageFeatureExtraction, PreTrainedModel, + AutoModelForImageTextToText, } from './models.js'; import { AutoProcessor, @@ -1990,6 +1991,85 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe } } +/** + * @callback ImageTextToTextPipelineCallback Assign labels to the image(s) passed as inputs. + * @param {ImagePipelineInputs} images The images to be captioned. + * TODO: support chat inputs (Chat|Chat[]) + * @param {string|string[]} texts The text to be combined with the image. If a list of strings is passed, the length of the list should be the same as the number of images. + * @param {Partial} [options] Additional keyword arguments to pass along to the generate method of the model. + * @returns {Promise} An object (or array of objects) containing the generated text(s). + * + * @typedef {TextImagePipelineConstructorArgs & ImageTextToTextPipelineCallback & Disposable} ImageTextToTextPipelineType + */ + +/** + * Image Text To Text pipeline using a `AutoModelForImageTextToText`. This pipeline infers text from a combination of text and image. + * + * **Example:** TODO + */ +export class ImageTextToTextPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ImageTextToTextPipelineType} */ (Pipeline)) { + + /** + * Create a new ImageToTextPipeline. + * @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline. + */ + constructor(options) { + super(options); + } + + /** @type {ImageTextToTextPipelineCallback} */ + async _call(images, texts, generate_kwargs = {}) { + const isBatchedImages = Array.isArray(images); + const isBatchedTexts = Array.isArray(texts); + const isBatched = isBatchedImages && isBatchedTexts; + + if (isBatchedImages !== isBatchedTexts) { + throw Error("ImageTextToTextPipeline: If images are batched, texts must also be batched and vice versa."); + } + + if (isBatched && images.length !== texts.length) { + throw Error("ImageTextToTextPipeline: If the images and texts are batched, they must have the same length."); + } + + if (isBatched) { + // TODO: support batches + throw Error("ImageTextToTextPipeline: Batching is not supported yet."); + } + + // if (isChat(texts) || isChat(images)) { + // // TODO: support chat + // throw Error("ImageTextToTextPipeline: Chat is not supported yet."); + // } + texts = !Array.isArray(texts) ? [texts] : texts; + const preparedTexts = texts.map((text, index) => { + const conversation = [ + { + role: "user", + content: "<|image_pad|>" + }, + { + role: "user", + content: text + } + ] + return this.processor.apply_chat_template(conversation, { add_generation_prompt: true, tokenize: false }) + }); + const preparedImages = await prepareImages(images); + + // const image_inputs = await this.processor(preparedImages); + // const text_inputs = await this.tokenizer(preparedTexts); + const inputs = await this.processor(preparedTexts, preparedImages); + + const outputs = await this.model.generate({...inputs, ...generate_kwargs, max_new_tokens: 128,}); + + const decoded = this.tokenizer.batch_decode(/** @type {Tensor} */(outputs), { + skip_special_tokens: true, + }) + + return decoded.map(x => ({ generated_text: x.trim() })); + } +} + /** * @typedef {Object} ImageClassificationSingle * @property {string} label The label identified by the model. @@ -3203,7 +3283,16 @@ const SUPPORTED_TASKS = Object.freeze({ }, "type": "multimodal", }, - + "image-text-to-text": { + "tokenizer": AutoTokenizer, + "pipeline": ImageTextToTextPipeline, + "model": AutoModelForImageTextToText, + "processor": AutoProcessor, + "default": { + "model": "onnx-community/Qwen2-VL-2B-Instruct" + }, + "type": "multimodal", + }, "image-classification": { // no tokenizer "pipeline": ImageClassificationPipeline, @@ -3375,6 +3464,7 @@ const TASK_ALIASES = Object.freeze({ * - `"image-classification"`: will return a `ImageClassificationPipeline`. * - `"image-segmentation"`: will return a `ImageSegmentationPipeline`. * - `"image-to-text"`: will return a `ImageToTextPipeline`. + * - `"image-text-to-text"`: will return a `ImageTextToTextPipeline`. * - `"object-detection"`: will return a `ObjectDetectionPipeline`. * - `"question-answering"`: will return a `QuestionAnsweringPipeline`. * - `"summarization"`: will return a `SummarizationPipeline`. diff --git a/tests/pipelines/test_pipelines_image_text_to_text.js b/tests/pipelines/test_pipelines_image_text_to_text.js new file mode 100644 index 000000000..45f8b0abc --- /dev/null +++ b/tests/pipelines/test_pipelines_image_text_to_text.js @@ -0,0 +1,33 @@ +import { pipeline, ImageTextToTextPipeline } from "../../src/transformers.js"; + +import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js"; +import { load_cached_image } from "../asset_cache.js"; + +const PIPELINE_ID = "image-text-to-text"; + +export default () => { + describe("Image Text to Text", () => { + const model_id = "onnx-community/Qwen2-VL-2B-Instruct"; + //TODO: Looks like this model is too big and is triggering timeout. Use smaller model. + /** @type {ImageTextToTextPipeline} */ + let pipe; + let images; + beforeAll(async () => { + pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS); + images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]); + texts = ["What is the color of the image?", "What is the color of the image?"]; + }, MAX_MODEL_LOAD_TIME); + + it("should be an instance of ImageTextToTextPipeline", () => { + expect(pipe).toBeInstanceOf(ImageTextToTextPipeline); + }); + + describe("batch_size=1", () => { + it("default", async () => { + const output = await pipe(images[0], texts[0]); + const target = [{ generated_text: "" }]; //TODO: What should I put here? Will depend on the model... + expect(output).toEqual(target); + }, MAX_TEST_EXECUTION_TIME); + }); + }); +} \ No newline at end of file