docs: Add docstrings and usage example to MoondreamPreprocessor

BharathC0 · BharathC0 · commit 95e3804acb39 · 2026-01-30T21:49:56.000+05:30
diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
@@ -458,6 +458,15 @@
 from keras_hub.src.models.mobilenetv5.mobilenetv5_image_classifier_preprocessor import (
     MobileNetV5ImageClassifierPreprocessor as MobileNetV5ImageClassifierPreprocessor,
 )
+from keras_hub.src.models.moondream.moondream_backbone import (
+    MoondreamBackbone as MoondreamBackbone,
+)
+from keras_hub.src.models.moondream.moondream_causal_lm import (
+    MoondreamCausalLM as MoondreamCausalLM,
+)
+from keras_hub.src.models.moondream.moondream_preprocessor import (
+    MoondreamPreprocessor as MoondreamPreprocessor,
+)
 from keras_hub.src.models.moonshine.moonshine_audio_to_text import (
     MoonshineAudioToText as MoonshineAudioToText,
 )
diff --git a/keras_hub/src/models/moondream/__init__.py b/keras_hub/src/models/moondream/__init__.py
@@ -1,3 +1,4 @@
 from keras_hub.src.models.moondream.moondream_backbone import MoondreamBackbone
-from keras_hub.src.models.moondream.moondream_preprocessor import \
-    MoondreamPreprocessor
+from keras_hub.src.models.moondream.moondream_preprocessor import (
+    MoondreamPreprocessor,
+)
diff --git a/keras_hub/src/models/moondream/moondream_backbone.py b/keras_hub/src/models/moondream/moondream_backbone.py
@@ -7,52 +7,117 @@
 
 @keras_hub_export("keras_hub.models.MoondreamBackbone")
 class MoondreamBackbone(Backbone):
-    def __init__(self, vision_encoder, text_decoder, projection_dim=2048, **kwargs):
-        super().__init__(**kwargs)
+    """
+    The Moondream Backbone model.
+
+    This model connects a vision encoder (SigLIP) and a text decoder (Phi-1.5)
+    using a projection layer. It is designed for vision-language tasks where
+    image features are projected into the text embedding space.
+
+    Args:
+        vision_encoder: A Keras model (e.g., SigLIP). The vision encoder
+            responsible for processing input images.
+        text_decoder: A Keras model (e.g., Phi-1.5). The text decoder
+            responsible for generating text tokens.
+        projection_dim: int. The dimension to project image features into.
+            Defaults to `2048`.
+        **kwargs: Standard Keras keyword arguments.
+
+    Example:
+    ```python
+    import keras
+    import numpy as np
+    from keras_hub.src.models.moondream.moondream_backbone import (
+        MoondreamBackbone
+    )
+
+    # 1. Create Mock Encoders
+    # Vision Encoder: Maps (378, 378, 3) -> (729, 1152)
+    image_input = keras.Input(shape=(378, 378, 3))
+    vision_output = keras.layers.Lambda(
+        lambda x: keras.ops.ones((keras.ops.shape(x)[0], 729, 1152))
+    )(image_input)
+    vision_encoder = keras.Model(inputs=image_input, outputs=vision_output)
+
+    # Text Decoder: Maps (Seq,) -> (Seq, 2048)
+    text_input = keras.Input(shape=(None,), dtype="int32")
+    text_output = keras.layers.Lambda(
+        lambda x: keras.ops.ones(
+            (keras.ops.shape(x)[0], keras.ops.shape(x)[1], 2048)
+        )
+    )(text_input)
+    text_decoder = keras.Model(inputs=text_input, outputs=text_output)
+
+    # Helper for embeddings
+    text_decoder.get_input_embeddings = lambda x: keras.layers.Embedding(
+        50000, 2048
+    )(x)
 
+    # 2. Instantiate Backbone
+    backbone = MoondreamBackbone(
+        vision_encoder=vision_encoder,
+        text_decoder=text_decoder,
+        projection_dim=2048
+    )
+
+    # 3. Run Forward Pass
+    inputs = {
+        "images": np.random.rand(2, 378, 378, 3),
+        "token_ids": np.random.randint(0, 50000, (2, 10)),
+        "padding_mask": np.ones((2, 10))
+    }
+    outputs = backbone(inputs)
+    ```
+    """
+
+    def __init__(
+        self, vision_encoder, text_decoder, projection_dim=2048, **kwargs
+    ):
+        super().__init__(**kwargs)
         self.vision_encoder = vision_encoder
         self.text_decoder = text_decoder
+        self.projection_dim = projection_dim
 
-        # The Connector
         self.vision_projection = keras.layers.Dense(
             projection_dim, name="vision_projection"
         )
 
-    def call(self, inputs):
-        images = inputs["images"]
-        token_ids = inputs["token_ids"]
-        padding_mask = inputs["padding_mask"]
+        images = keras.Input(shape=(None, None, 3), name="images")
+        token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids")
+        padding_mask = keras.Input(
+            shape=(None,), dtype="int32", name="padding_mask"
+        )
+
+        inputs = {
+            "images": images,
+            "token_ids": token_ids,
+            "padding_mask": padding_mask,
+        }
 
-        # 1. Image Features
         image_features = self.vision_encoder(images)
-
-        # 2. Project
         projected_images = self.vision_projection(image_features)
 
-        # 3. Text Embeddings
         text_embeddings = self.text_decoder.get_input_embeddings(token_ids)
 
-        # 4. Concatenate
         combined_embeddings = ops.concatenate(
             [projected_images, text_embeddings], axis=1
         )
 
-        # 5. Masking
         batch_size = ops.shape(images)[0]
         num_patches = ops.shape(projected_images)[1]
 
-        image_mask = ops.ones((batch_size, num_patches), dtype="bool")
+        image_mask = ops.ones((batch_size, num_patches), dtype="int32")
         combined_mask = ops.concatenate([image_mask, padding_mask], axis=1)
 
-        # 6. Decoder Pass
-        # Now compatible with our Subclass Mock Decoder
         outputs = self.text_decoder(
             inputs=None,
             decoder_inputs_embeds=combined_embeddings,
             padding_mask=combined_mask,
         )
 
-        return outputs
+        super(MoondreamBackbone, self).__init__(
+            inputs=inputs, outputs=outputs, **kwargs
+        )
 
     def get_config(self):
         config = super().get_config()
@@ -61,8 +126,10 @@ def get_config(self):
                 "vision_encoder": keras.saving.serialize_keras_object(
                     self.vision_encoder
                 ),
-                "text_decoder": keras.saving.serialize_keras_object(self.text_decoder),
-                "projection_dim": self.vision_projection.units,
+                "text_decoder": keras.saving.serialize_keras_object(
+                    self.text_decoder
+                ),
+                "projection_dim": self.projection_dim,
             }
         )
         return config
diff --git a/keras_hub/src/models/moondream/moondream_causal_lm.py b/keras_hub/src/models/moondream/moondream_causal_lm.py
@@ -1,14 +1,69 @@
-import keras
-
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.causal_lm import CausalLM
 from keras_hub.src.models.moondream.moondream_backbone import MoondreamBackbone
-from keras_hub.src.models.moondream.moondream_preprocessor import \
-    MoondreamPreprocessor
+from keras_hub.src.models.moondream.moondream_preprocessor import (
+    MoondreamPreprocessor,
+)
 
 
 @keras_hub_export("keras_hub.models.MoondreamCausalLM")
 class MoondreamCausalLM(CausalLM):
+    """
+    An end-to-end Moondream model for causal language modeling.
+
+    This model wraps `MoondreamBackbone` and handles the complete flow from
+    raw inputs (images + text) to generated text output. It provides a
+    high-level interface for image captioning and visual question answering.
+
+    Args:
+        backbone: A `MoondreamBackbone` instance. The backbone model that
+            connects the vision encoder and text decoder.
+        preprocessor: A `MoondreamPreprocessor` instance. Handles data
+            preprocessing (tokenization and image resizing).
+        **kwargs: Standard Keras keyword arguments.
+
+    Example:
+    ```python
+    import keras
+    import numpy as np
+    from keras_hub.src.models.moondream.moondream_backbone import (
+        MoondreamBackbone
+    )
+    from keras_hub.src.models.moondream.moondream_causal_lm import (
+        MoondreamCausalLM
+    )
+
+    # 1. Setup Mock Backbone
+    images = keras.Input(shape=(None, None, 3), name="images")
+    token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids")
+    padding_mask = keras.Input(
+        shape=(None,), dtype="int32", name="padding_mask"
+    )
+
+    outputs = keras.layers.Dense(2048)(token_ids)
+
+    backbone = keras.Model(
+        inputs={
+            "images": images,
+            "token_ids": token_ids,
+            "padding_mask": padding_mask
+        },
+        outputs=outputs
+    )
+
+    # 2. Instantiate CausalLM
+    model = MoondreamCausalLM(backbone=backbone)
+
+    # 3. Run Forward Pass
+    inputs = {
+        "images": np.random.rand(2, 378, 378, 3),
+        "token_ids": np.random.randint(0, 100, (2, 10)),
+        "padding_mask": np.ones((2, 10))
+    }
+    outputs = model(inputs)
+    ```
+    """
+
     backbone_cls = MoondreamBackbone
     preprocessor_cls = MoondreamPreprocessor
 
@@ -18,20 +73,14 @@ def __init__(
         preprocessor=None,
         **kwargs,
     ):
-        inputs = getattr(backbone, "input", None)
+        inputs = backbone.input
+        outputs = backbone(inputs)
 
-        super().__init__(**kwargs)
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
 
-        # Manually set the attributes
         self.backbone = backbone
         self.preprocessor = preprocessor
-
-        # Set tensor spec if available
-        if inputs is not None:
-            self.input_tensor_spec = inputs
-
-    def call(self, inputs, training=False):
-        if self.backbone is None:
-            raise ValueError("Backbone not initialized")
-        x = self.backbone(inputs)
-        return x
diff --git a/keras_hub/src/models/moondream/moondream_preprocessor.py b/keras_hub/src/models/moondream/moondream_preprocessor.py
@@ -6,6 +6,63 @@
 
 @keras_hub_export("keras_hub.models.MoondreamPreprocessor")
 class MoondreamPreprocessor(CausalLMPreprocessor):
+    """
+    Moondream Causal LM Preprocessor.
+
+    This class handles the preprocessing of images and text for the Moondream
+    model. It combines image resizing/rescaling logic with text tokenization
+    to prepare inputs for the model.
+
+    Args:
+        tokenizer: The tokenizer to be used for text inputs.
+        image_converter: An optional layer or callable for image preprocessing
+            (e.g., resizing, normalization).
+        sequence_length: int. The context length for tokenization.
+            Defaults to 1024.
+        add_start_token: bool. Whether to add the start token.
+            Defaults to True.
+        add_end_token: bool. Whether to add the end token.
+            Defaults to True.
+        **kwargs: Standard Keras keyword arguments.
+
+    Example:
+    ```python
+    import keras
+    import numpy as np
+    from keras_hub.src.models.moondream.moondream_preprocessor import (
+        MoondreamPreprocessor
+    )
+
+    # 1. Create a Mock Tokenizer
+    class MockTokenizer:
+        def __call__(self, x):
+            return keras.ops.convert_to_tensor([[1, 2, 3]] * len(x))
+        def detokenize(self, x):
+            return x
+        pass
+
+    tokenizer = MockTokenizer()
+
+    # 2. Create an Image Converter
+    image_converter = keras.layers.Resizing(height=378, width=378)
+
+    # 3. Instantiate Preprocessor
+    preprocessor = MoondreamPreprocessor(
+        tokenizer=tokenizer,
+        image_converter=image_converter,
+        sequence_length=128
+    )
+
+    # 4. Preprocess Data
+    inputs = {
+        "images": np.random.randint(0, 255, (2, 500, 500, 3)),
+        "text": ["Describe this image.", "What is in the photo?"]
+    }
+
+    outputs = preprocessor(inputs)
+    ```
+    """
+
     def __init__(
         self,
         tokenizer,
@@ -25,23 +82,45 @@ def __init__(
         self.image_converter = image_converter
 
     def call(self, x, y=None, sample_weight=None):
-        output = super().call(x, y, sample_weight)
+        if isinstance(x, dict):
+            text_input = x.get("text", "")
+            images = x.get("images", None)
+        else:
+            text_input = x
+            images = None
+
+        output = super().call(text_input, y=y, sample_weight=sample_weight)
 
-        # 1. Identify the input dictionary from the output
-        # If output is a tuple (x, y, sw), the first element is the input dict.
         if isinstance(output, tuple):
             x_out = output[0]
         else:
             x_out = output
 
-        # 2. Type Guard for Pylance
-        # We explicitly check if x_out IS a dictionary.
-        # This stops Pylance from thinking it might be a Tuple/List.
-        if isinstance(x_out, dict) and isinstance(x, dict) and "images" in x:
-            images = x["images"]
+        if images is not None:
+            if self.image_converter:
+                images = self.image_converter(images)
+
+            if isinstance(x_out, dict):
+                x_out["images"] = images
+
+        return output
+
+    def generate_preprocess(self, x, sequence_length=None):
+        if isinstance(x, dict):
+            text_input = x.get("text", "")
+            images = x.get("images", None)
+        else:
+            text_input = x
+            images = None
+
+        output = super().generate_preprocess(
+            text_input, sequence_length=sequence_length
+        )
+
+        if images is not None:
             if self.image_converter:
                 images = self.image_converter(images)
-            x_out["images"] = images
+            output["images"] = images
 
         return output