ocr page

OliverBryant · OliverBryant · commit 269891cefdde · 2025-10-29T17:10:59.000+08:00
diff --git a/xinference/model/image/core.py b/xinference/model/image/core.py
@@ -159,12 +159,14 @@ def create_ocr_model_instance(
     model_spec: ImageModelFamilyV2,
     model_path: Optional[str] = None,
     **kwargs,
-) -> GotOCR2Model:
+):
     from .cache_manager import ImageCacheManager
 
     if not model_path:
         cache_manager = ImageCacheManager(model_spec)
         model_path = cache_manager.cache()
+
+    # Use GOT-OCR2 for all OCR models
     model = GotOCR2Model(
         model_uid,
         model_path,
diff --git a/xinference/ui/gradio/media_interface.py b/xinference/ui/gradio/media_interface.py
@@ -63,9 +63,7 @@ def __init__(
         )
 
     def build(self) -> gr.Blocks:
-        if self.model_type == "image":
-            assert "stable_diffusion" in self.model_family
-
+        # Remove the stable_diffusion restriction to support OCR models
         interface = self.build_main_interface()
         interface.queue()
         # Gradio initiates the queue during a startup event, but since the app has already been
@@ -1233,9 +1231,98 @@ def tts_generate(
 
         return tts_ui
 
+    def ocr_interface(self) -> "gr.Blocks":
+        def extract_text_from_image(
+            image: "PIL.Image.Image",
+            ocr_type: str = "ocr",
+            progress=gr.Progress(),
+        ) -> str:
+            from ...client import RESTfulClient
+
+            client = RESTfulClient(self.endpoint)
+            client._set_token(self.access_token)
+            model = client.get_model(self.model_uid)
+            assert hasattr(model, "ocr")
+
+            # Convert PIL image to bytes
+            import io
+
+            buffered = io.BytesIO()
+            if image.mode == "RGBA" or image.mode == "CMYK":
+                image = image.convert("RGB")
+            image.save(buffered, format="PNG")
+            image_bytes = buffered.getvalue()
+
+            progress(0.1, desc="Processing image for OCR")
+
+            # Call the OCR method with bytes instead of PIL Image
+            response = model.ocr(
+                image=image_bytes,
+                ocr_type=ocr_type,
+            )
+
+            progress(0.8, desc="Extracting text")
+            progress(1.0, desc="OCR complete")
+
+            return response if response else "No text extracted from the image."
+
+        with gr.Blocks() as ocr_interface:
+            gr.Markdown(f"### OCR Text Extraction with {self.model_name}")
+
+            with gr.Row():
+                with gr.Column(scale=1):
+                    image_input = gr.Image(
+                        type="pil",
+                        label="Upload Image for OCR",
+                        interactive=True,
+                        height=400,
+                    )
+
+                    gr.Markdown(f"**Current OCR Model:** {self.model_name}")
+
+                    ocr_type = gr.Dropdown(
+                        choices=["ocr", "format"],
+                        value="ocr",
+                        label="OCR Type",
+                        info="Choose OCR processing type",
+                    )
+
+                    extract_btn = gr.Button("Extract Text", variant="primary")
+
+                with gr.Column(scale=1):
+                    text_output = gr.Textbox(
+                        label="Extracted Text",
+                        lines=20,
+                        placeholder="Extracted text will appear here...",
+                        interactive=True,
+                        show_copy_button=True,
+                    )
+
+            # Examples section
+            gr.Markdown("### Examples")
+            gr.Examples(
+                examples=[
+                    # You can add example image paths here if needed
+                ],
+                inputs=[image_input],
+                label="Example Images",
+            )
+
+            # Extract button click event
+            extract_btn.click(
+                fn=extract_text_from_image,
+                inputs=[image_input, ocr_type],
+                outputs=[text_output],
+            )
+
+        return ocr_interface
+
     def build_main_interface(self) -> "gr.Blocks":
         if self.model_type == "image":
-            title = f"🎨 Xinference Stable Diffusion: {self.model_name} 🎨"
+            if "ocr" in self.model_ability:
+                title = f"🔍 Xinference OCR: {self.model_name} 🔍"
+            else:
+                title = f"🎨 Xinference Stable Diffusion: {self.model_name} 🎨"
         elif self.model_type == "video":
             title = f"🎨 Xinference Video Generation: {self.model_name} 🎨"
         else:
@@ -1266,6 +1353,9 @@ def build_main_interface(self) -> "gr.Blocks":
                     </div>
                     """
             )
+            if "ocr" in self.model_ability:
+                with gr.Tab("OCR"):
+                    self.ocr_interface()
             if "text2image" in self.model_ability:
                 with gr.Tab("Text to Image"):
                     self.text2image_interface()
diff --git a/xinference/ui/web/ui/src/scenes/register_model/registerModel.js b/xinference/ui/web/ui/src/scenes/register_model/registerModel.js
@@ -58,6 +58,10 @@ const model_ability_options = [
       'Hybrid',
     ],
   },
+  {
+    type: 'image',
+    options: ['ocr'],
+  },
   {
     type: 'audio',
     options: ['text2audio', 'audio2text'],
@@ -76,7 +80,7 @@ const messages = [
 const model_family_options = [
   {
     type: 'image',
-    options: ['stable_diffusion'],
+    options: ['stable_diffusion', 'ocr'],
   },
   {
     type: 'audio',

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,10 @@ const model_ability_options = [`
`58`	`58`	`'Hybrid',`
`59`	`59`	`],`
`60`	`60`	`},`
	`61`	`+ {`
	`62`	`+ type: 'image',`
	`63`	`+ options: ['ocr'],`
	`64`	`+ },`
`61`	`65`	`{`
`62`	`66`	`type: 'audio',`
`63`	`67`	`options: ['text2audio', 'audio2text'],`
`@@ -76,7 +80,7 @@ const messages = [`
`76`	`80`	`const model_family_options = [`
`77`	`81`	`{`
`78`	`82`	`type: 'image',`
`79`		`- options: ['stable_diffusion'],`
	`83`	`+ options: ['stable_diffusion', 'ocr'],`
`80`	`84`	`},`
`81`	`85`	`{`
`82`	`86`	`type: 'audio',`