oracle-devrel
diff --git a/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/LICENSE‎
Lines changed: 35 additions & 0 deletions b/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/LICENSE‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/README.md‎
Lines changed: 84 additions & 0 deletions b/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/README.md‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/files/gui.png‎
604 KB b/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/files/gui.png‎
604 KB
diff --git a/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/files/gui.py‎
Lines changed: 212 additions & 0 deletions b/‎cloud-infrastructure/ai-infra-gpu/ai-infrastructure/ocr-llm-demo/files/gui.py‎
Lines changed: 212 additions & 0 deletions
@@ -0,0 +1,35 @@
+Copyright (c) 2021, 2023 Oracle and/or its affiliates.
+
+The Universal Permissive License (UPL), Version 1.0
+
+Subject to the condition set forth below, permission is hereby granted to any
+person obtaining a copy of this software, associated documentation and/or data
+(collectively the "Software"), free of charge and under any and all copyright
+rights in the Software, and any and all patent rights owned or freely
+licensable by each licensor hereunder covering either (i) the unmodified
+Software as contributed to or provided by such licensor, or (ii) the Larger
+Works (as defined below), to deal in both
+
+(a) the Software, and
+(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+one is included with the Software (each a "Larger Work" to which the Software
+is contributed by such licensors),
+
+without restriction, including without limitation the rights to copy, create
+derivative works of, display, perform, and distribute the Software and make,
+use, sell, offer for sale, import, export, have made, and have sold the
+Software and the Larger Work(s), and to sublicense the foregoing rights on
+either these or other terms.
+
+This license is subject to the following condition:
+The above copyright notice and either this complete permission notice or at
+a minimum a reference to the UPL must be included in all copies or
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,84 @@
+# ocr-llm-demo
+How to quickly setup a demo to demonstrate OCR capabilities of multimodal llm
+New generation open multimodal llm are a very good fit for complex OCR workloads.
+Many of the funcionalities that would require fine tuning with traditional OCR models can now be achieved with prompt engineering. 
+The multilingual support, the hability to recognize handwriting are some of the features that can be used to improve OCR workloads.
+
+## Prerequisites of the demo
+To download model weights you will need access token to Hugging Face.
+The demo was run on Ubuntu 24.04. But it should be possible to run it on other Ubuntu versions.
+You need to install
+- Cuda toolkit/Nvidia Driver
+- anaconda or miniconda
+- sudo apt-get install python-poppler
+
+
+### Install and activate 
+```
+conda env create -f ocr-llm.yaml
+conda activate ocr-llm
+```
+
+## LLM Models
+
+In this Demo we use VLLM to serve multi modal models with the OpenAI API. We tested Pixtral-12B on a VM with 2 A10, and Qwen2-VL on a single A10. 
+Llama-3.2-11B-Vision-Instruct is also an option.
+
+You first need to login to Hugging Face to download the weights
+```
+huggingface-cli login
+```
+and you serve one of the VLLM supported visual models. According to the number of GPUs in your shape you might be able to execute one model or more concurrenlty. For llama-3.2 access in Europe is currenlty restricted.
+
+```
+vllm serve  mistralai/Pixtral-12B-2409 --dtype auto  --tokenizer-mode  mistral -tp 2 --port 8001 --max-model-len 32768
+
+vllm serve  Qwen/Qwen2-VL-7B-Instruct  --dtype auto --max-model-len 8192 --enforce-eager --port 8000
+
+vllm serve  meta-llama/Llama-3.2-11B-Vision-Instruct --dtype auto   --port 8002 --max-model-len 32768
+```  
+## Sample images
+
+The folder pictures includes some example pictures that can be used in the demo. You can add additional images to improve the demo.
+The LLM supported formats are PNG,JPG,WEBP, non animated GIF. I also added automated conversion for PDF images, but for multipage PDF only the firt page will be considered.
+
+
+## Running the GUI
+
+You have a Gradio based GUI available:
+```
+python gui.py
+```
+
+Gradio is configured to proxy to a public connection, similar to the following one
+ ![Alt text](files/gui.png?raw=true "GUI")
+
+## Executing Qwen-2.5-VL as backend API
+
+
+
+
+Qwen-2.5-VL are now supported by VLLM but you mught still need to install transformers from github repo. 
+
+You can execute the 72B model  on the 8 A100 GPUs of BM.GPU.4.8  with
+
+```
+vllm serve Qwen/Qwen2.5-VL-72B-Instruct  --dtype auto  -tp 8 --port 9193
+```
+inference time 40-50 seconds.
+
+It can also be executed on a BM.GPU.L40s.4 by limiting context length
+
+```
+vllm serve Qwen/Qwen2.5-VL-72B-Instruct  --dtype auto  -tp 4 --port 9193 --max-model-len 16000 --enforce-eager
+```
+inference time about 60 seconds
+
+The 7B model can be executed on 2 GPUs
+
+```
+vllm serve Qwen/Qwen2.5-VL-7B-Instruct  --dtype auto -tp 2  --port 9192
+```
+
+
+
@@ -0,0 +1,212 @@
+import base64
+from openai import OpenAI
+import gradio as gr
+import magic
+from pdf2image import convert_from_path
+from io import BytesIO
+from gradio_pdf import PDF
+from PIL import Image as Pil
+import shutil
+import os
+
+uploaded_files = set()
+
+def upload_file(file):
+    global uploaded_files
+    if file.name in uploaded_files:
+        return
+    UPLOAD_FOLDER = "./pictures"
+    shutil.copy(file, UPLOAD_FOLDER)
+    gr.Info("File uploaded", duration=2)
+    uploaded_files.add(file.name)
+
+def update_file_explorer_2():
+    return gr.FileExplorer(root_dir="./pictures")
+
+def upload_file2(file):
+    UPLOAD_FOLDER = "./pictures"
+    shutil.copy(file, UPLOAD_FOLDER)
+    gr.Info("File uploaded", duration=2)
+    return gr.FileExplorer(root_dir="./")
+
+# Function to encode the image
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+
+# Path to your image
+image_path = "./test.png"
+
+def is_pdf(file_path):
+    mime = magic.Magic(mime=True)
+    file_type = mime.from_file(file_path)
+    return file_type == 'application/pdf'
+
+def contact_llm(model_label,query,image_path):
+ images=[]
+ 
+# if upload_button:
+#    upload_file(upload_button)
+
+ if not image_path:
+    return None, None, None
+
+ # Getting the base64 string
+ if is_pdf(image_path):
+     images = convert_from_path(image_path)
+     im_file = BytesIO()
+     images.save(im_file, format="JPEG")
+     im_bytes = im_file.getvalue()
+     base64_image = base64.b64encode(im_bytes).decode('utf-8')
+     image_path=im_file
+     image=images
+ else:
+     base64_image = encode_image(image_path)
+     image = Pil.open(image_path)
+
+ if model_label=="Pixtral-12B":
+      model="mistralai/Pixtral-12B-2409"
+      port=str(8001)
+ if model_label=="Qwen2-VL":
+      model="Qwen/Qwen2-VL-7B-Instruct"
+      port=str(8000)
+ if model_label=="Qwen2.5-VL":
+      model="Qwen/Qwen2.5-VL-7B-Instruct"
+      port=str(9192)
+ if model_label=="Qwen2.5-VL-72B":
+      model="Qwen/Qwen2.5-VL-72B-Instruct"
+      port=str(9193)
+ if model_label=="Llama-3.2-Vision":
+      model="meta-llama/Llama-3.2-11B-Vision-Instruct"
+      port=str(8002)
+
+
+ if query != "":
+    text_query=query
+ else:
+    gr.Info("Using default Query", duration=1) 
+    text_query="Extract text from picture precisely as JSON"
+
+ client = OpenAI(
+  base_url="http://localhost:"+port+"/v1",
+  api_key="EMPTY"  # vLLM doesn't require an API key by default
+  )
+ response = client.chat.completions.create(
+  model=model,       
+  messages=[
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": text_query,
+        },
+        {
+          "type": "image_url",
+          "image_url": {
+            "url":  f"data:image/jpeg;base64,{base64_image}"
+          },
+        },
+      ],
+    }
+  ],
+ )
+
+ #print(response.choices[0])
+ return text_query,response.choices[0], image
+
+def get_from_url(url_input,model_label,query):
+ import validators
+ images=[]
+ 
+# if upload_button:
+#    upload_file(upload_button)
+
+ valid=validators.url(url_input) 
+ print (valid,url_input)
+
+ if model_label=="Pixtral-12B":
+      model="mistralai/Pixtral-12B-2409"
+      port=str(8001)
+ if model_label=="Qwen2-VL":
+      model="Qwen/Qwen2-VL-7B-Instruct"
+      port=str(8000)
+ if model_label=="Qwen2.5-VL":
+      model="Qwen/Qwen2.5-VL-7B-Instruct"
+      port=str(9192)
+ if model_label=="Qwen2.5-VL-72B":
+      model="Qwen/Qwen2.5-VL-72B-Instruct"
+      port=str(9193)
+ if model_label=="Llama-3.2-Vision":
+      model="meta-llama/Llama-3.2-11B-Vision-Instruct"
+      port=str(8002)
+
+
+ if query != "":
+    text_query=query
+ else:
+    gr.Info("Using default Query", duration=1) 
+    text_query="Extract text from picture precisely as JSON"
+
+ client = OpenAI(
+  base_url="http://localhost:"+port+"/v1",
+  api_key="EMPTY"  # vLLM doesn't require an API key by default
+  )
+ response = client.chat.completions.create(
+  model=model,       
+  messages=[
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": text_query,
+        },
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": url_input
+          },
+        },
+      ],
+    }
+  ],
+ )
+
+ #print(response.choices[0])
+ return text_query,response.choices[0], None
+
+if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        gr.Markdown("# VLM based OCR")
+        gr.Markdown("Provide an image and ask questions based on the context generated from it.")
+
+        with gr.Row():
+           with gr.Column(scale=1):
+                model = gr.Dropdown(
+                    ["Qwen2.5-VL", "Qwen2.5-VL-72B","Pixtral-12B", "Qwen2-VL", "Llama3.2-Vision"],
+                    label="Model",
+                    info="Pick the model to use"
+                 )
+                query_input = gr.Textbox(label="Enter your query", placeholder="Ask a question about the content")
+                url_input = gr.Textbox(label="Enter image URL", placeholder="Paste image URL")
+                file_explorer = gr.FileExplorer(glob="**/**", root_dir="./pictures", ignore_glob="**/__init__.py", file_count="single")
+                file_upload = gr.File(file_count="single")
+                submit_btn = gr.Button("Submit")
+
+           with gr.Column(scale=1):
+                query_output = gr.Textbox(label="Query")
+                response_output = gr.Textbox(label="Response")
+                image_output = gr.Image(type="pil")
+
+        submit_btn.click(
+                fn=contact_llm,
+                inputs=[model, query_input, file_explorer],
+              outputs=[query_output, response_output, image_output]
+              )
+        file_upload.upload(fn=upload_file2, inputs=file_upload, outputs=file_explorer).then(update_file_explorer_2, outputs=file_explorer)
+        url_input.input(fn=get_from_url,inputs=[url_input,model, query_input],outputs=[query_output, response_output, image_output]
+              )
+# Launch the interface
+url = demo.launch(share=True,auth=("opc", "H789lf4z"))
+