diff --git a/autogpt_platform/backend/Dockerfile b/autogpt_platform/backend/Dockerfile index 7f51bad3a132..2ef675c22752 100644 --- a/autogpt_platform/backend/Dockerfile +++ b/autogpt_platform/backend/Dockerfile @@ -29,6 +29,9 @@ RUN apt-get update \ libssl-dev \ postgresql-client \ nodejs \ + tesseract-ocr \ + libtesseract-dev \ + tesseract-ocr-eng \ && rm -rf /var/lib/apt/lists/* ENV POETRY_HOME=/opt/poetry @@ -45,6 +48,10 @@ COPY autogpt_platform/backend/poetry.lock autogpt_platform/backend/pyproject.tom WORKDIR /app/autogpt_platform/backend RUN poetry install --no-ansi --no-root + +# pytessearct +RUN poetry add pytesseract --no-ansi || true + # Generate Prisma client COPY autogpt_platform/backend/schema.prisma ./ COPY autogpt_platform/backend/backend/data/partial_types.py ./backend/data/partial_types.py @@ -65,6 +72,9 @@ ENV PATH=/opt/poetry/bin:$PATH RUN apt-get update && apt-get install -y \ python3.13 \ python3-pip \ + tesseract-ocr \ + libtesseract-dev \ + tesseract-ocr-eng \ && rm -rf /var/lib/apt/lists/* # Copy only necessary files from builder diff --git a/autogpt_platform/backend/backend/blocks/llm.py b/autogpt_platform/backend/backend/blocks/llm.py index 74f53d4bf143..0d4634cacddc 100644 --- a/autogpt_platform/backend/backend/blocks/llm.py +++ b/autogpt_platform/backend/backend/blocks/llm.py @@ -9,6 +9,14 @@ from json import JSONDecodeError from typing import Any, Iterable, List, Literal, NamedTuple, Optional + +import pytesseract +from PIL import Image +import requests +import base64 +import io + + import anthropic import ollama import openai @@ -35,6 +43,7 @@ from backend.util.logging import TruncatedLogger from backend.util.prompt import compress_prompt, estimate_token_count from backend.util.text import TextFormatter +from backend.util.type import MediaFileType logger = TruncatedLogger(logging.getLogger(__name__), "[LLM-Block]") fmt = TextFormatter(autoescape=False) @@ -838,6 +847,13 @@ class Input(BlockSchemaInput): default="localhost:11434", description="Ollama host for local models", ) + image: MediaFileType | None = SchemaField( + title="Media Input", + default=None, + description=( + "Optional media input file (URL, local path, or base64 data URI)." + ), + ) class Output(BlockSchemaOutput): response: dict[str, Any] | list[dict[str, Any]] = SchemaField( @@ -918,6 +934,41 @@ async def run( logger.debug(f"Calling LLM with input data: {input_data}") prompt = [json.to_dict(p) for p in input_data.conversation_history] + # Process image with OCR if present + if input_data.image: + try: + # Handle different image input formats + if input_data.image.startswith('http'): + # URL image + response = requests.get(input_data.image) + image = Image.open(io.BytesIO(response.content)) + elif input_data.image.startswith('data:image'): + # Base64 image + base64_data = re.sub('^data:image/.+;base64,', '', input_data.image) + image_data = base64.b64decode(base64_data) + image = Image.open(io.BytesIO(image_data)) + else: + # Local file path + image = Image.open(input_data.image) + + # Perform OCR + ocr_text = pytesseract.image_to_string(image) + logger.debug(f"OCR extracted text: {ocr_text}") + + # Append OCR text to prompt if text was extracted + if ocr_text.strip(): + if input_data.prompt: + input_data.prompt += f"\n\nExtracted text from image:\n{ocr_text}" + else: + input_data.prompt = f"Extracted text from image:\n{ocr_text}" + + except Exception as e: + logger.error(f"Error processing image with OCR: {str(e)}") + if input_data.prompt: + input_data.prompt += f"\n\nError processing image: {str(e)}" + else: + input_data.prompt = f"Error processing image: {str(e)}" + values = input_data.prompt_values if values: input_data.prompt = fmt.format_string(input_data.prompt, values) @@ -1214,52 +1265,60 @@ def trim_prompt(s: str) -> str: class AITextGeneratorBlock(AIBlockBase): class Input(BlockSchemaInput): prompt: str = SchemaField( - description="The prompt to send to the language model. You can use any of the {keys} from Prompt Values to fill in the prompt with values from the prompt values dictionary by putting them in curly braces.", + description="The prompt to send to the language model.", placeholder="Enter your prompt here...", ) model: LlmModel = SchemaField( title="LLM Model", default=LlmModel.GPT4O, description="The language model to use for answering the prompt.", - advanced=False, ) credentials: AICredentials = AICredentialsField() sys_prompt: str = SchemaField( title="System Prompt", default="", - description="The system prompt to provide additional context to the model.", + description="Optional system prompt for additional context.", ) retry: int = SchemaField( title="Retry Count", default=3, - description="Number of times to retry the LLM call if the response does not match the expected format.", + description="Number of times to retry the LLM call if needed.", ) prompt_values: dict[str, str] = SchemaField( advanced=False, default_factory=dict, - description="Values used to fill in the prompt. The values can be used in the prompt by putting them in a double curly braces, e.g. {{variable_name}}.", + description="Values used to fill in the prompt ({{var}} syntax).", ) ollama_host: str = SchemaField( advanced=True, default="localhost:11434", - description="Ollama host for local models", + description="Ollama host for local models.", ) max_tokens: int | None = SchemaField( advanced=True, default=None, - description="The maximum number of tokens to generate in the chat completion.", + description="Maximum number of tokens to generate.", + ) + image: MediaFileType | None = SchemaField( + title="Media Input", + default=None, + description=( + "Optional media input file (URL, local path, or base64 data URI)." + ), ) class Output(BlockSchemaOutput): response: str = SchemaField( description="The response generated by the language model." ) - prompt: list = SchemaField(description="The prompt sent to the language model.") + prompt: list = SchemaField( + description="The prompt sent to the language model." + ) def __init__(self): super().__init__( id="1f292d4a-41a4-4977-9684-7c8d560b9f91", - description="Call a Large Language Model (LLM) to generate a string based on the given prompt.", + description="Generate text using a language model.", categories={BlockCategory.AI}, input_schema=AITextGeneratorBlock.Input, output_schema=AITextGeneratorBlock.Output, @@ -1277,26 +1336,40 @@ def __init__(self): async def llm_call( self, - input_data: AIStructuredResponseGeneratorBlock.Input, - credentials: APIKeyCredentials, - ) -> dict: + input_data: "AIStructuredResponseGeneratorBlock.Input", + credentials: "APIKeyCredentials", + ) -> str: + """Delegate to structured response block and return only the text string.""" block = AIStructuredResponseGeneratorBlock() response = await block.run_once(input_data, "response", credentials=credentials) + # Track stats self.merge_llm_stats(block) + + # Return plain string for the response return response["response"] async def run( - self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs - ) -> BlockOutput: - object_input_data = AIStructuredResponseGeneratorBlock.Input( + self, + input_data: Input, + *, + credentials: "APIKeyCredentials", + **kwargs, + ) -> "BlockOutput": + """Run the block and yield outputs for FastAPI and tests.""" + # Prepare input for the structured response generator + structured_input = AIStructuredResponseGeneratorBlock.Input( **{ attr: getattr(input_data, attr) for attr in AITextGeneratorBlock.Input.model_fields }, expected_format={}, ) - response = await self.llm_call(object_input_data, credentials) - yield "response", response + print(structured_input) + # Call the underlying LLM (mocked in test) + response_text = await self.llm_call(structured_input, credentials) + + # Yield outputs + yield "response", response_text yield "prompt", self.prompt @@ -1738,7 +1811,7 @@ async def run( self, input_data: Input, *, credentials: APIKeyCredentials, **kwargs ) -> BlockOutput: logger.debug(f"Starting AIListGeneratorBlock.run with input data: {input_data}") - + # Check for API key api_key_check = credentials.api_key.get_secret_value() if not api_key_check: @@ -1780,6 +1853,7 @@ async def run( |Do not include any explanations or additional text, just respond with the list in the format specified above. |Do not include code fences or any other formatting, just the raw list. """ + # If a focus is provided, add it to the prompt if input_data.focus: prompt = f"Generate a list with the following focus:\n\n\n{input_data.focus}" diff --git a/autogpt_platform/backend/backend/server/routers/v1.py b/autogpt_platform/backend/backend/server/routers/v1.py index 6cf00cbb5fff..76234c99e15a 100644 --- a/autogpt_platform/backend/backend/server/routers/v1.py +++ b/autogpt_platform/backend/backend/server/routers/v1.py @@ -23,6 +23,17 @@ Security, UploadFile, ) + +from backend.blocks.llm import ( + TEST_CREDENTIALS, + TEST_CREDENTIALS_INPUT, + AIBlockBase, + AICredentials, + AICredentialsField, + LlmModel, + LLMResponse, + llm_call, +) from fastapi.concurrency import run_in_threadpool from pydantic import BaseModel from starlette.status import HTTP_204_NO_CONTENT, HTTP_404_NOT_FOUND @@ -63,6 +74,8 @@ update_user_notification_preference, update_user_timezone, ) + +from backend.blocks.llm import AITextGeneratorBlock from backend.executor import scheduler from backend.executor import utils as execution_utils from backend.integrations.webhooks.graph_lifecycle_hooks import ( @@ -1358,3 +1371,56 @@ async def update_permissions( return await api_key_db.update_api_key_permissions( key_id, user_id, request.permissions ) + + +# @v1_router.post( +# "/VLAD/TEST", +# summary="Run AITextGeneratorBlock with prompt via Ollama", +# tags=["AI"], +# ) +# async def llm_call_endpoint( +# body: dict = Body( +# ..., +# example={ +# "prompt": "Write a poem about the ocean.", +# "sys_prompt": "You are a helpful AI assistant.", +# "credentials": TEST_CREDENTIALS_INPUT, +# "model": "llama3.2", +# "retry": 2, +# "prompt_values": {"mood": "calm"}, +# "ollama_host": "http://host.docker.internal:11434", +# "max_tokens": 512, +# "image": "https://marketplace.canva.com/EAGZEaj1Dl0/1/0/1280w/canva-beige-aesthetic-motivational-quote-instagram-post-WAe2YFurmmg.jpg" +# }, +# ), +# ): +# """Endpoint to run the AITextGeneratorBlock using Ollama.""" +# print("➡️ Entered /VLAD/TEST endpoint") +# print(body["image"]) +# ai_block = AITextGeneratorBlock() +# print('passed ai block') +# # Validate and create input schema +# try: +# input_data = ai_block.Input(**body) +# print('passed input data') +# except Exception as e: +# print("❌ Validation error:", e.errors()) +# raise HTTPException(status_code=422, detail=e.errors()) + +# # print ports and what is running on them + + +# # Run the block asynchronously +# output = {} +# async for key, value in ai_block.run(input_data, credentials=None): +# output[key] = value + +# print("✅ Block finished executing") + +# return { +# "llm_response": output.get("response"), +# "prompt_used": output.get("prompt"), +# "ollama_host": input_data.ollama_host, +# "stats": getattr(ai_block, "execution_stats", None), +# } + diff --git a/autogpt_platform/docker-compose.yml b/autogpt_platform/docker-compose.yml index 1860252f4686..8322f4202874 100644 --- a/autogpt_platform/docker-compose.yml +++ b/autogpt_platform/docker-compose.yml @@ -46,6 +46,8 @@ services: extends: file: ./docker-compose.platform.yml service: rest_server + # extra_hosts: + # - "host.docker.internal:host-gateway" executor: <<: *agpt-services