diff --git a/src/content/docs/workers-ai/platform/pricing.mdx b/src/content/docs/workers-ai/platform/pricing.mdx index 87a8411b674551..13c1d2fc4d3604 100644 --- a/src/content/docs/workers-ai/platform/pricing.mdx +++ b/src/content/docs/workers-ai/platform/pricing.mdx @@ -57,6 +57,7 @@ The Price in Tokens column is equivalent to the Price in Neurons column - the di | @cf/openai/gpt-oss-120b | $0.350 per M input tokens
$0.750 per M output tokens | 31818 neurons per M input tokens
68182 neurons per M output tokens | | @cf/openai/gpt-oss-20b | $0.200 per M input tokens
$0.300 per M output tokens | 18182 neurons per M input tokens
27273 neurons per M output tokens | | @cf/aisingapore/gemma-sea-lion-v4-27b-it | $0.351 per M input tokens
$0.555 per M output tokens | 31876 neurons per M input tokens
50488 neurons per M output tokens | +| @cf/ibm-granite/granite-4.0-h-micro | $0.017 per M input tokens
$0.112 per M output tokens | 1542 neurons per M input tokens
10158 neurons per M output tokens | ## Embeddings model pricing diff --git a/src/content/workers-ai-models/granite-4.0-h-micro.json b/src/content/workers-ai-models/granite-4.0-h-micro.json new file mode 100644 index 00000000000000..8f5cd92ce56145 --- /dev/null +++ b/src/content/workers-ai-models/granite-4.0-h-micro.json @@ -0,0 +1,1019 @@ +{ + "id": "7952d0cc-cb00-4e10-be02-667565c2ee0f", + "source": 1, + "name": "@cf/ibm-granite/granite-4.0-h-micro", + "description": "Granite 4.0 instruct models deliver strong performance across benchmarks, achieving industry-leading results in key agentic tasks like instruction following and function calling. These efficiencies make the models well-suited for a wide range of use cases like retrieval-augmented generation (RAG), multi-agent workflows, and edge deployments.", + "task": { + "id": "c329a1f9-323d-4e91-b2aa-582dd4188d34", + "name": "Text Generation", + "description": "Family of generative text models, such as large language models (LLM), that can be adapted for a variety of natural language tasks." + }, + "created_at": "2025-10-07 18:46:29.436", + "tags": [], + "properties": [ + { + "property_id": "context_window", + "value": "131000" + }, + { + "property_id": "price", + "value": [ + { + "unit": "per M input tokens", + "price": 0.017, + "currency": "USD" + }, + { + "unit": "per M output tokens", + "price": 0.11, + "currency": "USD" + } + ] + } + ], +"schema": { + "input": { + "type": "object", + "oneOf": [ + { + "title": "Prompt", + "properties": { + "prompt": { + "type": "string", + "minLength": 1, + "description": "The input text prompt for the model to generate a response." + }, + "lora": { + "type": "string", + "description": "Name of the LoRA (Low-Rank Adaptation) model to fine-tune the base model." + }, + "response_format": { + "title": "JSON Mode", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "json_object", + "json_schema" + ] + }, + "json_schema": {} + } + }, + "raw": { + "type": "boolean", + "default": false, + "description": "If true, a chat template is not applied and you must adhere to the specific model's expected formatting." + }, + "stream": { + "type": "boolean", + "default": false, + "description": "If true, the response will be streamed back incrementally using SSE, Server Sent Events." + }, + "max_tokens": { + "type": "integer", + "default": 2000, + "description": "The maximum number of tokens to generate in the response." + }, + "temperature": { + "type": "number", + "default": 0.6, + "minimum": 0, + "maximum": 5, + "description": "Controls the randomness of the output; higher values produce more random results." + }, + "top_p": { + "type": "number", + "minimum": 0.001, + "maximum": 1, + "description": "Adjusts the creativity of the AI's responses by controlling how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses." + }, + "top_k": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "description": "Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises." + }, + "seed": { + "type": "integer", + "minimum": 1, + "maximum": 9999999999, + "description": "Random seed for reproducibility of the generation." + }, + "repetition_penalty": { + "type": "number", + "minimum": 0, + "maximum": 2, + "description": "Penalty for repeated tokens; higher values discourage repetition." + }, + "frequency_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Decreases the likelihood of the model repeating the same lines verbatim." + }, + "presence_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Increases the likelihood of the model introducing new topics." + } + }, + "required": [ + "prompt" + ] + }, + { + "title": "Messages", + "properties": { + "messages": { + "type": "array", + "description": "An array of message objects representing the conversation history.", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "description": "The role of the message sender (e.g., 'user', 'assistant', 'system', 'tool')." + }, + "content": { + "type": "string", + "description": "The content of the message as a string." + } + }, + "required": [ + "role", + "content" + ] + } + }, + "functions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "code": { + "type": "string" + } + }, + "required": [ + "name", + "code" + ] + } + }, + "tools": { + "type": "array", + "description": "A list of tools available for the assistant to use.", + "items": { + "type": "object", + "oneOf": [ + { + "properties": { + "name": { + "type": "string", + "description": "The name of the tool. More descriptive the better." + }, + "description": { + "type": "string", + "description": "A brief description of what the tool does." + }, + "parameters": { + "type": "object", + "description": "Schema defining the parameters accepted by the tool.", + "properties": { + "type": { + "type": "string", + "description": "The type of the parameters object (usually 'object')." + }, + "required": { + "type": "array", + "description": "List of required parameter names.", + "items": { + "type": "string" + } + }, + "properties": { + "type": "object", + "description": "Definitions of each parameter.", + "additionalProperties": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "The data type of the parameter." + }, + "description": { + "type": "string", + "description": "A description of the expected parameter." + } + }, + "required": [ + "type", + "description" + ] + } + } + }, + "required": [ + "type", + "properties" + ] + } + }, + "required": [ + "name", + "description", + "parameters" + ] + }, + { + "properties": { + "type": { + "type": "string", + "description": "Specifies the type of tool (e.g., 'function')." + }, + "function": { + "type": "object", + "description": "Details of the function tool.", + "properties": { + "name": { + "type": "string", + "description": "The name of the function." + }, + "description": { + "type": "string", + "description": "A brief description of what the function does." + }, + "parameters": { + "type": "object", + "description": "Schema defining the parameters accepted by the function.", + "properties": { + "type": { + "type": "string", + "description": "The type of the parameters object (usually 'object')." + }, + "required": { + "type": "array", + "description": "List of required parameter names.", + "items": { + "type": "string" + } + }, + "properties": { + "type": "object", + "description": "Definitions of each parameter.", + "additionalProperties": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "The data type of the parameter." + }, + "description": { + "type": "string", + "description": "A description of the expected parameter." + } + }, + "required": [ + "type", + "description" + ] + } + } + }, + "required": [ + "type", + "properties" + ] + } + }, + "required": [ + "name", + "description", + "parameters" + ] + } + }, + "required": [ + "type", + "function" + ] + } + ] + } + }, + "response_format": { + "title": "JSON Mode", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "json_object", + "json_schema" + ] + }, + "json_schema": {} + } + }, + "raw": { + "type": "boolean", + "default": false, + "description": "If true, a chat template is not applied and you must adhere to the specific model's expected formatting." + }, + "stream": { + "type": "boolean", + "default": false, + "description": "If true, the response will be streamed back incrementally using SSE, Server Sent Events." + }, + "max_tokens": { + "type": "integer", + "default": 2000, + "description": "The maximum number of tokens to generate in the response." + }, + "temperature": { + "type": "number", + "default": 0.6, + "minimum": 0, + "maximum": 5, + "description": "Controls the randomness of the output; higher values produce more random results." + }, + "top_p": { + "type": "number", + "minimum": 0.001, + "maximum": 1, + "description": "Adjusts the creativity of the AI's responses by controlling how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses." + }, + "top_k": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "description": "Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises." + }, + "seed": { + "type": "integer", + "minimum": 1, + "maximum": 9999999999, + "description": "Random seed for reproducibility of the generation." + }, + "repetition_penalty": { + "type": "number", + "minimum": 0, + "maximum": 2, + "description": "Penalty for repeated tokens; higher values discourage repetition." + }, + "frequency_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Decreases the likelihood of the model repeating the same lines verbatim." + }, + "presence_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Increases the likelihood of the model introducing new topics." + } + }, + "required": [ + "messages" + ] + }, + { + "title": "Async Batch", + "type": "object", + "properties": { + "requests": { + "type": "array", + "items": { + "type": "object", + "oneOf": [ + { + "title": "Prompt", + "properties": { + "prompt": { + "type": "string", + "minLength": 1, + "description": "The input text prompt for the model to generate a response." + }, + "lora": { + "type": "string", + "description": "Name of the LoRA (Low-Rank Adaptation) model to fine-tune the base model." + }, + "response_format": { + "title": "JSON Mode", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "json_object", + "json_schema" + ] + }, + "json_schema": {} + } + }, + "raw": { + "type": "boolean", + "default": false, + "description": "If true, a chat template is not applied and you must adhere to the specific model's expected formatting." + }, + "stream": { + "type": "boolean", + "default": false, + "description": "If true, the response will be streamed back incrementally using SSE, Server Sent Events." + }, + "max_tokens": { + "type": "integer", + "default": 256, + "description": "The maximum number of tokens to generate in the response." + }, + "temperature": { + "type": "number", + "default": 0.6, + "minimum": 0, + "maximum": 5, + "description": "Controls the randomness of the output; higher values produce more random results." + }, + "top_p": { + "type": "number", + "minimum": 0.001, + "maximum": 1, + "description": "Adjusts the creativity of the AI's responses by controlling how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses." + }, + "top_k": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "description": "Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises." + }, + "seed": { + "type": "integer", + "minimum": 1, + "maximum": 9999999999, + "description": "Random seed for reproducibility of the generation." + }, + "repetition_penalty": { + "type": "number", + "minimum": 0, + "maximum": 2, + "description": "Penalty for repeated tokens; higher values discourage repetition." + }, + "frequency_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Decreases the likelihood of the model repeating the same lines verbatim." + }, + "presence_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Increases the likelihood of the model introducing new topics." + } + }, + "required": [ + "prompt" + ] + }, + { + "title": "Messages", + "properties": { + "messages": { + "type": "array", + "description": "An array of message objects representing the conversation history.", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "description": "The role of the message sender (e.g., 'user', 'assistant', 'system', 'tool')." + }, + "content": { + "type": "string", + "description": "The content of the message as a string." + } + }, + "required": [ + "role", + "content" + ] + } + }, + "functions": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "code": { + "type": "string" + } + }, + "required": [ + "name", + "code" + ] + } + }, + "tools": { + "type": "array", + "description": "A list of tools available for the assistant to use.", + "items": { + "type": "object", + "oneOf": [ + { + "properties": { + "name": { + "type": "string", + "description": "The name of the tool. More descriptive the better." + }, + "description": { + "type": "string", + "description": "A brief description of what the tool does." + }, + "parameters": { + "type": "object", + "description": "Schema defining the parameters accepted by the tool.", + "properties": { + "type": { + "type": "string", + "description": "The type of the parameters object (usually 'object')." + }, + "required": { + "type": "array", + "description": "List of required parameter names.", + "items": { + "type": "string" + } + }, + "properties": { + "type": "object", + "description": "Definitions of each parameter.", + "additionalProperties": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "The data type of the parameter." + }, + "description": { + "type": "string", + "description": "A description of the expected parameter." + } + }, + "required": [ + "type", + "description" + ] + } + } + }, + "required": [ + "type", + "properties" + ] + } + }, + "required": [ + "name", + "description", + "parameters" + ] + }, + { + "properties": { + "type": { + "type": "string", + "description": "Specifies the type of tool (e.g., 'function')." + }, + "function": { + "type": "object", + "description": "Details of the function tool.", + "properties": { + "name": { + "type": "string", + "description": "The name of the function." + }, + "description": { + "type": "string", + "description": "A brief description of what the function does." + }, + "parameters": { + "type": "object", + "description": "Schema defining the parameters accepted by the function.", + "properties": { + "type": { + "type": "string", + "description": "The type of the parameters object (usually 'object')." + }, + "required": { + "type": "array", + "description": "List of required parameter names.", + "items": { + "type": "string" + } + }, + "properties": { + "type": "object", + "description": "Definitions of each parameter.", + "additionalProperties": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "The data type of the parameter." + }, + "description": { + "type": "string", + "description": "A description of the expected parameter." + } + }, + "required": [ + "type", + "description" + ] + } + } + }, + "required": [ + "type", + "properties" + ] + } + }, + "required": [ + "name", + "description", + "parameters" + ] + } + }, + "required": [ + "type", + "function" + ] + } + ] + } + }, + "response_format": { + "title": "JSON Mode", + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "json_object", + "json_schema" + ] + }, + "json_schema": {} + } + }, + "raw": { + "type": "boolean", + "default": false, + "description": "If true, a chat template is not applied and you must adhere to the specific model's expected formatting." + }, + "stream": { + "type": "boolean", + "default": false, + "description": "If true, the response will be streamed back incrementally using SSE, Server Sent Events." + }, + "max_tokens": { + "type": "integer", + "default": 256, + "description": "The maximum number of tokens to generate in the response." + }, + "temperature": { + "type": "number", + "default": 0.6, + "minimum": 0, + "maximum": 5, + "description": "Controls the randomness of the output; higher values produce more random results." + }, + "top_p": { + "type": "number", + "minimum": 0.001, + "maximum": 1, + "description": "Adjusts the creativity of the AI's responses by controlling how many possible words it considers. Lower values make outputs more predictable; higher values allow for more varied and creative responses." + }, + "top_k": { + "type": "integer", + "minimum": 1, + "maximum": 50, + "description": "Limits the AI to choose from the top 'k' most probable words. Lower values make responses more focused; higher values introduce more variety and potential surprises." + }, + "seed": { + "type": "integer", + "minimum": 1, + "maximum": 9999999999, + "description": "Random seed for reproducibility of the generation." + }, + "repetition_penalty": { + "type": "number", + "minimum": 0, + "maximum": 2, + "description": "Penalty for repeated tokens; higher values discourage repetition." + }, + "frequency_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Decreases the likelihood of the model repeating the same lines verbatim." + }, + "presence_penalty": { + "type": "number", + "minimum": -2, + "maximum": 2, + "description": "Increases the likelihood of the model introducing new topics." + } + }, + "required": [ + "messages" + ] + } + ] + } + } + }, + "required": [ + "requests" + ] + } + ] + }, + "output": { + "oneOf": [ + { + "type": "object", + "contentType": "application/json", + "title": "Chat Completion Response", + "properties": { + "id": { + "type": "string", + "description": "Unique identifier for the completion" + }, + "object": { + "type": "string", + "enum": [ + "chat.completion" + ], + "description": "Object type identifier" + }, + "created": { + "type": "number", + "description": "Unix timestamp of when the completion was created" + }, + "model": { + "type": "string", + "description": "Model used for the completion" + }, + "choices": { + "type": "array", + "description": "List of completion choices", + "items": { + "type": "object", + "properties": { + "index": { + "type": "number", + "description": "Index of the choice in the list" + }, + "message": { + "type": "object", + "description": "The message generated by the model", + "properties": { + "role": { + "type": "string", + "description": "Role of the message author" + }, + "content": { + "type": "string", + "description": "The content of the message" + }, + "reasoning_content": { + "type": "string", + "description": "Internal reasoning content (if available)" + }, + "tool_calls": { + "type": "array", + "description": "Tool calls made by the assistant", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "Unique identifier for the tool call" + }, + "type": { + "type": "string", + "enum": [ + "function" + ], + "description": "Type of tool call" + }, + "function": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Name of the function to call" + }, + "arguments": { + "type": "string", + "description": "JSON string of arguments for the function" + } + }, + "required": [ + "name", + "arguments" + ] + } + }, + "required": [ + "id", + "type", + "function" + ] + } + } + }, + "required": [ + "role", + "content" + ] + }, + "finish_reason": { + "type": "string", + "description": "Reason why the model stopped generating" + }, + "stop_reason": { + "type": [ + "string", + "null" + ], + "description": "Stop reason (may be null)" + }, + "logprobs": { + "type": [ + "object", + "null" + ], + "description": "Log probabilities (if requested)" + } + } + } + }, + "usage": { + "type": "object", + "description": "Usage statistics for the inference request", + "properties": { + "prompt_tokens": { + "type": "number", + "description": "Total number of tokens in input", + "default": 0 + }, + "completion_tokens": { + "type": "number", + "description": "Total number of tokens in output", + "default": 0 + }, + "total_tokens": { + "type": "number", + "description": "Total number of input and output tokens", + "default": 0 + } + } + }, + "prompt_logprobs": { + "type": [ + "object", + "null" + ], + "description": "Log probabilities for the prompt (if requested)" + } + } + }, + { + "type": "object", + "contentType": "application/json", + "title": "Text Completion Response", + "properties": { + "id": { + "type": "string", + "description": "Unique identifier for the completion" + }, + "object": { + "type": "string", + "enum": [ + "text_completion" + ], + "description": "Object type identifier" + }, + "created": { + "type": "number", + "description": "Unix timestamp of when the completion was created" + }, + "model": { + "type": "string", + "description": "Model used for the completion" + }, + "choices": { + "type": "array", + "description": "List of completion choices", + "items": { + "type": "object", + "properties": { + "index": { + "type": "number", + "description": "Index of the choice in the list" + }, + "text": { + "type": "string", + "description": "The generated text completion" + }, + "finish_reason": { + "type": "string", + "description": "Reason why the model stopped generating" + }, + "stop_reason": { + "type": [ + "string", + "null" + ], + "description": "Stop reason (may be null)" + }, + "logprobs": { + "type": [ + "object", + "null" + ], + "description": "Log probabilities (if requested)" + }, + "prompt_logprobs": { + "type": [ + "object", + "null" + ], + "description": "Log probabilities for the prompt (if requested)" + } + }, + "required": [ + "index", + "text", + "finish_reason" + ] + } + }, + "usage": { + "type": "object", + "description": "Usage statistics for the inference request", + "properties": { + "prompt_tokens": { + "type": "number", + "description": "Total number of tokens in input", + "default": 0 + }, + "completion_tokens": { + "type": "number", + "description": "Total number of tokens in output", + "default": 0 + }, + "total_tokens": { + "type": "number", + "description": "Total number of input and output tokens", + "default": 0 + } + } + } + } + }, + { + "type": "string", + "contentType": "text/event-stream", + "format": "binary" + }, + { + "type": "object", + "contentType": "application/json", + "title": "Async response", + "properties": { + "request_id": { + "type": "string", + "description": "The async request id that can be used to obtain the results." + } + } + } + ] + } + } +} \ No newline at end of file