diff --git a/bin/fetch-ai-models.js b/bin/fetch-ai-models.js index 50e4593bcb1fe7..9e2a7db415a390 100644 --- a/bin/fetch-ai-models.js +++ b/bin/fetch-ai-models.js @@ -3,7 +3,7 @@ import fs from "fs"; fetch("https://ai.cloudflare.com/api/models") .then((res) => res.json()) .then((data) => { - data.models.forEach((model) => { + data.models.map((model) => { const fileName = model.name.split("/")[2]; fs.writeFileSync( `./src/content/workers-ai-models/${fileName}.json`, diff --git a/src/content/changelog/workers-ai/2025-03-17-new-workers-ai-models.mdx b/src/content/changelog/workers-ai/2025-03-17-new-workers-ai-models.mdx new file mode 100644 index 00000000000000..26524d1da7df31 --- /dev/null +++ b/src/content/changelog/workers-ai/2025-03-17-new-workers-ai-models.mdx @@ -0,0 +1,17 @@ +--- +title: New models in Workers AI +description: New text-to-speech, reranker, whisper, embeddings models now available! +date: 2025-03-17T17:00:00Z +--- + +Workers AI is excited to add 4 new models to the catalog, including 2 brand new classes of models with a text-to-speech and reranker model. Introducing: +- [@cf/baai/bge-m3](/workers-ai/models/bge-m3/) - a multi-lingual embeddings model that supports over 100 languages. It can also simultaneously perform dense retrieval, multi-vector retrieval, and sparse retrieval, with the ability to process inputs of different granularities. +- [@cf/baai/bge-reranker-base](/workers-ai/models/bge-reranker-base/) - our first reranker model! Rerankers are a type of text classification model that takes a query and context, and outputs a similarity score between the two. When used in RAG systems, you can use a reranker after the initial vector search to find the most relevant documents to return to a user by reranking the outputs. +- [@cf/openai/whisper-large-v3-turbo](/workers-ai/models/whisper-large-v3-turbo/) - a faster, more accurate speech-to-text model. This model was added earlier but is graduating out of beta with pricing included today. +- [@cf/myshell-ai/melotts](/workers-ai/models/melotts/) - our first text-to-speech model that allows users to generate an MP3 with voice audio from inputted text. + +Pricing is available for each of these models on the [Workers AI pricing page](/workers-ai/platform/pricing/). + +This docs update includes a few minor bug fixes to the model schema for llama-guard, llama-3.2-1b, which you can review on the [product changelog](/workers-ai/changelog/). + +Try it out and let us know what you think! Stay tuned for more models in the coming days. \ No newline at end of file diff --git a/src/content/docs/workers-ai/platform/pricing.mdx b/src/content/docs/workers-ai/platform/pricing.mdx index 473a95b39fb3fa..c9f84d687f4eb6 100644 --- a/src/content/docs/workers-ai/platform/pricing.mdx +++ b/src/content/docs/workers-ai/platform/pricing.mdx @@ -26,6 +26,10 @@ All limits reset daily at 00:00 UTC. If you exceed any one of the above limits, Neurons are our way of measuring AI outputs across different models, representing the GPU compute needed to perform your request. Our serverless model allows you to pay only for what you use without having to worry about renting, managing, or scaling GPUs. +:::note +The Price in Tokens column is equivalent to the Price in Neurons column - the different units are displayed so you can easily compare and understand pricing. +::: + ## LLM model pricing | Model | Price in Tokens | Price in Neurons | @@ -46,15 +50,23 @@ Neurons are our way of measuring AI outputs across different models, representin | @cf/meta/llama-2-7b-chat-fp16 | $0.556 per M input tokens
$6.667 per M output tokens | 50505 neurons per M input tokens
606061 neurons per M output tokens | | @cf/meta/llama-guard-3-8b | $0.484 per M input tokens
$0.030 per M output tokens | 44003 neurons per M input tokens
2730 neurons per M output tokens | +## Embeddings model pricing +| Model | Price in Tokens | Price in Neurons | +| ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ | +| @cf/baai/bge-small-en-v1.5 | $0.020 per M input tokens | 1841 neurons per M input tokens | +| @cf/baai/bge-base-en-v1.5 | $0.067 per M input tokens | 6058 neurons per M input tokens | +| @cf/baai/bge-large-en-v1.5 | $0.204 per M input tokens | 18582 neurons per M input tokens | +|@cf/baai/bge-m3 |$0.012 per M input tokens|1075 neurons per M input tokens | + ## Other model pricing | Model | Price in Tokens | Price in Neurons | | ------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------------------------ | | @cf/black-forest-labs/flux-1-schnell | $0.0000528 per 512x512 tile
$0.0001056 per step | 4.80 neurons per 512x512 tile
9.60 neurons per step | | @cf/huggingface/distilbert-sst-2-int8 | $0.026 per M input tokens | 2394 neurons per M input tokens | -| @cf/baai/bge-small-en-v1.5 | $0.020 per M input tokens | 1841 neurons per M input tokens | -| @cf/baai/bge-base-en-v1.5 | $0.067 per M input tokens | 6058 neurons per M input tokens | -| @cf/baai/bge-large-en-v1.5 | $0.204 per M input tokens | 18582 neurons per M input tokens | +|@cf/baai/bge-reranker-base |$0.003 per M input tokens|283 neurons per M input tokens | | @cf/meta/m2m100-1.2b | $0.342 per M input tokens
$0.342 per M output tokens | 31050 neurons per M input tokens
31050 neurons per M output tokens | | @cf/microsoft/resnet-50 | $2.51 per M images | 228055 neurons per M images | | @cf/openai/whisper | $0.0005 per audio minute | 41.14 neurons per audio minute | +|@cf/openai/whisper-large-v3-turbo|$0.0005 per audio minute |46.63 neurons per audio minute | +|@cf/myshell-ai/melotts |$3.416 per M input tokens|310577 neurons per M input tokens| \ No newline at end of file diff --git a/src/content/release-notes/workers-ai.yaml b/src/content/release-notes/workers-ai.yaml index 9bba62512416fa..907061df34954a 100644 --- a/src/content/release-notes/workers-ai.yaml +++ b/src/content/release-notes/workers-ai.yaml @@ -5,6 +5,12 @@ productLink: "/workers-ai/" productArea: Developer platform productAreaLink: /workers/platform/changelog/platform/ entries: + - publish_date: "2025-03-17" + title: Minor updates to the model schema for llama-3.2-1b-instruct, whisper-large-v3-turbo, llama-guard + description: |- + - [llama-3.2-1b-instruct](/workers-ai-models/llama-3.2-1b-instruct/) - updated context window to the accurate 60,000 + - [whisper-large-v3-turbo](workers-ai-models/whisper-large-v3-turbo/) - new hyperparameters available + - [llama-guard-3-8b](/workers-ai-models/llama-guard-3-8b/) - the messages array must alternate between `user` and `assistant` to function correctly - publish_date: "2025-02-21" title: Workers AI bug fixes description: |- diff --git a/src/content/workers-ai-models/bge-m3.json b/src/content/workers-ai-models/bge-m3.json new file mode 100644 index 00000000000000..7c4bea9ffe6395 --- /dev/null +++ b/src/content/workers-ai-models/bge-m3.json @@ -0,0 +1,83 @@ +{ + "id": "eed32bc1-8775-4985-89ce-dd1405508ad8", + "source": 1, + "name": "@cf/baai/bge-m3", + "description": "Multi-Functionality, Multi-Linguality, and Multi-Granularity embeddings model.", + "task": { + "id": "0137cdcf-162a-4108-94f2-1ca59e8c65ee", + "name": "Text Embeddings", + "description": "Feature extraction models transform raw data into numerical features that can be processed while preserving the information in the original dataset. These models are ideal as part of building vector search applications or Retrieval Augmented Generation workflows with Large Language Models (LLM)." + }, + "tags": [], + "properties": [], + "schema": { + "input": { + "type": "object", + "properties": { + "query": { + "type": "string", + "minLength": 1, + "description": "A query you wish to perform against the provided contexts. If no query is provided the model with respond with embeddings for contexts" + }, + "contexts": { + "type": "array", + "items": { + "type": "object", + "properties": { + "text": { + "type": "string", + "minLength": 1, + "description": "One of the provided context content" + } + } + }, + "description": "List of provided contexts. Note that the index in this array is important, as the response will refer to it." + } + }, + "required": [ + "contexts" + ] + }, + "output": { + "type": "object", + "contentType": "application/json", + "oneOf": [ + { + "title": "Query", + "properties": { + "response": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "integer", + "description": "Index of the context in the request" + }, + "score": { + "type": "number", + "description": "Score of the context under the index." + } + } + } + } + } + }, + { + "title": "Embedding", + "properties": { + "response": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "number" + } + } + } + } + } + ] + } + } +} \ No newline at end of file diff --git a/src/content/workers-ai-models/bge-reranker-base.json b/src/content/workers-ai-models/bge-reranker-base.json new file mode 100644 index 00000000000000..86d7340c5abce3 --- /dev/null +++ b/src/content/workers-ai-models/bge-reranker-base.json @@ -0,0 +1,71 @@ +{ + "id": "145337e7-cec3-4ebb-8e78-16ddfc75e580", + "source": 1, + "name": "@cf/baai/bge-reranker-base", + "description": "Different from embedding model, reranker uses question and document as input and directly output similarity instead of embedding. You can get a relevance score by inputting query and passage to the reranker. And the score can be mapped to a float value in [0,1] by sigmoid function.\n\n", + "task": { + "id": "19606750-23ed-4371-aab2-c20349b53a60", + "name": "Text Classification", + "description": "Sentiment analysis or text classification is a common NLP task that classifies a text input into labels or classes." + }, + "tags": [], + "properties": [], + "schema": { + "input": { + "type": "object", + "properties": { + "query": { + "type": "string", + "minLength": 1, + "description": "A query you wish to perform against the provided contexts." + }, + "top_k": { + "type": "integer", + "default": null, + "minimum": 1, + "description": "Number of returned results starting with the best score." + }, + "contexts": { + "type": "array", + "items": { + "type": "object", + "properties": { + "text": { + "type": "string", + "minLength": 1, + "description": "One of the provided context content" + } + } + }, + "description": "List of provided contexts. Note that the index in this array is important, as the response will refer to it." + } + }, + "required": [ + "query", + "contexts" + ] + }, + "output": { + "type": "object", + "contentType": "application/json", + "properties": { + "response": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "integer", + "description": "Index of the context in the request" + }, + "score": { + "type": "number", + "description": "Score of the context under the index." + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/src/content/workers-ai-models/llama-3.2-1b-instruct.json b/src/content/workers-ai-models/llama-3.2-1b-instruct.json index 2aacfcac2107da..ae45ceed6defd9 100644 --- a/src/content/workers-ai-models/llama-3.2-1b-instruct.json +++ b/src/content/workers-ai-models/llama-3.2-1b-instruct.json @@ -12,7 +12,7 @@ "properties": [ { "property_id": "context_window", - "value": "128000" + "value": "60000" }, { "property_id": "terms", diff --git a/src/content/workers-ai-models/llama-guard-3-8b.json b/src/content/workers-ai-models/llama-guard-3-8b.json index ae4e67a90d4e32..79d49e7073d743 100644 --- a/src/content/workers-ai-models/llama-guard-3-8b.json +++ b/src/content/workers-ai-models/llama-guard-3-8b.json @@ -21,12 +21,14 @@ "type": "object", "properties": { "role": { - "type": "string", - "description": "The role of the message sender (e.g., 'user', 'assistant', 'system', 'tool')." + "enum": [ + "user", + "assistant" + ], + "description": "The role of the message sender must alternate between 'user' and 'assistant'." }, "content": { "type": "string", - "maxLength": 131072, "description": "The content of the message as a string." } }, diff --git a/src/content/workers-ai-models/melotts.json b/src/content/workers-ai-models/melotts.json new file mode 100644 index 00000000000000..65d6e7d4506d43 --- /dev/null +++ b/src/content/workers-ai-models/melotts.json @@ -0,0 +1,53 @@ +{ + "id": "c837b2ac-4d9b-4d37-8811-34de60f0c44f", + "source": 1, + "name": "@cf/myshell-ai/melotts", + "description": "MeloTTS is a high-quality multi-lingual text-to-speech library by MyShell.ai.", + "task": { + "id": "b52660a1-9a95-4ab2-8b1d-f232be34604a", + "name": "Text-to-Speech", + "description": "Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages." + }, + "tags": [], + "properties": [], + "schema": { + "input": { + "type": "object", + "properties": { + "prompt": { + "type": "string", + "minLength": 1, + "description": "A text description of the image you want to generate" + }, + "lang": { + "type": "string", + "default": "en", + "description": "The speech language (e.g., 'en' for English, 'fr' for French). Defaults to 'en' if not specified" + } + }, + "required": [ + "prompt" + ] + }, + "output": { + "oneOf": [ + { + "type": "object", + "contentType": "application/json", + "properties": { + "audio": { + "type": "string", + "description": "The generated audio in MP3 format, base64-encoded" + } + } + }, + { + "type": "string", + "contentType": "audio/mpeg", + "format": "binary", + "description": "The generated audio in MP3 format" + } + ] + } + } +} \ No newline at end of file diff --git a/src/content/workers-ai-models/whisper-large-v3-turbo.json b/src/content/workers-ai-models/whisper-large-v3-turbo.json index 830bd9dd2bc7a2..d56854cdaacce2 100644 --- a/src/content/workers-ai-models/whisper-large-v3-turbo.json +++ b/src/content/workers-ai-models/whisper-large-v3-turbo.json @@ -9,12 +9,7 @@ "description": "Automatic speech recognition (ASR) models convert a speech signal, typically an audio input, to text." }, "tags": [], - "properties": [ - { - "property_id": "beta", - "value": "true" - } - ], + "properties": [], "schema": { "input": { "type": "object", @@ -44,7 +39,7 @@ }, "prefix": { "type": "string", - "description": "The prefix it appended the beginning of the output of the transcription and can guide the transcription result." + "description": "The prefix it appended the the beginning of the output of the transcription and can guide the transcription result." } }, "required": [ @@ -85,52 +80,55 @@ "description": "The total number of words in the transcription." }, "segments": { - "type": "object", - "properties": { - "start": { - "type": "number", - "description": "The starting time of the segment within the audio, in seconds." - }, - "end": { - "type": "number", - "description": "The ending time of the segment within the audio, in seconds." - }, - "text": { - "type": "string", - "description": "The transcription of the segment." - }, - "temperature": { - "type": "number", - "description": "The temperature used in the decoding process, controlling randomness in predictions. Lower values result in more deterministic outputs." - }, - "avg_logprob": { - "type": "number", - "description": "The average log probability of the predictions for the words in this segment, indicating overall confidence." - }, - "compression_ratio": { - "type": "number", - "description": "The compression ratio of the input to the output, measuring how much the text was compressed during the transcription process." - }, - "no_speech_prob": { - "type": "number", - "description": "The probability that the segment contains no speech, represented as a decimal between 0 and 1." - }, - "words": { - "type": "array", - "items": { - "type": "object", - "properties": { - "word": { - "type": "string", - "description": "The individual word transcribed from the audio." - }, - "start": { - "type": "number", - "description": "The starting time of the word within the audio, in seconds." - }, - "end": { - "type": "number", - "description": "The ending time of the word within the audio, in seconds." + "type": "array", + "items": { + "type": "object", + "properties": { + "start": { + "type": "number", + "description": "The starting time of the segment within the audio, in seconds." + }, + "end": { + "type": "number", + "description": "The ending time of the segment within the audio, in seconds." + }, + "text": { + "type": "string", + "description": "The transcription of the segment." + }, + "temperature": { + "type": "number", + "description": "The temperature used in the decoding process, controlling randomness in predictions. Lower values result in more deterministic outputs." + }, + "avg_logprob": { + "type": "number", + "description": "The average log probability of the predictions for the words in this segment, indicating overall confidence." + }, + "compression_ratio": { + "type": "number", + "description": "The compression ratio of the input to the output, measuring how much the text was compressed during the transcription process." + }, + "no_speech_prob": { + "type": "number", + "description": "The probability that the segment contains no speech, represented as a decimal between 0 and 1." + }, + "words": { + "type": "array", + "items": { + "type": "object", + "properties": { + "word": { + "type": "string", + "description": "The individual word transcribed from the audio." + }, + "start": { + "type": "number", + "description": "The starting time of the word within the audio, in seconds." + }, + "end": { + "type": "number", + "description": "The ending time of the word within the audio, in seconds." + } } } }