ngxson
diff --git a/‎.github/workflows/inference-publish.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/inference-publish.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/test.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 32 additions & 17 deletions b/‎README.md‎
Lines changed: 32 additions & 17 deletions
diff --git a/‎packages/agents/pnpm-lock.yaml‎
Lines changed: 12 additions & 1 deletion b/‎packages/agents/pnpm-lock.yaml‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎packages/gguf/src/gguf.spec.ts‎
Lines changed: 5 additions & 0 deletions b/‎packages/gguf/src/gguf.spec.ts‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎packages/gguf/src/gguf.ts‎
Lines changed: 13 additions & 4 deletions b/‎packages/gguf/src/gguf.ts‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎packages/gguf/src/types.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/gguf/src/types.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎packages/inference/LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎packages/inference/LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/inference/README.md‎
Lines changed: 37 additions & 4 deletions b/‎packages/inference/README.md‎
Lines changed: 37 additions & 4 deletions
@@ -54,7 +54,7 @@ jobs:
           git tag "inference-v$BUMPED_VERSION"
 
       - name: "Check Deps are published before publishing this package"
-        run: pnpm -w check-deps gguf
+        run: pnpm -w check-deps tasks
 
       - run: pnpm publish --no-git-checks .
         env:
 
@@ -41,6 +41,10 @@ jobs:
         run: VCR_MODE=playback pnpm --filter ...[${{ steps.since.outputs.SINCE }}] test
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_FAL_KEY: dummy
+          HF_REPLICATE_KEY: dummy
+          HF_SAMBANOVA_KEY: dummy
+          HF_TOGETHER_KEY: dummy
 
   browser:
     runs-on: ubuntu-latest
@@ -77,6 +81,10 @@ jobs:
         run: VCR_MODE=playback pnpm --filter ...[${{ steps.since.outputs.SINCE }}] test:browser
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_FAL_KEY: dummy
+          HF_REPLICATE_KEY: dummy
+          HF_SAMBANOVA_KEY: dummy
+          HF_TOGETHER_KEY: dummy
 
   e2e:
     runs-on: ubuntu-latest
@@ -140,3 +148,7 @@ jobs:
         env:
           NPM_CONFIG_REGISTRY: http://localhost:4874/
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_FAL_KEY: dummy
+          HF_REPLICATE_KEY: dummy
+          HF_SAMBANOVA_KEY: dummy
+          HF_TOGETHER_KEY: dummy
@@ -1,6 +1,6 @@
 # Ownership for the Inference Package
 
-/packages/inference/ @vvmnnnkv @radames
+/packages/inference/ @julien-c @hanouticelina @SBrandeis @coyotte508
 
 # Ownership for the Tasks Package
 
 
@@ -13,7 +13,7 @@
 // Programatically interact with the Hub
 
 await createRepo({
-  repo: {type: "model", name: "my-user/nlp-model"},
+  repo: { type: "model", name: "my-user/nlp-model" },
   accessToken: HF_TOKEN
 });
 
@@ -27,7 +27,7 @@ await uploadFile({
   }
 });
 
-// Use Inference API
+// Use HF Inference API, or external Inference Providers!
 
 await inference.chatCompletion({
   model: "meta-llama/Llama-3.1-8B-Instruct",
@@ -39,6 +39,7 @@ await inference.chatCompletion({
   ],
   max_tokens: 512,
   temperature: 0.5,
+  provider: "sambanova", // or together, fal-ai, replicate, …
 });
 
 await inference.textToImage({
@@ -53,11 +54,13 @@ await inference.textToImage({
 
 This is a collection of JS libraries to interact with the Hugging Face API, with TS types included.
 
-- [@huggingface/inference](packages/inference/README.md): Use Inference Endpoints (dedicated) and Inference API (serverless) to make calls to 100,000+ Machine Learning models
+- [@huggingface/inference](packages/inference/README.md): Use Inference API (serverless), Inference Endpoints (dedicated) and third-party Inference providers to make calls to 100,000+ Machine Learning models
 - [@huggingface/hub](packages/hub/README.md): Interact with huggingface.co to create or delete repos and commit / download files
 - [@huggingface/agents](packages/agents/README.md): Interact with HF models through a natural language interface
 - [@huggingface/gguf](packages/gguf/README.md): A GGUF parser that works on remotely hosted files.
+- [@huggingface/dduf](packages/dduf/README.md): Similar package for DDUF (DDUF Diffusers Unified Format)
 - [@huggingface/tasks](packages/tasks/README.md): The definition files and source-of-truth for the Hub's main primitives like pipeline tasks, model libraries, etc.
+- [@huggingface/jinja](packages/jinja/README.md): A minimalistic JS implementation of the Jinja templating engine, to be used for ML chat templates.
 - [@huggingface/space-header](packages/space-header/README.md): Use the Space `mini_header` outside Hugging Face
 - [@huggingface/ollama-utils](packages/ollama-utils/README.md): Various utilities for maintaining Ollama compatibility with models on Hugging Face hub.
 
@@ -93,7 +96,7 @@ You can run our packages with vanilla JS, without any bundler, by using a CDN or
 
 ```html
 <script type="module">
-    import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@2.8.1/+esm';
+    import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@3.1.2/+esm';
     import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]/+esm";
 </script>
 ```
@@ -143,6 +146,22 @@ for await (const chunk of inference.chatCompletionStream({
   console.log(chunk.choices[0].delta.content);
 }
 
+/// Using a third-party provider: 
+await inference.chatCompletion({
+  model: "meta-llama/Llama-3.1-8B-Instruct",
+  messages: [{ role: "user", content: "Hello, nice to meet you!" }],
+  max_tokens: 512,
+  provider: "sambanova", // or together, fal-ai, replicate, …
+})
+
+await inference.textToImage({
+  model: "black-forest-labs/FLUX.1-dev",
+  inputs: "a picture of a green bird",
+  provider: "fal-ai",
+})
+
+
+
 // You can also omit "model" to use the recommended model for the task
 await inference.translation({
   inputs: "My name is Wolfgang and I live in Amsterdam",
@@ -152,28 +171,24 @@ await inference.translation({
   },
 });
 
-await inference.textToImage({
-  model: 'black-forest-labs/FLUX.1-dev',
-  inputs: 'a picture of a green bird',
-})
-
+// pass multimodal files or URLs as inputs
 await inference.imageToText({
+  model: 'nlpconnect/vit-gpt2-image-captioning',
   data: await (await fetch('https://picsum.photos/300/300')).blob(),
-  model: 'nlpconnect/vit-gpt2-image-captioning',  
 })
 
 // Using your own dedicated inference endpoint: https://hf.co/docs/inference-endpoints/
 const gpt2 = inference.endpoint('https://xyz.eu-west-1.aws.endpoints.huggingface.cloud/gpt2');
 const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the universe is'});
 
-//Chat Completion
+// Chat Completion
 const llamaEndpoint = inference.endpoint(
  "https://api-inference.huggingface.co/models/meta-llama/Llama-3.1-8B-Instruct"
 );
 const out = await llamaEndpoint.chatCompletion({
- model: "meta-llama/Llama-3.1-8B-Instruct",
- messages: [{ role: "user", content: "Hello, nice to meet you!" }],
- max_tokens: 512,
+  model: "meta-llama/Llama-3.1-8B-Instruct",
+  messages: [{ role: "user", content: "Hello, nice to meet you!" }],
+  max_tokens: 512,
 });
 console.log(out.choices[0].message);
 ```
@@ -186,7 +201,7 @@ import { createRepo, uploadFile, deleteFiles } from "@huggingface/hub";
 const HF_TOKEN = "hf_...";
 
 await createRepo({
-  repo: "my-user/nlp-model", // or {type: "model", name: "my-user/nlp-test"},
+  repo: "my-user/nlp-model", // or { type: "model", name: "my-user/nlp-test" },
   accessToken: HF_TOKEN
 });
 
@@ -201,7 +216,7 @@ await uploadFile({
 });
 
 await deleteFiles({
-  repo: {type: "space", name: "my-user/my-space"}, // or "spaces/my-user/my-space"
+  repo: { type: "space", name: "my-user/my-space" }, // or "spaces/my-user/my-space"
   accessToken: HF_TOKEN,
   paths: ["README.md", ".gitattributes"]
 });
@@ -210,7 +225,7 @@ await deleteFiles({
 ### @huggingface/agents example
 
 ```ts
-import {HfAgent, LLMFromHub, defaultTools} from '@huggingface/agents';
+import { HfAgent, LLMFromHub, defaultTools } from '@huggingface/agents';
 
 const HF_TOKEN = "hf_...";
 
 
@@ -283,4 +283,9 @@ describe("gguf", () => {
 		expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual(undefined); // TODO: investigate IQ3_XS
 		expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q4_0_4_4.gguf")).toEqual("Q4_0"); // TODO: investigate Q4_0_4_4
 	});
+
+	it("calculate tensor data offset", async () => {
+		const { tensorDataOffset } = await gguf(URL_LLAMA);
+		expect(tensorDataOffset).toEqual(741056n);
+	});
 });
@@ -10,6 +10,8 @@ export { parseGGUFQuantLabel, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL } from "@huggi
 
 export const RE_GGUF_FILE = /\.gguf$/;
 export const RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/;
+const GGUF_DEFAULT_ALIGNMENT = 32; // defined in ggml.h
+const GGML_PAD = (x: number, n: number) => (x + n - 1) & ~(n - 1); // defined in ggml.h
 const PARALLEL_DOWNLOADS = 20;
 
 export interface GgufShardFileInfo {
@@ -384,14 +386,18 @@ export async function gguf(
 		});
 	}
 
+	// calculate absolute offset of tensor data
+	const alignment: number = Number(metadata["general.alignment"] ?? GGUF_DEFAULT_ALIGNMENT);
+	const tensorDataOffset = BigInt(GGML_PAD(offset, alignment));
+
 	if (params?.computeParametersCount) {
 		const parameterCount = tensorInfos
 			.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1))
 			.reduce((acc, val) => acc + val, 0);
 
-		return { metadata, tensorInfos, parameterCount };
+		return { metadata, tensorInfos, tensorDataOffset, parameterCount };
 	} else {
-		return { metadata, tensorInfos };
+		return { metadata, tensorInfos, tensorDataOffset };
 	}
 }
 
@@ -429,7 +435,10 @@ export async function ggufAllShards(
 			parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0),
 		};
 	} else {
-		const { metadata, tensorInfos, parameterCount } = await gguf(url, { ...params, computeParametersCount: true });
-		return { shards: [{ metadata, tensorInfos }], parameterCount };
+		const { metadata, tensorInfos, tensorDataOffset, parameterCount } = await gguf(url, {
+			...params,
+			computeParametersCount: true,
+		});
+		return { shards: [{ metadata, tensorInfos, tensorDataOffset }], parameterCount };
 	}
 }
@@ -155,4 +155,5 @@ export interface GGUFTensorInfo {
 export interface GGUFParseOutput<Options extends GGUFMetadataOptions = { strict: true }> {
 	metadata: GGUFMetadata<Options>;
 	tensorInfos: GGUFTensorInfo[];
+	tensorDataOffset: bigint;
 }
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2022 Tim Mikeladze
+Copyright (c) 2022 Tim Mikeladze and the Hugging Face team
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -1,7 +1,7 @@
-# 🤗 Hugging Face Inference Endpoints
+# 🤗 Hugging Face Inference
 
-A Typescript powered wrapper for the Hugging Face Inference Endpoints API. Learn more about Inference Endpoints at [Hugging Face](https://huggingface.co/inference-endpoints).
-It works with both [Inference API (serverless)](https://huggingface.co/docs/api-inference/index) and [Inference Endpoints (dedicated)](https://huggingface.co/docs/inference-endpoints/index).
+A Typescript powered wrapper for the Hugging Face Inference API (serverless), Inference Endpoints (dedicated), and third-party Inference Providers.
+It works with [Inference API (serverless)](https://huggingface.co/docs/api-inference/index) and [Inference Endpoints (dedicated)](https://huggingface.co/docs/inference-endpoints/index), and even with supported third-party Inference Providers.
 
 Check out the [full documentation](https://huggingface.co/docs/huggingface.js/inference/README).
 
@@ -42,7 +42,40 @@ const hf = new HfInference('your access token')
 
 Your access token should be kept private. If you need to protect it in front-end applications, we suggest setting up a proxy server that stores the access token.
 
-#### Tree-shaking
+### Third-party inference providers
+
+You can send inference requests to third-party providers with the inference client.
+
+Currently, we support the following providers: [Fal.ai](https://fal.ai), [Replicate](https://replicate.com), [Together](https://together.xyz) and [Sambanova](https://sambanova.ai).
+
+To send requests to a third-party provider, you have to pass the `provider` parameter to the inference function. Make sure your request is authenticated with an access token.
+```ts
+const accessToken = "hf_..."; // Either a HF access token, or an API key from the third-party provider (Replicate in this example)
+
+const client = new HfInference(accessToken);
+await client.textToImage({
+  provider: "replicate",
+  model:"black-forest-labs/Flux.1-dev",
+  inputs: "A black forest cake"
+})
+```
+
+When authenticated with a Hugging Face access token, the request is routed through https://huggingface.co.
+When authenticated with a third-party provider key, the request is made directly against that provider's inference API.
+
+Only a subset of models are supported when requesting third-party providers. You can check the list of supported models per pipeline tasks here:
+- [Fal.ai supported models](./src/providers/fal-ai.ts)
+- [Replicate supported models](./src/providers/replicate.ts)
+- [Sambanova supported models](./src/providers/sambanova.ts)
+- [Together supported models](./src/providers/together.ts)
+- [HF Inference API (serverless)](https://huggingface.co/models?inference=warm&sort=trending)
+
+❗**Important note:** To be compatible, the third-party API must adhere to the "standard" shape API we expect on HF model pages for each pipeline task type. 
+This is not an issue for LLMs as everyone converged on the OpenAI API anyways, but can be more tricky for other tasks like "text-to-image" or "automatic-speech-recognition" where there exists no standard API. Let us know if any help is needed or if we can make things easier for you!
+
+👋**Want to add another provider?** Get in touch if you'd like to add support for another Inference provider, and/or request it on https://huggingface.co/spaces/huggingface/HuggingDiscussions/discussions/49
+
+### Tree-shaking
 
 You can import the functions you need directly from the module instead of using the `HfInference` class.
Original file line number	Diff line number	Diff line change
`@@ -155,4 +155,5 @@ export interface GGUFTensorInfo {`
`155`	`155`	`export interface GGUFParseOutput<Options extends GGUFMetadataOptions = { strict: true }> {`
`156`	`156`	`metadata: GGUFMetadata<Options>;`
`157`	`157`	`tensorInfos: GGUFTensorInfo[];`
	`158`	`+ tensorDataOffset: bigint;`
`158`	`159`	`}`