cocoindex-io
diff --git a/‎docs/docs/ops/functions.md‎
Lines changed: 88 additions & 77 deletions b/‎docs/docs/ops/functions.md‎
Lines changed: 88 additions & 77 deletions
diff --git a/‎examples/code_embedding/main.py‎
Lines changed: 4 additions & 9 deletions b/‎examples/code_embedding/main.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎python/cocoindex/functions/__init__.py‎
Lines changed: 5 additions & 10 deletions b/‎python/cocoindex/functions/__init__.py‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎python/cocoindex/functions/_engine_builtin_specs.py‎
Lines changed: 4 additions & 0 deletions b/‎python/cocoindex/functions/_engine_builtin_specs.py‎
Lines changed: 4 additions & 0 deletions
@@ -11,11 +11,23 @@ description: CocoIndex Built-in Functions
 
 Input data:
 
-*   `text` (*Str*): The source text to parse.
-*   `language` (*Optional[Str]*, default: `"json"`): The language of the source text.  Only `json` is supported now.
+* `text` (*Str*): The source text to parse.
+* `language` (*Optional[Str]*, default: `"json"`): The language of the source text.  Only `json` is supported now.
 
 Return: *Json*, the parsed JSON object.
 
+## DetectProgrammingLanguage
+
+`DetectProgrammingLanguage` detects the programming language of a file based on its filename extension.
+
+Input data:
+
+* `filename` (*Str*): The filename (with extension) to detect the language for.
+
+Return: *Str* or *Null*. Returns the programming language name if the file extension is recognized, or *Null* if the extension is not supported.
+
+The returned string values match the language name listed in [`tree-sitter-language-pack`](https://github.com/Goldziher/tree-sitter-language-pack?tab=readme-ov-file#available-languages).
+
 ## SplitRecursively
 
 `SplitRecursively` splits a document into chunks of a given size.
@@ -24,20 +36,20 @@ For example, for a Markdown file, it identifies boundaries in this order: level-
 
 The spec takes the following fields:
 
-*   `custom_languages` (`list[CustomLanguageSpec]`, optional): This allows you to customize the way to chunking specific languages using regular expressions. Each `CustomLanguageSpec` is a dict with the following fields:
-    *   `language_name` (`str`): Name of the language.
-    *   `aliases` (`list[str]`, optional): A list of aliases for the language.
+* `custom_languages` (`list[CustomLanguageSpec]`, optional): This allows you to customize the way to chunking specific languages using regular expressions. Each `CustomLanguageSpec` is a dict with the following fields:
+  * `language_name` (`str`): Name of the language.
+  * `aliases` (`list[str]`, optional): A list of aliases for the language.
         It's an error if any language name or alias is duplicated.
 
-    *   `separators_regex` (`list[str]`): A list of regex patterns to split the text.
+  * `separators_regex` (`list[str]`): A list of regex patterns to split the text.
         Higher-level boundaries should come first, and lower-level should be listed later. e.g. `[r"\n# ", r"\n## ", r"\n\n", r"\. "]`.
         See [regex syntax](https://docs.rs/regex/latest/regex/#syntax) for supported regular expression syntax.
 
 Input data:
 
-*   `text` (*Str*): The text to split.
-*   `chunk_size` (*Int64*): The maximum size of each chunk, in bytes.
-*   `min_chunk_size` (*Int64*, default: `chunk_size / 2`): The minimum size of each chunk, in bytes.
+* `text` (*Str*): The text to split.
+* `chunk_size` (*Int64*): The maximum size of each chunk, in bytes.
+* `min_chunk_size` (*Int64*, default: `chunk_size / 2`): The minimum size of each chunk, in bytes.
 
     :::note
 
@@ -48,70 +60,67 @@ Input data:
 
     :::
 
-*   `chunk_overlap` (*Optional[Int64]*, default: *None*): The maximum overlap size between adjacent chunks, in bytes.
-*   `language` (*Str*, default: `""`): The language of the document.
-    Can be a language name (e.g. `Python`, `Javascript`, `Markdown`) or a file extension (e.g. `.py`, `.js`, `.md`).
-
+* `chunk_overlap` (*Optional[Int64]*, default: *None*): The maximum overlap size between adjacent chunks, in bytes.
+* `language` (*Str*, default: `""`): The language of the document.
+    Can be a language name (e.g. `python`, `javascript`, `markdown`) or a file extension (e.g. `.py`, `.js`, `.md`).
 
     :::note
 
     We use the `language` field to determine how to split the input text, following these rules:
 
-    *   We match the input `language` field against the following registries in the following order:
-        *   `custom_languages` in the spec, against the `language_name` or `aliases` field of each entry.
-        *   Builtin languages (see [Supported Languages](#supported-languages) section below), against the language, aliases or file extensions of each entry.
+  * We match the input `language` field against the following registries in the following order:
+  * `custom_languages` in the spec, against the `language_name` or `aliases` field of each entry.
+  * Builtin languages (see [Supported Languages](#supported-languages) section below), against the language, aliases or file extensions of each entry.
 
         All matches are in a case-insensitive manner.
 
-    *   If no match is found, the input will be treated as plain text.
+  * If no match is found, the input will be treated as plain text.
 
     :::
 
 Return: [*KTable*](/docs/core/data_types#ktable), each row represents a chunk, with the following sub fields:
 
-*   `location` (*Range*): The location of the chunk.
-*   `text` (*Str*): The text of the chunk.
-*   `start` / `end` (*Struct*): Details about the start position (inclusive) and end position (exclusive) of the chunk. They have the following sub fields:
-    *   `offset` (*Int64*): The byte offset of the position.
-    *   `line` (*Int64*): The line number of the position. Starting from 1.
-    *   `column` (*Int64*): The column number of the position. Starting from 1.
+* `location` (*Range*): The location of the chunk.
+* `text` (*Str*): The text of the chunk.
+* `start` / `end` (*Struct*): Details about the start position (inclusive) and end position (exclusive) of the chunk. They have the following sub fields:
+  * `offset` (*Int64*): The byte offset of the position.
+  * `line` (*Int64*): The line number of the position. Starting from 1.
+  * `column` (*Int64*): The column number of the position. Starting from 1.
 
 ### Supported Languages
 
 Currently, `SplitRecursively` supports the following languages:
 
 | Language | Aliases | File Extensions |
 |----------|---------|-----------------|
-| C | | `.c` |
-| C++ | CPP | `.cpp`, `.cc`, `.cxx`, `.h`, `.hpp` |
-| C# | CSharp, CS | `.cs` |
-| CSS | | `.css`, `.scss` |
-| DTD | | `.dtd` |
-| Fortran | F, F90, F95, F03 | `.f`, `.f90`, `.f95`, `.f03` |
-| Go | Golang | `.go` |
-| HTML | | `.html`, `.htm` |
-| Java | | `.java` |
-| JavaScript | JS | `.js` |
-| JSON | | `.json` |
-| Kotlin | | `.kt`, `.kts` |
-| Markdown | MD | `.md`, `.mdx` |
-| Pascal | PAS, DPR, Delphi | `.pas`, `.dpr` |
-| PHP | | `.php` |
-| Python | | `.py` |
-| R | | `.r` |
-| Ruby | | `.rb` |
-| Rust | RS | `.rs` |
-| Scala | | `.scala` |
-| Solidity | | `.sol` |
-| SQL | | `.sql` |
-| Swift | | `.swift` |
-| TOML | | `.toml` |
-| TSX | | `.tsx` |
-| TypeScript | TS | `.ts` |
-| XML | | `.xml` |
-| YAML | | `.yaml`, `.yml` |
-
-
+| c | | `.c` |
+| cpp | c++ | `.cpp`, `.cc`, `.cxx`, `.h`, `.hpp` |
+| csharp | csharp, cs | `.cs` |
+| css | | `.css`, `.scss` |
+| dtd | | `.dtd` |
+| fortran | f, f90, f95, f03 | `.f`, `.f90`, `.f95`, `.f03` |
+| go | golang | `.go` |
+| html | | `.html`, `.htm` |
+| java | | `.java` |
+| javascript | js | `.js` |
+| json | | `.json` |
+| kotlin | | `.kt`, `.kts` |
+| markdown | md | `.md`, `.mdx` |
+| pascal | pas, dpr, delphi | `.pas`, `.dpr` |
+| php | | `.php` |
+| python | | `.py` |
+| r | | `.r` |
+| ruby | | `.rb` |
+| rust | rs | `.rs` |
+| scala | | `.scala` |
+| solidity | | `.sol` |
+| sql | | `.sql` |
+| swift | | `.swift` |
+| toml | | `.toml` |
+| tsx | | `.tsx` |
+| typescript | ts | `.ts` |
+| xml | | `.xml` |
+| yaml | | `.yaml`, `.yml` |
 
 ## SentenceTransformerEmbed
 
@@ -124,41 +133,42 @@ This function requires the 'sentence-transformers' library, which is an optional
 ```bash
 pip install 'cocoindex[embeddings]'
 ```
+
 :::
 
 The spec takes the following fields:
 
-*   `model` (`str`): The name of the SentenceTransformer model to use.
-*   `args` (`dict[str, Any]`, optional): Additional arguments to pass to the SentenceTransformer constructor. e.g. `{"trust_remote_code": True}`
+* `model` (`str`): The name of the SentenceTransformer model to use.
+* `args` (`dict[str, Any]`, optional): Additional arguments to pass to the SentenceTransformer constructor. e.g. `{"trust_remote_code": True}`
 
 Input data:
 
-*   `text` (*Str*): The text to embed.
+* `text` (*Str*): The text to embed.
 
 Return: *Vector[Float32, N]*, where *N* is determined by the model
 
 ## ExtractByLlm
 
 `ExtractByLlm` extracts structured information from a text using specified LLM. The spec takes the following fields:
 
-*   `llm_spec` (`cocoindex.LlmSpec`): The specification of the LLM to use. See [LLM Spec](/docs/ai/llm#llm-spec) for more details.
-*   `output_type` (`type`): The type of the output. e.g. a dataclass type name. See [Data Types](/docs/core/data_types) for all supported data types. The LLM will output values that match the schema of the type.
-*   `instruction` (`str`, optional): Additional instruction for the LLM.
+* `llm_spec` (`cocoindex.LlmSpec`): The specification of the LLM to use. See [LLM Spec](/docs/ai/llm#llm-spec) for more details.
+* `output_type` (`type`): The type of the output. e.g. a dataclass type name. See [Data Types](/docs/core/data_types) for all supported data types. The LLM will output values that match the schema of the type.
+* `instruction` (`str`, optional): Additional instruction for the LLM.
 
 :::tip Clear type definitions
 
 Definitions of the `output_type` is fed into LLM as guidance to generate the output.
 To improve the quality of the extracted information, giving clear definitions for your dataclasses is especially important, e.g.
 
-*   Provide readable field names for your dataclasses.
-*   Provide reasonable docstrings for your dataclasses.
-*   For any optional fields, clearly annotate that they are optional, by `SomeType | None` or `typing.Optional[SomeType]`.
+* Provide readable field names for your dataclasses.
+* Provide reasonable docstrings for your dataclasses.
+* For any optional fields, clearly annotate that they are optional, by `SomeType | None` or `typing.Optional[SomeType]`.
 
 :::
 
 Input data:
 
-*   `text` (*Str*): The text to extract information from.
+* `text` (*Str*): The text to extract information from.
 
 Return: As specified by the `output_type` field in the spec. The extracted information from the input text.
 
@@ -168,15 +178,15 @@ Return: As specified by the `output_type` field in the spec. The extracted infor
 
 The spec takes the following fields:
 
-*   `api_type` ([`cocoindex.LlmApiType`](/docs/ai/llm#llm-api-types)): The type of LLM API to use for embedding.
-*   `model` (`str`): The name of the embedding model to use.
-*   `address` (`str`, optional): The address of the LLM API. If not specified, uses the default address for the API type.
-*   `output_dimension` (`int`, optional): The expected dimension of the output embedding vector. If not specified, use the default dimension of the model.
+* `api_type` ([`cocoindex.LlmApiType`](/docs/ai/llm#llm-api-types)): The type of LLM API to use for embedding.
+* `model` (`str`): The name of the embedding model to use.
+* `address` (`str`, optional): The address of the LLM API. If not specified, uses the default address for the API type.
+* `output_dimension` (`int`, optional): The expected dimension of the output embedding vector. If not specified, use the default dimension of the model.
 
     For most API types, the function internally keeps a registry for the default output dimension of known model.
     You need to explicitly specify the `output_dimension` if you want to use a new model that is not in the registry yet.
 
-*   `task_type` (`str`, optional): The task type for embedding, used by some embedding models to optimize the embedding for specific use cases.
+* `task_type` (`str`, optional): The task type for embedding, used by some embedding models to optimize the embedding for specific use cases.
 
 :::note Supported APIs for Text Embedding
 
@@ -186,18 +196,18 @@ Not all LLM APIs support text embedding. See the [LLM API Types table](/docs/ai/
 
 Input data:
 
-*   `text` (*Str*): The text to embed.
+* `text` (*Str*): The text to embed.
 
 Return: *Vector[Float32, N]*, where *N* is the dimension of the embedding vector determined by the model.
 
 ## ColPali Functions
 
 ColPali functions enable multimodal document retrieval using ColVision models. These functions support ALL models available in the [colpali-engine library](https://github.com/illuin-tech/colpali), including:
 
-- **ColPali models** (colpali-*): PaliGemma-based, best for general document retrieval
-- **ColQwen2 models** (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
-- **ColSmol models** (colsmol-*): Lightweight, good for resource-constrained environments
-- Any future ColVision models supported by colpali-engine
+* **ColPali models** (colpali-*): PaliGemma-based, best for general document retrieval
+* **ColQwen2 models** (colqwen-*): Qwen2-VL-based, excellent for multilingual text (29+ languages) and general vision
+* **ColSmol models** (colsmol-*): Lightweight, good for resource-constrained environments
+* Any future ColVision models supported by colpali-engine
 
 These models use late interaction between image patch embeddings and text token embeddings for retrieval.
 
@@ -208,6 +218,7 @@ These functions require the `colpali-engine` library, which is an optional depen
 ```bash
 pip install 'cocoindex[colpali]'
 ```
+
 :::
 
 ### ColPaliEmbedImage
@@ -216,11 +227,11 @@ pip install 'cocoindex[colpali]'
 
 The spec takes the following fields:
 
-*   `model` (`str`): Any ColVision model name supported by colpali-engine (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0"). See the [complete list of supported models](https://github.com/illuin-tech/colpali#list-of-colvision-models).
+* `model` (`str`): Any ColVision model name supported by colpali-engine (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0"). See the [complete list of supported models](https://github.com/illuin-tech/colpali#list-of-colvision-models).
 
 Input data:
 
-*   `img_bytes` (*Bytes*): The image data in bytes format.
+* `img_bytes` (*Bytes*): The image data in bytes format.
 
 Return: *Vector[Vector[Float32, N]]*, where *N* is the hidden dimension determined by the model. This returns a multi-vector format with variable patches and fixed hidden dimension.
 
@@ -232,10 +243,10 @@ This produces query embeddings compatible with ColVision image embeddings for la
 
 The spec takes the following fields:
 
-*   `model` (`str`): Any ColVision model name supported by colpali-engine (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0"). See the [complete list of supported models](https://github.com/illuin-tech/colpali#list-of-colvision-models).
+* `model` (`str`): Any ColVision model name supported by colpali-engine (e.g., "vidore/colpali-v1.2", "vidore/colqwen2.5-v0.2", "vidore/colsmol-v1.0"). See the [complete list of supported models](https://github.com/illuin-tech/colpali#list-of-colvision-models).
 
 Input data:
 
-*   `query` (*Str*): The text query to embed.
+* `query` (*Str*): The text query to embed.
 
 Return: *Vector[Vector[Float32, N]]*, where *N* is the hidden dimension determined by the model. This returns a multi-vector format with variable tokens and fixed hidden dimension.
@@ -1,20 +1,13 @@
 from dotenv import load_dotenv
 from psycopg_pool import ConnectionPool
 from pgvector.psycopg import register_vector
-from typing import Any
 import functools
 import cocoindex
 import os
 from numpy.typing import NDArray
 import numpy as np
 
 
-@cocoindex.op.function()
-def extract_extension(filename: str) -> str:
-    """Extract the extension of a filename."""
-    return os.path.splitext(filename)[1]
-
-
 @cocoindex.transform_flow()
 def code_to_embedding(
     text: cocoindex.DataSlice[str],
@@ -53,10 +46,12 @@ def code_embedding_flow(
     code_embeddings = data_scope.add_collector()
 
     with data_scope["files"].row() as file:
-        file["extension"] = file["filename"].transform(extract_extension)
+        file["language"] = file["filename"].transform(
+            cocoindex.functions.DetectProgrammingLanguage()
+        )
         file["chunks"] = file["content"].transform(
             cocoindex.functions.SplitRecursively(),
-            language=file["extension"],
+            language=file["language"],
             chunk_size=1000,
             min_chunk_size=300,
             chunk_overlap=300,
 
@@ -5,13 +5,7 @@
 """
 
 # Import all engine builtin function specs
-from ._engine_builtin_specs import (
-    ParseJson,
-    SplitRecursively,
-    SplitBySeparators,
-    EmbedText,
-    ExtractByLlm,
-)
+from ._engine_builtin_specs import *
 
 # Import SentenceTransformer embedding functionality
 from .sbert import (
@@ -29,11 +23,12 @@
 
 __all__ = [
     # Engine builtin specs
-    "ParseJson",
-    "SplitRecursively",
-    "SplitBySeparators",
+    "DetectProgrammingLanguage",
     "EmbedText",
     "ExtractByLlm",
+    "ParseJson",
+    "SplitBySeparators",
+    "SplitRecursively",
     # SentenceTransformer
     "SentenceTransformerEmbed",
     "SentenceTransformerEmbedExecutor",
 
@@ -19,6 +19,10 @@ class CustomLanguageSpec:
     aliases: list[str] = dataclasses.field(default_factory=list)
 
 
+class DetectProgrammingLanguage(op.FunctionSpec):
+    """Detect the programming language of a file."""
+
+
 class SplitRecursively(op.FunctionSpec):
     """Split a document (in string) recursively."""