1+ """Utility functions and classes for code language detection and processing."""
2+
13from enum import Enum
24from typing import List , Optional
35
1416
1517
1618class Language (str , Enum ):
19+ """Supported programming languages for code chunking."""
20+
1721 PYTHON = "python"
1822 JAVASCRIPT = "javascript"
1923 TYPESCRIPT = "typescript"
2024 JAVA = "java"
2125 C = "c"
2226
2327 def file_extensions (self ) -> List [str ]:
28+ """Get the file extensions associated with this language."""
2429 if self == Language .PYTHON :
2530 return [".py" ]
2631 elif self == Language .TYPESCRIPT :
@@ -35,6 +40,7 @@ def file_extensions(self) -> List[str]:
3540 return []
3641
3742 def get_tree_sitter_language (self ):
43+ """Get the tree-sitter language object for this language."""
3844 if self == Language .PYTHON :
3945 return Lang (ts_python .language ())
4046 elif self == Language .TYPESCRIPT :
@@ -49,7 +55,7 @@ def get_tree_sitter_language(self):
4955 return None
5056
5157 def to_code_language_label (self ):
52-
58+ """Convert this language to a CodeLanguageLabel."""
5359 mapping = {
5460 Language .PYTHON : CodeLanguageLabel .PYTHON ,
5561 Language .JAVA : CodeLanguageLabel .JAVA ,
@@ -60,6 +66,7 @@ def to_code_language_label(self):
6066 return mapping .get (self , CodeLanguageLabel .UNKNOWN )
6167
6268 def get_import_query (self ) -> Optional [str ]:
69+ """Get the tree-sitter query string for finding imports in this language."""
6370 if self == Language .PYTHON :
6471 return """
6572 (import_statement) @import
@@ -101,6 +108,7 @@ def get_import_query(self) -> Optional[str]:
101108 return None
102109
103110 def get_function_name (self , node : Node ) -> Optional [str ]:
111+ """Extract the function name from a function node."""
104112 if self == Language .C :
105113 declarator = node .child_by_field_name ("declarator" )
106114 if declarator :
@@ -115,6 +123,7 @@ def get_function_name(self, node: Node) -> Optional[str]:
115123 return None
116124
117125 def is_collectable_function (self , node : Node , constructor_name : str ) -> bool :
126+ """Check if a function should be collected for chunking."""
118127 if self == Language .C :
119128 return True
120129 else :
@@ -126,6 +135,7 @@ def is_collectable_function(self, node: Node, constructor_name: str) -> bool:
126135
127136
128137def _get_default_tokenizer () -> "BaseTokenizer" :
138+ """Get the default tokenizer instance."""
129139 from docling_core .transforms .chunker .tokenizer .huggingface import (
130140 HuggingFaceTokenizer ,
131141 )
@@ -136,17 +146,20 @@ def _get_default_tokenizer() -> "BaseTokenizer":
136146
137147
138148def has_child (node : Node , child_name : str ) -> bool :
149+ """Check if a node has a child with the specified name."""
139150 return bool (node and node .child_by_field_name (child_name ))
140151
141152
142153def get_children (node : Node , child_types : List [str ]) -> List [Node ]:
154+ """Get all children of a node that match the specified types."""
143155 if not node .children :
144156 return []
145157
146158 return [child for child in node .children if child .type in child_types ]
147159
148160
149161def to_str (node : Node ) -> str :
162+ """Convert a tree-sitter node to a string."""
150163 if not node or not node .text :
151164 return ""
152165 text = node .text .decode ()
0 commit comments