Skip to content

Commit 1a01de8

Browse files
committed
run all pre-commit less pytest
1 parent 850c5cc commit 1a01de8

File tree

8 files changed

+58
-26
lines changed

8 files changed

+58
-26
lines changed

docling_core/transforms/chunker/base_code_chunker.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Base code chunker implementation for parsing and chunking code files."""
2+
13
from typing import Any, Dict, Iterator, List, Optional, Tuple
24

35
from tree_sitter import Node, Parser, Tree
@@ -58,13 +60,8 @@ def __init__(self, **data):
5860
if self.parser is None:
5961
self.parser = Parser(self.ts_language)
6062

61-
@property
62-
def max_tokens(self) -> int:
63-
"""Get maximum number of tokens allowed."""
64-
return self.tokenizer.get_max_tokens()
65-
6663
def parse_code(self, code: str) -> Tree:
67-
"""Get tree sitter parser"""
64+
"""Get tree sitter parser."""
6865
return self.parser.parse(bytes(code, self.utf8_encoding))
6966

7067
def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[CodeChunk]:
@@ -199,7 +196,10 @@ def _yield_function_chunks_with_ranges(
199196
function_content.replace(docstring, "") if docstring else function_content
200197
)
201198

202-
base_content = f"{prefix}{imports}{module_variable_definitions}{additional_context_no_docstring}{function_no_docstring}"
199+
base_content = (
200+
f"{prefix}{imports}{module_variable_definitions}"
201+
f"{additional_context_no_docstring}{function_no_docstring}"
202+
)
203203

204204
if chunk_builder:
205205
yield chunk_builder.build_function_chunk(
@@ -437,7 +437,6 @@ def find_used_imports(node):
437437

438438
def _get_node_with_comments(self, node: Node) -> str:
439439
"""Get node text including any preceding comments."""
440-
441440
current = node.prev_sibling
442441
comment_parts: List[str] = []
443442

@@ -516,7 +515,7 @@ def _build_additional_context(
516515
return context, context_no_docstring
517516

518517
def _is_docstring(self, node: Node) -> bool:
519-
"""Determines if a node is a docstring"""
518+
"""Determines if a node is a docstring."""
520519
return bool(
521520
node.type == self.expression_statement
522521
and node.named_children

docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Utility classes for code chunking operations."""
2+
13
import hashlib
24
from typing import Iterator, List, Tuple
35

@@ -20,6 +22,7 @@ class RangeTracker:
2022
"""Handles tracking and management of used byte ranges in code."""
2123

2224
def __init__(self):
25+
"""Initialize the range tracker with an empty list of used ranges."""
2326
self.used_ranges: List[Tuple[int, int]] = []
2427

2528
def mark_used(self, start_byte: int, end_byte: int) -> None:
@@ -79,6 +82,7 @@ class ChunkMetadataBuilder:
7982
"""Builds metadata for code chunks."""
8083

8184
def __init__(self, origin: DocumentOrigin):
85+
"""Initialize the metadata builder with document origin."""
8286
self.origin = origin
8387

8488
def build_function_metadata(
@@ -152,6 +156,7 @@ class ChunkBuilder:
152156
"""Builds code chunks from nodes and content."""
153157

154158
def __init__(self, origin: DocumentOrigin):
159+
"""Initialize the chunk builder with document origin."""
155160
self.metadata_builder = ChunkMetadataBuilder(origin)
156161

157162
def build_function_chunk(
@@ -231,6 +236,7 @@ class ChunkSizeProcessor:
231236
def __init__(
232237
self, tokenizer, max_tokens: int, min_chunk_size: int = 300, chunker=None
233238
):
239+
"""Initialize the chunk size processor with tokenizer and size constraints."""
234240
self.tokenizer = tokenizer
235241
self.max_tokens = max_tokens
236242
self.min_chunk_size = min_chunk_size
@@ -322,7 +328,7 @@ def _split_function_chunk(
322328

323329
new_meta = chunk.meta.model_copy()
324330
new_meta.part_name = (
325-
f"{chunk.meta.part_name}_part_{i+1}"
331+
f"{chunk.meta.part_name}_part_{i + 1}"
326332
if len(chunks) > 1
327333
else chunk.meta.part_name
328334
)

docling_core/transforms/chunker/code_chunk_utils/utils.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Utility functions and classes for code language detection and processing."""
2+
13
from enum import Enum
24
from typing import List, Optional
35

@@ -14,13 +16,16 @@
1416

1517

1618
class Language(str, Enum):
19+
"""Supported programming languages for code chunking."""
20+
1721
PYTHON = "python"
1822
JAVASCRIPT = "javascript"
1923
TYPESCRIPT = "typescript"
2024
JAVA = "java"
2125
C = "c"
2226

2327
def file_extensions(self) -> List[str]:
28+
"""Get the file extensions associated with this language."""
2429
if self == Language.PYTHON:
2530
return [".py"]
2631
elif self == Language.TYPESCRIPT:
@@ -35,6 +40,7 @@ def file_extensions(self) -> List[str]:
3540
return []
3641

3742
def get_tree_sitter_language(self):
43+
"""Get the tree-sitter language object for this language."""
3844
if self == Language.PYTHON:
3945
return Lang(ts_python.language())
4046
elif self == Language.TYPESCRIPT:
@@ -49,7 +55,7 @@ def get_tree_sitter_language(self):
4955
return None
5056

5157
def to_code_language_label(self):
52-
58+
"""Convert this language to a CodeLanguageLabel."""
5359
mapping = {
5460
Language.PYTHON: CodeLanguageLabel.PYTHON,
5561
Language.JAVA: CodeLanguageLabel.JAVA,
@@ -60,6 +66,7 @@ def to_code_language_label(self):
6066
return mapping.get(self, CodeLanguageLabel.UNKNOWN)
6167

6268
def get_import_query(self) -> Optional[str]:
69+
"""Get the tree-sitter query string for finding imports in this language."""
6370
if self == Language.PYTHON:
6471
return """
6572
(import_statement) @import
@@ -101,6 +108,7 @@ def get_import_query(self) -> Optional[str]:
101108
return None
102109

103110
def get_function_name(self, node: Node) -> Optional[str]:
111+
"""Extract the function name from a function node."""
104112
if self == Language.C:
105113
declarator = node.child_by_field_name("declarator")
106114
if declarator:
@@ -115,6 +123,7 @@ def get_function_name(self, node: Node) -> Optional[str]:
115123
return None
116124

117125
def is_collectable_function(self, node: Node, constructor_name: str) -> bool:
126+
"""Check if a function should be collected for chunking."""
118127
if self == Language.C:
119128
return True
120129
else:
@@ -126,6 +135,7 @@ def is_collectable_function(self, node: Node, constructor_name: str) -> bool:
126135

127136

128137
def _get_default_tokenizer() -> "BaseTokenizer":
138+
"""Get the default tokenizer instance."""
129139
from docling_core.transforms.chunker.tokenizer.huggingface import (
130140
HuggingFaceTokenizer,
131141
)
@@ -136,17 +146,20 @@ def _get_default_tokenizer() -> "BaseTokenizer":
136146

137147

138148
def has_child(node: Node, child_name: str) -> bool:
149+
"""Check if a node has a child with the specified name."""
139150
return bool(node and node.child_by_field_name(child_name))
140151

141152

142153
def get_children(node: Node, child_types: List[str]) -> List[Node]:
154+
"""Get all children of a node that match the specified types."""
143155
if not node.children:
144156
return []
145157

146158
return [child for child in node.children if child.type in child_types]
147159

148160

149161
def to_str(node: Node) -> str:
162+
"""Convert a tree-sitter node to a string."""
150163
if not node or not node.text:
151164
return ""
152165
text = node.text.decode()

docling_core/transforms/chunker/code_chunking_strategy.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Code chunking strategy implementations for different programming languages."""
2+
13
from typing import Any, Dict, Iterator, Optional
24

35
from docling_core.transforms.chunker.base_code_chunker import _CodeChunker
@@ -30,7 +32,6 @@ class LanguageDetector:
3032
@staticmethod
3133
def detect_from_extension(filename: Optional[str]) -> Optional[Language]:
3234
"""Detect language from file extension."""
33-
3435
if not filename:
3536
return None
3637

@@ -45,7 +46,6 @@ def detect_from_extension(filename: Optional[str]) -> Optional[Language]:
4546
@staticmethod
4647
def detect_from_content(code_text: str) -> Optional[Language]:
4748
"""Detect language from code content using heuristics."""
48-
4949
if not code_text:
5050
return None
5151

@@ -65,7 +65,20 @@ def detect_from_content(code_text: str) -> Optional[Language]:
6565
]
6666
) and not any(
6767
pattern in code_lower
68-
for pattern in ["public class", "private ", "protected ", "package "]
68+
for pattern in [
69+
"public class",
70+
"private ",
71+
"protected ",
72+
"package ",
73+
"package main",
74+
"func main()",
75+
'import "fmt"',
76+
"chan ",
77+
"interface{}",
78+
"go func",
79+
"defer ",
80+
":= ",
81+
]
6982
):
7083
return Language.PYTHON
7184

@@ -169,7 +182,6 @@ def detect_language(
169182
code_text: str, filename: Optional[str] = None
170183
) -> Optional[Language]:
171184
"""Detect language from both filename and content."""
172-
173185
if filename:
174186
lang = LanguageDetector.detect_from_extension(filename)
175187
if lang:
@@ -185,7 +197,6 @@ class CodeChunkingStrategyFactory:
185197
@staticmethod
186198
def create_chunker(language: Language, **kwargs: Any) -> _CodeChunker:
187199
"""Create a language-specific code chunker."""
188-
189200
chunker_map = {
190201
Language.PYTHON: _PythonFunctionChunker,
191202
Language.TYPESCRIPT: _TypeScriptFunctionChunker,
@@ -206,13 +217,11 @@ class DefaultCodeChunkingStrategy:
206217

207218
def __init__(self, **chunker_kwargs: Any):
208219
"""Initialize the strategy with optional chunker parameters."""
209-
210220
self.chunker_kwargs = chunker_kwargs
211221
self._chunker_cache: Dict[Language, _CodeChunker] = {}
212222

213223
def _get_chunker(self, language: Language) -> _CodeChunker:
214224
"""Get or create a chunker for the given language."""
215-
216225
if language not in self._chunker_cache:
217226
self._chunker_cache[language] = CodeChunkingStrategyFactory.create_chunker(
218227
language, **self.chunker_kwargs
@@ -228,7 +237,6 @@ def chunk_code_item(
228237
**kwargs: Any,
229238
) -> Iterator[CodeChunk]:
230239
"""Chunk a single code item using the appropriate language chunker."""
231-
232240
if not code_text.strip():
233241
return
234242

@@ -276,7 +284,6 @@ def chunk_code_item(
276284
**kwargs: Any,
277285
) -> Iterator[CodeChunk]:
278286
"""Return the code as a single chunk without further processing."""
279-
280287
if not code_text.strip():
281288
return
282289

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
Literal,
2020
Optional,
2121
Protocol,
22+
cast,
2223
)
2324

2425
from pydantic import ConfigDict, Field, StringConstraints, field_validator
@@ -134,7 +135,7 @@ class CodeDocMeta(DocMeta):
134135
default="docling_core.transforms.chunker.CodeDocMeta",
135136
alias=_KEY_SCHEMA_NAME,
136137
)
137-
doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS)
138+
doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS) # type: ignore[assignment]
138139
part_name: Optional[str] = Field(default=None)
139140
docstring: Optional[str] = Field(default=None)
140141
sha256: Optional[int] = Field(default=None)
@@ -151,7 +152,7 @@ class CodeChunk(BaseChunk):
151152

152153

153154
class CodeChunkType(str, Enum):
154-
"""Chunk type"""
155+
"""Chunk type."""
155156

156157
FUNCTION = "function"
157158
METHOD = "method"
@@ -317,8 +318,9 @@ def chunk(
317318
LanguageDetector,
318319
)
319320

321+
text_item = cast(Any, item)
320322
language = LanguageDetector.detect_language(
321-
item.text,
323+
text_item.text,
322324
(
323325
getattr(dl_doc.origin, "filename", None)
324326
if dl_doc.origin

docling_core/transforms/chunker/language_code_chunkers.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""Language-specific code chunker implementations."""
2+
13
from typing import Any, Dict, List, Tuple
24

35
from pydantic import Field
@@ -31,7 +33,7 @@ class _PythonFunctionChunker(_CodeChunker):
3133
function_body: str = "block"
3234
tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer)
3335
min_chunk_size: int = 300
34-
max_tokens: int = 50
36+
max_tokens: int = 5000
3537
docs_types: List[str] = ["body", "comment"]
3638
dotted_name: str = "dotted_name"
3739
aliased_import: str = "aliased_import"
@@ -112,6 +114,7 @@ def _find_used_variables(self, function_node: Node) -> set:
112114
used_vars = set()
113115

114116
def collect_identifiers(node, depth=0):
117+
"""Collect identifiers from node."""
115118
" " * depth
116119
if node.type in self.identifiers:
117120
var_name = node.text.decode(self.utf8_encoding)
@@ -365,6 +368,7 @@ def _find_used_variables(self, function_node: Node) -> set:
365368
used_macros = set()
366369

367370
def collect_identifiers(node, depth=0):
371+
"""Collect identifiers from node."""
368372
" " * depth
369373
if node.type in self.identifiers:
370374
macro_name = node.text.decode(self.utf8_encoding)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ namespace_packages = true
136136
show_error_codes = true
137137
python_version = "3.9"
138138
plugins = ["pydantic.mypy"]
139+
exclude = "(^|/)test/data/.*"
139140

140141
[[tool.mypy.overrides]]
141142
module = [

test/test_code_chunker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import json
33
import os
44
import pathlib
5-
from typing import List
5+
from typing import List, Optional
66

77
import git
88
import pytest
@@ -30,7 +30,7 @@ def get_latest_commit_id(file_dir: str) -> str:
3030

3131

3232
def create_documents_from_repository(
33-
file_dir: str, repo_url: str, commit_id: str = None
33+
file_dir: str, repo_url: str, commit_id: Optional[str] = None
3434
) -> List[DoclingDocument]:
3535
"""Build DoclingDocument objects from a local checkout, one per code file."""
3636

0 commit comments

Comments
 (0)