run all pre-commit less pytest

bridgetmcg · bridgetmcg · commit 1a01de85576b · 2025-10-30T10:51:31.000-04:00
diff --git a/docling_core/transforms/chunker/base_code_chunker.py b/docling_core/transforms/chunker/base_code_chunker.py
@@ -1,3 +1,5 @@
+"""Base code chunker implementation for parsing and chunking code files."""
+
 from typing import Any, Dict, Iterator, List, Optional, Tuple
 
 from tree_sitter import Node, Parser, Tree
@@ -58,13 +60,8 @@ def __init__(self, **data):
         if self.parser is None:
             self.parser = Parser(self.ts_language)
 
-    @property
-    def max_tokens(self) -> int:
-        """Get maximum number of tokens allowed."""
-        return self.tokenizer.get_max_tokens()
-
     def parse_code(self, code: str) -> Tree:
-        """Get tree sitter parser"""
+        """Get tree sitter parser."""
         return self.parser.parse(bytes(code, self.utf8_encoding))
 
     def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[CodeChunk]:
@@ -199,7 +196,10 @@ def _yield_function_chunks_with_ranges(
             function_content.replace(docstring, "") if docstring else function_content
         )
 
-        base_content = f"{prefix}{imports}{module_variable_definitions}{additional_context_no_docstring}{function_no_docstring}"
+        base_content = (
+            f"{prefix}{imports}{module_variable_definitions}"
+            f"{additional_context_no_docstring}{function_no_docstring}"
+        )
 
         if chunk_builder:
             yield chunk_builder.build_function_chunk(
@@ -437,7 +437,6 @@ def find_used_imports(node):
 
     def _get_node_with_comments(self, node: Node) -> str:
         """Get node text including any preceding comments."""
-
         current = node.prev_sibling
         comment_parts: List[str] = []
 
@@ -516,7 +515,7 @@ def _build_additional_context(
         return context, context_no_docstring
 
     def _is_docstring(self, node: Node) -> bool:
-        """Determines if a node is a docstring"""
+        """Determines if a node is a docstring."""
         return bool(
             node.type == self.expression_statement
             and node.named_children
diff --git a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py
@@ -1,3 +1,5 @@
+"""Utility classes for code chunking operations."""
+
 import hashlib
 from typing import Iterator, List, Tuple
 
@@ -20,6 +22,7 @@ class RangeTracker:
     """Handles tracking and management of used byte ranges in code."""
 
     def __init__(self):
+        """Initialize the range tracker with an empty list of used ranges."""
         self.used_ranges: List[Tuple[int, int]] = []
 
     def mark_used(self, start_byte: int, end_byte: int) -> None:
@@ -79,6 +82,7 @@ class ChunkMetadataBuilder:
     """Builds metadata for code chunks."""
 
     def __init__(self, origin: DocumentOrigin):
+        """Initialize the metadata builder with document origin."""
         self.origin = origin
 
     def build_function_metadata(
@@ -152,6 +156,7 @@ class ChunkBuilder:
     """Builds code chunks from nodes and content."""
 
     def __init__(self, origin: DocumentOrigin):
+        """Initialize the chunk builder with document origin."""
         self.metadata_builder = ChunkMetadataBuilder(origin)
 
     def build_function_chunk(
@@ -231,6 +236,7 @@ class ChunkSizeProcessor:
     def __init__(
         self, tokenizer, max_tokens: int, min_chunk_size: int = 300, chunker=None
     ):
+        """Initialize the chunk size processor with tokenizer and size constraints."""
         self.tokenizer = tokenizer
         self.max_tokens = max_tokens
         self.min_chunk_size = min_chunk_size
@@ -322,7 +328,7 @@ def _split_function_chunk(
 
             new_meta = chunk.meta.model_copy()
             new_meta.part_name = (
-                f"{chunk.meta.part_name}_part_{i+1}"
+                f"{chunk.meta.part_name}_part_{i + 1}"
                 if len(chunks) > 1
                 else chunk.meta.part_name
             )
diff --git a/docling_core/transforms/chunker/code_chunk_utils/utils.py b/docling_core/transforms/chunker/code_chunk_utils/utils.py
@@ -1,3 +1,5 @@
+"""Utility functions and classes for code language detection and processing."""
+
 from enum import Enum
 from typing import List, Optional
 
@@ -14,13 +16,16 @@
 
 
 class Language(str, Enum):
+    """Supported programming languages for code chunking."""
+
     PYTHON = "python"
     JAVASCRIPT = "javascript"
     TYPESCRIPT = "typescript"
     JAVA = "java"
     C = "c"
 
     def file_extensions(self) -> List[str]:
+        """Get the file extensions associated with this language."""
         if self == Language.PYTHON:
             return [".py"]
         elif self == Language.TYPESCRIPT:
@@ -35,6 +40,7 @@ def file_extensions(self) -> List[str]:
             return []
 
     def get_tree_sitter_language(self):
+        """Get the tree-sitter language object for this language."""
         if self == Language.PYTHON:
             return Lang(ts_python.language())
         elif self == Language.TYPESCRIPT:
@@ -49,7 +55,7 @@ def get_tree_sitter_language(self):
             return None
 
     def to_code_language_label(self):
-
+        """Convert this language to a CodeLanguageLabel."""
         mapping = {
             Language.PYTHON: CodeLanguageLabel.PYTHON,
             Language.JAVA: CodeLanguageLabel.JAVA,
@@ -60,6 +66,7 @@ def to_code_language_label(self):
         return mapping.get(self, CodeLanguageLabel.UNKNOWN)
 
     def get_import_query(self) -> Optional[str]:
+        """Get the tree-sitter query string for finding imports in this language."""
         if self == Language.PYTHON:
             return """
                 (import_statement) @import
@@ -101,6 +108,7 @@ def get_import_query(self) -> Optional[str]:
             return None
 
     def get_function_name(self, node: Node) -> Optional[str]:
+        """Extract the function name from a function node."""
         if self == Language.C:
             declarator = node.child_by_field_name("declarator")
             if declarator:
@@ -115,6 +123,7 @@ def get_function_name(self, node: Node) -> Optional[str]:
             return None
 
     def is_collectable_function(self, node: Node, constructor_name: str) -> bool:
+        """Check if a function should be collected for chunking."""
         if self == Language.C:
             return True
         else:
@@ -126,6 +135,7 @@ def is_collectable_function(self, node: Node, constructor_name: str) -> bool:
 
 
 def _get_default_tokenizer() -> "BaseTokenizer":
+    """Get the default tokenizer instance."""
     from docling_core.transforms.chunker.tokenizer.huggingface import (
         HuggingFaceTokenizer,
     )
@@ -136,17 +146,20 @@ def _get_default_tokenizer() -> "BaseTokenizer":
 
 
 def has_child(node: Node, child_name: str) -> bool:
+    """Check if a node has a child with the specified name."""
     return bool(node and node.child_by_field_name(child_name))
 
 
 def get_children(node: Node, child_types: List[str]) -> List[Node]:
+    """Get all children of a node that match the specified types."""
     if not node.children:
         return []
 
     return [child for child in node.children if child.type in child_types]
 
 
 def to_str(node: Node) -> str:
+    """Convert a tree-sitter node to a string."""
     if not node or not node.text:
         return ""
     text = node.text.decode()
diff --git a/docling_core/transforms/chunker/code_chunking_strategy.py b/docling_core/transforms/chunker/code_chunking_strategy.py
@@ -1,3 +1,5 @@
+"""Code chunking strategy implementations for different programming languages."""
+
 from typing import Any, Dict, Iterator, Optional
 
 from docling_core.transforms.chunker.base_code_chunker import _CodeChunker
@@ -30,7 +32,6 @@ class LanguageDetector:
     @staticmethod
     def detect_from_extension(filename: Optional[str]) -> Optional[Language]:
         """Detect language from file extension."""
-
         if not filename:
             return None
 
@@ -45,7 +46,6 @@ def detect_from_extension(filename: Optional[str]) -> Optional[Language]:
     @staticmethod
     def detect_from_content(code_text: str) -> Optional[Language]:
         """Detect language from code content using heuristics."""
-
         if not code_text:
             return None
 
@@ -65,7 +65,20 @@ def detect_from_content(code_text: str) -> Optional[Language]:
             ]
         ) and not any(
             pattern in code_lower
-            for pattern in ["public class", "private ", "protected ", "package "]
+            for pattern in [
+                "public class",
+                "private ",
+                "protected ",
+                "package ",
+                "package main",
+                "func main()",
+                'import "fmt"',
+                "chan ",
+                "interface{}",
+                "go func",
+                "defer ",
+                ":= ",
+            ]
         ):
             return Language.PYTHON
 
@@ -169,7 +182,6 @@ def detect_language(
         code_text: str, filename: Optional[str] = None
     ) -> Optional[Language]:
         """Detect language from both filename and content."""
-
         if filename:
             lang = LanguageDetector.detect_from_extension(filename)
             if lang:
@@ -185,7 +197,6 @@ class CodeChunkingStrategyFactory:
     @staticmethod
     def create_chunker(language: Language, **kwargs: Any) -> _CodeChunker:
         """Create a language-specific code chunker."""
-
         chunker_map = {
             Language.PYTHON: _PythonFunctionChunker,
             Language.TYPESCRIPT: _TypeScriptFunctionChunker,
@@ -206,13 +217,11 @@ class DefaultCodeChunkingStrategy:
 
     def __init__(self, **chunker_kwargs: Any):
         """Initialize the strategy with optional chunker parameters."""
-
         self.chunker_kwargs = chunker_kwargs
         self._chunker_cache: Dict[Language, _CodeChunker] = {}
 
     def _get_chunker(self, language: Language) -> _CodeChunker:
         """Get or create a chunker for the given language."""
-
         if language not in self._chunker_cache:
             self._chunker_cache[language] = CodeChunkingStrategyFactory.create_chunker(
                 language, **self.chunker_kwargs
@@ -228,7 +237,6 @@ def chunk_code_item(
         **kwargs: Any,
     ) -> Iterator[CodeChunk]:
         """Chunk a single code item using the appropriate language chunker."""
-
         if not code_text.strip():
             return
 
@@ -276,7 +284,6 @@ def chunk_code_item(
         **kwargs: Any,
     ) -> Iterator[CodeChunk]:
         """Return the code as a single chunk without further processing."""
-
         if not code_text.strip():
             return
 
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -19,6 +19,7 @@
     Literal,
     Optional,
     Protocol,
+    cast,
 )
 
 from pydantic import ConfigDict, Field, StringConstraints, field_validator
@@ -134,7 +135,7 @@ class CodeDocMeta(DocMeta):
         default="docling_core.transforms.chunker.CodeDocMeta",
         alias=_KEY_SCHEMA_NAME,
     )
-    doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS)
+    doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS)  # type: ignore[assignment]
     part_name: Optional[str] = Field(default=None)
     docstring: Optional[str] = Field(default=None)
     sha256: Optional[int] = Field(default=None)
@@ -151,7 +152,7 @@ class CodeChunk(BaseChunk):
 
 
 class CodeChunkType(str, Enum):
-    """Chunk type"""
+    """Chunk type."""
 
     FUNCTION = "function"
     METHOD = "method"
@@ -317,8 +318,9 @@ def chunk(
                         LanguageDetector,
                     )
 
+                    text_item = cast(Any, item)
                     language = LanguageDetector.detect_language(
-                        item.text,
+                        text_item.text,
                         (
                             getattr(dl_doc.origin, "filename", None)
                             if dl_doc.origin
diff --git a/docling_core/transforms/chunker/language_code_chunkers.py b/docling_core/transforms/chunker/language_code_chunkers.py
@@ -1,3 +1,5 @@
+"""Language-specific code chunker implementations."""
+
 from typing import Any, Dict, List, Tuple
 
 from pydantic import Field
@@ -31,7 +33,7 @@ class _PythonFunctionChunker(_CodeChunker):
     function_body: str = "block"
     tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer)
     min_chunk_size: int = 300
-    max_tokens: int = 50
+    max_tokens: int = 5000
     docs_types: List[str] = ["body", "comment"]
     dotted_name: str = "dotted_name"
     aliased_import: str = "aliased_import"
@@ -112,6 +114,7 @@ def _find_used_variables(self, function_node: Node) -> set:
         used_vars = set()
 
         def collect_identifiers(node, depth=0):
+            """Collect identifiers from node."""
             "  " * depth
             if node.type in self.identifiers:
                 var_name = node.text.decode(self.utf8_encoding)
@@ -365,6 +368,7 @@ def _find_used_variables(self, function_node: Node) -> set:
         used_macros = set()
 
         def collect_identifiers(node, depth=0):
+            """Collect identifiers from node."""
             "  " * depth
             if node.type in self.identifiers:
                 macro_name = node.text.decode(self.utf8_encoding)
diff --git a/pyproject.toml b/pyproject.toml
@@ -136,6 +136,7 @@ namespace_packages = true
 show_error_codes = true
 python_version = "3.9"
 plugins = ["pydantic.mypy"]
+exclude = "(^|/)test/data/.*"
 
 [[tool.mypy.overrides]]
 module = [
diff --git a/test/test_code_chunker.py b/test/test_code_chunker.py
@@ -2,7 +2,7 @@
 import json
 import os
 import pathlib
-from typing import List
+from typing import List, Optional
 
 import git
 import pytest
@@ -30,7 +30,7 @@ def get_latest_commit_id(file_dir: str) -> str:
 
 
 def create_documents_from_repository(
-    file_dir: str, repo_url: str, commit_id: str = None
+    file_dir: str, repo_url: str, commit_id: Optional[str] = None
 ) -> List[DoclingDocument]:
     """Build DoclingDocument objects from a local checkout, one per code file."""