perf: replace python-dotenv with custom fast parser - 5x faster cold start

vilsonrodrigues · web-flow · commit 22a20e18630f · 2025-12-03T01:48:11.000Z
diff --git a/README.md b/README.md
@@ -23,9 +23,9 @@
 
 - ✅ **High performance** - Built on msgspec for speed
 - ✅ **Type-safe** - Full type hints and validation
-- ✅ **.env support** - Automatic loading from .env files via python-dotenv
+- ✅ **.env support** - Fast built-in .env parser (no dependencies)
 - ✅ **Nested settings** - Support for complex configuration structures
-- ✅ **Minimal dependencies** - Only msgspec and python-dotenv
+- ✅ **Zero dependencies** - Only msgspec required
 - ✅ **Familiar API** - Easy to learn if you've used settings libraries before
 
 ## Installation
@@ -144,31 +144,31 @@ msgspec-ext provides a **faster, lighter alternative** to pydantic-settings whil
 
 ### Performance Comparison
 
-**First-time load** (what you'll see when testing):
+**Cold start** (first load, includes .env parsing):
 
 | Library | Time per load | Speed |
 |---------|---------------|-------|
-| **msgspec-ext** | **1.818ms** | **1.5x faster** ⚡ |
-| pydantic-settings | 2.814ms | Baseline |
+| **msgspec-ext** | **0.39ms** | **5.0x faster** ⚡ |
+| pydantic-settings | 1.95ms | Baseline |
 
-**With caching** (repeated loads in long-running applications):
+**Warm (cached)** (repeated loads in long-running applications):
 
 | Library | Time per load | Speed |
 |---------|---------------|-------|
-| **msgspec-ext** | **0.016ms** | **112x faster** ⚡ |
-| pydantic-settings | 1.818ms | Baseline |
+| **msgspec-ext** | **0.012ms** | **267x faster** ⚡ |
+| pydantic-settings | 3.2ms | Baseline |
 
 > *Benchmark includes .env file parsing, environment variable loading, type validation, and nested configuration (app settings, database, redis, feature flags). Run `benchmark/benchmark_cold_warm.py` to reproduce.*
 
 ### Key Advantages
 
 | Feature | msgspec-ext | pydantic-settings |
 |---------|------------|-------------------|
-| **First load** | **1.5x faster** ⚡ | Baseline |
-| **Cached loads** | **112x faster** ⚡ | Baseline |
+| **Cold start** | **5.0x faster** ⚡ | Baseline |
+| **Warm (cached)** | **267x faster** ⚡ | Baseline |
 | **Package size** | **0.49 MB** | 1.95 MB |
-| **Dependencies** | **2 (minimal)** | 5+ |
-| .env support | ✅ | ✅ |
+| **Dependencies** | **1 (msgspec only)** | 5+ |
+| .env support | ✅ Built-in | ✅ Via python-dotenv |
 | Type validation | ✅ | ✅ |
 | Advanced caching | ✅ | ❌ |
 | Nested config | ✅ | ✅ |
@@ -179,14 +179,15 @@ msgspec-ext provides a **faster, lighter alternative** to pydantic-settings whil
 
 msgspec-ext achieves its performance through:
 - **Bulk validation**: Validates all fields at once in C (via msgspec), not one-by-one in Python
-- **Smart caching**: Caches .env files, field mappings, and type information - loads after the first are 112x faster
+- **Custom .env parser**: Built-in fast parser with zero external dependencies (no python-dotenv overhead)
+- **Smart caching**: Caches .env files, field mappings, and type information - loads after the first are 267x faster
 - **Optimized file operations**: Uses fast os.path operations instead of slower pathlib alternatives
 - **Zero overhead**: Fast paths for common types (str, bool, int, float) with minimal Python code
 
 This means your application **starts faster** and uses **less memory**, especially important for:
-- 🚀 **CLI tools** - 1.5x faster startup every time you run the command
+- 🚀 **CLI tools** - 5.0x faster startup every time you run the command
 - ⚡ **Serverless functions** - Lower cold start latency means better response times
-- 🔄 **Long-running apps** - After the first load, reloading settings is 112x faster (16 microseconds!)
+- 🔄 **Long-running apps** - After the first load, reloading settings is 267x faster (12 microseconds!)
 
 ## Contributing
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,6 @@ authors = [
 requires-python = ">=3.10"
 dependencies = [
     "msgspec>=0.19.0",
-    "python-dotenv>=1.1.1",
 ]
 classifiers = [
   "Development Status :: 4 - Beta",
diff --git a/src/msgspec_ext/fast_dotenv.py b/src/msgspec_ext/fast_dotenv.py
@@ -0,0 +1,172 @@
+r"""Fast .env file parser - optimized for performance.
+
+Key features:
+1. UTF-8 BOM support (\ufeff)
+2. Escape sequences parsing (\n, \t, etc)
+3. Whitespace preservation inside quotes
+4. Strict variable name validation (isidentifier)
+5. Robust 'export' keyword support
+6. Correct duplicate handling
+7. Special symbols in unquoted values
+"""
+
+import os
+
+# Global cache
+_FILE_CACHE: dict[str, dict[str, str]] = {}
+
+# Optimization constants
+_BOM = "\ufeff"
+_EXPORT_LEN = 6  # len("export")
+
+
+def parse_env_file(file_path: str, encoding: str | None = "utf-8") -> dict[str, str]:  # noqa: C901, PLR0912
+    """Fast .env file parser with production-grade robustness.
+
+    Optimized for speed while handling edge cases correctly.
+    """
+    cache_key = f"{file_path}:{encoding}"
+    if cache_key in _FILE_CACHE:
+        return _FILE_CACHE[cache_key]
+
+    env_vars: dict[str, str] = {}
+
+    try:
+        # 1. Fast read with immediate BOM handling
+        with open(file_path, encoding=encoding) as f:
+            content = f.read()
+
+        # Remove BOM if present
+        if content.startswith(_BOM):
+            content = content[1:]
+
+        # Local references for loop speed
+        _str_strip = str.strip
+        _str_startswith = str.startswith
+
+        for raw_line in content.splitlines():
+            # Fast initial cleanup
+            line = _str_strip(raw_line)
+
+            if not line or _str_startswith(line, "#"):
+                continue
+
+            # 2. Handle 'export' keyword
+            # Check if starts with 'export' followed by space (not a var called 'exporter')
+            if (
+                _str_startswith(line, "export")
+                and len(line) > _EXPORT_LEN
+                and line[_EXPORT_LEN].isspace()
+            ):
+                line = line[_EXPORT_LEN:].lstrip()
+
+            # 3. Atomic partition
+            key, sep, value = line.partition("=")
+
+            if not sep:
+                continue
+
+            key = key.strip()
+
+            # 4. Variable name validation
+            # isidentifier() is implemented in C and covers:
+            # - Not starting with number
+            # - Only alphanumerics and underscore
+            # - No hyphens (bash compliant)
+            if not key.isidentifier():
+                continue
+
+            # 5. Value parsing
+            if not value:
+                env_vars[key] = ""
+                continue
+
+            quote = value[0] if value else ""
+
+            # Quote handling logic
+            if quote in ('"', "'"):
+                # Check if quote closes (ignore orphaned quotes)
+                if value.endswith(quote) and len(value) > 1:
+                    # Extract content
+                    val_content = value[1:-1]
+
+                    # Double quotes: Support escape sequences
+                    if quote == '"':
+                        # Decode common escapes
+                        # Manual replace is faster than codecs.decode('unicode_escape') for this subset
+                        if "\\" in val_content:
+                            val_content = (
+                                val_content.replace("\\n", "\n")
+                                .replace("\\r", "\r")
+                                .replace("\\t", "\t")
+                                .replace('\\"', '"')
+                                .replace("\\\\", "\\")
+                            )
+                    # Single quotes: Minimal escape processing
+                    elif quote == "'":
+                        # Only unescape single quote itself if needed
+                        if "\\'" in val_content:
+                            val_content = val_content.replace("\\'", "'")
+
+                    env_vars[key] = val_content
+                else:
+                    # Broken or unclosed quotes -> Treat as unquoted string
+                    env_vars[key] = value.strip()
+            else:
+                # Unquoted value - Preserve leading spaces but allow inline comments
+                # Do NOT remove leading spaces to preserve intentionality
+
+                # Remove inline comments (e.g., VAL=123 # id)
+                if "#" in value:
+                    # Only partition if # exists to avoid overhead
+                    value = value.partition("#")[0]
+
+                # Remove trailing whitespace only at the end
+                env_vars[key] = value.rstrip()
+
+    except FileNotFoundError:
+        pass
+    except Exception:  # noqa: S110
+        # In critical production, logging would be ideal, but keeping interface clean
+        pass
+
+    _FILE_CACHE[cache_key] = env_vars
+    return env_vars
+
+
+def load_dotenv(
+    dotenv_path: str | None = ".env",
+    encoding: str | None = "utf-8",
+    *,
+    override: bool = False,
+) -> bool:
+    """Load environment variables from .env file into os.environ.
+
+    Args:
+        dotenv_path: Path to .env file (default: ".env")
+        encoding: File encoding (default: "utf-8")
+        override: Whether to override existing environment variables (default: False)
+
+    Returns:
+        True if file was loaded successfully, False otherwise
+    """
+    try:
+        env_vars = parse_env_file(dotenv_path, encoding)
+
+        if not env_vars:
+            return False  # Empty or invalid file
+
+        if override:
+            # Override all variables from file
+            os.environ.update(env_vars)
+        else:
+            # Preserve existing environment variables
+            # Direct iteration is faster than sets for small/medium dicts
+            environ = os.environ
+            for key, value in env_vars.items():
+                if key not in environ:
+                    environ[key] = value
+
+        return True
+    except Exception:
+        return False
diff --git a/src/msgspec_ext/settings.py b/src/msgspec_ext/settings.py
@@ -4,7 +4,8 @@
 from typing import Any, ClassVar, Union, get_args, get_origin
 
 import msgspec
-from dotenv import load_dotenv
+
+from msgspec_ext.fast_dotenv import load_dotenv
 
 __all__ = ["BaseSettings", "SettingsConfigDict"]
 
@@ -305,7 +306,7 @@ def _get_env_name(cls, field_name: str) -> str:
         return env_name
 
     @classmethod
-    def _preprocess_env_value(cls, env_value: str, field_type: type) -> Any:  # noqa: C901, PLR0912
+    def _preprocess_env_value(cls, env_value: str, field_type: type) -> Any:  # noqa: C901
         """Convert environment variable string to JSON-compatible type.
 
         Ultra-optimized to minimize type introspection overhead with caching.
@@ -356,43 +357,3 @@ def _preprocess_env_value(cls, env_value: str, field_type: type) -> Any:  # noqa
                 return cls._preprocess_env_value(env_value, resolved_type)
 
         return env_value
-
-        # Fast path: Direct type comparison (avoid get_origin when possible)
-        if field_type is str:
-            return env_value
-        if field_type is bool:
-            return env_value.lower() in ("true", "1", "yes", "y", "t")
-        if field_type is int:
-            try:
-                return int(env_value)
-            except ValueError as e:
-                raise ValueError(f"Cannot convert '{env_value}' to int") from e
-        if field_type is float:
-            try:
-                return float(env_value)
-            except ValueError as e:
-                raise ValueError(f"Cannot convert '{env_value}' to float") from e
-
-        # Only use typing introspection for complex types (Union, Optional, etc.)
-        origin = get_origin(field_type)
-        if origin is Union:
-            args = get_args(field_type)
-            non_none = [a for a in args if a is not type(None)]
-            if non_none:
-                # Cache the resolved type for future use
-                resolved_type = non_none[0]
-                cls._type_cache[field_type] = resolved_type
-                # Recursively process with the non-None type
-                return cls._preprocess_env_value(env_value, resolved_type)
-
-        return env_value
-
-        # Type conversion (required for JSON encoding)
-        if field_type is bool:
-            return env_value.lower() in ("true", "1", "yes", "y", "t")
-        if field_type is int:
-            return int(env_value)
-        if field_type is float:
-            return float(env_value)
-
-        return env_value
diff --git a/tests/test_fast_dotenv.py b/tests/test_fast_dotenv.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,6 @@ authors = [`
`14`	`14`	`requires-python = ">=3.10"`
`15`	`15`	`dependencies = [`
`16`	`16`	`"msgspec>=0.19.0",`
`17`		`- "python-dotenv>=1.1.1",`
`18`	`17`	`]`
`19`	`18`	`classifiers = [`
`20`	`19`	`"Development Status :: 4 - Beta",`