fix: Support unicode dotenv files (#393)

hhvrc · Copilot · web-flow · commit 9bebff8d9ce3 · 2025-12-09T12:49:12.000+01:00
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/scripts/utils/dotenv.py b/scripts/utils/dotenv.py
@@ -1,8 +1,7 @@
-import os
 from pathlib import Path
 from typing import Mapping
 
-LOGLEVEL_MAP = {
+LOGLEVEL_MAP: dict[str, tuple[int, str]] = {
     'none': (0, 'LOG_NONE'),
     'log_none': (0, 'LOG_NONE'),
     'error': (1, 'LOG_ERROR'),
@@ -20,17 +19,78 @@
 }
 
 
+def read_text_with_fallback(
+    path: str | Path,
+    encodings: list[str] | tuple[str, ...] | None = None,
+) -> str:
+    """
+    Read a text file using multiple attempted encodings in order.
+
+    Handles BOM automatically via utf-8-sig and utf-16 encodings.
+    Raises a clean, descriptive error if all encodings fail.
+    """
+
+    if encodings is None:
+        # You can reorder these depending on what you expect most commonly.
+        encodings = [
+            'utf-8-sig',  # handles UTF-8 BOM automatically
+            'utf-16',  # auto-detects LE/BE with BOM
+            'utf-16-le',
+            'utf-16-be',
+            'latin-1',  # fallback that never fails (for decoding)
+        ]
+
+    path = Path(path)
+    raw = path.read_bytes()
+
+    last_error: UnicodeError | None = None
+
+    for encoding in encodings:
+        try:
+            text = raw.decode(encoding)
+            return text
+        except UnicodeError as e:
+            last_error = e
+            continue
+
+    # If we reach here, all decoding attempts failed (only possible if latin-1 is not in encodings).
+    raise UnicodeDecodeError(
+        'multi-encoding-reader',
+        raw,
+        0,
+        len(raw),
+        f"failed to decode file '{path}' using encodings: {', '.join(encodings)}",
+    ) from last_error
+
+
 class DotEnv:
     def __read_dotenv(self, path: str | Path):
-        with open(path, 'r') as f:
-            for line in f:
-                line = line.strip()
-                if line == '' or line.startswith('#'):
-                    continue
+        text_data = read_text_with_fallback(path)
+
+        for line in text_data.splitlines():
+            line = line.strip()
+
+            # Skip empty lines and comments
+            if not line or line.startswith('#'):
+                continue
 
-                key, value = line.strip().split('=', 1)
+            # Ignore lines that don't contain '=' instead of raising
+            if '=' not in line:
+                continue
 
-                self.dotenv_vars[key] = value
+            key, value = line.split('=', 1)
+            key = key.strip()
+            value = value.strip()
+
+            # Skip lines with empty keys
+            if not key:
+                continue
+            # Strip optional surrounding quotes (must match)
+            if len(value) >= 2:
+                if (value[0] == '"' and value[-1] == '"') or (value[0] == "'" and value[-1] == "'"):
+                    value = value[1:-1]
+
+            self.dotenv_vars[key] = value
 
     def __init__(self, path: str | Path, environment: str):
         self.dotenv_vars: dict[str, str] = {}
@@ -45,42 +105,37 @@ def __init__(self, path: str | Path, environment: str):
         env_specific_name = '.env.' + environment
 
         # Read the .env files.
-        for path in paths:
-            env_file = path / '.env'
+        for base in paths:
+            env_file = base / '.env'
             if env_file.exists():
                 self.__read_dotenv(env_file)
 
-            env_file = path / env_specific_name
+            env_file = base / env_specific_name
             if env_file.exists():
                 self.__read_dotenv(env_file)
 
-            env_file = path / '.env.local'
+            env_file = base / '.env.local'
             if env_file.exists():
                 self.__read_dotenv(env_file)
 
-    def get_string(self, key: str):
+    def get_string(self, key: str) -> str | None:
         return self.dotenv_vars.get(key)
 
     def get_all_prefixed(self, prefix: str) -> Mapping[str, str]:
-        result: dict[str, str] = {}
-        for key, value in self.dotenv_vars.items():
-            if key.startswith(prefix):
-                result[key] = value
-        return result
+        return {k: v for k, v in self.dotenv_vars.items() if k.startswith(prefix)}
 
     def get_loglevel(self, key: str) -> int | None:
         value = self.get_string(key)
-        if value == None:
+        if value is None:
             return None
 
-        value = value.lower()
-
-        tup = LOGLEVEL_MAP.get(value)
-        if tup == None:
-            raise ValueError('Environment variable ' + key + ' (' + value + ') is not a valid log level.')
+        normalized = value.strip().lower()
+        tup = LOGLEVEL_MAP.get(normalized)
+        if tup is None:
+            raise ValueError(f'Environment variable {key} ({value}) is not a valid log level.')
 
         return tup[0]
 
 
-def read(workdir: str, environment_name: str) -> DotEnv:
+def read(workdir: str | Path, environment_name: str) -> DotEnv:
     return DotEnv(workdir, environment=environment_name)