Update dotenv.py

hhvrc · hhvrc · commit 276251963682 · 2025-12-08T13:40:09.000+01:00
diff --git a/scripts/utils/dotenv.py b/scripts/utils/dotenv.py
@@ -1,8 +1,7 @@
-import os
 from pathlib import Path
 from typing import Mapping
 
-LOGLEVEL_MAP = {
+LOGLEVEL_MAP: dict[str, tuple[int, str]] = {
     'none': (0, 'LOG_NONE'),
     'log_none': (0, 'LOG_NONE'),
     'error': (1, 'LOG_ERROR'),
@@ -20,33 +19,84 @@
 }
 
 
-class DotEnv:
-    def __read_dotenv(self, path: str | Path):
-        text_data = ''
+def read_text_with_fallback(
+    path: str | Path,
+    encodings: list[str] | tuple[str, ...] | None = None,
+) -> str:
+    """
+    Read a text file using multiple attempted encodings in order.
+
+    Supports BOM-stripping for UTF-8, UTF-16-LE, UTF-16-BE.
+    Raises a clean, descriptive error if all encodings fail.
+    """
+
+    if encodings is None:
+        # You can reorder these depending on what you expect most commonly.
+        encodings = [
+            'utf-8-sig',  # handles UTF-8 BOM automatically
+            'utf-16',  # auto-detects LE/BE with BOM
+            'utf-16-le',
+            'utf-16-be',
+            'latin-1',  # fallback that never fails (for decoding)
+        ]
+
+    path = Path(path)
+    raw = path.read_bytes()
+
+    last_error: UnicodeError | None = None
+
+    for encoding in encodings:
+        try:
+            # Special handling for UTF-16 because utf-16 may incorrectly detect encoding without BOM.
+            if encoding in ('utf-16', 'utf-16-le', 'utf-16-be'):
+                try:
+                    text = raw.decode(encoding)
+                except UnicodeError as e:
+                    last_error = e
+                    continue
+            else:
+                text = raw.decode(encoding)
+
+            return text
+
+        except UnicodeError as e:
+            last_error = e
+            continue
+
+    # If we reach here, all decoding attempts failed (only possible if latin-1 is not in encodings).
+    raise UnicodeDecodeError(
+        'multi-encoding-reader',
+        raw,
+        0,
+        len(raw),
+        f"failed to decode file '{path}' using encodings: {', '.join(encodings)}",
+    ) from last_error
 
-        with open(path, 'rb') as f:  # Open the file in binary mode first to detect BOM
-            raw_data = f.read()
 
-            # Check for BOM and strip it if present
-            if raw_data.startswith(b'\xef\xbb\xbf'):  # UTF-8 BOM
-                text_data = raw_data[3:].decode('utf-8')
-            elif raw_data.startswith(b'\xff\xfe'):  # UTF-16 LE BOM
-                text_data = raw_data[2:].decode('utf-16le')
-            elif raw_data.startswith(b'\xfe\xff'):  # UTF-16 BE BOM
-                text_data = raw_data[2:].decode('utf-16be')
+class DotEnv:
+    def __read_dotenv(self, path: str | Path):
+        text_data = read_text_with_fallback(path)
 
-        # Now process the text data
         for line in text_data.splitlines():
             line = line.strip()
-            if line == '' or line.startswith('#'):
+
+            # Skip empty lines and comments
+            if not line or line.startswith('#'):
                 continue
 
-            split = line.strip().split('=', 1)
-            if len(split) != 2:
-                print('Failed to parse: ' + line)
+            # Ignore lines that don't contain '=' instead of raising
+            if '=' not in line:
                 continue
 
-            self.dotenv_vars[line[0]] = line[1]
+            key, value = line.split('=', 1)
+            key = key.strip()
+            value = value.strip()
+
+            # Strip optional surrounding quotes
+            if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+                value = value[1:-1]
+
+            self.dotenv_vars[key] = value
 
     def __init__(self, path: str | Path, environment: str):
         self.dotenv_vars: dict[str, str] = {}
@@ -61,42 +111,37 @@ def __init__(self, path: str | Path, environment: str):
         env_specific_name = '.env.' + environment
 
         # Read the .env files.
-        for path in paths:
-            env_file = path / '.env'
+        for base in paths:
+            env_file = base / '.env'
             if env_file.exists():
                 self.__read_dotenv(env_file)
 
-            env_file = path / env_specific_name
+            env_file = base / env_specific_name
             if env_file.exists():
                 self.__read_dotenv(env_file)
 
-            env_file = path / '.env.local'
+            env_file = base / '.env.local'
             if env_file.exists():
                 self.__read_dotenv(env_file)
 
-    def get_string(self, key: str):
+    def get_string(self, key: str) -> str | None:
         return self.dotenv_vars.get(key)
 
     def get_all_prefixed(self, prefix: str) -> Mapping[str, str]:
-        result: dict[str, str] = {}
-        for key, value in self.dotenv_vars.items():
-            if key.startswith(prefix):
-                result[key] = value
-        return result
+        return {k: v for k, v in self.dotenv_vars.items() if k.startswith(prefix)}
 
     def get_loglevel(self, key: str) -> int | None:
         value = self.get_string(key)
-        if value == None:
+        if value is None:
             return None
 
-        value = value.lower()
-
-        tup = LOGLEVEL_MAP.get(value)
-        if tup == None:
-            raise ValueError('Environment variable ' + key + ' (' + value + ') is not a valid log level.')
+        normalized = value.strip().lower()
+        tup = LOGLEVEL_MAP.get(normalized)
+        if tup is None:
+            raise ValueError(f'Environment variable {key} ({value}) is not a valid log level.')
 
         return tup[0]
 
 
-def read(workdir: str, environment_name: str) -> DotEnv:
+def read(workdir: str | Path, environment_name: str) -> DotEnv:
     return DotEnv(workdir, environment=environment_name)