jsonl decoder

joocer · joocer · commit 275cbb1610da · 2025-10-15T18:27:20.000+01:00
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1654
+__build__ = 1656
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1654"
+__version__ = "0.26.0-beta.1656"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
diff --git a/opteryx/compiled/structures/jsonl_decoder.pyx b/opteryx/compiled/structures/jsonl_decoder.pyx
diff --git a/opteryx/connectors/__init__.py b/opteryx/connectors/__init__.py
@@ -270,7 +270,7 @@ def connector_factory(dataset, statistics, **config):
             break
     else:
         # Check if dataset is a file or contains wildcards
-        has_wildcards = any(char in dataset for char in ['*', '?', '['])
+        has_wildcards = any(char in dataset for char in ["*", "?", "["])
         if os.path.isfile(dataset) or has_wildcards:
             from opteryx.connectors import file_connector
 
@@ -286,7 +286,7 @@ def connector_factory(dataset, statistics, **config):
     remove_prefix = connector_entry.pop("remove_prefix", False)
     if prefix and remove_prefix and dataset.startswith(prefix):
         # Remove the prefix. If there's a separator (. or //) after the prefix, skip it too
-        dataset = dataset[len(prefix):]
+        dataset = dataset[len(prefix) :]
         if dataset.startswith(".") or dataset.startswith("//"):
             dataset = dataset[1:] if dataset.startswith(".") else dataset[2:]
 
diff --git a/opteryx/connectors/aws_s3_connector.py b/opteryx/connectors/aws_s3_connector.py
@@ -86,13 +86,13 @@ def __init__(self, credentials=None, **kwargs):
             )
 
         self.minio = Minio(end_point, access_key, secret_key, secure=secure)
-        
+
         # Only convert dots to path separators if the dataset doesn't already contain slashes
         # Dataset references like "my.dataset.table" use dots as separators
         # File paths like "bucket/path/file.parquet" already have slashes and should not be converted
         if OS_SEP not in self.dataset and "/" not in self.dataset:
             self.dataset = self.dataset.replace(".", OS_SEP)
-        
+
         # Check if dataset contains wildcards
         self.has_wildcards = paths.has_wildcards(self.dataset)
         if self.has_wildcards:
@@ -111,28 +111,28 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
         else:
             list_prefix = prefix
             filter_pattern = None
-            
+
         bucket, object_path, _, _ = paths.get_parts(list_prefix)
         blobs = self.minio.list_objects(bucket_name=bucket, prefix=object_path, recursive=True)
-        
+
         blob_list = []
         for blob in blobs:
             if blob.object_name.endswith("/"):
                 continue
-                
+
             full_path = bucket + "/" + blob.object_name
-            
+
             # Check if blob has valid extension
             if ("." + full_path.split(".")[-1].lower()) not in VALID_EXTENSIONS:
                 continue
-            
+
             # If we have a wildcard pattern, filter by it
             if filter_pattern:
                 if paths.match_wildcard(filter_pattern, full_path):
                     blob_list.append(full_path)
             else:
                 blob_list.append(full_path)
-        
+
         return sorted(blob_list)
 
     def read_dataset(
diff --git a/opteryx/connectors/file_connector.py b/opteryx/connectors/file_connector.py
@@ -136,10 +136,10 @@ def __init__(self, *args, **kwargs):
         if ".." in self.dataset or self.dataset[0] in ("\\", "/", "~"):
             # Don't find any datasets which look like path traversal
             raise DatasetNotFoundError(dataset=self.dataset)
-        
+
         # Check if dataset contains wildcards
-        self.has_wildcards = any(char in self.dataset for char in ['*', '?', '['])
-        
+        self.has_wildcards = any(char in self.dataset for char in ["*", "?", "["])
+
         if self.has_wildcards:
             # Expand wildcards to get list of files
             self.files = self._expand_wildcards(self.dataset)
@@ -150,43 +150,43 @@ def __init__(self, *args, **kwargs):
         else:
             self.files = [self.dataset]
             self.decoder = get_decoder(self.dataset)
-    
+
     def _expand_wildcards(self, pattern: str) -> List[str]:
         """
         Expand wildcard patterns in file paths while preventing path traversal.
-        
+
         Supports wildcards:
         - * matches any number of characters
-        - ? matches a single character  
+        - ? matches a single character
         - [range] matches a range of characters (e.g., [0-9], [a-z])
-        
+
         Args:
             pattern: File path pattern with wildcards
-            
+
         Returns:
             List of matching file paths
         """
         # Additional path traversal check after expansion
         if ".." in pattern:
             raise DatasetNotFoundError(dataset=pattern)
-        
+
         # Use glob to expand the pattern
         matched_files = glob.glob(pattern, recursive=False)
-        
+
         # Filter out any results that might have path traversal
         # This is an extra safety check
         safe_files = []
         for file_path in matched_files:
             if ".." not in file_path and os.path.isfile(file_path):
                 safe_files.append(file_path)
-        
+
         return sorted(safe_files)
 
     def read_dataset(
         self, columns: list = None, predicates: list = None, limit: int = None, **kwargs
     ) -> pyarrow.Table:
         rows_read = 0
-        
+
         # Iterate over all matched files
         for file_path in self.files:
             morsel = read_blob(
@@ -221,7 +221,7 @@ def get_dataset_schema(self) -> RelationSchema:
 
         # Use the first file to get the schema
         first_file = self.files[0]
-        
+
         try:
             file_descriptor = os.open(first_file, os.O_RDONLY | os.O_BINARY)
             size = os.path.getsize(first_file)
diff --git a/opteryx/connectors/gcp_cloudstorage_connector.py b/opteryx/connectors/gcp_cloudstorage_connector.py
@@ -97,7 +97,7 @@ def __init__(self, credentials=None, **kwargs):
         if OS_SEP not in self.dataset and "/" not in self.dataset:
             self.dataset = self.dataset.replace(".", OS_SEP)
         self.credentials = credentials
-        
+
         # Check if dataset contains wildcards
         self.has_wildcards = paths.has_wildcards(self.dataset)
         if self.has_wildcards:
@@ -231,9 +231,9 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
                 name = blob["name"]
                 if not name.endswith(TUPLE_OF_VALID_EXTENSIONS):
                     continue
-                    
+
                 full_path = f"{bucket}/{name}"
-                
+
                 # If we have a wildcard pattern, filter by it
                 if filter_pattern:
                     if paths.match_wildcard(filter_pattern, full_path):
diff --git a/opteryx/utils/file_decoders.py b/opteryx/utils/file_decoders.py
@@ -461,13 +461,13 @@ def jsonl_decoder(
     if use_fast_decoder and not just_schema and not selection and len(buffer) > 10000:
         try:
             from opteryx.compiled.structures import jsonl_decoder as cython_decoder
-            
+
             # Sample first 100 lines to infer schema
             parser = simdjson.Parser()
             sample_size = min(100, buffer.count(b"\n"))
             sample_records = []
             keys_union = set()
-            
+
             start = 0
             for _ in range(sample_size):
                 newline = buffer.find(b"\n", start)
@@ -483,57 +483,58 @@ def jsonl_decoder(
                         keys_union.update(row.keys())
                     except Exception:
                         continue
-            
+
             if sample_records:
                 # Infer column types from sample
                 column_types = {}
                 columns_to_extract = list(keys_union)
-                
+
                 if projection:
                     # If projection specified, only extract those columns
                     columns_to_extract = [c.value for c in projection if c.value in keys_union]
-                
+
                 for key in columns_to_extract:
                     for record in sample_records:
                         if key in record and record[key] is not None:
                             val = record[key]
                             if isinstance(val, bool):
-                                column_types[key] = 'bool'
+                                column_types[key] = "bool"
                             elif isinstance(val, int):
-                                column_types[key] = 'int'
+                                column_types[key] = "int"
                             elif isinstance(val, float):
-                                column_types[key] = 'float'
+                                column_types[key] = "float"
                             elif isinstance(val, str):
-                                column_types[key] = 'str'
+                                column_types[key] = "str"
                             elif isinstance(val, list):
-                                column_types[key] = 'list'
+                                column_types[key] = "list"
                             elif isinstance(val, dict):
-                                column_types[key] = 'dict'
+                                column_types[key] = "dict"
                             break
                     if key not in column_types:
-                        column_types[key] = 'str'  # Default to string
-                
+                        column_types[key] = "str"  # Default to string
+
                 # Use Cython decoder
                 num_rows, num_cols, column_data = cython_decoder.fast_jsonl_decode_columnar(
                     buffer, columns_to_extract, column_types, sample_size
                 )
-                
+
                 # Convert to PyArrow table
                 arrays = []
                 names = []
                 for key in sorted(columns_to_extract):
                     arrays.append(pyarrow.array(column_data[key]))
                     names.append(key)
-                
+
                 if arrays:
                     table = pyarrow.Table.from_arrays(arrays, names=names)
                     if projection:
                         table = post_read_projector(table, projection)
                     return num_rows, num_cols, 0, table
-        
+
         except (ImportError, Exception) as e:
             # Fall back to standard decoder if Cython version fails
             import warnings
+
             warnings.warn(f"Fast JSONL decoder failed, using standard decoder: {e}")
 
     parser = simdjson.Parser()
diff --git a/opteryx/utils/paths.py b/opteryx/utils/paths.py
@@ -45,83 +45,83 @@ def get_parts(path_string: str):
 def has_wildcards(path: str) -> bool:
     """
     Check if a path contains wildcard characters.
-    
+
     Args:
         path: Path string to check
-        
+
     Returns:
         True if path contains wildcards (*, ?, [])
     """
-    return any(char in path for char in ['*', '?', '['])
+    return any(char in path for char in ["*", "?", "["])
 
 
 def split_wildcard_path(path: str):
     """
     Split a path with wildcards into a non-wildcard prefix and wildcard pattern.
-    
+
     For cloud storage, we need to list blobs with a prefix, then filter by pattern.
     This function finds the longest non-wildcard prefix for listing.
-    
+
     Args:
         path: Path with potential wildcards (e.g., "bucket/path/subdir/*.parquet")
-        
+
     Returns:
         tuple: (prefix, pattern) where:
             - prefix: Non-wildcard prefix for listing (e.g., "bucket/path/subdir/")
             - pattern: Full path with wildcards for matching (e.g., "bucket/path/subdir/*.parquet")
-    
+
     Examples:
         >>> split_wildcard_path("bucket/path/*.parquet")
         ('bucket/path/', 'bucket/path/*.parquet')
-        
+
         >>> split_wildcard_path("bucket/path/file[0-9].parquet")
         ('bucket/path/', 'bucket/path/file[0-9].parquet')
-        
+
         >>> split_wildcard_path("bucket/*/data.parquet")
         ('bucket/', 'bucket/*/data.parquet')
     """
     if not has_wildcards(path):
         return path, path
-    
+
     # Find the first wildcard character
     wildcard_pos = len(path)
-    for char in ['*', '?', '[']:
+    for char in ["*", "?", "["]:
         pos = path.find(char)
         if pos != -1 and pos < wildcard_pos:
             wildcard_pos = pos
-    
+
     # Find the last path separator before the wildcard
     prefix = path[:wildcard_pos]
     last_sep = prefix.rfind(OS_SEP)
-    
+
     if last_sep != -1:
         # Include the separator in the prefix
-        prefix = path[:last_sep + 1]
+        prefix = path[: last_sep + 1]
     else:
         # No separator before wildcard, prefix is empty or bucket name
         prefix = ""
-    
+
     return prefix, path
 
 
 def match_wildcard(pattern: str, path: str) -> bool:
     """
     Match a path against a wildcard pattern using glob-like semantics.
-    
+
     Unlike fnmatch, this function treats path separators specially:
     - '*' matches any characters EXCEPT path separators
     - '?' matches any single character EXCEPT path separators
     - Use '**' to match across directory boundaries (not yet supported)
-    
+
     This ensures consistent behavior with glob.glob() used for local files.
-    
+
     Args:
         pattern: Pattern with wildcards (e.g., "bucket/path/*.parquet")
         path: Path to match (e.g., "bucket/path/file1.parquet")
-        
+
     Returns:
         True if path matches pattern
-        
+
     Examples:
         >>> match_wildcard("bucket/path/*.parquet", "bucket/path/file.parquet")
         True
@@ -131,14 +131,14 @@ def match_wildcard(pattern: str, path: str) -> bool:
     # Split pattern and path into parts using OS path separator for cross-platform compatibility
     pattern_parts = pattern.split(OS_SEP)
     path_parts = path.split(OS_SEP)
-    
+
     # Must have same number of path parts for a match (wildcards don't cross directory boundaries)
     if len(pattern_parts) != len(path_parts):
         return False
-    
+
     # Match each part using fnmatch
     for pattern_part, path_part in zip(pattern_parts, path_parts):
         if not fnmatch.fnmatch(path_part, pattern_part):
             return False
-    
+
     return True
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "opteryx"
-version = "0.26.0-beta.1654"
+version = "0.26.0-beta.1656"
 description = "Query your data, where it lives"
 requires-python = '>=3.11'
 readme = {file = "README.md", content-type = "text/markdown"}
diff --git a/tests/unit/security/test_protocol_permissions.py b/tests/unit/security/test_protocol_permissions.py