mabel-dev
diff --git a/‎opteryx/utils/file_decoders.py‎
Lines changed: 204 additions & 0 deletions b/‎opteryx/utils/file_decoders.py‎
Lines changed: 204 additions & 0 deletions
@@ -429,13 +429,201 @@ def orc_decoder(
     return *full_shape, 0, table
 
 
+def fast_jsonl_decoder(
+    buffer: bytes,
+    *,
+    projection: Optional[list] = None,
+    selection: Optional[list] = None,
+    sample_size: int = 100,
+) -> Tuple[int, int, pyarrow.Table]:
+    """
+    Fast JSONL decoder that parses a sample to infer schema, then extracts values
+    using regex patterns without full JSON parsing for subsequent lines.
+    
+    This is optimized for datasets with consistent schema across all records.
+    """
+    import re
+    from opteryx.third_party.tktech import csimdjson as simdjson
+    
+    # Split buffer into lines
+    lines = buffer.split(b'\n')
+    lines = [line for line in lines if line.strip()]
+    
+    if not lines:
+        return 0, 0, pyarrow.Table.from_pylist([])
+    
+    # Parse sample lines to infer schema
+    parser = simdjson.Parser()
+    sample_records = []
+    sample_parsed_records = []
+    keys_union = set()
+    
+    num_sample = min(sample_size, len(lines))
+    
+    for i in range(num_sample):
+        try:
+            record = parser.parse(lines[i])
+            row = record.as_dict()
+            sample_records.append(row)
+            sample_parsed_records.append(record)
+            keys_union.update(row.keys())
+        except Exception:
+            continue
+    
+    if not sample_records:
+        return 0, 0, pyarrow.Table.from_pylist([])
+    
+    # If projection is specified, only extract projected columns
+    if projection:
+        columns_to_extract = {c.value for c in projection}
+    else:
+        columns_to_extract = keys_union
+    
+    # Infer types from sample
+    column_types = {}
+    for key in columns_to_extract:
+        for record in sample_records:
+            if key in record and record[key] is not None:
+                val = record[key]
+                if isinstance(val, bool):
+                    column_types[key] = 'bool'
+                elif isinstance(val, int):
+                    column_types[key] = 'int'
+                elif isinstance(val, float):
+                    column_types[key] = 'float'
+                elif isinstance(val, str):
+                    column_types[key] = 'str'
+                elif isinstance(val, list):
+                    column_types[key] = 'list'
+                elif isinstance(val, dict):
+                    column_types[key] = 'dict'
+                break
+        if key not in column_types:
+            column_types[key] = 'null'
+    
+    # Build regex patterns for each column
+    # Pattern to match: "key": value
+    column_patterns = {}
+    for key in columns_to_extract:
+        # Escape special regex characters in key name
+        escaped_key = re.escape(key)
+        
+        # Create pattern based on expected type
+        col_type = column_types.get(key, 'null')
+        
+        if col_type == 'bool':
+            # Match true/false
+            pattern = rb'"' + escaped_key.encode() + rb'":\s*(true|false)'
+        elif col_type in ('int', 'float'):
+            # Match numbers (including negative, decimals, scientific notation, null)
+            pattern = rb'"' + escaped_key.encode() + rb'":\s*(-?\d+\.?\d*(?:[eE][+-]?\d+)?|null)'
+        elif col_type == 'str':
+            # Match quoted strings (non-greedy, handle escaped quotes) or null
+            pattern = rb'"' + escaped_key.encode() + rb'":\s*(?:"((?:[^"\\]|\\.)*)"|null)'
+        elif col_type == 'null':
+            # Match null
+            pattern = rb'"' + escaped_key.encode() + rb'":\s*null'
+        elif col_type == 'list':
+            # Match arrays (including empty arrays) or null
+            pattern = rb'"' + escaped_key.encode() + rb'":\s*(\[(?:[^\[\]]|\[.*?\])*?\]|null)'
+        elif col_type == 'dict':
+            # Match objects - use balanced brace matching or null
+            # This is a simplified pattern that works for non-nested dicts
+            pattern = rb'"' + escaped_key.encode() + rb'":\s*(\{[^{}]*\}|null)'
+        else:
+            pattern = None
+        
+        if pattern:
+            column_patterns[key] = (re.compile(pattern), col_type)
+    
+    # Extract values from all lines using regex
+    column_data = {key: [] for key in columns_to_extract}
+    
+    for line in lines:
+        if not line.strip():
+            continue
+        
+        for key in columns_to_extract:
+            if key not in column_patterns:
+                column_data[key].append(None)
+                continue
+            
+            pattern, col_type = column_patterns[key]
+            match = pattern.search(line)
+            
+            if match:
+                if col_type == 'bool':
+                    value = match.group(1) == b'true'
+                elif col_type in ('int', 'float'):
+                    try:
+                        matched_val = match.group(1)
+                        if matched_val == b'null':
+                            value = None
+                        elif col_type == 'int':
+                            value = int(matched_val)
+                        else:
+                            value = float(matched_val)
+                    except (ValueError, IndexError):
+                        value = None
+                elif col_type == 'str':
+                    try:
+                        # Group 1 captures the string content (without quotes)
+                        matched_val = match.group(1)
+                        if matched_val is None:  # null was matched
+                            value = None
+                        else:
+                            # Decode and handle escaped characters
+                            raw_str = matched_val.decode('utf-8', errors='replace')
+                            # Simple unescape for common cases
+                            value = raw_str.replace('\\n', '\n').replace('\\t', '\t').replace('\\"', '"').replace('\\\\', '\\')
+                    except (UnicodeDecodeError, IndexError):
+                        value = None
+                elif col_type == 'null':
+                    value = None
+                elif col_type in ('list', 'dict'):
+                    # For complex types, fall back to JSON parsing
+                    try:
+                        matched_val = match.group(1)
+                        if matched_val == b'null':
+                            value = None
+                        else:
+                            import json
+                            value = json.loads(matched_val.decode('utf-8'))
+                            if col_type == 'dict' and isinstance(value, dict):
+                                # Convert dict to JSON string (similar to record[key].mini)
+                                value = json.dumps(value, ensure_ascii=False)
+                    except (json.JSONDecodeError, UnicodeDecodeError, IndexError):
+                        value = None
+                else:
+                    value = None
+            else:
+                value = None
+            
+            column_data[key].append(value)
+    
+    # Convert to PyArrow table
+    arrays = []
+    names = []
+    
+    for key in sorted(columns_to_extract):
+        arrays.append(pyarrow.array(column_data[key]))
+        names.append(key)
+    
+    if not arrays:
+        return 0, 0, pyarrow.Table.from_pylist([])
+    
+    table = pyarrow.Table.from_arrays(arrays, names=names)
+    return len(lines), len(columns_to_extract), table
+
+
 def jsonl_decoder(
     buffer: Union[memoryview, bytes, BinaryIO],
     *,
     projection: Optional[list] = None,
     selection: Optional[list] = None,
     just_schema: bool = False,
     just_statistics: bool = False,
+    use_fast_decoder: bool = True,
     **kwargs,
 ) -> Tuple[int, int, pyarrow.Table]:
     if just_statistics:
@@ -456,6 +644,22 @@ def jsonl_decoder(
         table = pyarrow.Table.from_arrays([[num_rows]], names=["$COUNT(*)"])
         return (num_rows, 0, 0, table)
 
+    # Use fast decoder if enabled and no complex filtering is needed
+    # Fast decoder is most effective for large files with consistent schema
+    if use_fast_decoder and not just_schema and not selection:
+        try:
+            num_rows, num_cols, table = fast_jsonl_decoder(
+                buffer, projection=projection, selection=selection
+            )
+            
+            if projection:
+                table = post_read_projector(table, projection)
+            
+            return num_rows, num_cols, 0, table
+        except Exception:
+            # Fall back to traditional decoder if fast decoder fails
+            pass
+
     parser = simdjson.Parser()
 
     # preallocate and reuse dicts