mabel-dev
diff --git a/‎opteryx/utils/file_decoders.py‎
Lines changed: 0 additions & 212 deletions b/‎opteryx/utils/file_decoders.py‎
Lines changed: 0 additions & 212 deletions
@@ -429,205 +429,13 @@ def orc_decoder(
     return *full_shape, 0, table
 
 
-def fast_jsonl_decoder(
-    buffer: bytes,
-    *,
-    projection: Optional[list] = None,
-    selection: Optional[list] = None,
-    sample_size: int = 100,
-) -> Tuple[int, int, pyarrow.Table]:
-    """
-    Fast JSONL decoder that parses a sample to infer schema, then extracts values
-    using regex patterns without full JSON parsing for subsequent lines.
-    
-    This is optimized for datasets with consistent schema across all records.
-    """
-    import re
-    from opteryx.third_party.tktech import csimdjson as simdjson
-    
-    # Split buffer into lines
-    lines = buffer.split(b'\n')
-    lines = [line for line in lines if line.strip()]
-    
-    if not lines:
-        return 0, 0, pyarrow.Table.from_pylist([])
-    
-    # Parse sample lines to infer schema
-    parser = simdjson.Parser()
-    sample_records = []
-    sample_parsed_records = []
-    keys_union = set()
-    
-    num_sample = min(sample_size, len(lines))
-    
-    for i in range(num_sample):
-        try:
-            record = parser.parse(lines[i])
-            row = record.as_dict()
-            sample_records.append(row)
-            sample_parsed_records.append(record)
-            keys_union.update(row.keys())
-        except Exception:
-            continue
-    
-    if not sample_records:
-        return 0, 0, pyarrow.Table.from_pylist([])
-    
-    # If projection is specified, only extract projected columns
-    if projection:
-        columns_to_extract = {c.value for c in projection}
-    else:
-        columns_to_extract = keys_union
-    
-    # Infer types from sample
-    column_types = {}
-    for key in columns_to_extract:
-        for record in sample_records:
-            if key in record and record[key] is not None:
-                val = record[key]
-                if isinstance(val, bool):
-                    column_types[key] = 'bool'
-                elif isinstance(val, int):
-                    column_types[key] = 'int'
-                elif isinstance(val, float):
-                    column_types[key] = 'float'
-                elif isinstance(val, str):
-                    column_types[key] = 'str'
-                elif isinstance(val, list):
-                    column_types[key] = 'list'
-                elif isinstance(val, dict):
-                    column_types[key] = 'dict'
-                break
-        if key not in column_types:
-            column_types[key] = 'null'
-    
-    # Build regex patterns for each column
-    # Pattern to match: "key": value
-    column_patterns = {}
-    for key in columns_to_extract:
-        # Escape special regex characters in key name
-        escaped_key = re.escape(key)
-        
-        # Create pattern based on expected type
-        col_type = column_types.get(key, 'null')
-        
-        if col_type == 'bool':
-            # Match true/false
-            pattern = rb'"' + escaped_key.encode() + rb'":\s*(true|false)'
-        elif col_type in ('int', 'float'):
-            # Match numbers (including negative, decimals, scientific notation, null)
-            # Ensures decimal point is followed by digits
-            pattern = rb'"' + escaped_key.encode() + rb'":\s*(-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?|null)'
-        elif col_type == 'str':
-            # Match quoted strings (non-greedy, handle escaped quotes) or null
-            pattern = rb'"' + escaped_key.encode() + rb'":\s*(?:"((?:[^"\\]|\\.)*)"|null)'
-        elif col_type == 'null':
-            # Match null
-            pattern = rb'"' + escaped_key.encode() + rb'":\s*null'
-        elif col_type == 'list':
-            # Match arrays (including empty arrays) or null
-            # Note: This pattern handles simple arrays and single-level nested arrays
-            # For deeply nested arrays, the fast decoder may fall back to JSON parsing
-            pattern = rb'"' + escaped_key.encode() + rb'":\s*(\[(?:[^\[\]]|\[.*?\])*?\]|null)'
-        elif col_type == 'dict':
-            # Match objects - simplified pattern for single-level objects
-            # Note: For nested objects, this will fall back to JSON parsing in the value extraction
-            # This limitation is documented and acceptable since complex nested objects are less common
-            pattern = rb'"' + escaped_key.encode() + rb'":\s*(\{[^{}]*\}|null)'
-        else:
-            pattern = None
-        
-        if pattern:
-            column_patterns[key] = (re.compile(pattern), col_type)
-    
-    # Extract values from all lines using regex
-    column_data = {key: [] for key in columns_to_extract}
-    
-    for line in lines:
-        if not line.strip():
-            continue
-        
-        for key in columns_to_extract:
-            if key not in column_patterns:
-                column_data[key].append(None)
-                continue
-            
-            pattern, col_type = column_patterns[key]
-            match = pattern.search(line)
-            
-            if match:
-                if col_type == 'bool':
-                    value = match.group(1) == b'true'
-                elif col_type in ('int', 'float'):
-                    try:
-                        matched_val = match.group(1)
-                        if matched_val == b'null':
-                            value = None
-                        elif col_type == 'int':
-                            value = int(matched_val)
-                        else:
-                            value = float(matched_val)
-                    except (ValueError, IndexError):
-                        value = None
-                elif col_type == 'str':
-                    try:
-                        # Group 1 captures the string content (without quotes)
-                        matched_val = match.group(1)
-                        if matched_val is None:  # null was matched
-                            value = None
-                        else:
-                            # Decode and handle escaped characters
-                            raw_str = matched_val.decode('utf-8', errors='replace')
-                            # Simple unescape for common cases
-                            value = raw_str.replace('\\n', '\n').replace('\\t', '\t').replace('\\"', '"').replace('\\\\', '\\')
-                    except (UnicodeDecodeError, IndexError):
-                        value = None
-                elif col_type == 'null':
-                    value = None
-                elif col_type in ('list', 'dict'):
-                    # For complex types, fall back to JSON parsing
-                    try:
-                        matched_val = match.group(1)
-                        if matched_val == b'null':
-                            value = None
-                        else:
-                            import json
-                            value = json.loads(matched_val.decode('utf-8'))
-                            if col_type == 'dict' and isinstance(value, dict):
-                                # Convert dict to JSON string (similar to record[key].mini)
-                                value = json.dumps(value, ensure_ascii=False)
-                    except (json.JSONDecodeError, UnicodeDecodeError, IndexError):
-                        value = None
-                else:
-                    value = None
-            else:
-                value = None
-            
-            column_data[key].append(value)
-    
-    # Convert to PyArrow table
-    arrays = []
-    names = []
-    
-    for key in sorted(columns_to_extract):
-        arrays.append(pyarrow.array(column_data[key]))
-        names.append(key)
-    
-    if not arrays:
-        return 0, 0, pyarrow.Table.from_pylist([])
-    
-    table = pyarrow.Table.from_arrays(arrays, names=names)
-    return len(lines), len(columns_to_extract), table
-
-
 def jsonl_decoder(
     buffer: Union[memoryview, bytes, BinaryIO],
     *,
     projection: Optional[list] = None,
     selection: Optional[list] = None,
     just_schema: bool = False,
     just_statistics: bool = False,
-    use_fast_decoder: bool = True,
     **kwargs,
 ) -> Tuple[int, int, pyarrow.Table]:
     if just_statistics:
@@ -648,26 +456,6 @@ def jsonl_decoder(
         table = pyarrow.Table.from_arrays([[num_rows]], names=["$COUNT(*)"])
         return (num_rows, 0, 0, table)
 
-    # Use fast decoder if enabled and no complex filtering is needed
-    # Fast decoder is most effective for large files with consistent schema
-    if use_fast_decoder and not just_schema and not selection:
-        try:
-            num_rows, num_cols, table = fast_jsonl_decoder(
-                buffer, projection=projection, selection=selection
-            )
-            
-            if projection:
-                table = post_read_projector(table, projection)
-            
-            return num_rows, num_cols, 0, table
-        except Exception as e:
-            # Fall back to traditional decoder if fast decoder fails
-            # This ensures robustness even with unexpected data
-            import warnings
-            warnings.warn(f"Fast JSONL decoder failed, falling back to standard decoder: {e}")
-
-    parser = simdjson.Parser()
-
     parser = simdjson.Parser()
 
     # preallocate and reuse dicts