bitsandbytes-foundation
diff --git a/‎ast_test.py‎
Lines changed: 254 additions & 0 deletions b/‎ast_test.py‎
Lines changed: 254 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 34 additions & 10 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 34 additions & 10 deletions
@@ -0,0 +1,254 @@
+import ast
+import networkx as nx
+import random
+from pathlib import Path
+import sys
+import argparse
+from typing import Dict, Set, List, Tuple, Optional
+
+
+class FunctionCallVisitor(ast.NodeVisitor):
+    def __init__(self):
+        self.graph = nx.DiGraph()
+        self.current_function = None
+        self.objects_used = {}  # Track objects used in each function
+        self.function_locations = {}  # Track line numbers
+
+    def visit_FunctionDef(self, node):
+        function_name = node.name
+        self.graph.add_node(function_name)
+        self.function_locations[function_name] = node.lineno
+
+        # Store previous function to handle nesting
+        previous_function = self.current_function
+        self.current_function = function_name
+
+        # Visit all children in the function body
+        self.objects_used[function_name] = set()
+        self.generic_visit(node)
+
+        # Restore previous function context
+        self.current_function = previous_function
+
+    def visit_Call(self, node):
+        if self.current_function:
+            called_function = None
+            line_num = getattr(node, 'lineno', None)
+
+            # Handle different types of function calls
+            if isinstance(node.func, ast.Name):
+                called_function = node.func.id
+            elif isinstance(node.func, ast.Attribute):
+                # For method calls like obj.method()
+                obj_name = self._get_attribute_source(node.func)
+                method_name = node.func.attr
+                called_function = f"{obj_name}.{method_name}"
+
+                # Track line number for method calls
+                if line_num and called_function not in self.function_locations:
+                    self.function_locations[called_function] = line_num
+
+            if called_function:
+                self.graph.add_node(called_function)
+                self.graph.add_edge(self.current_function, called_function)
+
+        # Visit arguments to find more function calls
+        self.generic_visit(node)
+
+    def _get_attribute_source(self, node):
+        """Helper to get the source of an attribute (e.g., 'obj' from 'obj.method')"""
+        if isinstance(node.value, ast.Name):
+            return node.value.id
+        elif isinstance(node.value, ast.Attribute):
+            # Handle nested attributes like a.b.method()
+            return self._get_attribute_source(node.value) + "." + node.value.attr
+        elif isinstance(node.value, ast.Call):
+            # Handle method calls on function returns like func().method()
+            return "(result)"
+        return "object"
+
+    def visit_Name(self, node):
+        # Track objects/variables used in function
+        if self.current_function and isinstance(node.ctx, ast.Load):
+            self.objects_used[self.current_function].add(node.id)
+        self.generic_visit(node)
+
+
+def build_call_graph(file_path: str) -> Tuple[nx.DiGraph, Dict[str, Set[str]], Dict[str, int]]:
+    """Build a call graph from a Python file."""
+    with open(file_path, 'r') as file:
+        source_code = file.read()
+
+    # Parse the AST
+    tree = ast.parse(source_code)
+
+    # Visit the AST and build the call graph
+    visitor = FunctionCallVisitor()
+    visitor.visit(tree)
+
+    # Find functions that are called but not defined in this file
+    # (likely imported functions or methods on objects)
+    all_called = set()
+    for _, successors in nx.bfs_successors(visitor.graph, list(visitor.graph.nodes())[0] if visitor.graph.nodes() else None):
+        all_called.update(successors)
+
+    defined_funcs = set(visitor.function_locations.keys())
+    external_funcs = all_called - defined_funcs
+
+    print(f"External function calls: {len(external_funcs)}")
+
+    return visitor.graph, visitor.objects_used, visitor.function_locations
+
+
+def generate_random_trace(graph: nx.DiGraph, min_depth: int = 3, max_depth: int = 10) -> List[str]:
+    """Generate a random trace through the call graph with a minimum depth when possible."""
+    if not graph.nodes():
+        return []
+
+    # Find nodes that have outgoing edges as potential starting points
+    starting_candidates = [n for n in graph.nodes() if graph.out_degree(n) > 0]
+    if not starting_candidates:
+        starting_candidates = list(graph.nodes())
+
+    current = random.choice(starting_candidates)
+    trace = [current]
+    visited = set([current])
+
+    # Follow a random path down the graph
+    depth_attempts = 0
+    while depth_attempts < 50:  # Allow more attempts for deeper traces
+        # Get successors that haven't been visited to avoid cycles
+        successors = [s for s in graph.successors(current) if s not in visited]
+
+        if not successors:
+            # If we're at a leaf node but haven't reached min_depth, try backtracking
+            if len(trace) < min_depth and len(trace) > 1:
+                # Remove the current dead-end
+                visited.remove(current)
+                trace.pop()
+                current = trace[-1]
+                continue
+            # Otherwise, we've reached a valid end point
+            break
+
+        # Choose a successor with preference for those that have their own successors
+        weighted_successors = []
+        for s in successors:
+            # Assign weight based on number of outgoing edges
+            weight = max(1, graph.out_degree(s))
+            weighted_successors.extend([s] * weight)
+
+        if weighted_successors:
+            current = random.choice(weighted_successors)
+        else:
+            current = random.choice(successors)
+
+        trace.append(current)
+        visited.add(current)
+
+        # Stop if we've reached desired max depth
+        if len(trace) >= max_depth:
+            break
+
+        depth_attempts += 1
+
+    return trace
+
+
+def print_stack_trace(trace: List[str], file_path: str, objects_used: Dict[str, Set[str]],
+                     function_locations: Dict[str, int]):
+    """Print a stack-trace-like representation of the function call path."""
+    if not trace:
+        print("No functions found in the trace.")
+        return
+
+    file_name = Path(file_path).name
+    print(f"\n{'=' * 60}")
+    print(f"RANDOM FUNCTION CALL TRACE IN: {file_name}")
+    print(f"{'=' * 60}")
+
+    for i, func in enumerate(trace):
+        line_num = function_locations.get(func, '?')
+        indent = '  ' * i
+
+        # For the function name display
+        if i < len(trace) - 1:
+            arrow = "↓ calls"
+        else:
+            arrow = "⊥ (end)"
+
+        # Handle method calls differently
+        if '.' in func:
+            obj_name, method_name = func.rsplit('.', 1)
+            print(f"{indent}File \"{file_name}\", line {line_num}, in {obj_name} object")
+            print(f"{indent}  Method call: {method_name}()")
+        else:
+            print(f"{indent}File \"{file_name}\", line {line_num}, in {func}()")
+
+        # Show objects used in this function
+        if func in objects_used and objects_used[func]:
+            obj_list = ", ".join(objects_used[func])
+            print(f"{indent}  [Objects used: {obj_list}]")
+
+        # Show the arrow for the next function call
+        if i < len(trace) - 1:
+            print(f"{indent}  {arrow}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate a random function call trace from Python code")
+    parser.add_argument("file", help="Python file to analyze")
+    parser.add_argument("--traces", type=int, default=1, help="Number of random traces to generate")
+    parser.add_argument("--min-depth", type=int, default=3, help="Minimum depth of trace to try to achieve")
+    parser.add_argument("--max-depth", type=int, default=15, help="Maximum depth of trace")
+    parser.add_argument("--max-attempts", type=int, default=100, help="Maximum number of attempts to find a trace that meets min-depth")
+    args = parser.parse_args()
+
+    try:
+        graph, objects_used, function_locations = build_call_graph(args.file)
+
+        if not graph.nodes():
+            print(f"No functions found in {args.file}")
+            return
+
+        # Calculate graph stats
+        total_functions = len(graph.nodes())
+        total_calls = len(graph.edges())
+        max_call_depth = nx.dag_longest_path_length(nx.DiGraph(graph)) if nx.is_directed_acyclic_graph(graph) else "unknown (contains cycles)"
+
+        print(f"Found {total_functions} functions with {total_calls} call relationships in {args.file}")
+        print(f"Maximum theoretical call depth: {max_call_depth}")
+        print(f"Generating {args.traces} traces with minimum depth {args.min_depth}...")
+
+        traces_generated = 0
+        attempts = 0
+
+        while traces_generated < args.traces and attempts < args.max_attempts:
+            trace = generate_random_trace(graph, args.min_depth, args.max_depth)
+            attempts += 1
+
+            if len(trace) >= args.min_depth:
+                print_stack_trace(trace, args.file, objects_used, function_locations)
+                print(f"Trace depth: {len(trace)} (found after {attempts} attempts)")
+                traces_generated += 1
+                attempts = 0  # Reset attempts counter for next trace
+
+        if traces_generated < args.traces:
+            print(f"\nWARNING: Could only generate {traces_generated} traces of minimum depth {args.min_depth} "
+                  f"after {args.max_attempts} attempts.")
+            print(f"The codebase may not have enough deep call chains to satisfy the requested minimum depth.")
+            # Optionally suggest a smaller depth
+            if args.min_depth > 2:
+                print(f"Try using a smaller --min-depth value (e.g., {args.min_depth - 1}).")
+
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -248,17 +248,22 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
 @register_kernel("bitsandbytes::quantize_blockwise_kbit", "cuda")
 def _(A: torch.Tensor, k: int, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
-    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+    torch._check(blocksize == 32, lambda: f"Only blocksize=32 is supported for k-bit quantization, got {blocksize}")
     torch._check(A.device.type == "cuda", lambda: "Input tensor must be on CUDA device")
     torch._check(code.device.type == "cuda", lambda: "Code tensor must be on CUDA device")
     torch._check(code.dtype == torch.float32, lambda: "Code must be float32")
     torch._check(A.is_contiguous(), lambda: "A must be contiguous")
     torch._check(code.is_contiguous(), lambda: "Code must be contiguous")
 
     n = A.numel()
-    blocks = -(n // -blocksize)
+    blocks = (n + 31) // 32  # Round up for 32-element blocks
     absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
-    out = torch.zeros_like(A, dtype=torch.uint8)
+    
+    # Calculate packed output size: ceil(n * k / 32) * 4 bytes (as uint32 words)
+    elements_per_word = 32 // k
+    packed_words = (n + elements_per_word - 1) // elements_per_word
+    packed_bytes = packed_words * 4
+    out = torch.zeros((packed_bytes,), device=A.device, dtype=torch.uint8)
 
     with torch.cuda.device_of(A):
         args = (
@@ -286,8 +291,19 @@ def _(A: torch.Tensor, k: int, code: torch.Tensor, blocksize: int) -> tuple[torc
 @register_kernel("bitsandbytes::dequantize_blockwise_kbit", "cuda")
 def _(A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
     torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
-    out = torch.empty_like(A, dtype=dtype)
-    _dequantize_blockwise_kbit_impl(A, k, absmax, code, blocksize, dtype, out=out)
+    torch._check(blocksize == 32, lambda: f"Only blocksize=32 is supported for k-bit quantization, got {blocksize}")
+    
+    # Calculate original number of elements from packed tensor and absmax
+    elements_per_word = 32 // k
+    packed_words = A.numel() // 4  # A is uint8 tensor, 4 bytes per uint32 word
+    max_elements = packed_words * elements_per_word
+    
+    # Use absmax size to determine actual number of elements
+    blocks = absmax.numel()
+    n_elements = min(max_elements, blocks * 32)  # Each block has up to 32 elements
+    
+    out = torch.empty((n_elements,), device=A.device, dtype=dtype)
+    _dequantize_blockwise_kbit_impl(A, k, absmax, code, blocksize, dtype, n_elements, out=out)
     return out
 
 
@@ -302,15 +318,23 @@ def _(
     out: torch.Tensor,
 ) -> None:
     torch._check(k >= 2 and k <= 8, lambda: f"k must be between 2 and 8, got {k}")
+    torch._check(blocksize == 32, lambda: f"Only blocksize=32 is supported for k-bit quantization, got {blocksize}")
     torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
-    torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
-    _dequantize_blockwise_kbit_impl(A, k, absmax, code, blocksize, dtype, out=out)
+    
+    # Calculate expected number of elements
+    elements_per_word = 32 // k
+    packed_words = A.numel() // 4
+    blocks = absmax.numel()
+    n_elements = min(packed_words * elements_per_word, blocks * 32)
+    
+    torch._check(out.numel() == n_elements, lambda: f"Expected out.numel() == {n_elements}, got {out.numel()}")
+    _dequantize_blockwise_kbit_impl(A, k, absmax, code, blocksize, dtype, n_elements, out=out)
 
 
 def _dequantize_blockwise_kbit_impl(
-    A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+    A: torch.Tensor, k: int, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, n_elements: int, out: torch.Tensor
 ) -> None:
-    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+    torch._check(blocksize == 32, lambda: f"Only blocksize=32 is supported for k-bit quantization, got {blocksize}")
     torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
     torch._check(
         dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -326,7 +350,7 @@ def _dequantize_blockwise_kbit_impl(
             get_ptr(absmax),
             get_ptr(out),
             ct.c_int32(blocksize),
-            ct.c_int(A.numel()),
+            ct.c_int(n_elements),  # Use calculated n_elements instead of A.numel()
             _get_tensor_stream(A),
         )