|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# Copyright 2025 Google LLC |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | +# |
| 16 | + |
| 17 | +""" |
| 18 | +A dual-purpose module for Python code analysis and BigQuery client generation. |
| 19 | +
|
| 20 | +When run as a script, it generates the BigQueryClient source code. |
| 21 | +When imported, it provides utility functions for parsing and exploring |
| 22 | +any Python codebase using the `ast` module. |
| 23 | +""" |
| 24 | + |
| 25 | +import ast |
| 26 | +import os |
| 27 | +from collections import defaultdict |
| 28 | +from typing import List, Dict, Any |
| 29 | + |
| 30 | +from . import utils |
| 31 | + |
| 32 | +# ============================================================================= |
| 33 | +# Section 1: Generic AST Analysis Utilities |
| 34 | +# ============================================================================= |
| 35 | + |
| 36 | + |
| 37 | +class CodeAnalyzer(ast.NodeVisitor): |
| 38 | + """ |
| 39 | + A node visitor to traverse an AST and extract structured information |
| 40 | + about classes, methods, and their arguments. |
| 41 | + """ |
| 42 | + |
| 43 | + def __init__(self): |
| 44 | + self.structure: List[Dict[str, Any]] = [] |
| 45 | + self.imports: set[str] = set() |
| 46 | + self.types: set[str] = set() |
| 47 | + self._current_class_info: Dict[str, Any] | None = None |
| 48 | + self._is_in_method: bool = False |
| 49 | + |
| 50 | + def _get_type_str(self, node: ast.AST | None) -> str | None: |
| 51 | + """Recursively reconstructs a type annotation string from an AST node.""" |
| 52 | + if node is None: |
| 53 | + return None |
| 54 | + # Handles simple names like 'str', 'int', 'HttpRequest' |
| 55 | + if isinstance(node, ast.Name): |
| 56 | + return node.id |
| 57 | + # Handles dotted names like 'service.GetDatasetRequest' |
| 58 | + if isinstance(node, ast.Attribute): |
| 59 | + # Attempt to reconstruct the full dotted path |
| 60 | + parts = [] |
| 61 | + curr = node |
| 62 | + while isinstance(curr, ast.Attribute): |
| 63 | + parts.append(curr.attr) |
| 64 | + curr = curr.value |
| 65 | + if isinstance(curr, ast.Name): |
| 66 | + parts.append(curr.id) |
| 67 | + return ".".join(reversed(parts)) |
| 68 | + # Handles subscripted types like 'list[str]', 'Optional[...]' |
| 69 | + if isinstance(node, ast.Subscript): |
| 70 | + value_str = self._get_type_str(node.value) |
| 71 | + slice_str = self._get_type_str(node.slice) |
| 72 | + return f"{value_str}[{slice_str}]" |
| 73 | + # Handles tuples inside subscripts, e.g., 'dict[str, int]' |
| 74 | + if isinstance(node, ast.Tuple): |
| 75 | + return ", ".join( |
| 76 | + [s for s in (self._get_type_str(e) for e in node.elts) if s] |
| 77 | + ) |
| 78 | + # Handles forward references as strings, e.g., '"Dataset"' |
| 79 | + if isinstance(node, ast.Constant): |
| 80 | + return repr(node.value) |
| 81 | + return None # Fallback for unhandled types |
| 82 | + |
| 83 | + def _collect_types_from_node(self, node: ast.AST | None) -> None: |
| 84 | + """Recursively traverses an annotation node to find and collect all type names.""" |
| 85 | + if node is None: |
| 86 | + return |
| 87 | + |
| 88 | + if isinstance(node, ast.Name): |
| 89 | + self.types.add(node.id) |
| 90 | + elif isinstance(node, ast.Attribute): |
| 91 | + type_str = self._get_type_str(node) |
| 92 | + if type_str: |
| 93 | + self.types.add(type_str) |
| 94 | + elif isinstance(node, ast.Subscript): |
| 95 | + self._collect_types_from_node(node.value) |
| 96 | + self._collect_types_from_node(node.slice) |
| 97 | + elif isinstance(node, (ast.Tuple, ast.List)): |
| 98 | + for elt in node.elts: |
| 99 | + self._collect_types_from_node(elt) |
| 100 | + elif isinstance(node, ast.Constant) and isinstance(node.value, str): |
| 101 | + self.types.add(node.value) |
| 102 | + elif isinstance(node, ast.BinOp) and isinstance( |
| 103 | + node.op, ast.BitOr |
| 104 | + ): # For | union type |
| 105 | + self._collect_types_from_node(node.left) |
| 106 | + self._collect_types_from_node(node.right) |
| 107 | + |
| 108 | + def visit_Import(self, node: ast.Import) -> None: |
| 109 | + """Catches 'import X' and 'import X as Y' statements.""" |
| 110 | + for alias in node.names: |
| 111 | + if alias.asname: |
| 112 | + self.imports.add(f"import {alias.name} as {alias.asname}") |
| 113 | + else: |
| 114 | + self.imports.add(f"import {alias.name}") |
| 115 | + self.generic_visit(node) |
| 116 | + |
| 117 | + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: |
| 118 | + """Catches 'from X import Y' statements.""" |
| 119 | + module = node.module or "" |
| 120 | + if not module: |
| 121 | + module = "." * node.level |
| 122 | + else: |
| 123 | + module = "." * node.level + module |
| 124 | + |
| 125 | + names = [] |
| 126 | + for alias in node.names: |
| 127 | + if alias.asname: |
| 128 | + names.append(f"{alias.name} as {alias.asname}") |
| 129 | + else: |
| 130 | + names.append(alias.name) |
| 131 | + |
| 132 | + if names: |
| 133 | + self.imports.add(f"from {module} import {', '.join(names)}") |
| 134 | + self.generic_visit(node) |
| 135 | + |
| 136 | + def visit_ClassDef(self, node: ast.ClassDef) -> None: |
| 137 | + """Visits a class definition node.""" |
| 138 | + class_info = { |
| 139 | + "class_name": node.name, |
| 140 | + "methods": [], |
| 141 | + "attributes": [], |
| 142 | + } |
| 143 | + |
| 144 | + # Extract class-level attributes (for proto.Message classes) |
| 145 | + for item in node.body: |
| 146 | + if isinstance(item, ast.AnnAssign) and isinstance(item.target, ast.Name): |
| 147 | + attr_name = item.target.id |
| 148 | + type_str = self._get_type_str(item.annotation) |
| 149 | + class_info["attributes"].append({"name": attr_name, "type": type_str}) |
| 150 | + |
| 151 | + self.structure.append(class_info) |
| 152 | + self._current_class_info = class_info |
| 153 | + self.generic_visit(node) |
| 154 | + self._current_class_info = None |
| 155 | + |
| 156 | + def visit_FunctionDef(self, node: ast.FunctionDef) -> None: |
| 157 | + """Visits a function/method definition node.""" |
| 158 | + if self._current_class_info: # This is a method |
| 159 | + args_info = [] |
| 160 | + |
| 161 | + # Get default values |
| 162 | + defaults = [self._get_type_str(d) for d in node.args.defaults] |
| 163 | + num_defaults = len(defaults) |
| 164 | + num_args = len(node.args.args) |
| 165 | + |
| 166 | + for i, arg in enumerate(node.args.args): |
| 167 | + arg_data = {"name": arg.arg, "type": self._get_type_str(arg.annotation)} |
| 168 | + |
| 169 | + # Match defaults to arguments from the end |
| 170 | + default_index = i - (num_args - num_defaults) |
| 171 | + if default_index >= 0: |
| 172 | + arg_data["default"] = defaults[default_index] |
| 173 | + |
| 174 | + args_info.append(arg_data) |
| 175 | + self._collect_types_from_node(arg.annotation) |
| 176 | + |
| 177 | + # Collect return type |
| 178 | + return_type = self._get_type_str(node.returns) |
| 179 | + self._collect_types_from_node(node.returns) |
| 180 | + |
| 181 | + method_info = { |
| 182 | + "method_name": node.name, |
| 183 | + "args": args_info, |
| 184 | + "return_type": return_type, |
| 185 | + } |
| 186 | + self._current_class_info["methods"].append(method_info) |
| 187 | + |
| 188 | + # Visit nodes inside the method to find instance attributes. |
| 189 | + self._is_in_method = True |
| 190 | + self.generic_visit(node) |
| 191 | + self._is_in_method = False |
| 192 | + |
| 193 | + def _add_attribute(self, attr_name: str, attr_type: str | None = None): |
| 194 | + """Adds a unique attribute to the current class context. |
| 195 | +
|
| 196 | + Assumes self._current_class_info is not None, as this method |
| 197 | + is only called from within visit_Assign and visit_AnnAssign |
| 198 | + after checking for an active class context. |
| 199 | + """ |
| 200 | + # Create a list of attribute names for easy lookup |
| 201 | + attr_names = [ |
| 202 | + attr.get("name") for attr in self._current_class_info["attributes"] |
| 203 | + ] |
| 204 | + if attr_name not in attr_names: |
| 205 | + self._current_class_info["attributes"].append( |
| 206 | + {"name": attr_name, "type": attr_type} |
| 207 | + ) |
| 208 | + |
| 209 | + def visit_Assign(self, node: ast.Assign) -> None: |
| 210 | + """Handles attribute assignments: `x = ...` and `self.x = ...`.""" |
| 211 | + if self._current_class_info: |
| 212 | + for target in node.targets: |
| 213 | + # Instance attribute: self.x = ... |
| 214 | + if ( |
| 215 | + isinstance(target, ast.Attribute) |
| 216 | + and isinstance(target.value, ast.Name) |
| 217 | + and target.value.id == "self" |
| 218 | + ): |
| 219 | + self._add_attribute(target.attr) |
| 220 | + # Class attribute: x = ... (only if not inside a method) |
| 221 | + elif isinstance(target, ast.Name) and not self._is_in_method: |
| 222 | + self._add_attribute(target.id) |
| 223 | + self.generic_visit(node) |
| 224 | + |
| 225 | + def visit_AnnAssign(self, node: ast.AnnAssign) -> None: |
| 226 | + """Handles annotated assignments: `x: int = ...` and `self.x: int = ...`.""" |
| 227 | + if self._current_class_info: |
| 228 | + target = node.target |
| 229 | + # Instance attribute: self.x: int = ... |
| 230 | + if ( |
| 231 | + isinstance(target, ast.Attribute) |
| 232 | + and isinstance(target.value, ast.Name) |
| 233 | + and target.value.id == "self" |
| 234 | + ): |
| 235 | + self._add_attribute(target.attr, self._get_type_str(node.annotation)) |
| 236 | + # Class attribute: x: int = ... |
| 237 | + # We identify it as a class attribute if the assignment happens |
| 238 | + # directly within the class body, not inside a method. |
| 239 | + elif isinstance(target, ast.Name) and not self._is_in_method: |
| 240 | + self._add_attribute(target.id, self._get_type_str(node.annotation)) |
| 241 | + self.generic_visit(node) |
| 242 | + |
| 243 | + |
| 244 | +def parse_code(code: str) -> tuple[List[Dict[str, Any]], set[str], set[str]]: |
| 245 | + """ |
| 246 | + Parses a string of Python code into a structured list of classes, a set of imports, |
| 247 | + and a set of all type annotations found. |
| 248 | +
|
| 249 | + Args: |
| 250 | + code: A string containing Python code. |
| 251 | +
|
| 252 | + Returns: |
| 253 | + A tuple containing: |
| 254 | + - A list of dictionaries, where each dictionary represents a class. |
| 255 | + - A set of strings, where each string is an import statement. |
| 256 | + - A set of strings, where each string is a type annotation. |
| 257 | + """ |
| 258 | + tree = ast.parse(code) |
| 259 | + analyzer = CodeAnalyzer() |
| 260 | + analyzer.visit(tree) |
| 261 | + return analyzer.structure, analyzer.imports, analyzer.types |
| 262 | + |
| 263 | + |
| 264 | +def parse_file(file_path: str) -> tuple[List[Dict[str, Any]], set[str], set[str]]: |
| 265 | + """ |
| 266 | + Parses a Python file into a structured list of classes, a set of imports, |
| 267 | + and a set of all type annotations found. |
| 268 | +
|
| 269 | + Args: |
| 270 | + file_path: The absolute path to the Python file. |
| 271 | +
|
| 272 | + Returns: |
| 273 | + A tuple containing the class structure, a set of import statements, |
| 274 | + and a set of type annotations. |
| 275 | + """ |
| 276 | + with open(file_path, "r", encoding="utf-8") as source: |
| 277 | + code = source.read() |
| 278 | + return parse_code(code) |
| 279 | + |
| 280 | + |
| 281 | +def list_code_objects( |
| 282 | + path: str, |
| 283 | + show_methods: bool = False, |
| 284 | + show_attributes: bool = False, |
| 285 | + show_arguments: bool = False, |
| 286 | +) -> Any: |
| 287 | + """ |
| 288 | + Lists classes and optionally their methods, attributes, and arguments |
| 289 | + from a given Python file or directory. |
| 290 | +
|
| 291 | + This function consolidates the functionality of the various `list_*` functions. |
| 292 | +
|
| 293 | + Args: |
| 294 | + path (str): The absolute path to a Python file or directory. |
| 295 | + show_methods (bool): Whether to include methods in the output. |
| 296 | + show_attributes (bool): Whether to include attributes in the output. |
| 297 | + show_arguments (bool): If True, includes method arguments. Implies show_methods. |
| 298 | +
|
| 299 | + Returns: |
| 300 | + - If `show_methods` and `show_attributes` are both False, returns a |
| 301 | + sorted `List[str]` of class names (mimicking `list_classes`). |
| 302 | + - Otherwise, returns a `Dict[str, Dict[str, Any]]` containing the |
| 303 | + requested details about each class. |
| 304 | + """ |
| 305 | + # If show_arguments is True, we must show methods. |
| 306 | + if show_arguments: |
| 307 | + show_methods = True |
| 308 | + |
| 309 | + results = defaultdict(dict) |
| 310 | + all_class_keys = [] |
| 311 | + |
| 312 | + def process_structure( |
| 313 | + structure: List[Dict[str, Any]], file_name: str | None = None |
| 314 | + ): |
| 315 | + """Populates the results dictionary from the parsed AST structure.""" |
| 316 | + for class_info in structure: |
| 317 | + key = class_info["class_name"] |
| 318 | + if file_name: |
| 319 | + key = f"{key} (in {file_name})" |
| 320 | + |
| 321 | + all_class_keys.append(key) |
| 322 | + |
| 323 | + if show_attributes: |
| 324 | + results[key]["attributes"] = sorted(class_info["attributes"]) |
| 325 | + |
| 326 | + if show_methods: |
| 327 | + if show_arguments: |
| 328 | + method_details = {} |
| 329 | + # Sort methods by name for consistent output |
| 330 | + for method in sorted( |
| 331 | + class_info["methods"], key=lambda m: m["method_name"] |
| 332 | + ): |
| 333 | + method_details[method["method_name"]] = method["args"] |
| 334 | + results[key]["methods"] = method_details |
| 335 | + else: |
| 336 | + results[key]["methods"] = sorted( |
| 337 | + [m["method_name"] for m in class_info["methods"]] |
| 338 | + ) |
| 339 | + |
| 340 | + # Determine if the path is a file or directory and process accordingly |
| 341 | + if os.path.isfile(path) and path.endswith(".py"): |
| 342 | + structure, _, _ = parse_file(path) |
| 343 | + process_structure(structure) |
| 344 | + elif os.path.isdir(path): |
| 345 | + # This assumes `utils.walk_codebase` is defined elsewhere. |
| 346 | + for file_path in utils.walk_codebase(path): |
| 347 | + structure, _, _ = parse_file(file_path) |
| 348 | + process_structure(structure, file_name=os.path.basename(file_path)) |
| 349 | + |
| 350 | + # Return the data in the desired format based on the flags |
| 351 | + if not show_methods and not show_attributes: |
| 352 | + return sorted(all_class_keys) |
| 353 | + else: |
| 354 | + return dict(results) |
0 commit comments