diff --git a/codegen-examples/examples/repo_analytics/README.md b/codegen-examples/examples/repo_analytics/README.md new file mode 100644 index 000000000..e072eb20e --- /dev/null +++ b/codegen-examples/examples/repo_analytics/README.md @@ -0,0 +1,140 @@ +# Repository Analytics + +This example demonstrates how to use Codegen to analyze repository metrics and generate comprehensive codebase analytics. For a complete walkthrough, check out our [tutorial](https://docs.codegen.com/tutorials/codebase-analytics-dashboard). + +## How the Analytics Script Works + +The script calculates four main categories of metrics: + +1. **Line Metrics** + + ```python + def count_lines(source: str): + """Count different types of lines in source code.""" + lines = [line.strip() for line in source.splitlines()] + loc = len(lines) + sloc = len([line for line in lines if line]) + # ... additional line counting logic ... + return loc, lloc, sloc, comments + ``` + + - Lines of Code (LOC): Total lines including blanks and comments + - Logical Lines (LLOC): Lines containing functional statements + - Source Lines (SLOC): Non-blank lines of code + - Comment Density: Percentage of comments relative to total lines + +1. **Cyclomatic Complexity** + + ```python + def calculate_cyclomatic_complexity(function): + def analyze_statement(statement): + complexity = 0 + if isinstance(statement, IfBlockStatement): + complexity += 1 + if hasattr(statement, "elif_statements"): + complexity += len(statement.elif_statements) + # ... additional complexity calculations ... + return complexity + + return 1 + analyze_block(function.code_block) + ``` + + - Measures number of code paths through the program + - Higher values indicate more complex control flow + - Helps identify functions that may need refactoring + +1. **Halstead Volume** + + ```python + def calculate_halstead_volume(operators, operands): + n1 = len(set(operators)) # unique operators + n2 = len(set(operands)) # unique operands + N1 = len(operators) # total operators + N2 = len(operands) # total operands + N = N1 + N2 + n = n1 + n2 + # ... volume calculation ... + return volume, N1, N2, n1, n2 + ``` + + - Measures program size based on vocabulary and length + - Uses distinct operators and operands to calculate volume + - Indicates cognitive load and program understanding effort + +1. **Depth of Inheritance** + + ```python + def calculate_doi(cls): + """Calculate the depth of inheritance for a given class.""" + return len(cls.superclasses) + ``` + + - Length of inheritance chain for classes + +1. **Maintainability Index** + + ```python + def calculate_maintainability_index(halstead_volume: float, cyclomatic_complexity: float, loc: int) -> int: + """Calculate the normalized maintainability index for a given function.""" + raw_mi = 171 - 5.2 * math.log(max(1, halstead_volume)) - 0.23 * cyclomatic_complexity - 16.2 * math.log(max(1, loc)) + normalized_mi = max(0, min(100, raw_mi * 100 / 171)) + return int(normalized_mi) + ``` + + - Normalized score (0-100) based on complexity, volume, and size + - Higher scores indicate better maintainability + +## Running the Analysis + +```bash +# Install Codegen +pip install codegen + +# Run the analysis +python run.py +``` + +The script will output a detailed report including: + +- Basic repository statistics +- Line metrics and comment density +- Complexity measurements +- Object-oriented metrics +- Overall maintainability scores + +## Example Output + +``` +šŸ“Š Repository Analysis Report šŸ“Š +================================================== +šŸ“ Repository: codegen-sh/codegen +šŸ“ Description: [Repository description from GitHub] + +šŸ“ˆ Basic Metrics: + • Files: 42 + • Functions: 156 + • Classes: 23 + +šŸ“ Line Metrics: + • Lines of Code: 4,521 + • Logical Lines: 2,845 + • Source Lines: 3,892 + • Comments: 629 + • Comment Density: 13.9% + +šŸ”„ Complexity Metrics: + • Average Cyclomatic Complexity: 3.2 + • Average Maintainability Index: 72 + • Average Depth of Inheritance: 1.4 + • Total Halstead Volume: 52,436 + • Average Halstead Volume: 336 +``` + +## Learn More + +- [Analytics Tutorial](https://docs.codegen.com/tutorials/codebase-analytics-dashboard) +- [Codegen Documentation](https://docs.codegen.com) + +## Contributing + +Feel free to submit issues and enhancement requests! diff --git a/codegen-examples/examples/repo_analytics/run.py b/codegen-examples/examples/repo_analytics/run.py new file mode 100644 index 000000000..629466443 --- /dev/null +++ b/codegen-examples/examples/repo_analytics/run.py @@ -0,0 +1,313 @@ +from typing import Dict, Any +from codegen import Codebase +from codegen.sdk.core.statements.for_loop_statement import ForLoopStatement +from codegen.sdk.core.statements.if_block_statement import IfBlockStatement +from codegen.sdk.core.statements.try_catch_statement import TryCatchStatement +from codegen.sdk.core.statements.while_statement import WhileStatement +from codegen.sdk.core.expressions.binary_expression import BinaryExpression +from codegen.sdk.core.expressions.unary_expression import UnaryExpression +from codegen.sdk.core.expressions.comparison_expression import ComparisonExpression +import math +import re +import requests + + +def calculate_cyclomatic_complexity(function): + def analyze_statement(statement): + complexity = 0 + + if isinstance(statement, IfBlockStatement): + complexity += 1 + if hasattr(statement, "elif_statements"): + complexity += len(statement.elif_statements) + + elif isinstance(statement, (ForLoopStatement, WhileStatement)): + complexity += 1 + + elif isinstance(statement, TryCatchStatement): + complexity += len(getattr(statement, "except_blocks", [])) + + if hasattr(statement, "condition") and isinstance(statement.condition, str): + complexity += statement.condition.count(" and ") + statement.condition.count(" or ") + + if hasattr(statement, "nested_code_blocks"): + for block in statement.nested_code_blocks: + complexity += analyze_block(block) + + return complexity + + def analyze_block(block): + if not block or not hasattr(block, "statements"): + return 0 + return sum(analyze_statement(stmt) for stmt in block.statements) + + return 1 + analyze_block(function.code_block) if hasattr(function, "code_block") else 1 + + +def cc_rank(complexity): + if complexity < 0: + raise ValueError("Complexity must be a non-negative value") + + ranks = [ + (1, 5, "A"), + (6, 10, "B"), + (11, 20, "C"), + (21, 30, "D"), + (31, 40, "E"), + (41, float("inf"), "F"), + ] + for low, high, rank in ranks: + if low <= complexity <= high: + return rank + return "F" + + +def calculate_doi(cls): + """Calculate the depth of inheritance for a given class.""" + return len(cls.superclasses) + + +def get_operators_and_operands(function): + operators = [] + operands = [] + + for statement in function.code_block.statements: + for call in statement.function_calls: + operators.append(call.name) + for arg in call.args: + operands.append(arg.source) + + if hasattr(statement, "expressions"): + for expr in statement.expressions: + if isinstance(expr, BinaryExpression): + operators.extend([op.source for op in expr.operators]) + operands.extend([elem.source for elem in expr.elements]) + elif isinstance(expr, UnaryExpression): + operators.append(expr.ts_node.type) + operands.append(expr.argument.source) + elif isinstance(expr, ComparisonExpression): + operators.extend([op.source for op in expr.operators]) + operands.extend([elem.source for elem in expr.elements]) + + if hasattr(statement, "expression"): + expr = statement.expression + if isinstance(expr, BinaryExpression): + operators.extend([op.source for op in expr.operators]) + operands.extend([elem.source for elem in expr.elements]) + elif isinstance(expr, UnaryExpression): + operators.append(expr.ts_node.type) + operands.append(expr.argument.source) + elif isinstance(expr, ComparisonExpression): + operators.extend([op.source for op in expr.operators]) + operands.extend([elem.source for elem in expr.elements]) + + return operators, operands + + +def calculate_halstead_volume(operators, operands): + n1 = len(set(operators)) + n2 = len(set(operands)) + + N1 = len(operators) + N2 = len(operands) + + N = N1 + N2 + n = n1 + n2 + + if n > 0: + volume = N * math.log2(n) + return volume, N1, N2, n1, n2 + return 0, N1, N2, n1, n2 + + +def count_lines(source: str): + """Count different types of lines in source code.""" + if not source.strip(): + return 0, 0, 0, 0 + + lines = [line.strip() for line in source.splitlines()] + loc = len(lines) + sloc = len([line for line in lines if line]) + + in_multiline = False + comments = 0 + code_lines = [] + + i = 0 + while i < len(lines): + line = lines[i] + code_part = line + if not in_multiline and "#" in line: + comment_start = line.find("#") + if not re.search(r'["\'].*#.*["\']', line[:comment_start]): + code_part = line[:comment_start].strip() + if line[comment_start:].strip(): + comments += 1 + + if ('"""' in line or "'''" in line) and not (line.count('"""') % 2 == 0 or line.count("'''") % 2 == 0): + if in_multiline: + in_multiline = False + comments += 1 + else: + in_multiline = True + comments += 1 + if line.strip().startswith('"""') or line.strip().startswith("'''"): + code_part = "" + elif in_multiline: + comments += 1 + code_part = "" + elif line.strip().startswith("#"): + comments += 1 + code_part = "" + + if code_part.strip(): + code_lines.append(code_part) + + i += 1 + + lloc = 0 + continued_line = False + for line in code_lines: + if continued_line: + if not any(line.rstrip().endswith(c) for c in ("\\", ",", "{", "[", "(")): + continued_line = False + continue + + lloc += len([stmt for stmt in line.split(";") if stmt.strip()]) + + if any(line.rstrip().endswith(c) for c in ("\\", ",", "{", "[", "(")): + continued_line = True + + return loc, lloc, sloc, comments + + +def calculate_maintainability_index(halstead_volume: float, cyclomatic_complexity: float, loc: int) -> int: + """Calculate the normalized maintainability index for a given function.""" + if loc <= 0: + return 100 + + try: + raw_mi = 171 - 5.2 * math.log(max(1, halstead_volume)) - 0.23 * cyclomatic_complexity - 16.2 * math.log(max(1, loc)) + normalized_mi = max(0, min(100, raw_mi * 100 / 171)) + return int(normalized_mi) + except (ValueError, TypeError): + return 0 + + +def get_github_repo_description(repo_url): + api_url = f"https://api.github.com/repos/{repo_url}" + + response = requests.get(api_url) + + if response.status_code == 200: + repo_data = response.json() + return repo_data.get("description", "No description available") + else: + return "" + + +def analyze_repo(repo_url: str) -> Dict[str, Any]: + """Analyze a repository and return comprehensive metrics.""" + codebase = Codebase.from_repo(repo_url) + + num_files = len(codebase.files(extensions="*")) + num_functions = len(codebase.functions) + num_classes = len(codebase.classes) + + total_loc = total_lloc = total_sloc = total_comments = 0 + total_complexity = 0 + total_volume = 0 + total_mi = 0 + total_doi = 0 + + for file in codebase.files: + loc, lloc, sloc, comments = count_lines(file.source) + total_loc += loc + total_lloc += lloc + total_sloc += sloc + total_comments += comments + + callables = codebase.functions + [m for c in codebase.classes for m in c.methods] + + num_callables = 0 + for func in callables: + if not hasattr(func, "code_block"): + continue + + complexity = calculate_cyclomatic_complexity(func) + operators, operands = get_operators_and_operands(func) + volume, _, _, _, _ = calculate_halstead_volume(operators, operands) + loc = len(func.code_block.source.splitlines()) + mi_score = calculate_maintainability_index(volume, complexity, loc) + + total_complexity += complexity + total_volume += volume + total_mi += mi_score + num_callables += 1 + + for cls in codebase.classes: + doi = calculate_doi(cls) + total_doi += doi + + desc = get_github_repo_description(repo_url) + + results = { + "repo_url": repo_url, + "line_metrics": { + "total": { + "loc": total_loc, + "lloc": total_lloc, + "sloc": total_sloc, + "comments": total_comments, + "comment_density": (total_comments / total_loc * 100) if total_loc > 0 else 0, + }, + }, + "cyclomatic_complexity": { + "average": total_complexity if num_callables > 0 else 0, + }, + "depth_of_inheritance": { + "average": total_doi / len(codebase.classes) if codebase.classes else 0, + }, + "halstead_metrics": { + "total_volume": int(total_volume), + "average_volume": int(total_volume / num_callables) if num_callables > 0 else 0, + }, + "maintainability_index": { + "average": int(total_mi / num_callables) if num_callables > 0 else 0, + }, + "description": desc, + "num_files": num_files, + "num_functions": num_functions, + "num_classes": num_classes, + } + + return results + + +if __name__ == "__main__": + repo_url = "codegen-sh/codegen" + results = analyze_repo(repo_url) + + print("\nšŸ“Š Repository Analysis Report šŸ“Š") + print("=" * 50) + print(f"šŸ“ Repository: {results['repo_url']}") + print(f"šŸ“ Description: {results['description']}") + print("\nšŸ“ˆ Basic Metrics:") + print(f" • Files: {results['num_files']}") + print(f" • Functions: {results['num_functions']}") + print(f" • Classes: {results['num_classes']}") + + print("\nšŸ“ Line Metrics:") + line_metrics = results["line_metrics"]["total"] + print(f" • Lines of Code: {line_metrics['loc']:,}") + print(f" • Logical Lines: {line_metrics['lloc']:,}") + print(f" • Source Lines: {line_metrics['sloc']:,}") + print(f" • Comments: {line_metrics['comments']:,}") + print(f" • Comment Density: {line_metrics['comment_density']:.1f}%") + + print("\nšŸ”„ Complexity Metrics:") + print(f" • Average Cyclomatic Complexity: {results['cyclomatic_complexity']['average']:.1f}") + print(f" • Average Maintainability Index: {results['maintainability_index']['average']}") + print(f" • Average Depth of Inheritance: {results['depth_of_inheritance']['average']:.1f}") + print(f" • Total Halstead Volume: {results['halstead_metrics']['total_volume']:,}") + print(f" • Average Halstead Volume: {results['halstead_metrics']['average_volume']:,}")