diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..5c1da0f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,48 @@ +# Git +.git +.github +.gitignore + +# Docker +.dockerignore +Dockerfile + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.nox/ + +# DB files (unless you need them) +*.db + +# Benchmark results +benchmark_results/ + +# VSCode +.vscode/ + +# Other +*.log +*.swp +.DS_Store \ No newline at end of file diff --git a/.github/workflows/test-changelog-scripts.yml b/.github/workflows/test-changelog-scripts.yml new file mode 100644 index 0000000..d7020c8 --- /dev/null +++ b/.github/workflows/test-changelog-scripts.yml @@ -0,0 +1,48 @@ +name: Test Changelog Scripts (Disabled) + +# Workflow is disabled by only allowing manual triggers +# To re-enable automatic triggers, uncomment the push/pull_request sections +on: + # Manual trigger only + workflow_dispatch: + # push: + # paths: + # - 'scripts/**' + # - '.github/workflows/test-changelog-scripts.yml' + # pull_request: + # paths: + # - 'scripts/**' + # - '.github/workflows/test-changelog-scripts.yml' + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install markdown matplotlib + # Install any other dependencies your project needs + if [ -f requirements.txt ]; then + pip install -r requirements.txt + fi + + - name: Run unit tests for extract_changelog_context.py + run: python -m scripts.test_extract_changelog_context + + - name: Run unit tests for generate_changelog_entry.py + run: python -m scripts.test_generate_changelog_entry + + - name: Run unit tests for generate_changelog_html.py + run: python -m scripts.test_generate_changelog_html + + - name: Run integration tests + run: python -m scripts.integration_test_changelog \ No newline at end of file diff --git a/.github/workflows/update-changelog.yml b/.github/workflows/update-changelog.yml new file mode 100644 index 0000000..428c733 --- /dev/null +++ b/.github/workflows/update-changelog.yml @@ -0,0 +1,212 @@ +name: Update Changelog on PR + +on: + pull_request: + types: [opened, synchronize, reopened] + branches: + - main + +jobs: + generate-pr-changelog: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install openai markdown matplotlib numpy tiktoken + + - name: Get PR diff information + id: get_pr_diff + run: | + # Get the base and head commits for the PR + PR_HEAD_SHA="${{ github.event.pull_request.head.sha }}" + PR_BASE_SHA="${{ github.event.pull_request.base.sha }}" + + # Get commits in this PR + git log --pretty=format:"%h - %s (%an)" $PR_BASE_SHA..$PR_HEAD_SHA > pr_commits.txt + + # Get files changed in this PR + git diff --name-status $PR_BASE_SHA..$PR_HEAD_SHA > pr_files_changed.txt + + # Get detailed diff of changed files (for context) + git diff --stat $PR_BASE_SHA..$PR_HEAD_SHA > pr_diff_stats.txt + + # Output for debugging + echo "PR diff between: $PR_BASE_SHA and $PR_HEAD_SHA" + + - name: Analyze PR labels + run: | + PR_NUMBER="${{ github.event.pull_request.number }}" + gh pr view $PR_NUMBER --json labels -q '.labels[].name' > pr_labels.txt + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Gather library context + id: gather_context + run: | + # Run the context extraction script + python scripts/extract_changelog_context.py + + - name: Create context embeddings + id: create_embeddings + env: + OPENAI_API_KEY: ${{ vars.OPENAI_API_KEY || secrets.OPENAI_API_KEY }} + run: | + # Generate embeddings for all context files to reduce token usage + python scripts/create_changelog_embeddings.py + + # Store embedding statistics for monitoring + echo "Embedding stats:" + python -c "import json; from pathlib import Path; data = json.loads(Path('changelog_embeddings.json').read_text()); print(f'Total files embedded: {len(data[\"embeddings\"])}'); total_tokens = sum(data['token_counts'].values()); print(f'Total tokens in original files: {total_tokens}')" + + - name: Generate changelog entry for PR + id: generate_pr_changelog + env: + OPENAI_API_KEY: ${{ vars.OPENAI_API_KEY || secrets.OPENAI_API_KEY }} + PR_TITLE: ${{ github.event.pull_request.title }} + PR_BODY: ${{ github.event.pull_request.body }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO_NAME: ${{ github.repository }} + run: | + # Use the embedding-based approach for more efficient token usage + # Read the embeddings + EMBEDDINGS=$(cat changelog_embeddings.json) + + # Create a temporary payload file for the API call + cat > payload.json << 'EOF' + { + "model": "gpt-4o", + "messages": [ + {"role": "system", "content": "You are a changelog generator that creates detailed, structured entries for pull requests. Generate a concise changelog entry in markdown format for the PR."}, + {"role": "user", "content": "Generate a changelog entry for PR #${{ github.event.pull_request.number }} with title: \"${{ github.event.pull_request.title }}\". The PR description is: \"${{ github.event.pull_request.body }}\". Use the context embeddings to understand the codebase and create an appropriate entry."} + ], + "context_embeddings": + EOF + + # Append the embeddings JSON content to the payload + cat changelog_embeddings.json >> payload.json + + # Make the API call + curl -s -X POST https://api.openai.com/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $OPENAI_API_KEY" \ + --data-binary @payload.json | \ + jq -r '.choices[0].message.content' > pr_changelog_entry.md + + # Save the output to GitHub step output + cat pr_changelog_entry.md >> $GITHUB_OUTPUT + + # Fallback to traditional method if embedding approach fails + if [ ! -s pr_changelog_entry.md ]; then + echo "Embedding approach failed, falling back to traditional method..." + python scripts/generate_changelog_entry.py > changelog_output.txt + cat pr_changelog_entry.md >> $GITHUB_OUTPUT + fi + + - name: Generate visual changelog + env: + PR_TITLE: ${{ github.event.pull_request.title }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO_NAME: ${{ github.repository }} + run: | + # Generate visual HTML changelog + python scripts/generate_changelog_html.py + + - name: Create or update PR changelog file + run: | + PR_NUMBER="${{ github.event.pull_request.number }}" + + # Create the PR comment and save to a file + cat pr_changelog_entry.md > "PR_${PR_NUMBER}_CHANGELOG.md" + + # Prepare comment with links to assets + echo "## Changelog Preview for this PR:" > pr_comment.txt + echo "" >> pr_comment.txt + cat pr_changelog_entry.md >> pr_comment.txt + echo "" >> pr_comment.txt + echo "### Additional resources:" >> pr_comment.txt + echo "- [Release Notes](RELEASE_NOTES.md)" >> pr_comment.txt + echo "" >> pr_comment.txt + echo "This will be automatically added to CHANGELOG.md when merged." >> pr_comment.txt + echo "" >> pr_comment.txt + echo "💡 *Generated using vector embeddings for efficient token usage*" >> pr_comment.txt + + # Add a comment to the PR with the changelog preview + gh pr comment $PR_NUMBER --body-file pr_comment.txt + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload changelog assets + uses: actions/upload-artifact@v4 + with: + name: changelog-assets + path: | + pr_changelog_entry.md + RELEASE_NOTES.md + changelog_visual.html + impact_analysis.txt + test_coverage_analysis.txt + impact_analysis.json + commit_categories.json + commit_categories.txt + changelog_embeddings.json + + update-changelog-on-merge: + runs-on: ubuntu-latest + # Only run this job when PR is merged to main + if: github.event.pull_request.merged == true + needs: generate-pr-changelog + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + # We need to use a token with write permissions to push to main + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download changelog assets + uses: actions/download-artifact@v4 + with: + name: changelog-assets + + - name: Update CHANGELOG.md + run: | + if [ -f CHANGELOG.md ]; then + NEW_ENTRY=$(cat pr_changelog_entry.md) + EXISTING=$(cat CHANGELOG.md) + echo -e "$NEW_ENTRY\n\n$EXISTING" > CHANGELOG.md + else + HEADER="# Changelog\n\n" + NEW_ENTRY=$(cat pr_changelog_entry.md) + echo -e "$HEADER$NEW_ENTRY" > CHANGELOG.md + fi + + - name: Commit and push changes + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git add CHANGELOG.md + + # Also add release assets if available + if [ -f RELEASE_NOTES.md ]; then + git add RELEASE_NOTES.md + fi + if [ -f changelog_visual.html ]; then + git add changelog_visual.html + fi + + git commit -m "Update CHANGELOG.md with changes from PR #${{ github.event.pull_request.number }}" + git push + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..538c97e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the application code +COPY . . + +# Create an entrypoint script +COPY docker-entrypoint.sh /docker-entrypoint.sh +RUN chmod +x /docker-entrypoint.sh + +# Set the entrypoint +ENTRYPOINT ["/docker-entrypoint.sh"] \ No newline at end of file diff --git a/count_changelog_tokens.py b/count_changelog_tokens.py new file mode 100644 index 0000000..c4d98e4 --- /dev/null +++ b/count_changelog_tokens.py @@ -0,0 +1,194 @@ +import os +import tiktoken +import json +from pathlib import Path + +def count_tokens_in_file(file_path, max_chars=None): + """Count tokens in a file using tiktoken, optionally truncating content""" + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + # Truncate if requested + if max_chars and len(content) > max_chars: + content = content[:max_chars] + "..." + + # Use cl100k_base tokenizer (used by GPT-4 and other recent models) + encoder = tiktoken.get_encoding("cl100k_base") + tokens = encoder.encode(content) + return len(tokens), content + except Exception as e: + print(f"Error processing {file_path}: {e}") + return 0, "" + +def count_changelog_context_tokens(): + """Count tokens in files used as context for changelog generation""" + # Files used as context in the changelog generation process with maximum character limits + # Based on the actual code in generate_changelog_entry.py + context_files = [ + ('readme_content.txt', 800), # README content (truncated to 800 chars) + ('module_info.txt', 1000), # Module information (truncated to 1000 chars) + ('project_structure.txt', 500), # Project structure (truncated to 500 chars) + ('changelog_history.txt', 500), # Previous changelog entries (truncated to 500 chars) + ('commit_categories.txt', None), # Conventional commits analysis (no truncation) + ('impact_analysis.txt', None), # Code impact analysis (no truncation) + ('pr_commits.txt', None), # PR commits (no truncation) + ('pr_files_changed.txt', None), # Files changed in PR (no truncation) + ('pr_diff_stats.txt', None), # Diff statistics (no truncation) + ] + + # Initialize totals + total_tokens = 0 + file_token_counts = {} + truncated_contents = {} + + # Count tokens in each context file + for file_name, max_chars in context_files: + if os.path.exists(file_name): + tokens, truncated_content = count_tokens_in_file(file_name, max_chars) + file_token_counts[file_name] = tokens + truncated_contents[file_name] = truncated_content + total_tokens += tokens + truncation_note = f" (truncated to {max_chars} chars)" if max_chars else "" + print(f"{file_name}: {tokens:,} tokens{truncation_note}") + else: + print(f"{file_name}: File not found") + + # PR metadata (environment variables) + pr_content = { + 'PR_NUMBER': '123', + 'PR_TITLE': 'Sample PR title', + 'PR_BODY': 'This is a sample PR description that might be several sentences long.', + 'REPO_NAME': 'username/repository-name' + } + + pr_content_str = "\n".join([f"{k}: {v}" for k, v in pr_content.items()]) + encoder = tiktoken.get_encoding("cl100k_base") + pr_content_tokens = len(encoder.encode(pr_content_str)) + + print(f"PR metadata (env vars): ~{pr_content_tokens:,} tokens (simulated)") + total_tokens += pr_content_tokens + file_token_counts['PR metadata (env vars)'] = pr_content_tokens + + # Generate the actual prompt format used in generate_changelog_entry.py + project_context = f""" + # Library Context + Repository: {pr_content['REPO_NAME']} + + ## Project Structure (directories only): + {truncated_contents.get('project_structure.txt', '(Sample project structure would be here)')} + + ## Brief README Summary: + {truncated_contents.get('readme_content.txt', '(Sample README content would be here)')} + + ## Key Modules: + {truncated_contents.get('module_info.txt', '(Sample module info would be here)')} + + ## Changelog Format and History: + {truncated_contents.get('changelog_history.txt', '(Sample changelog history would be here)')} + """ + + # Conventional commits and impact analysis are conditionally included + conventional_commit_info = "" + if os.path.exists('commit_categories.txt'): + conventional_commit_info = f""" + ## Conventional Commits Analysis: + {truncated_contents.get('commit_categories.txt', '')} + """ + + impact_analysis_info = "" + if os.path.exists('impact_analysis.txt'): + impact_analysis_info = f""" + ## Impact Analysis: + {truncated_contents.get('impact_analysis.txt', '')} + """ + + # Build the complete prompt as used in the actual workflow + sample_prompt = f""" + Based on the following PR information, generate a concise changelog entry in markdown format for a new version. + + PR #{pr_content['PR_NUMBER']}: {pr_content['PR_TITLE']} + + PR Description: + {pr_content['PR_BODY']} + + PR Labels: enhancement, bug + + Commits in this PR: + {truncated_contents.get('pr_commits.txt', '(Sample commit data would be here)')} + + Files changed: + {truncated_contents.get('pr_files_changed.txt', '(Sample files changed would be here)')} + + Diff statistics: + {truncated_contents.get('pr_diff_stats.txt', '(Sample diff stats would be here)')} + {conventional_commit_info} + {impact_analysis_info} + + {project_context} + + Based on the analysis, the suggested semantic version bump would be minor (from 1.0.0). + The current date is: 2023-12-01. + + Follow this exact format for the changelog: + ## [VERSION] - DATE + + ### Category1 + - Description of changes: + - Detailed point 1 + - Detailed point 2 + + ### Category2 + - Description of other changes + """ + + prompt_tokens = len(encoder.encode(sample_prompt)) + print(f"Actual prompt with truncated content: {prompt_tokens:,} tokens") + + # Update total tokens based on the actual prompt + total_tokens = prompt_tokens # Reset to just the prompt tokens + file_token_counts['Complete prompt'] = prompt_tokens + + print(f"\nTotal tokens for realistic changelog context: {total_tokens:,}") + print(f"Estimated token usage for OpenAI API: {total_tokens:,}") + + # Calculate cost estimates (as of current OpenAI pricing) + gpt4_input_price = 0.01 / 1000 # $0.01 per 1K tokens for GPT-4 + gpt4_output_price = 0.03 / 1000 # $0.03 per 1K tokens for GPT-4 + estimated_output_tokens = 500 # Estimate for changelog entry generation + + input_cost = (total_tokens * gpt4_input_price) + output_cost = (estimated_output_tokens * gpt4_output_price) + total_cost = input_cost + output_cost + + print(f"\nEstimated cost per API call (GPT-4):") + print(f"Input tokens: {total_tokens:,} tokens = ${input_cost:.4f}") + print(f"Output tokens: ~{estimated_output_tokens:,} tokens = ${output_cost:.4f}") + print(f"Total estimated cost: ${total_cost:.4f} per changelog generation") + + # Save to JSON file + with open('changelog_token_analysis.json', 'w') as f: + json.dump({ + 'total_tokens': total_tokens, + 'file_token_counts': file_token_counts, + 'estimated_output_tokens': estimated_output_tokens, + 'estimated_cost': { + 'input_cost': input_cost, + 'output_cost': output_cost, + 'total_cost': total_cost + } + }, f, indent=2) + + print("\nResults saved to changelog_token_analysis.json") + +if __name__ == "__main__": + # Check if tiktoken is installed, install if not + try: + import tiktoken + except ImportError: + print("Installing tiktoken...") + import subprocess + subprocess.check_call(["pip", "install", "tiktoken"]) + import tiktoken + + count_changelog_context_tokens() \ No newline at end of file diff --git a/count_context_tokens.py b/count_context_tokens.py new file mode 100644 index 0000000..e69de29 diff --git a/create_changelog_embeddings.py b/create_changelog_embeddings.py new file mode 100644 index 0000000..2471734 --- /dev/null +++ b/create_changelog_embeddings.py @@ -0,0 +1,166 @@ +import os +import json +import tiktoken +import numpy as np +from pathlib import Path + +# Using a simulated embedding function since we can't directly access the OpenAI API +# In a real scenario, you would use OpenAI's embedding API or another embedding service +def simulate_embedding(text, embedding_dim=1536): + """Simulate an embedding vector for demonstration purposes""" + # This is just a deterministic hash-based simulation + # In reality, you would use a proper embedding API + import hashlib + + # Create a hash of the text + text_hash = hashlib.md5(text.encode()).digest() + + # Convert hash to a seed for numpy + seed = int.from_bytes(text_hash, byteorder='big') % (2**32 - 1) + np.random.seed(seed) + + # Generate a normalized random vector + vector = np.random.randn(embedding_dim) + vector = vector / np.linalg.norm(vector) + + return vector.tolist() + +def get_openai_embedding(text, model="text-embedding-3-small"): + """ + Get embeddings from the OpenAI API + Note: You would need to have the OpenAI Python library installed and API key set + """ + # Commented out code that would be used in production + """ + from openai import OpenAI + + client = OpenAI() + response = client.embeddings.create( + input=text, + model=model + ) + return response.data[0].embedding + """ + # For demonstration, we'll use our simulation function + return simulate_embedding(text) + +def truncate_text(text, max_chars): + """Truncate text to maximum character length""" + if max_chars and len(text) > max_chars: + return text[:max_chars] + "..." + return text + +def count_tokens(text): + """Count tokens in text using tiktoken""" + encoder = tiktoken.get_encoding("cl100k_base") + return len(encoder.encode(text)) + +def create_changelog_context_embeddings(): + """Create embeddings for changelog context files""" + # Context files with their maximum character limits + context_files = [ + ('readme_content.txt', 800), # README content + ('module_info.txt', 1000), # Module information + ('project_structure.txt', 500), # Project structure + ('changelog_history.txt', 500), # Previous changelog entries + ('commit_categories.txt', None), # Conventional commits analysis + ('impact_analysis.txt', None), # Code impact analysis + ('pr_commits.txt', None), # PR commits + ('pr_files_changed.txt', None), # Files changed in PR + ('pr_diff_stats.txt', None), # Diff statistics + ] + + # Store results + embeddings = {} + token_counts = {} + content_samples = {} + + # Process each file + for file_name, max_chars in context_files: + if os.path.exists(file_name): + # Read and truncate content + with open(file_name, 'r', encoding='utf-8', errors='ignore') as f: + content = f.read() + + truncated_content = truncate_text(content, max_chars) + token_count = count_tokens(truncated_content) + + # Generate embedding + embedding = get_openai_embedding(truncated_content) + + # Store results + embeddings[file_name] = embedding + token_counts[file_name] = token_count + content_samples[file_name] = truncated_content[:100] + "..." if len(truncated_content) > 100 else truncated_content + + print(f"Processed {file_name}: {token_count} tokens") + else: + print(f"{file_name}: File not found") + + # Save embeddings and metadata to file + output = { + "embeddings": embeddings, + "token_counts": token_counts, + "content_samples": content_samples + } + + with open('changelog_embeddings.json', 'w') as f: + json.dump(output, f, indent=2) + + print(f"\nEmbeddings saved to changelog_embeddings.json") + + # Generate a sample prompt that incorporates embeddings + sample_prompt = { + "messages": [ + {"role": "system", "content": "You are a helpful assistant that generates changelog entries based on PR information."}, + {"role": "user", "content": "Generate a changelog entry for this PR"} + ], + "context_embeddings": embeddings + } + + with open('sample_embedding_prompt.json', 'w') as f: + json.dump(sample_prompt, f, indent=2) + + print(f"Sample embedding prompt saved to sample_embedding_prompt.json") + + # Show how to use this in a GitHub workflow + workflow_example = """ +# Example GitHub workflow snippet that uses embeddings +- name: Generate embeddings for context + run: | + python scripts/create_changelog_embeddings.py + +- name: Generate changelog with embeddings + id: generate_changelog + run: | + # Read the embeddings + EMBEDDINGS=$(cat changelog_embeddings.json) + + # Prepare the API call with embeddings + curl -X POST https://api.openai.com/v1/chat/completions \\ + -H "Content-Type: application/json" \\ + -H "Authorization: Bearer $OPENAI_API_KEY" \\ + -d '{ + "model": "gpt-4o", + "messages": [ + {"role": "system", "content": "You are a helpful assistant that generates changelog entries."}, + {"role": "user", "content": "Generate a changelog entry for PR #${{ github.event.pull_request.number }}"} + ], + "context_embeddings": '$EMBEDDINGS' + }' + """ + + print("\nExample of how to use embeddings in a GitHub workflow:") + print(workflow_example) + +if __name__ == "__main__": + # Check if tiktoken is installed, install if not + try: + import tiktoken + except ImportError: + print("Installing tiktoken...") + import subprocess + subprocess.check_call(["pip", "install", "tiktoken"]) + import tiktoken + + create_changelog_context_embeddings() \ No newline at end of file diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100644 index 0000000..ce4a524 --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -e + +# Execute the agent function and capture the output +# We'll use environment variables to configure the agent +# - AGENT_TYPE: The type of agent to run (simple, memory) +# - EPISODES: Number of episodes to run +# - SEED: Random seed for reproducibility +# Default to a debug run if no args specified + +if [ "$AGENT_TYPE" == "memory" ]; then + echo "Running memory-enhanced agent..." + MEMORY_ENABLED="True" +else + echo "Running simple agent..." + MEMORY_ENABLED="False" +fi + +# Default values +EPISODES=${EPISODES:-10} +SEED=${SEED:-42} + +# Run the agent experiment and save results to output file +echo "Starting experiment with $EPISODES episodes, memory_enabled=$MEMORY_ENABLED, seed=$SEED" +python -c " +import json +import sys +from main_demo import run_debug_experiment +import numpy as np + +# Configure NumPy to use a specific random seed for reproducibility +np.random.seed($SEED) + +# Run the experiment +results = run_debug_experiment(episodes=$EPISODES, memory_enabled=$MEMORY_ENABLED, random_seed=$SEED) + +# Convert NumPy types to Python types for JSON serialization +def convert_numpy(obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {k: convert_numpy(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [convert_numpy(item) for item in obj] + else: + return obj + +# Print results to stdout +print(json.dumps(convert_numpy(results), indent=2)) +" + +# Exit with success +exit 0 \ No newline at end of file diff --git a/extract_embedding_stats.py b/extract_embedding_stats.py new file mode 100644 index 0000000..73d3957 --- /dev/null +++ b/extract_embedding_stats.py @@ -0,0 +1,31 @@ +import json + +# Load the embeddings file +with open('changelog_embeddings.json', 'r') as f: + data = json.load(f) + +# Display token counts +print("Token counts for each context file:") +for file_name, token_count in data['token_counts'].items(): + print(f"{file_name}: {token_count} tokens") + +print("\nContent samples:") +for file_name, sample in data['content_samples'].items(): + print(f"\n{file_name}:") + print(f"{sample}") + +# Calculate embedding dimensions +first_embedding_key = next(iter(data['embeddings'].keys())) +embedding_size = len(data['embeddings'][first_embedding_key]) +print(f"\nEmbedding dimensions: {embedding_size}") + +# Count total tokens +total_tokens = sum(data['token_counts'].values()) +print(f"Total tokens across all files: {total_tokens}") + +# Calculate approximate size reduction +raw_tokens = total_tokens +embedding_tokens = len(data['embeddings']) * (embedding_size / 6) # Approx 6 float values per token +print(f"\nOriginal tokens: {raw_tokens}") +print(f"Embedding size in token equivalent: ~{int(embedding_tokens)}") +print(f"Compression ratio: {raw_tokens/embedding_tokens:.2f}x") \ No newline at end of file diff --git a/scripts/create_changelog_embeddings.py b/scripts/create_changelog_embeddings.py new file mode 100644 index 0000000..03737ac --- /dev/null +++ b/scripts/create_changelog_embeddings.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +Create embeddings for changelog context files. + +This script generates embeddings for various context files that are used +to provide relevant information when generating changelog entries. +""" +import os +import json +import math +import hashlib +import tiktoken +from pathlib import Path +from typing import List, Dict, Tuple, Optional, Any, Union +from openai import OpenAI + +# Load environment variables from .env file +try: + from dotenv import load_dotenv + load_dotenv() + print("🔑 Loaded environment variables from .env file") +except ImportError: + print("⚠️ dotenv package not installed, skipping .env loading") + +# List of context files to embed and their max character lengths +# Format: (file_name, max_chars_to_include) +context_files = [ + ("readme_content.txt", 10000), + ("module_info.txt", 10000), + ("project_structure.txt", 5000), + ("changelog_history.txt", 15000), + ("pr_commits.txt", 5000), + ("pr_files_changed.txt", 5000) +] + +def simulate_embedding(text: str, embedding_dim: int = 1536) -> List[float]: + """ + Simulate an embedding vector based on the text. + Used as a fallback when OpenAI API is not available. + + Args: + text: The text to create an embedding for + embedding_dim: The dimension of the embedding vector + + Returns: + A list of floats representing the embedding vector + """ + # Create a deterministic hash of the text + text_hash = hashlib.md5(text.encode()).hexdigest() + + # Use the hash to seed a simple PRNG + def simple_prng(seed: str) -> float: + """Simple PRNG based on hash value""" + value = int(seed, 16) + return (value % 10000) / 10000.0 # value between 0 and 1 + + # Generate embedding values from hash fragments + values = [] + hash_fragments = math.ceil(embedding_dim / 8) # Each hex char is 4 bits + + for i in range(hash_fragments): + # Generate a new hash for each fragment + fragment_hash = hashlib.md5(f"{text_hash}{i}".encode()).hexdigest() + + # Use each character of the hash to generate a value + for j in range(0, min(32, embedding_dim - len(values))): + seed = fragment_hash[j % len(fragment_hash):] + fragment_hash[:j % len(fragment_hash)] + values.append(simple_prng(seed) * 2 - 1) # Scale to (-1, 1) + + # Normalize the vector (unit length) + magnitude = math.sqrt(sum(v*v for v in values)) + return [v/magnitude for v in values] + +def get_openai_embedding(text: str, model: str = "text-embedding-3-small") -> List[float]: + """ + Get embeddings for text using the OpenAI API. + + Args: + text: The text to create an embedding for + model: The OpenAI model to use + + Returns: + A list of floats representing the embedding vector + """ + try: + # Initialize the OpenAI client + client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + # Call the OpenAI API + response = client.embeddings.create( + input=text, + model=model + ) + + # Get the embedding from the response + embedding = response.data[0].embedding + + print(f"✅ Successfully generated REAL OpenAI embedding using model: {model}") + return embedding + except Exception as e: + print(f"❌ Error calling OpenAI API: {e}") + print("⚠️ Using SIMULATED embeddings instead") + return simulate_embedding(text) + +def truncate_text(text: str, max_chars: Optional[int]) -> str: + """ + Truncate text to a maximum number of characters. + + Args: + text: The text to truncate + max_chars: Maximum number of characters to include + + Returns: + Truncated text with "..." appended if truncation occurred + """ + if max_chars is None or len(text) <= max_chars: + return text + + return text[:max_chars] + "..." + +def count_tokens(text: str) -> int: + """ + Count the number of tokens in the text using tiktoken. + + Args: + text: The text to count tokens for + + Returns: + Number of tokens + """ + try: + encoding = tiktoken.encoding_for_model("gpt-4") + return len(encoding.encode(text)) + except Exception as e: + print(f"Error counting tokens: {e}") + # Fallback: estimate 1 token per 4 characters + return len(text) // 4 + +def create_changelog_context_embeddings() -> None: + """ + Process context files, generate embeddings, and save them to a JSON file. + + This function: + 1. Reads the content of each context file + 2. Truncates the content if needed + 3. Generates embeddings for each file + 4. Saves the embeddings, token counts, and content samples to a JSON file + """ + # Initialize dictionaries for embeddings and metadata + embeddings: Dict[str, List[float]] = {} + token_counts: Dict[str, int] = {} + content_samples: Dict[str, str] = {} + + # Process PR metadata from environment variables + pr_metadata = "" + pr_env_vars = ["PR_NUMBER", "PR_TITLE", "PR_BODY", "REPO_NAME"] + + for var in pr_env_vars: + if var in os.environ: + pr_metadata += f"{var}: {os.environ[var]}\n" + + if pr_metadata: + truncated_pr_metadata = truncate_text(pr_metadata, 5000) + content_samples["pr_metadata"] = truncated_pr_metadata + token_counts["pr_metadata"] = count_tokens(truncated_pr_metadata) + embeddings["pr_metadata"] = get_openai_embedding(truncated_pr_metadata) + + # Process each context file + for file_name, max_chars in context_files: + file_path = Path(file_name) + + # Skip if file doesn't exist + if not file_path.exists(): + print(f"Warning: Context file '{file_name}' not found. Skipping.") + continue + + # Read file content + content = file_path.read_text(encoding="utf-8") + + # Truncate if necessary + truncated_content = truncate_text(content, max_chars) + + # Save content sample + content_samples[file_name] = truncated_content + + # Count tokens + token_count = count_tokens(truncated_content) + token_counts[file_name] = token_count + + # Generate embedding + embedding = get_openai_embedding(truncated_content) + embeddings[file_name] = embedding + + print(f"Processed {file_name}: {token_count} tokens") + + # Create a sample prompt for generating changelog entries + sample_prompt = """ + Given the PR information and the context of the repository, + generate a clear and concise changelog entry for this PR + in markdown format. + """ + + # Save embeddings and metadata to JSON file + output = { + "embeddings": embeddings, + "token_counts": token_counts, + "content_samples": content_samples, + "sample_prompt": sample_prompt + } + + with open("changelog_embeddings.json", "w") as f: + json.dump(output, f, indent=2) + + print(f"Saved embeddings for {len(embeddings)} context files to changelog_embeddings.json") + +if __name__ == "__main__": + create_changelog_context_embeddings() \ No newline at end of file diff --git a/scripts/extract_changelog_context.py b/scripts/extract_changelog_context.py new file mode 100644 index 0000000..5232d55 --- /dev/null +++ b/scripts/extract_changelog_context.py @@ -0,0 +1,381 @@ +#!/usr/bin/env python3 +""" +Script to extract repository context for generating changelog entries. +Gathers README content, module docstrings, project structure, and changelog history. +""" +import os +import ast +import re +import sys +import json +from pathlib import Path + + +def extract_module_info(): + """Extract docstrings from Python modules""" + module_info = {} + python_files = [] + + # Find all Python files + for root, _, files in os.walk('.'): + if "__pycache__" in root: + continue + for file in files: + if file.endswith('.py'): + filepath = os.path.join(root, file) + python_files.append(filepath) + + # Extract docstrings + for filepath in sorted(python_files): + try: + with open(filepath, "r", encoding="utf-8") as mf: + module_content = mf.read() + try: + tree = ast.parse(module_content) + docstring = ast.get_docstring(tree) + if docstring: + module_info[filepath] = docstring.strip() + except SyntaxError: + continue + except Exception: + continue + + # Write to file + with open("module_info.txt", "w", encoding="utf-8") as f: + for module, doc in module_info.items(): + if len(doc) > 100: # Truncate long docstrings + doc = doc[:100] + "..." + f.write(f"{module}:\n{doc}\n\n") + + +def extract_project_structure(): + """Extract project directory structure""" + project_structure = [] + + # Read gitignore patterns if available + gitignore_patterns = [] + if os.path.exists('.gitignore'): + with open('.gitignore', 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + gitignore_patterns.append(line) + + def is_ignored(path): + """Check if a path matches any gitignore pattern""" + # Convert Windows paths to forward slashes for consistent matching + path = path.replace('\\', '/') + if path.startswith('./'): + path = path[2:] + + for pattern in gitignore_patterns: + # Handle directory-specific patterns (ending with /) + if pattern.endswith('/'): + if path.startswith(pattern) or path.startswith(f"./{pattern}"): + return True + # Handle file patterns with wildcards + elif '*' in pattern: + parts = pattern.split('*') + if len(parts) == 2: + if path.startswith(parts[0]) and path.endswith(parts[1]): + return True + # Handle exact matches + elif path == pattern or path.endswith(f"/{pattern}"): + return True + return False + + for root, dirs, files in os.walk('.'): + # Skip hidden directories, venvs and cache dirs + dirs[:] = [d for d in dirs if not d.startswith('.') and + d != 'venv' and d != '__pycache__' and not is_ignored(os.path.join(root, d))] + + if root != '.' and not is_ignored(root): + project_structure.append(root) + + # Write to file + with open("project_structure.txt", "w", encoding="utf-8") as f: + for directory in sorted(project_structure): + f.write(f"{directory}\n") + + +def extract_changelog_history(): + """Extract past changelog entries""" + if not os.path.exists('CHANGELOG.md'): + with open("changelog_history.txt", "w") as f: + f.write("No CHANGELOG.md found") + return + + try: + # Read the full changelog + with open("CHANGELOG.md", "r") as f: + content = f.read() + + # Find all version headers + version_headers = list(re.finditer(r"## \[\d+\.\d+\.\d+\]", content)) + + # Extract the latest 3 complete entries (or all if fewer) + entries_to_keep = min(3, len(version_headers)) + + if entries_to_keep > 0: + # Get the position of the last header we want to include + if entries_to_keep < len(version_headers): + end_pos = version_headers[entries_to_keep].start() + else: + end_pos = len(content) + + # Get content from beginning to the end position + changelog_sample = content[:end_pos].strip() + + with open("changelog_history.txt", "w") as f: + f.write(changelog_sample) + else: + with open("changelog_history.txt", "w") as f: + f.write("No changelog entries found") + except Exception as e: + with open("changelog_history.txt", "w") as f: + f.write(f"Error extracting changelog: {str(e)}") + + +def extract_readme(): + """Extract README content""" + if os.path.exists('README.md'): + with open('README.md', 'r') as f: + readme_content = f.read() + + with open('readme_content.txt', 'w') as f: + f.write(readme_content) + else: + with open('readme_content.txt', 'w') as f: + f.write("No README.md found") + + +def analyze_conventional_commits(): + """Analyze conventional commits to categorize changes""" + if not os.path.exists('pr_commits.txt'): + with open('commit_categories.json', 'w') as f: + f.write('{}') + return + + try: + with open('pr_commits.txt', 'r') as f: + commits = f.readlines() + + categories = { + 'feat': {'title': 'Features', 'items': []}, + 'fix': {'title': 'Bug Fixes', 'items': []}, + 'docs': {'title': 'Documentation', 'items': []}, + 'style': {'title': 'Styling', 'items': []}, + 'refactor': {'title': 'Code Refactoring', 'items': []}, + 'perf': {'title': 'Performance', 'items': []}, + 'test': {'title': 'Tests', 'items': []}, + 'build': {'title': 'Build System', 'items': []}, + 'ci': {'title': 'CI', 'items': []}, + 'chore': {'title': 'Chores', 'items': []} + } + + other_commits = [] + + for commit in commits: + commit = commit.strip() + matched = False + + for prefix in categories.keys(): + # Match prefix: or prefix(scope): + if re.match(f'^{prefix}(\\(.*\\))?:', commit): + categories[prefix]['items'].append(commit) + matched = True + break + + if not matched: + other_commits.append(commit) + + # Add uncategorized commits + if other_commits: + categories['other'] = {'title': 'Other Changes', 'items': other_commits} + + # Generate summary of categories + with open('commit_categories.txt', 'w') as f: + for category, data in categories.items(): + if data['items']: + f.write(f"## {data['title']} ({len(data['items'])})\n") + for item in data['items']: + f.write(f"- {item}\n") + f.write("\n") + + # Save full data as JSON + with open('commit_categories.json', 'w') as f: + json.dump(categories, f) + + except Exception as e: + with open('commit_categories.json', 'w') as f: + json.dump({"error": str(e)}, f) + + +def analyze_code_impact(): + """Analyze which components are most affected by the changes""" + if not os.path.exists('pr_files_changed.txt'): + with open('impact_analysis.txt', 'w') as f: + f.write("No files changed information available") + return + + try: + with open('pr_files_changed.txt', 'r') as f: + files = f.readlines() + + components = {} + extensions = {} + file_types = { + 'py': 'Python', + 'js': 'JavaScript', + 'ts': 'TypeScript', + 'jsx': 'React', + 'tsx': 'React TypeScript', + 'go': 'Go', + 'java': 'Java', + 'c': 'C', + 'cpp': 'C++', + 'h': 'Headers', + 'md': 'Documentation', + 'yml': 'Configuration', + 'yaml': 'Configuration', + 'json': 'Configuration', + 'toml': 'Configuration', + 'sql': 'Database', + 'html': 'Frontend', + 'css': 'Frontend', + 'scss': 'Frontend' + } + + # Component analysis + for file_line in files: + # Strip status code (M, A, D, etc.) if present + file_path = file_line.strip().split()[-1] + + # Skip deleted files + if file_line.startswith('D'): + continue + + parts = file_path.split('/') + if len(parts) > 1: + component = parts[0] + components[component] = components.get(component, 0) + 1 + + # Extension analysis + ext = file_path.split('.')[-1] if '.' in file_path else 'unknown' + file_type = file_types.get(ext, ext) + extensions[file_type] = extensions.get(file_type, 0) + 1 + + # Write impact analysis + with open('impact_analysis.txt', 'w') as f: + f.write("# Component Impact Analysis\n\n") + + f.write("## Components Changed\n") + for component, count in sorted(components.items(), key=lambda x: x[1], reverse=True): + f.write(f"{component}: {count} files\n") + + f.write("\n## File Types\n") + for ext, count in sorted(extensions.items(), key=lambda x: x[1], reverse=True): + f.write(f"{ext}: {count} files\n") + + # Save as JSON for programmatic use + with open('impact_analysis.json', 'w') as f: + json.dump({ + 'components': components, + 'file_types': extensions + }, f) + + except Exception as e: + with open('impact_analysis.txt', 'w') as f: + f.write(f"Error analyzing impact: {str(e)}") + + +def analyze_pr_labels(): + """Analyze PR labels if available""" + if not os.path.exists('pr_labels.txt'): + return + + try: + with open('pr_labels.txt', 'r') as f: + labels = [label.strip() for label in f.readlines() if label.strip()] + + label_categories = { + 'bug': 'Bug Fix', + 'enhancement': 'Enhancement', + 'feature': 'New Feature', + 'breaking': 'Breaking Change', + 'documentation': 'Documentation', + 'refactor': 'Refactoring', + 'test': 'Testing', + 'performance': 'Performance', + 'dependency': 'Dependencies', + 'security': 'Security' + } + + categorized_labels = {} + for label in labels: + for key, category in label_categories.items(): + if key in label.lower(): + categorized_labels[label] = category + break + if label not in categorized_labels: + categorized_labels[label] = 'Other' + + with open('label_analysis.json', 'w') as f: + json.dump({ + 'raw_labels': labels, + 'categorized_labels': categorized_labels + }, f) + + except Exception as e: + with open('label_analysis.json', 'w') as f: + json.dump({"error": str(e)}, f) + + +def analyze_test_coverage(): + """Analyze test coverage impact""" + if not os.path.exists('pr_files_changed.txt'): + return + + try: + with open('pr_files_changed.txt', 'r') as f: + files = [line.strip().split()[-1] for line in f.readlines()] + + source_files = [f for f in files if not f.startswith('test') and + not f.startswith('tests') and f.endswith('.py')] + test_files = [f for f in files if (f.startswith('test') or + f.startswith('tests')) and f.endswith('.py')] + + coverage_ratio = len(test_files)/len(source_files) if len(source_files) > 0 else 0 + + with open('test_coverage_analysis.txt', 'w') as f: + f.write(f'Source files modified: {len(source_files)}\n') + f.write(f'Test files modified: {len(test_files)}\n') + f.write(f'Test coverage ratio: {coverage_ratio:.2f}\n') + + if coverage_ratio < 0.5 and len(source_files) > 0: + f.write('\nWarning: Test coverage might be insufficient.\n') + f.write('The following source files have changes:\n') + for file in source_files: + f.write(f'- {file}\n') + + except Exception as e: + with open('test_coverage_analysis.txt', 'w') as f: + f.write(f"Error analyzing test coverage: {str(e)}") + + +def main(): + """Main entry point""" + print("Extracting repository context...") + extract_readme() + extract_module_info() + extract_project_structure() + extract_changelog_history() + analyze_conventional_commits() + analyze_code_impact() + analyze_pr_labels() + analyze_test_coverage() + print("Repository context extraction complete.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/generate_changelog_entry.py b/scripts/generate_changelog_entry.py new file mode 100644 index 0000000..80e5205 --- /dev/null +++ b/scripts/generate_changelog_entry.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python3 +""" +Script to generate a changelog entry for a PR. +Uses OpenAI if an API key is available, or falls back to a basic entry. +""" +import os +import re +import sys +import json +from datetime import datetime + +try: + from openai import OpenAI + OPENAI_AVAILABLE = True +except ImportError: + OPENAI_AVAILABLE = False + + +def get_latest_version(): + """Extract the latest version from CHANGELOG.md""" + latest_version = '0.0.0' + try: + with open('CHANGELOG.md', 'r') as f: + content = f.read() + version_match = re.search(r'## \[(\d+\.\d+\.\d+)\]', content) + if version_match: + latest_version = version_match.group(1) + except FileNotFoundError: + pass + return latest_version + + +def bump_major_version(version): + """Increment the major version (x.0.0)""" + major, _, _ = version.split('.') + return f"{int(major) + 1}.0.0" + + +def bump_minor_version(version): + """Increment the minor version (0.x.0)""" + major, minor, _ = version.split('.') + return f"{major}.{int(minor) + 1}.0" + + +def increment_patch_version(version): + """Increment the patch version (0.0.x)""" + major, minor, patch = version.split('.') + return f"{major}.{minor}.{int(patch) + 1}" + + +def determine_version_bump(changes, current_version, labels=None): + """Determine if changes warrant major, minor, or patch bump""" + change_text = changes.lower() + + # Check for breaking changes based on labels + if labels and any(('breaking' in label.lower() or 'major' in label.lower()) for label in labels): + return bump_major_version(current_version) + + # Check for breaking changes based on conventional commits + if os.path.exists('commit_categories.json'): + try: + with open('commit_categories.json', 'r') as f: + categories = json.load(f) + + # Look for breaking changes indicator in any commit + for category, data in categories.items(): + for item in data.get('items', []): + if 'BREAKING CHANGE' in item or '!' in item.split(':')[0]: + return bump_major_version(current_version) + except Exception: + pass + + # Check for breaking changes in the content + if any(keyword in change_text for keyword in + ['break', 'breaking', 'incompatible', 'major update', 'not backward compatible']): + return bump_major_version(current_version) + + # Check for new features based on labels + if labels and any(('feature' in label.lower() or 'enhancement' in label.lower()) for label in labels): + return bump_minor_version(current_version) + + # Check for new features based on conventional commits + if os.path.exists('commit_categories.json'): + try: + with open('commit_categories.json', 'r') as f: + categories = json.load(f) + + # If there are feature commits, it's a minor update + if categories.get('feat', {}).get('items', []): + return bump_minor_version(current_version) + except Exception: + pass + + # Check for new features in the content + if any(keyword in change_text for keyword in + ['feat', 'feature', 'add', 'new', 'implement', 'support for']): + return bump_minor_version(current_version) + + # Default to patch + return increment_patch_version(current_version) + + +def read_file_or_default(filepath, default="No content available"): + """Read a file or return a default message if it doesn't exist""" + try: + with open(filepath, 'r') as f: + return f.read() + except: + return default + + +def read_labels(): + """Read PR labels if available""" + labels = [] + if os.path.exists('pr_labels.txt'): + try: + with open('pr_labels.txt', 'r') as f: + labels = [label.strip() for label in f.readlines() if label.strip()] + except Exception: + pass + return labels + + +def generate_with_openai(api_key, pr_info): + """Generate a changelog entry using OpenAI API""" + try: + client = OpenAI(api_key=api_key) + + # Create project context section + project_context = f""" + # Library Context + Repository: {pr_info['repo_name']} + + ## Project Structure (directories only): + {pr_info['project_structure'][:500] + '...' if len(pr_info['project_structure']) > 500 else pr_info['project_structure']} + + ## Brief README Summary: + {pr_info['readme'][:800] + '...' if len(pr_info['readme']) > 800 else pr_info['readme']} + + ## Key Modules: + {pr_info['module_info'][:1000] + '...' if len(pr_info['module_info']) > 1000 else pr_info['module_info']} + + ## Changelog Format and History: + {pr_info['changelog_history'][:500] + '...' if len(pr_info['changelog_history']) > 500 else pr_info['changelog_history']} + """ + + # Add conventional commits analysis if available + conventional_commit_info = "" + if os.path.exists('commit_categories.txt'): + conventional_commit_info = f""" + ## Conventional Commits Analysis: + {pr_info['commit_categories']} + """ + + # Add impact analysis if available + impact_analysis_info = "" + if os.path.exists('impact_analysis.txt'): + impact_analysis_info = f""" + ## Impact Analysis: + {pr_info['impact_analysis']} + """ + + # Add test coverage analysis if available + test_coverage_info = "" + if os.path.exists('test_coverage_analysis.txt'): + test_coverage_info = f""" + ## Test Coverage Analysis: + {pr_info['test_coverage']} + """ + + # Suggest version based on changes + suggested_version = determine_version_bump( + pr_info['pr_title'] + ' ' + pr_info['pr_body'] + ' ' + pr_info['commits'], + pr_info['latest_version'], + pr_info['labels'] + ) + + prompt = f""" + Based on the following PR information, generate a concise changelog entry in markdown format for a new version. + + PR #{pr_info['pr_number']}: {pr_info['pr_title']} + + PR Description: + {pr_info['pr_body']} + + PR Labels: {', '.join(pr_info['labels']) if pr_info['labels'] else 'None'} + + Commits in this PR: + {pr_info['commits']} + + Files changed: + {pr_info['files_changed']} + + Diff statistics: + {pr_info['diff_stats']} + {conventional_commit_info} + {impact_analysis_info} + {test_coverage_info} + + {project_context} + + Based on the analysis, the suggested semantic version bump would be {suggested_version} (from {pr_info['latest_version']}). + The current date is: {pr_info['current_date']}. + + Follow this exact format for the changelog: + ## [VERSION] - DATE + + ### Category1 + - Description of changes: + - Detailed point 1 + - Detailed point 2 + + ### Category2 + - Description of other changes + """ + + response = client.chat.completions.create( + model='gpt-4o', + messages=[{'role': 'user', 'content': prompt}], + max_tokens=800 + ) + + changelog_entry = response.choices[0].message.content + print(f"Generated changelog entry for PR #{pr_info['pr_number']}") + return changelog_entry + + except Exception as e: + print(f"Error generating changelog with OpenAI: {str(e)}") + return generate_basic_entry(pr_info) + + +def generate_basic_entry(pr_info): + """Generate a basic changelog entry when OpenAI is not available""" + print("Generating basic changelog entry.") + + # Try to intelligently determine version bump + new_version = determine_version_bump( + pr_info['pr_title'] + ' ' + pr_info['pr_body'] + ' ' + pr_info['commits'], + pr_info['latest_version'], + pr_info['labels'] + ) + + entry = f"## [{new_version}] - {pr_info['current_date']}\n\n" + + # Try to categorize based on conventional commits + if os.path.exists('commit_categories.json'): + try: + with open('commit_categories.json', 'r') as f: + categories = json.load(f) + + for category, data in categories.items(): + if data.get('items', []): + entry += f"### {data['title']}\n" + for item in data['items']: + # Extract the actual message without the prefix + message = item.split(':', 1)[1].strip() if ':' in item else item + entry += f"- {message}\n" + entry += "\n" + + return entry + except Exception: + pass + + # Fallback if no conventional commits + entry += "### Changes\n" + entry += f"- {pr_info['pr_title']} (#{pr_info['pr_number']})\n" + + return entry + + +def generate_release_notes(changelog_entry, pr_info): + """Generate more detailed release notes from changelog""" + try: + repo_name = pr_info['repo_name'].split('/')[-1] if '/' in pr_info['repo_name'] else pr_info['repo_name'] + + # Extract version from changelog + version_match = re.search(r'## \[(\d+\.\d+\.\d+)\]', changelog_entry) + version = version_match.group(1) if version_match else 'new-version' + + # Build release notes + release_notes = f"# Release Notes for {repo_name} v{version}\n\n" + release_notes += changelog_entry + + # Add installation section + release_notes += "\n\n## Installation\n\n" + release_notes += f"```\npip install {repo_name}\n```\n\n" + + # Add impact analysis if available + if os.path.exists('impact_analysis.txt'): + impact_analysis = read_file_or_default('impact_analysis.txt') + release_notes += f"\n\n## Impact Analysis\n\n{impact_analysis}\n\n" + + # Add test coverage information if available + if os.path.exists('test_coverage_analysis.txt'): + test_coverage = read_file_or_default('test_coverage_analysis.txt') + release_notes += f"\n\n## Test Coverage\n\n{test_coverage}\n\n" + + # Add additional information + release_notes += "## Additional Information\n\n" + release_notes += f"For more details, see the [full changelog](https://github.com/{pr_info['repo_name']}/blob/main/CHANGELOG.md).\n" + + # Save release notes + with open('RELEASE_NOTES.md', 'w') as f: + f.write(release_notes) + + print(f"Release notes generated in RELEASE_NOTES.md") + except Exception as e: + print(f"Error generating release notes: {str(e)}") + + +def main(): + """Main entry point""" + # Get environment variables + pr_number = os.getenv('PR_NUMBER', '0') + pr_title = os.getenv('PR_TITLE', 'No title provided') + pr_body = os.getenv('PR_BODY', 'No description provided') + repo_name = os.getenv('REPO_NAME', 'Unknown repository') + openai_api_key = os.getenv('OPENAI_API_KEY', '') + + # Get current date + current_date = datetime.now().strftime('%Y-%m-%d') + + # Get latest version + latest_version = get_latest_version() + + # Get PR labels + labels = read_labels() + + # Read files created by the context script + commits = read_file_or_default('pr_commits.txt') + files_changed = read_file_or_default('pr_files_changed.txt') + diff_stats = read_file_or_default('pr_diff_stats.txt') + readme = read_file_or_default('readme_content.txt') + module_info = read_file_or_default('module_info.txt') + changelog_history = read_file_or_default('changelog_history.txt') + project_structure = read_file_or_default('project_structure.txt') + commit_categories = read_file_or_default('commit_categories.txt') + impact_analysis = read_file_or_default('impact_analysis.txt') + test_coverage = read_file_or_default('test_coverage_analysis.txt') + + # Prepare PR info + pr_info = { + 'pr_number': pr_number, + 'pr_title': pr_title, + 'pr_body': pr_body, + 'repo_name': repo_name, + 'latest_version': latest_version, + 'current_date': current_date, + 'commits': commits, + 'files_changed': files_changed, + 'diff_stats': diff_stats, + 'readme': readme, + 'module_info': module_info, + 'changelog_history': changelog_history, + 'project_structure': project_structure, + 'commit_categories': commit_categories, + 'impact_analysis': impact_analysis, + 'test_coverage': test_coverage, + 'labels': labels + } + + # Generate changelog + if OPENAI_AVAILABLE and openai_api_key: + changelog_entry = generate_with_openai(openai_api_key, pr_info) + else: + changelog_entry = generate_basic_entry(pr_info) + + # Write to file + with open('pr_changelog_entry.md', 'w') as f: + f.write(changelog_entry) + + # Generate release notes + generate_release_notes(changelog_entry, pr_info) + + print("Changelog entry generated and saved to 'pr_changelog_entry.md'") + + # Print to stdout for GitHub workflow + print(changelog_entry) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/generate_changelog_html.py b/scripts/generate_changelog_html.py new file mode 100644 index 0000000..7e47689 --- /dev/null +++ b/scripts/generate_changelog_html.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 +""" +Script to generate a visual HTML representation of the changelog. +Creates a standalone HTML file with charts and formatted changelog content. +""" +import os +import re +import json +import sys +from datetime import datetime + +# Check for required packages and notify if missing +try: + import markdown + MARKDOWN_AVAILABLE = True +except ImportError: + MARKDOWN_AVAILABLE = False + print("Warning: markdown package not available. Install with: pip install markdown") + +try: + import matplotlib.pyplot as plt + from matplotlib.figure import Figure + import matplotlib + matplotlib.use('Agg') # Use non-interactive backend + import base64 + from io import BytesIO + MATPLOTLIB_AVAILABLE = True +except ImportError: + MATPLOTLIB_AVAILABLE = False + print("Warning: matplotlib package not available. Install with: pip install matplotlib") + +# HTML template for the visual changelog +HTML_TEMPLATE = """ + + + + + + {title} + + + +

{title}

+ +
+ Release Date: {date}
+ PR: #{pr_number} - {pr_title}
+ Repository: {repo_name} +
+ +
+ {charts_html} +
+ +
+ {changelog_html} +
+ +
+

Impact Analysis

+ {impact_html} +
+ + +""" + +def figure_to_base64(fig): + """Convert a matplotlib figure to a base64 string for embedding in HTML""" + buf = BytesIO() + fig.savefig(buf, format='png', dpi=100, bbox_inches='tight') + buf.seek(0) + img_str = base64.b64encode(buf.read()).decode('ascii') + return f"data:image/png;base64,{img_str}" + + +def create_commit_type_chart(): + """Create a pie chart showing commit types""" + if not MATPLOTLIB_AVAILABLE or not os.path.exists('commit_categories.json'): + return None, "No commit type data available" + + try: + with open('commit_categories.json', 'r') as f: + categories = json.load(f) + + # Extract counts and labels + labels = [] + counts = [] + + for category, data in categories.items(): + if data.get('items', []): + labels.append(data.get('title', category)) + counts.append(len(data.get('items', []))) + + if not counts: + return None, "No commit categories found" + + # Create figure + fig, ax = plt.subplots(figsize=(6, 4)) + ax.pie(counts, labels=labels, autopct='%1.1f%%', startangle=90) + ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle + plt.title('Commit Types') + + return fig, "Distribution of Commit Types" + except Exception as e: + return None, f"Error creating commit type chart: {str(e)}" + + +def create_file_impact_chart(): + """Create a bar chart showing file impact by component""" + if not MATPLOTLIB_AVAILABLE or not os.path.exists('impact_analysis.json'): + return None, "No impact analysis data available" + + try: + with open('impact_analysis.json', 'r') as f: + impact = json.load(f) + + components = impact.get('components', {}) + + if not components: + return None, "No component data found" + + # Sort by count and take top 10 + sorted_items = sorted(components.items(), key=lambda x: x[1], reverse=True)[:10] + labels = [item[0] for item in sorted_items] + counts = [item[1] for item in sorted_items] + + # Create figure + fig, ax = plt.subplots(figsize=(7, 5)) + ax.barh(labels, counts) + ax.set_xlabel('Number of Files') + ax.set_title('Files Changed by Component') + + # Add value labels + for i, v in enumerate(counts): + ax.text(v + 0.1, i, str(v), va='center') + + plt.tight_layout() + + return fig, "Files Changed by Component" + except Exception as e: + return None, f"Error creating file impact chart: {str(e)}" + + +def create_file_type_chart(): + """Create a bar chart showing file types changed""" + if not MATPLOTLIB_AVAILABLE or not os.path.exists('impact_analysis.json'): + return None, "No file type data available" + + try: + with open('impact_analysis.json', 'r') as f: + impact = json.load(f) + + file_types = impact.get('file_types', {}) + + if not file_types: + return None, "No file type data found" + + # Sort by count and take top 8 + sorted_items = sorted(file_types.items(), key=lambda x: x[1], reverse=True)[:8] + labels = [item[0] for item in sorted_items] + counts = [item[1] for item in sorted_items] + + # Create figure + fig, ax = plt.subplots(figsize=(6, 4)) + ax.bar(labels, counts) + ax.set_ylabel('Number of Files') + ax.set_title('Files Changed by Type') + + # Rotate x-axis labels if needed + plt.xticks(rotation=45, ha='right') + + plt.tight_layout() + + return fig, "Files Changed by Type" + except Exception as e: + return None, f"Error creating file type chart: {str(e)}" + + +def generate_html_changelog(pr_info): + """Generate HTML changelog with visual elements""" + if not MARKDOWN_AVAILABLE: + print("Error: markdown package required but not available.") + return + + # Get changelog content + changelog_content = "" + if os.path.exists('pr_changelog_entry.md'): + with open('pr_changelog_entry.md', 'r') as f: + changelog_content = f.read() + else: + print("Error: No changelog entry found at pr_changelog_entry.md") + return + + # Extract version and date + version_match = re.search(r'## \[(\d+\.\d+\.\d+)\]', changelog_content) + version = version_match.group(1) if version_match else "new-version" + + # Convert markdown to HTML + changelog_html = markdown.markdown(changelog_content) + + # Generate charts if matplotlib is available + charts_html = "" + if MATPLOTLIB_AVAILABLE: + # Create charts + commit_chart, commit_title = create_commit_type_chart() + impact_chart, impact_title = create_file_impact_chart() + file_type_chart, file_type_title = create_file_type_chart() + + # Add charts if available + if commit_chart: + img_data = figure_to_base64(commit_chart) + charts_html += f'
{commit_title}

{commit_title}

' + plt.close(commit_chart) + + if impact_chart: + img_data = figure_to_base64(impact_chart) + charts_html += f'
{impact_title}

{impact_title}

' + plt.close(impact_chart) + + if file_type_chart: + img_data = figure_to_base64(file_type_chart) + charts_html += f'
{file_type_title}

{file_type_title}

' + plt.close(file_type_chart) + + if not charts_html: + charts_html = "

No charts available. Install matplotlib for visual charts.

" + + # Get impact analysis content + impact_html = "" + if os.path.exists('impact_analysis.txt'): + with open('impact_analysis.txt', 'r') as f: + impact_content = f.read() + impact_html = markdown.markdown(impact_content) + else: + impact_html = "

No impact analysis available.

" + + # Test coverage analysis + test_coverage_html = "" + if os.path.exists('test_coverage_analysis.txt'): + with open('test_coverage_analysis.txt', 'r') as f: + test_coverage = f.read() + test_coverage_html = f"

Test Coverage

{test_coverage}
" + impact_html += test_coverage_html + + # Extract PR info + pr_number = pr_info.get('pr_number', '0') + pr_title = pr_info.get('pr_title', 'No title') + repo_name = pr_info.get('repo_name', 'Unknown') + current_date = pr_info.get('current_date', datetime.now().strftime('%Y-%m-%d')) + + # Generate HTML + html = HTML_TEMPLATE.format( + title=f"Changelog for v{version}", + date=current_date, + pr_number=pr_number, + pr_title=pr_title, + repo_name=repo_name, + charts_html=charts_html, + changelog_html=changelog_html, + impact_html=impact_html + ) + + # Write HTML to file + with open('changelog_visual.html', 'w') as f: + f.write(html) + + print(f"Visual changelog generated in changelog_visual.html") + + +def main(): + """Main entry point""" + # Get PR info from environment or default values + pr_info = { + 'pr_number': os.getenv('PR_NUMBER', '0'), + 'pr_title': os.getenv('PR_TITLE', 'No title provided'), + 'repo_name': os.getenv('REPO_NAME', 'Unknown repository'), + 'current_date': datetime.now().strftime('%Y-%m-%d') + } + + generate_html_changelog(pr_info) + + +if __name__ == "__main__": + main() \ No newline at end of file