github
diff --git a/‎auth.py‎
Lines changed: 102 additions & 19 deletions b/‎auth.py‎
Lines changed: 102 additions & 19 deletions
diff --git a/‎config.py‎
Lines changed: 78 additions & 10 deletions b/‎config.py‎
Lines changed: 78 additions & 10 deletions
diff --git a/‎markdown_helpers.py‎
Lines changed: 74 additions & 13 deletions b/‎markdown_helpers.py‎
Lines changed: 74 additions & 13 deletions
@@ -1,4 +1,24 @@
-"""This is the module that contains functions related to authenticating to GitHub with a personal access token."""
+"""GitHub authentication module for the InnerSource measurement tool.
+
+This module provides functions for authenticating with GitHub using either Personal Access
+Tokens (PAT) or GitHub App installations. It supports both GitHub.com and GitHub Enterprise
+Server installations.
+
+Authentication Methods:
+    1. Personal Access Token (PAT) - Simple token-based authentication
+    2. GitHub App Installation - More secure app-based authentication with JWT
+
+The module handles the complexity of different authentication methods and provides
+a unified interface for establishing authenticated connections to GitHub's API.
+
+Functions:
+    auth_to_github: Create an authenticated GitHub client connection
+    get_github_app_installation_token: Obtain installation tokens for GitHub Apps
+
+Dependencies:
+    - github3.py: GitHub API client library
+    - requests: HTTP library for API calls
+"""
 
 import github3
 import requests
@@ -13,19 +33,52 @@ def auth_to_github(
     gh_app_enterprise_only: bool,
 ) -> github3.GitHub:
     """
-    Connect to GitHub.com or GitHub Enterprise, depending on env variables.
-
+    Establish an authenticated connection to GitHub.com or GitHub Enterprise.
+    
+    This function creates an authenticated GitHub client using either Personal Access Token
+    or GitHub App authentication. It supports both GitHub.com and GitHub Enterprise
+    installations.
+    
+    Authentication Priority:
+    1. GitHub App authentication (if all app credentials are provided)
+    2. Personal Access Token authentication (if token is provided)
+    
     Args:
-        token (str): the GitHub personal access token
-        gh_app_id (int | None): the GitHub App ID
-        gh_app_installation_id (int | None): the GitHub App Installation ID
-        gh_app_private_key_bytes (bytes): the GitHub App Private Key
-        ghe (str): the GitHub Enterprise URL
-        gh_app_enterprise_only (bool): Set this to true if the GH APP is created
-                                       on GHE and needs to communicate with GHE api only
+        token (str): The GitHub personal access token for authentication.
+                    Can be empty if using GitHub App authentication.
+        gh_app_id (int | None): The GitHub App ID for app-based authentication.
+                               Required along with other app credentials for app auth.
+        gh_app_installation_id (int | None): The GitHub App Installation ID.
+                                            Required for app-based authentication.
+        gh_app_private_key_bytes (bytes): The GitHub App Private Key as bytes.
+                                         Required for app-based authentication.
+        ghe (str): The GitHub Enterprise URL (e.g., "https://github.company.com").
+                  Leave empty for GitHub.com.
+        gh_app_enterprise_only (bool): Set to True if the GitHub App is created
+                                      on GitHub Enterprise and should only communicate
+                                      with the GHE API endpoint.
 
     Returns:
-        github3.GitHub: the GitHub connection object
+        github3.GitHub: An authenticated GitHub client object that can be used
+                       to make API calls to GitHub.
+    
+    Raises:
+        ValueError: If authentication fails due to:
+                   - Missing required credentials (no token or incomplete app credentials)
+                   - Unable to establish connection to GitHub
+    
+    Examples:
+        >>> # Using Personal Access Token
+        >>> client = auth_to_github(token="ghp_...", gh_app_id=None, 
+        ...                        gh_app_installation_id=None, 
+        ...                        gh_app_private_key_bytes=b"", 
+        ...                        ghe="", gh_app_enterprise_only=False)
+        
+        >>> # Using GitHub App
+        >>> client = auth_to_github(token="", gh_app_id=12345, 
+        ...                        gh_app_installation_id=67890,
+        ...                        gh_app_private_key_bytes=private_key_bytes,
+        ...                        ghe="", gh_app_enterprise_only=False)
     """
     if gh_app_id and gh_app_private_key_bytes and gh_app_installation_id:
         if ghe and gh_app_enterprise_only:
@@ -58,17 +111,47 @@ def get_github_app_installation_token(
     gh_app_installation_id: str,
 ) -> str | None:
     """
-    Get a GitHub App Installation token.
-    API: https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/authenticating-as-a-github-app-installation # noqa: E501
-
+    Obtain a GitHub App Installation access token using JWT authentication.
+    
+    This function creates a JWT token using the GitHub App's private key and exchanges
+    it for an installation access token that can be used to authenticate API requests
+    on behalf of the installed app.
+    
+    Reference: https://docs.github.com/en/apps/creating-github-apps/authenticating-with-a-github-app/authenticating-as-a-github-app-installation
+    
     Args:
-        ghe (str): the GitHub Enterprise endpoint
-        gh_app_id (str): the GitHub App ID
-        gh_app_private_key_bytes (bytes): the GitHub App Private Key
-        gh_app_installation_id (str): the GitHub App Installation ID
+        ghe (str): The GitHub Enterprise endpoint URL (e.g., "https://github.company.com").
+                  Leave empty for GitHub.com.
+        gh_app_id (str): The GitHub App ID as a string.
+        gh_app_private_key_bytes (bytes): The GitHub App Private Key in bytes format.
+                                         This should be the complete private key including
+                                         the header and footer.
+        gh_app_installation_id (str): The GitHub App Installation ID as a string.
+                                     This identifies the specific installation of the app.
 
     Returns:
-        str: the GitHub App token
+        str | None: The installation access token if successful, None if the request
+                   fails or if there's an error in the authentication process.
+    
+    Raises:
+        No exceptions are raised directly, but request failures are handled gracefully
+        and logged to the console.
+    
+    Notes:
+        - The token has a default expiration time (typically 1 hour)
+        - The token provides access to resources the app installation has been granted
+        - Network errors and API failures are handled gracefully with None return
+    
+    Examples:
+        >>> private_key = b"-----BEGIN PRIVATE KEY-----\\n...\\n-----END PRIVATE KEY-----"
+        >>> token = get_github_app_installation_token(
+        ...     ghe="", 
+        ...     gh_app_id="12345",
+        ...     gh_app_private_key_bytes=private_key,
+        ...     gh_app_installation_id="67890"
+        ... )
+        >>> if token:
+        ...     print("Successfully obtained installation token")
     """
     jwt_headers = github3.apps.create_jwt_headers(gh_app_private_key_bytes, gh_app_id)
     api_endpoint = f"{ghe}/api/v3" if ghe else "https://api.github.com"
 
@@ -85,14 +85,27 @@ def __repr__(self):
 
 
 def get_bool_env_var(env_var_name: str, default: bool = False) -> bool:
-    """Get a boolean environment variable.
+    """Get a boolean environment variable with proper type conversion.
+    
+    This function retrieves an environment variable and converts it to a boolean.
+    Only the string "true" (case-insensitive) is considered True; all other
+    values are considered False.
 
     Args:
-        env_var_name: The name of the environment variable to retrieve.
-        default: The default value to return if the environment variable is not set.
+        env_var_name (str): The name of the environment variable to retrieve.
+        default (bool, optional): The default value to return if the environment 
+                                 variable is not set or is empty. Defaults to False.
 
     Returns:
-        The value of the environment variable as a boolean.
+        bool: True if the environment variable is set to "true" (case-insensitive),
+              False otherwise, or the default value if the variable is not set.
+    
+    Examples:
+        >>> os.environ['TEST_VAR'] = 'true'
+        >>> get_bool_env_var('TEST_VAR')
+        True
+        >>> get_bool_env_var('NONEXISTENT_VAR', default=True)
+        True
     """
     ev = os.environ.get(env_var_name, "")
     if ev == "" and default:
@@ -101,13 +114,27 @@ def get_bool_env_var(env_var_name: str, default: bool = False) -> bool:
 
 
 def get_int_env_var(env_var_name: str) -> int | None:
-    """Get an integer environment variable.
+    """Get an integer environment variable with proper type conversion and validation.
+    
+    This function retrieves an environment variable and attempts to convert it to an integer.
+    If the conversion fails or the variable is not set, it returns None.
 
     Args:
-        env_var_name: The name of the environment variable to retrieve.
+        env_var_name (str): The name of the environment variable to retrieve.
 
     Returns:
-        The value of the environment variable as an integer or None.
+        int | None: The value of the environment variable as an integer, or None if
+                   the variable is not set, empty, or cannot be converted to an integer.
+    
+    Examples:
+        >>> os.environ['PORT'] = '8080'
+        >>> get_int_env_var('PORT')
+        8080
+        >>> get_int_env_var('NONEXISTENT_VAR')
+        None
+        >>> os.environ['INVALID_INT'] = 'not-a-number'
+        >>> get_int_env_var('INVALID_INT')
+        None
     """
     env_var = os.environ.get(env_var_name)
     if env_var is None or not env_var.strip():
@@ -120,9 +147,50 @@ def get_int_env_var(env_var_name: str) -> int | None:
 
 def get_env_vars(test: bool = False) -> EnvVars:
     """
-    Get the environment variables for use in the script.
-
-    Returns EnvVars object with all environment variables
+    Get and validate all environment variables required for the InnerSource measurement tool.
+    
+    This function loads environment variables from the system and an optional .env file,
+    validates them, and returns a structured EnvVars object containing all configuration
+    needed to run the tool.
+    
+    Args:
+        test (bool, optional): If True, skip loading the .env file (used for testing).
+                              Defaults to False.
+    
+    Returns:
+        EnvVars: A structured object containing all validated environment variables
+                and configuration settings.
+    
+    Raises:
+        ValueError: If required environment variables are missing or invalid:
+                   - Missing GitHub authentication (GH_TOKEN or GitHub App credentials)
+                   - Missing or invalid REPOSITORY format (must be "owner/repo")
+                   - Incomplete GitHub App credentials (missing ID, key, or installation ID)
+    
+    Environment Variables Required:
+        Authentication (choose one):
+        - GH_TOKEN: GitHub personal access token
+        - GH_APP_ID + GH_APP_PRIVATE_KEY + GH_APP_INSTALLATION_ID: GitHub App credentials
+        
+        Repository:
+        - REPOSITORY: Repository to analyze in "owner/repo" format
+        
+        Optional:
+        - GH_ENTERPRISE_URL: GitHub Enterprise URL (for on-premises installations)
+        - GITHUB_APP_ENTERPRISE_ONLY: Set to "true" for GHE-only GitHub Apps
+        - REPORT_TITLE: Custom title for the report (default: "InnerSource Report")
+        - OUTPUT_FILE: Output filename (default: "innersource_report.md")
+        - RATE_LIMIT_BYPASS: Set to "true" to bypass rate limiting
+        - CHUNK_SIZE: Number of items to process at once (default: 100, minimum: 10)
+    
+    Examples:
+        >>> os.environ['GH_TOKEN'] = 'ghp_...'
+        >>> os.environ['REPOSITORY'] = 'octocat/Hello-World'
+        >>> env_vars = get_env_vars()
+        >>> print(env_vars.owner)
+        'octocat'
+        >>> print(env_vars.repo)
+        'Hello-World'
     """
     if not test:  # pragma: no cover
         dotenv_path = join(dirname(__file__), ".env")
 
@@ -1,17 +1,49 @@
-"""Helper functions for working with markdown files."""
+"""Markdown file processing utilities for the InnerSource measurement tool.
+
+This module provides helper functions for working with markdown files, particularly
+for handling large files that may exceed GitHub's character limits for issue bodies.
+
+GitHub issues have a maximum character limit of 65,535 characters for the body content.
+When InnerSource reports are large, they need to be split into smaller files that can
+fit within this limit.
+
+Functions:
+    markdown_too_large_for_issue_body: Check if a markdown file is too large for GitHub issues
+    split_markdown_file: Split large markdown files into smaller, manageable chunks
+
+Common Use Cases:
+    - Splitting large InnerSource reports for GitHub issue compatibility
+    - Managing file sizes for various markdown-based systems with character limits
+    - Preparing reports for different output formats with size constraints
+"""
 
 
 def markdown_too_large_for_issue_body(file_path: str, max_char_count: int) -> bool:
     """
-    Check if the markdown file is too large to fit into a github issue.
-
-    Inputs:
-    file_path: str - the path to the markdown file to check
-    max_char_count: int - the maximum number of characters allowed in a github issue body
+    Check if a markdown file exceeds GitHub's issue body character limit.
+    
+    GitHub issues have a maximum character limit for the body content. This function
+    reads a markdown file and determines if it would exceed this limit.
+    
+    Args:
+        file_path (str): The path to the markdown file to check. Must be a valid
+                        file path that exists and is readable.
+        max_char_count (int): The maximum number of characters allowed in a GitHub
+                             issue body. For GitHub.com, this is typically 65,535.
 
     Returns:
-    bool - True if the file is too large, False otherwise
-
+        bool: True if the file contents exceed the character limit, False otherwise.
+    
+    Raises:
+        FileNotFoundError: If the specified file does not exist.
+        PermissionError: If the file cannot be read due to permission issues.
+        UnicodeDecodeError: If the file contains invalid UTF-8 encoding.
+    
+    Examples:
+        >>> # Check if a report is too large for GitHub issues
+        >>> is_too_large = markdown_too_large_for_issue_body("report.md", 65535)
+        >>> if is_too_large:
+        ...     print("File needs to be split for GitHub issues")
     """
     with open(file_path, "r", encoding="utf-8") as file:
         file_contents = file.read()
@@ -20,12 +52,41 @@ def markdown_too_large_for_issue_body(file_path: str, max_char_count: int) -> bo
 
 def split_markdown_file(file_path: str, max_char_count: int) -> None:
     """
-    Split the markdown file into smaller files.
-
-    Inputs:
-    file_path: str - the path to the markdown file to split
-    max_char_count: int - the maximum number of characters allowed before splitting markdown file
+    Split a large markdown file into smaller files that fit within size limits.
+    
+    This function reads a markdown file and splits it into multiple smaller files
+    when the original file is too large for GitHub issues or other systems with
+    character limits.
+    
+    Args:
+        file_path (str): The path to the markdown file to split. The file must exist
+                        and be readable. The function will create new files with
+                        numbered suffixes in the same directory.
+        max_char_count (int): The maximum number of characters allowed in each split
+                             file. Content will be split at this boundary.
 
+    Returns:
+        None: This function performs file operations and creates new split files.
+    
+    Side Effects:
+        - Creates new files with names like "{original_name}_0.md", "{original_name}_1.md", etc.
+        - Each new file contains a portion of the original content
+        - Files are created in the same directory as the original file
+        - The original file is not modified or deleted
+    
+    File Naming:
+        - Original file: "report.md"
+        - Split files: "report_0.md", "report_1.md", "report_2.md", etc.
+    
+    Raises:
+        FileNotFoundError: If the specified file does not exist.
+        PermissionError: If the file cannot be read or new files cannot be created.
+        UnicodeDecodeError: If the file contains invalid UTF-8 encoding.
+    
+    Examples:
+        >>> # Split a large report into smaller files
+        >>> split_markdown_file("large_report.md", 65535)
+        >>> # This creates: large_report_0.md, large_report_1.md, etc.
     """
     with open(file_path, "r", encoding="utf-8") as file:
         file_contents = file.read()