github
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config.py‎
Lines changed: 17 additions & 1 deletion b/‎config.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎markdown_writer.py‎
Lines changed: 11 additions & 2 deletions b/‎markdown_writer.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎measure_innersource.py‎
Lines changed: 72 additions & 29 deletions b/‎measure_innersource.py‎
Lines changed: 72 additions & 29 deletions
diff --git a/‎test_config_chunk_size.py‎
Lines changed: 54 additions & 0 deletions b/‎test_config_chunk_size.py‎
Lines changed: 54 additions & 0 deletions
@@ -159,6 +159,7 @@ This action can be configured to authenticate with GitHub App Installation or Pe
 | `OUTPUT_FILE`       | False    | `innersource_report.md` | Output filename.                                                                                                                      |
 | `REPORT_TITLE`      | False    | `"InnerSource Report"`  | Title to have on the report issue.                                                                                                    |
 | `REPOSITORY`        | True     | `""`                    | The name of the repository you are trying to measure. Format `owner/repo` ie. `github/measure-innersource`                            |
+| `CHUNK_SIZE`        | False    | `100`                   | Number of items to process at once when fetching data. Increasing can improve performance but uses more memory. Minimum value is 10.  |
 
 ## Understanding the Results
 
 
@@ -34,6 +34,7 @@ class EnvVars:
         repo (str): The name of the repository to measure InnerSource collaboration in
         output_file (str): The name of the file to write the report to
         rate_limit_bypass (bool): If set to TRUE, bypass the rate limit for the GitHub API
+        chunk_size (int): The number of items to process at once when fetching data (for memory efficiency)
     """
 
     def __init__(
@@ -49,6 +50,7 @@ def __init__(
         repo: str,
         output_file: str,
         rate_limit_bypass: bool = False,
+        chunk_size: int = 100,
     ):
         self.gh_app_id = gh_app_id
         self.gh_app_installation_id = gh_app_installation_id
@@ -61,6 +63,7 @@ def __init__(
         self.repo = repo
         self.output_file = output_file
         self.rate_limit_bypass = rate_limit_bypass
+        self.chunk_size = chunk_size
 
     def __repr__(self):
         return (
@@ -75,7 +78,9 @@ def __repr__(self):
             f"{self.owner},"
             f"{self.repo},"
             f"{self.output_file},"
-            f"{self.rate_limit_bypass}"
+            f"{self.rate_limit_bypass},"
+            f"{self.chunk_size}"
+            ")"
         )
 
 
@@ -159,6 +164,16 @@ def get_env_vars(test: bool = False) -> EnvVars:
         output_file = "innersource_report.md"
     rate_limit_bypass = get_bool_env_var("RATE_LIMIT_BYPASS", False)
 
+    # Get the chunk size for processing data in batches (for memory efficiency)
+    chunk_size_str = os.getenv("CHUNK_SIZE", "100")
+    try:
+        chunk_size = int(chunk_size_str)
+        # Ensure a reasonable minimum chunk size
+        chunk_size = max(chunk_size, 10)
+    except ValueError:
+        # Default to 100 if not a valid integer
+        chunk_size = 100
+
     return EnvVars(
         gh_app_id,
         gh_app_installation_id,
@@ -171,4 +186,5 @@ def get_env_vars(test: bool = False) -> EnvVars:
         repo,
         output_file,
         rate_limit_bypass,
+        chunk_size,
     )
@@ -42,7 +42,6 @@ def write_to_markdown(
             report_file.write("no op\n\n")
             return
         report_file.write(f"## Repository: {repo_data.full_name}\n\n")
-        innersource_ratio = innersource_ratio if innersource_ratio is not None else 0.0
         report_file.write(f"### InnerSource Ratio: {innersource_ratio:.2%}\n\n")
         report_file.write(
             f"### Original Commit Author: {original_commit_author} (Manager: {original_commit_author_manager})\n\n"
@@ -58,6 +57,8 @@ def write_to_markdown(
         if all_contributors:
             for contributor in all_contributors:
                 report_file.write(f"- {contributor}\n")
+        else:
+            report_file.write("No contributors found.\n")
 
         report_file.write("\n## Innersource Contributors:\n")
         if innersource_contributors:
@@ -70,9 +71,17 @@ def write_to_markdown(
         if innersource_contribution_counts:
             for contributor, count in innersource_contribution_counts.items():
                 report_file.write(f"- {contributor}: {count} contributions\n")
+        else:
+            report_file.write("No InnerSource contribution counts available.\n")
 
         report_file.write("\n## Team Member Contribution Counts:\n")
-        if team_member_contribution_counts is not None:
+        if team_member_contribution_counts:
+            found_contributions = False
             for member, count in team_member_contribution_counts.items():
                 if count > 0:
+                    found_contributions = True
                     report_file.write(f"- {member}: {count} contributions\n")
+            if not found_contributions:
+                report_file.write("No team member contributions found.\n")
+        else:
+            report_file.write("No team member contribution counts available.\n")
@@ -119,7 +119,6 @@ def main():  # pragma: no cover
             f"Original commit author: {original_commit_author}, \
 with manager: {original_commit_author_manager}"
         )
-
         # Create a dictionary mapping users to their managers for faster lookups
         user_to_manager = {}
         manager_to_reports = {}
@@ -169,56 +168,100 @@ def main():  # pragma: no cover
         print(f"All contributors: {all_contributors}")
         print(f"Innersource contributors: {innersource_contributors}")
 
-        # Fetch all PRs and issues once
-        print("Fetching all pull requests...")
-        all_pulls = list(repo_data.pull_requests(state="all"))
-        print(f"Found {len(all_pulls)} pull requests")
-
-        print("Fetching all issues...")
-        all_issues = list(repo_data.issues(state="all"))
-        print(f"Found {len(all_issues)} issues")
+        # Process data in chunks to avoid memory issues while maintaining performance
+        chunk_size = env_vars.chunk_size
+        print(f"Using chunk size of {chunk_size} for data processing")
 
-        # Pre-process all data to create mappings of user to contribution counts
         print("Pre-processing contribution data...")
 
         # Create mapping of commit authors to commit counts
+        print("Processing commits...")
         commit_author_counts = {}
         for commit in commit_list:
             if hasattr(commit.author, "login"):
                 author = commit.author.login
                 commit_author_counts[author] = commit_author_counts.get(author, 0) + 1
 
-        # Create mapping of PR authors to PR counts
+        # Process pull requests in chunks
+        print("Processing pull requests in chunks...")
         pr_author_counts = {}
-        for pull in all_pulls:
-            author = pull.user.login
-            pr_author_counts[author] = pr_author_counts.get(author, 0) + 1
-
-        # Create mapping of issue authors to issue counts
+        total_prs = 0
+
+        # GitHub API returns an iterator that internally handles pagination
+        # We'll manually chunk it to avoid loading everything at once
+        pulls_iterator = repo_data.pull_requests(state="all")
+        while True:
+            # Process a chunk of pull requests
+            chunk = []
+            for _ in range(chunk_size):
+                try:
+                    chunk.append(next(pulls_iterator))
+                except StopIteration:
+                    break
+
+            if not chunk:
+                break
+
+            # Update counts for this chunk
+            for pull in chunk:
+                if hasattr(pull.user, "login"):
+                    author = pull.user.login
+                    pr_author_counts[author] = pr_author_counts.get(author, 0) + 1
+
+            total_prs += len(chunk)
+            print(f"  Processed {total_prs} pull requests so far...")
+
+        print(f"Found and processed {total_prs} pull requests")
+
+        # Process issues in chunks
+        print("Processing issues in chunks...")
         issue_author_counts = {}
-        for issue in all_issues:
-            if hasattr(issue.user, "login"):
-                author = issue.user.login
-                issue_author_counts[author] = issue_author_counts.get(author, 0) + 1
-
-        # Count contributions for each innersource contributor
+        total_issues = 0
+
+        # GitHub API returns an iterator that internally handles pagination
+        # We'll manually chunk it to avoid loading everything at once
+        issues_iterator = repo_data.issues(state="all")
+        while True:
+            # Process a chunk of issues
+            chunk = []
+            for _ in range(chunk_size):
+                try:
+                    chunk.append(next(issues_iterator))
+                except StopIteration:
+                    break
+
+            if not chunk:
+                break
+
+            # Update counts for this chunk
+            for issue in chunk:
+                if hasattr(issue.user, "login"):
+                    author = issue.user.login
+                    issue_author_counts[author] = issue_author_counts.get(author, 0) + 1
+
+            total_issues += len(chunk)
+            print(f"  Processed {total_issues} issues so far...")
+
+        print(f"Found and processed {total_issues} issues")
+
+        # Count contributions for each innersource contributor using precompiled dictionaries
         innersource_contribution_counts = {}
         print("Counting contributions for each innersource contributor...")
         for contributor in innersource_contributors:
             # Initialize counter for this contributor
             innersource_contribution_counts[contributor] = 0
 
-            # Add commit counts
+            # Add commit counts from the precompiled dictionary
             innersource_contribution_counts[contributor] += commit_author_counts.get(
                 contributor, 0
             )
 
-            # Add PR counts
+            # Add PR counts from the precompiled dictionary
             innersource_contribution_counts[contributor] += pr_author_counts.get(
                 contributor, 0
             )
 
-            # Add issue counts
+            # Add issue counts from the precompiled dictionary
             innersource_contribution_counts[contributor] += issue_author_counts.get(
                 contributor, 0
             )
@@ -227,22 +270,22 @@ def main():  # pragma: no cover
         for contributor, count in innersource_contribution_counts.items():
             print(f"  {contributor}: {count} contributions")
 
-        # count contributions for each user in team_members_that_own_the_repo
+        # Count contributions for each team member using precompiled dictionaries
         team_member_contribution_counts = {}
         print("Counting contributions for each team member that owns the repo...")
         for member in team_members_that_own_the_repo:
             # Initialize counter for this team member
             team_member_contribution_counts[member] = 0
 
-            # Add commit counts
+            # Add commit counts from the precompiled dictionary
             team_member_contribution_counts[member] += commit_author_counts.get(
                 member, 0
             )
 
-            # Add PR counts
+            # Add PR counts from the precompiled dictionary
             team_member_contribution_counts[member] += pr_author_counts.get(member, 0)
 
-            # Add issue counts
+            # Add issue counts from the precompiled dictionary
             team_member_contribution_counts[member] += issue_author_counts.get(
                 member, 0
             )
 
@@ -0,0 +1,54 @@
+"""
+Tests for config.py specifically for the chunk_size parameter
+"""
+
+from config import get_env_vars
+
+
+class TestChunkSize:
+    """
+    Test cases for the chunk_size parameter in config.py
+    """
+
+    def test_get_env_vars_with_default_chunk_size(self, monkeypatch):
+        """
+        Test that chunk_size is set to default (100) when not specified
+        """
+        monkeypatch.setenv("REPOSITORY", "owner/repo")
+        monkeypatch.setenv("GH_TOKEN", "token")
+
+        env_vars = get_env_vars(test=True)
+        assert env_vars.chunk_size == 100
+
+    def test_get_env_vars_with_custom_chunk_size(self, monkeypatch):
+        """
+        Test that chunk_size is set to the custom value when specified
+        """
+        monkeypatch.setenv("REPOSITORY", "owner/repo")
+        monkeypatch.setenv("GH_TOKEN", "token")
+        monkeypatch.setenv("CHUNK_SIZE", "200")
+
+        env_vars = get_env_vars(test=True)
+        assert env_vars.chunk_size == 200
+
+    def test_get_env_vars_with_small_chunk_size(self, monkeypatch):
+        """
+        Test that chunk_size is set to minimum 10 when specified value is too small
+        """
+        monkeypatch.setenv("REPOSITORY", "owner/repo")
+        monkeypatch.setenv("GH_TOKEN", "token")
+        monkeypatch.setenv("CHUNK_SIZE", "5")
+
+        env_vars = get_env_vars(test=True)
+        assert env_vars.chunk_size == 10
+
+    def test_get_env_vars_with_invalid_chunk_size(self, monkeypatch):
+        """
+        Test that chunk_size is set to default (100) when an invalid value is specified
+        """
+        monkeypatch.setenv("REPOSITORY", "owner/repo")
+        monkeypatch.setenv("GH_TOKEN", "token")
+        monkeypatch.setenv("CHUNK_SIZE", "not_a_number")
+
+        env_vars = get_env_vars(test=True)
+        assert env_vars.chunk_size == 100