Skip to content

Commit 6f85395

Browse files
authored
Merge pull request #18 from github/zkoppert-chunking
perf: Implement chunking and dictionary based lookups
2 parents 2d0487a + 83b11fe commit 6f85395

8 files changed

+390
-32
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ This action can be configured to authenticate with GitHub App Installation or Pe
159159
| `OUTPUT_FILE` | False | `innersource_report.md` | Output filename. |
160160
| `REPORT_TITLE` | False | `"InnerSource Report"` | Title to have on the report issue. |
161161
| `REPOSITORY` | True | `""` | The name of the repository you are trying to measure. Format `owner/repo` ie. `github/measure-innersource` |
162+
| `CHUNK_SIZE` | False | `100` | Number of items to process at once when fetching data. Increasing can improve performance but uses more memory. Minimum value is 10. |
162163

163164
## Understanding the Results
164165

config.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class EnvVars:
3434
repo (str): The name of the repository to measure InnerSource collaboration in
3535
output_file (str): The name of the file to write the report to
3636
rate_limit_bypass (bool): If set to TRUE, bypass the rate limit for the GitHub API
37+
chunk_size (int): The number of items to process at once when fetching data (for memory efficiency)
3738
"""
3839

3940
def __init__(
@@ -49,6 +50,7 @@ def __init__(
4950
repo: str,
5051
output_file: str,
5152
rate_limit_bypass: bool = False,
53+
chunk_size: int = 100,
5254
):
5355
self.gh_app_id = gh_app_id
5456
self.gh_app_installation_id = gh_app_installation_id
@@ -61,6 +63,7 @@ def __init__(
6163
self.repo = repo
6264
self.output_file = output_file
6365
self.rate_limit_bypass = rate_limit_bypass
66+
self.chunk_size = chunk_size
6467

6568
def __repr__(self):
6669
return (
@@ -75,7 +78,9 @@ def __repr__(self):
7578
f"{self.owner},"
7679
f"{self.repo},"
7780
f"{self.output_file},"
78-
f"{self.rate_limit_bypass}"
81+
f"{self.rate_limit_bypass},"
82+
f"{self.chunk_size}"
83+
")"
7984
)
8085

8186

@@ -159,6 +164,16 @@ def get_env_vars(test: bool = False) -> EnvVars:
159164
output_file = "innersource_report.md"
160165
rate_limit_bypass = get_bool_env_var("RATE_LIMIT_BYPASS", False)
161166

167+
# Get the chunk size for processing data in batches (for memory efficiency)
168+
chunk_size_str = os.getenv("CHUNK_SIZE", "100")
169+
try:
170+
chunk_size = int(chunk_size_str)
171+
# Ensure a reasonable minimum chunk size
172+
chunk_size = max(chunk_size, 10)
173+
except ValueError:
174+
# Default to 100 if not a valid integer
175+
chunk_size = 100
176+
162177
return EnvVars(
163178
gh_app_id,
164179
gh_app_installation_id,
@@ -171,4 +186,5 @@ def get_env_vars(test: bool = False) -> EnvVars:
171186
repo,
172187
output_file,
173188
rate_limit_bypass,
189+
chunk_size,
174190
)

markdown_writer.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ def write_to_markdown(
4242
report_file.write("no op\n\n")
4343
return
4444
report_file.write(f"## Repository: {repo_data.full_name}\n\n")
45-
innersource_ratio = innersource_ratio if innersource_ratio is not None else 0.0
4645
report_file.write(f"### InnerSource Ratio: {innersource_ratio:.2%}\n\n")
4746
report_file.write(
4847
f"### Original Commit Author: {original_commit_author} (Manager: {original_commit_author_manager})\n\n"
@@ -58,6 +57,8 @@ def write_to_markdown(
5857
if all_contributors:
5958
for contributor in all_contributors:
6059
report_file.write(f"- {contributor}\n")
60+
else:
61+
report_file.write("No contributors found.\n")
6162

6263
report_file.write("\n## Innersource Contributors:\n")
6364
if innersource_contributors:
@@ -70,9 +71,17 @@ def write_to_markdown(
7071
if innersource_contribution_counts:
7172
for contributor, count in innersource_contribution_counts.items():
7273
report_file.write(f"- {contributor}: {count} contributions\n")
74+
else:
75+
report_file.write("No InnerSource contribution counts available.\n")
7376

7477
report_file.write("\n## Team Member Contribution Counts:\n")
75-
if team_member_contribution_counts is not None:
78+
if team_member_contribution_counts:
79+
found_contributions = False
7680
for member, count in team_member_contribution_counts.items():
7781
if count > 0:
82+
found_contributions = True
7883
report_file.write(f"- {member}: {count} contributions\n")
84+
if not found_contributions:
85+
report_file.write("No team member contributions found.\n")
86+
else:
87+
report_file.write("No team member contribution counts available.\n")

measure_innersource.py

Lines changed: 72 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,6 @@ def main(): # pragma: no cover
119119
f"Original commit author: {original_commit_author}, \
120120
with manager: {original_commit_author_manager}"
121121
)
122-
123122
# Create a dictionary mapping users to their managers for faster lookups
124123
user_to_manager = {}
125124
manager_to_reports = {}
@@ -169,56 +168,100 @@ def main(): # pragma: no cover
169168
print(f"All contributors: {all_contributors}")
170169
print(f"Innersource contributors: {innersource_contributors}")
171170

172-
# Fetch all PRs and issues once
173-
print("Fetching all pull requests...")
174-
all_pulls = list(repo_data.pull_requests(state="all"))
175-
print(f"Found {len(all_pulls)} pull requests")
176-
177-
print("Fetching all issues...")
178-
all_issues = list(repo_data.issues(state="all"))
179-
print(f"Found {len(all_issues)} issues")
171+
# Process data in chunks to avoid memory issues while maintaining performance
172+
chunk_size = env_vars.chunk_size
173+
print(f"Using chunk size of {chunk_size} for data processing")
180174

181-
# Pre-process all data to create mappings of user to contribution counts
182175
print("Pre-processing contribution data...")
183176

184177
# Create mapping of commit authors to commit counts
178+
print("Processing commits...")
185179
commit_author_counts = {}
186180
for commit in commit_list:
187181
if hasattr(commit.author, "login"):
188182
author = commit.author.login
189183
commit_author_counts[author] = commit_author_counts.get(author, 0) + 1
190184

191-
# Create mapping of PR authors to PR counts
185+
# Process pull requests in chunks
186+
print("Processing pull requests in chunks...")
192187
pr_author_counts = {}
193-
for pull in all_pulls:
194-
author = pull.user.login
195-
pr_author_counts[author] = pr_author_counts.get(author, 0) + 1
196-
197-
# Create mapping of issue authors to issue counts
188+
total_prs = 0
189+
190+
# GitHub API returns an iterator that internally handles pagination
191+
# We'll manually chunk it to avoid loading everything at once
192+
pulls_iterator = repo_data.pull_requests(state="all")
193+
while True:
194+
# Process a chunk of pull requests
195+
chunk = []
196+
for _ in range(chunk_size):
197+
try:
198+
chunk.append(next(pulls_iterator))
199+
except StopIteration:
200+
break
201+
202+
if not chunk:
203+
break
204+
205+
# Update counts for this chunk
206+
for pull in chunk:
207+
if hasattr(pull.user, "login"):
208+
author = pull.user.login
209+
pr_author_counts[author] = pr_author_counts.get(author, 0) + 1
210+
211+
total_prs += len(chunk)
212+
print(f" Processed {total_prs} pull requests so far...")
213+
214+
print(f"Found and processed {total_prs} pull requests")
215+
216+
# Process issues in chunks
217+
print("Processing issues in chunks...")
198218
issue_author_counts = {}
199-
for issue in all_issues:
200-
if hasattr(issue.user, "login"):
201-
author = issue.user.login
202-
issue_author_counts[author] = issue_author_counts.get(author, 0) + 1
203-
204-
# Count contributions for each innersource contributor
219+
total_issues = 0
220+
221+
# GitHub API returns an iterator that internally handles pagination
222+
# We'll manually chunk it to avoid loading everything at once
223+
issues_iterator = repo_data.issues(state="all")
224+
while True:
225+
# Process a chunk of issues
226+
chunk = []
227+
for _ in range(chunk_size):
228+
try:
229+
chunk.append(next(issues_iterator))
230+
except StopIteration:
231+
break
232+
233+
if not chunk:
234+
break
235+
236+
# Update counts for this chunk
237+
for issue in chunk:
238+
if hasattr(issue.user, "login"):
239+
author = issue.user.login
240+
issue_author_counts[author] = issue_author_counts.get(author, 0) + 1
241+
242+
total_issues += len(chunk)
243+
print(f" Processed {total_issues} issues so far...")
244+
245+
print(f"Found and processed {total_issues} issues")
246+
247+
# Count contributions for each innersource contributor using precompiled dictionaries
205248
innersource_contribution_counts = {}
206249
print("Counting contributions for each innersource contributor...")
207250
for contributor in innersource_contributors:
208251
# Initialize counter for this contributor
209252
innersource_contribution_counts[contributor] = 0
210253

211-
# Add commit counts
254+
# Add commit counts from the precompiled dictionary
212255
innersource_contribution_counts[contributor] += commit_author_counts.get(
213256
contributor, 0
214257
)
215258

216-
# Add PR counts
259+
# Add PR counts from the precompiled dictionary
217260
innersource_contribution_counts[contributor] += pr_author_counts.get(
218261
contributor, 0
219262
)
220263

221-
# Add issue counts
264+
# Add issue counts from the precompiled dictionary
222265
innersource_contribution_counts[contributor] += issue_author_counts.get(
223266
contributor, 0
224267
)
@@ -227,22 +270,22 @@ def main(): # pragma: no cover
227270
for contributor, count in innersource_contribution_counts.items():
228271
print(f" {contributor}: {count} contributions")
229272

230-
# count contributions for each user in team_members_that_own_the_repo
273+
# Count contributions for each team member using precompiled dictionaries
231274
team_member_contribution_counts = {}
232275
print("Counting contributions for each team member that owns the repo...")
233276
for member in team_members_that_own_the_repo:
234277
# Initialize counter for this team member
235278
team_member_contribution_counts[member] = 0
236279

237-
# Add commit counts
280+
# Add commit counts from the precompiled dictionary
238281
team_member_contribution_counts[member] += commit_author_counts.get(
239282
member, 0
240283
)
241284

242-
# Add PR counts
285+
# Add PR counts from the precompiled dictionary
243286
team_member_contribution_counts[member] += pr_author_counts.get(member, 0)
244287

245-
# Add issue counts
288+
# Add issue counts from the precompiled dictionary
246289
team_member_contribution_counts[member] += issue_author_counts.get(
247290
member, 0
248291
)

test_config_chunk_size.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Tests for config.py specifically for the chunk_size parameter
3+
"""
4+
5+
from config import get_env_vars
6+
7+
8+
class TestChunkSize:
9+
"""
10+
Test cases for the chunk_size parameter in config.py
11+
"""
12+
13+
def test_get_env_vars_with_default_chunk_size(self, monkeypatch):
14+
"""
15+
Test that chunk_size is set to default (100) when not specified
16+
"""
17+
monkeypatch.setenv("REPOSITORY", "owner/repo")
18+
monkeypatch.setenv("GH_TOKEN", "token")
19+
20+
env_vars = get_env_vars(test=True)
21+
assert env_vars.chunk_size == 100
22+
23+
def test_get_env_vars_with_custom_chunk_size(self, monkeypatch):
24+
"""
25+
Test that chunk_size is set to the custom value when specified
26+
"""
27+
monkeypatch.setenv("REPOSITORY", "owner/repo")
28+
monkeypatch.setenv("GH_TOKEN", "token")
29+
monkeypatch.setenv("CHUNK_SIZE", "200")
30+
31+
env_vars = get_env_vars(test=True)
32+
assert env_vars.chunk_size == 200
33+
34+
def test_get_env_vars_with_small_chunk_size(self, monkeypatch):
35+
"""
36+
Test that chunk_size is set to minimum 10 when specified value is too small
37+
"""
38+
monkeypatch.setenv("REPOSITORY", "owner/repo")
39+
monkeypatch.setenv("GH_TOKEN", "token")
40+
monkeypatch.setenv("CHUNK_SIZE", "5")
41+
42+
env_vars = get_env_vars(test=True)
43+
assert env_vars.chunk_size == 10
44+
45+
def test_get_env_vars_with_invalid_chunk_size(self, monkeypatch):
46+
"""
47+
Test that chunk_size is set to default (100) when an invalid value is specified
48+
"""
49+
monkeypatch.setenv("REPOSITORY", "owner/repo")
50+
monkeypatch.setenv("GH_TOKEN", "token")
51+
monkeypatch.setenv("CHUNK_SIZE", "not_a_number")
52+
53+
env_vars = get_env_vars(test=True)
54+
assert env_vars.chunk_size == 100

0 commit comments

Comments
 (0)