Improve how we calculate PR tracking costs

Thomas Stromberg · Thomas Stromberg · commit 2f532aee7bf4 · 2025-11-18T07:34:24.000-05:00
diff --git a/README.md b/README.md
@@ -186,6 +186,39 @@ Calibrated on Windows Vista development data showing 4% weekly code churn. A PR
 
 **Reference**: Nagappan, N., et al. (2008). Organizational Structure and Software Quality. *ICSE '08*.
 
+### 6. PR Tracking Overhead: Empirical Organizational Studies
+
+Models the cost of managing and triaging open PR backlogs. This captures planning and coordination overhead, **excluding actual code review time** (counted separately in future review costs). Based on research showing developers spend significant time on PR discovery, triage, and project management activities beyond reviewing code.
+
+**Formula**:
+```
+tracking_hours_per_day = openPRs × log₂(activeContributors + 1) × 0.005
+```
+
+**Components**:
+- **Linear with PR count**: More open PRs require more organizational scanning/triage overhead
+- **Logarithmic with team size**: Larger teams develop specialization, tooling, and distributed ownership that reduce per-capita burden
+- **Constant (0.005)**: Calibrated to ~20 seconds per PR per week of planning/coordination time, excluding actual review
+
+**Validation Examples**:
+- 20 PRs, 5 contributors: ~15 min/day total (3 min/person/day)
+- 200 PRs, 50 contributors: ~6 hours/day total (7 min/person/day)
+- 1000 PRs, 100 contributors: ~33 hours/day total (20 min/person/day)
+
+**Activities Captured** (non-review overhead only):
+- Sprint/milestone planning discussions about open PRs
+- Daily standup mentions and status coordination
+- Searching for duplicate work before starting new PRs
+- Identifying related PRs that may conflict or depend on each other
+- Quarterly/monthly mass triage of stale PRs
+- Project/product management tracking of feature delivery
+- Estimating and re-prioritizing work based on open PR backlog
+
+**References**:
+- Bacchelli, A., & Bird, C. (2013). Expectations, Outcomes, and Challenges of Modern Code Review. *ICSE '13*.
+- Rigby, P. C., & Bird, C. (2013). Convergent Contemporary Software Peer Review Practices. *FSE '13*.
+- Uwano, H., et al. (2006). Analyzing Individual Performance of Source Code Review Using Reviewers' Eye Movement. *ETRA '06*.
+
 ## Model Limitations
 
 **Individual Estimates**: High variance (CV > 1.0) due to developer and task heterogeneity.
diff --git a/pkg/cost/extrapolate.go b/pkg/cost/extrapolate.go
@@ -1,6 +1,9 @@
 package cost
 
-import "log/slog"
+import (
+	"log/slog"
+	"math"
+)
 
 // PRMergeStatus represents merge status information for a PR (for calculating merge rate).
 type PRMergeStatus struct {
@@ -322,11 +325,21 @@ func ExtrapolateFromSamples(breakdowns []Breakdown, totalPRs, totalAuthors, actu
 	extCodeChurnCost := sumCodeChurnCost / samples * multiplier
 	extAutomatedUpdatesCost := sumAutomatedUpdatesCost / samples * multiplier
 	// Calculate Open PR Tracking cost based on actual open PRs (not from samples)
-	// Formula: actualOpenPRs × uniqueUsers × (tracking_minutes_per_day_per_person / 60) × daysInPeriod × hourlyRate
-	// This scales with team size: larger teams spend more total time tracking open PRs
+	// Formula: openPRs × log2(activeContributors + 1) × 0.005 × daysInPeriod × hourlyRate
+	// This represents planning/coordination overhead ONLY (excludes actual code review)
+	// - Linear with PR count: more PRs = more planning/triage overhead
+	// - Logarithmic with team size: larger teams have specialization/better processes
+	// - Constant 0.005: calibrated to ~20 seconds per PR per week of planning/coordination time
+	//   (excludes actual review time, which is counted separately in FutureReviewCost)
 	hourlyRate := cfg.AnnualSalary * cfg.BenefitsMultiplier / cfg.HoursPerYear
 	uniqueUserCount := len(uniqueNonBotUsers)
-	extPRTrackingHours := float64(actualOpenPRs) * float64(uniqueUserCount) * (cfg.PRTrackingMinutesPerDay / 60.0) * float64(daysInPeriod)
+	var extPRTrackingHours float64
+	if uniqueUserCount > 0 && actualOpenPRs > 0 {
+		// log2(n+1) to handle log(0) and provide smooth scaling
+		teamScaleFactor := math.Log2(float64(uniqueUserCount) + 1)
+		// 0.005 hours = 0.30 minutes per PR per day (organizational average for planning/coordination only)
+		extPRTrackingHours = float64(actualOpenPRs) * teamScaleFactor * 0.005 * float64(daysInPeriod)
+	}
 	extPRTrackingCost := extPRTrackingHours * hourlyRate
 	extFutureReviewCost := sumFutureReviewCost / samples * multiplier
 	extFutureMergeCost := sumFutureMergeCost / samples * multiplier
diff --git a/pkg/github/fetch.go b/pkg/github/fetch.go
@@ -39,7 +39,7 @@ func PRDataFromPRX(prData *prx.PullRequestData) cost.PRData {
 	// Fallback bot detection: if prx didn't mark it as a bot, check common bot names
 	authorBot := pr.AuthorBot
 	if !authorBot {
-		authorBot = isCommonBot(pr.Author)
+		authorBot = IsBot(pr.Author)
 		if authorBot {
 			slog.Info("Bot detected by name pattern (prx missed it)",
 				"author", pr.Author,
@@ -71,45 +71,6 @@ func PRDataFromPRX(prData *prx.PullRequestData) cost.PRData {
 	return data
 }
 
-// isCommonBot checks if a username matches common bot patterns.
-// This is a fallback in case prx doesn't correctly mark the AuthorBot field.
-func isCommonBot(username string) bool {
-	lowerName := strings.ToLower(username)
-
-	// Common bot account names
-	botPatterns := []string{
-		"dependabot",
-		"renovate",
-		"github-actions",
-		"codecov",
-		"greenkeeper",
-		"snyk-bot",
-		"allcontributors",
-		"imgbot",
-		"stalebot",
-		"mergify",
-		"netlify",
-		"vercel",
-		"codefactor-io",
-		"deepsource-autofix",
-		"pre-commit-ci",
-		"ready-to-review",
-	}
-
-	for _, pattern := range botPatterns {
-		if strings.Contains(lowerName, pattern) {
-			return true
-		}
-	}
-
-	// Check for [bot] suffix
-	if strings.HasSuffix(lowerName, "[bot]") {
-		return true
-	}
-
-	return false
-}
-
 // FetchPRData retrieves pull request information from GitHub and converts it
 // to the format needed for cost calculation.
 //
@@ -137,7 +98,7 @@ func FetchPRData(ctx context.Context, prURL string, token string, updatedAt time
 	slog.Debug("Parsed PR URL", "owner", owner, "repo", repo, "number", number)
 
 	// Get cache directory from user's cache directory
-	cacheDir, err := getCacheDir()
+	userCacheDir, err := os.UserCacheDir()
 	if err != nil {
 		slog.Warn("Failed to get cache directory, using non-cached client", "error", err)
 		// Fallback to non-cached client
@@ -151,6 +112,20 @@ func FetchPRData(ctx context.Context, prURL string, token string, updatedAt time
 		return result, nil
 	}
 
+	cacheDir := filepath.Join(userCacheDir, "prcost")
+	if err := os.MkdirAll(cacheDir, 0o700); err != nil {
+		slog.Warn("Failed to create cache directory, using non-cached client", "error", err)
+		// Fallback to non-cached client
+		client := prx.NewClient(token)
+		prData, err := client.PullRequest(ctx, owner, repo, number)
+		if err != nil {
+			slog.Error("GitHub API call failed", "owner", owner, "repo", repo, "pr", number, "error", err)
+			return cost.PRData{}, fmt.Errorf("failed to fetch PR data: %w", err)
+		}
+		result := PRDataFromPRX(prData)
+		return result, nil
+	}
+
 	// Create prx cache client for disk-based caching
 	client, err := prx.NewCacheClient(token, cacheDir)
 	if err != nil {
@@ -229,9 +204,9 @@ func extractParticipantEvents(events []prx.Event) []cost.ParticipantEvent {
 		}
 
 		// Skip bots: check both prx's Bot field and common bot patterns
-		isBot := event.Bot || event.Actor == "github" || isCommonBot(event.Actor)
-		if isBot {
-			if !event.Bot && isCommonBot(event.Actor) {
+		isBotEvent := event.Bot || event.Actor == "github" || IsBot(event.Actor)
+		if isBotEvent {
+			if !event.Bot && IsBot(event.Actor) {
 				slog.Debug("Bot event detected by name pattern (prx missed it)",
 					"actor", event.Actor,
 					"kind", event.Kind,
@@ -250,21 +225,3 @@ func extractParticipantEvents(events []prx.Event) []cost.ParticipantEvent {
 
 	return participantEvents
 }
-
-// getCacheDir returns the cache directory for prx client.
-// Uses OS-specific user cache directory with prcost subdirectory.
-func getCacheDir() (string, error) {
-	userCacheDir, err := os.UserCacheDir()
-	if err != nil {
-		return "", fmt.Errorf("get user cache dir: %w", err)
-	}
-
-	cacheDir := filepath.Join(userCacheDir, "prcost")
-
-	// Ensure cache directory exists
-	if err := os.MkdirAll(cacheDir, 0o700); err != nil {
-		return "", fmt.Errorf("create cache dir: %w", err)
-	}
-
-	return cacheDir, nil
-}
diff --git a/pkg/github/fetch_test.go b/pkg/github/fetch_test.go
@@ -3,7 +3,6 @@ package github
 import (
 	"encoding/json"
 	"os"
-	"strings"
 	"testing"
 	"time"
 
@@ -277,77 +276,6 @@ func TestPRDataFromPRXWithRealData(t *testing.T) {
 	t.Logf("PR 1891: %d human events out of %d total events", len(costData.Events), len(prxData.Events))
 }
 
-func TestGetCacheDir(t *testing.T) {
-	dir, err := getCacheDir()
-	if err != nil {
-		t.Fatalf("getCacheDir() error = %v", err)
-	}
-	if dir == "" {
-		t.Error("getCacheDir() returned empty string")
-	}
-
-	// Should contain prcost in the path
-	if !strings.Contains(dir, "prcost") {
-		t.Errorf("getCacheDir() = %q, expected to contain 'prcost'", dir)
-	}
-}
-
-func TestIsCommonBot(t *testing.T) {
-	tests := []struct {
-		name     string
-		username string
-		want     bool
-	}{
-		{"dependabot", "dependabot[bot]", true},
-		{"renovate", "renovate-bot", true},
-		{"github-actions", "github-actions", true},
-		{"codecov", "codecov-commenter", true},
-		{"greenkeeper", "greenkeeper[bot]", true},
-		{"snyk", "snyk-bot", true},
-		{"allcontributors", "allcontributors[bot]", true},
-		{"imgbot", "ImgBot", true}, // Case insensitive
-		{"stalebot", "stalebot", true},
-		{"mergify", "mergify[bot]", true},
-		{"netlify", "netlify[bot]", true},
-		{"vercel", "vercel[bot]", true},
-		{"codefactor", "codefactor-io", true},
-		{"deepsource", "deepsource-autofix[bot]", true},
-		{"pre-commit", "pre-commit-ci[bot]", true},
-		{"regular user", "john-doe", false},
-		{"bot in middle", "robot-person", false},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := isCommonBot(tt.username)
-			if got != tt.want {
-				t.Errorf("isCommonBot(%q) = %v, want %v", tt.username, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestIsCommonBotCaseSensitivity(t *testing.T) {
-	tests := []struct {
-		name     string
-		username string
-		want     bool
-	}{
-		{"uppercase BOT", "DEPENDABOT[bot]", true},
-		{"mixed case", "DePeNdAbOt[bot]", true},
-		{"lowercase all", "dependabot[bot]", true},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := isCommonBot(tt.username)
-			if got != tt.want {
-				t.Errorf("isCommonBot(%q) = %v, want %v", tt.username, got, tt.want)
-			}
-		})
-	}
-}
-
 func TestExtractParticipantEventsEdgeCases(t *testing.T) {
 	now := time.Now()
 
@@ -438,73 +366,3 @@ func TestPRDataFromPRXWithRealSprinklerData(t *testing.T) {
 
 	t.Logf("Sprinkler PR 37: %d human events out of %d total events", len(costData.Events), len(prxData.Events))
 }
-
-func TestGetCacheDirCreatesDirectory(t *testing.T) {
-	// This test actually calls getCacheDir to improve coverage
-	dir, err := getCacheDir()
-	if err != nil {
-		t.Fatalf("getCacheDir() error = %v", err)
-	}
-	if dir == "" {
-		t.Error("getCacheDir() returned empty string")
-	}
-
-	// Verify directory was created
-	info, err := os.Stat(dir)
-	if err != nil {
-		t.Errorf("Cache directory was not created: %v", err)
-	}
-	if !info.IsDir() {
-		t.Error("Cache path is not a directory")
-	}
-}
-
-func TestIsCommonBotVariations(t *testing.T) {
-	tests := []struct {
-		username string
-		want     bool
-	}{
-		{"dependabot", true},
-		{"dependabot[bot]", true},
-		{"renovate", true},
-		{"renovate-bot", true},
-		{"github-actions", true},
-		{"github-actions[bot]", true},
-		{"codecov", true},
-		{"codecov-commenter", true},
-		{"greenkeeper", true},
-		{"greenkeeper[bot]", true},
-		{"snyk-bot", true},
-		{"allcontributors", true},
-		{"allcontributors[bot]", true},
-		{"imgbot", true},
-		{"ImgBot", true}, // case insensitive
-		{"stalebot", true},
-		{"mergify", true},
-		{"mergify[bot]", true},
-		{"netlify", true},
-		{"netlify[bot]", true},
-		{"vercel", true},
-		{"vercel[bot]", true},
-		{"codefactor-io", true},
-		{"deepsource-autofix", true},
-		{"deepsource-autofix[bot]", true},
-		{"pre-commit-ci", true},
-		{"pre-commit-ci[bot]", true},
-		{"ready-to-review", true},
-		{"ready-to-review[bot]", true},
-		{"regular-user", false},
-		{"robot", false},
-		{"botman", false},
-		{"john-doe", false},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.username, func(t *testing.T) {
-			got := isCommonBot(tt.username)
-			if got != tt.want {
-				t.Errorf("isCommonBot(%q) = %v, want %v", tt.username, got, tt.want)
-			}
-		})
-	}
-}
diff --git a/pkg/github/query.go b/pkg/github/query.go