Skip to content

Commit ab9a8ab

Browse files
authored
Merge pull request #28 from kalverra/survey
Add Survey command
2 parents a175e15 + 33b6332 commit ab9a8ab

28 files changed

+2155
-153
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ go.work.sum
3030

3131
octometrics.log.json
3232
octometrics.monitor.json
33+
octometrics.log.jsonl
34+
octometrics.monitor.jsonl
3335
data/
3436
observe_output/
3537
test_results/

cmd/gather.go

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ import (
1515
var (
1616
githubToken string
1717
githubClient *gather.GitHubClient
18-
forceUpdate bool
19-
noObserve bool
2018
)
2119

2220
var gatherCmd = &cobra.Command{
@@ -34,10 +32,6 @@ var gatherCmd = &cobra.Command{
3432
return nil
3533
},
3634
RunE: func(_ *cobra.Command, _ []string) error {
37-
logger.Debug().
38-
Bool("force-update", forceUpdate).
39-
Msg("gather flags")
40-
4135
startTime := time.Now()
4236

4337
logger = logger.With().Str("owner", cfg.Owner).Str("repo", cfg.Repo).Logger()
@@ -53,7 +47,7 @@ var gatherCmd = &cobra.Command{
5347

5448
opts := []gather.Option{}
5549

56-
if forceUpdate {
50+
if cfg.ForceUpdate {
5751
opts = append(opts, gather.ForceUpdate())
5852
}
5953

@@ -72,7 +66,7 @@ var gatherCmd = &cobra.Command{
7266
logger.Info().Str("duration", time.Since(startTime).String()).Msg("Gathered data")
7367
fmt.Println("Gathered data")
7468

75-
if noObserve {
69+
if cfg.NoObserve {
7670
return nil
7771
}
7872

@@ -94,14 +88,16 @@ var gatherCmd = &cobra.Command{
9488

9589
func init() {
9690
gatherCmd.Flags().
97-
BoolVar(&noObserve, "no-observe", false, "Skip launching the interactive observer after gathering")
98-
gatherCmd.Flags().BoolVarP(&forceUpdate, "force-update", "u", false, "Force update of existing data")
91+
Bool("no_observe", false, "Skip launching the interactive observer after gathering data")
92+
gatherCmd.Flags().BoolP("force_update", "u", false, "Force update of existing data")
9993
gatherCmd.Flags().StringP("owner", "o", "", "Repository owner")
10094
gatherCmd.Flags().StringP("repo", "r", "", "Repository name")
101-
gatherCmd.Flags().StringP("commit-sha", "c", "", "Commit SHA")
95+
gatherCmd.Flags().StringP("commit_sha", "c", "", "Commit SHA")
10296
gatherCmd.Flags().Int64P("workflow_run_id", "w", 0, "Workflow run ID")
10397
gatherCmd.Flags().IntP("pull_request_number", "p", 0, "Pull request number")
10498
gatherCmd.Flags().StringP("github_token", "t", "", "GitHub API token (env: GITHUB_TOKEN)")
99+
gatherCmd.Flags().
100+
Bool("gather_cost", false, "Gather cost data for workflow runs (can significantly increase runtime)")
105101

106102
rootCmd.AddCommand(gatherCmd)
107103
}

cmd/monitor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,5 @@ func init() {
5858
monitorCmd.Flags().DurationVarP(&duration, "duration", "d", 0, "Duration to monitor, defaults to indefinite")
5959
monitorCmd.Flags().DurationVarP(&interval, "interval", "i", 1*time.Second, "At what interval to observe metrics")
6060
monitorCmd.Flags().
61-
StringVarP(&outputFile, "output-file", "o", "octometrics.monitor.json", "Output file for the monitor data")
61+
StringVarP(&outputFile, "output-file", "o", monitor.DataFile, "Output file for the monitor data")
6262
}

cmd/observe.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,6 @@ var observeCmd = &cobra.Command{
2222
return os.RemoveAll(observe.OutputDir)
2323
},
2424
RunE: func(_ *cobra.Command, _ []string) error {
25-
// if workflowRunID != 0 {
26-
// err := observe.WorkflowRun(githubClient, owner, repo, workflowRunID, outputTypes)
27-
// if err != nil {
28-
// return fmt.Errorf("failed to observe workflow run: %w", err)
29-
// }
30-
// }
31-
32-
// if pullRequestID != 0 {
33-
// err := observe.PullRequest(githubClient, owner, repo, pullRequestID, outputTypes)
34-
// if err != nil {
35-
// return fmt.Errorf("failed to observe pull request: %w", err)
36-
// }
37-
// }
38-
3925
return observe.Interactive(logger, githubClient, "")
4026
},
4127
}

cmd/octometrics.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import (
1616
)
1717

1818
const (
19-
logFileName = "octometrics.log.json"
19+
logFileName = "octometrics.log.jsonl"
2020
)
2121

2222
var (
@@ -62,6 +62,12 @@ Octometrics aims to help you easily visualize what your workflows look like, hel
6262
if err != nil {
6363
return fmt.Errorf("failed to setup logging: %w", err)
6464
}
65+
66+
if cfg.GitHubToken == "" {
67+
logger.Warn().Msg("GitHub token not provided, will likely hit rate limits quickly")
68+
fmt.Println("WARNING: GitHub token not provided, will likely hit rate limits quickly")
69+
}
70+
6571
return nil
6672
},
6773
}

cmd/survey.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package cmd
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"time"
7+
8+
"github.com/spf13/cobra"
9+
10+
"github.com/kalverra/octometrics/gather"
11+
"github.com/kalverra/octometrics/internal/config"
12+
"github.com/kalverra/octometrics/observe"
13+
)
14+
15+
var surveyCmd = &cobra.Command{
16+
Use: "survey",
17+
Short: "Survey CI suite run times across a time period and identify p50/p75/p95 runs",
18+
Long: `Survey lists all completed workflow runs for a repository in a given time period,
19+
groups them by commit to compute per-commit CI suite duration, and identifies the
20+
commits at the p50, p75, and p95 percentiles. Detailed data is then gathered only
21+
for those representative commits, keeping API usage minimal.`,
22+
PreRunE: func(_ *cobra.Command, _ []string) error {
23+
if err := cfg.ValidateSurvey(); err != nil {
24+
return err
25+
}
26+
var err error
27+
githubClient, err = gather.NewGitHubClient(logger, githubToken, nil)
28+
if err != nil {
29+
return fmt.Errorf("failed to create GitHub client: %w", err)
30+
}
31+
return nil
32+
},
33+
RunE: func(_ *cobra.Command, _ []string) error {
34+
startTime := time.Now()
35+
36+
logger = logger.With().
37+
Str("owner", cfg.Owner).
38+
Str("repo", cfg.Repo).
39+
Str("event", cfg.Event).
40+
Time("since", cfg.Since).
41+
Time("until", cfg.Until).
42+
Logger()
43+
44+
fmt.Printf("Surveying %s/%s workflow runs from %s to %s...\n",
45+
cfg.Owner, cfg.Repo,
46+
cfg.Since.Format("2006-01-02"), cfg.Until.Format("2006-01-02"),
47+
)
48+
49+
opts := []gather.Option{}
50+
if cfg.ForceUpdate {
51+
opts = append(opts, gather.ForceUpdate())
52+
}
53+
54+
result, err := gather.Survey(logger, githubClient, cfg)
55+
if err != nil {
56+
return fmt.Errorf("survey failed: %w", err)
57+
}
58+
59+
fmt.Printf("Analyzed %d commits from %d workflow runs\n", len(result.Commits), result.TotalRuns)
60+
61+
for _, label := range []string{"p50", "p75", "p95"} {
62+
if cs, ok := result.Percentiles[label]; ok {
63+
fmt.Printf(" %s: %s (commit %s, %d workflows)\n",
64+
label, cs.Duration.Round(time.Second), cs.SHA[:7], len(cs.WorkflowRuns),
65+
)
66+
}
67+
}
68+
69+
// Phase 2: gather detailed data for representative commits
70+
fmt.Println("\nGathering detailed data for percentile commits...")
71+
for label, cs := range result.Percentiles {
72+
fmt.Printf(" Gathering %s commit %s...\n", label, cs.SHA[:7])
73+
_, err := gather.Commit(logger, githubClient, cfg.Owner, cfg.Repo, cs.SHA, opts...)
74+
if err != nil {
75+
logger.Warn().Err(err).Str("label", label).Str("sha", cs.SHA).
76+
Msg("Failed to gather detailed data for percentile commit")
77+
fmt.Printf(" Warning: failed to gather details for %s commit %s: %v\n", label, cs.SHA, err)
78+
}
79+
}
80+
81+
logger.Info().Str("duration", time.Since(startTime).String()).Msg("Survey complete")
82+
fmt.Printf("\nSurvey complete in %s\n", time.Since(startTime).Round(time.Second))
83+
84+
if cfg.NoObserve {
85+
return nil
86+
}
87+
88+
if err := os.RemoveAll(observe.OutputDir); err != nil {
89+
return fmt.Errorf("failed to clean observe output: %w", err)
90+
}
91+
92+
surveyFile := fmt.Sprintf("/%s/%s/surveys/%s.html",
93+
cfg.Owner, cfg.Repo,
94+
gather.SurveyFileBaseName(cfg.Event, cfg.Since, cfg.Until),
95+
)
96+
return observe.Interactive(logger, githubClient, surveyFile)
97+
},
98+
}
99+
100+
func init() {
101+
surveyCmd.Flags().StringP("owner", "o", "", "Repository owner")
102+
surveyCmd.Flags().StringP("repo", "r", "", "Repository name")
103+
surveyCmd.Flags().String("event", "all", "Filter by event type (all, pull_request, merge_group, push)")
104+
surveyCmd.Flags().Time("since", config.DefaultSince, []string{"2006-01-02"}, "Start analysis date")
105+
surveyCmd.Flags().Time("until", config.DefaultUntil, []string{"2006-01-02"}, "End analysis date")
106+
surveyCmd.Flags().StringP("github_token", "t", "", "GitHub API token (env: GITHUB_TOKEN)")
107+
surveyCmd.Flags().BoolP("force_update", "u", false, "Force update of existing data")
108+
surveyCmd.Flags().Bool("no_observe", false, "Skip launching the interactive observer after survey")
109+
110+
rootCmd.AddCommand(surveyCmd)
111+
}

design.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Octometrics Design
2+
3+
Octometrics is a Go CLI tool that gathers detailed GitHub Actions workflow runtime data via the GitHub REST and GraphQL APIs, stores it locally as JSON, and visualizes it as interactive Gantt-style timelines in the browser. It supports per-commit, per-PR, and aggregate percentile views of CI suite performance.
4+
5+
## Command Flow
6+
7+
```mermaid
8+
flowchart TD
9+
subgraph commands [CLI Commands]
10+
Gather[gather]
11+
Survey[survey]
12+
Observe[observe]
13+
Monitor[monitor]
14+
end
15+
16+
subgraph gatherPkg [gather package]
17+
GatherCommit["Commit()"]
18+
GatherPR["PullRequest()"]
19+
GatherWF["WorkflowRun()"]
20+
GatherSurvey["Survey()"]
21+
end
22+
23+
subgraph observePkg [observe package]
24+
ObsCommit["Commit()"]
25+
ObsPR["PullRequest()"]
26+
ObsWF["WorkflowRun()"]
27+
ObsSurvey["SurveyFromFile()"]
28+
Interactive["Interactive()"]
29+
end
30+
31+
Gather --> GatherCommit
32+
Gather --> GatherPR
33+
Gather --> GatherWF
34+
Survey --> GatherSurvey
35+
Survey --> GatherCommit
36+
37+
Observe --> Interactive
38+
Interactive --> ObsCommit
39+
Interactive --> ObsPR
40+
Interactive --> ObsWF
41+
Interactive --> ObsSurvey
42+
43+
GatherCommit --> JSON[(data/owner/repo/*.json)]
44+
GatherPR --> JSON
45+
GatherWF --> JSON
46+
GatherSurvey --> JSON
47+
48+
ObsCommit --> HTML[(observe_output/html/)]
49+
ObsPR --> HTML
50+
ObsWF --> HTML
51+
ObsSurvey --> HTML
52+
53+
HTML --> Browser[Browser :8080]
54+
```
55+
56+
## Survey Two-Phase Architecture
57+
58+
The `survey` command efficiently identifies p50/p75/p95 CI suite runs without exhausting GitHub API rate limits. It uses a two-phase approach: a lightweight listing phase, then targeted detail gathering.
59+
60+
```mermaid
61+
flowchart LR
62+
subgraph phase1 [Phase 1: Survey]
63+
A["ListRepositoryWorkflowRuns\n~10-50 API calls"] --> B[Group by HeadSHA]
64+
B --> C[Compute per-commit duration]
65+
C --> D[Sort and find p50/p75/p95]
66+
end
67+
68+
subgraph phase2 [Phase 2: Detail]
69+
D --> E["Commit(p50) ~30 calls"]
70+
D --> F["Commit(p75) ~30 calls"]
71+
D --> G["Commit(p95) ~30 calls"]
72+
end
73+
74+
subgraph render [Visualize]
75+
E --> H[Survey HTML with Gantt charts]
76+
F --> H
77+
G --> H
78+
end
79+
```
80+
81+
## Key Design Decisions
82+
83+
- **Local JSON cache**: All gathered data is stored as JSON in `data/` and re-read on subsequent runs, avoiding redundant API calls. `ForceUpdate` bypasses the cache.
84+
- **Rate limit awareness**: The REST client uses `go-github-ratelimit` to automatically sleep when rate-limited. Survey's two-phase design reduces total API calls from O(commits x workflows x jobs) to O(listing_pages + 3 x detail_calls).
85+
- **Real representative commits for percentiles**: Rather than constructing synthetic "average" timelines, the survey picks actual commits whose CI duration falls at each percentile. This shows real job distributions and integrates with existing Gantt visualization.
86+
- **Mermaid Gantt for timelines**: Workflow/job/step timing is rendered as Mermaid Gantt charts, giving a visual representation of parallelism and duration without requiring a charting library.
87+
- **Plotly for monitoring data**: CPU, memory, disk, and I/O metrics from optional `octometrics monitor` instrumentation are rendered using Plotly.js.

gather/commit.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
package gather
44

55
import (
6-
"context"
76
"encoding/json"
87
"fmt"
98
"net/http"
@@ -200,7 +199,7 @@ func Commit(
200199
return nil, fmt.Errorf("GitHub client is nil")
201200
}
202201

203-
ctx, cancel := context.WithTimeoutCause(ghCtx, timeoutDur, errGitHubTimeout)
202+
ctx, cancel := ghCtx()
204203
commit, resp, err := client.Rest.Repositories.GetCommit(ctx, owner, repo, sha, nil)
205204
cancel()
206205
if err != nil {
@@ -254,7 +253,7 @@ func checkRunsForCommit(
254253

255254
for {
256255
var checkRuns *github.ListCheckRunsResults
257-
ctx, cancel := context.WithTimeoutCause(ghCtx, timeoutDur, errGitHubTimeout)
256+
ctx, cancel := ghCtx()
258257
checkRuns, resp, err = client.Rest.Checks.ListCheckRunsForRef(ctx, owner, repo, sha, listOpts)
259258
cancel()
260259
if err != nil {

0 commit comments

Comments
 (0)