Skip to content

Commit 5a7db19

Browse files
issue-138 - bug: Program Hangs at Launch
1 parent 7578f85 commit 5a7db19

File tree

5 files changed

+57
-34
lines changed

5 files changed

+57
-34
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Following the tradition of Unix/Linux `top` tools, `ktop` is a tool that display
2424

2525
## Installation
2626

27-
### kubectl plugin (recommended)
27+
### kubectl plugin
2828

2929
```bash
3030
kubectl krew install ktop

cmd/ktop.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -100,18 +100,19 @@ func NewKtopCmd() *cobra.Command {
100100
}
101101

102102
// tryPrometheus attempts to create, start, and verify a prometheus metrics source.
103-
// It performs a test scrape to verify connectivity before returning.
103+
// It performs a connectivity test FIRST before starting the expensive collection.
104104
func tryPrometheus(ctx context.Context, restConfig *rest.Config, cfg *promMetrics.PromConfig) (*promMetrics.PromMetricsSource, error) {
105105
source, err := promMetrics.NewPromMetricsSource(restConfig, cfg)
106106
if err != nil {
107107
return nil, err
108108
}
109-
if err := source.Start(ctx); err != nil {
110-
source.Stop()
109+
// TEST FIRST - quick connectivity check before starting expensive collection
110+
// This prevents hanging on Start() if the cluster is unreachable
111+
if err := source.TestConnection(ctx); err != nil {
111112
return nil, err
112113
}
113-
// Verify connectivity with a test scrape
114-
if err := source.TestConnection(ctx); err != nil {
114+
// THEN start collection (now non-blocking)
115+
if err := source.Start(ctx); err != nil {
115116
source.Stop()
116117
return nil, err
117118
}

docs/overrides/home.html

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@
2525
}
2626

2727
.tx-hero__logo {
28-
height: 6rem;
29-
margin-bottom: 1rem;
28+
width: 200px;
29+
height: auto;
30+
margin-bottom: 0.5rem;
3031
}
3132

3233
.tx-hero__content {

prom/controller.go

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,17 +65,11 @@ func (cc *CollectorController) Stop() error {
6565
// TestScrape performs a quick test to verify connectivity to prometheus endpoints.
6666
// It makes a direct API call to test RBAC permissions for nodes/proxy.
6767
// Returns nil if the metrics endpoints are accessible.
68+
// This can be called before Start() to verify connectivity before starting collection.
6869
func (cc *CollectorController) TestScrape(ctx context.Context) error {
69-
cc.mutex.RLock()
70-
if !cc.running {
71-
cc.mutex.RUnlock()
72-
return fmt.Errorf("controller is not running")
73-
}
74-
kubeConfig := cc.kubeConfig
75-
cc.mutex.RUnlock()
76-
7770
// Create a quick client to test connectivity
78-
clientset, err := kubernetes.NewForConfig(kubeConfig)
71+
// Note: kubeConfig is set at construction time, no lock needed
72+
clientset, err := kubernetes.NewForConfig(cc.kubeConfig)
7973
if err != nil {
8074
return fmt.Errorf("creating test client: %w", err)
8175
}
@@ -128,8 +122,13 @@ func (cc *CollectorController) runCollector(ctx context.Context) {
128122
return
129123
}
130124

131-
// Run immediate first collection (don't wait for ticker)
132-
cc.collectFromAllComponents(ctx)
125+
// Run first collection NON-BLOCKING with timeout
126+
// This prevents startup hangs - UI will show loading state while metrics populate
127+
go func() {
128+
firstCtx, cancel := context.WithTimeout(ctx, cc.config.Timeout)
129+
defer cancel()
130+
cc.collectFromAllComponents(firstCtx)
131+
}()
133132

134133
ticker := time.NewTicker(cc.config.Interval)
135134
defer ticker.Stop()

prom/scraper.go

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -97,41 +97,58 @@ func (ks *KubernetesScraper) ScrapeComponent(ctx context.Context, component Comp
9797
return ks.scrapeTarget(ctx, target)
9898
}
9999

100-
// scrapeAllTargets scrapes all targets and merges results into a single ScrapedMetrics
100+
// scrapeAllTargets scrapes all targets IN PARALLEL and merges results into a single ScrapedMetrics
101101
// This is used for node-based components where we need metrics from all nodes
102102
func (ks *KubernetesScraper) scrapeAllTargets(ctx context.Context, targets []*ScrapeTarget) (*ScrapedMetrics, error) {
103103
if len(targets) == 0 {
104104
return nil, fmt.Errorf("no targets to scrape")
105105
}
106106

107-
// Merged result
107+
// Result type for collecting scrape results from goroutines
108+
type scrapeResult struct {
109+
target *ScrapeTarget
110+
metrics *ScrapedMetrics
111+
err error
112+
}
113+
114+
startTime := time.Now()
115+
results := make(chan scrapeResult, len(targets))
116+
117+
// Scrape all targets in parallel
118+
for _, target := range targets {
119+
go func(t *ScrapeTarget) {
120+
metrics, err := ks.scrapeTarget(ctx, t)
121+
results <- scrapeResult{target: t, metrics: metrics, err: err}
122+
}(target)
123+
}
124+
125+
// Collect results and merge families
108126
mergedFamilies := make(map[string]*MetricFamily)
109127
var firstEndpoint string
110128
var totalDuration time.Duration
111-
startTime := time.Now()
112129
var lastErr error
113130

114-
for _, target := range targets {
115-
metrics, err := ks.scrapeTarget(ctx, target)
116-
if err != nil {
117-
lastErr = err
131+
for i := 0; i < len(targets); i++ {
132+
result := <-results
133+
if result.err != nil {
134+
lastErr = result.err
118135
continue // Skip failed targets but continue with others
119136
}
120137

121138
if firstEndpoint == "" {
122-
firstEndpoint = metrics.Endpoint
139+
firstEndpoint = result.metrics.Endpoint
123140
}
124-
totalDuration += metrics.ScrapeDuration
141+
totalDuration += result.metrics.ScrapeDuration
125142

126143
// Merge families, adding node label to each time series
127-
for name, family := range metrics.Families {
144+
for name, family := range result.metrics.Families {
128145
// Add node label to each time series in this family
129146
for _, ts := range family.TimeSeries {
130147
// Add node label if this is a node-based target
131-
if target.NodeName != "" {
148+
if result.target.NodeName != "" {
132149
ts.Labels = append(ts.Labels, labels.Label{
133150
Name: "node",
134-
Value: target.NodeName,
151+
Value: result.target.NodeName,
135152
})
136153
}
137154
}
@@ -340,17 +357,22 @@ func (ks *KubernetesScraper) scrapeComponentPeriodically(ctx context.Context, co
340357

341358
// scrapeTarget scrapes metrics from a single target using RESTClient
342359
func (ks *KubernetesScraper) scrapeTarget(ctx context.Context, target *ScrapeTarget) (*ScrapedMetrics, error) {
360+
// Add per-request timeout to prevent indefinite blocking on slow/unresponsive nodes
361+
reqCtx, cancel := context.WithTimeout(ctx, ks.config.Timeout)
362+
defer cancel()
363+
343364
startTime := time.Now()
344365

345366
var result rest.Result
346367
var endpoint string
347368

348369
// Build the appropriate RESTClient request based on target type
370+
// Use reqCtx (with timeout) for all requests to prevent indefinite blocking
349371
switch target.Component {
350372
case ComponentAPIServer:
351373
// API server metrics via direct path
352374
endpoint = "/metrics"
353-
result = ks.restClient.Get().AbsPath("/metrics").Do(ctx)
375+
result = ks.restClient.Get().AbsPath("/metrics").Do(reqCtx)
354376

355377
case ComponentKubelet, ComponentCAdvisor:
356378
// Node-based components via node proxy
@@ -360,7 +382,7 @@ func (ks *KubernetesScraper) scrapeTarget(ctx context.Context, target *ScrapeTar
360382
Name(target.NodeName).
361383
SubResource("proxy").
362384
Suffix(target.Path).
363-
Do(ctx)
385+
Do(reqCtx)
364386

365387
case ComponentEtcd, ComponentScheduler, ComponentControllerManager, ComponentKubeProxy:
366388
// Pod-based components via pod proxy
@@ -372,7 +394,7 @@ func (ks *KubernetesScraper) scrapeTarget(ctx context.Context, target *ScrapeTar
372394
Name(podNameWithPort).
373395
SubResource("proxy").
374396
Suffix(target.Path).
375-
Do(ctx)
397+
Do(reqCtx)
376398

377399
default:
378400
return nil, fmt.Errorf("unsupported component type: %s", target.Component)

0 commit comments

Comments
 (0)