Skip to content

Commit 8565552

Browse files
Merge pull request #1034 from jhadvig/OCPBUGS_telemetry
OCPBUGS-58094: Consolidate telemetry values oscilation
2 parents 31d58bd + 5885c0e commit 8565552

File tree

2 files changed

+152
-7
lines changed

2 files changed

+152
-7
lines changed

pkg/console/telemetry/telemetry.go

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"io"
77
"net/http"
88
"net/url"
9+
"sync"
910
"time"
1011

1112
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
@@ -24,6 +25,17 @@ const (
2425
TelemetryAnnotationPrefix = "telemetry.console.openshift.io/"
2526
TelemeterClientDeploymentNamespace = "openshift-monitoring"
2627
PullSecretName = "pull-secret"
28+
// FetchInterval defines how often we can fetch from OCM after the last successful fetch
29+
FetchInterval = 24 * time.Hour
30+
// FailureBackoffInterval defines how often we retry after a failed or missing-data attempt
31+
FailureBackoffInterval = 5 * time.Minute
32+
)
33+
34+
var (
35+
// Global timestamps for rate limiting
36+
lastAttemptTime time.Time
37+
lastSuccessTime time.Time
38+
fetchMutex sync.RWMutex
2739
)
2840

2941
func IsTelemeterClientAvailable(deploymentLister appsv1listers.DeploymentLister) (bool, error) {
@@ -93,24 +105,79 @@ func GetOrganizationMeta(telemetryConfig map[string]string, cachedOrganizationID
93105
return customOrganizationID, customAccountMail, false
94106
}
95107

108+
// If both cached values are available, prefer them without fetching
96109
if cachedOrganizationID != "" && cachedAccountEmail != "" {
97110
klog.V(4).Infoln("telemetry config: using cached organization metadata")
98111
return cachedOrganizationID, cachedAccountEmail, false
99112
}
100113

101-
fetchedOCMRespose, err := FetchSubscription(clusterID, accessToken)
114+
// Attempt rate-limited fetch of subscription
115+
// We need to do this bacause in some cases the organization ID and account mail are not
116+
// not available in the telemetry configmap, so we need to fetch it from OCM. But event there
117+
// one of the value might not be available, so we need to check periodically.
118+
fetchedOCMRespose, fetched, err := getSubscriptionWithRateLimit(clusterID, accessToken)
102119
if err != nil {
103120
klog.Errorf("telemetry config error: %s", err)
104-
return "", "", false // Ensure safe return in case of error
105121
}
122+
// If not fetched or error, proceed with cached values without clearing
123+
124+
// Merge per-field: only overwrite when the fetched value is non-empty
125+
resolvedOrgID := cachedOrganizationID
126+
if fetched && fetchedOCMRespose != nil && fetchedOCMRespose.Organization.ExternalId != "" {
127+
resolvedOrgID = fetchedOCMRespose.Organization.ExternalId
128+
}
129+
130+
resolvedAccountMail := cachedAccountEmail
131+
if fetched && fetchedOCMRespose != nil && fetchedOCMRespose.Creator.Email != "" {
132+
resolvedAccountMail = fetchedOCMRespose.Creator.Email
133+
}
134+
135+
refresh := resolvedOrgID != cachedOrganizationID || resolvedAccountMail != cachedAccountEmail
136+
return resolvedOrgID, resolvedAccountMail, refresh
137+
}
138+
139+
// getSubscriptionWithRateLimit applies rate limiting using last attempt and last success timestamps
140+
// and returns a subscription only if a fetch was performed and succeeded.
141+
// fetched indicates whether a fetch was attempted and succeeded.
142+
func getSubscriptionWithRateLimit(clusterID, accessToken string) (*Subscription, bool, error) {
143+
// Check freshness windows
144+
fetchMutex.RLock()
145+
successFresh := !lastSuccessTime.IsZero() && time.Since(lastSuccessTime) < FetchInterval
146+
attemptFresh := !lastAttemptTime.IsZero() && time.Since(lastAttemptTime) < FailureBackoffInterval
147+
fetchMutex.RUnlock()
148+
149+
if successFresh || attemptFresh {
150+
return nil, false, nil
151+
}
152+
153+
// Mark attempt time, guarding against races
154+
fetchMutex.Lock()
155+
if !lastSuccessTime.IsZero() && time.Since(lastSuccessTime) < FetchInterval {
156+
fetchMutex.Unlock()
157+
return nil, false, nil
158+
}
159+
if !lastAttemptTime.IsZero() && time.Since(lastAttemptTime) < FailureBackoffInterval {
160+
fetchMutex.Unlock()
161+
return nil, false, nil
162+
}
163+
lastAttemptTime = time.Now()
164+
fetchMutex.Unlock()
106165

107-
// Check if the fetched response is nil before accessing fields
108-
if fetchedOCMRespose == nil {
109-
klog.Errorf("telemetry config error: FetchSubscription returned nil response")
110-
return "", "", false
166+
// Perform fetch
167+
subscription, err := FetchSubscription(clusterID, accessToken)
168+
if err != nil {
169+
return nil, false, err
111170
}
171+
if subscription == nil {
172+
return nil, false, fmt.Errorf("nil subscription response")
173+
}
174+
175+
// Mark success time
176+
fetchMutex.Lock()
177+
lastSuccessTime = time.Now()
178+
fetchMutex.Unlock()
112179

113-
return fetchedOCMRespose.Organization.ExternalId, fetchedOCMRespose.Creator.Email, true
180+
return subscription, true, nil
114181
}
115182

116183
// Needed to create our own types for OCM Subscriptions since their types and client are useless
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
package telemetry
2+
3+
import (
4+
"testing"
5+
"time"
6+
)
7+
8+
// Helpers to set rate-limit timestamps and restore after test
9+
func withLastAttemptTime(t *testing.T, ts time.Time) {
10+
prev := lastAttemptTime
11+
lastAttemptTime = ts
12+
t.Cleanup(func() { lastAttemptTime = prev })
13+
}
14+
15+
func withLastSuccessTime(t *testing.T, ts time.Time) {
16+
prev := lastSuccessTime
17+
lastSuccessTime = ts
18+
t.Cleanup(func() { lastSuccessTime = prev })
19+
}
20+
21+
func TestGetOrganizationMeta_UsesCustomOverrides(t *testing.T) {
22+
telemetryConfig := map[string]string{
23+
"ORGANIZATION_ID": "org-custom",
24+
"ACCOUNT_MAIL": "[email protected]",
25+
}
26+
orgID, account, refresh := GetOrganizationMeta(telemetryConfig, "", "", "cluster-id", "token")
27+
28+
if orgID != "org-custom" || account != "[email protected]" {
29+
t.Fatalf("expected custom values, got orgID=%q account=%q", orgID, account)
30+
}
31+
if refresh {
32+
t.Fatalf("expected refresh=false when using custom overrides")
33+
}
34+
}
35+
36+
func TestGetOrganizationMeta_UsesCachedWhenAvailable(t *testing.T) {
37+
telemetryConfig := map[string]string{}
38+
orgID, account, refresh := GetOrganizationMeta(telemetryConfig, "org-cached", "[email protected]", "cluster-id", "token")
39+
40+
if orgID != "org-cached" || account != "[email protected]" {
41+
t.Fatalf("expected cached values, got orgID=%q account=%q", orgID, account)
42+
}
43+
if refresh {
44+
t.Fatalf("expected refresh=false when using cached values")
45+
}
46+
}
47+
48+
func TestGetOrganizationMeta_SkipsFetch_WhenSuccessRecent_OneMissing(t *testing.T) {
49+
telemetryConfig := map[string]string{}
50+
51+
// Emulate recent successful fetch within 24h
52+
withLastSuccessTime(t, time.Now())
53+
54+
orgID, account, refresh := GetOrganizationMeta(telemetryConfig, "org-cached", "", "cluster-id", "token")
55+
56+
if orgID != "org-cached" || account != "" {
57+
t.Fatalf("expected to return cached org and empty account without fetching, got orgID=%q account=%q", orgID, account)
58+
}
59+
if refresh {
60+
t.Fatalf("expected refresh=false when fetch is rate-limited by last success")
61+
}
62+
}
63+
64+
func TestGetOrganizationMeta_SkipsFetch_WhenAttemptRecent_NoCache(t *testing.T) {
65+
telemetryConfig := map[string]string{}
66+
67+
// Emulate recent attempt within backoff window (no success)
68+
withLastAttemptTime(t, time.Now())
69+
70+
orgID, account, refresh := GetOrganizationMeta(telemetryConfig, "", "", "cluster-id", "token")
71+
72+
if orgID != "" || account != "" {
73+
t.Fatalf("expected empty values when no cache and fetch is rate-limited by last attempt, got orgID=%q account=%q", orgID, account)
74+
}
75+
if refresh {
76+
t.Fatalf("expected refresh=false when fetch is rate-limited by last attempt")
77+
}
78+
}

0 commit comments

Comments
 (0)