|
| 1 | +// Copyright 2021 The Cockroach Authors. |
| 2 | +// |
| 3 | +// Use of this software is governed by the CockroachDB Software License |
| 4 | +// included in the /LICENSE file. |
| 5 | + |
| 6 | +package admission |
| 7 | + |
| 8 | +import ( |
| 9 | + "fmt" |
| 10 | + "strconv" |
| 11 | + "strings" |
| 12 | + |
| 13 | + "github.com/cockroachdb/cockroach/pkg/util/syncutil" |
| 14 | + "github.com/olekukonko/tablewriter" |
| 15 | +) |
| 16 | + |
| 17 | +// resourceTier specifies the tier of a resource group, in descending levels of importance. |
| 18 | +// That is, tier 0 is the most important. The token bucket sizes must be such that the |
| 19 | +// non-burstable token bucket size of tier-i must be greater than the burstable token bucket |
| 20 | +// size of tier-(i+1); see cpuTimeTokenGranter for details on this. |
| 21 | +// |
| 22 | +// The tier determination for a request happens at a layer outside the admission package. |
| 23 | +// |
| 24 | +// TODO(josh): Add docs re: tentative usage after a discussion with Sumeer. |
| 25 | +// |
| 26 | +// NB: Inter-tenant fair sharing only works within a tier. |
| 27 | +// |
| 28 | +// TODO(josh): Move ths definition to admission.go. Export publicly. |
| 29 | +type resourceTier uint8 |
| 30 | + |
| 31 | +const numResourceTiers resourceTier = 2 |
| 32 | + |
| 33 | +// cpuTimeTokenChildGranter implements granter. It stores resourceTier and proxies |
| 34 | +// proxies to cpuTimeTokenGranter. See the declaration comment for cpuTimeTokenGranter |
| 35 | +// for more details. |
| 36 | +// |
| 37 | +// Each "child" granter is paired with a requester, since the requester (in practice, a |
| 38 | +// WorkQueue for a certain resourceTier) does not need to know about the others. |
| 39 | +// An alternative would be to make resourceTier an argument to the various granter methods, |
| 40 | +// but this approach seems cleaner. |
| 41 | +type cpuTimeTokenChildGranter struct { |
| 42 | + tier resourceTier |
| 43 | + parent *cpuTimeTokenGranter |
| 44 | +} |
| 45 | + |
| 46 | +var _ granter = &cpuTimeTokenChildGranter{} |
| 47 | + |
| 48 | +// tryGet implements granter. |
| 49 | +func (cg *cpuTimeTokenChildGranter) tryGet(qual burstQualification, count int64) bool { |
| 50 | + return cg.parent.tryGet(cg.tier, qual, count) |
| 51 | +} |
| 52 | + |
| 53 | +// returnGrant implements granter. |
| 54 | +func (cg *cpuTimeTokenChildGranter) returnGrant(count int64) { |
| 55 | + cg.parent.returnGrant(count) |
| 56 | +} |
| 57 | + |
| 58 | +// tookWithoutPermission implements granter. |
| 59 | +func (cg *cpuTimeTokenChildGranter) tookWithoutPermission(count int64) { |
| 60 | + cg.parent.tookWithoutPermission(count) |
| 61 | +} |
| 62 | + |
| 63 | +// continueGrantChain implements granter. |
| 64 | +func (cg *cpuTimeTokenChildGranter) continueGrantChain(grantChainID grantChainID) { |
| 65 | + // Ignore since grant chains are not used. |
| 66 | +} |
| 67 | + |
| 68 | +// cpuTimeTokenGranter uses token buckets to limit CPU usage. There is one |
| 69 | +// token bucket per type of request. Requests are only admitted (tryGet |
| 70 | +// only returns true), if the bucket for the type of request to be done has |
| 71 | +// positive tokens. Before a request is admitted, tokens are deducated from all |
| 72 | +// buckets, not just the bucket that was checked initially. This enables |
| 73 | +// setting up a hierarchy of types of requests, where some types can use more |
| 74 | +// CPU than others. |
| 75 | +// |
| 76 | +// For example, on an 8 vCPU machine, it might be set up like this: |
| 77 | +// |
| 78 | +// - Burstable tier-0 work -> 6 seconds of CPU time per second |
| 79 | +// - Non-burstable tier-0 work -> 5 seconds of CPU time per second |
| 80 | +// - Burstable tier-1 work -> 2 seconds of CPU time per second |
| 81 | +// - Non-burstable tier-1 work -> 1 seconds of CPU time per second |
| 82 | +// |
| 83 | +// A request for 5s of burstable tier-0 work would be admitted immediately, |
| 84 | +// since the burstable tier-0 bucket is positive. It would deduct from all |
| 85 | +// four buckets, resulting in a balance of (1,0,-3,-4). Non-burstable tier-0 |
| 86 | +// work and all tier-1 work would now have to wait for their respective buckets |
| 87 | +// to refill, while burstable tier-0 work is still admissible. |
| 88 | +// |
| 89 | +// The immediate purpose of this is to achieve low goroutine scheduling latencies |
| 90 | +// even in the case of a Serverless tenant sending a large workload to a multi-host |
| 91 | +// cluster. The above rates will be set so as to limit CPU utilization to some |
| 92 | +// cluster-setting-configurable maximum. For example, if the target max is 80%, and |
| 93 | +// if the machine has 8 vCPUs, then the non-burstable tier-0 work will be allowed |
| 94 | +// 6.4 seconds of CPU time per second. In limiting CPU usage to some max, goroutine |
| 95 | +// scheduling latency can be kept low. |
| 96 | +// |
| 97 | +// TODO(josh): Add docs about resourceTier after a discussion with Sumeer. |
| 98 | +// |
| 99 | +// Note that cpuTimeTokenGranter does not handle replenishing the buckets. |
| 100 | +// |
| 101 | +// For more, see the initial design sketch: |
| 102 | +// https://docs.google.com/document/d/1-Kr2gRFTk0QV8kBs7AXRXUwFpK2ZxR1cqIwWCuOx22Q/edit?tab=t.0 |
| 103 | +// TODO(josh): Turn into a proper design documnet. |
| 104 | +type cpuTimeTokenGranter struct { |
| 105 | + requester [numResourceTiers]requester |
| 106 | + mu struct { |
| 107 | + // TODO(josh): I suspect putting the mutex here is better than in |
| 108 | + // CPUTimeTokenGrantCoordinator, but for now the decision is tentative. |
| 109 | + // Think better to decice when I put up a PR that introduces |
| 110 | + // CPUTimeTokenGrantCoordinator & cpuTimeTokenAdjusterNew. |
| 111 | + syncutil.Mutex |
| 112 | + // Invariant #1: For any two buckets A & B, if A has a lower ordinal resourceTier, |
| 113 | + // then A must have more tokens than B. |
| 114 | + // Invariant #2: For any two buckets A & B, if A & B have the same resourceTier, |
| 115 | + // and if A has a lower ordinal burstQualification, then A must have more tokens than B. |
| 116 | + // |
| 117 | + // Since admission deducts from all buckets, these invariants are true, so long as token bucket |
| 118 | + // replenishing respects it also. Token bucket replenishing is not yet implemented. See |
| 119 | + // tryGrantLocked for a situation where invariant #1 is relied on. |
| 120 | + buckets [numResourceTiers][numBurstQualifications]tokenBucket |
| 121 | + } |
| 122 | +} |
| 123 | + |
| 124 | +// TODO(josh): Make this observable. See here for one approach: |
| 125 | +// https://github.com/cockroachdb/cockroach/commit/06967f5fa72115348d57fc66fe895aec514261d5#diff-6212d039fab53dd464bd989bdbd537947b11d37a9c8fe77ca497870b49e28a9cR367 |
| 126 | +type tokenBucket struct { |
| 127 | + tokens int64 |
| 128 | +} |
| 129 | + |
| 130 | +func (stg *cpuTimeTokenGranter) String() string { |
| 131 | + stg.mu.Lock() |
| 132 | + defer stg.mu.Unlock() |
| 133 | + var buf strings.Builder |
| 134 | + tw := tablewriter.NewWriter(&buf) |
| 135 | + hdrs := [numBurstQualifications + 1]string{} |
| 136 | + hdrs[0] = "cpuTTG" |
| 137 | + for gk := canBurst; gk < numBurstQualifications; gk++ { |
| 138 | + hdrs[1+gk] = gk.String() |
| 139 | + } |
| 140 | + tw.SetAlignment(tablewriter.ALIGN_LEFT) |
| 141 | + tw.SetAutoFormatHeaders(false) |
| 142 | + tw.SetBorder(false) |
| 143 | + tw.SetColumnSeparator("") |
| 144 | + tw.SetHeader(hdrs[:]) |
| 145 | + tw.SetHeaderLine(false) |
| 146 | + tw.SetNoWhiteSpace(true) |
| 147 | + tw.SetTablePadding(" ") |
| 148 | + tw.SetTrimWhiteSpaceAtEOL(true) |
| 149 | + |
| 150 | + for tier := 0; tier < int(numResourceTiers); tier++ { |
| 151 | + row := [1 + numBurstQualifications]string{} |
| 152 | + row[0] = "tier" + strconv.Itoa(tier) |
| 153 | + for gk := canBurst; gk < numBurstQualifications; gk++ { |
| 154 | + row[gk+1] = fmt.Sprint(stg.mu.buckets[tier][gk].tokens) |
| 155 | + } |
| 156 | + tw.Append(row[:]) |
| 157 | + } |
| 158 | + tw.Render() |
| 159 | + return buf.String() |
| 160 | +} |
| 161 | + |
| 162 | +// tryGet is the helper for implementing granter.tryGet. |
| 163 | +func (stg *cpuTimeTokenGranter) tryGet( |
| 164 | + tier resourceTier, qual burstQualification, count int64, |
| 165 | +) bool { |
| 166 | + stg.mu.Lock() |
| 167 | + defer stg.mu.Unlock() |
| 168 | + if stg.mu.buckets[tier][qual].tokens <= 0 { |
| 169 | + return false |
| 170 | + } |
| 171 | + stg.tookWithoutPermissionLocked(count) |
| 172 | + return true |
| 173 | +} |
| 174 | + |
| 175 | +// returnGrant is the helper for implementing granter.returnGrant. |
| 176 | +func (stg *cpuTimeTokenGranter) returnGrant(count int64) { |
| 177 | + stg.mu.Lock() |
| 178 | + defer stg.mu.Unlock() |
| 179 | + stg.tookWithoutPermissionLocked(-count) |
| 180 | + // count must be positive. Thus above always adds tokens to the buckets. |
| 181 | + // Thus returnGrant should always attempt to grant admission to waiting requests. |
| 182 | + stg.grantUntilNoWaitingRequestsLocked() |
| 183 | +} |
| 184 | + |
| 185 | +// tookWithoutPermission is the helper for implementing granter.tookWithoutPermission. |
| 186 | +func (stg *cpuTimeTokenGranter) tookWithoutPermission(count int64) { |
| 187 | + stg.mu.Lock() |
| 188 | + defer stg.mu.Unlock() |
| 189 | + stg.tookWithoutPermissionLocked(count) |
| 190 | +} |
| 191 | + |
| 192 | +func (stg *cpuTimeTokenGranter) tookWithoutPermissionLocked(count int64) { |
| 193 | + for tier := range stg.mu.buckets { |
| 194 | + for qual := range stg.mu.buckets[tier] { |
| 195 | + stg.mu.buckets[tier][qual].tokens -= count |
| 196 | + } |
| 197 | + } |
| 198 | +} |
| 199 | + |
| 200 | +// grantUntilNoWaitingRequestsLocked grants admission to all queued requests |
| 201 | +// that can be granted, given the current state of the token buckets, etc. |
| 202 | +// It prioritizes requesters from higher class work in the sense of resourceTier |
| 203 | +// That is, multiple waiting tier-0 requests will be granted before a single tier-1 |
| 204 | +// request. |
| 205 | +func (stg *cpuTimeTokenGranter) grantUntilNoWaitingRequestsLocked() { |
| 206 | + // TODO(josh): If there are a lot of tokens, this could hold the mutex for a long |
| 207 | + // time. We may want to drop and reacquire the mutex after every 1000 requests or so. |
| 208 | + for stg.tryGrantLocked() { |
| 209 | + } |
| 210 | +} |
| 211 | + |
| 212 | +// tryGrantLocked attempts to grant admission to a single queued request. |
| 213 | +// It prioritizes requesters from higher class work, in the sense of |
| 214 | +// resourceTier. |
| 215 | +func (stg *cpuTimeTokenGranter) tryGrantLocked() bool { |
| 216 | + for tier := range stg.requester { |
| 217 | + hasWaitingRequests, qual := stg.requester[tier].hasWaitingRequests() |
| 218 | + if !hasWaitingRequests { |
| 219 | + continue |
| 220 | + } |
| 221 | + if stg.mu.buckets[tier][qual].tokens <= 0 { |
| 222 | + // tryGrantLocked does not need to continue here, since there are |
| 223 | + // no more requests to grant. The detailed reason for this is: |
| 224 | + // |
| 225 | + // - stg.requester is ordered by resourceTier. |
| 226 | + // - Given two buckets A & B, if A is for a lower ordinal resourceTier, |
| 227 | + // more tokens will be in bucket A than bucket B (see cpuTimeTokenGranter |
| 228 | + // for more on this invariant). |
| 229 | + // - Thus, if no tokens in A, there are no tokens in B. |
| 230 | + // |
| 231 | + // Note that it is up to the requester which is the next request |
| 232 | + // to admit. So tryGrantLocked only needs to check the bucket that |
| 233 | + // corresponds to the burstQualification of that request, as is done |
| 234 | + // below. |
| 235 | + return false |
| 236 | + } |
| 237 | + tokens := stg.requester[tier].granted(noGrantChain) |
| 238 | + if tokens == 0 { |
| 239 | + // Did not accept grant. |
| 240 | + continue |
| 241 | + } |
| 242 | + stg.tookWithoutPermissionLocked(tokens) |
| 243 | + return true |
| 244 | + } |
| 245 | + return false |
| 246 | +} |
0 commit comments