|
17 | 17 |
|
18 | 18 | package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
|
19 | 19 |
|
| 20 | +import scala.collection.mutable.ArrayBuffer |
20 | 21 | import scala.math.BigDecimal.RoundingMode
|
21 | 22 |
|
22 | 23 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
|
@@ -212,4 +213,172 @@ object EstimationUtils {
|
212 | 213 | }
|
213 | 214 | }
|
214 | 215 |
|
| 216 | + /** |
| 217 | + * Returns overlapped ranges between two histograms, in the given value range |
| 218 | + * [lowerBound, upperBound]. |
| 219 | + */ |
| 220 | + def getOverlappedRanges( |
| 221 | + leftHistogram: Histogram, |
| 222 | + rightHistogram: Histogram, |
| 223 | + lowerBound: Double, |
| 224 | + upperBound: Double): Seq[OverlappedRange] = { |
| 225 | + val overlappedRanges = new ArrayBuffer[OverlappedRange]() |
| 226 | + // Only bins whose range intersect [lowerBound, upperBound] have join possibility. |
| 227 | + val leftBins = leftHistogram.bins |
| 228 | + .filter(b => b.lo <= upperBound && b.hi >= lowerBound) |
| 229 | + val rightBins = rightHistogram.bins |
| 230 | + .filter(b => b.lo <= upperBound && b.hi >= lowerBound) |
| 231 | + |
| 232 | + leftBins.foreach { lb => |
| 233 | + rightBins.foreach { rb => |
| 234 | + val (left, leftHeight) = trimBin(lb, leftHistogram.height, lowerBound, upperBound) |
| 235 | + val (right, rightHeight) = trimBin(rb, rightHistogram.height, lowerBound, upperBound) |
| 236 | + // Only collect overlapped ranges. |
| 237 | + if (left.lo <= right.hi && left.hi >= right.lo) { |
| 238 | + // Collect overlapped ranges. |
| 239 | + val range = if (right.lo >= left.lo && right.hi >= left.hi) { |
| 240 | + // Case1: the left bin is "smaller" than the right bin |
| 241 | + // left.lo right.lo left.hi right.hi |
| 242 | + // --------+------------------+------------+----------------+-------> |
| 243 | + if (left.hi == right.lo) { |
| 244 | + // The overlapped range has only one value. |
| 245 | + OverlappedRange( |
| 246 | + lo = right.lo, |
| 247 | + hi = right.lo, |
| 248 | + leftNdv = 1, |
| 249 | + rightNdv = 1, |
| 250 | + leftNumRows = leftHeight / left.ndv, |
| 251 | + rightNumRows = rightHeight / right.ndv |
| 252 | + ) |
| 253 | + } else { |
| 254 | + val leftRatio = (left.hi - right.lo) / (left.hi - left.lo) |
| 255 | + val rightRatio = (left.hi - right.lo) / (right.hi - right.lo) |
| 256 | + OverlappedRange( |
| 257 | + lo = right.lo, |
| 258 | + hi = left.hi, |
| 259 | + leftNdv = left.ndv * leftRatio, |
| 260 | + rightNdv = right.ndv * rightRatio, |
| 261 | + leftNumRows = leftHeight * leftRatio, |
| 262 | + rightNumRows = rightHeight * rightRatio |
| 263 | + ) |
| 264 | + } |
| 265 | + } else if (right.lo <= left.lo && right.hi <= left.hi) { |
| 266 | + // Case2: the left bin is "larger" than the right bin |
| 267 | + // right.lo left.lo right.hi left.hi |
| 268 | + // --------+------------------+------------+----------------+-------> |
| 269 | + if (right.hi == left.lo) { |
| 270 | + // The overlapped range has only one value. |
| 271 | + OverlappedRange( |
| 272 | + lo = right.hi, |
| 273 | + hi = right.hi, |
| 274 | + leftNdv = 1, |
| 275 | + rightNdv = 1, |
| 276 | + leftNumRows = leftHeight / left.ndv, |
| 277 | + rightNumRows = rightHeight / right.ndv |
| 278 | + ) |
| 279 | + } else { |
| 280 | + val leftRatio = (right.hi - left.lo) / (left.hi - left.lo) |
| 281 | + val rightRatio = (right.hi - left.lo) / (right.hi - right.lo) |
| 282 | + OverlappedRange( |
| 283 | + lo = left.lo, |
| 284 | + hi = right.hi, |
| 285 | + leftNdv = left.ndv * leftRatio, |
| 286 | + rightNdv = right.ndv * rightRatio, |
| 287 | + leftNumRows = leftHeight * leftRatio, |
| 288 | + rightNumRows = rightHeight * rightRatio |
| 289 | + ) |
| 290 | + } |
| 291 | + } else if (right.lo >= left.lo && right.hi <= left.hi) { |
| 292 | + // Case3: the left bin contains the right bin |
| 293 | + // left.lo right.lo right.hi left.hi |
| 294 | + // --------+------------------+------------+----------------+-------> |
| 295 | + val leftRatio = (right.hi - right.lo) / (left.hi - left.lo) |
| 296 | + OverlappedRange( |
| 297 | + lo = right.lo, |
| 298 | + hi = right.hi, |
| 299 | + leftNdv = left.ndv * leftRatio, |
| 300 | + rightNdv = right.ndv, |
| 301 | + leftNumRows = leftHeight * leftRatio, |
| 302 | + rightNumRows = rightHeight |
| 303 | + ) |
| 304 | + } else { |
| 305 | + assert(right.lo <= left.lo && right.hi >= left.hi) |
| 306 | + // Case4: the right bin contains the left bin |
| 307 | + // right.lo left.lo left.hi right.hi |
| 308 | + // --------+------------------+------------+----------------+-------> |
| 309 | + val rightRatio = (left.hi - left.lo) / (right.hi - right.lo) |
| 310 | + OverlappedRange( |
| 311 | + lo = left.lo, |
| 312 | + hi = left.hi, |
| 313 | + leftNdv = left.ndv, |
| 314 | + rightNdv = right.ndv * rightRatio, |
| 315 | + leftNumRows = leftHeight, |
| 316 | + rightNumRows = rightHeight * rightRatio |
| 317 | + ) |
| 318 | + } |
| 319 | + overlappedRanges += range |
| 320 | + } |
| 321 | + } |
| 322 | + } |
| 323 | + overlappedRanges |
| 324 | + } |
| 325 | + |
| 326 | + /** |
| 327 | + * Given an original bin and a value range [lowerBound, upperBound], returns the trimmed part |
| 328 | + * of the bin in that range and its number of rows. |
| 329 | + * @param bin the input histogram bin. |
| 330 | + * @param height the number of rows of the given histogram bin inside an equi-height histogram. |
| 331 | + * @param lowerBound lower bound of the given range. |
| 332 | + * @param upperBound upper bound of the given range. |
| 333 | + * @return trimmed part of the given bin and its number of rows. |
| 334 | + */ |
| 335 | + def trimBin(bin: HistogramBin, height: Double, lowerBound: Double, upperBound: Double) |
| 336 | + : (HistogramBin, Double) = { |
| 337 | + val (lo, hi) = if (bin.lo <= lowerBound && bin.hi >= upperBound) { |
| 338 | + // bin.lo lowerBound upperBound bin.hi |
| 339 | + // --------+------------------+------------+-------------+-------> |
| 340 | + (lowerBound, upperBound) |
| 341 | + } else if (bin.lo <= lowerBound && bin.hi >= lowerBound) { |
| 342 | + // bin.lo lowerBound bin.hi upperBound |
| 343 | + // --------+------------------+------------+-------------+-------> |
| 344 | + (lowerBound, bin.hi) |
| 345 | + } else if (bin.lo <= upperBound && bin.hi >= upperBound) { |
| 346 | + // lowerBound bin.lo upperBound bin.hi |
| 347 | + // --------+------------------+------------+-------------+-------> |
| 348 | + (bin.lo, upperBound) |
| 349 | + } else { |
| 350 | + // lowerBound bin.lo bin.hi upperBound |
| 351 | + // --------+------------------+------------+-------------+-------> |
| 352 | + assert(bin.lo >= lowerBound && bin.hi <= upperBound) |
| 353 | + (bin.lo, bin.hi) |
| 354 | + } |
| 355 | + |
| 356 | + if (hi == lo) { |
| 357 | + // Note that bin.hi == bin.lo also falls into this branch. |
| 358 | + (HistogramBin(lo, hi, 1), height / bin.ndv) |
| 359 | + } else { |
| 360 | + assert(bin.hi != bin.lo) |
| 361 | + val ratio = (hi - lo) / (bin.hi - bin.lo) |
| 362 | + (HistogramBin(lo, hi, math.ceil(bin.ndv * ratio).toLong), height * ratio) |
| 363 | + } |
| 364 | + } |
| 365 | + |
| 366 | + /** |
| 367 | + * A join between two equi-height histograms may produce multiple overlapped ranges. |
| 368 | + * Each overlapped range is produced by a part of one bin in the left histogram and a part of |
| 369 | + * one bin in the right histogram. |
| 370 | + * @param lo lower bound of this overlapped range. |
| 371 | + * @param hi higher bound of this overlapped range. |
| 372 | + * @param leftNdv ndv in the left part. |
| 373 | + * @param rightNdv ndv in the right part. |
| 374 | + * @param leftNumRows number of rows in the left part. |
| 375 | + * @param rightNumRows number of rows in the right part. |
| 376 | + */ |
| 377 | + case class OverlappedRange( |
| 378 | + lo: Double, |
| 379 | + hi: Double, |
| 380 | + leftNdv: Double, |
| 381 | + rightNdv: Double, |
| 382 | + leftNumRows: Double, |
| 383 | + rightNumRows: Double) |
215 | 384 | }
|
0 commit comments