|
| 1 | +import io.github.oshai.kotlinlogging.KotlinLogging |
| 2 | +import kotlinx.coroutines.* |
| 3 | +import java.io.DataInputStream |
| 4 | +import java.nio.file.Files |
| 5 | +import java.nio.file.Path |
| 6 | +import kotlin.io.path.Path |
| 7 | +import kotlin.io.path.inputStream |
| 8 | +import kotlin.io.path.name |
| 9 | +import kotlin.io.path.walk |
| 10 | +import kotlin.time.Duration.Companion.nanoseconds |
| 11 | + |
| 12 | +private const val entrySize = 8 + 4 + 4 + 2 |
| 13 | + |
| 14 | +private val logger = KotlinLogging.logger { } |
| 15 | + |
| 16 | +private val shardFolderRegex = Regex("""shard-(\d+)-(?:zstd|zlib)""") |
| 17 | +private val logFileRegex = Regex("""log-(\d+)\.bin""") |
| 18 | + |
| 19 | +private const val minThroughput: Double = 100.0 |
| 20 | + |
| 21 | +suspend fun main(args: Array<String>) { |
| 22 | + require(args.size == 1) { |
| 23 | + "One argument must be present for the logs input directory (decompression-logs)" |
| 24 | + } |
| 25 | + |
| 26 | + val dispatcher = Dispatchers.IO.limitedParallelism(12) |
| 27 | + |
| 28 | + val zstdSmallShards = arrayListOf<List<Path>>() |
| 29 | + val zstdBigShards = arrayListOf<List<Path>>() |
| 30 | + val zlibShards = arrayListOf<List<Path>>() |
| 31 | + |
| 32 | + val logsDirectory = Path(args[0]) |
| 33 | + withContext(Dispatchers.IO) { |
| 34 | + Files.walk(logsDirectory, 1) |
| 35 | + .filter { it.name.matches(shardFolderRegex) } |
| 36 | + .sorted(Comparator.comparingInt { shardFolder -> |
| 37 | + shardFolderRegex.matchEntire(shardFolder.name)!!.groupValues[1].toInt() |
| 38 | + }) |
| 39 | + .forEach { shardFolder -> |
| 40 | + val shardId = shardFolderRegex.matchEntire(shardFolder.name)!!.groupValues[1].toInt() |
| 41 | + val shards = when (shardId % 3) { |
| 42 | + 0 -> zstdSmallShards |
| 43 | + 1 -> zstdBigShards |
| 44 | + 2 -> zlibShards |
| 45 | + else -> error("Unhandled shard $shardId") |
| 46 | + } |
| 47 | + |
| 48 | + shardFolder.walk() |
| 49 | + .filter { it.name.matches(logFileRegex) } |
| 50 | + .sortedBy { logFile -> |
| 51 | + // Uses a timestamp not an index |
| 52 | + logFileRegex.matchEntire(logFile.name)!!.groupValues[1].toLong() |
| 53 | + } |
| 54 | + .toList() |
| 55 | + .also(shards::add) |
| 56 | + } |
| 57 | + } |
| 58 | + |
| 59 | + fun List<List<Path>>.computeEntryCount(): Int { |
| 60 | + return sumOf { shard -> |
| 61 | + shard.sumOf { logFile -> |
| 62 | + var retainedEntries = 0 |
| 63 | + readLogThroughputs(logFile) { throughput -> |
| 64 | + if (throughput > minThroughput) { |
| 65 | + retainedEntries++ |
| 66 | + } |
| 67 | + } |
| 68 | + retainedEntries |
| 69 | + } |
| 70 | + } |
| 71 | + } |
| 72 | + |
| 73 | + val zstdSmallMetrics = DecompressionMetrics(zstdSmallShards.computeEntryCount()) |
| 74 | + val zstdBigMetrics = DecompressionMetrics(zstdBigShards.computeEntryCount()) |
| 75 | + val zlibMetrics = DecompressionMetrics(zlibShards.computeEntryCount()) |
| 76 | + |
| 77 | + coroutineScope { |
| 78 | + launch(dispatcher) { processShard(zstdSmallShards, zstdSmallMetrics) } |
| 79 | + launch(dispatcher) { processShard(zstdBigShards, zstdBigMetrics) } |
| 80 | + launch(dispatcher) { processShard(zlibShards, zlibMetrics) } |
| 81 | + } |
| 82 | + |
| 83 | + coroutineScope { |
| 84 | + launch(dispatcher) { zstdSmallMetrics.finish() } |
| 85 | + launch(dispatcher) { zstdBigMetrics.finish() } |
| 86 | + launch(dispatcher) { zlibMetrics.finish() } |
| 87 | + } |
| 88 | + |
| 89 | + println("zstdSmall = $zstdSmallMetrics") |
| 90 | + println("zstdBig = $zstdBigMetrics") |
| 91 | + println("zlib = $zlibMetrics") |
| 92 | + |
| 93 | + fun DecompressionEntry.decompressTimeStats(): String = "$timeToDecompress<br>${compressedSize.prettySize()} -> ${decompressedSize.prettySize()}" |
| 94 | + fun DecompressionEntry.compressedStats(): String = "**${compressedSize.prettySize()}** -> ${decompressedSize.prettySize()}<br>$timeToDecompress" |
| 95 | + fun DecompressionEntry.decompressedStats(): String = "${compressedSize.prettySize()} -> **${decompressedSize.prettySize()}**<br>$timeToDecompress" |
| 96 | + |
| 97 | + println(""" |
| 98 | + | Stat | Zlib | Zstd (8K buf) | Zstd (128K buf) | |
| 99 | + |------|------|---------------|-----------------| |
| 100 | + | Entries | ${zlibMetrics.addedEntries.pretty()} | ${zstdSmallMetrics.addedEntries.pretty()} | ${zstdBigMetrics.addedEntries.pretty()} | |
| 101 | + | Total compressed | ${zlibMetrics.totalCompressed.prettySize()} | ${zstdSmallMetrics.totalCompressed.prettySize()} | ${zstdBigMetrics.totalCompressed.prettySize()} | |
| 102 | + | Total decompressed | ${zlibMetrics.totalDecompressed.prettySize()} | ${zstdSmallMetrics.totalDecompressed.prettySize()} | ${zstdBigMetrics.totalDecompressed.prettySize()} | |
| 103 | + | Total time to decompress | ${zlibMetrics.totalTimeToDecompress} | ${zstdSmallMetrics.totalTimeToDecompress} | ${zstdBigMetrics.totalTimeToDecompress} | |
| 104 | + | Min decompress time | ${zlibMetrics.minDecompressTime.decompressTimeStats()} | ${zstdSmallMetrics.minDecompressTime.decompressTimeStats()} | ${zstdBigMetrics.minDecompressTime.decompressTimeStats()} | |
| 105 | + | Average decompress time | ${zlibMetrics.averageDecompressTime} | ${zstdSmallMetrics.averageDecompressTime} | ${zstdBigMetrics.averageDecompressTime} | |
| 106 | + | Median decompress time | ${zlibMetrics.medianDecompressTime} | ${zstdSmallMetrics.medianDecompressTime} | ${zstdBigMetrics.medianDecompressTime} | |
| 107 | + | Max decompress time | ${zlibMetrics.maxDecompressTime.decompressTimeStats()} | ${zstdSmallMetrics.maxDecompressTime.decompressTimeStats()} | ${zstdBigMetrics.maxDecompressTime.decompressTimeStats()} | |
| 108 | + | Min throughput (B/µs) | ${zlibMetrics.minThroughput} | ${zstdSmallMetrics.minThroughput} | ${zstdBigMetrics.minThroughput} | |
| 109 | + | Average throughput (B/µs) | ${zlibMetrics.averageThroughput} | ${zstdSmallMetrics.averageThroughput} | ${zstdBigMetrics.averageThroughput} | |
| 110 | + | Max throughput (B/µs) | ${zlibMetrics.maxThroughput} | ${zstdSmallMetrics.maxThroughput} | ${zstdBigMetrics.maxThroughput} | |
| 111 | + | Min compressed size | ${zlibMetrics.minCompressedSize.compressedStats()} | ${zstdSmallMetrics.minCompressedSize.compressedStats()} | ${zstdBigMetrics.minCompressedSize.compressedStats()} | |
| 112 | + | Average compressed size (B) | ${zlibMetrics.averageCompressedSize} | ${zstdSmallMetrics.averageCompressedSize} | ${zstdBigMetrics.averageCompressedSize} | |
| 113 | + | Median compressed size (B) | ${zlibMetrics.medianCompressedSize} | ${zstdSmallMetrics.medianCompressedSize} | ${zstdBigMetrics.medianCompressedSize} | |
| 114 | + | Max compressed size | ${zlibMetrics.maxCompressedSize.compressedStats()} | ${zstdSmallMetrics.maxCompressedSize.compressedStats()} | ${zstdBigMetrics.maxCompressedSize.compressedStats()} | |
| 115 | + | Min decompressed size | ${zlibMetrics.minDecompressedSize.decompressedStats()} | ${zstdSmallMetrics.minDecompressedSize.decompressedStats()} | ${zstdBigMetrics.minDecompressedSize.decompressedStats()} | |
| 116 | + | Average decompressed size (B) | ${zlibMetrics.averageDecompressedSize} | ${zstdSmallMetrics.averageDecompressedSize} | ${zstdBigMetrics.averageDecompressedSize} | |
| 117 | + | Median decompressed size (B) | ${zlibMetrics.medianDecompressedSize} | ${zstdSmallMetrics.medianDecompressedSize} | ${zstdBigMetrics.medianDecompressedSize} | |
| 118 | + | Max decompressed size | ${zlibMetrics.maxDecompressedSize.decompressedStats()} | ${zstdSmallMetrics.maxDecompressedSize.decompressedStats()} | ${zstdBigMetrics.maxDecompressedSize.decompressedStats()} | |
| 119 | + """.trimIndent()) |
| 120 | +} |
| 121 | + |
| 122 | +private fun CoroutineScope.processShard(shards: List<List<Path>>, metrics: DecompressionMetrics) { |
| 123 | + var i = 0 |
| 124 | + for (shard in shards) { |
| 125 | + val shardId = i++ |
| 126 | + |
| 127 | + shard.forEachIndexed { logIndex, logFile -> |
| 128 | + launch { |
| 129 | + logger.info { "Reading shard $shardId file $logIndex" } |
| 130 | + |
| 131 | + val logIndexByte = logIndex.toUByte() |
| 132 | + readLogFile(logFile, logIndexByte) { entry -> |
| 133 | + // Take only those with a throughput of 100 bytes per microsecond |
| 134 | + if (entry.throughput <= minThroughput) return@readLogFile |
| 135 | + metrics.accept(entry) |
| 136 | + } |
| 137 | + } |
| 138 | + } |
| 139 | + } |
| 140 | +} |
| 141 | + |
| 142 | +private fun Int.pretty(): String { |
| 143 | + return toLong().pretty() |
| 144 | +} |
| 145 | + |
| 146 | +private fun Long.pretty(): String { |
| 147 | + val str = toString() |
| 148 | + if (str.length <= 3) return str |
| 149 | + |
| 150 | + return buildString { |
| 151 | + str.reversed().forEachIndexed { index, ch -> |
| 152 | + if (index != 0 && index % 3 == 0) { |
| 153 | + append('_') |
| 154 | + } |
| 155 | + append(ch) |
| 156 | + } |
| 157 | + }.reversed() |
| 158 | +} |
| 159 | + |
| 160 | +private fun Int.prettySize(): String { |
| 161 | + return toLong().prettySize() |
| 162 | +} |
| 163 | + |
| 164 | +private fun Long.prettySize(): String { |
| 165 | + if (this > 10000) { |
| 166 | + var prettySize = this.toDouble() / 1024.0 |
| 167 | + var prettyUnit = "KB" |
| 168 | + if (prettySize > 1024.0) { |
| 169 | + prettySize /= 1024.0 |
| 170 | + prettyUnit = "MB" |
| 171 | + } |
| 172 | + if (prettySize > 1024.0) { |
| 173 | + prettySize /= 1024.0 |
| 174 | + prettyUnit = "GB" |
| 175 | + } |
| 176 | + return "$this B (${"%.1f".format(prettySize)} ${prettyUnit})" |
| 177 | + } else { |
| 178 | + return "$this B" |
| 179 | + } |
| 180 | +} |
| 181 | + |
| 182 | +private fun readLogFile(logFile: Path, logIndex: UByte, entryConsumer: (DecompressionEntry) -> Unit) { |
| 183 | + logFile.inputStream().buffered().let(::DataInputStream).use { input -> |
| 184 | + var entryIndex = 0u |
| 185 | + var available = input.available() |
| 186 | + while (available > 0) { |
| 187 | + val timeToDecompress = input.readLong() |
| 188 | + val compressedSize = input.readInt() |
| 189 | + val decompressedSize = input.readInt() |
| 190 | + input.skipBytes(2) // Separator |
| 191 | + |
| 192 | + entryConsumer(DecompressionEntry(logIndex, entryIndex++, timeToDecompress.nanoseconds, compressedSize, decompressedSize)) |
| 193 | + |
| 194 | + available -= entrySize |
| 195 | + if (available <= 0) { |
| 196 | + available = input.available() |
| 197 | + } |
| 198 | + } |
| 199 | + } |
| 200 | +} |
| 201 | + |
| 202 | +private fun readLogThroughputs(logFile: Path, consumer: (throughput: Double) -> Unit) { |
| 203 | + logFile.inputStream().buffered().let(::DataInputStream).use { input -> |
| 204 | + var available = input.available() |
| 205 | + while (available > 0) { |
| 206 | + val timeToDecompress = input.readLong() |
| 207 | + input.skipBytes(4) // Compressed size |
| 208 | + val decompressedSize = input.readInt() |
| 209 | + input.skipBytes(2) // Separator |
| 210 | + |
| 211 | + consumer(decompressedSize / timeToDecompress.toDouble() * 1000.0) |
| 212 | + |
| 213 | + available -= entrySize |
| 214 | + if (available <= 0) { |
| 215 | + available = input.available() |
| 216 | + } |
| 217 | + } |
| 218 | + } |
| 219 | +} |
0 commit comments