diff --git a/.gitignore b/.gitignore index 36a342d..f43aaee 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ .idea .DS_Store .specstory/ +delta-tables/ +test_data/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index ba33e02..6886e8e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -94,6 +94,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "const-random", "getrandom 0.3.3", "once_cell", "serde", @@ -319,12 +320,449 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" +dependencies = [ + "arrow-arith 55.2.0", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-csv 55.2.0", + "arrow-data 55.2.0", + "arrow-ipc 55.2.0", + "arrow-json 55.2.0", + "arrow-ord 55.2.0", + "arrow-row 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "arrow-string 55.2.0", +] + +[[package]] +name = "arrow" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +dependencies = [ + "arrow-arith 56.2.0", + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-cast 56.2.0", + "arrow-csv 56.2.0", + "arrow-data 56.2.0", + "arrow-ipc 56.2.0", + "arrow-json 56.2.0", + "arrow-ord 56.2.0", + "arrow-row 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "arrow-string 56.2.0", +] + +[[package]] +name = "arrow-arith" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "num", +] + +[[package]] +name = "arrow-arith" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.15.5", + "num", +] + +[[package]] +name = "arrow-array" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.0", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +dependencies = [ + "bytes 1.10.1", + "half", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +dependencies = [ + "bytes 1.10.1", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "atoi", + "base64 0.22.1", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-cast" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "atoi", + "base64 0.22.1", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +dependencies = [ + "arrow-array 55.2.0", + "arrow-cast 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-csv" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +dependencies = [ + "arrow-array 56.2.0", + "arrow-cast 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +dependencies = [ + "arrow-buffer 55.2.0", + "arrow-schema 55.2.0", + "half", + "num", +] + +[[package]] +name = "arrow-data" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +dependencies = [ + "arrow-buffer 56.2.0", + "arrow-schema 56.2.0", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "flatbuffers", + "lz4_flex", +] + +[[package]] +name = "arrow-ipc" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "flatbuffers", + "lz4_flex", + "zstd 0.13.3", +] + +[[package]] +name = "arrow-json" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "chrono", + "half", + "indexmap 2.11.0", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-json" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-cast 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "half", + "indexmap 2.11.0", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", +] + +[[package]] +name = "arrow-ord" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", +] + +[[package]] +name = "arrow-row" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "half", +] + +[[package]] +name = "arrow-row" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "half", +] + +[[package]] +name = "arrow-schema" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "arrow-schema" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +dependencies = [ + "bitflags 2.9.3", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-select" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +dependencies = [ + "ahash 0.8.12", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "num", +] + +[[package]] +name = "arrow-select" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "num", +] + +[[package]] +name = "arrow-string" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +dependencies = [ + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-data 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "memchr", + "num", + "regex", + "regex-syntax 0.8.6", +] + +[[package]] +name = "arrow-string" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "memchr", + "num", + "regex", + "regex-syntax 0.8.6", +] + [[package]] name = "ascii-canvas" version = "4.0.0" @@ -372,15 +810,16 @@ dependencies = [ [[package]] name = "async-compression" version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddb939d66e4ae03cee6091612804ba446b12878410cfa17f785f4dd67d4014e8" +source = "git+https://github.com/nolouch/async-compression?rev=ba69fdc#ba69fdcf9a5071d1678d2bb5ceaa248007d1dd80" dependencies = [ "brotli", + "bzip2 0.6.0", "flate2", "futures-core", "memchr", "pin-project-lite", "tokio", + "xz2", "zstd 0.13.3", "zstd-safe 7.2.4", ] @@ -454,7 +893,7 @@ dependencies = [ "futures-timer", "futures-util", "http 1.3.1", - "indexmap 2.10.0", + "indexmap 2.11.0", "mime", "multer", "num-traits", @@ -503,7 +942,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34ecdaff7c9cffa3614a9f9999bf9ee4c3078fe3ce4d6a6e161736b56febf2de" dependencies = [ "bytes 1.10.1", - "indexmap 2.10.0", + "indexmap 2.11.0", "serde", "serde_json", ] @@ -743,9 +1182,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.5" +version = "1.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c478f5b10ce55c9a33f87ca3404ca92768b144fc1bfdede7c0121214a8283a25" +checksum = "8bc1b40fb26027769f16960d2f4a6bc20c4bb755d403e552c8c1a73af433c246" dependencies = [ "aws-credential-types", "aws-runtime", @@ -773,9 +1212,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.5" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1541072f81945fa1251f8795ef6c92c4282d74d59f88498ae7d4bf00f0ebdad9" +checksum = "799a1290207254984cb7c05245111bc77958b92a3c9bb449598044b36341cce6" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -783,6 +1222,29 @@ dependencies = [ "zeroize", ] +[[package]] +name = "aws-lc-rs" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c953fe1ba023e6b7730c0d4b031d06f267f23a46167dcbd40316644b10a17ba" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff" +dependencies = [ + "bindgen", + "cc", + "cmake", + "dunce", + "fs_extra", +] + [[package]] name = "aws-runtime" version = "1.5.10" @@ -836,9 +1298,9 @@ dependencies = [ [[package]] name = "aws-sdk-cloudwatchlogs" -version = "1.99.0" +version = "1.100.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "971dcaf0eca0b6887ed3e3251815e0456a79baa729f05dc87c5284a6a4ad9829" +checksum = "ddd943f79a325b4ec4304a80b3f83d29232fa3eb3baf4478c49ea8a74d0455e8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -858,10 +1320,10 @@ dependencies = [ ] [[package]] -name = "aws-sdk-firehose" -version = "1.89.0" +name = "aws-sdk-dynamodb" +version = "1.93.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be79fdeca6c84a12c270b36b9fb6b72f0a97bf37111821d822a3a64ed0487cc" +checksum = "6d5b0656080dc4061db88742d2426fc09369107eee2485dfedbc7098a04f21d1" dependencies = [ "aws-credential-types", "aws-runtime", @@ -880,15 +1342,14 @@ dependencies = [ ] [[package]] -name = "aws-sdk-kinesis" -version = "1.85.0" +name = "aws-sdk-firehose" +version = "1.90.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee302a867d89907dceebe77306d26dcf12b9268435316a9bcd4fd2adefa1ed8a" +checksum = "b920b0fc5d8a944971aadb5d5e4dd8a6822d1aa88ad91641a0fb568952c6397a" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-json", "aws-smithy-runtime", @@ -903,14 +1364,15 @@ dependencies = [ ] [[package]] -name = "aws-sdk-kms" -version = "1.84.0" +name = "aws-sdk-kinesis" +version = "1.86.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98037a2a0745914d2f0fee41acb6cf88a76f0ed31dd75753b4dc318aa5a4da39" +checksum = "2ee45ea02aafba6989594302fa04c68db57ab59612b69d287ac32d8a42712405" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", + "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-json", "aws-smithy-runtime", @@ -925,17 +1387,39 @@ dependencies = [ ] [[package]] -name = "aws-sdk-s3" -version = "1.103.0" +name = "aws-sdk-kms" +version = "1.84.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af040a86ae4378b7ed2f62c83b36be1848709bbbf5757ec850d0e08596a26be9" +checksum = "98037a2a0745914d2f0fee41acb6cf88a76f0ed31dd75753b4dc318aa5a4da39" dependencies = [ "aws-credential-types", "aws-runtime", - "aws-sigv4", "aws-smithy-async", - "aws-smithy-checksums", - "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes 1.10.1", + "fastrand 2.3.0", + "http 0.2.12", + "regex-lite", + "tracing 0.1.41", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.103.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af040a86ae4378b7ed2f62c83b36be1848709bbbf5757ec850d0e08596a26be9" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-json", "aws-smithy-runtime", @@ -1005,9 +1489,9 @@ dependencies = [ [[package]] name = "aws-sdk-sqs" -version = "1.81.0" +version = "1.82.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17688adfe7471c885396c7f314736f349c425cb3536e8f80233ddebe64269cd" +checksum = "fe6858d5bd13b69709fe602f62e8a0be7f43ba0e71bfae1c65a638ffac5123e6" dependencies = [ "aws-credential-types", "aws-runtime", @@ -1027,9 +1511,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.81.0" +version = "1.84.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79ede098271e3471036c46957cba2ba30888f53bda2515bf04b560614a30a36e" +checksum = "357a841807f6b52cb26123878b3326921e2a25faca412fabdd32bd35b7edd5d3" dependencies = [ "aws-credential-types", "aws-runtime", @@ -1049,9 +1533,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.82.0" +version = "1.86.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43326f724ba2cc957e6f3deac0ca1621a3e5d4146f5970c24c8a108dac33070f" +checksum = "9d1cc7fb324aa12eb4404210e6381195c5b5e9d52c2682384f295f38716dd3c7" dependencies = [ "aws-credential-types", "aws-runtime", @@ -1071,9 +1555,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.83.0" +version = "1.86.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5468593c47efc31fdbe6c902d1a5fde8d9c82f78a3f8ccfe907b1e9434748cb" +checksum = "e7d835f123f307cafffca7b9027c14979f1d403b417d8541d67cf252e8a21e35" dependencies = [ "aws-credential-types", "aws-runtime", @@ -1165,9 +1649,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.10" +version = "0.60.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "604c7aec361252b8f1c871a7641d5e0ba3a7f5a586e51b66bc9510a5519594d9" +checksum = "182b03393e8c677347fb5705a04a9392695d47d20ef0a2f8cfe28c8e6b9b9778" dependencies = [ "aws-smithy-types", "bytes 1.10.1", @@ -1197,9 +1681,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.0.6" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f108f1ca850f3feef3009bdcc977be201bca9a91058864d9de0684e64514bee0" +checksum = "734b4282fbb7372923ac339cc2222530f8180d9d4745e582de19a18cee409fd8" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -1207,18 +1691,29 @@ dependencies = [ "h2 0.3.27", "h2 0.4.12", "http 0.2.12", + "http 1.3.1", "http-body 0.4.6", "hyper 0.14.32", + "hyper 1.7.0", + "hyper-rustls 0.24.2", + "hyper-rustls 0.27.7", + "hyper-util", "pin-project-lite", + "rustls 0.21.12", + "rustls 0.23.31", + "rustls-native-certs 0.8.1", + "rustls-pki-types", "tokio", + "tokio-rustls 0.26.2", + "tower 0.5.2", "tracing 0.1.41", ] [[package]] name = "aws-smithy-json" -version = "0.61.4" +version = "0.61.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a16e040799d29c17412943bdbf488fd75db04112d0c0d4b9290bacf5ae0014b9" +checksum = "eaa31b350998e703e9826b2104dd6f63be0508666e1aba88137af060e8944047" dependencies = [ "aws-smithy-types", ] @@ -1244,9 +1739,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.8.6" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e107ce0783019dbff59b3a244aa0c114e4a8c9d93498af9162608cd5474e796" +checksum = "4fa63ad37685ceb7762fa4d73d06f1d5493feb88e3f27259b9ed277f4c01b185" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -1268,9 +1763,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.8.7" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75d52251ed4b9776a3e8487b2a01ac915f73b2da3af8fc1e77e0fce697a550d4" +checksum = "07f5e0fc8a6b3f2303f331b94504bbf754d85488f402d6f1dd7a6080f99afe56" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -1606,6 +2101,42 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" +[[package]] +name = "bigdecimal" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "bindgen" +version = "0.69.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +dependencies = [ + "bitflags 2.9.3", + "cexpr", + "clang-sys", + "itertools 0.12.1", + "lazy_static", + "lazycell", + "log", + "prettyplease 0.2.37", + "proc-macro2", + "quote", + "regex", + "rustc-hash 1.1.0", + "shlex", + "syn 2.0.106", + "which 4.4.2", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -1629,9 +2160,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.2" +version = "2.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a65b545ab31d687cff52899d4890855fec459eb6afe0da6417b8a18da87aa29" +checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d" dependencies = [ "serde", ] @@ -1658,6 +2189,28 @@ dependencies = [ "wyz", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -1714,7 +2267,7 @@ dependencies = [ "home", "http 1.3.1", "http-body-util", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-named-pipe", "hyper-rustls 0.27.7", "hyper-util", @@ -1730,7 +2283,7 @@ dependencies = [ "serde_json", "serde_repr", "serde_urlencoded", - "thiserror 2.0.15", + "thiserror 2.0.16", "tokio", "tokio-util", "tower-service", @@ -1790,7 +2343,7 @@ dependencies = [ "getrandom 0.2.16", "getrandom 0.3.3", "hex", - "indexmap 2.10.0", + "indexmap 2.11.0", "js-sys", "once_cell", "rand 0.9.2", @@ -1892,6 +2445,34 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3c8f83209414aacf0eeae3cf730b18d6981697fba62f200fcfb92b9f082acba" +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea8dcd42434048e4f7a304411d9273a411f647446c1234a65ce0554923f4cff" +dependencies = [ + "libbz2-rs-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cassowary" version = "0.3.0" @@ -1918,9 +2499,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.33" +version = "1.2.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ee0f8803222ba5a7e2777dd72ca451868909b1ac410621b676adf07280e9b5f" +checksum = "42bc4aea80032b7bf409b0bc7ccad88853858911b7713a8062fdc0623867bedc" dependencies = [ "jobserver", "libc", @@ -1933,6 +2514,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom 7.1.3", +] + [[package]] name = "cfb-mode" version = "0.8.2" @@ -1944,9 +2534,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.1" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" [[package]] name = "cfg_aliases" @@ -2058,6 +2648,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "4.5.45" @@ -2119,6 +2720,15 @@ dependencies = [ "digest", ] +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + [[package]] name = "codecs" version = "0.1.0" @@ -2142,7 +2752,7 @@ dependencies = [ "serde_json", "serde_with 3.14.0", "smallvec", - "snafu 0.8.6", + "snafu 0.8.7", "syslog_loose", "tokio", "tokio-util", @@ -2195,6 +2805,19 @@ dependencies = [ "tokio-util", ] +[[package]] +name = "comfy-table" +version = "7.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" +dependencies = [ + "crossterm 0.27.0", + "crossterm 0.28.1", + "strum 0.26.3", + "strum_macros 0.26.4", + "unicode-width 0.2.0", +] + [[package]] name = "community-id" version = "0.2.3" @@ -2238,12 +2861,38 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.16", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const_fn" version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "convert_case" version = "0.4.0" @@ -2293,7 +2942,7 @@ checksum = "2eac901828f88a5241ee0600950ab981148a18f2f756900ffba1b125ca6a3ef9" dependencies = [ "cookie", "document-features", - "idna 1.0.3", + "idna 1.1.0", "log", "publicsuffix", "serde", @@ -2423,13 +3072,26 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crossterm" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" +dependencies = [ + "bitflags 2.9.3", + "crossterm_winapi", + "libc", + "parking_lot", + "winapi", +] + [[package]] name = "crossterm" version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "crossterm_winapi", "mio", "parking_lot", @@ -2445,7 +3107,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "crossterm_winapi", "document-features", "futures-core", @@ -2627,98 +3289,1510 @@ dependencies = [ ] [[package]] -name = "darling_macro" -version = "0.20.11" +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "data-encoding" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" + +[[package]] +name = "data-url" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1e0bca6c3637f992fc1cc7cbc52a78c1ef6db076dbf1059c4323d6a2048376" + +[[package]] +name = "databend-client" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d689ffeaa08b1e4be3f035fcdadd4ea69db3dbf529ec5668c6911b8a301fc06" +dependencies = [ + "cookie", + "log", + "once_cell", + "parking_lot", + "percent-encoding", + "reqwest 0.12.23", + "semver 1.0.26", + "serde", + "serde_json", + "tokio", + "tokio-retry", + "tokio-stream", + "tokio-util", + "url", + "uuid", +] + +[[package]] +name = "datafusion" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a11e19a7ccc5bb979c95c1dceef663eab39c9061b3bbf8d1937faf0f03bf41f" +dependencies = [ + "arrow 55.2.0", + "arrow-ipc 55.2.0", + "arrow-schema 55.2.0", + "async-trait", + "bytes 1.10.1", + "bzip2 0.5.2", + "chrono", + "datafusion-catalog 48.0.1", + "datafusion-catalog-listing 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-datasource-csv 48.0.1", + "datafusion-datasource-json 48.0.1", + "datafusion-datasource-parquet 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-functions 48.0.1", + "datafusion-functions-aggregate 48.0.1", + "datafusion-functions-nested 48.0.1", + "datafusion-functions-table 48.0.1", + "datafusion-functions-window 48.0.1", + "datafusion-optimizer 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-optimizer 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "datafusion-sql 48.0.1", + "flate2", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet 55.2.0", + "rand 0.9.2", + "regex", + "sqlparser 0.55.0", + "tempfile", + "tokio", + "url", + "uuid", + "xz2", + "zstd 0.13.3", +] + +[[package]] +name = "datafusion" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af15bb3c6ffa33011ef579f6b0bcbe7c26584688bd6c994f548e44df67f011a" +dependencies = [ + "arrow 56.2.0", + "arrow-ipc 56.2.0", + "arrow-schema 56.2.0", + "async-trait", + "bytes 1.10.1", + "bzip2 0.6.0", + "chrono", + "datafusion-catalog 50.3.0", + "datafusion-catalog-listing 50.3.0", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-datasource 50.3.0", + "datafusion-datasource-csv 50.3.0", + "datafusion-datasource-json 50.3.0", + "datafusion-datasource-parquet 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-functions 50.3.0", + "datafusion-functions-aggregate 50.3.0", + "datafusion-functions-nested 50.3.0", + "datafusion-functions-table 50.3.0", + "datafusion-functions-window 50.3.0", + "datafusion-optimizer 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-optimizer 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-session 50.3.0", + "datafusion-sql 50.3.0", + "flate2", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet 56.2.0", + "rand 0.9.2", + "regex", + "sqlparser 0.58.0", + "tempfile", + "tokio", + "url", + "uuid", + "xz2", + "zstd 0.13.3", +] + +[[package]] +name = "datafusion-catalog" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94985e67cab97b1099db2a7af11f31a45008b282aba921c1e1d35327c212ec18" +dependencies = [ + "arrow 55.2.0", + "async-trait", + "dashmap", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "datafusion-sql 48.0.1", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "187622262ad8f7d16d3be9202b4c1e0116f1c9aa387e5074245538b755261621" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "dashmap", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-datasource 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-session 50.3.0", + "datafusion-sql 50.3.0", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e002df133bdb7b0b9b429d89a69aa77b35caeadee4498b2ce1c7c23a99516988" +dependencies = [ + "arrow 55.2.0", + "async-trait", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "futures 0.3.31", + "log", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9657314f0a32efd0382b9a46fdeb2d233273ece64baa68a7c45f5a192daf0f83" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "datafusion-catalog 50.3.0", + "datafusion-common 50.3.0", + "datafusion-datasource 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-session 50.3.0", + "futures 0.3.31", + "log", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-common" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13242fc58fd753787b0a538e5ae77d356cb9d0656fa85a591a33c5f106267f6" +dependencies = [ + "ahash 0.8.12", + "arrow 55.2.0", + "arrow-ipc 55.2.0", + "base64 0.22.1", + "half", + "hashbrown 0.14.5", + "indexmap 2.11.0", + "libc", + "log", + "object_store", + "parquet 55.2.0", + "paste", + "recursive", + "sqlparser 0.55.0", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a83760d9a13122d025fbdb1d5d5aaf93dd9ada5e90ea229add92aa30898b2d1" +dependencies = [ + "ahash 0.8.12", + "arrow 56.2.0", + "arrow-ipc 56.2.0", + "base64 0.22.1", + "chrono", + "half", + "hashbrown 0.14.5", + "indexmap 2.11.0", + "libc", + "log", + "object_store", + "parquet 56.2.0", + "paste", + "recursive", + "sqlparser 0.58.0", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2239f964e95c3a5d6b4a8cde07e646de8995c1396a7fd62c6e784f5341db499" +dependencies = [ + "futures 0.3.31", + "log", + "tokio", +] + +[[package]] +name = "datafusion-common-runtime" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b6234a6c7173fe5db1c6c35c01a12b2aa0f803a3007feee53483218817f8b1e" +dependencies = [ + "futures 0.3.31", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cf792579bc8bf07d1b2f68c2d5382f8a63679cce8fbebfd4ba95742b6e08864" +dependencies = [ + "arrow 55.2.0", + "async-compression", + "async-trait", + "bytes 1.10.1", + "bzip2 0.5.2", + "chrono", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "flate2", + "futures 0.3.31", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "parquet 55.2.0", + "rand 0.9.2", + "tempfile", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd 0.13.3", +] + +[[package]] +name = "datafusion-datasource" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7256c9cb27a78709dd42d0c80f0178494637209cac6e29d5c93edd09b6721b86" +dependencies = [ + "arrow 56.2.0", + "async-compression", + "async-trait", + "bytes 1.10.1", + "bzip2 0.6.0", + "chrono", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-session 50.3.0", + "flate2", + "futures 0.3.31", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "parquet 56.2.0", + "rand 0.9.2", + "tempfile", + "tokio", + "tokio-util", + "url", + "xz2", + "zstd 0.13.3", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfc114f9a1415174f3e8d2719c371fc72092ef2195a7955404cfe6b2ba29a706" +dependencies = [ + "arrow 55.2.0", + "async-trait", + "bytes 1.10.1", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "futures 0.3.31", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64533a90f78e1684bfb113d200b540f18f268134622d7c96bbebc91354d04825" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "bytes 1.10.1", + "datafusion-catalog 50.3.0", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-datasource 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-session 50.3.0", + "futures 0.3.31", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88dd5e215c420a52362b9988ecd4cefd71081b730663d4f7d886f706111fc75" +dependencies = [ + "arrow 55.2.0", + "async-trait", + "bytes 1.10.1", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "futures 0.3.31", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d7ebeb12c77df0aacad26f21b0d033aeede423a64b2b352f53048a75bf1d6e6" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "bytes 1.10.1", + "datafusion-catalog 50.3.0", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-datasource 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-session 50.3.0", + "futures 0.3.31", + "object_store", + "serde_json", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33692acdd1fbe75280d14f4676fe43f39e9cb36296df56575aa2cac9a819e4cf" +dependencies = [ + "arrow 55.2.0", + "async-trait", + "bytes 1.10.1", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-datasource 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-aggregate 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-optimizer 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-session 48.0.1", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet 55.2.0", + "rand 0.9.2", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09e783c4c7d7faa1199af2df4761c68530634521b176a8d1331ddbc5a5c75133" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "bytes 1.10.1", + "datafusion-catalog 50.3.0", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-datasource 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-functions-aggregate 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-optimizer 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-pruning", + "datafusion-session 50.3.0", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "parquet 56.2.0", + "rand 0.9.2", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0e7b648387b0c1937b83cb328533c06c923799e73a9e3750b762667f32662c0" + +[[package]] +name = "datafusion-doc" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99ee6b1d9a80d13f9deb2291f45c07044b8e62fb540dbde2453a18be17a36429" + +[[package]] +name = "datafusion-execution" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9609d83d52ff8315283c6dad3b97566e877d8f366fab4c3297742f33dcd636c7" +dependencies = [ + "arrow 55.2.0", + "dashmap", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "futures 0.3.31", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-execution" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4cec0a57653bec7b933fb248d3ffa3fa3ab3bd33bd140dc917f714ac036f531" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "dashmap", + "datafusion-common 50.3.0", + "datafusion-expr 50.3.0", + "futures 0.3.31", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e75230cd67f650ef0399eb00f54d4a073698f2c0262948298e5299fc7324da63" +dependencies = [ + "arrow 55.2.0", + "chrono", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-functions-aggregate-common 48.0.1", + "datafusion-functions-window-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "indexmap 2.11.0", + "paste", + "recursive", + "serde_json", + "sqlparser 0.55.0", +] + +[[package]] +name = "datafusion-expr" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef76910bdca909722586389156d0aa4da4020e1631994d50fadd8ad4b1aa05fe" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "chrono", + "datafusion-common 50.3.0", + "datafusion-doc 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-functions-aggregate-common 50.3.0", + "datafusion-functions-window-common 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "indexmap 2.11.0", + "paste", + "recursive", + "serde_json", + "sqlparser 0.58.0", +] + +[[package]] +name = "datafusion-expr-common" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70fafb3a045ed6c49cfca0cd090f62cf871ca6326cc3355cb0aaf1260fa760b6" +dependencies = [ + "arrow 55.2.0", + "datafusion-common 48.0.1", + "indexmap 2.11.0", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-expr-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d155ccbda29591ca71a1344dd6bed26c65a4438072b400df9db59447f590bb6" +dependencies = [ + "arrow 56.2.0", + "datafusion-common 50.3.0", + "indexmap 2.11.0", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdf9a9cf655265861a20453b1e58357147eab59bdc90ce7f2f68f1f35104d3bb" +dependencies = [ + "arrow 55.2.0", + "arrow-buffer 55.2.0", + "base64 0.22.1", + "blake2", + "blake3", + "chrono", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-macros 48.0.1", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7de2782136bd6014670fd84fe3b0ca3b3e4106c96403c3ae05c0598577139977" +dependencies = [ + "arrow 56.2.0", + "arrow-buffer 56.2.0", + "base64 0.22.1", + "blake2", + "blake3", + "chrono", + "datafusion-common 50.3.0", + "datafusion-doc 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-macros 50.3.0", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f07e49733d847be0a05235e17b884d326a2fd402c97a89fe8bcf0bfba310005" +dependencies = [ + "ahash 0.8.12", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-aggregate-common 48.0.1", + "datafusion-macros 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07331fc13603a9da97b74fd8a273f4238222943dffdbbed1c4c6f862a30105bf" +dependencies = [ + "ahash 0.8.12", + "arrow 56.2.0", + "datafusion-common 50.3.0", + "datafusion-doc 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-functions-aggregate-common 50.3.0", + "datafusion-macros 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4512607e10d72b0b0a1dc08f42cb5bd5284cb8348b7fea49dc83409493e32b1b" +dependencies = [ + "ahash 0.8.12", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5951e572a8610b89968a09b5420515a121fbc305c0258651f318dc07c97ab17" +dependencies = [ + "ahash 0.8.12", + "arrow 56.2.0", + "datafusion-common 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-physical-expr-common 50.3.0", +] + +[[package]] +name = "datafusion-functions-nested" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab331806e34f5545e5f03396e4d5068077395b1665795d8f88c14ec4f1e0b7a" +dependencies = [ + "arrow 55.2.0", + "arrow-ord 55.2.0", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions 48.0.1", + "datafusion-functions-aggregate 48.0.1", + "datafusion-macros 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-nested" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdacca9302c3d8fc03f3e94f338767e786a88a33f5ebad6ffc0e7b50364b9ea3" +dependencies = [ + "arrow 56.2.0", + "arrow-ord 56.2.0", + "datafusion-common 50.3.0", + "datafusion-doc 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-functions 50.3.0", + "datafusion-functions-aggregate 50.3.0", + "datafusion-functions-aggregate-common 50.3.0", + "datafusion-macros 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ac2c0be983a06950ef077e34e0174aa0cb9e346f3aeae459823158037ade37" +dependencies = [ + "arrow 55.2.0", + "async-trait", + "datafusion-catalog 48.0.1", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-plan 48.0.1", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37ff8a99434fbbad604a7e0669717c58c7c4f14c472d45067c4b016621d981" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "datafusion-catalog 50.3.0", + "datafusion-common 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-physical-plan 50.3.0", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f3d92731de384c90906941d36dcadf6a86d4128409a9c5cd916662baed5f53" +dependencies = [ + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-doc 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-window-common 48.0.1", + "datafusion-macros 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e2aea7c79c926cffabb13dc27309d4eaeb130f4a21c8ba91cdd241c813652b" +dependencies = [ + "arrow 56.2.0", + "datafusion-common 50.3.0", + "datafusion-doc 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-functions-window-common 50.3.0", + "datafusion-macros 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c679f8bf0971704ec8fd4249fcbb2eb49d6a12cc3e7a840ac047b4928d3541b5" +dependencies = [ + "datafusion-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fead257ab5fd2ffc3b40fda64da307e20de0040fe43d49197241d9de82a487f" +dependencies = [ + "datafusion-common 50.3.0", + "datafusion-physical-expr-common 50.3.0", +] + +[[package]] +name = "datafusion-macros" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2821de7cb0362d12e75a5196b636a59ea3584ec1e1cc7dc6f5e34b9e8389d251" +dependencies = [ + "datafusion-expr 48.0.1", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "datafusion-macros" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec6f637bce95efac05cdfb9b6c19579ed4aa5f6b94d951cfa5bb054b7bb4f730" +dependencies = [ + "datafusion-expr 50.3.0", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "datafusion-optimizer" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1594c7a97219ede334f25347ad8d57056621e7f4f35a0693c8da876e10dd6a53" +dependencies = [ + "arrow 55.2.0", + "chrono", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "indexmap 2.11.0", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax 0.8.6", +] + +[[package]] +name = "datafusion-optimizer" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6583ef666ae000a613a837e69e456681a9faa96347bf3877661e9e89e141d8a" +dependencies = [ + "arrow 56.2.0", + "chrono", + "datafusion-common 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-physical-expr 50.3.0", + "indexmap 2.11.0", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax 0.8.6", +] + +[[package]] +name = "datafusion-physical-expr" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc6da0f2412088d23f6b01929dedd687b5aee63b19b674eb73d00c3eb3c883b7" +dependencies = [ + "ahash 0.8.12", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-functions-aggregate-common 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "half", + "hashbrown 0.14.5", + "indexmap 2.11.0", + "itertools 0.14.0", + "log", + "paste", + "petgraph 0.8.2", +] + +[[package]] +name = "datafusion-physical-expr" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8668103361a272cbbe3a61f72eca60c9b7c706e87cc3565bcf21e2b277b84f6" +dependencies = [ + "ahash 0.8.12", + "arrow 56.2.0", + "datafusion-common 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-functions-aggregate-common 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "half", + "hashbrown 0.14.5", + "indexmap 2.11.0", + "itertools 0.14.0", + "log", + "parking_lot", + "paste", + "petgraph 0.8.2", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "815acced725d30601b397e39958e0e55630e0a10d66ef7769c14ae6597298bb0" +dependencies = [ + "arrow 56.2.0", + "datafusion-common 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-functions 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb0dbd9213078a593c3fe28783beaa625a4e6c6a6c797856ee2ba234311fb96" +dependencies = [ + "ahash 0.8.12", + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-expr-common 48.0.1", + "hashbrown 0.14.5", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6652fe7b5bf87e85ed175f571745305565da2c0b599d98e697bcbedc7baa47c3" +dependencies = [ + "ahash 0.8.12", + "arrow 56.2.0", + "datafusion-common 50.3.0", + "datafusion-expr-common 50.3.0", + "hashbrown 0.14.5", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d140854b2db3ef8ac611caad12bfb2e1e1de827077429322a6188f18fc0026a" +dependencies = [ + "arrow 55.2.0", + "datafusion-common 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-expr-common 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "datafusion-physical-plan 48.0.1", + "itertools 0.14.0", + "log", + "recursive", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b7d623eb6162a3332b564a0907ba00895c505d101b99af78345f1acf929b5c" +dependencies = [ + "arrow 56.2.0", + "datafusion-common 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-pruning", + "itertools 0.14.0", + "log", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b46cbdf21a01206be76d467f325273b22c559c744a012ead5018dfe79597de08" +dependencies = [ + "ahash 0.8.12", + "arrow 55.2.0", + "arrow-ord 55.2.0", + "arrow-schema 55.2.0", + "async-trait", + "chrono", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-functions-window-common 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-expr-common 48.0.1", + "futures 0.3.31", + "half", + "hashbrown 0.14.5", + "indexmap 2.11.0", + "itertools 0.14.0", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-physical-plan" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2f7f778a1a838dec124efb96eae6144237d546945587557c9e6936b3414558c" +dependencies = [ + "ahash 0.8.12", + "arrow 56.2.0", + "arrow-ord 56.2.0", + "arrow-schema 56.2.0", + "async-trait", + "chrono", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-functions-aggregate-common 50.3.0", + "datafusion-functions-window-common 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "futures 0.3.31", + "half", + "hashbrown 0.14.5", + "indexmap 2.11.0", + "itertools 0.14.0", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7df9f606892e6af45763d94d210634eec69b9bb6ced5353381682ff090028a3" +dependencies = [ + "arrow 56.2.0", + "chrono", + "datafusion 50.3.0", + "datafusion-common 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-proto-common", + "object_store", + "prost 0.13.5", +] + +[[package]] +name = "datafusion-proto-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4b14f288ca4ef77743d9672cafecf3adfffff0b9b04af9af79ecbeaaf736901" +dependencies = [ + "arrow 56.2.0", + "datafusion-common 50.3.0", + "prost 0.13.5", +] + +[[package]] +name = "datafusion-pruning" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1e59e2ca14fe3c30f141600b10ad8815e2856caa59ebbd0e3e07cd3d127a65" +dependencies = [ + "arrow 56.2.0", + "arrow-schema 56.2.0", + "datafusion-common 50.3.0", + "datafusion-datasource 50.3.0", + "datafusion-expr-common 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-expr-common 50.3.0", + "datafusion-physical-plan 50.3.0", + "itertools 0.14.0", + "log", +] + +[[package]] +name = "datafusion-session" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a72733766ddb5b41534910926e8da5836622316f6283307fd9fb7e19811a59c" +dependencies = [ + "arrow 55.2.0", + "async-trait", + "dashmap", + "datafusion-common 48.0.1", + "datafusion-common-runtime 48.0.1", + "datafusion-execution 48.0.1", + "datafusion-expr 48.0.1", + "datafusion-physical-expr 48.0.1", + "datafusion-physical-plan 48.0.1", + "datafusion-sql 48.0.1", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-session" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21ef8e2745583619bd7a49474e8f45fbe98ebb31a133f27802217125a7b3d58d" +dependencies = [ + "arrow 56.2.0", + "async-trait", + "dashmap", + "datafusion-common 50.3.0", + "datafusion-common-runtime 50.3.0", + "datafusion-execution 50.3.0", + "datafusion-expr 50.3.0", + "datafusion-physical-expr 50.3.0", + "datafusion-physical-plan 50.3.0", + "datafusion-sql 50.3.0", + "futures 0.3.31", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-sql" +version = "48.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5162338cdec9cc7ea13a0e6015c361acad5ec1d88d83f7c86301f789473971f" +dependencies = [ + "arrow 55.2.0", + "bigdecimal", + "datafusion-common 48.0.1", + "datafusion-expr 48.0.1", + "indexmap 2.11.0", + "log", + "recursive", + "regex", + "sqlparser 0.55.0", +] + +[[package]] +name = "datafusion-sql" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89abd9868770386fede29e5a4b14f49c0bf48d652c3b9d7a8a0332329b87d50b" +dependencies = [ + "arrow 56.2.0", + "bigdecimal", + "datafusion-common 50.3.0", + "datafusion-expr 50.3.0", + "indexmap 2.11.0", + "log", + "recursive", + "regex", + "sqlparser 0.58.0", +] + +[[package]] +name = "dbl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd2735a791158376708f9347fe8faba9667589d82427ef3aed6794a8981de3d9" +dependencies = [ + "generic-array", +] + +[[package]] +name = "deadpool" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ed5957ff93768adf7a65ab167a17835c3d2c3c50d084fe305174c112f468e2f" +dependencies = [ + "deadpool-runtime", + "num_cpus", + "tokio", +] + +[[package]] +name = "deadpool-runtime" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" +dependencies = [ + "tokio", +] + +[[package]] +name = "delta_kernel" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb6b80fa39021744edf13509bbdd7caef94c1bf101e384990210332dbddddf44" +dependencies = [ + "arrow 56.2.0", + "bytes 1.10.1", + "chrono", + "comfy-table", + "delta_kernel_derive", + "futures 0.3.31", + "indexmap 2.11.0", + "itertools 0.14.0", + "object_store", + "parquet 56.2.0", + "reqwest 0.12.23", + "roaring", + "rustc_version 0.4.1", + "serde", + "serde_json", + "strum 0.27.2", + "thiserror 2.0.16", + "tokio", + "tracing 0.1.41", + "url", + "uuid", + "z85", +] + +[[package]] +name = "delta_kernel_derive" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +checksum = "ae1d02d9f5d886ae8bb7fc3f7a3cb8f1b75cd0f5c95f9b5f45bba308f1a0aa58" dependencies = [ - "darling_core 0.20.11", + "proc-macro2", "quote", "syn 2.0.106", ] [[package]] -name = "dary_heap" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" - -[[package]] -name = "dashmap" -version = "6.1.0" +name = "deltalake" +version = "0.29.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +checksum = "e5db40b24da295184823a5be90e8c0f093cd52d864d8c6898b888e8836f8b685" dependencies = [ - "cfg-if", - "crossbeam-utils", - "hashbrown 0.14.5", - "lock_api", - "once_cell", - "parking_lot_core", + "delta_kernel", + "deltalake-aws", + "deltalake-core", ] [[package]] -name = "data-encoding" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a2330da5de22e8a3cb63252ce2abb30116bf5265e89c0e01bc17015ce30a476" - -[[package]] -name = "data-url" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c297a1c74b71ae29df00c3e22dd9534821d60eb9af5a0192823fa2acea70c2a" - -[[package]] -name = "databend-client" -version = "0.28.1" +name = "deltalake-aws" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f39579be4d900955f61afeffc24de2255bc18dd9dddb6133cddf76eca23d6ed5" +checksum = "c0210d644f4ab27e6d477da99e4b4bf0c7d739fd399ac38c005b6d0dfa4fe132" dependencies = [ - "cookie", - "log", - "once_cell", - "parking_lot", - "percent-encoding", - "reqwest 0.12.23", - "semver 1.0.26", - "serde", - "serde_json", + "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-dynamodb", + "aws-sdk-sts", + "aws-smithy-runtime-api", + "backon", + "bytes 1.10.1", + "chrono", + "deltalake-core", + "futures 0.3.31", + "object_store", + "regex", + "thiserror 2.0.16", "tokio", - "tokio-retry", - "tokio-stream", - "tokio-util", + "tracing 0.1.41", "url", "uuid", ] [[package]] -name = "dbl" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd2735a791158376708f9347fe8faba9667589d82427ef3aed6794a8981de3d9" -dependencies = [ - "generic-array", -] - -[[package]] -name = "deadpool" -version = "0.12.2" +name = "deltalake-core" +version = "0.29.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ed5957ff93768adf7a65ab167a17835c3d2c3c50d084fe305174c112f468e2f" +checksum = "5f190f9efb4f7be3e4fa032ccd1dcc59b19399ce086c74ab7b4e4cc66f545bb1" dependencies = [ - "deadpool-runtime", + "arrow 56.2.0", + "arrow-arith 56.2.0", + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-cast 56.2.0", + "arrow-ipc 56.2.0", + "arrow-json 56.2.0", + "arrow-ord 56.2.0", + "arrow-row 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "async-trait", + "bytes 1.10.1", + "cfg-if", + "chrono", + "dashmap", + "datafusion 50.3.0", + "datafusion-proto", + "delta_kernel", + "deltalake-derive", + "dirs", + "either", + "futures 0.3.31", + "humantime", + "indexmap 2.11.0", + "itertools 0.14.0", "num_cpus", + "object_store", + "parking_lot", + "parquet 56.2.0", + "percent-encoding", + "percent-encoding-rfc3986", + "pin-project-lite", + "rand 0.8.5", + "regex", + "serde", + "serde_json", + "sqlparser 0.59.0", + "strum 0.27.2", + "thiserror 2.0.16", "tokio", + "tracing 0.1.41", + "url", + "uuid", + "validator", ] [[package]] -name = "deadpool-runtime" -version = "0.1.4" +name = "deltalake-derive" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" +checksum = "1a785b4702c2d1b6ff286075f375fb2fd52dfbb2fadf17b9233f4d5eea35c6ec" dependencies = [ - "tokio", + "convert_case 0.8.0", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.106", ] [[package]] @@ -2840,6 +4914,15 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -2850,6 +4933,18 @@ dependencies = [ "dirs-sys-next", ] +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users 0.5.2", + "windows-sys 0.60.2", +] + [[package]] name = "dirs-sys-next" version = "0.1.2" @@ -2857,7 +4952,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ "libc", - "redox_users", + "redox_users 0.4.6", "winapi", ] @@ -2874,9 +4969,9 @@ dependencies = [ [[package]] name = "dns-lookup" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91adf1f5ae09290d87cca8f4f0a8e49bcc30672993eb8aa11a5c9d8872d16a98" +checksum = "cf5597a4b7fe5275fc9dcf88ce26326bc8e4cb87d0130f33752d4c5f717793cf" dependencies = [ "cfg-if", "libc", @@ -2891,7 +4986,7 @@ source = "git+https://github.com/vectordotdev/vector?tag=v0.49.0#dc7e79278323d13 dependencies = [ "data-encoding", "hickory-proto", - "snafu 0.8.6", + "snafu 0.8.7", ] [[package]] @@ -2907,7 +5002,7 @@ dependencies = [ "paste", "prost 0.12.6", "prost-build 0.12.6", - "snafu 0.8.6", + "snafu 0.8.7", "tracing 0.1.41", "vector-lib", "vrl", @@ -2967,6 +5062,12 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + [[package]] name = "dyn-clone" version = "1.0.20" @@ -3344,8 +5445,8 @@ checksum = "d6215aee357f8c7c989ebb4b8466ca4d7dc93b3957039f2fc3ea2ade8ea5f279" dependencies = [ "bit-set", "derivative", - "regex-automata 0.4.9", - "regex-syntax 0.8.5", + "regex-automata 0.4.10", + "regex-syntax 0.8.6", ] [[package]] @@ -3355,8 +5456,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf04c5ec15464ace8355a7b440a33aece288993475556d461154d7a62ad9947c" dependencies = [ "bit-set", - "regex-automata 0.4.9", - "regex-syntax 0.8.5", + "regex-automata 0.4.10", + "regex-syntax 0.8.6", ] [[package]] @@ -3403,7 +5504,7 @@ dependencies = [ "flate2", "futures 0.3.31", "glob", - "indexmap 2.10.0", + "indexmap 2.11.0", "libc", "scan_fmt", "serde", @@ -3427,6 +5528,16 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flatbuffers" +version = "25.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" +dependencies = [ + "bitflags 2.9.3", + "rustc_version 0.4.1", +] + [[package]] name = "flate2" version = "1.1.2" @@ -3504,9 +5615,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "form_urlencoded" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" dependencies = [ "percent-encoding", ] @@ -3521,6 +5632,12 @@ dependencies = [ "num", ] +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "fsevent-sys" version = "4.1.0" @@ -3690,9 +5807,9 @@ dependencies = [ [[package]] name = "generator" -version = "0.8.5" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d18470a76cb7f8ff746cf1f7470914f900252ec36bbc40b569d74b1258446827" +checksum = "605183a538e3e2a9c1038635cc5c2d194e2ee8fd0d1b66b8349fad7dbacce5a2" dependencies = [ "cc", "cfg-if", @@ -3902,7 +6019,7 @@ dependencies = [ "parking_lot", "prost 0.12.6", "rand 0.9.2", - "snafu 0.8.6", + "snafu 0.8.7", "tokio", "tokio-stream", "tonic 0.11.0", @@ -3943,7 +6060,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.10.0", + "indexmap 2.11.0", "slab", "tokio", "tokio-util", @@ -3962,7 +6079,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.3.1", - "indexmap 2.10.0", + "indexmap 2.11.0", "slab", "tokio", "tokio-util", @@ -3977,6 +6094,7 @@ checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" dependencies = [ "cfg-if", "crunchy", + "num-traits", ] [[package]] @@ -4000,6 +6118,15 @@ dependencies = [ "ahash 0.7.8", ] +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +dependencies = [ + "ahash 0.8.12", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -4021,6 +6148,12 @@ dependencies = [ "foldhash", ] +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + [[package]] name = "hashlink" version = "0.10.0" @@ -4030,6 +6163,15 @@ dependencies = [ "hashbrown 0.15.5", ] +[[package]] +name = "hashlru" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c782df656366ccb42fbbac7c7c180535a43d906a138ffb491ec3ed56a239ab" +dependencies = [ + "hashbrown 0.13.2", +] + [[package]] name = "headers" version = "0.3.9" @@ -4239,20 +6381,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502" dependencies = [ "async-trait", - "bitflags 2.9.2", + "bitflags 2.9.3", "cfg-if", "data-encoding", "enum-as-inner 0.6.1", "futures-channel", "futures-io", "futures-util", - "idna 1.0.3", + "idna 1.1.0", "ipnet", "once_cell", "rand 0.9.2", "ring", "rustls-pki-types", - "thiserror 2.0.15", + "thiserror 2.0.16", "time", "tinyvec", "tracing 0.1.41", @@ -4444,13 +6586,14 @@ dependencies = [ [[package]] name = "hyper" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" +checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" dependencies = [ + "atomic-waker", "bytes 1.10.1", "futures-channel", - "futures-util", + "futures-core", "h2 0.4.12", "http 1.3.1", "http-body 1.0.1", @@ -4458,6 +6601,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", + "pin-utils", "smallvec", "tokio", "want", @@ -4473,7 +6617,7 @@ dependencies = [ "futures-util", "headers 0.4.1", "http 1.3.1", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-rustls 0.27.7", "hyper-util", "pin-project-lite", @@ -4490,7 +6634,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" dependencies = [ "hex", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-util", "pin-project-lite", "tokio", @@ -4523,7 +6667,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "527d4d619ca2c2aafa31ec139a3d1d60bf557bf7578a1f20f743637eccd9ca19" dependencies = [ "http 1.3.1", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-util", "linked_hash_set", "once_cell", @@ -4561,7 +6705,9 @@ dependencies = [ "futures-util", "http 0.2.12", "hyper 0.14.32", + "log", "rustls 0.21.12", + "rustls-native-certs 0.6.3", "tokio", "tokio-rustls 0.24.1", ] @@ -4573,7 +6719,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ "http 1.3.1", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-util", "log", "rustls 0.23.31", @@ -4603,7 +6749,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" dependencies = [ - "hyper 1.6.0", + "hyper 1.7.0", "hyper-util", "pin-project-lite", "tokio", @@ -4631,7 +6777,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" dependencies = [ "bytes 1.10.1", "http-body-util", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-util", "native-tls", "tokio", @@ -4652,7 +6798,7 @@ dependencies = [ "futures-util", "http 1.3.1", "http-body 1.0.1", - "hyper 1.6.0", + "hyper 1.7.0", "ipnet", "libc", "percent-encoding", @@ -4671,7 +6817,7 @@ checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" dependencies = [ "hex", "http-body-util", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-util", "pin-project-lite", "tokio", @@ -4807,9 +6953,9 @@ dependencies = [ [[package]] name = "idna" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" dependencies = [ "idna_adapter", "smallvec", @@ -4839,9 +6985,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" +checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9" dependencies = [ "equivalent", "hashbrown 0.15.5", @@ -4879,7 +7025,7 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "inotify-sys", "libc", ] @@ -4925,11 +7071,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "inventory" -version = "0.3.20" +version = "0.3.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab08d7cd2c5897f2c949e5383ea7c7db03fb19130ffcfbf7eda795137ae3cb83" +checksum = "bc61209c082fbeb19919bee74b176221b27223e27b65d781eb91af24eb1fb46e" dependencies = [ "rustversion", ] @@ -4947,11 +7099,11 @@ dependencies = [ [[package]] name = "io-uring" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "cfg-if", "libc", ] @@ -5085,9 +7237,9 @@ checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" [[package]] name = "jobserver" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ "getrandom 0.3.3", "libc", @@ -5140,7 +7292,7 @@ dependencies = [ "pest_derive", "regex", "serde_json", - "thiserror 2.0.15", + "thiserror 2.0.16", ] [[package]] @@ -5166,7 +7318,7 @@ dependencies = [ "email_address", "fancy-regex 0.16.1", "fraction", - "idna 1.0.3", + "idna 1.1.0", "itoa", "num-cmp", "num-traits", @@ -5174,7 +7326,7 @@ dependencies = [ "percent-encoding", "referencing", "regex", - "regex-syntax 0.8.5", + "regex-syntax 0.8.6", "serde", "serde_json", "uuid-simd", @@ -5272,7 +7424,7 @@ dependencies = [ "http 1.3.1", "http-body 1.0.1", "http-body-util", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-openssl 0.10.2", "hyper-timeout 0.5.2", "hyper-util", @@ -5308,7 +7460,7 @@ dependencies = [ "http 1.3.1", "http-body 1.0.1", "http-body-util", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-http-proxy", "hyper-rustls 0.27.7", "hyper-timeout 0.5.2", @@ -5322,7 +7474,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "thiserror 2.0.15", + "thiserror 2.0.16", "tokio", "tokio-util", "tower 0.5.2", @@ -5360,7 +7512,7 @@ dependencies = [ "serde", "serde-value", "serde_json", - "thiserror 2.0.15", + "thiserror 2.0.16", ] [[package]] @@ -5404,7 +7556,7 @@ dependencies = [ "lalrpop-util", "petgraph 0.7.1", "regex", - "regex-syntax 0.8.5", + "regex-syntax 0.8.6", "sha3", "string_cache", "term 1.1.0", @@ -5418,7 +7570,7 @@ version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5baa5e9ff84f1aefd264e6869907646538a52147a755d494517a8007fb48733" dependencies = [ - "regex-automata 0.4.9", + "regex-automata 0.4.10", "rustversion", ] @@ -5453,11 +7605,87 @@ dependencies = [ "spin", ] +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lexical-core" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" -version = "0.2.175" +version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" +checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" [[package]] name = "libflate" @@ -5483,6 +7711,16 @@ dependencies = [ "rle-decode-fast", ] +[[package]] +name = "libloading" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" +dependencies = [ + "cfg-if", + "windows-targets 0.53.3", +] + [[package]] name = "libm" version = "0.2.15" @@ -5495,7 +7733,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "391290121bad3d37fbddad76d8f5d1c1c314cfc646d143d7e07a3086ddff0ce3" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "libc", "redox_syscall", ] @@ -5697,6 +7935,17 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "macaddr" version = "1.0.1" @@ -5750,7 +7999,7 @@ dependencies = [ "memchr", "serde", "simdutf8", - "thiserror 2.0.15", + "thiserror 2.0.16", ] [[package]] @@ -5771,9 +8020,9 @@ checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "memmap2" -version = "0.9.7" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "483758ad303d734cec05e5c12b41d7e93e6a6390c5e9dae6bdeb7c1259012d28" +checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" dependencies = [ "libc", ] @@ -5812,7 +8061,7 @@ version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1ada651cd6bdffe01e5f35067df53491f1fe853d2b154008ca2bd30b3d3fcf6" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.0", "itoa", "lockfree-object-pool", "metrics", @@ -5833,7 +8082,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "hashbrown 0.15.5", - "indexmap 2.10.0", + "indexmap 2.11.0", "metrics", "ordered-float 4.6.0", "quanta", @@ -5896,7 +8145,7 @@ dependencies = [ "mlua_derive", "num-traits", "parking_lot", - "rustc-hash", + "rustc-hash 2.1.1", "rustversion", ] @@ -6108,9 +8357,9 @@ checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" [[package]] name = "newtype-uuid" -version = "1.2.4" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17d82edb1c8a6c20c238747ae7aae9181133e766bc92cd2556fdd764407d0d1" +checksum = "980493932a63b13905b6732671f5295dd11c53d763c91dbde8a7a780611c9189" dependencies = [ "uuid", ] @@ -6155,7 +8404,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "cfg-if", "cfg_aliases", "libc", @@ -6248,7 +8497,7 @@ version = "8.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d3d07927151ff8575b7087f245456e549fea62edf0ec4e565a5ee50c8402bc3" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "fsevent-sys", "inotify", "kqueue", @@ -6509,7 +8758,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", ] [[package]] @@ -6524,7 +8773,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "900831247d2fe1a09a683278e5384cfb8c80c79fe6b166f9d14bfdde0ea1b03c" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "objc2", ] @@ -6547,6 +8796,44 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efc4f07659e11cd45a341cd24d71e683e3be65d9ff1f8150061678fe60437496" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes 1.10.1", + "chrono", + "form_urlencoded", + "futures 0.3.31", + "http 1.3.1", + "http-body-util", + "httparse", + "humantime", + "hyper 1.7.0", + "itertools 0.14.0", + "md-5", + "parking_lot", + "percent-encoding", + "quick-xml 0.38.3", + "rand 0.9.2", + "reqwest 0.12.23", + "ring", + "rustls-pemfile 2.2.0", + "serde", + "serde_json", + "serde_urlencoded", + "thiserror 2.0.16", + "tokio", + "tracing 0.1.41", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "octseq" version = "0.5.2" @@ -6589,7 +8876,7 @@ version = "6.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "libc", "once_cell", "onig_sys", @@ -6675,7 +8962,7 @@ version = "0.10.73" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "cfg-if", "foreign-types", "libc", @@ -6742,6 +9029,12 @@ dependencies = [ "vrl", ] +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + [[package]] name = "ordered-float" version = "2.10.1" @@ -6831,6 +9124,79 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "parquet" +version = "55.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" +dependencies = [ + "ahash 0.8.12", + "arrow-array 55.2.0", + "arrow-buffer 55.2.0", + "arrow-cast 55.2.0", + "arrow-data 55.2.0", + "arrow-ipc 55.2.0", + "arrow-schema 55.2.0", + "arrow-select 55.2.0", + "base64 0.22.1", + "brotli", + "bytes 1.10.1", + "chrono", + "flate2", + "futures 0.3.31", + "half", + "hashbrown 0.15.5", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd 0.13.3", +] + +[[package]] +name = "parquet" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +dependencies = [ + "ahash 0.8.12", + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-cast 56.2.0", + "arrow-data 56.2.0", + "arrow-ipc 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "base64 0.22.1", + "brotli", + "bytes 1.10.1", + "chrono", + "flate2", + "futures 0.3.31", + "half", + "hashbrown 0.16.0", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "ring", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd 0.13.3", +] + [[package]] name = "parse-size" version = "1.1.0" @@ -6885,9 +9251,15 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.1" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "percent-encoding-rfc3986" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "3637c05577168127568a64e9dc5a6887da720efef07b3d9472d45f63ab191166" [[package]] name = "pest" @@ -6896,7 +9268,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1db05f56d34358a8b1066f67cbb203ee3e7ed2ba674a6263a1d5ec6db2204323" dependencies = [ "memchr", - "thiserror 2.0.15", + "thiserror 2.0.16", "ucd-trie", ] @@ -6940,7 +9312,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset 0.4.2", - "indexmap 2.10.0", + "indexmap 2.11.0", ] [[package]] @@ -6950,7 +9322,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset 0.5.7", - "indexmap 2.10.0", + "indexmap 2.11.0", +] + +[[package]] +name = "petgraph" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" +dependencies = [ + "fixedbitset 0.5.7", + "hashbrown 0.15.5", + "indexmap 2.11.0", + "serde", ] [[package]] @@ -7225,9 +9609,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.36" +version = "0.2.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff24dfcda44452b9816fff4cd4227e1bb73ff5a2f1bc1105aa92fb8565ce44d2" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", "syn 2.0.106", @@ -7322,12 +9706,12 @@ name = "prometheus-parser" version = "0.1.0" source = "git+https://github.com/vectordotdev/vector?tag=v0.49.0#dc7e79278323d1323bcafe3741d7e258b0c37fb4" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.0", "nom 8.0.0", "prost 0.12.6", "prost-build 0.12.6", "prost-types 0.12.6", - "snafu 0.8.6", + "snafu 0.8.7", "vector-common", ] @@ -7396,7 +9780,7 @@ dependencies = [ "multimap 0.10.1", "once_cell", "petgraph 0.6.5", - "prettyplease 0.2.36", + "prettyplease 0.2.37", "prost 0.12.6", "prost-types 0.12.6", "regex", @@ -7416,7 +9800,7 @@ dependencies = [ "multimap 0.10.1", "once_cell", "petgraph 0.7.1", - "prettyplease 0.2.36", + "prettyplease 0.2.37", "prost 0.13.5", "prost-types 0.13.5", "regex", @@ -7506,9 +9890,9 @@ dependencies = [ [[package]] name = "psl" -version = "2.1.133" +version = "2.1.136" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f2122d897153251f66b73cb51a4c2d61ebd76a9953bec65d9d2d32b8c64b9f7" +checksum = "0a4d5ec1bed313b61a9d525e8549493538aea497056a5484f5398b1a05bb8261" dependencies = [ "psl-types", ] @@ -7519,6 +9903,15 @@ version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" +[[package]] +name = "psm" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" +dependencies = [ + "cc", +] + [[package]] name = "ptr_meta" version = "0.1.4" @@ -7545,7 +9938,7 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42ea446cab60335f76979ec15e12619a2165b5ae2c12166bef27d283a9fadf" dependencies = [ - "idna 1.0.3", + "idna 1.1.0", "psl-types", ] @@ -7614,11 +10007,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ed1a693391a16317257103ad06a88c6529ac640846021da7c435a06fffdacd7" dependencies = [ "chrono", - "indexmap 2.10.0", + "indexmap 2.11.0", "newtype-uuid", "quick-xml 0.37.5", "strip-ansi-escapes", - "thiserror 2.0.15", + "thiserror 2.0.16", "uuid", ] @@ -7642,6 +10035,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.38.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quickcheck" version = "1.0.3" @@ -7664,10 +10067,10 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash", + "rustc-hash 2.1.1", "rustls 0.23.31", "socket2 0.5.10", - "thiserror 2.0.15", + "thiserror 2.0.16", "tokio", "tracing 0.1.41", "web-time", @@ -7684,11 +10087,11 @@ dependencies = [ "lru-slab", "rand 0.9.2", "ring", - "rustc-hash", + "rustc-hash 2.1.1", "rustls 0.23.31", "rustls-pki-types", "slab", - "thiserror 2.0.15", + "thiserror 2.0.16", "tinyvec", "tracing 0.1.41", "web-time", @@ -7861,7 +10264,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabd94c2f37801c20583fc49dd5cd6b0ba68c716787c2dd6ed18571e1e63117b" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "cassowary", "compact_str", "crossterm 0.28.1", @@ -7882,7 +10285,7 @@ version = "11.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6df7ab838ed27997ba19a4664507e6f82b41fe6e20be42929332156e5e85146" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", ] [[package]] @@ -7896,6 +10299,26 @@ dependencies = [ "futures-io", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.106", +] + [[package]] name = "redis" version = "0.32.5" @@ -7929,7 +10352,7 @@ version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", ] [[package]] @@ -7943,6 +10366,17 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.16", + "libredox", + "thiserror 2.0.16", +] + [[package]] name = "ref-cast" version = "1.0.24" @@ -7979,14 +10413,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.1" +version = "1.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.9", - "regex-syntax 0.8.5", + "regex-automata 0.4.10", + "regex-syntax 0.8.6", ] [[package]] @@ -8000,13 +10434,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.5", + "regex-syntax 0.8.6", ] [[package]] @@ -8016,18 +10450,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c11639076bf147be211b90e47790db89f4c22b6c8a9ca6e960833869da67166" dependencies = [ "aho-corasick", - "indexmap 2.10.0", + "indexmap 2.11.0", "itertools 0.13.0", "nohash", "regex", - "regex-syntax 0.8.5", + "regex-syntax 0.8.6", ] [[package]] name = "regex-lite" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" +checksum = "943f41321c63ef1c92fd763bfe054d2668f7f225a5c29f0105903dc2fc04ba30" [[package]] name = "regex-syntax" @@ -8037,9 +10471,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" [[package]] name = "rend" @@ -8109,10 +10543,11 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", + "h2 0.4.12", "http 1.3.1", "http-body 1.0.1", "http-body-util", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-rustls 0.27.7", "hyper-tls 0.6.0", "hyper-util", @@ -8124,6 +10559,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.31", + "rustls-native-certs 0.8.1", "rustls-pki-types", "serde", "serde_json", @@ -8313,6 +10749,12 @@ version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.1" @@ -8367,7 +10809,7 @@ version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "errno", "libc", "linux-raw-sys 0.4.15", @@ -8380,7 +10822,7 @@ version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "errno", "libc", "linux-raw-sys 0.9.4", @@ -8419,6 +10861,7 @@ version = "0.23.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" dependencies = [ + "aws-lc-rs", "log", "once_cell", "ring", @@ -8520,6 +10963,7 @@ version = "0.103.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" dependencies = [ + "aws-lc-rs", "ring", "rustls-pki-types", "untrusted", @@ -8537,7 +10981,7 @@ version = "16.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62fd9ca5ebc709e8535e8ef7c658eb51457987e48c98ead2be482172accc408d" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "cfg-if", "clipboard-win", "libc", @@ -8683,7 +11127,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "core-foundation 0.9.4", "core-foundation-sys", "libc", @@ -8696,7 +11140,7 @@ version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -8737,6 +11181,12 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.219" @@ -8798,11 +11248,11 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.142" +version = "1.0.143" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7" +checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.0", "itoa", "memchr", "ryu", @@ -8900,7 +11350,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.10.0", + "indexmap 2.11.0", "schemars 0.9.0", "schemars 1.0.4", "serde", @@ -8940,7 +11390,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.0", "itoa", "ryu", "serde", @@ -9147,13 +11597,13 @@ dependencies = [ [[package]] name = "snafu" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320b01e011bf8d5d7a4a4a4be966d9160968935849c83b918827f6a435e7f627" +checksum = "0062a372b26c4a6e9155d099a3416d732514fd47ae2f235b3695b820afcee74a" dependencies = [ "futures-core", "pin-project", - "snafu-derive 0.8.6", + "snafu-derive 0.8.7", ] [[package]] @@ -9170,9 +11620,9 @@ dependencies = [ [[package]] name = "snafu-derive" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1961e2ef424c1424204d3a5d6975f934f56b6d50ff5732382d84ebf460e147f7" +checksum = "7e5fd9e3263fc19d73abd5107dbd4d43e37949212d2b15d4d334ee5db53022b8" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -9244,6 +11694,49 @@ dependencies = [ "der", ] +[[package]] +name = "sqlparser" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser" +version = "0.58.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec4b661c54b1e4b603b37873a18c59920e4c51ea8ea2cf527d925424dbd4437c" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +dependencies = [ + "log", + "recursive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "sqlx" version = "0.8.6" @@ -9276,20 +11769,22 @@ dependencies = [ "futures-util", "hashbrown 0.15.5", "hashlink", - "indexmap 2.10.0", + "indexmap 2.11.0", "log", "memchr", "once_cell", "percent-encoding", + "rustls 0.23.31", "serde", "serde_json", "sha2", "smallvec", - "thiserror 2.0.15", + "thiserror 2.0.16", "tokio", "tokio-stream", "tracing 0.1.41", "url", + "webpki-roots 0.26.11", ] [[package]] @@ -9338,7 +11833,7 @@ checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526" dependencies = [ "atoi", "base64 0.22.1", - "bitflags 2.9.2", + "bitflags 2.9.3", "byteorder", "bytes 1.10.1", "chrono", @@ -9368,7 +11863,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.15", + "thiserror 2.0.16", "tracing 0.1.41", "whoami", ] @@ -9381,7 +11876,7 @@ checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46" dependencies = [ "atoi", "base64 0.22.1", - "bitflags 2.9.2", + "bitflags 2.9.3", "byteorder", "chrono", "crc", @@ -9406,7 +11901,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.15", + "thiserror 2.0.16", "tracing 0.1.41", "whoami", ] @@ -9431,7 +11926,7 @@ dependencies = [ "serde", "serde_urlencoded", "sqlx-core", - "thiserror 2.0.15", + "thiserror 2.0.16", "tracing 0.1.41", "url", ] @@ -9442,6 +11937,19 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stacker" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -9524,6 +12032,15 @@ dependencies = [ "strum_macros 0.26.4", ] +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros 0.27.2", +] + [[package]] name = "strum_macros" version = "0.25.3" @@ -9550,6 +12067,18 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "subtle" version = "2.6.1" @@ -9693,15 +12222,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" dependencies = [ "fastrand 2.3.0", "getrandom 0.3.3", "once_cell", "rustix 1.0.8", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -9754,11 +12283,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80d76d3f064b981389ecb4b6b7f45a0bf9fdac1d5b9204c7bd6714fecc302850" +checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" dependencies = [ - "thiserror-impl 2.0.15", + "thiserror-impl 2.0.16", ] [[package]] @@ -9774,9 +12303,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d29feb33e986b6ea906bd9c3559a856983f92371b3eaa5e83782a351623de0" +checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" dependencies = [ "proc-macro2", "quote", @@ -9792,6 +12321,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float 2.10.1", +] + [[package]] name = "tikv-jemalloc-sys" version = "0.6.0+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" @@ -9846,6 +12386,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinystr" version = "0.8.1" @@ -9858,9 +12407,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ "tinyvec_macros", ] @@ -10070,13 +12619,13 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75129e1dc5000bfbaa9fee9d1b21f974f9fbad9daec557a521ee6e080825f6e8" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.0", "serde", "serde_spanned", "toml_datetime 0.7.0", "toml_parser", "toml_writer", - "winnow 0.7.12", + "winnow 0.7.13", ] [[package]] @@ -10100,7 +12649,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.0", "toml_datetime 0.6.11", "winnow 0.5.40", ] @@ -10111,9 +12660,9 @@ version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.0", "toml_datetime 0.6.11", - "winnow 0.7.12", + "winnow 0.7.13", ] [[package]] @@ -10122,7 +12671,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b551886f449aa90d4fe2bdaa9f4a2577ad2dde302c61ecf262d80b116db95c10" dependencies = [ - "winnow 0.7.12", + "winnow 0.7.13", ] [[package]] @@ -10179,7 +12728,7 @@ dependencies = [ "http 1.3.1", "http-body 1.0.1", "http-body-util", - "hyper 1.6.0", + "hyper 1.7.0", "hyper-timeout 0.5.2", "hyper-util", "percent-encoding", @@ -10216,7 +12765,7 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4ef6dd70a610078cb4e338a0f79d06bc759ff1b22d2120c2ff02ae264ba9c2" dependencies = [ - "prettyplease 0.2.36", + "prettyplease 0.2.37", "proc-macro2", "prost-build 0.12.6", "quote", @@ -10229,7 +12778,7 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" dependencies = [ - "prettyplease 0.2.36", + "prettyplease 0.2.37", "proc-macro2", "prost-build 0.13.5", "prost-types 0.13.5", @@ -10265,7 +12814,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "indexmap 2.10.0", + "indexmap 2.11.0", "pin-project-lite", "slab", "sync_wrapper 1.0.2", @@ -10283,7 +12832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140" dependencies = [ "async-compression", - "bitflags 2.9.2", + "bitflags 2.9.3", "bytes 1.10.1", "futures-core", "futures-util", @@ -10305,7 +12854,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ "base64 0.21.7", - "bitflags 2.9.2", + "bitflags 2.9.3", "bytes 1.10.1", "http 1.3.1", "http-body 1.0.1", @@ -10324,7 +12873,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ "base64 0.22.1", - "bitflags 2.9.2", + "bitflags 2.9.3", "bytes 1.10.1", "futures-util", "http 1.3.1", @@ -10776,12 +13325,12 @@ dependencies = [ [[package]] name = "url" -version = "2.5.4" +version = "2.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" dependencies = [ "form_urlencoded", - "idna 1.0.3", + "idna 1.1.0", "percent-encoding", "serde", ] @@ -10840,6 +13389,36 @@ dependencies = [ "vsimd", ] +[[package]] +name = "validator" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0b4a29d8709210980a09379f27ee31549b73292c87ab9899beee1c0d3be6303" +dependencies = [ + "idna 1.1.0", + "once_cell", + "regex", + "serde", + "serde_derive", + "serde_json", + "url", + "validator_derive", +] + +[[package]] +name = "validator_derive" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bac855a2ce6f843beb229757e6e570a42e837bcb15e5f449dd48d5747d41bf77" +dependencies = [ + "darling 0.20.11", + "once_cell", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "valuable" version = "0.1.1" @@ -10940,7 +13519,7 @@ dependencies = [ "hyper 0.14.32", "hyper-openssl 0.9.2", "hyper-proxy", - "indexmap 2.10.0", + "indexmap 2.11.0", "indoc", "inventory", "ipnet", @@ -11002,7 +13581,7 @@ dependencies = [ "serde_yaml", "smallvec", "smpl_jwt", - "snafu 0.8.6", + "snafu 0.8.7", "snap", "socket2 0.5.10", "sqlx", @@ -11088,7 +13667,7 @@ dependencies = [ "paste", "rkyv", "serde", - "snafu 0.8.6", + "snafu 0.8.7", "tokio", "tokio-util", "tracing 0.1.41", @@ -11107,7 +13686,7 @@ dependencies = [ "crossbeam-utils", "derivative", "futures 0.3.31", - "indexmap 2.10.0", + "indexmap 2.11.0", "metrics", "paste", "pin-project", @@ -11130,14 +13709,14 @@ dependencies = [ "chrono-tz", "encoding_rs", "http 0.2.12", - "indexmap 2.10.0", + "indexmap 2.11.0", "inventory", "no-proxy", "num-traits", "serde", "serde_json", "serde_with 3.14.0", - "snafu 0.8.6", + "snafu 0.8.7", "toml", "tracing 0.1.41", "url", @@ -11195,7 +13774,7 @@ dependencies = [ "headers 0.3.9", "http 0.2.12", "hyper-proxy", - "indexmap 2.10.0", + "indexmap 2.11.0", "inventory", "ipnet", "metrics", @@ -11219,7 +13798,7 @@ dependencies = [ "serde_json", "serde_with 3.14.0", "smallvec", - "snafu 0.8.6", + "snafu 0.8.7", "socket2 0.5.10", "tokio", "tokio-openssl", @@ -11243,14 +13822,20 @@ dependencies = [ name = "vector-extensions" version = "0.49.0" dependencies = [ + "arrow 56.2.0", "async-recursion", "async-trait", + "aws-config", "aws-sdk-s3", + "aws-sdk-sts", "aws-smithy-types", "azure_storage_blobs", "base64 0.22.1", "bytes 1.10.1", "chrono", + "crc32fast", + "datafusion 48.0.1", + "deltalake", "etcd-client", "exitcode", "file-source", @@ -11258,6 +13843,7 @@ dependencies = [ "futures 0.3.31", "futures-util", "goauth", + "hashlru", "hex", "http 0.2.12", "hyper 0.14.32", @@ -11268,15 +13854,18 @@ dependencies = [ "md-5", "metrics", "ordered-float 4.6.0", + "parquet 55.2.0", "prost 0.12.6", "prost-build 0.12.6", "prost-types 0.12.6", "rand 0.9.2", "regex", "reqwest 0.11.27", + "rustls 0.23.31", "serde", "serde_json", - "snafu 0.8.6", + "snafu 0.8.7", + "sqlx", "tokio", "tokio-openssl", "tokio-stream", @@ -11423,8 +14012,8 @@ dependencies = [ "hmac", "hostname 0.4.1", "iana-time-zone", - "idna 1.0.3", - "indexmap 2.10.0", + "idna 1.1.0", + "indexmap 2.11.0", "indoc", "influxdb-line-protocol", "itertools 0.14.0", @@ -11466,12 +14055,12 @@ dependencies = [ "sha2", "sha3", "simdutf8", - "snafu 0.8.6", + "snafu 0.8.7", "snap", "strip-ansi-escapes", "syslog_loose", "termcolor", - "thiserror 2.0.15", + "thiserror 2.0.16", "tokio", "tracing 0.1.41", "ua-parser", @@ -11705,6 +14294,15 @@ version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.2", +] + [[package]] name = "webpki-roots" version = "1.0.2" @@ -11779,11 +14377,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -11891,7 +14489,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "193cae8e647981c35bc947fdd57ba7928b1fa0d4a79305f6dd2dc55221ac35ac" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", "widestring 1.2.0", "windows-sys 0.59.0", ] @@ -12213,9 +14811,9 @@ dependencies = [ [[package]] name = "winnow" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3edebf492c8125044983378ecb5766203ad3b4c2f7a922bd7dd207f6d443e95" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" dependencies = [ "memchr", ] @@ -12242,7 +14840,7 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags 2.9.2", + "bitflags 2.9.3", ] [[package]] @@ -12276,6 +14874,15 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.8.0" @@ -12300,6 +14907,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "z85" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b3a41ce106832b4da1c065baa4c31cf640cf965fa1483816402b7f6b96f0a64" + [[package]] name = "zerocopy" version = "0.8.26" diff --git a/Cargo.toml b/Cargo.toml index d56ec9f..15b85c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,10 @@ readme = "README.md" publish = false default-run = "vector" +# [profile.release] +# codegen-units = 1 +# lto = "fat" + [[bin]] name = "vector" path = "src/main.rs" @@ -13,12 +17,17 @@ path = "src/main.rs" [dependencies] async-recursion = "1.1.1" async-trait = { version = "0.1.88", default-features = false } +arrow = { version = "56.2.0" } +aws-config = { version = "1.0" } aws-sdk-s3 = { version = "1.82.0", default-features = false, features = ["behavior-version-latest"] } +aws-sdk-sts = { version = "1.0" } aws-smithy-types = { version = "1.3.2", default-features = false } azure_storage_blobs = { version = "0.17.0", default-features = false, features = ["enable_reqwest"] } base64 = { version = "0.22.1", default-features = false } bytes = { version = "1.10.1", default-features = false, features = ["serde"] } chrono = { version = "0.4.41", default-features = false, features = ["clock", "serde"] } +deltalake = { version = "0.29.3", features = ["datafusion", "s3"] } +datafusion = { version = "48"} etcd-client = { version = "0.14", features = ["tls-roots"] } exitcode = { version = "1.1.2", default-features = false } file-source = { git = "https://github.com/vectordotdev/vector", tag = "v0.49.0" } @@ -35,13 +44,16 @@ kube = { version = "1.0.0" } md-5 = { version = "0.10", default-features = false } metrics = "0.24.2" ordered-float = { version = "4.6.0", default-features = false } +parquet = { version = "55.2.0" } prost = { version = "0.12", default-features = false, features = ["std"] } prost-types = { version = "0.12", default-features = false } rand = "0.9.2" reqwest = { version = "0.11", features = ["native-tls"] } +rustls = { version = "0.23", features = ["aws-lc-rs"] } serde = { version = "1.0.219", default-features = false, features = ["derive"] } serde_json = { version = "1.0.142", default-features = false, features = ["std", "raw_value"] } snafu = { version = "0.8.6", default-features = false, features = ["futures"] } +sqlx = { version = "0.8", features = ["mysql", "runtime-tokio-rustls", "chrono"] } tokio = { version = "1.45.1", default-features = false, features = ["full"] } tokio-openssl = { version = "0.6.5", default-features = false } tokio-stream = { version = "0.1.17", default-features = false, features = ["net", "sync", "time"] } @@ -55,9 +67,11 @@ url = { version = "2.5.4", default-features = false, features = ["serde"] } vector = { git = "https://github.com/vectordotdev/vector", tag = "v0.49.0", default-features = false, features = ["aws-config", "sinks-aws_s3", "gcp", "sinks-gcp"] } vector-config = { git = "https://github.com/vectordotdev/vector", tag = "v0.49.0", default-features = false } vector-lib = { git = "https://github.com/vectordotdev/vector", tag = "v0.49.0", default-features = false } +hashlru = "0.11.1" +lazy_static = "1.4.0" +crc32fast = "1.4.2" [dev-dependencies] -lazy_static = "1.4.0" regex = "1.10.3" [build-dependencies] @@ -178,3 +192,6 @@ sinks-metrics = [ "vector/sinks-vector", "vector/sinks-splunk_hec" ] + +[patch.crates-io] +async-compression = { git = "https://github.com/nolouch/async-compression", rev = "ba69fdc" } diff --git a/DELTALAKE_S3_CHANGES.md b/DELTALAKE_S3_CHANGES.md new file mode 100644 index 0000000..7959dc3 --- /dev/null +++ b/DELTALAKE_S3_CHANGES.md @@ -0,0 +1,116 @@ +# Delta Lake S3 Support with Complete AWS Authentication + +## Summary + +This enhancement adds complete S3 storage support to the Delta Lake sink, enabling it to write Delta Lake tables directly to S3 with full AWS authentication support, **completely mirroring the `aws_s3_upload_file` implementation**. The implementation now provides the same level of AWS integration and authentication options as the existing `aws_s3_upload_file` sink. + +## Changes Made + +### 1. Enhanced Configuration (`src/sinks/deltalake/mod.rs`) + +**Completely mirrored** `aws_s3_upload_file` configuration by adding these fields to `DeltaLakeConfig`: + +- `bucket`: S3 bucket name for remote storage +- `options`: Complete S3Options support (flattened, same as aws_s3_upload_file) +- `region`: AWS region or endpoint configuration (flattened, same as aws_s3_upload_file) +- `tls`: TLS configuration support (same as aws_s3_upload_file) +- `auth`: Full AwsAuthentication support with default (same as aws_s3_upload_file) +- `force_path_style`: S3 addressing style configuration (same as aws_s3_upload_file) + +### 2. Complete S3Service Integration + +**Mirrored `aws_s3_upload_file` architecture** by implementing: + +- `create_service()`: Creates S3Service using s3_common::config::create_service (identical to aws_s3_upload_file) +- Full S3Service integration for authentication and configuration +- Real S3 healthcheck using s3_common::config::build_healthcheck (identical to aws_s3_upload_file) +- Automatic AWS region and endpoint configuration from S3Service + +### 3. Enhanced Storage Options + +The sink now automatically configures Delta Lake storage options based on AWS configuration: + +- `AWS_STORAGE_ALLOW_HTTP`: Enables HTTP for local testing +- `AWS_REGION`: Set from Vector's AWS configuration +- `AWS_ENDPOINT_URL`: Set for custom endpoints +- `AWS_S3_ADDRESSING_STYLE`: Configures path-style or virtual-hosted-style addressing +- Full integration with Vector's AWS credential chain + +### 4. Enhanced Writer (`src/sinks/deltalake/writer.rs`) + +Updated `DeltaLakeWriter` to: + +- Detect S3 URLs in table paths (`s3://` prefix) +- Pass storage options to Delta Lake table builder +- Handle both local filesystem and S3 storage paths +- Use Delta Lake's `with_storage_options()` API for configuration + +### 5. Improved Processor (`src/sinks/deltalake/processor.rs`) + +Enhanced `DeltaLakeSink` to: + +- Properly construct S3 table paths by appending table names to S3 base paths +- Maintain backward compatibility with local filesystem paths + +## Configuration Example + +```yaml +sinks: + deltalake_s3: + type: "deltalake" + inputs: ["your_source"] + + # S3 configuration + base_path: "s3://your-bucket/deltalake-tables" + bucket: "your-bucket" + region: "us-west-2" + + # Assume role authentication + auth: + assume_role: "arn:aws:iam::123456789012:role/YourDeltaLakeRole" + + # Delta Lake settings + batch_size: 1000 + timeout_secs: 30 + compression: "snappy" + + # Optional S3 storage options + storage_options: + AWS_STORAGE_ALLOW_HTTP: "true" + + acknowledgements: + enabled: true +``` + +## Authentication Methods Supported + +**Identical to `aws_s3_upload_file`**, supporting all AWS authentication methods: + +1. **Assume Role**: Use `auth.assume_role` with optional `external_id` and `role_session_name` +2. **Static Credentials**: Use `auth.access_key_id`, `auth.secret_access_key`, and optional `auth.token` +3. **Default Credential Chain**: Uses AWS standard credential chain: + - Environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_SESSION_TOKEN`) + - IAM Instance Profile (when running on EC2) + - AWS Config/Credentials Files (`~/.aws/credentials`, `~/.aws/config`) + - ECS/Fargate task roles + - Web Identity Token (for OIDC/SAML) +4. **Custom Endpoints**: Full support for custom S3-compatible endpoints +5. **TLS Configuration**: Complete TLS options for secure connections + +## Backward Compatibility + +- Existing local filesystem configurations remain unchanged +- All existing Delta Lake functionality is preserved +- S3 support is opt-in via the new configuration fields + +## Benefits + +1. **Centralized Storage**: Store Delta Lake tables in S3 for centralized access +2. **Scalability**: Leverage S3's scalability and durability +3. **Security**: Use AWS assume roles for secure, temporary access +4. **Cost Efficiency**: Benefit from S3's cost-effective storage tiers +5. **Integration**: Seamless integration with existing AWS infrastructure + +## Testing + +The implementation has been tested to compile successfully with the existing Vector codebase and Delta Lake dependencies. diff --git a/build.rs b/build.rs index 6ce380a..d96eb1f 100644 --- a/build.rs +++ b/build.rs @@ -3,6 +3,7 @@ fn main() { println!("cargo:rerun-if-changed=proto/tidb.proto"); println!("cargo:rerun-if-changed=proto/tikv.proto"); + println!("cargo:rerun-if-changed=proto/tipb_simple.proto"); println!("cargo:rerun-if-changed=proto/resource_tag.proto"); let mut prost_build = prost_build::Config::new(); @@ -14,6 +15,7 @@ fn main() { &[ "proto/tidb.proto", "proto/tikv.proto", + "proto/tipb_simple.proto", "proto/resource_tag.proto", ], &["proto/"], diff --git a/check_network.sh b/check_network.sh new file mode 100755 index 0000000..631c03c --- /dev/null +++ b/check_network.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +echo "🔍 Checking network connectivity and AWS service accessibility:" +echo "" + +echo "1. Checking internet connection:" +if ping -c 2 8.8.8.8 > /dev/null 2>&1; then + echo "✅ Internet connection is normal" +else + echo "❌ Internet connection failed" +fi + +echo "" +echo "2. Checking AWS STS service DNS resolution:" +if nslookup sts.us-west-2.amazonaws.com > /dev/null 2>&1; then + echo "✅ AWS STS DNS resolution is normal" +else + echo "❌ AWS STS DNS resolution failed" +fi + +echo "" +echo "3. Checking AWS STS service connectivity:" +if curl -s --max-time 5 https://sts.us-west-2.amazonaws.com > /dev/null 2>&1; then + echo "✅ AWS STS service is reachable" +else + echo "❌ AWS STS service is not reachable" +fi + +echo "" +echo "4. Checking AWS credentials environment:" +echo "AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-❌ Not set}" +echo "AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-❌ Not set}" +echo "AWS_PROFILE: ${AWS_PROFILE:-Not set}" +echo "AWS config file: $(ls ~/.aws/credentials 2>/dev/null && echo "✅ Exists" || echo "❌ Does not exist")" + +echo "" +echo "5. Diagnostic conclusion:" +if ping -c 1 8.8.8.8 > /dev/null 2>&1; then + if curl -s --max-time 3 https://sts.us-west-2.amazonaws.com > /dev/null 2>&1; then + echo "🟢 Network connection is normal, AWS STS is reachable" + echo " - dispatch failure may be due to missing valid AWS credentials" + echo " - Recommend setting correct AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY" + else + echo "🟡 Network connection is normal, but AWS STS is not reachable" + echo " - May be a firewall or proxy issue" + echo " - May be an AWS service region issue" + fi +else + echo "🔴 Network connection problem" + echo " - Check internet connection" + echo " - Check network configuration" +fi diff --git a/config_deltalake_s3_example.yaml b/config_deltalake_s3_example.yaml new file mode 100644 index 0000000..757be01 --- /dev/null +++ b/config_deltalake_s3_example.yaml @@ -0,0 +1,114 @@ +# Example configuration for DeltaLake sink with S3 storage and AWS authentication +# This configuration shows how to write Delta Lake tables to S3 using various AWS authentication methods + +sources: + conprof_add_meta: + type: "stdin" # Replace with your actual source + decoding: + codec: "json" + +sinks: + # Delta Lake sink writing to S3 with assume role authentication + deltalake_s3_assume_role: + type: "deltalake" + inputs: ["conprof_add_meta"] + + # S3 base path where Delta Lake tables will be stored + # Tables will be created as subdirectories under this path + base_path: "s3://o11y-test-shared-us-west-2/deltalake-tables" + + # S3 bucket name (required for S3 storage) + bucket: "o11y-test-shared-us-west-2" + + # AWS region configuration + region: "${REGION:-us-west-2}" + + # AWS authentication using assume role (完全参考 aws_s3_upload_file) + auth: + assume_role: "arn:aws:iam::123456789012:role/YourDeltaLakeRole" + external_id: "your-external-id" # Optional + role_session_name: "vector-deltalake" # Optional + + # S3 options (参考 aws_s3_upload_file 的 S3Options) + storage_class: "STANDARD" # S3 storage class + server_side_encryption: "AES256" # Server-side encryption + + # TLS configuration (optional) + tls: + verify_certificate: true + verify_hostname: true + + # Force path-style addressing for S3 (optional) + force_path_style: false + + # Delta Lake configuration + batch_size: 1000 # Number of events to batch before writing + timeout_secs: 30 # Timeout for write operations + compression: "snappy" # Compression format (snappy, gzip, none) + + # Storage options (optional) - these are passed directly to Delta Lake + storage_options: + AWS_STORAGE_ALLOW_HTTP: "true" + # Additional S3/AWS options can be added here + + # Acknowledgments + acknowledgements: + enabled: true + + # Delta Lake sink with static AWS credentials + deltalake_s3_static_credentials: + type: "deltalake" + inputs: ["conprof_add_meta"] + + base_path: "s3://my-bucket/deltalake-tables" + bucket: "my-bucket" + region: "us-east-1" + + # Static credentials + auth: + access_key_id: "${AWS_ACCESS_KEY_ID}" + secret_access_key: "${AWS_SECRET_ACCESS_KEY}" + token: "${AWS_SESSION_TOKEN}" # Optional, for temporary credentials + + batch_size: 1000 + timeout_secs: 30 + compression: "snappy" + + acknowledgements: + enabled: true + + # Delta Lake sink using default AWS credential chain + deltalake_s3_default_auth: + type: "deltalake" + inputs: ["conprof_add_meta"] + + base_path: "s3://my-bucket/deltalake-tables" + bucket: "my-bucket" + region: "us-east-1" + + # Default authentication (uses AWS credential chain) + # This will try: environment variables -> IAM role -> AWS config files + + batch_size: 1000 + timeout_secs: 30 + compression: "snappy" + + acknowledgements: + enabled: true + +# Alternative configuration for local Delta Lake storage +sinks: + deltalake_local: + type: "deltalake" + inputs: ["conprof_add_meta"] + + # Local filesystem path + base_path: "./delta-tables" + + # Delta Lake configuration + batch_size: 1000 + timeout_secs: 30 + compression: "snappy" + + acknowledgements: + enabled: true diff --git a/config_example.yaml b/config_example.yaml new file mode 100644 index 0000000..c683d24 --- /dev/null +++ b/config_example.yaml @@ -0,0 +1,86 @@ +# Example configuration for system_tables source and deltalake sink + +sources: + system_tables: + type: "system_tables" + + # PD address for legacy mode (to discover TiDB instances) + pd_address: "127.0.0.1:2379" + + # Database connection configuration + database: + username: "root" + password: "" + host: "127.0.0.1" + port: 4000 + database: "test" + max_connections: 10 + connect_timeout: 30 + + # Collection interval configuration + collection: + short_interval: 5 # 5 seconds for high-frequency tables + long_interval: 1800 # 30 minutes for low-frequency tables + retention_days: 7 + + # Tables to collect data from + tables: + - source_schema: "information_schema" + source_table: "PROCESSLIST" + dest_table: "hist_processlist" + collection_interval: "short" + where_clause: "command != 'Sleep'" + enabled: true + + - source_schema: "information_schema" + source_table: "INNODB_TRX" + dest_table: "hist_innodb_trx" + collection_interval: "short" + enabled: true + + - source_schema: "information_schema" + source_table: "TIDB_TRX" + dest_table: "hist_tidb_trx" + collection_interval: "short" + enabled: true + + - source_schema: "information_schema" + source_table: "TIDB_INDEX_USAGE" + dest_table: "hist_tidb_index_usage" + collection_interval: "long" + enabled: true + + - source_schema: "information_schema" + source_table: "MEMORY_USAGE" + dest_table: "hist_memory_usage" + collection_interval: "short" + enabled: true + +sinks: + deltalake: + type: "deltalake" + inputs: ["system_tables"] + + # Base path for Delta Lake tables + base_path: "./delta-tables" + + # Tables are automatically discovered from events (_vector_table field) + # All tables are automatically partitioned by date and _vector_instance + + # Write configuration + write: + batch_size: 1000 + timeout_secs: 30 + compression: "snappy" + + # Storage options for cloud storage (optional) + # storage_options: + # AWS_ACCESS_KEY_ID: "your-key" + # AWS_SECRET_ACCESS_KEY: "your-secret" + # AWS_DEFAULT_REGION: "us-west-2" + +# Global configuration +data_dir: "/tmp/vector" +log_schema: + source_type: "log" + timestamp: "timestamp" diff --git a/examples/system_tables_config.toml b/examples/system_tables_config.toml new file mode 100644 index 0000000..0aed72f --- /dev/null +++ b/examples/system_tables_config.toml @@ -0,0 +1,107 @@ +# TiDB System Tables Source 配置示例 +# 此配置文件展示了 system_tables source 的各种配置选项 + +# 数据源配置 +[sources.tidb_system_tables] +type = "tidb_system_tables" +collection_method = "coprocessor" # 可选: sql, http_api, coprocessor +instance = "tidb-cluster-1" +enabled = true + +# 拓扑发现配置 +[sources.tidb_system_tables.topology] +discovery_method = "static" # 可选: static, consul, etcd +refresh_interval = "30s" + +# Coprocessor 收集器配置 (高性能方式) +[sources.tidb_system_tables.coprocessor] +host = "127.0.0.1" +port = 4000 +grpc_timeout_secs = 30 +max_retries = 3 + +# SQL 收集器配置 (兼容性方式) +[sources.tidb_system_tables.sql] +host = "127.0.0.1" +port = 4000 +username = "root" +password = "" +database = "information_schema" +connection_pool_size = 5 +query_timeout_secs = 30 + +# HTTP API 收集器配置 (轻量级方式) +[sources.tidb_system_tables.http_api] +host = "127.0.0.1" +port = 10080 +timeout_secs = 30 +max_retries = 3 + +# 表配置 - CLUSTER_STATEMENTS_SUMMARY (集群表) +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "CLUSTER_STATEMENTS_SUMMARY" +dest_table = "cluster_statements_summary" +collection_interval = "short" # 可选: short (30s), medium (5m), long (1h) +enabled = true + +# 表配置 - CLUSTER_TIDB_STATEMENTS_STATS (集群表) +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "CLUSTER_TIDB_STATEMENTS_STATS" +dest_table = "cluster_tidb_statements_stats" +collection_interval = "short" +enabled = true + +# 表配置 - STATEMENTS_SUMMARY (非集群表,使用 SQL 方式) +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "STATEMENTS_SUMMARY" +dest_table = "statements_summary" +collection_interval = "medium" +enabled = true + +# 表配置 - SLOW_QUERY (慢查询表) +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "SLOW_QUERY" +dest_table = "slow_query" +collection_interval = "long" +enabled = true + +# Delta Lake 输出配置 +[sinks.deltalake] +type = "deltalake" +inputs = ["tidb_system_tables"] +base_path = "./data/deltalake" +batch_size = 1000 +timeout_secs = 30 +compression = "snappy" + +# 本地存储选项 +[sinks.deltalake.storage_options] +"file.enable_move" = "true" + +# S3 存储选项 (可选) +# [sinks.deltalake.storage_options] +# "aws.access_key_id" = "your_access_key" +# "aws.secret_access_key" = "your_secret_key" +# "aws.region" = "us-west-2" +# "aws.bucket" = "your-bucket" + +# 日志配置 +[log] +level = "info" +format = "json" + +# 指标配置 +[metrics] +enabled = true +port = 9598 +path = "/metrics" + +# 健康检查配置 +[health_check] +enabled = true +port = 9599 +path = "/health" diff --git a/proto/tidb.proto b/proto/tidb.proto index 7b262a7..abe5be5 100644 --- a/proto/tidb.proto +++ b/proto/tidb.proto @@ -17,6 +17,8 @@ message TopSQLRecordItem { map stmt_kv_exec_count = 4; // target => count uint64 stmt_duration_sum_ns = 5; uint64 stmt_duration_count = 6; + uint64 stmt_network_in_bytes = 7; // traffic from client + uint64 stmt_network_out_bytes = 8; // traffic to client } message SQLMeta { diff --git a/proto/tikv.proto b/proto/tikv.proto index cdab879..4c3599a 100644 --- a/proto/tikv.proto +++ b/proto/tikv.proto @@ -16,6 +16,7 @@ message EmptyResponse {} message ResourceUsageRecord { oneof record_oneof { GroupTagRecord record = 1; + RegionRecord region_record = 2; } } @@ -25,9 +26,19 @@ message GroupTagRecord { repeated GroupTagRecordItem items = 2; } +// RegionRecord is a set of resource usage data grouped by region. +message RegionRecord { + uint64 region_id = 1; + repeated GroupTagRecordItem items = 2; +} + message GroupTagRecordItem { uint64 timestamp_sec = 1; uint32 cpu_time_ms = 2; uint32 read_keys = 3; uint32 write_keys = 4; + uint64 network_in_bytes = 5; + uint64 network_out_bytes = 6; + uint64 logical_read_bytes = 7; + uint64 logical_write_bytes = 8; } diff --git a/proto/tipb_official.proto b/proto/tipb_official.proto new file mode 100644 index 0000000..4962936 --- /dev/null +++ b/proto/tipb_official.proto @@ -0,0 +1,214 @@ +syntax = "proto2"; + +package tipb; + +option java_multiple_files = true; +option java_package = "com.pingcap.tidb.tipb"; + +import "executor.proto"; + +import "gogoproto/gogo.proto"; +import "rustproto.proto"; + +option (gogoproto.marshaler_all) = true; +option (gogoproto.sizer_all) = true; +option (gogoproto.unmarshaler_all) = true; +option (gogoproto.goproto_unkeyed_all) = false; +option (gogoproto.goproto_unrecognized_all) = false; +option (gogoproto.goproto_sizecache_all) = false; +option (rustproto.lite_runtime_all) = true; + +// values are all in text format. +message Row { + optional bytes handle = 1; + optional bytes data = 2; +} + +message Error { + optional int32 code = 1 [(gogoproto.nullable) = false]; + optional string msg = 2 [(gogoproto.nullable) = false]; +} + +// It is the data of a intermidiate output channel +message IntermediateOutput { + optional EncodeType encode_type = 1 [(gogoproto.nullable) = false]; + repeated Chunk chunks = 2 [(gogoproto.nullable) = false]; +} + +// Response for SelectRequest. +message SelectResponse { + optional Error error = 1; + + // Result rows. + repeated Row rows = 2; + + // Use multiple chunks to reduce memory allocation and + // avoid allocating large contiguous memory. + repeated Chunk chunks = 3 [(gogoproto.nullable) = false]; + repeated Error warnings = 4; + repeated int64 output_counts = 5; + optional int64 warning_count = 6; + + // Not used any more + // optional bytes row_batch_data = 7 [(gogoproto.customtype) = "github.com/pingcap/tipb/sharedbytes.SharedBytes", (gogoproto.nullable) = false]; + + // The execution summary of each executor, in the order in request. + repeated ExecutorExecutionSummary execution_summaries = 8; + // It indicates the encode type of response. + optional EncodeType encode_type = 9 [(gogoproto.nullable) = false]; + // ndvs collects the number of distinct value information per range. It will be used to serve as execution feedback information. + // Helping us improve the table's statistics information. + repeated int64 ndvs = 10; + // It contains all the intermedidate outputs. + repeated IntermediateOutput intermediate_outputs = 11; +} + +// Chunk contains multiple rows data and rows meta. +message Chunk { + // Data for all rows in the chunk. + optional bytes rows_data = 3 [(gogoproto.customtype) = "github.com/pingcap/tipb/sharedbytes.SharedBytes", (gogoproto.nullable) = false]; + + // Meta data for every row. + repeated RowMeta rows_meta = 4 [(gogoproto.nullable) = false]; +} + +// IntermediateOutputChannel is the channel description for the intermediate ouput. +// The SelectResponse of a DAGRequest may output some intermediate data because not all rows can be processed in DAG. +// For example, the executor IndexLookUp scans the index records and look up the rows locally. +// If a related row of a index is not found locally, this index record should be ouput into the intermediate channel +// for the further processment in the TiDB side. +message IntermediateOutputChannel { + // executor_idx indicates which executor outputs this intermediate result. + required uint32 executor_idx = 1 [(gogoproto.nullable) = false]; + // It represents which columns we should output. + repeated uint32 output_offsets = 2; +} + +// RowMeta contains row handle and length of a row. +message RowMeta { + optional int64 handle = 1 [(gogoproto.nullable) = false]; + optional int64 length = 2 [(gogoproto.nullable) = false]; +} + +// DAGRequest represents the request that will be handled with DAG mode. +message DAGRequest { + // Transaction start timestamp. + // Deprecated. Start Ts has been moved to coprocessor.Request. + optional uint64 start_ts_fallback = 1; + + // It represents push down Executors and follows the order of depth-first search with post-order traversal. + // That is: left child first, then right child, then parent. + // For example, a DAG: + // A + // / + // B + // / \ + // C D + // / / \ + // E F G + // / + // H + // Its order should be: [H, E, C, F, G, D, B, A] + // In most cases, there is only one child for each parent, that makes executors simple array from the srouce + // to the out most executors, and the response only need to output the final rows. + // But when a executor has more than one children, for example, IndexLookUp, some intermedidate result is required to output. + // The field `intermediate_output_channels` describes it. + repeated Executor executors = 2; + + // time zone offset in seconds + optional int64 time_zone_offset = 3 [(gogoproto.nullable) = false]; + + // flags are used to store flags that change the execution mode, it contains: + // ignore_truncate = 1 + // truncate error should be ignore if set. + // truncate_as_warning = 1 << 1 + // when ignored_truncate is not set, return warning instead of error if this flag is set. + // ... + // add more when needed. + optional uint64 flags = 4 [(gogoproto.nullable) = false]; + + // It represents which columns we should output. + repeated uint32 output_offsets = 5; + + // It represents whether we collect the detailed scan counts in each range. + optional bool collect_range_counts = 6; + + // It indicates the maximum number of warning, + // which is the number of messages that SHOW WARNINGS displays. + optional uint64 max_warning_count = 7; + + // It indicates the encode type of response. + optional EncodeType encode_type = 8 [(gogoproto.nullable) = false]; + + // It indicates the sql_mode. + optional uint64 sql_mode = 9; + + // It indicates whether the sql mode is strict. + // Deprecated. Don't use. + // optional bool is_strict_sql_mode = 10; + + // supply offset is not enough since we have daylight saving time present in some regions + optional string time_zone_name = 11 [(gogoproto.nullable) = false]; + + // It represents whether or not TiKV should collect execution summaries. + // Execution summaries will be collected into `execution_summaries` field + // in the response. + optional bool collect_execution_summaries = 12; + + // Represents the maximum size of one packet, any generated string, or any parameter sent as long data. + optional uint64 max_allowed_packet = 13; + + // Represents the chunk memory layout. + optional ChunkMemoryLayout chunk_memory_layout = 14; + + // Represents whether the expression use RPN form. + optional bool is_rpn_expr = 15; + + // UserIdentity uses to do privilege check. It is only used in TiDB cluster memory table. + optional UserIdentity user = 16; + + // Represents tree struct based executors, if this field is set, should ignore the executors field, currently only used in TiFlash + optional Executor root_executor = 17; + + // Force using the encode type specified by encode_type, currently only used in TiFlash + optional bool force_encode_type = 18; + + // It indicates the number of digits by which to increase the scale of the result of division operations performed with the / operator. + optional uint32 div_precision_increment = 19; + + // It inidcates the intermdidate result channels. + repeated IntermediateOutputChannel intermediate_output_channels = 20; +} + +enum EncodeType { + TypeDefault = 0; + TypeChunk = 1; + // TypeCHBlock is used by TiSpark and TiFlash, in this encode mode, TiFlash will encode the data using native ch block format + TypeCHBlock = 2; +} + +message ChunkMemoryLayout { + // Represents the endian. + optional Endian endian = 1 [(gogoproto.nullable) = false]; +} + +enum Endian { + LittleEndian = 0; + BigEndian = 1; +} + +message UserIdentity { + optional string user_name = 1 [(gogoproto.nullable) = false]; + optional string user_host = 2 [(gogoproto.nullable) = false]; +} + +message StreamResponse { + optional Error error = 1; + // Data for all rows + optional bytes data = 3 [(gogoproto.customtype) = "github.com/pingcap/tipb/sharedbytes.SharedBytes", (gogoproto.nullable) = false]; + repeated Error warnings = 4; + // output row count for each executor + repeated int64 output_counts = 5; + optional int64 warning_count = 6; + repeated int64 ndvs = 7; +} diff --git a/proto/tipb_simple.proto b/proto/tipb_simple.proto new file mode 100644 index 0000000..d2d30ad --- /dev/null +++ b/proto/tipb_simple.proto @@ -0,0 +1,500 @@ +syntax = "proto3"; + +package tipb; + +// Complete proto definitions based on official tipb and kvproto + +// Basic enums +enum EncodeType { + TypeDefault = 0; + TypeChunk = 1; + TypeCHBlock = 2; +} + +enum Endian { + ENDIAN_UNSPECIFIED = 0; + LittleEndian = 1; + BigEndian = 2; +} + +enum ExecType { + TypeTableScan = 0; + TypeIndexScan = 1; + TypeSelection = 2; + TypeAggregation = 3; + TypeTopN = 4; + TypeLimit = 5; + TypeStreamAgg = 6; + TypeJoin = 7; + TypeKill = 8; + TypeExchangeSender = 9; + TypeExchangeReceiver = 10; + TypeProjection = 11; + TypePartitionTableScan = 12; + TypeSort = 13; + TypeWindow = 14; + TypeExpand = 15; + TypeExpand2 = 16; + TypeBroadcastQuery = 17; + TypeCTESink = 18; + TypeCTESource = 19; + TypeIndexLookUp = 20; +} + +enum Op { + Put = 0; + Del = 1; + Lock = 2; + Rollback = 3; + Insert = 4; + CheckNotExists = 5; +} + +// Basic structures +message ChunkMemoryLayout { + Endian endian = 1; +} + +message UserIdentity { + string user_name = 1; + string user_host = 2; +} + +message IntermediateOutputChannel { + uint32 executor_idx = 1; + repeated uint32 output_offsets = 2; +} + +message ColumnInfo { + int64 column_id = 1; + int32 tp = 2; +} + +// Expression definition +message Expr { + int32 tp = 1; + bytes val = 2; + repeated Expr children = 3; + int32 sig = 4; + FieldType field_type = 5; +} + +message TableScan { + int64 table_id = 1; + repeated ColumnInfo columns = 2; + bool desc = 3; +} + +// Add missing message types for Executor +message IndexScan { + int64 table_id = 1; + int64 index_id = 2; + repeated ColumnInfo columns = 3; + bool desc = 4; +} + +message Selection { + repeated Expr conditions = 1; +} + +message Aggregation { + repeated Expr group_by = 1; + repeated Expr agg_func = 2; +} + +message TopN { + repeated ByItem order_by = 1; + uint64 limit = 2; +} + +message Limit { + uint64 limit = 1; +} + +message ExchangeReceiver { + repeated FieldType field_types = 1; +} + +message Join { + int32 join_type = 1; + repeated Expr left_join_keys = 2; + repeated Expr right_join_keys = 3; +} + +message Kill { + uint64 conn_id = 1; +} + +message ExchangeSender { + int32 tp = 1; + repeated bytes encoded_task_meta = 2; +} + +message Projection { + repeated Expr exprs = 1; +} + +message PartitionTableScan { + int64 table_id = 1; + repeated ColumnInfo columns = 2; + repeated int64 partition_ids = 3; +} + +message Sort { + repeated ByItem by_items = 1; +} + +message Window { + repeated Expr func_desc = 1; + repeated ByItem order_by = 2; + repeated Expr partition_by = 3; +} + +message Expand { + repeated GroupingSet grouping_sets = 1; +} + +message Expand2 { + repeated GroupingSet grouping_sets = 1; +} + +message BroadcastQuery { + int32 query_type = 1; +} + +message CTESink { + int32 cte_id = 1; +} + +message CTESource { + int32 cte_id = 1; +} + +message IndexLookUp { + IndexScan index_scan = 1; + TableScan table_scan = 2; +} + +// Supporting message types +message ByItem { + Expr expr = 1; + bool desc = 2; +} + +message GroupingSet { + repeated uint64 grouping_exprs = 1; +} + +message FieldType { + int32 tp = 1; + uint32 flag = 2; + int32 flen = 3; + int32 decimal = 4; + string charset = 5; + string collate = 6; +} + +message Executor { + ExecType tp = 1; + TableScan tbl_scan = 2; + IndexScan idx_scan = 3; + Selection selection = 4; + Aggregation aggregation = 5; + TopN topN = 6; + Limit limit = 7; + ExchangeReceiver exchange_receiver = 8; + Join join = 9; + string executor_id = 10; + Kill kill = 11; + ExchangeSender exchange_sender = 12; + Projection projection = 13; + PartitionTableScan partition_table_scan = 14; + Sort sort = 15; + Window window = 16; + uint64 fine_grained_shuffle_stream_count = 17; + uint64 fine_grained_shuffle_batch_size = 18; + Expand expand = 19; + Expand2 expand2 = 20; + BroadcastQuery broadcast_query = 21; + CTESink cte_sink = 22; + CTESource cte_source = 23; + IndexLookUp index_lookup = 24; + uint32 parent_idx = 25; +} + +message DagRequest { + uint64 start_ts_fallback = 1; + repeated Executor executors = 2; + int64 time_zone_offset = 3; + uint64 flags = 4; + repeated uint32 output_offsets = 5; + bool collect_range_counts = 6; + uint64 max_warning_count = 7; + EncodeType encode_type = 8; + uint64 sql_mode = 9; + string time_zone_name = 11; + bool collect_execution_summaries = 12; + uint64 max_allowed_packet = 13; + ChunkMemoryLayout chunk_memory_layout = 14; + bool is_rpn_expr = 15; + UserIdentity user = 16; + Executor root_executor = 17; + bool force_encode_type = 18; + uint32 div_precision_increment = 19; + repeated IntermediateOutputChannel intermediate_output_channels = 20; +} + +// Context and region related +message RegionEpoch { + uint64 conf_ver = 1; + uint64 version = 2; +} + +message Peer { + uint64 id = 1; + uint64 store_id = 2; +} + +message SourceStmt { + uint64 connection_id = 1; + string session_alias = 2; +} + +message Context { + uint64 region_id = 1; + RegionEpoch region_epoch = 2; + Peer peer = 3; + SourceStmt source_stmt = 4; +} + +message KeyRange { + bytes start = 1; + bytes end = 2; +} + +// Coprocessor request/response +message CoprocessorRequest { + Context context = 1; + int64 tp = 2; + bytes data = 3; + repeated KeyRange ranges = 4; + bool is_cache_enabled = 5; + uint64 cache_if_match_version = 6; + uint64 start_ts = 7; + int64 schema_ver = 8; + bool is_trace_enabled = 9; + uint64 paging_size = 10; + uint64 connection_id = 12; + string connection_alias = 13; +} + +// Error handling +message RegionError { + string message = 1; + NotLeader not_leader = 2; + RegionNotFound region_not_found = 3; + KeyNotInRegion key_not_in_region = 4; + EpochNotMatch epoch_not_match = 5; + ServerIsBusy server_is_busy = 6; + StaleCommand stale_command = 7; + StoreNotMatch store_not_match = 8; + RaftEntryTooLarge raft_entry_too_large = 9; + MaxTimestampNotSynced max_timestamp_not_synced = 10; + ReadIndexNotReady read_index_not_ready = 11; + ProposalInMergingMode proposal_in_merging_mode = 12; + DataIsNotReady data_is_not_ready = 13; + RegionNotInitialized region_not_initialized = 14; + DiskFull disk_full = 15; +} + +message NotLeader { + uint64 region_id = 1; + Peer leader = 2; +} + +message RegionNotFound { + uint64 region_id = 1; +} + +message KeyNotInRegion { + bytes key = 1; + uint64 region_id = 2; + bytes start_key = 3; + bytes end_key = 4; +} + +message EpochNotMatch { + repeated Region current_regions = 1; +} + +message Region { + uint64 id = 1; + bytes start_key = 2; + bytes end_key = 3; + RegionEpoch region_epoch = 4; + repeated Peer peers = 5; +} + +message ServerIsBusy { + string reason = 1; + uint64 backoff_ms = 2; +} + +message StaleCommand {} + +message StoreNotMatch { + uint64 request_store_id = 1; + uint64 actual_store_id = 2; +} + +message RaftEntryTooLarge { + uint64 region_id = 1; + uint64 entry_size = 2; +} + +message MaxTimestampNotSynced {} + +message ReadIndexNotReady { + string reason = 1; + uint64 region_id = 2; +} + +message ProposalInMergingMode { + uint64 region_id = 1; +} + +message DataIsNotReady { + uint64 region_id = 1; + uint64 peer_id = 2; + uint64 safe_ts = 3; +} + +message RegionNotInitialized { + uint64 region_id = 1; +} + +message DiskFull {} + +message LockInfo { + bytes primary_lock = 1; + uint64 lock_version = 2; + bytes key = 3; + uint64 lock_ttl = 4; + uint64 txn_size = 5; + Op lock_type = 6; + uint64 lock_for_update_ts = 7; + bool use_async_commit = 8; + repeated bytes secondaries = 9; + uint64 min_commit_ts = 10; +} + +// Execution details +message TimeDetail { + uint64 process_wall_time_ms = 1; + uint64 process_cpu_time_ms = 2; + uint64 total_keys = 3; + uint64 processed_keys = 4; +} + +message TimeDetailV2 { + uint64 process_wall_time_ns = 1; + uint64 process_cpu_time_ns = 2; + uint64 total_keys = 3; + uint64 processed_keys = 4; +} + +message ScanDetail { + uint64 total = 1; + uint64 processed = 2; +} + +message ScanDetailV2 { + uint64 processed_versions = 1; + uint64 total_versions = 2; + uint64 rocksdb_delete_skipped_count = 3; + uint64 rocksdb_key_skipped_count = 4; + uint64 rocksdb_block_cache_hit_count = 5; + uint64 rocksdb_block_read_count = 6; + uint64 rocksdb_block_read_byte = 7; +} + +message WriteDetail { + uint64 store_batch_wait_duration = 1; + uint64 propose_send_wait_duration = 2; + uint64 persist_log_duration = 3; + uint64 raft_db_write_leader_wait_duration = 4; + uint64 raft_db_sync_log_duration = 5; + uint64 raft_db_write_memtable_duration = 6; + uint64 commit_log_duration = 7; + uint64 apply_batch_wait_duration = 8; + uint64 apply_log_duration = 9; + uint64 apply_mutate_duration = 10; + uint64 apply_wait_duration = 11; +} + +message ExecDetails { + TimeDetail time_detail = 1; + ScanDetail scan_detail = 2; + WriteDetail write_detail = 3; +} + +message ExecDetailsV2 { + TimeDetailV2 time_detail_v2 = 1; + ScanDetailV2 scan_detail_v2 = 2; + WriteDetail write_detail = 3; +} + +// CoprocessorResponse - official definition from kvproto +message CoprocessorResponse { + bytes data = 1; + RegionError region_error = 2; + LockInfo locked = 3; + string other_error = 4; + KeyRange range = 5; + ExecDetails exec_details = 6; + bool is_cache_hit = 7; + uint64 cache_last_version = 8; + bool can_be_cached = 9; + ExecDetailsV2 exec_details_v2 = 11; + uint64 latest_buckets_version = 12; +} + +// SelectResponse and related messages +message SelectResponse { + Error error = 1; + repeated Row rows = 2; + repeated Chunk chunks = 3; + repeated Error warnings = 4; + repeated int64 output_counts = 5; + int64 warning_count = 6; + repeated ExecutorExecutionSummary execution_summaries = 8; + EncodeType encode_type = 9; + repeated int64 ndvs = 10; +} + +message Row { + bytes handle = 1; + bytes data = 2; +} + +message Error { + int32 code = 1; + string msg = 2; +} + +message Chunk { + bytes rows_data = 3; + repeated RowMeta rows_meta = 4; +} + +message RowMeta { + int64 handle = 1; + int64 length = 2; +} + +message ExecutorExecutionSummary { + uint64 time_processed_ns = 1; + uint64 num_produced_rows = 2; + uint64 num_iterations = 3; + uint64 concurrency = 4; +} diff --git a/src/common/deltalake_writer.rs b/src/common/deltalake_writer.rs new file mode 100644 index 0000000..71d6703 --- /dev/null +++ b/src/common/deltalake_writer.rs @@ -0,0 +1,1303 @@ +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::PathBuf; + +use deltalake::kernel::{DataType as DeltaDataType, StructField, TableFeatures}; +use { + arrow::array::{ + ArrayRef, BooleanBuilder, Float64Builder, Int16Builder, Int32Builder, Int64Builder, + Int8Builder, StringArray, StringBuilder, UInt32Builder, UInt64Builder, + }, + arrow::datatypes::{DataType, Field, Schema, TimeUnit}, + arrow::record_batch::RecordBatch, + deltalake::operations::create::CreateBuilder, + deltalake::DeltaOps, +}; + +use vector::{ + aws::{AwsAuthentication, RegionOrEndpoint}, + sinks::s3_common::service::S3Service, +}; + +use vector_lib::event::Event; +use vector_lib::event::{LogEvent, Value as LogValue}; +use url::Url; + +/// Delta table configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DeltaTableConfig { + /// Table name + pub name: String, + + /// Partition columns + pub partition_by: Option>, + + /// Enable schema evolution + pub schema_evolution: Option, + + /// Standard columns to include + pub standard_columns: Option>, +} + +/// Write configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WriteConfig { + /// Batch size for writing + #[serde(default = "default_batch_size")] + pub batch_size: usize, + + /// Max count of row group in a single parquet file + #[serde(default = "default_max_row_group_size")] + pub max_row_group_size: usize, + + /// Write timeout in seconds + #[serde(default = "default_timeout_secs")] + pub timeout_secs: u64, + + /// Compression format + #[serde(default = "default_compression")] + pub compression: String, +} + +/// Compression format +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum CompressionFormat { + /// Snappy compression + Snappy, + /// Gzip compression + Gzip, + /// No compression + None, +} + +pub const fn default_batch_size() -> usize { + 1000 +} + +pub const fn default_max_row_group_size() -> usize { + 8192 +} + +pub const fn default_timeout_secs() -> u64 { + 30 +} + +pub fn default_compression() -> String { + "snappy".to_string() +} + +pub struct StorageOptionsBuilder { + /// AWS region or endpoint + pub region: Option, + + /// Specifies which addressing style to use + pub force_path_style: Option, + + /// AWS authentication + pub auth: AwsAuthentication, +} + +impl StorageOptionsBuilder { + pub fn new( + region: Option, + force_path_style: Option, + auth: AwsAuthentication, + ) -> Self { + Self { + region, + force_path_style, + auth, + } + } + + pub async fn build( + &self, + storage_options: &mut HashMap, + _service: &S3Service, + ) -> vector::Result<()> { + info!("=== Applying S3 storage options (aws_s3_upload_file style) ==="); + debug!("Initial storage_options: {:?}", storage_options); + + // Initialize S3 handlers for Delta Lake + deltalake::aws::register_handlers(None); + debug!("Delta Lake S3 handlers registered"); + + // Set AWS storage options for Delta Lake + storage_options.insert("AWS_STORAGE_ALLOW_HTTP".to_string(), "true".to_string()); + + // Set region from configuration + if let Some(region) = &self.region { + // Convert region to string - this will be picked up by Delta Lake + if let Some(region_str) = region.region() { + storage_options.insert("AWS_REGION".to_string(), region_str.to_string()); + } + + // Set endpoint if using custom endpoint + if let Some(endpoint) = region.endpoint() { + storage_options.insert("AWS_ENDPOINT_URL".to_string(), endpoint); + } + } + + // Set addressing style + if let Some(force_path_style) = self.force_path_style { + if force_path_style { + storage_options.insert("AWS_S3_ADDRESSING_STYLE".to_string(), "path".to_string()); + } else { + storage_options + .insert("AWS_S3_ADDRESSING_STYLE".to_string(), "virtual".to_string()); + } + } + + // Configure AWS authentication for Delta Lake using storage_options + // Delta Lake's object_store crate supports multiple authentication methods: + // 1. Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN) + // 2. IAM Role ARN (AWS_IAM_ROLE_ARN + AWS_IAM_ROLE_SESSION_NAME) - for AssumeRole + // 3. AWS Profile (AWS_PROFILE + AWS_SHARED_CREDENTIALS_FILE) + // 4. EC2/ECS/Lambda instance roles (automatic) + // + // This matches aws_s3_upload_file behavior which uses the same AWS SDK credential chain + info!("Configuring AWS authentication for Delta Lake (storage_options approach)"); + + // Check Vector's auth configuration and map to Delta Lake storage_options + match &self.auth { + AwsAuthentication::Role { + assume_role, + external_id, + .. + } => { + // Configure IAM Role ARN for AssumeRole + // Delta Lake's object_store will automatically call AssumeRole with these settings + info!("Configuring Delta Lake with IAM Role ARN: {}", assume_role); + storage_options.insert("AWS_IAM_ROLE_ARN".to_string(), assume_role.clone()); + storage_options.insert( + "AWS_IAM_ROLE_SESSION_NAME".to_string(), + "vector-deltalake".to_string(), + ); + + if let Some(ext_id) = external_id { + storage_options.insert("AWS_IAM_ROLE_EXTERNAL_ID".to_string(), ext_id.clone()); + info!("✓ Using external ID for role assumption"); + } + + info!("✓ Delta Lake will use AssumeRole with IAM Role ARN"); + } + AwsAuthentication::AccessKey { + access_key_id, + secret_access_key, + session_token, + assume_role, + .. + } => { + // Use static credentials + storage_options.insert("AWS_ACCESS_KEY_ID".to_string(), access_key_id.to_string()); + storage_options.insert( + "AWS_SECRET_ACCESS_KEY".to_string(), + secret_access_key.to_string(), + ); + + if let Some(token) = session_token { + storage_options.insert("AWS_SESSION_TOKEN".to_string(), token.to_string()); + } + + if let Some(role_arn) = assume_role { + info!("Using access key with assume role: {}", role_arn); + // Can also configure AssumeRole with base credentials + storage_options.insert("AWS_IAM_ROLE_ARN".to_string(), role_arn.clone()); + storage_options.insert( + "AWS_IAM_ROLE_SESSION_NAME".to_string(), + "vector-deltalake".to_string(), + ); + } + + info!("✓ Delta Lake using static AWS credentials"); + } + AwsAuthentication::File { + credentials_file, + profile, + .. + } => { + // Use AWS profile + storage_options.insert("AWS_PROFILE".to_string(), profile.clone()); + storage_options.insert( + "AWS_SHARED_CREDENTIALS_FILE".to_string(), + credentials_file.clone(), + ); + info!("✓ Delta Lake using AWS profile: {}", profile); + } + AwsAuthentication::Default { .. } => { + // Use default AWS credential chain (environment variables, instance roles, etc.) + // Check environment variables and pass them to Delta Lake + info!("Using default AWS credential chain"); + + if let Ok(access_key) = std::env::var("AWS_ACCESS_KEY_ID") { + storage_options.insert("AWS_ACCESS_KEY_ID".to_string(), access_key); + } + if let Ok(secret_key) = std::env::var("AWS_SECRET_ACCESS_KEY") { + storage_options.insert("AWS_SECRET_ACCESS_KEY".to_string(), secret_key); + } + if let Ok(session_token) = std::env::var("AWS_SESSION_TOKEN") { + storage_options.insert("AWS_SESSION_TOKEN".to_string(), session_token); + } + if let Ok(profile) = std::env::var("AWS_PROFILE") { + storage_options.insert("AWS_PROFILE".to_string(), profile); + } + + // Set default credentials file path if it exists + if let Ok(home) = std::env::var("HOME") { + let default_creds_file = format!("{}/.aws/credentials", home); + if std::path::Path::new(&default_creds_file).exists() { + storage_options.insert( + "AWS_SHARED_CREDENTIALS_FILE".to_string(), + default_creds_file, + ); + } + } + + info!("✓ Delta Lake will use AWS SDK's default credential chain"); + } + } + + info!("✓ AWS authentication configured for Delta Lake via storage_options"); + + debug!("=== Completed apply_s3_storage_options ==="); + debug!("Final storage_options: {:?}", storage_options); + info!("✓ S3 storage options applied successfully"); + // Log final storage options for debugging + info!( + "Final Delta Lake storage options configured: {:?}", + storage_options + ); + + Ok(()) + } +} + +/// Delta Lake table writer +pub struct DeltaLakeWriter { + table_path: PathBuf, + table_config: DeltaTableConfig, + #[allow(dead_code)] + write_config: WriteConfig, + #[allow(dead_code)] + storage_options: Option>, + schema: Option, + /// Cached schema information from source (table_name -> (field_name -> mysql_type)) + cached_source_schemas: HashMap>, + /// Fixed Arrow schema for this table to ensure consistency across batches + fixed_arrow_schema: Option, +} + +impl DeltaLakeWriter { + /// Create a new Delta Lake writer + pub fn new( + table_path: PathBuf, + table_config: DeltaTableConfig, + write_config: WriteConfig, + storage_options: Option>, + ) -> Self { + // Initialize S3 handlers if this is an S3 path + if table_path.to_string_lossy().starts_with("s3://") { + deltalake::aws::register_handlers(None); + info!( + "Registered Delta Lake S3 handlers for path: {}", + table_path.display() + ); + } + + Self { + table_path, + table_config, + write_config, + storage_options, + schema: None, + cached_source_schemas: HashMap::new(), + fixed_arrow_schema: None, + } + } + + /// Write events to Delta Lake table + pub async fn write_events( + &mut self, + events: Vec, + ) -> Result<(), Box> { + if events.is_empty() { + return Ok(()); + } + + // Log batch summary + info!("Writer processing {} events", events.len()); + + // Convert events to Arrow record batch + let record_batch = self.events_to_record_batch(events)?; + + // Write to Delta Lake + self.write_to_delta_lake(record_batch).await?; + + Ok(()) + } + + /// Convert Vector events to Arrow record batch + fn events_to_record_batch( + &mut self, + events: Vec, + ) -> Result> { + use std::sync::Arc; + + if events.is_empty() { + return Err("No events to convert".into()); + } + + // Get or create fixed schema for this table + let schema = if let Some(ref fixed_schema) = self.fixed_arrow_schema { + fixed_schema.clone() + } else { + // Build fixed schema from first event and cache it + let first_event = &events[0]; + let schema = self.build_fixed_schema(first_event)?; + self.fixed_arrow_schema = Some(schema.clone()); + self.schema = Some(schema.clone()); + schema + }; + + // Convert events to columns + let mut columns: Vec = Vec::new(); + + for field in &schema.fields { + let column = self.create_column(field, &events)?; + columns.push(column); + } + + // Create record batch + let record_batch = RecordBatch::try_new(Arc::new(schema), columns)?; + Ok(record_batch) + } + + /// Build a fixed schema from the first event that will be consistent across all batches + fn build_fixed_schema( + &mut self, + event: &Event, + ) -> Result> { + if let Event::Log(log_event) = event { + let mut fields = Vec::new(); + let mut added_fields = std::collections::HashSet::new(); + + // First, extract and cache the MySQL schema metadata from the event + self.extract_and_cache_mysql_schema(log_event); + + // Get table name for schema lookup + let table_name = log_event + .get("_vector_table") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| "unknown_table".to_string()); + + // Build fixed field list based on cached MySQL schema and Vector system fields + + // 1. Add Vector system fields first + if let Some(standard_columns) = &self.table_config.standard_columns { + for field_name in standard_columns { + fields.push(Field::new(field_name, DataType::Utf8, false)); + added_fields.insert(field_name.to_string()); + } + }; + + // Add date field for partitioning (derived from _vector_timestamp) + fields.push(Field::new("date", DataType::Utf8, false)); + added_fields.insert("date".to_string()); + + // 3. Add all MySQL data fields from cached schema (in deterministic order) + if let Some(table_schema) = self.cached_source_schemas.get(&table_name) { + // Sort field names to ensure consistent order + let mut field_names: Vec<_> = table_schema.keys().collect(); + field_names.sort(); + + for field_name in field_names { + // Skip if conflicts with Vector system fields or is metadata field + if !added_fields.contains(field_name) + && !field_name.starts_with("_schema_metadata") + { + if let Some(mysql_type) = table_schema.get(field_name) { + let data_type = self.mysql_type_to_arrow_type(mysql_type); + fields.push(Field::new(field_name, data_type, true)); + added_fields.insert(field_name.to_string()); + } + } + } + } else { + // Fallback: add fields from current event if no schema cache available + warn!( + "No cached schema found for table {}, using fields from current event", + table_name + ); + if let Some(iter) = log_event.all_event_fields() { + let mut event_fields: Vec<_> = iter + .map(|(key, value)| (key.as_ref().to_string(), value)) + .collect(); + event_fields.sort_by_key(|(key, _)| key.clone()); + + for (key_str, value) in event_fields { + if !added_fields.contains(&key_str) + && !key_str.starts_with("_schema_metadata") + { + let data_type = + self.get_arrow_type_from_schema(log_event, &key_str, value); + fields.push(Field::new(&key_str, data_type, true)); + added_fields.insert(key_str); + } + } + } + } + + info!( + "Built fixed schema with {} fields for table {}", + fields.len(), + table_name + ); + Ok(Schema::new(fields)) + } else { + Err("Event is not a log event".into()) + } + } + + /// Extract MySQL schema metadata from event and cache it + fn extract_and_cache_mysql_schema(&mut self, log_event: &LogEvent) { + // Get table name for schema cache key + let table_name = log_event + .get("_vector_table") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| "unknown_table".to_string()); + + // Only extract if not already cached + if !self.cached_source_schemas.contains_key(&table_name) { + if let Some(schema_metadata) = log_event.get("_schema_metadata") { + if let Some(schema_obj) = schema_metadata.as_object() { + let mut table_schema = HashMap::new(); + for (field, info) in schema_obj { + if let Some(mysql_type) = info.get("mysql_type").and_then(|v| v.as_str()) { + table_schema.insert(field.to_string(), mysql_type.to_string()); + } + } + + info!( + "Cached MySQL schema for table {} with {} fields", + table_name, + table_schema.len() + ); + self.cached_source_schemas.insert(table_name, table_schema); + } + } + } + } + + /// Get Arrow data type from cached schema or extract from event and cache + fn get_arrow_type_from_schema( + &mut self, + log_event: &LogEvent, + field_name: &str, + value: &LogValue, + ) -> DataType { + // Get table name for schema cache key + let table_name = log_event + .get("_vector_table") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| "unknown_table".to_string()); + + // Check if we already have cached schema for this table + if let Some(table_schema) = self.cached_source_schemas.get(&table_name) { + if let Some(mysql_type) = table_schema.get(field_name) { + let arrow_type = self.mysql_type_to_arrow_type(mysql_type); + // Using cached schema + return arrow_type; + } + } + + // Try to extract and cache schema from this event's _schema_metadata + if let Some(schema_metadata) = log_event.get("_schema_metadata") { + if let Some(schema_obj) = schema_metadata.as_object() { + // Cache the entire schema for this table + let mut table_schema = HashMap::new(); + for (field, info) in schema_obj { + if let Some(mysql_type) = info.get("mysql_type").and_then(|v| v.as_str()) { + table_schema.insert(field.to_string(), mysql_type.to_string()); + } + } + + // Schema cached successfully + self.cached_source_schemas + .insert(table_name.clone(), table_schema); + + // Now get the type for current field + if let Some(cached_schema) = self.cached_source_schemas.get(&table_name) { + if let Some(mysql_type) = cached_schema.get(field_name) { + let arrow_type = self.mysql_type_to_arrow_type(mysql_type); + // Using newly cached schema + return arrow_type; + } + } + } + } + + // Fallback to inference if schema not available + self.infer_arrow_type(field_name, value) + } + + /// Convert Arrow Field to Delta StructField + fn arrow_field_to_delta_field(&self, field: &Field) -> StructField { + let delta_type = self.arrow_type_to_delta_type(field.data_type()); + StructField::new(field.name().clone(), delta_type, field.is_nullable()) + } + + /// Convert Arrow DataType to Delta DataType + fn arrow_type_to_delta_type(&self, arrow_type: &DataType) -> DeltaDataType { + match arrow_type { + DataType::Boolean => DeltaDataType::BOOLEAN, + DataType::Int8 => DeltaDataType::BYTE, + DataType::Int16 => DeltaDataType::SHORT, + DataType::Int32 => DeltaDataType::INTEGER, + DataType::Int64 => DeltaDataType::LONG, + DataType::UInt32 => DeltaDataType::INTEGER, // Delta Lake doesn't have unsigned types + DataType::UInt64 => DeltaDataType::LONG, + DataType::Float32 => DeltaDataType::FLOAT, + DataType::Float64 => DeltaDataType::DOUBLE, + DataType::Utf8 => DeltaDataType::STRING, + DataType::LargeUtf8 => DeltaDataType::STRING, + DataType::Binary => DeltaDataType::BINARY, + DataType::LargeBinary => DeltaDataType::BINARY, + DataType::Timestamp(_, _) => DeltaDataType::TIMESTAMP, + DataType::Date32 => DeltaDataType::DATE, + DataType::Date64 => DeltaDataType::DATE, + _ => DeltaDataType::STRING, // Default fallback + } + } + + /// Convert MySQL type to Arrow DataType + fn mysql_type_to_arrow_type(&self, mysql_type: &str) -> DataType { + let mysql_type_lower = mysql_type.to_lowercase(); + + if mysql_type_lower.contains("tinyint(1)") { + DataType::Boolean + } else if mysql_type_lower.contains("bigint") { + if mysql_type_lower.contains("unsigned") { + DataType::UInt64 + } else { + DataType::Int64 + } + } else if mysql_type_lower.contains("tinyint") { + DataType::Int8 + } else if mysql_type_lower.contains("smallint") { + DataType::Int16 + } else if mysql_type_lower.contains("mediumint") || mysql_type_lower.contains("int") { + if mysql_type_lower.contains("unsigned") { + DataType::UInt32 + } else { + DataType::Int32 + } + } else if mysql_type_lower.contains("float") { + DataType::Float32 + } else if mysql_type_lower.contains("double") || mysql_type_lower.contains("real") { + DataType::Float64 + } else if mysql_type_lower.contains("decimal") || mysql_type_lower.contains("numeric") { + // For decimal, we'll use Float64 as a reasonable approximation + DataType::Float64 + } else if mysql_type_lower.contains("timestamp") { + // Use Timestamp for TIMESTAMP columns to enable native TIMESTAMP support + DataType::Timestamp(TimeUnit::Microsecond, None) + } else if mysql_type_lower.contains("datetime") { + // Use Utf8 for DATETIME columns (they don't have timezone info) + DataType::Utf8 + } else if mysql_type_lower.contains("date") { + DataType::Date32 + } else if mysql_type_lower.contains("time") { + DataType::Time64(arrow::datatypes::TimeUnit::Microsecond) + } else if mysql_type_lower.contains("longtext") + || mysql_type_lower.contains("mediumtext") + || mysql_type_lower.contains("text") + || mysql_type_lower.contains("varchar") + || mysql_type_lower.contains("char") + || mysql_type_lower.contains("blob") + || mysql_type_lower.contains("longblob") + || mysql_type_lower.contains("mediumblob") + { + // Handle all text and blob types as Utf8 + DataType::Utf8 + } else { + // Default to Utf8 for any unknown types + DataType::Utf8 + } + } + + /// Convert a JSON value to Arrow data type + fn value_to_arrow_type(&self, value: &LogValue) -> DataType { + let data_type = match value { + LogValue::Bytes(_) => DataType::Utf8, + LogValue::Integer(_) => DataType::Int64, + LogValue::Float(_) => DataType::Float64, + LogValue::Boolean(_) => DataType::Boolean, + LogValue::Null => DataType::Utf8, // Default for null values + _ => DataType::Utf8, + }; + + // Converting LogValue to Arrow type + data_type + } + + /// Infer Arrow data type from field name and value + /// This function now relies primarily on _schema_metadata for type inference + fn infer_arrow_type(&self, _field_name: &str, value: &LogValue) -> DataType { + // If we have a concrete value, use its type + if !matches!(value, LogValue::Null) { + return self.value_to_arrow_type(value); + } + + // For null values, we should rely on _schema_metadata + // If no schema metadata is available, default to Utf8 + // This is a fallback that should rarely be used with proper schema metadata + DataType::Utf8 + } + + /// Create a column for a specific field + fn create_column( + &self, + field: &Field, + events: &[Event], + ) -> Result> { + use std::sync::Arc; + + match field.data_type() { + DataType::Utf8 => { + let mut builder = StringBuilder::with_capacity(events.len(), events.len() * 8); + for event in events.iter() { + if let Event::Log(log_event) = event { + let value_opt = match field.name().as_str() { + "_vector_table" => log_event + .get("_vector_table") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + "_vector_source_table" => log_event + .get("_vector_source_table") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + "_vector_source_schema" => log_event + .get("_vector_source_schema") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + "_vector_instance" => log_event + .get("_vector_instance") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + "_vector_timestamp" => log_event + .get("_vector_timestamp") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()), + "date" => { + // Extract date from _vector_timestamp for partitioning + let date_str = log_event + .get("_vector_timestamp") + .and_then(|v| v.as_str()) + .map(|timestamp_str| { + // Parse ISO 8601 timestamp and extract date part + if let Ok(dt) = + chrono::DateTime::parse_from_rfc3339(×tamp_str) + { + dt.format("%Y-%m-%d").to_string() + } else { + // Fallback: try to extract date from other timestamp formats + if timestamp_str.len() >= 10 { + timestamp_str[..10].to_string() + } else { + chrono::Utc::now().format("%Y-%m-%d").to_string() + } + } + }) + .unwrap_or_else(|| { + // Ensure we always have a date value for consistency + chrono::Utc::now().format("%Y-%m-%d").to_string() + }); + Some(date_str) + } + + _ => { + // For data fields, try exact match first, then case-insensitive match + let field_name = field.name(); + if let Some(value) = log_event.get(field_name.as_str()) { + Some(value.to_string()) + } else { + // Try case-insensitive match for data fields + if let Some(iter) = log_event.all_event_fields() { + let mut found_value = None; + for (key, value) in iter { + if key.as_ref().to_lowercase() + == field_name.to_lowercase() + { + found_value = Some(value.to_string()); + break; + } + } + found_value + } else { + None + } + } + } + }; + if let Some(s) = value_opt { + // Trim quotes from string values to avoid query issues + let trimmed = s.trim_matches('"'); + builder.append_value(trimmed); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::Int64 => { + let mut builder = Int64Builder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + let value_opt = match field.name().as_str() { + "_vector_id" => { + log_event.get("_vector_id").and_then(|v| v.as_integer()) + } + _ => match log_event.get(field.name().as_str()) { + Some(LogValue::Integer(i)) => Some(*i), + Some(LogValue::Bytes(bytes)) => { + // Try to parse string as integer + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + s.parse::().ok() + } else { + None + } + } + // Accept null values gracefully + _ => None, + }, + }; + + if let Some(value) = value_opt { + builder.append_value(value); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::Int32 => { + let mut builder = Int32Builder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Integer(i)) => { + if *i >= i32::MIN as i64 && *i <= i32::MAX as i64 { + builder.append_value(*i as i32); + } else { + builder.append_null(); + } + } + Some(LogValue::Bytes(bytes)) => { + // Try to parse string as integer + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + if let Ok(i) = s.parse::() { + builder.append_value(i); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + // Accept null values gracefully + _ => builder.append_null(), + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::UInt32 => { + let mut builder = UInt32Builder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Integer(i)) => { + if *i >= 0 && *i <= u32::MAX as i64 { + builder.append_value(*i as u32); + } else { + builder.append_null(); + } + } + Some(LogValue::Bytes(bytes)) => { + // Try to parse string as unsigned integer + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + if let Ok(u) = s.parse::() { + builder.append_value(u); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + // Accept null values gracefully + _ => builder.append_null(), + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::Int16 => { + let mut builder = Int16Builder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Integer(i)) => { + if *i >= i16::MIN as i64 && *i <= i16::MAX as i64 { + builder.append_value(*i as i16); + } else { + builder.append_null(); + } + } + Some(LogValue::Bytes(bytes)) => { + // Try to parse string as integer + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + if let Ok(i) = s.parse::() { + builder.append_value(i); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + // Accept null values gracefully + _ => builder.append_null(), + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::Int8 => { + let mut builder = Int8Builder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Integer(i)) => { + if *i >= i8::MIN as i64 && *i <= i8::MAX as i64 { + builder.append_value(*i as i8); + } else { + builder.append_null(); + } + } + Some(LogValue::Bytes(bytes)) => { + // Try to parse string as integer + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + if let Ok(i) = s.parse::() { + builder.append_value(i); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + // Accept null values gracefully + _ => builder.append_null(), + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::UInt64 => { + let mut builder = UInt64Builder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Integer(i)) => { + if *i >= 0 { + builder.append_value(*i as u64); + } else { + builder.append_null(); + } + } + Some(LogValue::Bytes(bytes)) => { + // Try to parse string as unsigned integer + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + if let Ok(u) = s.parse::() { + builder.append_value(u); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + // Accept null values gracefully + _ => builder.append_null(), + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::Float64 => { + let mut builder = Float64Builder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Float(f)) => builder.append_value((*f).into_inner()), + Some(LogValue::Integer(i)) => builder.append_value(*i as f64), + Some(LogValue::Bytes(bytes)) => { + // Try to parse string as float + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + if let Ok(f) = s.parse::() { + builder.append_value(f); + } else { + builder.append_null(); + } + } else { + builder.append_null(); + } + } + _ => builder.append_null(), + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::Boolean => { + let mut builder = BooleanBuilder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Boolean(b)) => builder.append_value(*b), + _ => builder.append_null(), + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + DataType::Timestamp(TimeUnit::Microsecond, None) => { + let mut builder = + arrow::array::TimestampMicrosecondBuilder::with_capacity(events.len()); + for event in events.iter() { + if let Event::Log(log_event) = event { + match log_event.get(field.name().as_str()) { + Some(LogValue::Integer(microseconds)) => { + // Direct microseconds value from TiDB packed time + builder.append_value(*microseconds); + } + Some(LogValue::Bytes(bytes)) => { + // Try to parse timestamp string + if let Ok(s) = std::str::from_utf8(bytes.as_ref()) { + if let Ok(timestamp) = chrono::DateTime::parse_from_rfc3339(s) { + let microseconds = timestamp.timestamp_micros(); + builder.append_value(microseconds); + } else if let Ok(naive_dt) = + chrono::NaiveDateTime::parse_from_str( + s, + "%Y-%m-%d %H:%M:%S", + ) + { + let microseconds = naive_dt.and_utc().timestamp_micros(); + builder.append_value(microseconds); + } else { + warn!( + "Failed to parse timestamp '{}' for field '{}'", + s, + field.name() + ); + builder.append_null(); + } + } else { + warn!( + "Failed to decode bytes as UTF-8 for timestamp field '{}'", + field.name() + ); + builder.append_null(); + } + } + Some(LogValue::Null) => { + builder.append_null(); + } + Some(other_value) => { + warn!( + "Timestamp field '{}' received unexpected value type: {:?}", + field.name(), + other_value + ); + builder.append_null(); + } + None => { + builder.append_null(); + } + } + } else { + builder.append_null(); + } + } + let array = builder.finish(); + Ok(Arc::new(array)) + } + _ => { + // Default to Utf8 representation for any other types + let values: Vec> = events + .iter() + .map(|event| { + if let Event::Log(log_event) = event { + log_event.get(field.name().as_str()).map(|v| { + // Trim quotes from string values to avoid query issues + let s = v.to_string(); + s.trim_matches('"').to_string() + }) + } else { + None + } + }) + .collect(); + let array = StringArray::from(values); + Ok(Arc::new(array)) + } + } + } + + /// Write record batch to Delta Lake + async fn write_to_delta_lake( + &self, + record_batch: RecordBatch, + ) -> Result<(), Box> { + // For local paths, ensure table directory exists + let mut table_path_str = self.table_path.to_string_lossy().to_string(); + // Build Delta table URI + if !table_path_str.starts_with("s3://") { + table_path_str = format!("file://{}", table_path_str).to_string(); + std::fs::create_dir_all(&self.table_path)?; + } + let table_uri = Url::parse(table_path_str.as_str()).unwrap(); + info!("Writing to Delta Lake table at: {}", table_uri); + + // Use DeltaOps for improved S3 support, following the successful test pattern + let table_ops = if let Some(storage_options) = &self.storage_options { + info!( + "Using storage options for S3 authentication: {:?}", + storage_options + ); + DeltaOps::try_from_uri_with_storage_options(table_uri.clone(), storage_options.clone()).await? + } else { + info!("No storage options provided, using default credential chain {}", table_uri); + DeltaOps::try_from_uri(table_uri.clone()).await? + }; + + // Try to write directly first (avoid load() which can panic in deltalake-core 0.28.1) + info!("Attempting to write to Delta table at {} {} {}", table_uri, record_batch.num_rows(), self.write_config.max_row_group_size); + let mut write_builder = table_ops.write(vec![record_batch.clone()]); + // Always pass partition columns on write; for new tables this applies partitioning, + // for existing tables it validates consistency + if let Some(partitions) = &self.table_config.partition_by { + write_builder = write_builder.with_partition_columns(partitions.clone()); + } + // Allow protocol/schema update so timestamp ntz writer feature can be enabled when needed + write_builder = write_builder + .with_schema_mode(deltalake::operations::write::SchemaMode::Merge); + + let write_result = write_builder.await; + + match write_result { + Ok(table) => { + info!("✅ Successfully wrote to Delta table at {}", table_uri); + info!("Table version: {:?}", table.version()); + return Ok(()); + } + Err(e) => { + // Check if error is due to table not existing + let error_str = e.to_string(); + if error_str.contains("does not exist") + || error_str.contains("not found") + || error_str.contains("Not a Delta table") + { + info!( + "Table doesn't exist, will create it. Error was: {}", + error_str + ); + // Fall through to table creation below + } else { + // Other error, fail immediately + error!("Failed to write to Delta table: {}", e); + return Err(e.into()); + } + } + } + + // If we reach here, table doesn't exist and needs to be created + // Create new table first + info!( + "Creating new Delta table at {} for table {}, max_row_group_size {}", + table_uri, self.table_config.name, self.write_config.max_row_group_size, + ); + let schema = self.schema.as_ref().ok_or("Schema not available")?; + + let mut create_builder = CreateBuilder::new().with_location(&table_path_str).with_columns( + schema + .fields() + .iter() + .map(|field| self.arrow_field_to_delta_field(field)), + ); + + // Add storage options for S3 + if let Some(storage_options) = &self.storage_options { + create_builder = create_builder.with_storage_options(storage_options.clone()); + } + + // Add partition columns if configured + if let Some(partition_cols) = &self.table_config.partition_by { + info!( + "Setting partition columns for table {}: {:?}", + self.table_config.name, partition_cols + ); + create_builder = create_builder.with_partition_columns(partition_cols.clone()); + } else { + info!( + "No partition columns configured for table {}", + self.table_config.name + ); + } + + create_builder.await?; + info!("Successfully created new Delta table"); + + // Add TimestampWithoutTimezone feature to support Timestamp columns + info!("Adding TimestampWithoutTimezone feature to Delta table"); + let table_ops_for_feature = if let Some(storage_options) = &self.storage_options { + DeltaOps::try_from_uri_with_storage_options(table_uri.clone(), storage_options.clone()).await? + } else { + DeltaOps::try_from_uri(table_uri.clone()).await? + }; + + // Load the table first to ensure state is initialized + match table_ops_for_feature.load().await { + Ok((loaded_table, _stream)) => { + // Now try to add the feature with the loaded table + match DeltaOps::from(loaded_table) + .add_feature() + .with_feature(TableFeatures::TimestampWithoutTimezone) + .with_allow_protocol_versions_increase(true) + .await + { + Ok(_) => { + info!( + "✅ Successfully added TimestampWithoutTimezone feature to Delta table" + ); + } + Err(e) => { + warn!("Failed to add TimestampWithoutTimezone feature: {}. Continuing without it.", e); + } + } + } + Err(e) => { + warn!("Failed to load table for feature addition: {}. Continuing without TimestampWithoutTimezone feature.", e); + } + } + + // Now write the data using DeltaOps - reload the table_ops to get the created table + let table_ops = if let Some(storage_options) = &self.storage_options { + DeltaOps::try_from_uri_with_storage_options(table_uri.clone(), storage_options.clone()).await? + } else { + DeltaOps::try_from_uri(table_uri.clone()).await? + }; + + let mut write_builder = table_ops.write(vec![record_batch]); + if let Some(partitions) = &self.table_config.partition_by { + write_builder = write_builder.with_partition_columns(partitions.clone()); + } + write_builder = write_builder + .with_schema_mode(deltalake::operations::write::SchemaMode::Merge); + let write_result = write_builder.await?; + info!( + "Successfully wrote data to Delta Lake table at {}, version: {:?} max_row_group_size: {}", + table_uri, + write_result.version(), + self.write_config.max_row_group_size, + ); + + Ok(()) + } + + /// Check if Delta table exists at the given URI + #[allow(dead_code)] + async fn table_exists( + &self, + table_uri: &str, + ) -> Result> { + if table_uri.starts_with("s3://") { + // For S3, we need to check if _delta_log exists + // This is a simplified check - in practice you'd use the Delta Lake APIs + // For now, we'll always return false for S3 to trigger table creation logic + Ok(false) + } else { + // For local filesystem + Ok(self.table_path.join("_delta_log").exists()) + } + } + + /// Fallback: write events to JSON files + #[allow(dead_code)] + async fn write_to_json_files( + &self, + events: Vec, + ) -> Result<(), Box> { + use chrono::Utc; + use tokio::fs::OpenOptions; + use tokio::io::AsyncWriteExt; + + // Ensure table directory exists + std::fs::create_dir_all(&self.table_path)?; + + // Create file with timestamp + let timestamp = Utc::now().format("%Y%m%d_%H%M%S_%3f"); + let file_path = self.table_path.join(format!("data_{}.json", timestamp)); + + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(file_path) + .await?; + + for event in events { + if let Event::Log(log_event) = event { + let json_line = serde_json::to_string(&log_event)?; + file.write_all(json_line.as_bytes()).await?; + file.write_all(b"\n").await?; + } + } + + file.flush().await?; + Ok(()) + } +} diff --git a/src/common/mod.rs b/src/common/mod.rs index c54456d..0fda3a2 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -1,2 +1,4 @@ pub mod checkpointer; +pub mod deltalake_writer; pub mod features; +pub mod topology; diff --git a/src/sources/topsql/topology/fetch/mock/mod.rs b/src/common/topology/fetch/mock/mod.rs similarity index 100% rename from src/sources/topsql/topology/fetch/mock/mod.rs rename to src/common/topology/fetch/mock/mod.rs diff --git a/src/sources/topsql/topology/fetch/mock/pd.rs b/src/common/topology/fetch/mock/pd.rs similarity index 100% rename from src/sources/topsql/topology/fetch/mock/pd.rs rename to src/common/topology/fetch/mock/pd.rs diff --git a/src/sources/topsql/topology/fetch/mock/store.rs b/src/common/topology/fetch/mock/store.rs similarity index 100% rename from src/sources/topsql/topology/fetch/mock/store.rs rename to src/common/topology/fetch/mock/store.rs diff --git a/src/sources/topsql/topology/fetch/mod.rs b/src/common/topology/fetch/mod.rs similarity index 97% rename from src/sources/topsql/topology/fetch/mod.rs rename to src/common/topology/fetch/mod.rs index 37e8a39..15a3905 100644 --- a/src/sources/topsql/topology/fetch/mod.rs +++ b/src/common/topology/fetch/mod.rs @@ -10,7 +10,7 @@ mod tikv_nextgen; #[cfg(test)] mod mock; -use crate::sources::topsql::topology::Component; +use crate::common::topology::Component; use snafu::{ResultExt, Snafu}; use std::collections::HashSet; @@ -131,9 +131,10 @@ impl LegacyTopologyFetcher { tls_config: &Option, ) -> Result { let etcd_connect_opt = Self::build_etcd_connect_opt(tls_config)?; - let etcd_client: etcd_client::Client = etcd_client::Client::connect(&[pd_address], etcd_connect_opt) - .await - .context(BuildEtcdClientSnafu)?; + let etcd_client: etcd_client::Client = + etcd_client::Client::connect(&[pd_address], etcd_connect_opt) + .await + .context(BuildEtcdClientSnafu)?; Ok(etcd_client) } @@ -248,8 +249,8 @@ impl TopologyFetcher { }) } else { // In legacy mode, pd_address is required - let pd_address = pd_address.ok_or_else(|| FetchError::ConfigurationError { - message: "PD address is required in legacy mode".to_string() + let pd_address = pd_address.ok_or_else(|| FetchError::ConfigurationError { + message: "PD address is required in legacy mode".to_string(), })?; let fetcher = LegacyTopologyFetcher::new(pd_address, tls_config, proxy_config).await?; Ok(Self { diff --git a/src/sources/topsql/topology/fetch/models.rs b/src/common/topology/fetch/models.rs similarity index 100% rename from src/sources/topsql/topology/fetch/models.rs rename to src/common/topology/fetch/models.rs diff --git a/src/sources/topsql/topology/fetch/pd.rs b/src/common/topology/fetch/pd.rs similarity index 96% rename from src/sources/topsql/topology/fetch/pd.rs rename to src/common/topology/fetch/pd.rs index 15f7481..5c8fbf7 100644 --- a/src/sources/topsql/topology/fetch/pd.rs +++ b/src/common/topology/fetch/pd.rs @@ -3,8 +3,8 @@ use std::collections::HashSet; use snafu::{ResultExt, Snafu}; use vector::http::HttpClient; -use crate::sources::topsql::topology::fetch::{models, utils}; -use crate::sources::topsql::topology::{Component, InstanceType}; +use crate::common::topology::fetch::{models, utils}; +use crate::common::topology::{Component, InstanceType}; #[derive(Debug, Snafu)] pub enum FetchError { diff --git a/src/sources/topsql/topology/fetch/store.rs b/src/common/topology/fetch/store.rs similarity index 96% rename from src/sources/topsql/topology/fetch/store.rs rename to src/common/topology/fetch/store.rs index 90a79f0..51d21b4 100644 --- a/src/sources/topsql/topology/fetch/store.rs +++ b/src/common/topology/fetch/store.rs @@ -3,8 +3,8 @@ use std::collections::HashSet; use snafu::{ResultExt, Snafu}; use vector::http::HttpClient; -use crate::sources::topsql::topology::fetch::{models, utils}; -use crate::sources::topsql::topology::{Component, InstanceType}; +use crate::common::topology::fetch::{models, utils}; +use crate::common::topology::{Component, InstanceType}; #[derive(Debug, Snafu)] pub enum FetchError { diff --git a/src/sources/topsql/topology/fetch/tidb.rs b/src/common/topology/fetch/tidb.rs similarity index 97% rename from src/sources/topsql/topology/fetch/tidb.rs rename to src/common/topology/fetch/tidb.rs index b21141e..14eb754 100644 --- a/src/sources/topsql/topology/fetch/tidb.rs +++ b/src/common/topology/fetch/tidb.rs @@ -3,8 +3,8 @@ use std::time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH}; use snafu::{ResultExt, Snafu}; -use crate::sources::topsql::topology::fetch::{models, utils}; -use crate::sources::topsql::topology::{Component, InstanceType}; +use crate::common::topology::fetch::{models, utils}; +use crate::common::topology::{Component, InstanceType}; #[derive(Debug, Snafu)] pub enum FetchError { diff --git a/src/sources/topsql/topology/fetch/tidb_nextgen.rs b/src/common/topology/fetch/tidb_nextgen.rs similarity index 97% rename from src/sources/topsql/topology/fetch/tidb_nextgen.rs rename to src/common/topology/fetch/tidb_nextgen.rs index 654a70f..9e204fd 100644 --- a/src/sources/topsql/topology/fetch/tidb_nextgen.rs +++ b/src/common/topology/fetch/tidb_nextgen.rs @@ -1,4 +1,4 @@ -use crate::sources::topsql::topology::{Component, InstanceType}; +use crate::common::topology::{Component, InstanceType}; use std::collections::HashSet; diff --git a/src/sources/topsql/topology/fetch/tikv_nextgen.rs b/src/common/topology/fetch/tikv_nextgen.rs similarity index 97% rename from src/sources/topsql/topology/fetch/tikv_nextgen.rs rename to src/common/topology/fetch/tikv_nextgen.rs index 2dcbbc2..c1516be 100644 --- a/src/sources/topsql/topology/fetch/tikv_nextgen.rs +++ b/src/common/topology/fetch/tikv_nextgen.rs @@ -1,4 +1,4 @@ -use crate::sources::topsql::topology::{Component, InstanceType}; +use crate::common::topology::{Component, InstanceType}; use std::collections::HashSet; diff --git a/src/sources/topsql/topology/fetch/utils.rs b/src/common/topology/fetch/utils.rs similarity index 100% rename from src/sources/topsql/topology/fetch/utils.rs rename to src/common/topology/fetch/utils.rs diff --git a/src/sources/topsql/topology/mod.rs b/src/common/topology/mod.rs similarity index 100% rename from src/sources/topsql/topology/mod.rs rename to src/common/topology/mod.rs diff --git a/src/main.rs b/src/main.rs index 438b2f1..80f7f62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,6 +12,10 @@ mod utils; #[cfg(unix)] fn main() -> ExitCode { + rustls::crypto::aws_lc_rs::default_provider() + .install_default() + .expect("Failed to install default crypto provider"); + let exit_code = Application::run(ExtraContext::default()) .code() .unwrap_or(exitcode::UNAVAILABLE) as u8; diff --git a/src/sinks/deltalake/mod.rs b/src/sinks/deltalake/mod.rs new file mode 100644 index 0000000..4ca9a5b --- /dev/null +++ b/src/sinks/deltalake/mod.rs @@ -0,0 +1,342 @@ +use std::collections::HashMap; +use std::path::PathBuf; + +use serde::{Deserialize, Serialize}; +use vector::{ + aws::{AwsAuthentication, RegionOrEndpoint}, + config::{GenerateConfig, SinkConfig, SinkContext}, + sinks::{ + s3_common::{self, config::S3Options, service::S3Service}, + Healthcheck, + }, +}; + +use vector_lib::{ + config::proxy::ProxyConfig, + config::{AcknowledgementsConfig, DataType, Input}, + configurable::configurable_component, + sink::VectorSink, + tls::TlsConfig, +}; + +use crate::common::deltalake_writer::{DeltaTableConfig, WriteConfig}; +use crate::{ + common::deltalake_writer::StorageOptionsBuilder, sinks::deltalake::processor::DeltaLakeSink, +}; + +mod processor; + +/// Configuration for the deltalake sink +#[configurable_component(sink("deltalake"))] +#[derive(Debug, Clone)] +#[serde(deny_unknown_fields)] +pub struct DeltaLakeConfig { + /// Base path for Delta Lake tables + pub base_path: String, + + /// Batch size for writing + #[serde(default = "default_batch_size")] + pub batch_size: usize, + + /// Max counter of row group in a single parquet file + #[serde(default = "default_max_row_group_size")] + pub max_row_group_size: usize, + + /// Write timeout in seconds + #[serde(default = "default_timeout_secs")] + pub timeout_secs: u64, + + /// Compression format + #[serde(default = "default_compression")] + pub compression: String, + + /// Storage options for cloud storage + pub storage_options: Option>, + + /// S3 bucket name for remote storage + pub bucket: Option, + + /// S3 options + #[serde(flatten)] + pub options: Option, + + /// AWS region or endpoint + #[serde(flatten)] + pub region: Option, + + /// TLS configuration + pub tls: Option, + + /// AWS authentication + #[serde(default)] + pub auth: AwsAuthentication, + + /// Specifies which addressing style to use + #[serde(default = "default_force_path_style")] + pub force_path_style: Option, + + /// Acknowledgments configuration + #[serde( + default, + deserialize_with = "vector::serde::bool_or_struct", + skip_serializing_if = "vector::serde::is_default" + )] + pub acknowledgements: AcknowledgementsConfig, +} + +/// Compression format +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum CompressionFormat { + /// Snappy compression + Snappy, + /// Gzip compression + Gzip, + /// No compression + None, +} + +pub const fn default_batch_size() -> usize { + 1000 +} + +pub const fn default_timeout_secs() -> u64 { + 30 +} + +pub const fn default_max_row_group_size() -> usize { + 8192 +} + +pub fn default_compression() -> String { + "snappy".to_string() +} + +pub fn default_force_path_style() -> Option { + None +} + +impl GenerateConfig for DeltaLakeConfig { + fn generate_config() -> toml::Value { + toml::Value::try_from(Self { + base_path: "./delta-tables".to_owned(), + batch_size: default_batch_size(), + max_row_group_size: default_max_row_group_size(), + timeout_secs: default_timeout_secs(), + compression: default_compression(), + storage_options: None, + bucket: None, + options: None, + region: None, + tls: None, + auth: AwsAuthentication::default(), + force_path_style: None, + acknowledgements: Default::default(), + }) + .unwrap() + } +} + +#[async_trait::async_trait] +#[typetag::serde(name = "deltalake")] +impl SinkConfig for DeltaLakeConfig { + async fn build(&self, cx: SinkContext) -> vector::Result<(VectorSink, Healthcheck)> { + error!( + "DEBUG: Building Delta Lake sink with bucket: {:?}", + self.bucket + ); + + // Create S3 service if bucket is configured + let s3_service = if self.bucket.is_some() { + error!("DEBUG: Bucket configured, creating S3 service"); + match self.create_service(&cx.proxy).await { + Ok(service) => { + info!("S3 service created successfully"); + Some(service) + } + Err(e) => { + error!( + "Failed to create S3 service, falling back to credential-less mode: {}", + e + ); + // Don't fail completely, but continue without S3Service + // Delta Lake will handle authentication through storage_options + None + } + } + } else { + info!("No bucket configured, using local filesystem"); + None + }; + + info!("Building sink processor"); + let sink = self.build_processor(s3_service.as_ref(), cx).await?; + + info!("Building healthcheck"); + let healthcheck = self.build_healthcheck(s3_service.as_ref())?; + + info!("Delta Lake sink build completed successfully"); + Ok((sink, healthcheck)) + } + + fn input(&self) -> Input { + Input::new(DataType::Log) + } + + fn acknowledgements(&self) -> &AcknowledgementsConfig { + &self.acknowledgements + } +} + +impl DeltaLakeConfig { + async fn build_processor( + &self, + s3_service: Option<&S3Service>, + _cx: SinkContext, + ) -> vector::Result { + let base_path = PathBuf::from(&self.base_path); + + // Tables are discovered dynamically from events + // Default partition configuration will be applied to all tables + let table_configs: Vec = Vec::new(); + + let write_config = WriteConfig { + batch_size: self.batch_size, + max_row_group_size: self.max_row_group_size, + timeout_secs: self.timeout_secs, + compression: self.compression.clone(), + }; + let mut storage_options = self.storage_options.clone().unwrap_or_default(); + + // Add S3 storage options if S3 service is available + if let Some(service) = s3_service { + info!("Applying S3 storage options - S3 service found"); + let _ = StorageOptionsBuilder::new( + self.region.clone(), + self.force_path_style, + self.auth.clone(), + ) + .build(&mut storage_options, service) + .await; + } else { + info!("No S3 service available - using default storage options only"); + } + + let sink = DeltaLakeSink::new( + base_path, + table_configs, + write_config, + Some(storage_options), + ); + + Ok(VectorSink::from_event_streamsink(sink)) + } + + pub async fn create_service(&self, proxy: &ProxyConfig) -> vector::Result { + error!( + "DEBUG: Creating S3 service for Delta Lake with bucket: {:?}", + self.bucket + ); + + // Ensure we have a region configured + let region = self.region.as_ref().cloned().unwrap_or_else(|| { + info!("No region specified, using default us-east-1"); + RegionOrEndpoint::with_region("us-east-1".to_string()) + }); + + info!("Using region: {:?} for S3 service", region); + info!("Using auth: {:?} for S3 service", self.auth); + info!( + "Force path style: {:?}", + self.force_path_style.unwrap_or(true) + ); + + let result = s3_common::config::create_service( + ®ion, + &self.auth, + proxy, + self.tls.as_ref(), + self.force_path_style.unwrap_or(true), + ) + .await; + + match &result { + Ok(_) => info!("S3 service created successfully for Delta Lake"), + Err(e) => { + error!("Failed to create S3 service for Delta Lake: {}", e); + error!("Auth config: {:?}", self.auth); + error!("Region config: {:?}", region); + } + } + + result + } + + fn build_healthcheck(&self, s3_service: Option<&S3Service>) -> vector::Result { + info!( + "Building healthcheck for bucket: {:?}, s3_service: {}, base_path: {}", + self.bucket, + s3_service.is_some(), + self.base_path + ); + + if let (Some(bucket), Some(_service)) = (&self.bucket, s3_service) { + info!( + "S3 configuration detected - using simplified healthcheck for bucket: {}", + bucket + ); + // For Delta Lake S3, we'll use a simplified healthcheck that always passes + // The actual S3 connectivity will be tested during the first write operation + // This avoids credential issues that can occur during Vector startup + let healthcheck = Box::pin(async move { + info!("Delta Lake S3 healthcheck: Skipping detailed S3 connectivity test"); + info!("S3 connectivity will be verified during actual write operations"); + Ok(()) + }); + return Ok(healthcheck); + } + + info!( + "Using local filesystem healthcheck for path: {}", + self.base_path + ); + // Local filesystem healthcheck + let base_path = PathBuf::from(&self.base_path); + + let healthcheck = Box::pin(async move { + // Check if directory exists and is writable + if !base_path.exists() { + if let Err(e) = std::fs::create_dir_all(&base_path) { + return Err(format!( + "Failed to create directory {}: {}", + base_path.display(), + e + ) + .into()); + } + } + + // Try to create a test file + let test_file = base_path.join(".healthcheck"); + if let Err(e) = std::fs::write(&test_file, "test") { + return Err(format!("Failed to write to {}: {}", base_path.display(), e).into()); + } + + // Clean up test file + let _ = std::fs::remove_file(test_file); + + Ok(()) + }); + + Ok(healthcheck) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_config() { + vector::test_util::test_generate_config::(); + } +} diff --git a/src/sinks/deltalake/processor.rs b/src/sinks/deltalake/processor.rs new file mode 100644 index 0000000..361028f --- /dev/null +++ b/src/sinks/deltalake/processor.rs @@ -0,0 +1,162 @@ +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; + +use futures::{stream::BoxStream, StreamExt}; +use tokio::sync::Mutex; +use vector_lib::event::Event; +use vector_lib::sink::StreamSink; + +use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; + +/// Delta Lake sink processor +pub struct DeltaLakeSink { + base_path: PathBuf, + tables: Vec, + write_config: WriteConfig, + storage_options: Option>, + writers: Arc>>, +} + +impl DeltaLakeSink { + /// Create a new Delta Lake sink + pub fn new( + base_path: PathBuf, + tables: Vec, + write_config: WriteConfig, + storage_options: Option>, + ) -> Self { + Self { + base_path, + tables, + write_config, + storage_options, + writers: Arc::new(Mutex::new(HashMap::new())), + } + } + + /// Process events and write to Delta Lake + async fn process_events( + &self, + events: Vec, + ) -> Result<(), Box> { + if events.is_empty() { + return Ok(()); + } + + // Log batch summary + info!("Sink processing batch: {} events", events.len()); + + // Group events by table (prefer dest_table, fallback to table) + let mut table_events: HashMap> = HashMap::new(); + + for event in events { + if let Event::Log(log_event) = event { + let table_name = log_event + .get("_vector_table") + .and_then(|v| v.as_str()) + .or_else(|| log_event.get("dest_table").and_then(|v| v.as_str())) + .or_else(|| log_event.get("table").and_then(|v| v.as_str())); + if let Some(table_name) = table_name { + table_events + .entry(table_name.to_string()) + .or_insert_with(Vec::new) + .push(Event::Log(log_event)); + } + } + } + + // Write each table's events + for (table_name, table_events) in table_events { + if let Err(e) = self.write_table_events(&table_name, table_events).await { + let error_msg = e.to_string(); + if error_msg.contains("log segment") + || error_msg.contains("Invalid table version") + || error_msg.contains("not found") + || error_msg.contains("No such file or directory") + { + panic!( + "Delta Lake corruption detected for table {}: {}", + table_name, error_msg + ); + } else { + error!("Failed to write events to table {}: {}", table_name, e); + } + } + } + + Ok(()) + } + + /// Write events to a specific table + async fn write_table_events( + &self, + table_name: &str, + events: Vec, + ) -> Result<(), Box> { + // Get or create writer for this table + let mut writers = self.writers.lock().await; + let writer = writers.entry(table_name.to_string()).or_insert_with(|| { + let table_path = if self.base_path.to_string_lossy().starts_with("s3://") { + // For S3 paths, append the table name to the S3 path + PathBuf::from(format!( + "{}/{}", + self.base_path.to_string_lossy(), + table_name + )) + } else { + // For local paths, use join as before + self.base_path.join(table_name) + }; + + let table_config = self + .tables + .iter() + .find(|t| t.name == table_name) + .cloned() + .unwrap_or_else(|| DeltaTableConfig { + name: table_name.to_string(), + partition_by: Some(vec!["date".to_string()]), + schema_evolution: Some(true), + standard_columns: Some(vec![ + "_vector_table".to_string(), + "_vector_source_table".to_string(), + "_vector_source_schema".to_string(), + "_vector_instance".to_string(), + "_vector_timestamp".to_string(), + ]), + }); + DeltaLakeWriter::new( + table_path, + table_config, + self.write_config.clone(), + self.storage_options.clone(), + ) + }); + + // Write events + writer.write_events(events).await?; + + Ok(()) + } +} + +#[async_trait::async_trait] +impl StreamSink for DeltaLakeSink { + async fn run(self: Box, input: BoxStream<'_, Event>) -> Result<(), ()> { + info!( + "Delta Lake sink starting with batch_size: {}, timeout_secs: {}", + self.write_config.batch_size, self.write_config.timeout_secs + ); + + let mut input = input.ready_chunks(self.write_config.batch_size); + + while let Some(events) = input.next().await { + if let Err(e) = self.process_events(events).await { + error!("Failed to process events: {}", e); + } + } + + Ok(()) + } +} diff --git a/src/sinks/mod.rs b/src/sinks/mod.rs index f67581c..512011d 100644 --- a/src/sinks/mod.rs +++ b/src/sinks/mod.rs @@ -1,4 +1,6 @@ pub mod aws_s3_upload_file; pub mod azure_blob_upload_file; +pub mod deltalake; pub mod gcp_cloud_storage_upload_file; +pub mod topsql_deltalake; pub mod vm_import; diff --git a/src/sinks/topsql_deltalake/mod.rs b/src/sinks/topsql_deltalake/mod.rs new file mode 100644 index 0000000..06a29ef --- /dev/null +++ b/src/sinks/topsql_deltalake/mod.rs @@ -0,0 +1,343 @@ +use std::collections::HashMap; +use std::path::PathBuf; + +use serde::{Deserialize, Serialize}; +use vector::{ + aws::{AwsAuthentication, RegionOrEndpoint}, + config::{GenerateConfig, SinkConfig, SinkContext}, + sinks::{ + s3_common::{self, config::S3Options, service::S3Service}, + Healthcheck, + }, +}; + +use vector_lib::{ + config::proxy::ProxyConfig, + config::{AcknowledgementsConfig, DataType, Input}, + configurable::configurable_component, + sink::VectorSink, + tls::TlsConfig, +}; + +use crate::common::deltalake_writer::{DeltaTableConfig, WriteConfig}; +use crate::{ + common::deltalake_writer::StorageOptionsBuilder, + sinks::topsql_deltalake::processor::TopSQLDeltaLakeSink, +}; + +mod processor; + +/// Configuration for the topsql deltalake sink +#[configurable_component(sink("topsql_deltalake"))] +#[derive(Debug, Clone)] +#[serde(deny_unknown_fields)] +pub struct TopSQLDeltaLakeConfig { + /// Base path for Delta Lake tables + pub base_path: String, + + /// Batch size for writing + #[serde(default = "default_batch_size")] + pub batch_size: usize, + + /// Max counter of row group in a single parquet file + #[serde(default = "default_max_row_group_size")] + pub max_row_group_size: usize, + + /// Write timeout in seconds + #[serde(default = "default_timeout_secs")] + pub timeout_secs: u64, + + /// Compression format + #[serde(default = "default_compression")] + pub compression: String, + + /// Storage options for cloud storage + pub storage_options: Option>, + + /// S3 bucket name for remote storage + pub bucket: Option, + + /// S3 options + #[serde(flatten)] + pub options: Option, + + /// AWS region or endpoint + #[serde(flatten)] + pub region: Option, + + /// TLS configuration + pub tls: Option, + + /// AWS authentication + #[serde(default)] + pub auth: AwsAuthentication, + + /// Specifies which addressing style to use + #[serde(default = "default_force_path_style")] + pub force_path_style: Option, + + /// Acknowledgments configuration + #[serde( + default, + deserialize_with = "vector::serde::bool_or_struct", + skip_serializing_if = "vector::serde::is_default" + )] + pub acknowledgements: AcknowledgementsConfig, +} + +/// Compression format +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum CompressionFormat { + /// Snappy compression + Snappy, + /// Gzip compression + Gzip, + /// No compression + None, +} + +pub const fn default_batch_size() -> usize { + 1000 +} + +pub const fn default_max_row_group_size() -> usize { + 8192 +} + +pub const fn default_timeout_secs() -> u64 { + 30 +} + +pub fn default_compression() -> String { + "snappy".to_string() +} + +pub fn default_force_path_style() -> Option { + None +} + +impl GenerateConfig for TopSQLDeltaLakeConfig { + fn generate_config() -> toml::Value { + toml::Value::try_from(Self { + base_path: "./delta-tables".to_owned(), + batch_size: default_batch_size(), + max_row_group_size: default_max_row_group_size(), + timeout_secs: default_timeout_secs(), + compression: default_compression(), + storage_options: None, + bucket: None, + options: None, + region: None, + tls: None, + auth: AwsAuthentication::default(), + force_path_style: None, + acknowledgements: Default::default(), + }) + .unwrap() + } +} + +#[async_trait::async_trait] +#[typetag::serde(name = "topsql_deltalake")] +impl SinkConfig for TopSQLDeltaLakeConfig { + async fn build(&self, cx: SinkContext) -> vector::Result<(VectorSink, Healthcheck)> { + error!( + "DEBUG: Building Delta Lake sink with bucket: {:?}", + self.bucket + ); + + // Create S3 service if bucket is configured + let s3_service = if self.bucket.is_some() { + error!("DEBUG: Bucket configured, creating S3 service"); + match self.create_service(&cx.proxy).await { + Ok(service) => { + info!("S3 service created successfully"); + Some(service) + } + Err(e) => { + error!( + "Failed to create S3 service, falling back to credential-less mode: {}", + e + ); + // Don't fail completely, but continue without S3Service + // Delta Lake will handle authentication through storage_options + None + } + } + } else { + info!("No bucket configured, using local filesystem"); + None + }; + + info!("Building sink processor"); + let sink = self.build_processor(s3_service.as_ref(), cx).await?; + + info!("Building healthcheck"); + let healthcheck = self.build_healthcheck(s3_service.as_ref())?; + + info!("Delta Lake sink build completed successfully"); + Ok((sink, healthcheck)) + } + + fn input(&self) -> Input { + Input::new(DataType::Log) + } + + fn acknowledgements(&self) -> &AcknowledgementsConfig { + &self.acknowledgements + } +} + +impl TopSQLDeltaLakeConfig { + async fn build_processor( + &self, + s3_service: Option<&S3Service>, + _cx: SinkContext, + ) -> vector::Result { + let base_path = PathBuf::from(&self.base_path); + + // Tables are discovered dynamically from events + // Default partition configuration will be applied to all tables + let table_configs: Vec = Vec::new(); + + let write_config = WriteConfig { + batch_size: self.batch_size, + max_row_group_size: self.max_row_group_size, + timeout_secs: self.timeout_secs, + compression: self.compression.clone(), + }; + let mut storage_options = self.storage_options.clone().unwrap_or_default(); + + // Add S3 storage options if S3 service is available + if let Some(service) = s3_service { + info!("Applying S3 storage options - S3 service found"); + let _ = StorageOptionsBuilder::new( + self.region.clone(), + self.force_path_style, + self.auth.clone(), + ) + .build(&mut storage_options, service) + .await; + } else { + info!("No S3 service available - using default storage options only"); + } + + let sink = TopSQLDeltaLakeSink::new( + base_path, + table_configs, + write_config, + Some(storage_options), + ); + + Ok(VectorSink::from_event_streamsink(sink)) + } + + pub async fn create_service(&self, proxy: &ProxyConfig) -> vector::Result { + error!( + "DEBUG: Creating S3 service for Delta Lake with bucket: {:?}", + self.bucket + ); + + // Ensure we have a region configured + let region = self.region.as_ref().cloned().unwrap_or_else(|| { + info!("No region specified, using default us-east-1"); + RegionOrEndpoint::with_region("us-east-1".to_string()) + }); + + info!("Using region: {:?} for S3 service", region); + info!("Using auth: {:?} for S3 service", self.auth); + info!( + "Force path style: {:?}", + self.force_path_style.unwrap_or(true) + ); + + let result = s3_common::config::create_service( + ®ion, + &self.auth, + proxy, + self.tls.as_ref(), + self.force_path_style.unwrap_or(true), + ) + .await; + + match &result { + Ok(_) => info!("S3 service created successfully for Delta Lake"), + Err(e) => { + error!("Failed to create S3 service for Delta Lake: {}", e); + error!("Auth config: {:?}", self.auth); + error!("Region config: {:?}", region); + } + } + + result + } + + fn build_healthcheck(&self, s3_service: Option<&S3Service>) -> vector::Result { + info!( + "Building healthcheck for bucket: {:?}, s3_service: {}, base_path: {}", + self.bucket, + s3_service.is_some(), + self.base_path + ); + + if let (Some(bucket), Some(_service)) = (&self.bucket, s3_service) { + info!( + "S3 configuration detected - using simplified healthcheck for bucket: {}", + bucket + ); + // For Delta Lake S3, we'll use a simplified healthcheck that always passes + // The actual S3 connectivity will be tested during the first write operation + // This avoids credential issues that can occur during Vector startup + let healthcheck = Box::pin(async move { + info!("Delta Lake S3 healthcheck: Skipping detailed S3 connectivity test"); + info!("S3 connectivity will be verified during actual write operations"); + Ok(()) + }); + return Ok(healthcheck); + } + + info!( + "Using local filesystem healthcheck for path: {}", + self.base_path + ); + // Local filesystem healthcheck + let base_path = PathBuf::from(&self.base_path); + + let healthcheck = Box::pin(async move { + // Check if directory exists and is writable + if !base_path.exists() { + if let Err(e) = std::fs::create_dir_all(&base_path) { + return Err(format!( + "Failed to create directory {}: {}", + base_path.display(), + e + ) + .into()); + } + } + + // Try to create a test file + let test_file = base_path.join(".healthcheck"); + if let Err(e) = std::fs::write(&test_file, "test") { + return Err(format!("Failed to write to {}: {}", base_path.display(), e).into()); + } + + // Clean up test file + let _ = std::fs::remove_file(test_file); + + Ok(()) + }); + + Ok(healthcheck) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_config() { + vector::test_util::test_generate_config::(); + } +} diff --git a/src/sinks/topsql_deltalake/processor.rs b/src/sinks/topsql_deltalake/processor.rs new file mode 100644 index 0000000..e34c05d --- /dev/null +++ b/src/sinks/topsql_deltalake/processor.rs @@ -0,0 +1,1183 @@ +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; +use tokio::sync::MutexGuard; + +use futures::{stream::BoxStream, StreamExt}; +use hashlru::Cache; +use tokio::sync::Mutex; +use tokio::sync::mpsc; +use vector_lib::event::Event; +use vector_lib::event::Value as LogValue; +use vector_lib::sink::StreamSink; + +use crate::common::deltalake_writer::{DeltaLakeWriter, DeltaTableConfig, WriteConfig}; +use crate::sources::topsql::upstream::consts::LABEL_INSTANCE; +use crate::sources::topsql::upstream::consts::LABEL_INSTANCE_TYPE; +use crate::sources::topsql::upstream::consts::METRIC_NAME_CPU_TIME_MS; +use crate::sources::topsql::upstream::consts::METRIC_NAME_STMT_DURATION_COUNT; +use crate::sources::topsql::upstream::consts::METRIC_NAME_STMT_DURATION_SUM_NS; +use crate::sources::topsql::upstream::consts::{ + LABEL_PLAN_DIGEST, LABEL_REGION_ID, + LABEL_SQL_DIGEST, METRIC_NAME_LOGICAL_READ_BYTES, METRIC_NAME_LOGICAL_WRITE_BYTES, + METRIC_NAME_NETWORK_IN_BYTES, METRIC_NAME_NETWORK_OUT_BYTES, METRIC_NAME_READ_KEYS, + METRIC_NAME_STMT_EXEC_COUNT, METRIC_NAME_WRITE_KEYS, +}; + +use lazy_static::lazy_static; +lazy_static! { + static ref TOPSQL_SCHEMA: serde_json::Map = { + let mut schema_info = serde_json::Map::new(); + schema_info.insert( + "timestamps".into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": false + }), + ); + schema_info.insert( + "datetime".into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_INSTANCE_TYPE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_INSTANCE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + "instance_partition_id".into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_SQL_DIGEST.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_PLAN_DIGEST.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + METRIC_NAME_CPU_TIME_MS.into(), + serde_json::json!({ + "mysql_type": "int", + "is_nullable": false + }), + ); + schema_info.insert( + METRIC_NAME_STMT_EXEC_COUNT.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + schema_info.insert( + METRIC_NAME_STMT_DURATION_SUM_NS.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + schema_info.insert( + METRIC_NAME_STMT_DURATION_COUNT.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + schema_info.insert( + METRIC_NAME_NETWORK_IN_BYTES.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + schema_info.insert( + METRIC_NAME_NETWORK_OUT_BYTES.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + // tikv specific columns + schema_info.insert( + METRIC_NAME_READ_KEYS.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + schema_info.insert( + METRIC_NAME_WRITE_KEYS.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + schema_info.insert( + METRIC_NAME_LOGICAL_READ_BYTES.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + schema_info.insert( + METRIC_NAME_LOGICAL_WRITE_BYTES.into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": true + }), + ); + // tikv region specific fields + schema_info.insert( + LABEL_REGION_ID.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info + }; + static ref INSTANCE_SCHEMA: serde_json::Map = { + let mut schema_info = serde_json::Map::new(); + schema_info.insert( + "timestamps".into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_INSTANCE_TYPE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_INSTANCE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + "tidb_cluster_id".into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": true + }), + ); + schema_info.insert( + "keyspace_name".into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": true + }), + ); + schema_info.insert( + "vm_account_id".into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": true + }), + ); + schema_info.insert( + "vm_project_id".into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": true + }), + ); + schema_info + }; + static ref PARTITION_SCHEMA: serde_json::Map = { + let mut schema_info = serde_json::Map::new(); + schema_info.insert( + "timestamps".into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_INSTANCE_TYPE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + LABEL_INSTANCE.into(), + serde_json::json!({ + "mysql_type": "text", + "is_nullable": false + }), + ); + schema_info.insert( + "instance_partition_id".into(), + serde_json::json!({ + "mysql_type": "bigint", + "is_nullable": false + }), + ); + schema_info + }; +} + +#[derive(Default, Eq, PartialEq, Clone, Hash)] +struct TiKVExecCountKey { + sql_digest: String, + plan_digest: String, + timestamps: u64, + instance: String, +} + +/// Delta Lake sink processor +pub struct TopSQLDeltaLakeSink { + base_path: PathBuf, + tables: Vec, + write_config: WriteConfig, + storage_options: Option>, + writers: Arc>>, + tikv_exec_count_cache: Arc>>, + tidb_event_cache: Arc>>, + instance_events_counter: std::sync::atomic::AtomicU64, + tx: Arc>>>, +} + +impl TopSQLDeltaLakeSink { + /// Create a new Delta Lake sink + pub fn new( + base_path: PathBuf, + tables: Vec, + write_config: WriteConfig, + storage_options: Option>, + ) -> Self { + // Create a channel with capacity 1 + let (tx, rx) = mpsc::channel(1); + let tx = Arc::new(tx); + + // Create sink instance + let sink = Arc::new(Self { + base_path, + tables, + write_config, + storage_options, + writers: Arc::new(Mutex::new(HashMap::new())), + tikv_exec_count_cache: Arc::new(Mutex::new(Cache::new(100000))), + tidb_event_cache: Arc::new(Mutex::new(Vec::new())), + instance_events_counter: std::sync::atomic::AtomicU64::new(0), + tx: Arc::clone(&tx), + }); + + // Spawn process_events_loop as a separate tokio task to avoid blocking + let sink_clone = Arc::clone(&sink); + tokio::spawn(async move { + sink_clone.process_events_loop(rx).await; + }); + + // Return the sink (Arc::try_unwrap will fail because tokio task holds a reference, + // so we use unsafe to manually get the inner value without decrementing the reference count) + // Safety: We know there's exactly one more reference (the tokio task), + // but we need to return Self, not Arc. The tokio task will continue + // to hold its reference, which is safe because TopSQLDeltaLakeSink contains + // only Arc and atomic types that are safe to share. + // We use into_raw to get a raw pointer, then manually reconstruct the value. + unsafe { + let ptr = Arc::into_raw(sink); + // Get a reference to the inner value + let inner_ref = &*ptr; + // Clone the value (TopSQLDeltaLakeSink contains only Arc and atomic types, so cloning is safe) + let inner_value = TopSQLDeltaLakeSink { + base_path: inner_ref.base_path.clone(), + tables: inner_ref.tables.clone(), + write_config: inner_ref.write_config.clone(), + storage_options: inner_ref.storage_options.clone(), + writers: Arc::clone(&inner_ref.writers), + tikv_exec_count_cache: Arc::clone(&inner_ref.tikv_exec_count_cache), + tidb_event_cache: Arc::clone(&inner_ref.tidb_event_cache), + instance_events_counter: std::sync::atomic::AtomicU64::new( + inner_ref.instance_events_counter.load(std::sync::atomic::Ordering::Relaxed) + ), + tx: Arc::clone(&inner_ref.tx), + }; + // Reconstruct the Arc (so the tokio task's reference remains valid) + let _ = Arc::from_raw(ptr); + inner_value + } + } + + #[cfg(test)] + /// Create a new Delta Lake sink for testing, returning both the sink and the receiver + /// The receiver can be used to verify messages sent through the channel + /// Note: process_events_loop is NOT started automatically - test code should handle the receiver + pub fn new_for_test( + base_path: PathBuf, + tables: Vec, + write_config: WriteConfig, + storage_options: Option>, + ) -> (Self, mpsc::Receiver>>) { + // Create a channel with capacity 1 + let (tx, rx): (mpsc::Sender>>, mpsc::Receiver>>) = mpsc::channel(1); + let tx = Arc::new(tx); + + // Create sink instance (without starting process_events_loop) + let sink = Self { + base_path, + tables, + write_config, + storage_options, + writers: Arc::new(Mutex::new(HashMap::new())), + tikv_exec_count_cache: Arc::new(Mutex::new(Cache::new(100))), + tidb_event_cache: Arc::new(Mutex::new(Vec::new())), + instance_events_counter: std::sync::atomic::AtomicU64::new(0), + tx, + }; + + // Return the sink and receiver for testing + (sink, rx) + } + + fn process_tidb_records_events<'a>( + &self, + table_events: &mut HashMap>, + tikv_exec_count_cache: &mut MutexGuard<'a, Cache>, + tidb_event_cache: &mut MutexGuard<'a, Vec>, + ) { + for event in tidb_event_cache.iter_mut() { + if let Event::Log(ref mut log_event) = event { + // Enrich SQL and Plan from cache + let mut tikv_exec_count_key = TiKVExecCountKey::default(); + if let Some(sql_digest) = log_event.get(LABEL_SQL_DIGEST).and_then(|v| v.as_str()) { + tikv_exec_count_key.sql_digest = sql_digest.to_string(); + } + if let Some(plan_digest) = log_event.get(LABEL_PLAN_DIGEST).and_then(|v| v.as_str()) + { + tikv_exec_count_key.plan_digest = plan_digest.to_string(); + } + if let Some(timestamps) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + tikv_exec_count_key.timestamps = timestamps as u64; + } + { + let tikv_exec_map = log_event + .get("topsql_tikv_stmt_exec_count") + .and_then(|v| v.as_object()); + if let Some(tikv_exec_map) = tikv_exec_map { + for (key, value) in tikv_exec_map { + tikv_exec_count_key.instance = key.to_string(); + let count = value.as_integer().unwrap_or(0) as u64; + if count == 0 { + continue; + } + + let handle = tikv_exec_count_cache.get_mut(&tikv_exec_count_key); + if let Some(handle) = handle { + *handle += count; + } else { + tikv_exec_count_cache.insert(tikv_exec_count_key.clone(), count); + } + } + } + } + table_events + .entry("topsql_data".into()) + .or_insert_with(Vec::new) + .push(Event::Log(log_event.clone())); + } + } + } + + /// Process events from channel and write to Delta Lake + async fn process_events_loop( + &self, + mut rx: mpsc::Receiver>>, + ) { + while let Some(events_vec) = rx.recv().await { + if let Err(e) = self.process_events(events_vec).await { + error!("Failed to process events: {}", e); + } + } + } + + /// Process events and write to Delta Lake + async fn process_events( + &self, + events_vec: Vec>, + ) -> Result<(), Box> { + if events_vec.is_empty() { + return Ok(()); + } + // Group events by source_table + let mut table_events: HashMap> = HashMap::new(); + let mut tikv_exec_count_cache = self.tikv_exec_count_cache.lock().await; + let mut tidb_event_cache = self.tidb_event_cache.lock().await; + let mut tidb_event_cache_cleared = false; + let mut local_instance_counter : u64 = 0; + + for events in events_vec { + for event in events { + if let Event::Log(mut log_event) = event { + let table_name: String; + { + let table_name_ref = log_event.get("source_table").and_then(|v| v.as_str()); + if let Some(table_name_v2) = table_name_ref { + table_name = table_name_v2.to_string(); + } else { + continue; + } + } + match table_name.as_str() { + "tidb_topsql" => { + tidb_event_cache.push(Event::Log(log_event.clone())); + continue; + } + "tikv_topsql" => { + // handle tidb events first, since tikv events may depend on tidb's tikv_exec_count info + if !tidb_event_cache_cleared { + self.process_tidb_records_events( + &mut table_events, + &mut tikv_exec_count_cache, + &mut tidb_event_cache, + ); + tidb_event_cache.clear(); + tidb_event_cache_cleared = true; + } + + let mut tikv_exec_count_key = TiKVExecCountKey::default(); + // Enrich SQL and Plan from cache + if let Some(sql_digest) = + log_event.get(LABEL_SQL_DIGEST).and_then(|v| v.as_str()) + { + tikv_exec_count_key.sql_digest = sql_digest.to_string(); + } + if let Some(plan_digest) = + log_event.get(LABEL_PLAN_DIGEST).and_then(|v| v.as_str()) + { + tikv_exec_count_key.plan_digest = plan_digest.to_string(); + } + if let Some(timestamps) = + log_event.get("timestamps").and_then(|v| v.as_integer()) + { + tikv_exec_count_key.timestamps = timestamps as u64; + } + if let Some(instance) = log_event.get("instance").and_then(|v| v.as_str()) { + tikv_exec_count_key.instance = instance.to_string(); + } + { + let exec_count = tikv_exec_count_cache + .get(&tikv_exec_count_key) + .unwrap_or(&0); + log_event + .insert(METRIC_NAME_STMT_EXEC_COUNT, LogValue::from(*exec_count)); + } + + table_events + .entry("topsql_data".into()) + .or_insert_with(Vec::new) + .push(Event::Log(log_event)); + } + "tikv_topregion" => { + table_events + .entry("topsql_data".into()) + .or_insert_with(Vec::new) + .push(Event::Log(log_event)); + } + "instance" => { + local_instance_counter += 1; + table_events + .entry("topsql_instance".into()) + .or_insert_with(Vec::new) + .push(Event::Log(log_event)); + } + "instance_partition" => { + table_events + .entry("topsql_instance_partition".into()) + .or_insert_with(Vec::new) + .push(Event::Log(log_event)); + } + _ => { + // Ignore other tables + } + } + } + } + } + + self.instance_events_counter.fetch_add(local_instance_counter, std::sync::atomic::Ordering::Relaxed); + if local_instance_counter > 0 { + warn!("InstanceEvents count {}", self.instance_events_counter.load(std::sync::atomic::Ordering::Relaxed)); + } + + // Writeh table's events + for (table_name, mut table_events) in table_events { + self.add_schema_info(&table_name, &mut table_events); + if let Err(e) = self.write_table_events(&table_name, table_events).await { + let error_msg = e.to_string(); + if error_msg.contains("log segment") + || error_msg.contains("Invalid table version") + || error_msg.contains("not found") + || error_msg.contains("No such file or directory") + { + panic!( + "Delta Lake corruption detected for table {}: {}", + table_name, error_msg + ); + } else { + error!("Failed to write events to table {}: {}", table_name, e); + } + } + } + + Ok(()) + } + + /// Write events to a specific table + fn add_schema_info(&self, source_table_name: &str, events: &mut Vec) { + if events.is_empty() { + return; + } + match source_table_name { + "topsql_data" => { + let first_event = &mut events[0]; + let log = first_event.as_mut_log(); + log.insert( + "_schema_metadata", + serde_json::Value::Object(TOPSQL_SCHEMA.clone()), + ); + } + "topsql_instance" => { + let first_event = &mut events[0]; + let log = first_event.as_mut_log(); + log.insert( + "_schema_metadata", + serde_json::Value::Object(INSTANCE_SCHEMA.clone()), + ); + } + "topsql_instance_partition" => { + let first_event = &mut events[0]; + let log = first_event.as_mut_log(); + log.insert( + "_schema_metadata", + serde_json::Value::Object(PARTITION_SCHEMA.clone()), + ); + } + _ => {} + } + } + + /// Write events to a specific table + async fn write_table_events( + &self, + table_name: &str, + events: Vec, + ) -> Result<(), Box> { + // Get or create writer for this table + let mut writers = self.writers.lock().await; + let writer = writers.entry(table_name.to_string()).or_insert_with(|| { + let table_path = if self.base_path.to_string_lossy().starts_with("s3://") { + // For S3 paths, append the table name to the S3 path + PathBuf::from(format!( + "{}/{}", + self.base_path.to_string_lossy(), + table_name + )) + } else { + // For local paths, use join as before + self.base_path.join(table_name) + }; + + let partition_by = if table_name == "topsql_data" { + Some(vec!["datetime".to_string(), "instance_partition_id".to_string()]) + } else { + Some(vec!["date".to_string()]) + }; + + let table_config = self + .tables + .iter() + .find(|t| t.name == table_name) + .cloned() + .unwrap_or_else(|| DeltaTableConfig { + name: table_name.to_string(), + partition_by, + schema_evolution: Some(true), + standard_columns: None, + }); + DeltaLakeWriter::new( + table_path, + table_config, + self.write_config.clone(), + self.storage_options.clone(), + ) + }); + + // Write events + writer.write_events(events).await?; + + Ok(()) + } +} + +#[async_trait::async_trait] +impl StreamSink for TopSQLDeltaLakeSink { + async fn run(self: Box, input: BoxStream<'_, Event>) -> Result<(), ()> { + // Convert self to Arc for sharing + let sink = Arc::new(*self); + info!( + "Delta Lake sink starting with batch_size: {}", + sink.write_config.batch_size + ); + + // Use the channel sender from the sink + let tx = Arc::clone(&sink.tx); + + let mut input = input.ready_chunks(sink.write_config.batch_size); + let mut events_cache = vec![]; + let mut cur_cached_size = 0; + let mut oldest_timestamp = 0; + let mut latest_timestamp = 0; + while let Some(events) = input.next().await { + let events_count = events.len(); + if events_count == 0 { + continue; + } + + // Extract timestamp from first event + if let Event::Log(ref log_event) = events[0] { + if let Some(timestamps) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + latest_timestamp = timestamps; + if cur_cached_size == 0 { + oldest_timestamp = timestamps; + } + } + } + + cur_cached_size += events_count; + events_cache.push(events); + + // Allow max delay to 3 minutes, continue if not ready to send + if events_count + cur_cached_size < sink.write_config.batch_size + && latest_timestamp < oldest_timestamp + 180 { + continue; + } + + // Send events to process_events through channel + let should_drop_on_full = latest_timestamp >= oldest_timestamp + 180; + match tx.try_send(events_cache) { + Ok(_) => { + // Successfully sent, clear the cache + cur_cached_size = 0; + events_cache = vec![]; + } + Err(tokio::sync::mpsc::error::TrySendError::Full(restored_events)) => { + if should_drop_on_full { + // Timeout exceeded, drop the data + error!("Channel full and timeout exceeded, dropping events"); + cur_cached_size = 0; + events_cache = vec![]; + } else { + // Keep in cache for next retry + // Keep cur_cached_size unchanged so we can retry + events_cache = restored_events; + } + } + Err(tokio::sync::mpsc::error::TrySendError::Closed(restored_events)) => { + // Receiver closed, restore events_cache and keep it for next retry + error!("Channel closed, keeping events in cache"); + events_cache = restored_events; + // Keep cur_cached_size unchanged so we can retry + } + } + } + + // When the input stream ends, try to send any remaining cached events + if !events_cache.is_empty() { + // Send remaining events, wait if channel is full + if let Err(_) = tx.send(events_cache).await { + // Receiver closed, log error + error!("Channel closed when flushing remaining events, dropping events"); + } + } + + // Note: We don't drop tx here as it's owned by the sink and may be used by other run() calls + // The channel will be closed when the sink is dropped + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use futures::stream; + use vector_lib::event::LogEvent; + + fn create_test_event(timestamp: i64) -> Event { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + log.insert("source_table", "tidb_topsql"); + log.insert("timestamps", LogValue::from(timestamp)); + log.insert("time", LogValue::from(timestamp)); + event + } + + fn create_test_sink_with_receiver(batch_size: usize) -> (TopSQLDeltaLakeSink, mpsc::Receiver>>) { + TopSQLDeltaLakeSink::new_for_test( + PathBuf::from("/tmp/test"), + vec![], + WriteConfig { + batch_size, + max_row_group_size: 8192, + timeout_secs: 0, + compression: "snappy".to_string(), + }, + None, + ) + } + + #[tokio::test] + async fn test_send_when_batch_size_reached() { + let batch_size = 5; + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create events that will reach batch size + let events: Vec = (0..batch_size) + .map(|i| create_test_event(1000 + i as i64)) + .collect(); + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Wait a bit for the message to be sent + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Verify that a message was sent through the channel + let received = tokio::time::timeout( + tokio::time::Duration::from_millis(500), + rx.recv() + ).await; + + assert!(received.is_ok(), "Should receive a message from channel"); + if let Ok(Some(events_vec)) = received { + // Verify the message content + // Count total events + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, batch_size, "Should receive exactly batch_size events"); + + // Verify event structure + assert!(!events_vec.is_empty(), "Events vector should not be empty"); + for event_batch in &events_vec { + assert!(!event_batch.is_empty(), "Each event batch should not be empty"); + } + } else { + panic!("Failed to receive message from channel"); + } + + // Wait for run to complete + let _ = run_handle.await; + } + + #[tokio::test] + async fn test_send_when_timeout_reached() { + let batch_size = 100; // Large batch size so we don't reach it + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create events with timestamps that exceed timeout (180 seconds) + let oldest_ts = 1000; + let latest_ts = oldest_ts + 181; // Exceeds 180 second timeout + + // Create two events: one at the start, one after timeout + let events = vec![ + create_test_event(oldest_ts), + create_test_event(latest_ts), + ]; + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Wait a bit for the message to be sent + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Verify that a message was sent through the channel due to timeout + let received = tokio::time::timeout( + tokio::time::Duration::from_millis(500), + rx.recv() + ).await; + + assert!(received.is_ok(), "Should receive a message from channel due to timeout"); + if let Ok(Some(events_vec)) = received { + // Verify the message content + // Verify events were sent + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, 2, "Should receive both events (oldest and latest)"); + } else { + panic!("Failed to receive message from channel"); + } + + // Wait for run to complete + let _ = run_handle.await; + } + + #[tokio::test] + async fn test_channel_full_keep_cache_when_not_timeout() { + let batch_size = 5; + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create many events to fill the channel (capacity 1) + // The first batch will fill the channel, second batch should be kept in cache + // and retried later + let events: Vec = (0..batch_size * 2) + .map(|i| create_test_event(1000 + i as i64)) // All within timeout window + .collect(); + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Don't consume from rx immediately to fill the channel + // Wait a bit for the first message to be sent + // The channel should be full now, and subsequent sends should keep data in cache + // Since we're not consuming, the channel stays full + // After a bit more time, the run should complete + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Now consume the first message + let first_msg = rx.recv().await; + assert!(first_msg.is_some(), "Should receive first message"); + if let Some(events_vec) = first_msg { + // Verify first message content + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, batch_size, "First message should contain batch_size events"); + } + + // Wait a bit more - the second batch should be sent after channel has space + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Check if second message was sent (data was kept in cache and retried) + let second_msg = tokio::time::timeout( + tokio::time::Duration::from_millis(200), + rx.recv() + ).await; + + // The second batch should eventually be sent (kept in cache and retried) + assert!(second_msg.is_ok(), "Should eventually receive second message after retry"); + if let Ok(Some(events_vec)) = second_msg { + // Verify second message content + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, batch_size, "Second message should contain batch_size events"); + } + + // Wait for run to complete + let _ = run_handle.await; + } + + #[tokio::test] + async fn test_channel_full_drop_when_timeout() { + let batch_size = 5; + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create events with timeout: first batch, then events after timeout + let mut events = vec![]; + // First batch at timestamp 1000 + for i in 0..batch_size { + events.push(create_test_event(1000 + i as i64)); + } + // Then an event at 1181 (exceeds timeout) + for i in 0..batch_size { + events.push(create_test_event(1005 + i as i64)); + } + events.push(create_test_event(1186)); + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Don't consume from rx to fill the channel + // Wait for first message to be sent + // Channel should be full now + // When the timeout event arrives and channel is full, data should be dropped + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Consume the first message + let first_msg = rx.recv().await; + assert!(first_msg.is_some(), "Should receive first message"); + if let Some(events_vec) = first_msg { + // Verify first message content + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, batch_size, "First message should contain batch_size events"); + + // Verify timestamps are from the first batch (1000-1004) + for event_batch in &events_vec { + for event in event_batch { + if let Event::Log(ref log_event) = event { + if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + assert!(timestamp >= 1000 && timestamp < 1000 + batch_size as i64, + "First message should contain events from first batch"); + } + } + } + } + } + + // Wait a bit more - the timeout event should have been dropped, not sent + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Check if a second message was sent (it shouldn't be, as data was dropped) + let second_msg = tokio::time::timeout( + tokio::time::Duration::from_millis(200), + rx.recv() + ).await; + // The second message should NOT be sent because data was dropped due to timeout + assert!(second_msg.is_err() || second_msg.unwrap().is_none(), + "Should NOT receive second message as data was dropped due to timeout"); + + // Wait for run to complete + let _ = run_handle.await; + } + + #[tokio::test] + async fn test_not_send_when_batch_size_and_timeout_not_reached() { + let batch_size = 10; + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create events that don't reach batch size and don't timeout + let events: Vec = (0..3) + .map(|i| create_test_event(1000 + i)) + .collect(); + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Wait for run to complete + let result = run_handle.await; + assert!(result.is_ok()); + assert!(result.unwrap().is_ok()); + + // Verify that no message was sent (data doesn't meet send conditions) + // Note: When stream ends, remaining data might be flushed, but with only 3 events + // and batch_size 10, and no timeout, it should not send immediately + // However, when the stream ends, the loop exits and remaining cache might be sent + // Let's check if any message was received + let received = tokio::time::timeout( + tokio::time::Duration::from_millis(200), + rx.recv() + ).await; + + // With the current implementation, when stream ends, remaining cache might be sent + // So we check if a message was received and verify its content + if let Ok(Some(events_vec)) = received { + // Verify the message content + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, 3, "Should receive the 3 events that were cached"); + } else { + // If no message was received, that's also valid - data wasn't sent + // This depends on implementation details of when remaining cache is flushed + } + } + + #[tokio::test] + async fn test_batch_size_sending_behavior() { + let batch_size = 3; + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create exactly batch_size events + let events: Vec = (0..batch_size) + .map(|i| create_test_event(1000 + i as i64)) + .collect(); + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Wait a bit for the message to be sent + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Verify that a message was sent through the channel + let received = tokio::time::timeout( + tokio::time::Duration::from_millis(500), + rx.recv() + ).await; + + assert!(received.is_ok(), "Should receive a message from channel"); + if let Ok(Some(events_vec)) = received { + // Verify the message content + // Count total events + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, batch_size, "Should receive exactly batch_size events"); + + // Verify event timestamps + for event_batch in events_vec { + for (i, event) in event_batch.iter().enumerate() { + if let Event::Log(ref log_event) = event { + if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + assert_eq!(timestamp, 1000 + i as i64, "Event timestamp should match"); + } + } + } + } + } else { + panic!("Failed to receive message from channel"); + } + + // Wait for run to complete + let _ = run_handle.await; + } + + #[tokio::test] + async fn test_timeout_sending_behavior() { + let batch_size = 100; // Large batch size + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create events with large time gap (exceeding 180 seconds) + let oldest_ts = 1000; + let latest_ts = 1181; // 181 seconds later, exceeds timeout + let events = vec![ + create_test_event(oldest_ts), + create_test_event(latest_ts), + ]; + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Wait a bit for the message to be sent + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Verify that a message was sent through the channel due to timeout + let received = tokio::time::timeout( + tokio::time::Duration::from_millis(500), + rx.recv() + ).await; + + assert!(received.is_ok(), "Should receive a message from channel due to timeout"); + if let Ok(Some(events_vec)) = received { + // Verify the message content + // Count total events + let total_events: usize = events_vec.iter().map(|v| v.len()).sum(); + assert_eq!(total_events, 2, "Should receive both events"); + + // Verify event timestamps + let mut timestamps = Vec::new(); + for event_batch in &events_vec { + for event in event_batch { + if let Event::Log(ref log_event) = event { + if let Some(timestamp) = log_event.get("timestamps").and_then(|v| v.as_integer()) { + timestamps.push(timestamp); + } + } + } + } + timestamps.sort(); + assert_eq!(timestamps, vec![oldest_ts, latest_ts], "Should receive events with correct timestamps"); + } else { + panic!("Failed to receive message from channel"); + } + + // Wait for run to complete + let _ = run_handle.await; + } + + #[tokio::test] + async fn test_multiple_batches() { + let batch_size = 3; + let (sink, mut rx) = create_test_sink_with_receiver(batch_size); + + // Create multiple batches worth of events + let total_events = batch_size * 3; + let events: Vec = (0..total_events) + .map(|i| create_test_event(1000 + i as i64)) + .collect(); + + let input_stream = stream::iter(events.clone()).boxed(); + let sink_box = Box::new(sink); + + // Run the function in a task + let run_handle = tokio::spawn(async move { + sink_box.run(input_stream).await + }); + + // Collect all messages from the channel + let mut received_messages = Vec::new(); + let expected_batches = (total_events + batch_size - 1) / batch_size; // Ceiling division + + // Wait for all batches to be sent + for _ in 0..expected_batches { + let received = tokio::time::timeout( + tokio::time::Duration::from_millis(500), + rx.recv() + ).await; + if let Ok(Some(msg)) = received { + received_messages.push(msg); + } else { + break; + } + } + + // Verify we received the expected number of batches + assert!(received_messages.len() >= 1); + // Verify total events received + let total_received: usize = received_messages.iter() + .map(|events_vec| events_vec.iter().map(|v| v.len()).sum::()) + .sum(); + assert_eq!(total_received, total_events, "Should receive all events across batches"); + + // Verify each message + for events_vec in &received_messages { + assert!(!events_vec.is_empty(), "Each batch should contain events"); + } + + // Wait for run to complete + let _ = run_handle.await; + } +} diff --git a/src/sources/keyviz.rs b/src/sources/keyviz.rs index 2306a4a..014e529 100644 --- a/src/sources/keyviz.rs +++ b/src/sources/keyviz.rs @@ -18,7 +18,7 @@ use vector_lib::{ tls::TlsConfig, }; -use super::topsql::topology::{InstanceType, TopologyFetcher}; +use crate::common::topology::{InstanceType, TopologyFetcher}; use crate::utils::http::build_reqwest_client; const DEFAULT_MAX_REGIONS_PER_PD_REQUEST: usize = 51200; diff --git a/src/sources/mocked_topsql/controller.rs b/src/sources/mocked_topsql/controller.rs new file mode 100644 index 0000000..578178a --- /dev/null +++ b/src/sources/mocked_topsql/controller.rs @@ -0,0 +1,539 @@ +use chrono::{DateTime, Timelike}; +use ordered_float::NotNan; +use serde_json::Value; +use vector_lib::event::{Event, KeyString, LogEvent, Value as LogValue}; +use tracing::instrument::Instrument; +use futures::StreamExt; +use tokio::time; +use tokio_stream::wrappers::IntervalStream; +use vector::shutdown::ShutdownSignal; +use crate::sources::mocked_topsql::shutdown::{pair, ShutdownNotifier, ShutdownSubscriber}; +use vector::{internal_events::StreamClosedError, SourceSender}; +use std::time::Duration; +use rand::Rng; +use rand::distr::{Alphanumeric, Uniform, StandardUniform}; +use std::collections::BTreeMap; +use crc32fast::Hasher as Crc32Hasher; + +const SQL_CONSTANT: &str = "SELECT + `tbl_test_001`.`column0`, + `tbl_test_001`.`column1`, + `tbl_test_001`.`column2`, + `tbl_test_001`.`column3`, + `tbl_test_001`.`column4`, + `tbl_test_001`.`column5`, + `tbl_test_001`.`column6`, + `tbl_test_001`.`column7`, + `tbl_test_001`.`column8`, + `tbl_test_001`.`column9`, + `tbl_test_001`.`column10`, + `tbl_test_001`.`column11`, + `tbl_test_001`.`column12`, + `tbl_test_001`.`column13`, + `tbl_test_001`.`column14`, + `tbl_test_001`.`column15`, + `tbl_test_001`.`column16`, + `tbl_test_001`.`column17`, + `tbl_test_001`.`column18`, + `tbl_test_001`.`column19`, + `tbl_test_001`.`column20`, + `tbl_test_001`.`column21`, + `tbl_test_001`.`column22`, + `tbl_test_001`.`column23`, + `tbl_test_001`.`column24`, + `tbl_test_001`.`column25`, + `tbl_test_001`.`column26`, + `tbl_test_001`.`column27`, + `tbl_test_001`.`column28`, + `tbl_test_001`.`column29`, + `tbl_test_001`.`column30`, + `tbl_test_001`.`column31`, + `tbl_test_001`.`column32`, + `tbl_test_001`.`column33`, + `tbl_test_001`.`column34`, + `tbl_test_001`.`column35`, + `tbl_test_001`.`column36`, + `tbl_test_001`.`column37`, + `tbl_test_001`.`column38`, + `tbl_test_001`.`column39`, + `tbl_test_001`.`column40`, + `tbl_test_001`.`column41`, + `tbl_test_001`.`column42`, + `tbl_test_001`.`column43`, + `tbl_test_001`.`column44`, + `tbl_test_001`.`column45`, + `tbl_test_001`.`column46`, + `tbl_test_001`.`column47`, + `tbl_test_001`.`column48`, + `tbl_test_001`.`column49`, + `tbl_test_001`.`column50`, + `tbl_test_001`.`column51`, + `tbl_test_001`.`column52`, + `tbl_test_001`.`column53`, + `tbl_test_001`.`column54`, + `tbl_test_001`.`column55`, + `tbl_test_001`.`column56`, + `tbl_test_001`.`column57`, + `tbl_test_001`.`column58`, + `tbl_test_001`.`column59`, + `tbl_test_001`.`column60`, + `tbl_test_001`.`column61`, + `tbl_test_001`.`column62`, + `tbl_test_001`.`column63`, + `tbl_test_001`.`column64`, + `tbl_test_001`.`column65`, + `tbl_test_001`.`column66` +FROM + `tbl_test_001` +WHERE + `column0` = ? + AND `column1` = ? +LIMIT + ?"; + +const PLAN_CONSTANT: &str = " Projection root db_test_0001.tbl_test_001.column0, db_test_0001.tbl_test_001.column1, db_test_0001.tbl_test_001.column2, db_test_0001.tbl_test_001.column3, db_test_0001.tbl_test_001.column4, db_test_0001.tbl_test_001.column5, db_test_0001.tbl_test_001.column6, db_test_0001.tbl_test_001.column7, db_test_0001.tbl_test_001.column8, db_test_0001.tbl_test_001.column9, db_test_0001.tbl_test_001.column10, db_test_0001.tbl_test_001.column11, db_test_0001.tbl_test_001.column12, db_test_0001.tbl_test_001.column13, db_test_0001.tbl_test_001.column14, db_test_0001.tbl_test_001.column15, db_test_0001.tbl_test_001.column16, db_test_0001.tbl_test_001.column17, db_test_0001.tbl_test_001.column18, db_test_0001.tbl_test_001.column19, db_test_0001.tbl_test_001.column20, db_test_0001.tbl_test_001.column21, db_test_0001.tbl_test_001.column22, db_test_0001.tbl_test_001.column23, db_test_0001.tbl_test_001.column24, db_test_0001.tbl_test_001.column25, db_test_0001.tbl_test_001.column26, db_test_0001.tbl_test_001.column27, db_test_0001.tbl_test_001.column28, db_test_0001.tbl_test_001.column29, db_test_0001.tbl_test_001.column30, db_test_0001.tbl_test_001.column31, db_test_0001.tbl_test_001.column32, db_test_0001.tbl_test_001.column33, db_test_0001.tbl_test_001.column34, db_test_0001.tbl_test_001.column35, db_test_0001.tbl_test_001.column36, db_test_0001.tbl_test_001.column37, db_test_0001.tbl_test_001.column38, db_test_0001.tbl_test_001.column39, db_test_0001.tbl_test_001.column40, db_test_0001.tbl_test_001.column41, db_test_0001.tbl_test_001.column42, db_test_0001.tbl_test_001.column43, db_test_0001.tbl_test_001.column44, db_test_0001.tbl_test_001.column45, db_test_0001.tbl_test_001.column46, db_test_0001.tbl_test_001.column47, db_test_0001.tbl_test_001.column48, db_test_0001.tbl_test_001.column49, db_test_0001.tbl_test_001.column50, db_test_0001.tbl_test_001.column51, db_test_0001.tbl_test_001.column52, db_test_0001.tbl_test_001.column53, db_test_0001.tbl_test_001.column54, db_test_0001.tbl_test_001.column55, db_test_0001.tbl_test_001.column56, db_test_0001.tbl_test_001.column57, db_test_0001.tbl_test_001.column58, db_test_0001.tbl_test_001.column59, db_test_0001.tbl_test_001.column60, db_test_0001.tbl_test_001.column61, db_test_0001.tbl_test_001.column62, db_test_0001.tbl_test_001.column63, db_test_0001.tbl_test_001.column64, db_test_0001.tbl_test_001.column65, db_test_0001.tbl_test_001.column66 + └─Limit root + └─Point_Get root table:tbl_test_001, index:udx_column0_useridx_column1(column0, column1)"; + +fn generate_random_int() -> Vec { + let mut rng = rand::rng(); + let arr1: [i32; 1000] = rng.random(); + arr1.to_vec() +} + +fn generate_random_bigint() -> Vec { + let mut rng = rand::rng(); + let arr1: [i64; 1000] = rng.random(); + arr1.to_vec() +} + +fn generate_random_string(num_strings: i32, string_length: usize) -> Vec { + let random_strings: Vec = (0..num_strings) + .map(|_| { + rand::thread_rng() // 获取线程局部的随机数生成器 + .sample_iter(&Alphanumeric) // 从 Alphanumeric 分布中创建迭代器 + .take(string_length) // 取指定长度的字符 + .map(char::from) // 将 u8 转换为 char + .collect() // 收集成 String + }) + .collect(); // 收集成 Vec + random_strings +} +fn generate_random_digest() -> Vec { + generate_random_string(100000, 64) +} + +fn create_event_for_instance_partition(timestamp: i64, tidb_number: usize, tikv_number: usize, instance_part: usize) -> (Vec, Vec, Vec) { + let mut events = vec![]; + let mut tidb_instance_partition_vec = vec![]; + let mut tikv_instance_partition_vec = vec![]; + for i in 0..tidb_number { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "instance_partition"); + log.insert("timestamps", LogValue::from(timestamp)); + log.insert("instance_type", "tidb"); + log.insert("instance", format!("127.0.1.{}", i)); + + // Calculate CRC32 for (instance, instance_type) + let instance_key = format!("127.0.1.{}_tidb", i); + let mut hasher = Crc32Hasher::new(); + hasher.update(instance_key.as_bytes()); + let crc_value = hasher.finalize(); + + // Calculate partition by taking modulo + // Use max(1, partition_number) to avoid division by zero + let partition_mod = if instance_part == 0 { 1 } else { instance_part }; + let calculated_partition = (crc_value % partition_mod as u32) as u32; + tidb_instance_partition_vec.push(calculated_partition); + log.insert("instance_partition_id", LogValue::from(calculated_partition)); + events.push(event); + } + for i in 0..tikv_number { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + log.insert("source_table", "instance_partition"); + log.insert("timestamps", LogValue::from(timestamp)); + log.insert("instance_type", "tikv"); + log.insert("instance", format!("127.0.0.{}", i)); + // Calculate CRC32 for (instance, instance_type) + let instance_key = format!("127.0.0.{}_tikv", i); + let mut hasher = Crc32Hasher::new(); + hasher.update(instance_key.as_bytes()); + let crc_value = hasher.finalize(); + + // Calculate partition by taking modulo + // Use max(1, partition_number) to avoid division by zero + let partition_mod = if instance_part == 0 { 1 } else { instance_part }; + let calculated_partition = (crc_value % partition_mod as u32) as u32; + tikv_instance_partition_vec.push(calculated_partition); + log.insert("instance_partition_id", LogValue::from(calculated_partition)); + events.push(event); + } + (events, tidb_instance_partition_vec, tikv_instance_partition_vec) +} + +fn create_event_for_tidb_instance(index : usize) -> Event { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "instance"); + log.insert("timestamps", LogValue::from(chrono::Utc::now().timestamp())); + log.insert("time", LogValue::from(chrono::Utc::now().timestamp())); + log.insert("instance_type", "tidb"); + log.insert("instance", format!("10.2.12.{}", index)); + event +} +fn create_event_for_tikv_instance(index : usize) -> Event { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "instance"); + log.insert("timestamps", LogValue::from(chrono::Utc::now().timestamp())); + log.insert("time", LogValue::from(chrono::Utc::now().timestamp())); + log.insert("instance_type", "tikv"); + log.insert("instance", format!("10.2.12.{}", index)); + event +} + +/// Create a Vector event from tidb sql meta +fn create_event_for_tidb_sql_plan_meta(sql_digest: &Vec, plan_digest: &Vec, random_str_vec: &Vec, offset: usize) -> (Vec, Vec) { + let mut sql_events = vec![]; + for index in 0..5000 { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tidb_sql_meta"); + log.insert("sql_digest", LogValue::from(sql_digest[offset*5000+index].to_string())); + log.insert("normalized_sql", LogValue::from(SQL_CONSTANT.to_string().replace("tbl_test_001", random_str_vec[offset*5000+index].as_str()))); + sql_events.push(event); + } + let mut plan_events = vec![]; + for index in 0..5000 { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tidb_plan_meta"); + log.insert("plan_digest", LogValue::from(plan_digest[offset*5000+index].to_string())); + log.insert("normalized_plan", LogValue::from(PLAN_CONSTANT.to_string().replace("tbl_test_001", random_str_vec[offset*5000+index].as_str()))); + plan_events.push(event); + } + (sql_events, plan_events) +} +/// Create a Vector event from table data +fn create_event_for_tidb_sql(index: usize, timestamp: i64, sql_digest_vec: &Vec, plan_digest_vec: &Vec, + cpu_time_vec: &Vec, tikv_exec_count_vec: &Vec, + stmt_exec_count_vec: &Vec, stmt_duration_sum_vec: &Vec, + stmt_duration_count_vec: &Vec, top_n: usize, instance_part: usize) -> Vec { + let mut events = vec![]; + for i in 0..(top_n + top_n / 2) { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tidb_topsql"); + log.insert("timestamps", LogValue::from(timestamp)); + log.insert("time", LogValue::from(timestamp)); + // Calculate datetime string: %Y-%m-%d %H where %H is time slot index (0-3) + // Skip current event if timestamp conversion fails + let dt = match DateTime::from_timestamp(timestamp, 0) { + Some(dt) => dt, + None => continue, + }; + let naive_dt = dt.naive_utc(); + let date = naive_dt.date(); + let hour = naive_dt.hour(); + // Calculate time slot index: 0-6=0, 6-12=1, 12-18=2, 18-24=3 + let time_slot = (hour / 6) as u32; + let datetime_str = format!("{}-{}", date.format("%Y-%m-%d"), time_slot); + log.insert("datetime", LogValue::from(datetime_str)); + log.insert("instance_type", "tidb"); + log.insert("instance", format!("127.0.1.{}", index)); + log.insert("instance_partition_id", LogValue::from(instance_part)); + log.insert("sql_digest", sql_digest_vec[i+index].clone()); + log.insert("plan_digest", plan_digest_vec[i+index].clone()); + log.insert("topsql_cpu_time_ms", LogValue::from(cpu_time_vec[i])); + log.insert("topsql_stmt_exec_count", LogValue::from(stmt_exec_count_vec[i])); + log.insert("topsql_stmt_duration_sum_ns", LogValue::from(stmt_duration_sum_vec[i])); + log.insert("topsql_stmt_duration_count", LogValue::from(stmt_duration_count_vec[i])); + let mut tikv_exec_count = BTreeMap::::new(); + tikv_exec_count.insert( + KeyString::from(format!("127.0.0.{}", index+i)), + LogValue::from(tikv_exec_count_vec[i]), + ); + tikv_exec_count.insert( + KeyString::from(format!("127.0.0.{}", index+i+1)), + LogValue::from(tikv_exec_count_vec[i]), + ); + log.insert( + "topsql_tikv_stmt_exec_count", + LogValue::Object(tikv_exec_count), + ); + events.push(event); + } + events +} + +/// Create a Vector event from table data +fn create_event_for_tikv_sql( + index: usize, timestamp: i64, sql_digest_vec: &Vec, plan_digest_vec: &Vec, + cpu_time_vec: &Vec, read_keys_vec: &Vec, + network_in_vec: &Vec, network_out_vec: &Vec, + logical_read_vec: &Vec, logical_write_vec: &Vec, top_n: usize, instance_part: usize) -> Vec { + let mut events = vec![]; + for i in 0..(top_n + top_n) { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tikv_topsql"); + log.insert("timestamps", LogValue::from(timestamp)); + log.insert("time", LogValue::from(timestamp)); + // Calculate datetime string: %Y-%m-%d %H where %H is time slot index (0-3) + // Skip current event if timestamp conversion fails + let dt = match DateTime::from_timestamp(timestamp, 0) { + Some(dt) => dt, + None => continue, + }; + let naive_dt = dt.naive_utc(); + let date = naive_dt.date(); + let hour = naive_dt.hour(); + // Calculate time slot index: 0-6=0, 6-12=1, 12-18=2, 18-24=3 + let time_slot = (hour / 6) as u32; + let datetime_str = format!("{}-{}", date.format("%Y-%m-%d"), time_slot); + log.insert("datetime", LogValue::from(datetime_str)); + log.insert("instance_type", "tikv"); + log.insert("instance", format!("127.0.0.{}", index)); + log.insert("instance_partition_id", LogValue::from(instance_part)); + log.insert("sql_digest", sql_digest_vec[i+index].clone()); + log.insert("plan_digest", plan_digest_vec[i+index].clone()); + log.insert("topsql_cpu_time_ms", LogValue::from(cpu_time_vec[i])); + log.insert("topsql_read_keys", LogValue::from(read_keys_vec[i])); + log.insert("topsql_write_keys", LogValue::from(0)); + log.insert( + "topsql_network_in_bytes", + LogValue::from(network_in_vec[i]), + ); + log.insert( + "topsql_network_out_bytes", + LogValue::from(network_out_vec[i]), + ); + log.insert( + "topsql_logical_read_bytes", + LogValue::from(logical_read_vec[i]), + ); + log.insert( + "topsql_logical_write_bytes", + LogValue::from(logical_write_vec[i]), + ); + events.push(event); + } + events +} + +/// Create a Vector event from table data +fn create_event_for_tikv_region( + index: usize, timestamp: i64, region_id_vec: &Vec, + cpu_time_vec: &Vec, read_keys_vec: &Vec, + network_in_vec: &Vec, network_out_vec: &Vec, + logical_read_vec: &Vec, logical_write_vec: &Vec, top_n: usize, instance_part: usize) -> Vec { + let mut events = vec![]; + for i in 0..(top_n + top_n) { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tikv_topregion"); + log.insert("timestamps", LogValue::from(timestamp)); + log.insert("time", LogValue::from(timestamp)); + // Calculate datetime string: %Y-%m-%d %H where %H is time slot index (0-3) + // Skip current event if timestamp conversion fails + let dt = match DateTime::from_timestamp(timestamp, 0) { + Some(dt) => dt, + None => continue, + }; + let naive_dt = dt.naive_utc(); + let date = naive_dt.date(); + let hour = naive_dt.hour(); + // Calculate time slot index: 0-6=0, 6-12=1, 12-18=2, 18-24=3 + let time_slot = (hour / 6) as u32; + let datetime_str = format!("{}-{}", date.format("%Y-%m-%d"), time_slot); + log.insert("datetime", LogValue::from(datetime_str)); + log.insert("instance_type", "tikv"); + log.insert("instance", format!("127.0.0.{}", index)); + log.insert("instance_partition_id", LogValue::from(instance_part)); + log.insert("region_id", LogValue::from(region_id_vec[i])); + log.insert("topsql_cpu_time_ms", LogValue::from(cpu_time_vec[i])); + log.insert("topsql_read_keys", LogValue::from(read_keys_vec[i])); + log.insert("topsql_write_keys", LogValue::from(0)); + log.insert( + "topsql_network_in_bytes", + LogValue::from(network_in_vec[i]), + ); + log.insert( + "topsql_network_out_bytes", + LogValue::from(network_out_vec[i]), + ); + log.insert( + "topsql_logical_read_bytes", + LogValue::from(logical_read_vec[i]), + ); + log.insert( + "topsql_logical_write_bytes", + LogValue::from(logical_write_vec[i]), + ); + events.push(event); + } + events +} + +pub struct Controller { + shutdown_notifier: ShutdownNotifier, + shutdown_subscriber: ShutdownSubscriber, + top_n: usize, + downsampling_interval: u32, + tidb_number: usize, + tikv_number: usize, + extra_column_number: u32, + instance_part_number: usize, + out: SourceSender, +} + +impl Controller { + pub async fn new( + top_n: usize, + downsampling_interval: u32, + tidb_number: usize, + tikv_number: usize, + extra_column_number: u32, + instance_part_number: usize, + out: SourceSender, + ) -> vector::Result { + let (shutdown_notifier, shutdown_subscriber) = pair(); + Ok(Self { + shutdown_notifier, + shutdown_subscriber, + top_n, + downsampling_interval, + tidb_number, + tikv_number, + extra_column_number, + instance_part_number, + out, + }) + } + + pub async fn run(mut self, mut shutdown: ShutdownSignal) { + tokio::select! { + _ = self.run_loop() => {}, + _ = &mut shutdown => {}, + } + + info!("TopSQL PubSub Controller is shutting down."); + self.shutdown_all_components().await; + } + + async fn run_loop(&mut self) { + let mut batch = vec![]; + let (mut tidb_events, tidb_instance_partition_vec, tikv_instance_partition_vec) = create_event_for_instance_partition(chrono::Utc::now().timestamp(), self.tidb_number, self.tikv_number, self.instance_part_number); + batch.append(&mut tidb_events); + if self.out.send_batch(batch).await.is_err() { + info!(message = "Downstream is closed, stopping TopSQL source."); + return; + } + let sql_random_vec = generate_random_string(100000, 10); + let mut tick_stream = IntervalStream::new(time::interval(Duration::from_secs(1))); + let mut worker_stream = IntervalStream::new(time::interval(Duration::from_secs(60))); + let mut instance_stream = IntervalStream::new(time::interval(Duration::from_secs(30))); + let mut trigger_counter : u64 = 0; + let mut instance_events_counter : u64 = 0; + loop { + tokio::select! { + _ = worker_stream.next() => { + let timestamp = chrono::Utc::now().timestamp(); + let sql_digest_vec = generate_random_digest(); + let int_vec_1 = generate_random_int(); + let int_vec_2 = generate_random_int(); + let int_vec_3 = generate_random_int(); + let bigint_vec_1 = generate_random_bigint(); + let bigint_vec_2 = generate_random_bigint(); + let bigint_vec_3 = generate_random_bigint(); + let bigint_vec_4 = generate_random_bigint(); + for index in 0..self.tidb_number { + let mut batch = vec![]; + let (mut sql_events, mut plan_events) = create_event_for_tidb_sql_plan_meta(&sql_digest_vec, &sql_digest_vec, &sql_random_vec, index%20); + batch.append(sql_events.as_mut()); + if self.out.send_batch(batch).await.is_err() { + info!(message = "Downstream is closed, stopping TopSQL source."); + break; + } + let mut batch = vec![]; + batch.append(plan_events.as_mut()); + if self.out.send_batch(batch).await.is_err() { + info!(message = "Downstream is closed, stopping TopSQL source."); + break; + } + } + let mut loop_count = 1; + if self.downsampling_interval != 0 { + loop_count = 60 / self.downsampling_interval; + } + for _ in 0..loop_count { + for index in 0..self.tidb_number { + let mut batch = vec![]; + let mut tidb_events = create_event_for_tidb_sql(index, timestamp, &sql_digest_vec, &sql_digest_vec, &int_vec_1, + &bigint_vec_1, &bigint_vec_2, &bigint_vec_3, &bigint_vec_4, self.top_n, tidb_instance_partition_vec[index] as usize); + batch.append(tidb_events.as_mut()); + if self.out.send_batch(batch).await.is_err() { + info!(message = "Downstream is closed, stopping TopSQL source."); + break; + } + } + for index in 0..self.tikv_number { + let mut batch = vec![]; + batch.append(create_event_for_tikv_sql(index, timestamp, &sql_digest_vec, &sql_digest_vec, &int_vec_1, + &int_vec_2, &bigint_vec_1, &bigint_vec_2, &bigint_vec_3, &bigint_vec_4, self.top_n, tikv_instance_partition_vec[index] as usize).as_mut()); + batch.append(create_event_for_tikv_region(index, timestamp, &int_vec_1, + &int_vec_2, &int_vec_3, &bigint_vec_1, &bigint_vec_2, &bigint_vec_3, &bigint_vec_4, self.top_n, tikv_instance_partition_vec[index] as usize).as_mut()); + if self.out.send_batch(batch).await.is_err() { + info!(message = "Downstream is closed, stopping TopSQL source. {}",); + break; + } + } + } + } + _ = tick_stream.next() => tokio::time::sleep(Duration::from_millis(50)).await, + _ = instance_stream.next() => { + trigger_counter += 1; + instance_events_counter += self.tidb_number as u64; + instance_events_counter += self.tikv_number as u64; + for index in 0..self.tidb_number { + let instance_event = create_event_for_tidb_instance(index); + if self.out.send_event(instance_event).await.is_err() { + info!(message = "Downstream is closed, stopping TopSQL source."); + break; + } + } + for index in 0..self.tikv_number { + let instance_event = create_event_for_tikv_instance(index); + if self.out.send_event(instance_event).await.is_err() { + info!(message = "Downstream is closed, stopping TopSQL source."); + break; + } + } + warn!(message = "Mocked TopSQL source sent {} times, sent {} data.", trigger_counter, instance_events_counter); + } + } + }; + } + + async fn shutdown_all_components(mut self) { + self.shutdown_notifier.shutdown(); + self.shutdown_notifier.wait_for_exit().await; + info!(message = "All TopSQL sources have been shut down."); + } +} diff --git a/src/sources/mocked_topsql/mod.rs b/src/sources/mocked_topsql/mod.rs new file mode 100644 index 0000000..af75207 --- /dev/null +++ b/src/sources/mocked_topsql/mod.rs @@ -0,0 +1,118 @@ +use vector::config::{GenerateConfig, SourceConfig, SourceContext}; +use vector_lib::{ + config::{DataType, LogNamespace, SourceOutput}, + configurable::configurable_component, + source::Source, +}; + +use crate::sources::mocked_topsql::controller::Controller; + +mod controller; +pub mod shutdown; + +/// PLACEHOLDER +#[configurable_component(source("mocked_topsql"))] +#[derive(Debug, Clone)] +pub struct MockedTopSQLConfig { + /// Top N queries to collect + #[serde(default = "default_top_n")] + pub top_n: usize, + + /// Downsampling interval + #[serde(default = "default_downsampling_interval")] + pub downsampling_interval: u32, + + /// TiDB node number + pub tidb_number: usize, + + /// TiKV node number + pub tikv_number: usize, + + /// part instance number + pub instance_partition_number: usize, + + /// Extra column number + #[serde(default = "default_extra_column_number")] + pub extra_column_number: u32, +} + +pub const fn default_top_n() -> usize { + 0 +} + +pub const fn default_downsampling_interval() -> u32 { + 0 +} + +pub const fn default_tidb_number() -> usize { + 5 +} + +pub const fn default_instance_part_number() -> usize { + 1 +} + +pub const fn default_tikv_number() -> usize { + 5 +} + +pub const fn default_extra_column_number() -> u32 { + 0 +} + +impl GenerateConfig for MockedTopSQLConfig { + fn generate_config() -> toml::Value { + toml::Value::try_from(Self { + top_n: default_top_n(), + downsampling_interval: default_downsampling_interval(), + tidb_number: default_tidb_number(), + tikv_number: default_tikv_number(), + extra_column_number: default_extra_column_number(), + instance_partition_number: default_instance_part_number(), + }) + .unwrap() + } +} + +#[async_trait::async_trait] +#[typetag::serde(name = "mocked_topsql")] +impl SourceConfig for MockedTopSQLConfig { + async fn build(&self, cx: SourceContext) -> vector::Result { + let top_n = self.top_n; + let downsampling_interval = self.downsampling_interval; + let tidb_number = self.tidb_number; + let tikv_number = self.tikv_number; + let extra_column_number = self.extra_column_number; + let instance_part_number = self.instance_partition_number; + + Ok(Box::pin(async move { + let controller = Controller::new( + top_n, + downsampling_interval, + tidb_number, + tikv_number, + extra_column_number, + instance_part_number, + cx.out, + ) + .await + .map_err(|error| error!(message = "Source failed.", %error))?; + + controller.run(cx.shutdown).await; + + Ok(()) + })) + } + + fn outputs(&self, _: LogNamespace) -> Vec { + vec![SourceOutput { + port: None, + ty: DataType::Log, + schema_definition: None, + }] + } + + fn can_acknowledge(&self) -> bool { + false + } +} diff --git a/src/sources/mocked_topsql/shutdown.rs b/src/sources/mocked_topsql/shutdown.rs new file mode 100644 index 0000000..cf9ef12 --- /dev/null +++ b/src/sources/mocked_topsql/shutdown.rs @@ -0,0 +1,242 @@ +use async_recursion::async_recursion; +use tokio::sync::watch; + +pub fn pair() -> (ShutdownNotifier, ShutdownSubscriber) { + let (tx, rx) = watch::channel(()); + ( + ShutdownNotifier { tx }, + ShutdownSubscriber { parent: None, rx }, + ) +} + +#[derive(Clone)] +pub struct ShutdownNotifier { + tx: watch::Sender<()>, +} + +impl ShutdownNotifier { + pub fn shutdown(&self) { + let _ = self.tx.send(()); + } + + pub async fn wait_for_exit(&self) { + self.tx.closed().await; + } +} + +#[derive(Clone)] +pub struct ShutdownSubscriber { + parent: Option>, + rx: watch::Receiver<()>, +} + +impl ShutdownSubscriber { + #[async_recursion] + pub async fn done(&mut self) { + let rx = &mut self.rx; + match self.parent.as_mut() { + None => { + let _ = rx.changed().await; + } + Some(parent) => { + let parent = parent.as_mut(); + tokio::select! { + _ = parent.done() => {} + _ = rx.changed() => {} + } + } + } + } + + pub fn extend(&self) -> (ShutdownNotifier, ShutdownSubscriber) { + let (tx, rx) = watch::channel(()); + ( + ShutdownNotifier { tx }, + ShutdownSubscriber { + parent: Some(Box::new(self.clone())), + rx, + }, + ) + } + + #[allow(dead_code)] + pub async fn wait_for_shutdown(&mut self) { + self.done().await + } + + pub fn subscribe(&self) -> watch::Receiver<()> { + self.rx.clone() + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + use tokio::time::timeout; + + use super::*; + + #[tokio::test] + async fn ten_subscribers() { + let (notifier, subscriber) = pair(); + + const COUNT: usize = 10; + let done = Arc::new(AtomicUsize::new(0)); + let mut handles = vec![]; + for _ in 0..COUNT { + let done = done.clone(); + let mut subscriber = subscriber.clone(); + handles.push(tokio::spawn(async move { + subscriber.done().await; + done.fetch_add(1, Ordering::SeqCst); + })); + } + drop(subscriber); + + notifier.shutdown(); + notifier.wait_for_exit().await; + assert_eq!(done.load(Ordering::SeqCst), COUNT); + + let _ = futures::future::join_all(handles).await; + } + + #[tokio::test] + async fn no_subscribers() { + let (notifier, _) = pair(); + + notifier.shutdown(); + notifier.wait_for_exit().await; + } + + #[tokio::test] + async fn subscribers_drop_before_wait() { + let (notifier, subscriber) = pair(); + + let mut handles = vec![]; + for _ in 0..5 { + let subscriber = subscriber.clone(); + handles.push(tokio::spawn(async move { + let _s = subscriber; + })); + } + drop(subscriber); + + notifier.shutdown(); + notifier.wait_for_exit().await; + + let _ = futures::future::join_all(handles).await; + } + + #[tokio::test] + async fn notifier_drop_after_spawn() { + let (notifier, subscriber) = pair(); + + let mut handles = vec![]; + for _ in 0..5 { + let mut subscriber = subscriber.clone(); + handles.push(tokio::spawn(async move { + subscriber.done().await; + })); + } + drop((notifier, subscriber)); + + let _ = futures::future::join_all(handles).await; + } + + #[tokio::test] + async fn notifier_drop_before_spawn() { + let (notifier, subscriber) = pair(); + + drop(notifier); + let mut handles = vec![]; + for _ in 0..5 { + let mut subscriber = subscriber.clone(); + handles.push(tokio::spawn(async move { + subscriber.done().await; + })); + } + drop(subscriber); + + let _ = futures::future::join_all(handles).await; + } + + #[tokio::test] + async fn really_wait_for_exit() { + let (notifier, mut subscriber) = pair(); + + let (cont_tx, mut cont_rx) = tokio::sync::mpsc::unbounded_channel(); + let handle = tokio::spawn(async move { + let _ = cont_rx.recv().await; + subscriber.done().await; + }); + + notifier.shutdown(); + + // subscriber is blocked on something and cannot exit, so wait_for_exit is also blocked + assert!( + timeout(std::time::Duration::from_secs(1), notifier.wait_for_exit()) + .await + .is_err() + ); + + // unblock subscriber and wait_for_exit should act well + let _ = cont_tx.send(()); + notifier.wait_for_exit().await; + + let _ = handle.await; + } + + #[tokio::test] + async fn nested_inner_shutdown() { + let (notifier, subscriber) = pair(); + + let handle = tokio::spawn(async move { + let (sub_notifier, mut sub_subscriber) = subscriber.extend(); + + let handle = tokio::spawn(async move { + sub_subscriber.done().await; + }); + + sub_notifier.shutdown(); + sub_notifier.wait_for_exit().await; + let _ = handle.await; + }); + + notifier.wait_for_exit().await; + let _ = handle.await; + } + + #[tokio::test] + async fn nested_outer_shutdown() { + let (notifier, subscriber) = pair(); + + let mut handles = vec![]; + for _ in 0..3 { + let mut subscriber = subscriber.clone(); + handles.push(tokio::spawn(async move { + let mut handles = vec![]; + { + let (sub_notifier, sub_subscriber) = subscriber.extend(); + for _ in 0..3 { + let mut subscriber = sub_subscriber.clone(); + handles.push(tokio::spawn(async move { + subscriber.done().await; + })); + } + drop(sub_subscriber); + sub_notifier.wait_for_exit().await; + } + + subscriber.done().await; + let _ = futures::future::join_all(handles).await; + })); + } + drop(subscriber); + + notifier.shutdown(); + notifier.wait_for_exit().await; + let _ = futures::future::join_all(handles).await; + } +} diff --git a/src/sources/mod.rs b/src/sources/mod.rs index 6c8b098..6d00dbc 100644 --- a/src/sources/mod.rs +++ b/src/sources/mod.rs @@ -1,4 +1,6 @@ pub mod conprof; pub mod filename; pub mod keyviz; +pub mod system_tables; pub mod topsql; +pub mod mocked_topsql; diff --git a/src/sources/system_tables/cluster_statements_summary_client/README.md b/src/sources/system_tables/cluster_statements_summary_client/README.md new file mode 100644 index 0000000..e49f76c --- /dev/null +++ b/src/sources/system_tables/cluster_statements_summary_client/README.md @@ -0,0 +1,173 @@ +# TiDB CLUSTER_STATEMENTS_SUMMARY gRPC Coprocessor Client + +This project demonstrates how to directly query TiDB cluster's `CLUSTER_STATEMENTS_SUMMARY` data through the gRPC coprocessor protocol, simulating TiDB's internal mechanism for handling cluster table queries. + +## Project Features + +- 🚀 **Direct gRPC Communication**: Bypasses MySQL protocol, directly uses TiDB's internal coprocessor protocol +- 🔍 **Real Process Simulation**: Completely simulates TiDB's internal process for handling `CLUSTER_STATEMENTS_SUMMARY` queries +- 🌐 **Parallel Queries**: Sends requests to all TiDB nodes in the cluster simultaneously +- 📊 **Data Merging**: Automatically merges results from different nodes and adds instance identifiers +- ⚡ **High Performance**: Avoids MySQL protocol overhead, directly accesses memory data + +## Technical Principles + +### TiDB Internal Mechanism + +When executing `SELECT * FROM CLUSTER_STATEMENTS_SUMMARY`, TiDB internally: + +1. **Node Discovery**: Gets all TiDB node information from etcd +2. **Task Distribution**: Creates coprocessor tasks for each node +3. **Parallel Requests**: Sends requests to each node's StatusPort via gRPC +4. **Data Collection**: Each node returns local statement statistics data from memory +5. **Result Merging**: Adds instance address to each row and merges final results + +### Differences from Traditional Approach + +| Aspect | Traditional MySQL Query | gRPC Coprocessor | +|--------|------------------------|------------------| +| **Protocol** | MySQL Protocol | gRPC protobuf | +| **Port** | 4000 (MySQL) | 10080 (StatusPort) | +| **Data Path** | SQL parsing→executor→result set | Direct memory access | +| **Performance** | Protocol conversion overhead | Zero-copy, high performance | +| **Concurrency** | Single connection serial | Multiple connection parallel | + +## Build and Run + +### 1. Dependency Management + +```bash +cd cluster_statements_summary_client +go mod tidy +``` + +### 2. Build + +```bash +go build -o cluster_client cluster_statements_summary_client.go +``` + +### 3. Run + +```bash +./cluster_client +``` + +## Configuration + +### Server Configuration + +Configure your TiDB cluster information in the `GetTiDBServersFromEtcd()` function in `main()`: + +```go +return []ServerInfo{ + { + ServerType: "tidb", + Address: "127.0.0.1:4000", // MySQL port + StatusAddr: "127.0.0.1:10080", // gRPC port + StatusPort: 10080, + IP: "127.0.0.1", + }, + // Add more nodes... +} +``` + +### Important Port Information + +- **MySQL Port (4000)**: Used for regular SQL queries +- **Status Port (10080)**: Used for gRPC coprocessor requests, this is the port used by this client + +## Code Structure + +``` +cluster_statements_summary_client.go +├── ServerInfo # Server information structure +├── ClusterStatementsSummaryClient # Main client class +├── QueryClusterStatementsSummary() # Parallel query entry point +├── queryServerViaCoprocessor() # Single node query +├── buildCoprocessorRequest() # Build DAG request +├── parseCoprocessorResponse() # Parse response data +└── mergeResults() # Merge results +``` + +## Example Output + +``` +=== TiDB CLUSTER_STATEMENTS_SUMMARY gRPC Coprocessor Client === + +Found 2 TiDB servers: + - MySQL port: 127.0.0.1:4000, gRPC port: 127.0.0.1:10080 + - MySQL port: 127.0.0.1:4001, gRPC port: 127.0.0.1:10081 + +Starting CLUSTER_STATEMENTS_SUMMARY query via gRPC coprocessor... + +Querying server: 127.0.0.1:4000 +Querying server: 127.0.0.1:4001 + +Server 127.0.0.1:4000 returned 4 rows + Row 1: Instance=127.0.0.1:4000, DigestText=SELECT * FROM table_1, ExecCount=100, TotalTime=1000 + Row 2: Instance=127.0.0.1:4000, DigestText=UPDATE table_1 SET col = ?, ExecCount=50, TotalTime=800 + Row 3: Instance=127.0.0.1:4000, DigestText=SELECT * FROM table_2, ExecCount=110, TotalTime=1100 + +=== Merged Results === +Successfully queried servers: 2 +Failed servers: 0 +Total rows: 8 + +=== Statistics by Server === +Server 127.0.0.1:4000: 4 rows +Server 127.0.0.1:4001: 4 rows +``` + +## Technical Details + +### DAG Request Construction + +```go +dagReq := &tipb.DAGRequest{ + TimeZoneName: "UTC", + EncodeType: tipb.EncodeType_TypeDefault, + User: &tipb.UserIdentity{ + UserName: "root", + UserHost: "%", + }, + Executors: []*tipb.Executor{ + { + Tp: tipb.ExecType_TypeMemTableScan, + MemTableScan: &tipb.MemTableScan{ + TableId: 1, // CLUSTER_STATEMENTS_SUMMARY + Columns: [...], // Column definitions + }, + }, + { + Tp: tipb.ExecType_TypeLimit, + Limit: &tipb.Limit{Limit: 10}, + }, + }, +} +``` + +### Response Parsing + +Response data format is `tipb.SelectResponse`, containing: +- `Chunks[]`: Data chunk arrays +- `Warnings[]`: Warning information +- `ExecutionSummaries[]`: Execution statistics + +## Notes + +1. **Network Connection**: Ensure access to TiDB nodes' StatusPort (default 10080) +2. **Permission Requirements**: Need permissions to access STATEMENTS_SUMMARY +3. **Version Compatibility**: Use protobuf definitions compatible with target TiDB version +4. **Timeout Settings**: Adjust network timeout based on cluster size + +## Extension Usage + +This client framework can be extended to query other cluster tables: +- `CLUSTER_SLOW_QUERY` +- `CLUSTER_PROCESSLIST` +- `CLUSTER_CONFIG` +- `CLUSTER_HARDWARE` +- `CLUSTER_LOAD` + +Simply modify the table ID and column definitions in `buildCoprocessorRequest()`. \ No newline at end of file diff --git a/src/sources/system_tables/cluster_statements_summary_client/cluster_statements_summary_client.go b/src/sources/system_tables/cluster_statements_summary_client/cluster_statements_summary_client.go new file mode 100644 index 0000000..eccce42 --- /dev/null +++ b/src/sources/system_tables/cluster_statements_summary_client/cluster_statements_summary_client.go @@ -0,0 +1,1707 @@ +package main + +import ( + "context" + "encoding/binary" + "encoding/json" + "fmt" + "io" + "log" + "math" + "net" + "net/http" + "strconv" + "strings" + "sync" + "time" + + "github.com/gogo/protobuf/proto" + "github.com/pingcap/kvproto/pkg/coprocessor" + "github.com/pingcap/kvproto/pkg/kvrpcpb" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/kvproto/pkg/tikvpb" + "github.com/pingcap/tipb/go-tipb" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +// Simplified type definitions without dependency on TiDB internal packages +type MySQLType int32 + +const ( + TypeTiny MySQLType = 1 + TypeShort MySQLType = 2 + TypeInt24 MySQLType = 9 + TypeLong MySQLType = 3 + TypeLonglong MySQLType = 8 + TypeFloat MySQLType = 4 + TypeDouble MySQLType = 5 + TypeString MySQLType = 254 + TypeVarString MySQLType = 15 + TypeVarchar MySQLType = 15 + TypeTinyBlob MySQLType = 249 + TypeMediumBlob MySQLType = 250 + TypeLongBlob MySQLType = 251 + TypeBlob MySQLType = 252 + TypeJSON MySQLType = 245 + TypeDate MySQLType = 10 + TypeDatetime MySQLType = 12 + TypeTimestamp MySQLType = 7 + TypeDuration MySQLType = 11 + TypeNewDecimal MySQLType = 246 + TypeEnum MySQLType = 247 + TypeSet MySQLType = 248 + TypeBit MySQLType = 16 +) + +// Simplified Chunk structure +type Chunk struct { + columns []*Column +} + +type Column struct { + data []byte + offsets []int64 + length int + nullBitmap []byte +} + +func (c *Chunk) NumRows() int { + if len(c.columns) == 0 { + return 0 + } + return c.columns[0].length +} + +func (c *Chunk) Column(colIdx int) *Column { + if colIdx >= len(c.columns) { + return nil + } + return c.columns[colIdx] +} + +func (c *Column) IsNull(rowIdx int) bool { + if rowIdx >= c.length || len(c.nullBitmap) == 0 { + return false + } + byteIdx := rowIdx / 8 + bitIdx := rowIdx % 8 + return (c.nullBitmap[byteIdx] & (1 << bitIdx)) == 0 +} + +func (c *Column) GetInt64(rowIdx int) int64 { + if c.IsNull(rowIdx) { + return 0 + } + offset := rowIdx * 8 + if offset+8 > len(c.data) { + return 0 + } + return int64(binary.LittleEndian.Uint64(c.data[offset : offset+8])) +} + +func (c *Column) GetString(rowIdx int) string { + if c.IsNull(rowIdx) { + return "" + } + if len(c.offsets) <= rowIdx+1 { + return "" + } + start := int(c.offsets[rowIdx]) + end := int(c.offsets[rowIdx+1]) + if start >= end || start >= len(c.data) || end > len(c.data) { + return "" + } + return string(c.data[start:end]) +} + +func (c *Column) GetFloat32(rowIdx int) float32 { + if c.IsNull(rowIdx) { + return 0 + } + offset := rowIdx * 4 + if offset+4 > len(c.data) { + return 0 + } + return math.Float32frombits(binary.LittleEndian.Uint32(c.data[offset : offset+4])) +} + +func (c *Column) GetFloat64(rowIdx int) float64 { + if c.IsNull(rowIdx) { + return 0 + } + offset := rowIdx * 8 + if offset+8 > len(c.data) { + return 0 + } + return math.Float64frombits(binary.LittleEndian.Uint64(c.data[offset : offset+8])) +} + +func (c *Column) GetBytes(rowIdx int) []byte { + if c.IsNull(rowIdx) { + return nil + } + if len(c.offsets) <= rowIdx+1 { + return nil + } + start := int(c.offsets[rowIdx]) + end := int(c.offsets[rowIdx+1]) + if start >= end || start >= len(c.data) || end > len(c.data) { + return nil + } + return c.data[start:end] +} + +// TableSchemaColumn represents column information in table schema +type TableSchemaColumn struct { + ID int64 `json:"id"` + Name struct { + O string `json:"O"` // Original name + L string `json:"L"` // Lowercase name + } `json:"name"` + Type struct { + Tp int32 `json:"tp"` + Flag uint32 `json:"flag"` + Flen int32 `json:"flen"` + Decimal int32 `json:"decimal"` + Charset string `json:"charset"` + Collate string `json:"collate"` + } `json:"type"` +} + +// ServerInfo represents TiDB server information +type ServerInfo struct { + ServerType string + Address string + StatusAddr string + StatusPort uint + IP string +} + +// TableSchema represents table schema information +type TableSchema struct { + ID int64 `json:"id"` + Name struct { + O string `json:"O"` // Original name + L string `json:"L"` // Lowercase name + } `json:"name"` + Columns []struct { + ID int64 `json:"id"` + Name struct { + O string `json:"O"` // Original name + L string `json:"L"` // Lowercase name + } `json:"name"` + Type struct { + Tp int32 `json:"tp"` + Flag uint32 `json:"flag"` + Flen int32 `json:"flen"` + Decimal int32 `json:"decimal"` + Charset string `json:"charset"` + Collate string `json:"collate"` + } `json:"type"` + } `json:"cols"` +} + +// ClusterStatementsSummaryClient client for querying CLUSTER_STATEMENTS_SUMMARY via gRPC coprocessor +type ClusterStatementsSummaryClient struct { + servers []ServerInfo // List of TiDB server information +} + +// NewClusterStatementsSummaryClient creates a new client +func NewClusterStatementsSummaryClient(servers []ServerInfo) *ClusterStatementsSummaryClient { + return &ClusterStatementsSummaryClient{ + servers: servers, + } +} + +// getTableSchema gets table schema information via HTTP API +func (c *ClusterStatementsSummaryClient) getTableSchema(ctx context.Context, server ServerInfo) (*TableSchema, error) { + // Construct HTTP request URL + url := fmt.Sprintf("http://%s/schema/information_schema/cluster_statements_summary", server.StatusAddr) + + // Create HTTP client + client := &http.Client{ + Timeout: 10 * time.Second, + } + + // Send request + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, fmt.Errorf("failed to create HTTP request: %v", err) + } + + resp, err := client.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send HTTP request: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("HTTP request failed, status code: %d", resp.StatusCode) + } + + // Read response + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response: %v", err) + } + + // Parse JSON + var schema TableSchema + err = json.Unmarshal(body, &schema) + if err != nil { + return nil, fmt.Errorf("failed to parse JSON: %v", err) + } + + return &schema, nil +} + +// checkTiDBStatus checks TiDB instance status and statement summary configuration +func (c *ClusterStatementsSummaryClient) checkTiDBStatus(ctx context.Context, server ServerInfo) error { + // Check TiDB status + statusURL := fmt.Sprintf("http://%s/status", server.StatusAddr) + log.Printf("Checking TiDB status: %s", statusURL) + + client := &http.Client{ + Timeout: 10 * time.Second, + } + + resp, err := client.Get(statusURL) + if err != nil { + return fmt.Errorf("failed to check TiDB status: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("TiDB status check failed: HTTP %d", resp.StatusCode) + } + + // Check statement summary related configuration + configURL := fmt.Sprintf("http://%s/config", server.StatusAddr) + log.Printf("Checking TiDB configuration: %s", configURL) + + resp, err = client.Get(configURL) + if err != nil { + log.Printf("Warning: unable to get TiDB configuration: %v", err) + } else { + defer resp.Body.Close() + if resp.StatusCode == http.StatusOK { + body, err := io.ReadAll(resp.Body) + if err == nil { + log.Printf("TiDB configuration info: %s", string(body)[:min(200, len(body))]) + } + } + } + + return nil +} + +// QueryClusterStatementsSummary queries cluster statements summary +func (c *ClusterStatementsSummaryClient) QueryClusterStatementsSummary(ctx context.Context) error { + var wg sync.WaitGroup + resultChan := make(chan QueryResult, len(c.servers)) + + // First get schema information from the first server + var tableSchema *TableSchema + for _, server := range c.servers { + fmt.Printf("Getting schema information from server %s...\n", server.Address) + schema, err := c.getTableSchema(ctx, server) + if err != nil { + log.Printf("Failed to get schema from server %s: %v", server.Address, err) + continue + } + tableSchema = schema + fmt.Printf("Successfully got schema info: table ID=%d, column count=%d\n", schema.ID, len(schema.Columns)) + break + } + + if tableSchema == nil { + return fmt.Errorf("unable to get schema information from any server") + } + + // Send coprocessor requests to all TiDB servers in parallel + for _, server := range c.servers { + wg.Add(1) + go func(srv ServerInfo) { + defer wg.Done() + fmt.Printf("Querying server: %s\n", srv.Address) + + // First check TiDB status + if err := c.checkTiDBStatus(ctx, srv); err != nil { + log.Printf("Failed to check TiDB status %s: %v", srv.StatusAddr, err) + } + + result := c.queryServerViaCoprocessor(ctx, srv, tableSchema) + resultChan <- result + }(server) + } + + // Wait for all requests to complete + go func() { + wg.Wait() + close(resultChan) + }() + + // Collect and merge results + return c.mergeResults(resultChan) +} + +// QueryResult query result +type QueryResult struct { + ServerAddr string + Rows []Row + Error error +} + +// Row represents a row of data - based on the complete structure of CLUSTER_STATEMENTS_SUMMARY table +type Row struct { + // Basic information + Instance string // INSTANCE - instance address + SummaryBeginTime string // SUMMARY_BEGIN_TIME - summary begin time + SummaryEndTime string // SUMMARY_END_TIME - summary end time + StmtType string // STMT_TYPE - statement type + SchemaName string // SCHEMA_NAME - database name + Digest string // DIGEST - statement digest + DigestText string // DIGEST_TEXT - statement digest text + TableNames string // TABLE_NAMES - table names + IndexNames string // INDEX_NAMES - index names + SampleUser string // SAMPLE_USER - sample user + + // Execution statistics + ExecCount int64 // EXEC_COUNT - execution count + SumLatency int64 // SUM_LATENCY - total latency + MaxLatency int64 // MAX_LATENCY - maximum latency + MinLatency int64 // MIN_LATENCY - minimum latency + AvgLatency int64 // AVG_LATENCY - average latency + + // Parse and compile latency + AvgParseLatency int64 // AVG_PARSE_LATENCY - average parse latency + MaxParseLatency int64 // MAX_PARSE_LATENCY - maximum parse latency + AvgCompileLatency int64 // AVG_COMPILE_LATENCY - average compile latency + MaxCompileLatency int64 // MAX_COMPILE_LATENCY - maximum compile latency + + // Resource usage + AvgMem int64 // AVG_MEM - average memory usage + MaxMem int64 // MAX_MEM - maximum memory usage + AvgDisk int64 // AVG_DISK - average disk usage + MaxDisk int64 // MAX_DISK - maximum disk usage + AvgAffectedRows float64 // AVG_AFFECTED_ROWS - average affected rows (double type) + + // Time information + FirstSeen string // FIRST_SEEN - first seen time + LastSeen string // LAST_SEEN - last seen time + + // Sample information + SampleSQL string // SAMPLE_SQL - sample SQL + PrevSampleText string // PREV_SAMPLE_TEXT - previous sample text + + // Plan information + PlanDigest string // PLAN_DIGEST - plan digest + Plan string // PLAN - execution plan + PlanCacheHits int64 // PLAN_CACHE_HITS - plan cache hits + PlanInCache int64 // PLAN_IN_CACHE - plan in cache + PlanInBinding int64 // PLAN_IN_BINDING - plan in binding + + // Query sample information + QuerySampleText string // QUERY_SAMPLE_TEXT - query sample text + PrevSampleSQL string // PREV_SAMPLE_SQL - previous sample SQL + PlanDigestText string // PLAN_DIGEST_TEXT - plan digest text + QuerySampleUser string // QUERY_SAMPLE_USER - query sample user + QuerySampleHost string // QUERY_SAMPLE_HOST - query sample host + QuerySampleDB string // QUERY_SAMPLE_DB - query sample database + QuerySampleState string // QUERY_SAMPLE_STATE - query sample state + QuerySampleInfo string // QUERY_SAMPLE_INFO - query sample info + + // Transaction information + QuerySampleTransType string // QUERY_SAMPLE_TRANS_TYPE - query sample transaction type + QuerySampleTransIsolation string // QUERY_SAMPLE_TRANS_ISOLATION - query sample transaction isolation + QuerySampleTransStartTime string // QUERY_SAMPLE_TRANS_START_TIME - query sample transaction start time + QuerySampleTransDuration int64 // QUERY_SAMPLE_TRANS_DURATION - query sample transaction duration + QuerySampleTransState string // QUERY_SAMPLE_TRANS_STATE - query sample transaction state + QuerySampleTransError string // QUERY_SAMPLE_TRANS_ERROR - query sample transaction error + QuerySampleTransTables string // QUERY_SAMPLE_TRANS_TABLES - query sample transaction tables + QuerySampleTransIndexes string // QUERY_SAMPLE_TRANS_INDEXES - query sample transaction indexes + QuerySampleTransLockKeys string // QUERY_SAMPLE_TRANS_LOCK_KEYS - query sample transaction lock keys + QuerySampleTransLockTime int64 // QUERY_SAMPLE_TRANS_LOCK_TIME - query sample transaction lock time + QuerySampleTransWaitTime int64 // QUERY_SAMPLE_TRANS_WAIT_TIME - query sample transaction wait time + QuerySampleTransBackoffTime int64 // QUERY_SAMPLE_TRANS_BACKOFF_TIME - query sample transaction backoff time + QuerySampleTransResolveLockTime int64 // QUERY_SAMPLE_TRANS_RESOLVE_LOCK_TIME - query sample transaction resolve lock time + QuerySampleTransLocalLatchWaitTime int64 // QUERY_SAMPLE_TRANS_LOCAL_LATCH_WAIT_TIME - query sample transaction local latch wait time + QuerySampleTransWriteKeys int64 // QUERY_SAMPLE_TRANS_WRITE_KEYS - query sample transaction write keys + QuerySampleTransWriteSize int64 // QUERY_SAMPLE_TRANS_WRITE_SIZE - query sample transaction write size + QuerySampleTransPrewriteRegionNum int64 // QUERY_SAMPLE_TRANS_PREWRITE_REGION_NUM - query sample transaction prewrite region num + QuerySampleTransTxnRetry int64 // QUERY_SAMPLE_TRANS_TXN_RETRY - query sample transaction retry + QuerySampleTransBackoffTypes string // QUERY_SAMPLE_TRANS_BACKOFF_TYPES - query sample transaction backoff types + + // Extended fields - for storing other column data + ExtraFields map[string]interface{} // Store other undefined fields + + // Backward compatibility fields + TotalTime int64 // Total latency - same as SumLatency, for backward compatibility +} + +// queryServerViaCoprocessor queries a single server via coprocessor +func (c *ClusterStatementsSummaryClient) queryServerViaCoprocessor(ctx context.Context, server ServerInfo, tableSchema *TableSchema) QueryResult { + result := QueryResult{ + ServerAddr: server.Address, + } + + // Build gRPC connection address (using StatusPort) + grpcAddr := net.JoinHostPort(server.IP, strconv.FormatUint(uint64(server.StatusPort), 10)) + + // Establish gRPC connection + conn, err := grpc.DialContext(ctx, grpcAddr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + grpc.WithTimeout(10*time.Second), + ) + if err != nil { + result.Error = fmt.Errorf("failed to connect to gRPC service: %v", err) + return result + } + defer conn.Close() + + // Create TiKV client + client := tikvpb.NewTikvClient(conn) + + // Build coprocessor request + copReq, err := c.buildCoprocessorRequest(tableSchema) + if err != nil { + result.Error = fmt.Errorf("failed to build coprocessor request: %v", err) + return result + } + + // Send request + resp, err := client.Coprocessor(ctx, copReq) + if err != nil { + result.Error = fmt.Errorf("failed to send coprocessor request: %v", err) + return result + } + + // Parse response + rows, err := c.parseCoprocessorResponse(resp, server.Address, tableSchema) + if err != nil { + result.Error = fmt.Errorf("failed to parse response: %v", err) + return result + } + + result.Rows = rows + return result +} + +// buildCoprocessorRequest builds coprocessor request +func (c *ClusterStatementsSummaryClient) buildCoprocessorRequest(tableSchema *TableSchema) (*coprocessor.Request, error) { + // Build DAG request + dagReq := &tipb.DAGRequest{ + TimeZoneName: "Asia/Shanghai", + TimeZoneOffset: 28800, + Flags: 0, + EncodeType: tipb.EncodeType_TypeDefault, + User: &tipb.UserIdentity{ + UserName: "root", + UserHost: "%", + }, + Executors: []*tipb.Executor{ + { + Tp: tipb.ExecType_TypeTableScan, + TblScan: &tipb.TableScan{ + TableId: tableSchema.ID, + Columns: c.buildColumnsFromSchema(tableSchema), + Desc: false, + }, + }, + }, + OutputOffsets: c.buildOutputOffsetsFromSchema(tableSchema), + CollectExecutionSummaries: &[]bool{true}[0], + } + + // Serialize DAG request + data, err := proto.Marshal(dagReq) + if err != nil { + return nil, fmt.Errorf("failed to serialize DAG request: %v", err) + } + + log.Printf("=== GO VERSION DEBUG ===") + log.Printf("DAG request details:") + log.Printf(" - Executors: %d", len(dagReq.Executors)) + log.Printf(" - OutputOffsets length: %d, first 10: %v", len(dagReq.OutputOffsets), dagReq.OutputOffsets[:10]) // First 10 offsets + log.Printf(" - EncodeType: %v (numeric value: %d)", dagReq.EncodeType, int32(dagReq.EncodeType)) + log.Printf(" - TimeZoneName: %s", dagReq.TimeZoneName) + log.Printf(" - TimeZoneOffset: %d", dagReq.TimeZoneOffset) + log.Printf(" - CollectExecutionSummaries: %v", dagReq.CollectExecutionSummaries) + log.Printf("Serialized DAG request size: %d bytes", len(data)) + log.Printf("DAG request first 64 bytes: %v", data[:min(64, len(data))]) + + // Build coprocessor request + copReq := &coprocessor.Request{ + Tp: 103, // kv.ReqTypeDAG + Data: data, + Ranges: []*coprocessor.KeyRange{ + { + Start: []byte{0x74, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}, + End: []byte{0x74, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02}, + }, + }, + Context: &kvrpcpb.Context{ + RegionId: 1, + RegionEpoch: &metapb.RegionEpoch{ + ConfVer: 1, + Version: 1, + }, + Peer: &metapb.Peer{ + Id: 1, + StoreId: 1, + }, + SourceStmt: &kvrpcpb.SourceStmt{ + ConnectionId: 12345, + SessionAlias: "cluster_statements_summary_client", + }, + }, + StartTs: uint64(time.Now().Unix()), + } + + // Serialize the full coprocessor request for comparison + copReqData, err := proto.Marshal(copReq) + if err == nil { + log.Printf("Serialized coprocessor request size: %d bytes", len(copReqData)) + log.Printf("Coprocessor request first 64 bytes: %v", copReqData[:min(64, len(copReqData))]) + } + log.Printf("Request details: tp=%d, ranges_count=%d, context_region_id=%d, start_ts=%d", + copReq.Tp, len(copReq.Ranges), copReq.Context.RegionId, copReq.StartTs) + log.Printf("=== END GO VERSION DEBUG ===") + + return copReq, nil +} + +// buildColumnsFromSchema builds column information based on schema +func (c *ClusterStatementsSummaryClient) buildColumnsFromSchema(tableSchema *TableSchema) []*tipb.ColumnInfo { + columns := make([]*tipb.ColumnInfo, len(tableSchema.Columns)) + for i, col := range tableSchema.Columns { + columns[i] = &tipb.ColumnInfo{ + ColumnId: col.ID, + Tp: col.Type.Tp, + } + } + return columns +} + +// buildOutputOffsetsFromSchema builds output offsets based on schema +func (c *ClusterStatementsSummaryClient) buildOutputOffsetsFromSchema(tableSchema *TableSchema) []uint32 { + offsets := make([]uint32, len(tableSchema.Columns)) + for i := range tableSchema.Columns { + offsets[i] = uint32(i) + } + return offsets +} + +// parseCoprocessorResponse parses coprocessor response +func (c *ClusterStatementsSummaryClient) parseCoprocessorResponse(resp *coprocessor.Response, serverAddr string, tableSchema *TableSchema) ([]Row, error) { + if resp.OtherError != "" { + return nil, fmt.Errorf("server returned error: %s", resp.OtherError) + } + + // Parse SelectResponse + var selectResp tipb.SelectResponse + err := proto.Unmarshal(resp.Data, &selectResp) + if err != nil { + return nil, fmt.Errorf("failed to deserialize response: %v", err) + } + + // Log EncodeType for debugging + log.Printf("GO: SelectResponse EncodeType: %v", selectResp.EncodeType) + + // Check warnings + if len(selectResp.Warnings) > 0 { + log.Printf("Server %s received warnings:", serverAddr) + for i, warning := range selectResp.Warnings { + log.Printf(" Warning %d: [%d] %s", i+1, warning.Code, warning.Msg) + } + } + + // Check errors + if selectResp.Error != nil { + return nil, fmt.Errorf("server execution error: [%d] %s", selectResp.Error.Code, selectResp.Error.Msg) + } + + var allRows []Row + + // Parse data chunks + for i, chunk := range selectResp.Chunks { + blockRows, err := c.parseChunkDataWithEncodeType(&chunk, serverAddr, tableSchema, selectResp.EncodeType) + if err != nil { + log.Printf("Failed to parse data chunk %d: %v", i+1, err) + continue + } + allRows = append(allRows, blockRows...) + } + + return allRows, nil +} + +// parseChunkDataWithEncodeType parses data chunk with EncodeType awareness +func (c *ClusterStatementsSummaryClient) parseChunkDataWithEncodeType(chunk *tipb.Chunk, serverAddr string, tableSchema *TableSchema, encodeType tipb.EncodeType) ([]Row, error) { + var rows []Row + + // Check if there is data + if len(chunk.RowsData) == 0 { + return rows, nil + } + + // Create intelligent parsing based on schema information + rows = c.parseDataWithSchema(chunk.RowsData, serverAddr, tableSchema) + + return rows, nil +} + +// parseChunkData parses data chunk (legacy function) +func (c *ClusterStatementsSummaryClient) parseChunkData(chunk *tipb.Chunk, serverAddr string, tableSchema *TableSchema) ([]Row, error) { + // Default to TypeChunk for backward compatibility + return c.parseChunkDataWithEncodeType(chunk, serverAddr, tableSchema, tipb.EncodeType_TypeChunk) +} + +// parseDataWithSchema parses real binary data based on schema information +func (c *ClusterStatementsSummaryClient) parseDataWithSchema(data []byte, serverAddr string, tableSchema *TableSchema) []Row { + var rows []Row + + // Check if data is empty + if len(data) == 0 { + return rows + } + + // First try chunk decoding + chunk, err := c.decodeChunkData(data, tableSchema) + if err != nil { + log.Printf("chunk decoding failed: %v, trying row encoding decode", err) + // If chunk decoding fails, try row encoding decode + rows = c.decodeRowData(data, serverAddr, tableSchema) + return rows + } + + // Extract row data from chunk + rowCount := chunk.NumRows() + log.Printf("Successfully decoded chunk containing %d rows of data", rowCount) + + for i := 0; i < rowCount; i++ { + row := c.extractRowFromChunk(chunk, i, serverAddr, tableSchema) + rows = append(rows, row) + } + + return rows +} + +// decodeRowData decodes row encoded data +func (c *ClusterStatementsSummaryClient) decodeRowData(data []byte, serverAddr string, tableSchema *TableSchema) []Row { + var rows []Row + offset := 0 + rowIndex := 0 + + log.Printf("GO: Starting decodeRowData with %d bytes of data, %d columns in schema", len(data), len(tableSchema.Columns)) + + for offset < len(data) && rowIndex < 100 { + log.Printf("GO: Starting row %d at offset %d (remaining bytes: %d)", rowIndex, offset, len(data)-offset) + + row := Row{ + Instance: serverAddr, + ExtraFields: make(map[string]interface{}), + } + + rowDecoded := false + colIndex := 0 + for _, col := range tableSchema.Columns { + if offset >= len(data) { + log.Printf("GO: Row %d reached end of data at column %d (%s)", rowIndex, colIndex, col.Name.O) + break + } + + value, newOffset, err := c.decodeValueFromBytes(data, offset, col) + if err != nil { + log.Printf("GO: Row %d column %d (%s) decode failed at offset %d: %v", rowIndex, colIndex, col.Name.O, offset, err) + break + } + + // Log only first few rows and important columns to avoid spam + if rowIndex < 3 && (colIndex < 20 || col.Name.O == "EXEC_COUNT" || col.Name.O == "DIGEST" || col.Name.O == "DIGEST_TEXT") { + log.Printf("GO: Row %d Column %d (%s): value=%v, offset %d->%d", rowIndex, colIndex, col.Name.O, value, offset, newOffset) + } + + offset = newOffset + rowDecoded = true + columnName := col.Name.O + + switch columnName { + case "INSTANCE": + row.Instance = c.safeStringValue(value) + case "STMT_TYPE": + row.StmtType = c.safeStringValue(value) + case "SCHEMA_NAME": + row.SchemaName = c.safeStringValue(value) + case "DIGEST_TEXT": + row.DigestText = c.safeStringValue(value) + case "EXEC_COUNT": + row.ExecCount = c.safeInt64Value(value) + case "SUM_LATENCY": + row.SumLatency = c.safeInt64Value(value) + case "MAX_LATENCY": + row.MaxLatency = c.safeInt64Value(value) + case "AVG_LATENCY": + row.AvgLatency = c.safeInt64Value(value) + default: + row.ExtraFields[columnName] = value + } + colIndex++ + } + + if !rowDecoded { + log.Printf("GO: Row %d had no columns decoded, stopping at offset %d", rowIndex, offset) + break + } + + // Log summary for first few rows + if rowIndex < 5 { + log.Printf("GO: Row %d summary - DIGEST: %v, EXEC_COUNT: %d, DIGEST_TEXT_len: %d, final_offset: %d", + rowIndex, row.ExtraFields["DIGEST"], row.ExecCount, len(row.DigestText), offset) + } + + row.TotalTime = row.SumLatency + rows = append(rows, row) + rowIndex++ + } + + log.Printf("GO: decodeRowData completed: decoded %d rows, final offset %d/%d", len(rows), offset, len(data)) + return rows +} + +// decodeValueFromBytes decodes a single value using TiDB's DecodeOne logic +func (c *ClusterStatementsSummaryClient) decodeValueFromBytes(data []byte, offset int, col TableSchemaColumn) (interface{}, int, error) { + if offset >= len(data) { + return nil, offset, fmt.Errorf("insufficient data") + } + + flag := data[offset] + offset++ + + switch flag { + case 0x00: // NilFlag + return nil, offset, nil + case 0x01: // bytesFlag + val, err := c.decodeBytes(data, &offset) + return val, offset, err + case 0x02: // compactBytesFlag + val, err := c.decodeCompactBytes(data, &offset) + return val, offset, err + case 0x03: // intFlag + val, err := c.decodeInt(data, &offset) + return val, offset, err + case 0x04: // uintFlag + val, err := c.decodeUint(data, &offset) + return val, offset, err + case 0x05: // floatFlag + val, err := c.decodeFloat(data, &offset) + return val, offset, err + case 0x06: // decimalFlag + val, err := c.decodeDecimal(data, &offset) + return val, offset, err + case 0x07: // durationFlag + val, err := c.decodeDuration(data, &offset) + return val, offset, err + case 0x08: // varintFlag + val, err := c.decodeVarint(data, &offset) + return val, offset, err + case 0x09: // uvarintFlag + val, err := c.decodeUvarint(data, &offset) + return val, offset, err + case 0x0A: // jsonFlag + val, err := c.decodeJSON(data, &offset) + return val, offset, err + case 0x14: // vectorFloat32Flag + val, err := c.decodeVectorFloat32(data, &offset) + return val, offset, err + case 0xFA: // maxFlag + return "MAX_VALUE", offset, nil + default: + // Try to decode as time type (possibly special encoding) + if flag >= 0x20 && flag <= 0x30 { + val, err := c.decodeTimeValue(data, &offset, flag) + return val, offset, err + } + return nil, offset, fmt.Errorf("unknown encoding flag: 0x%02x", flag) + } +} + +// decodeInt decodes signed integer +func (c *ClusterStatementsSummaryClient) decodeInt(data []byte, offset *int) (int64, error) { + if *offset+8 > len(data) { + return 0, fmt.Errorf("insufficient data, cannot decode integer") + } + val := int64(binary.LittleEndian.Uint64(data[*offset:])) + *offset += 8 + return val, nil +} + +// decodeUint decodes unsigned integer +func (c *ClusterStatementsSummaryClient) decodeUint(data []byte, offset *int) (uint64, error) { + if *offset+8 > len(data) { + return 0, fmt.Errorf("insufficient data, cannot decode unsigned integer") + } + val := binary.LittleEndian.Uint64(data[*offset:]) + *offset += 8 + return val, nil +} + +// decodeVarint decodes variable length signed integer +func (c *ClusterStatementsSummaryClient) decodeVarint(data []byte, offset *int) (int64, error) { + val, n := binary.Varint(data[*offset:]) + if n <= 0 { + return 0, fmt.Errorf("cannot decode variable length integer") + } + *offset += n + return val, nil +} + +// decodeUvarint decodes variable length unsigned integer +func (c *ClusterStatementsSummaryClient) decodeUvarint(data []byte, offset *int) (uint64, error) { + val, n := binary.Uvarint(data[*offset:]) + if n <= 0 { + return 0, fmt.Errorf("cannot decode variable length unsigned integer") + } + *offset += n + return val, nil +} + +// decodeFloat decodes floating point number +func (c *ClusterStatementsSummaryClient) decodeFloat(data []byte, offset *int) (float64, error) { + if *offset+8 > len(data) { + return 0, fmt.Errorf("insufficient data, cannot decode float") + } + val := math.Float64frombits(binary.LittleEndian.Uint64(data[*offset:])) + *offset += 8 + return val, nil +} + +// decodeBytes decodes byte array +func (c *ClusterStatementsSummaryClient) decodeBytes(data []byte, offset *int) ([]byte, error) { + if *offset+4 > len(data) { + return nil, fmt.Errorf("insufficient data, cannot decode byte array length") + } + length := int(binary.LittleEndian.Uint32(data[*offset:])) + *offset += 4 + + if *offset+length > len(data) { + return nil, fmt.Errorf("insufficient data, cannot decode byte array data") + } + val := data[*offset : *offset+length] + *offset += length + return val, nil +} + +// decodeCompactBytes decodes compact byte array +func (c *ClusterStatementsSummaryClient) decodeCompactBytes(data []byte, offset *int) ([]byte, error) { + if *offset >= len(data) { + return nil, fmt.Errorf("insufficient data, cannot decode compact byte array") + } + + // Debug: Show the raw bytes before varint decode + initialOffset := *offset + if *offset < 100 { + previewLen := 10 + if *offset+previewLen > len(data) { + previewLen = len(data) - *offset + } + log.Printf("GO decodeCompactBytes before varint: offset=%d, raw_bytes=%x", *offset, data[*offset:*offset+previewLen]) + } + + // Read length + length, n := binary.Varint(data[*offset:]) + if n <= 0 { + return nil, fmt.Errorf("cannot decode compact byte array length") + } + *offset += n + + if *offset+int(length) > len(data) { + return nil, fmt.Errorf("insufficient data, cannot decode compact byte array data") + } + val := data[*offset : *offset+int(length)] + *offset += int(length) + + // Debug: Log the compact bytes decoding details + if initialOffset < 100 { + previewLen := len(val) + if previewLen > 16 { + previewLen = 16 + } + log.Printf("GO decodeCompactBytes: initial_offset=%d, varint_bytes=%d, length=%d, final_offset=%d, val_len=%d, val_preview=%x", + initialOffset, n, length, *offset, len(val), val[:previewLen]) + } + + return val, nil +} + +// decodeDecimal decodes decimal number +func (c *ClusterStatementsSummaryClient) decodeDecimal(data []byte, offset *int) (string, error) { + if *offset+4 > len(data) { + return "", fmt.Errorf("insufficient data, cannot decode decimal data") + } + val := fmt.Sprintf("decimal_%d", *offset) + *offset += 4 + return val, nil +} + +// decodeDuration decodes duration +func (c *ClusterStatementsSummaryClient) decodeDuration(data []byte, offset *int) (string, error) { + val, err := c.decodeInt(data, offset) + if err != nil { + return "", err + } + return time.Duration(val).String(), nil +} + +// decodeJSON decodes JSON +func (c *ClusterStatementsSummaryClient) decodeJSON(data []byte, offset *int) (string, error) { + if *offset >= len(data) { + return "", fmt.Errorf("insufficient data, cannot decode JSON") + } + val := fmt.Sprintf("json_data_%d", *offset) + *offset += 4 + return val, nil +} + +// decodeVectorFloat32 decodes vector float32 +func (c *ClusterStatementsSummaryClient) decodeVectorFloat32(data []byte, offset *int) (string, error) { + if *offset >= len(data) { + return "", fmt.Errorf("insufficient data, cannot decode vector float32") + } + val := fmt.Sprintf("vector_float32_%d", *offset) + *offset += 4 + return val, nil +} + +// decodeTimeValue decodes time value +func (c *ClusterStatementsSummaryClient) decodeTimeValue(data []byte, offset *int, flag byte) (string, error) { + if *offset+8 > len(data) { + return "", fmt.Errorf("insufficient data, cannot decode time value") + } + packedTime := binary.LittleEndian.Uint64(data[*offset:]) + *offset += 8 + val := fmt.Sprintf("time_0x%02x_%d", flag, packedTime) + return val, nil +} + +// decodeChunkData decodes data using simplified chunk decoder +func (c *ClusterStatementsSummaryClient) decodeChunkData(data []byte, tableSchema *TableSchema) (*Chunk, error) { + chunk := &Chunk{ + columns: make([]*Column, len(tableSchema.Columns)), + } + + offset := 0 + for i, col := range tableSchema.Columns { + column := &Column{} + var err error + offset, err = c.decodeColumn(data, offset, column, col) + if err != nil { + return nil, fmt.Errorf("failed to decode column %d: %v", i, err) + } + chunk.columns[i] = column + } + + return chunk, nil +} + +// decodeColumn 解码单个列 +func (c *ClusterStatementsSummaryClient) decodeColumn(data []byte, offset int, col *Column, colInfo TableSchemaColumn) (int, error) { + if offset+8 > len(data) { + return offset, fmt.Errorf("数据不足,无法读取列长度") + } + + initialOffset := offset + + // Debug: Log raw bytes for critical columns + rawBytesLen := 32 + if offset+rawBytesLen > len(data) { + rawBytesLen = len(data) - offset + } + if rawBytesLen > 0 { + log.Printf("GO: Column %d decode start at offset=%d, raw_bytes=%x", colInfo.ID, offset, data[offset:offset+rawBytesLen]) + } + + // 解码长度 + col.length = int(binary.LittleEndian.Uint32(data[offset:])) + offset += 4 + + // 解码 nullCount + nullCount := int(binary.LittleEndian.Uint32(data[offset:])) + offset += 4 + + // Debug: Log parsed header values + log.Printf("GO: Column %d parsed header: length=%d, nullCount=%d, type=%d", colInfo.ID, col.length, nullCount, colInfo.Type.Tp) + + // 解码 nullBitmap - 参考 TiDB 的逻辑 + if nullCount > 0 { + numNullBitmapBytes := (col.length + 7) / 8 + if offset+numNullBitmapBytes > len(data) { + return offset, fmt.Errorf("数据不足,无法读取 null bitmap,需要 %d 字节,剩余 %d 字节", numNullBitmapBytes, len(data)-offset) + } + col.nullBitmap = data[offset : offset+numNullBitmapBytes] + offset += numNullBitmapBytes + } else { + // 当 nullCount 为 0 时,不读取 null bitmap,而是设置所有位为非空 + c.setAllNotNull(col) + } + + // 解码 offsets 和数据 + isFixed := c.isFixedLengthType(MySQLType(colInfo.Type.Tp)) + log.Printf("GO: Column %d type decision: isFixed=%v, type=%d", colInfo.ID, isFixed, colInfo.Type.Tp) + + if isFixed { + // 固定长度类型 + fixedLen := c.getFixedLength(MySQLType(colInfo.Type.Tp)) + log.Printf("GO: Column %d fixed-length processing: fixedLen=%d", colInfo.ID, fixedLen) + if fixedLen > 0 { + dataLen := fixedLen * col.length + if offset+dataLen > len(data) { + return offset, fmt.Errorf("数据不足,无法读取固定长度数据,需要 %d 字节,剩余 %d 字节", dataLen, len(data)-offset) + } + log.Printf("GO: Column %d reading fixed data: offset=%d, dataLen=%d", colInfo.ID, offset, dataLen) + col.data = data[offset : offset+dataLen] + offset += dataLen + } + } else { + // 变长类型 + numOffsetBytes := (col.length + 1) * 8 + if offset+numOffsetBytes > len(data) { + return offset, fmt.Errorf("数据不足,无法读取偏移量,需要 %d 字节,剩余 %d 字节", numOffsetBytes, len(data)-offset) + } + + log.Printf("GO: Column %d reading variable-length offsets: offset=%d, numOffsetBytes=%d", colInfo.ID, offset, numOffsetBytes) + + // 解码偏移量 + col.offsets = make([]int64, col.length+1) + for i := 0; i <= col.length; i++ { + col.offsets[i] = int64(binary.LittleEndian.Uint64(data[offset:])) + offset += 8 + } + + log.Printf("GO: Column %d offsets: %v", colInfo.ID, col.offsets) + + // 解码数据 + dataLen := int(col.offsets[col.length]) + if offset+dataLen > len(data) { + return offset, fmt.Errorf("数据不足,无法读取变长数据,需要 %d 字节,剩余 %d 字节", dataLen, len(data)-offset) + } + log.Printf("GO: Column %d reading variable data: offset=%d, dataLen=%d", colInfo.ID, offset, dataLen) + col.data = data[offset : offset+dataLen] + offset += dataLen + } + + // Debug: Log final offset + log.Printf("GO: Column %d decode complete: final_offset=%d (consumed %d bytes)", colInfo.ID, offset, offset-initialOffset) + + return offset, nil +} + +// setAllNotNull 设置所有位为非空 +func (c *ClusterStatementsSummaryClient) setAllNotNull(col *Column) { + numNullBitmapBytes := (col.length + 7) / 8 + col.nullBitmap = make([]byte, numNullBitmapBytes) + for i := 0; i < numNullBitmapBytes; i++ { + col.nullBitmap[i] = 0xFF + } +} + +// isFixedLengthType 检查是否为固定长度类型 +func (c *ClusterStatementsSummaryClient) isFixedLengthType(tp MySQLType) bool { + switch tp { + case TypeTiny, TypeShort, TypeInt24, TypeLong, TypeLonglong, TypeFloat, TypeDouble: + return true + default: + return false + } +} + +// getFixedLength 获取固定长度类型的字节长度 +func (c *ClusterStatementsSummaryClient) getFixedLength(tp MySQLType) int { + switch tp { + case TypeTiny: + return 1 + case TypeShort: + return 2 + case TypeInt24: + return 3 + case TypeLong: + return 4 + case TypeLonglong: + return 8 + case TypeFloat: + return 4 + case TypeDouble: + return 8 + default: + return -1 + } +} + +// extractRowFromChunk 从 chunk 中提取单行数据 +func (c *ClusterStatementsSummaryClient) extractRowFromChunk(chunk *Chunk, rowIdx int, serverAddr string, tableSchema *TableSchema) Row { + row := Row{ + Instance: serverAddr, + ExtraFields: make(map[string]interface{}), + } + + // 遍历所有列,提取数据 + for colIdx, col := range tableSchema.Columns { + columnName := col.Name.O + value := c.extractValueFromColumn(chunk, rowIdx, colIdx, col) + + // 根据列名设置对应的字段 + switch columnName { + case "INSTANCE": + row.Instance = c.safeStringValue(value) + case "SUMMARY_BEGIN_TIME": + row.SummaryBeginTime = c.safeStringValue(value) + case "SUMMARY_END_TIME": + row.SummaryEndTime = c.safeStringValue(value) + case "STMT_TYPE": + row.StmtType = c.safeStringValue(value) + case "SCHEMA_NAME": + row.SchemaName = c.safeStringValue(value) + case "DIGEST": + row.Digest = c.safeStringValue(value) + case "DIGEST_TEXT": + row.DigestText = c.safeStringValue(value) + case "TABLE_NAMES": + row.TableNames = c.safeStringValue(value) + case "INDEX_NAMES": + row.IndexNames = c.safeStringValue(value) + case "SAMPLE_USER": + row.SampleUser = c.safeStringValue(value) + case "EXEC_COUNT": + row.ExecCount = c.safeInt64Value(value) + case "SUM_LATENCY": + row.SumLatency = c.safeInt64Value(value) + case "MAX_LATENCY": + row.MaxLatency = c.safeInt64Value(value) + case "MIN_LATENCY": + row.MinLatency = c.safeInt64Value(value) + case "AVG_LATENCY": + row.AvgLatency = c.safeInt64Value(value) + case "AVG_PARSE_LATENCY": + row.AvgParseLatency = c.safeInt64Value(value) + case "MAX_PARSE_LATENCY": + row.MaxParseLatency = c.safeInt64Value(value) + case "AVG_COMPILE_LATENCY": + row.AvgCompileLatency = c.safeInt64Value(value) + case "MAX_COMPILE_LATENCY": + row.MaxCompileLatency = c.safeInt64Value(value) + case "AVG_MEM": + row.AvgMem = c.safeInt64Value(value) + case "MAX_MEM": + row.MaxMem = c.safeInt64Value(value) + case "AVG_DISK": + row.AvgDisk = c.safeInt64Value(value) + case "MAX_DISK": + row.MaxDisk = c.safeInt64Value(value) + case "AVG_AFFECTED_ROWS": + row.AvgAffectedRows = c.safeFloat64Value(value) + case "FIRST_SEEN": + row.FirstSeen = c.safeStringValue(value) + case "LAST_SEEN": + row.LastSeen = c.safeStringValue(value) + case "SAMPLE_SQL": + row.SampleSQL = c.safeStringValue(value) + case "PREV_SAMPLE_TEXT": + row.PrevSampleText = c.safeStringValue(value) + case "PLAN_DIGEST": + row.PlanDigest = c.safeStringValue(value) + case "PLAN": + row.Plan = c.safeStringValue(value) + case "PLAN_CACHE_HITS": + row.PlanCacheHits = c.safeInt64Value(value) + case "PLAN_IN_CACHE": + row.PlanInCache = c.safeInt64Value(value) + case "PLAN_IN_BINDING": + row.PlanInBinding = c.safeInt64Value(value) + case "QUERY_SAMPLE_TEXT": + row.QuerySampleText = c.safeStringValue(value) + case "PREV_SAMPLE_SQL": + row.PrevSampleSQL = c.safeStringValue(value) + case "PLAN_DIGEST_TEXT": + row.PlanDigestText = c.safeStringValue(value) + case "QUERY_SAMPLE_USER": + row.QuerySampleUser = c.safeStringValue(value) + case "QUERY_SAMPLE_HOST": + row.QuerySampleHost = c.safeStringValue(value) + case "QUERY_SAMPLE_DB": + row.QuerySampleDB = c.safeStringValue(value) + case "QUERY_SAMPLE_STATE": + row.QuerySampleState = c.safeStringValue(value) + case "QUERY_SAMPLE_INFO": + row.QuerySampleInfo = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_TYPE": + row.QuerySampleTransType = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_ISOLATION": + row.QuerySampleTransIsolation = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_START_TIME": + row.QuerySampleTransStartTime = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_DURATION": + row.QuerySampleTransDuration = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_STATE": + row.QuerySampleTransState = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_ERROR": + row.QuerySampleTransError = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_TABLES": + row.QuerySampleTransTables = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_INDEXES": + row.QuerySampleTransIndexes = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_LOCK_KEYS": + row.QuerySampleTransLockKeys = c.safeStringValue(value) + case "QUERY_SAMPLE_TRANS_LOCK_TIME": + row.QuerySampleTransLockTime = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_WAIT_TIME": + row.QuerySampleTransWaitTime = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_BACKOFF_TIME": + row.QuerySampleTransBackoffTime = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_RESOLVE_LOCK_TIME": + row.QuerySampleTransResolveLockTime = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_LOCAL_LATCH_WAIT_TIME": + row.QuerySampleTransLocalLatchWaitTime = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_WRITE_KEYS": + row.QuerySampleTransWriteKeys = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_WRITE_SIZE": + row.QuerySampleTransWriteSize = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_PREWRITE_REGION_NUM": + row.QuerySampleTransPrewriteRegionNum = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_TXN_RETRY": + row.QuerySampleTransTxnRetry = c.safeInt64Value(value) + case "QUERY_SAMPLE_TRANS_BACKOFF_TYPES": + row.QuerySampleTransBackoffTypes = c.safeStringValue(value) + default: + // 存储到扩展字段中 + row.ExtraFields[columnName] = value + } + } + + // 设置 TotalTime 为 SumLatency 的别名,保持向后兼容 + row.TotalTime = row.SumLatency + + return row +} + +// extractValueFromColumn 从 chunk 列中提取值 +func (c *ClusterStatementsSummaryClient) extractValueFromColumn(chunk *Chunk, rowIdx, colIdx int, col TableSchemaColumn) interface{} { + column := chunk.Column(colIdx) + + // 检查是否为 NULL + if column.IsNull(rowIdx) { + return nil + } + + // 根据列类型提取值 + switch MySQLType(col.Type.Tp) { + case TypeTiny, TypeShort, TypeInt24, TypeLong, TypeLonglong: + return column.GetInt64(rowIdx) + case TypeFloat: + return column.GetFloat32(rowIdx) + case TypeDouble: + return column.GetFloat64(rowIdx) + case TypeString, TypeVarString, TypeTinyBlob, + TypeMediumBlob, TypeLongBlob, TypeBlob, TypeJSON: + return column.GetString(rowIdx) + case TypeDate, TypeDatetime, TypeTimestamp: + // 时间类型转换为字符串(简化处理) + bytes := column.GetBytes(rowIdx) + if len(bytes) >= 8 { + // 假设是时间戳格式 + timestamp := int64(binary.LittleEndian.Uint64(bytes)) + return time.Unix(timestamp, 0).Format("2006-01-02 15:04:05") + } + return column.GetString(rowIdx) + case TypeDuration: + // 持续时间类型(简化处理) + bytes := column.GetBytes(rowIdx) + if len(bytes) >= 8 { + duration := int64(binary.LittleEndian.Uint64(bytes)) + return time.Duration(duration).String() + } + return column.GetString(rowIdx) + case TypeNewDecimal: + // 小数类型(简化处理) + return column.GetString(rowIdx) + case TypeEnum: + // 枚举类型(简化处理) + return column.GetString(rowIdx) + case TypeSet: + // 集合类型(简化处理) + return column.GetString(rowIdx) + case TypeBit: + // 位类型 + bytes := column.GetBytes(rowIdx) + return string(bytes) + default: + // 默认尝试获取字符串 + return column.GetString(rowIdx) + } +} + +// safeStringValue 安全地获取字符串值 +func (c *ClusterStatementsSummaryClient) safeStringValue(value interface{}) string { + if value == nil { + return "" + } + if str, ok := value.(string); ok { + return str + } + // 处理 []uint8 类型(字节数组) + if bytes, ok := value.([]uint8); ok { + return string(bytes) + } + // 处理 []byte 类型 + if bytes, ok := value.([]byte); ok { + return string(bytes) + } + return "" +} + +// safeInt64Value 安全地获取 int64 值 +func (c *ClusterStatementsSummaryClient) safeInt64Value(value interface{}) int64 { + if value == nil { + return 0 + } + if val, ok := value.(int64); ok { + return val + } + // 处理 uint64 类型 + if val, ok := value.(uint64); ok { + return int64(val) + } + // 处理 int 类型 + if val, ok := value.(int); ok { + return int64(val) + } + // 处理 uint 类型 + if val, ok := value.(uint); ok { + return int64(val) + } + return 0 +} + +// safeFloat64Value safely converts interface{} to float64 +func (c *ClusterStatementsSummaryClient) safeFloat64Value(value interface{}) float64 { + if value == nil { + return 0.0 + } + if val, ok := value.(float64); ok { + return val + } + // Handle float32 type + if val, ok := value.(float32); ok { + return float64(val) + } + // Handle int64 type + if val, ok := value.(int64); ok { + return float64(val) + } + // Handle uint64 type + if val, ok := value.(uint64); ok { + return float64(val) + } + // Handle int type + if val, ok := value.(int); ok { + return float64(val) + } + // Handle uint type + if val, ok := value.(uint); ok { + return float64(val) + } + return 0.0 +} + +// parseRowData 解析单行数据 +func (c *ClusterStatementsSummaryClient) parseRowData(data []byte, offset int, serverAddr string, columnNames map[int64]string, tableSchema *TableSchema) (*Row, int, error) { + if offset >= len(data) { + return nil, offset, nil + } + + // 创建行数据映射 + rowData := make(map[string]interface{}) + + // 解析每个列的数据 + for i, col := range tableSchema.Columns { + if offset >= len(data) { + break + } + + columnName := col.Name.O + value, newOffset, err := c.parseColumnValue(data, offset, col.Type.Tp) + if err != nil { + return nil, offset, fmt.Errorf("解析列 %s 失败: %v", columnName, err) + } + + rowData[columnName] = value + offset = newOffset + + // 限制解析的列数,避免解析过多数据 + if i >= 20 { // 只解析前20个重要列 + break + } + } + + // 构建 Row 对象 + row := &Row{ + Instance: serverAddr, + } + + // 提取关键字段 + if digestText, ok := rowData["DIGEST_TEXT"].(string); ok { + row.DigestText = digestText + } else { + row.DigestText = "UNKNOWN_QUERY" + } + + if execCount, ok := rowData["EXEC_COUNT"].(int64); ok { + row.ExecCount = execCount + } else { + row.ExecCount = 0 + } + + if sumLatency, ok := rowData["SUM_LATENCY"].(int64); ok { + row.TotalTime = sumLatency + } else { + row.TotalTime = 0 + } + + return row, offset, nil +} + +// parseColumnValue 解析列值 +func (c *ClusterStatementsSummaryClient) parseColumnValue(data []byte, offset int, columnType int32) (interface{}, int, error) { + if offset >= len(data) { + return nil, offset, nil + } + + // 读取长度前缀 + if offset+1 > len(data) { + return nil, offset, fmt.Errorf("数据不足,无法读取长度前缀") + } + + length := int(data[offset]) + offset++ + + if offset+length > len(data) { + return nil, offset, fmt.Errorf("数据不足,长度=%d,剩余=%d", length, len(data)-offset) + } + + valueData := data[offset : offset+length] + offset += length + + // 根据类型解析值 + switch columnType { + case 15: // VARCHAR + return string(valueData), offset, nil + case 8: // BIGINT + if length == 8 { + value := int64(valueData[0])<<56 | int64(valueData[1])<<48 | int64(valueData[2])<<40 | int64(valueData[3])<<32 | + int64(valueData[4])<<24 | int64(valueData[5])<<16 | int64(valueData[6])<<8 | int64(valueData[7]) + return value, offset, nil + } + return int64(0), offset, nil + case 1: // TINYINT + if length == 1 { + return int64(valueData[0]), offset, nil + } + return int64(0), offset, nil + case 12: // DATETIME + // 简化处理,返回字符串 + return string(valueData), offset, nil + default: + // 默认作为字符串处理 + return string(valueData), offset, nil + } +} + +// mergeResults 合并来自所有节点的结果 +func (c *ClusterStatementsSummaryClient) mergeResults(resultChan <-chan QueryResult) error { + var allRows []Row + errorCount := 0 + successCount := 0 + + fmt.Println("\n=== 处理查询结果 ===") + + for result := range resultChan { + if result.Error != nil { + log.Printf("❌ 服务器 %s 查询失败: %v", result.ServerAddr, result.Error) + errorCount++ + continue + } + + successCount++ + fmt.Printf("✅ 服务器 %s 返回 %d 行数据\n", result.ServerAddr, len(result.Rows)) + + // 不再在这里打印详细数据,将在最后统一以表格形式显示 + + allRows = append(allRows, result.Rows...) + } + + fmt.Printf("\n=== 最终统计 ===\n") + fmt.Printf("成功查询的服务器数: %d/%d\n", successCount, len(c.servers)) + fmt.Printf("失败的服务器数: %d/%d\n", errorCount, len(c.servers)) + fmt.Printf("总数据行数: %d\n", len(allRows)) + + // 按服务器分组统计 + if len(allRows) > 0 { + serverStats := make(map[string]int) + for _, row := range allRows { + serverStats[row.Instance]++ + } + + fmt.Printf("\n=== 数据分布 ===\n") + for server, count := range serverStats { + fmt.Printf("服务器 %s: %d 行数据\n", server, count) + } + + // 以表格形式显示数据 + c.printTableFormat(allRows) + } + + return nil +} + +// printTableFormat 以表格形式打印数据 +func (c *ClusterStatementsSummaryClient) printTableFormat(rows []Row) { + fmt.Printf("\n=== CLUSTER_STATEMENTS_SUMMARY 数据表 ===\n") + + // 定义表格列 + columns := []struct { + name string + width int + getter func(Row) string + }{ + {"Instance", 20, func(r Row) string { return r.Instance }}, + {"StmtType", 10, func(r Row) string { return r.StmtType }}, + {"SchemaName", 15, func(r Row) string { return r.SchemaName }}, + {"DigestText", 50, func(r Row) string { return c.truncateString(r.DigestText, 47) }}, + {"ExecCount", 10, func(r Row) string { return fmt.Sprintf("%d", r.ExecCount) }}, + {"SumLatency(μs)", 15, func(r Row) string { return fmt.Sprintf("%d", r.SumLatency) }}, + {"MaxLatency(μs)", 15, func(r Row) string { return fmt.Sprintf("%d", r.MaxLatency) }}, + {"AvgLatency(μs)", 15, func(r Row) string { return fmt.Sprintf("%d", r.AvgLatency) }}, + {"AvgMem(KB)", 12, func(r Row) string { return fmt.Sprintf("%d", r.AvgMem) }}, + {"MaxMem(KB)", 12, func(r Row) string { return fmt.Sprintf("%d", r.MaxMem) }}, + {"PlanCacheHits", 15, func(r Row) string { return fmt.Sprintf("%d", r.PlanCacheHits) }}, + {"PlanInCache", 12, func(r Row) string { return fmt.Sprintf("%d", r.PlanInCache) }}, + {"PlanInBinding", 15, func(r Row) string { return fmt.Sprintf("%d", r.PlanInBinding) }}, + } + + // 打印表头 + fmt.Print("|") + for _, col := range columns { + fmt.Printf(" %-*s |", col.width, col.name) + } + fmt.Println() + + // 打印分隔线 + fmt.Print("|") + for _, col := range columns { + fmt.Printf(" %-*s |", col.width, strings.Repeat("-", col.width)) + } + fmt.Println() + + // 打印数据行 + for i, row := range rows { + fmt.Print("|") + for _, col := range columns { + value := col.getter(row) + fmt.Printf(" %-*s |", col.width, value) + } + fmt.Println() + + // 每10行添加一个分隔线 + if (i+1)%10 == 0 && i < len(rows)-1 { + fmt.Print("|") + for _, col := range columns { + fmt.Printf(" %-*s |", col.width, strings.Repeat("-", col.width)) + } + fmt.Println() + } + } + + fmt.Printf("\n共显示 %d 行数据\n", len(rows)) +} + +// truncateString 截断字符串到指定长度 +func (c *ClusterStatementsSummaryClient) truncateString(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen-3] + "..." +} + +// GetTiDBServersFromEtcd gets TiDB server information from etcd (mock implementation) +func GetTiDBServersFromEtcd(ctx context.Context) ([]ServerInfo, error) { + // This should get real server information from etcd + // For demonstration, using static configuration + return []ServerInfo{ + { + ServerType: "tidb", + Address: "127.0.0.1:4000", + StatusAddr: "127.0.0.1:10080", + StatusPort: 10080, + IP: "127.0.0.1", + }, + { + ServerType: "tidb", + Address: "127.0.0.1:4001", + StatusAddr: "127.0.0.1:10081", + StatusPort: 10081, + IP: "127.0.0.1", + }, + }, nil +} + +func main() { + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + fmt.Println("=== TiDB CLUSTER_STATEMENTS_SUMMARY gRPC Coprocessor 客户端 ===") + fmt.Println("版本: v2.1 - 修复 SourceStmt 空指针问题") + fmt.Println("修复内容: 添加了 Context.SourceStmt 字段,解决服务端 panic 问题") + fmt.Println() + + // Get TiDB server information + servers, err := GetTiDBServersFromEtcd(ctx) + if err != nil { + log.Fatalf("Failed to get server information: %v", err) + } + + if len(servers) == 0 { + log.Fatal("No TiDB servers found") + } + + fmt.Printf("Found %d TiDB servers:\n", len(servers)) + for _, server := range servers { + fmt.Printf(" - MySQL port: %s, gRPC port: %s\n", server.Address, server.StatusAddr) + } + fmt.Println() + + // Create client + client := NewClusterStatementsSummaryClient(servers) + + // Query cluster statements summary + fmt.Println("Starting CLUSTER_STATEMENTS_SUMMARY query via gRPC coprocessor...") + fmt.Println("Note: This will directly send coprocessor requests to each TiDB node's StatusPort") + fmt.Println("If errors occur, this is normal as we are sending test requests") + fmt.Println() + + err = client.QueryClusterStatementsSummary(ctx) + if err != nil { + log.Fatalf("Query failed: %v", err) + } + + fmt.Println("\n=== Query Complete ===") + fmt.Println("This client demonstrates TiDB's internal mechanism for handling CLUSTER_STATEMENTS_SUMMARY queries:") + fmt.Println("1. ✅ Connect to each TiDB node's StatusPort via gRPC") + fmt.Println("2. ✅ Send coprocessor DAG requests") + fmt.Println("3. ✅ Obtain responses from each node in parallel") + fmt.Println("4. ✅ Merge results and add instance identifiers") + fmt.Println("5. ✅ Error handling and fault tolerance mechanisms") + fmt.Println() + fmt.Println("Note: Since CLUSTER_STATEMENTS_SUMMARY requires specific table structure and permissions,") + fmt.Println("actual production environments require more precise DAG request construction.") +} diff --git a/src/sources/system_tables/cluster_statements_summary_client/go.mod b/src/sources/system_tables/cluster_statements_summary_client/go.mod new file mode 100644 index 0000000..45cec3f --- /dev/null +++ b/src/sources/system_tables/cluster_statements_summary_client/go.mod @@ -0,0 +1,19 @@ +module cluster_statements_summary_client + +go 1.24.3 + +require ( + github.com/gogo/protobuf v1.3.2 + github.com/pingcap/kvproto v0.0.0-20250928021345-732fa68a7b72 + github.com/pingcap/tipb v0.0.0-20250928030846-9fd33ded6f2c + google.golang.org/grpc v1.75.1 +) + +require ( + github.com/golang/protobuf v1.5.4 // indirect + golang.org/x/net v0.41.0 // indirect + golang.org/x/sys v0.33.0 // indirect + golang.org/x/text v0.26.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 // indirect + google.golang.org/protobuf v1.36.6 // indirect +) diff --git a/src/sources/system_tables/cluster_statements_summary_client/go.sum b/src/sources/system_tables/cluster_statements_summary_client/go.sum new file mode 100644 index 0000000..9068dd8 --- /dev/null +++ b/src/sources/system_tables/cluster_statements_summary_client/go.sum @@ -0,0 +1,71 @@ +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/pingcap/kvproto v0.0.0-20250928021345-732fa68a7b72 h1:uJNzG3mFTTLyBPTo6cov1DbICznYgE8OWuYcuAPXbFE= +github.com/pingcap/kvproto v0.0.0-20250928021345-732fa68a7b72/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= +github.com/pingcap/tipb v0.0.0-20250928030846-9fd33ded6f2c h1:tddMjEiXU0d1VlJ+yHwim4gINeHmFR9CCkitatuby2c= +github.com/pingcap/tipb v0.0.0-20250928030846-9fd33ded6f2c/go.mod h1:RM8iRcMalzOthG2XJxnNBniM4xFGb/lDwHUwqkaVzt4= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= +go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= +go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= +go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= +go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= +golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7 h1:pFyd6EwwL2TqFf8emdthzeX+gZE1ElRq3iM8pui4KBY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250707201910-8d1bb00bc6a7/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.75.1 h1:/ODCNEuf9VghjgO3rqLcfg8fiOP0nSluljWFlDxELLI= +google.golang.org/grpc v1.75.1/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= diff --git a/src/sources/system_tables/collector_factory.rs b/src/sources/system_tables/collector_factory.rs new file mode 100644 index 0000000..a988058 --- /dev/null +++ b/src/sources/system_tables/collector_factory.rs @@ -0,0 +1,33 @@ +use crate::sources::system_tables::data_collector::{ + CollectionError, CollectionMethod, CollectorConfig, DataCollector, +}; + +use crate::sources::system_tables::collectors::{CoprocessorCollector, SqlCollector}; + +/// Simplified collector factory - direct creation without complex abstractions +pub struct CollectorFactory; + +impl CollectorFactory { + /// Create a collector instance based on method and config + pub fn create_collector( + method: CollectionMethod, + config: CollectorConfig, + ) -> Result, CollectionError> { + match method { + CollectionMethod::Sql => { + let collector = SqlCollector::new(config)?; + Ok(Box::new(collector)) + } + CollectionMethod::Coprocessor => { + let collector = CoprocessorCollector::new(config)?; + Ok(Box::new(collector)) + } + CollectionMethod::HttpApi => Err(CollectionError::ConfigurationError( + "HTTP API collection method not implemented yet".to_string(), + )), + CollectionMethod::CustomGrpc => Err(CollectionError::ConfigurationError( + "Custom gRPC collection method not implemented yet".to_string(), + )), + } + } +} diff --git a/src/sources/system_tables/collectors/coprocessor_collector.rs b/src/sources/system_tables/collectors/coprocessor_collector.rs new file mode 100644 index 0000000..13767f6 --- /dev/null +++ b/src/sources/system_tables/collectors/coprocessor_collector.rs @@ -0,0 +1,2021 @@ +use std::collections::HashMap; +use std::error::Error as StdError; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use http; +use prost::Message; +use serde_json::Value; +use tonic::transport::Channel; +use tracing::{debug, error, info, warn}; + +// TLS proxy implementation for gRPC connections +mod tls_proxy { + use std::pin::Pin; + + use tokio::io::AsyncWriteExt; + use tokio::net::{TcpListener, TcpStream}; + use tokio_openssl::SslStream; + use tracing::{error, info}; + use vector_lib::tls::{tls_connector_builder, MaybeTlsSettings, TlsConfig}; + + /// Create a TLS proxy for gRPC connections, similar to topsql implementation + pub async fn create_tls_proxy( + tls_config: Option<&TlsConfig>, + address: &str, + ) -> Result> { + info!("Creating TLS proxy for address: {}", address); + + let outbound = tls_connect(tls_config, address).await?; + let listener = TcpListener::bind("0.0.0.0:0").await?; + let local_address = listener.local_addr()?; + + info!("TLS proxy listening on port: {}", local_address.port()); + + tokio::spawn(async move { + let res = accept_and_proxy(listener, outbound).await; + if let Err(error) = res { + error!("TLS proxy failed: {}", error); + } + }); + + Ok(local_address.port()) + } + + async fn tls_connect( + tls_config: Option<&TlsConfig>, + address: &str, + ) -> Result, Box> { + let uri = address.parse::()?; + let host = uri.host().unwrap_or_default(); + let port = uri.port().map(|p| p.as_u16()).unwrap_or(443); + + info!("Connecting to TLS endpoint: {}:{}", host, port); + + let raw_stream = TcpStream::connect(format!("{}:{}", &host, port)).await?; + + let tls_settings = MaybeTlsSettings::tls_client(tls_config)?; + let mut config_builder = tls_connector_builder(&tls_settings)?; + config_builder.set_alpn_protos(b"\x02h2")?; + + let config = config_builder.build().configure()?; + let ssl = config.into_ssl(host)?; + + let mut stream = SslStream::new(ssl, raw_stream)?; + Pin::new(&mut stream).connect().await?; + + info!("TLS connection established to {}:{}", host, port); + Ok(stream) + } + + async fn accept_and_proxy( + listener: TcpListener, + outbound: SslStream, + ) -> Result<(), Box> { + let (inbound, _) = listener.accept().await?; + drop(listener); + transfer(inbound, outbound).await?; + Ok(()) + } + + async fn transfer( + mut inbound: tokio::net::TcpStream, + outbound: SslStream, + ) -> Result<(), Box> { + let (mut ri, mut wi) = inbound.split(); + let (mut ro, mut wo) = tokio::io::split(outbound); + + let client_to_server = async { + tokio::io::copy(&mut ri, &mut wo).await?; + wo.shutdown().await + }; + + let server_to_client = async { + tokio::io::copy(&mut ro, &mut wi).await?; + wi.shutdown().await + }; + + tokio::try_join!(client_to_server, server_to_client)?; + + Ok(()) + } +} + +use crate::sources::system_tables::data_collector::{ + CollectionError, CollectionMetadata, CollectionMethod, CollectionResult, CollectorConfig, + CollectorConfigType, DataCollector, +}; +use crate::sources::system_tables::TableConfig; + +// Use generated protobuf types from proto/tipb_simple.proto +include!(concat!(env!("OUT_DIR"), "/tipb.rs")); + +// Note: All proto types are now defined in the generated code + +pub struct TikvClient { + inner: tonic::client::Grpc, +} + +impl TikvClient +where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: tonic::codegen::Body + Send + 'static, + ::Error: Into + Send, +{ + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + + pub async fn coprocessor( + &mut self, + request: impl tonic::IntoRequest, + ) -> Result, tonic::Status> { + self.inner.ready().await.map_err(|_| { + tonic::Status::new(tonic::Code::Unknown, "Service was not ready".to_string()) + })?; + let codec = tonic::codec::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static("/tikvpb.Tikv/Coprocessor"); + self.inner.unary(request.into_request(), path, codec).await + } +} + +/// Coprocessor-based data collector using gRPC protocol +pub struct CoprocessorCollector { + config: CollectorConfig, + grpc_endpoint: String, + client_channel: Option, + cached_schemas: std::sync::Mutex>, +} + +impl CoprocessorCollector { + /// Create a new coprocessor collector + pub fn new(config: CollectorConfig) -> Result { + // Extract coprocessor-specific config + let (host, port) = match &config.config_type { + CollectorConfigType::Coprocessor { host, port, .. } => (host.clone(), *port), + _ => { + return Err(CollectionError::ConfigurationError( + "Invalid config type for CoprocessorCollector".to_string(), + )) + } + }; + + // Build gRPC endpoint using topsql-style port calculation + // TiDB secondary port is typically used for gRPC (status port) + // For standard TiDB setup: MySQL port 4000 -> status port 10080 + let grpc_port = if port == 4000 { 10080 } else { port + 6080 }; + let grpc_scheme = if matches!( + config.config_type, + CollectorConfigType::Coprocessor { tls: Some(_), .. } + ) { + "https" + } else { + "http" + }; + let grpc_endpoint = format!("{}://{}:{}", grpc_scheme, host, grpc_port); + + Ok(Self { + config, + grpc_endpoint, + client_channel: None, + cached_schemas: std::sync::Mutex::new(HashMap::new()), + }) + } + + /// Establish gRPC connection using topsql-style TLS proxy approach + async fn create_grpc_connection(&self) -> Result { + info!("Creating gRPC connection to: {}", self.grpc_endpoint); + + // Extract TLS config for topsql-style handling + let tls_config = match &self.config.config_type { + CollectorConfigType::Coprocessor { tls, .. } => tls.as_ref(), + _ => None, + }; + + let endpoint = if tls_config.is_none() { + // No TLS - direct connection like topsql + info!("No TLS config, using direct HTTP connection"); + Channel::from_shared(self.grpc_endpoint.clone()) + .map_err(|e| { + CollectionError::ConfigurationError(format!("Invalid endpoint: {}", e)) + })? + .http2_keep_alive_interval(Duration::from_secs(300)) + .keep_alive_timeout(Duration::from_secs(10)) + .keep_alive_while_idle(true) + } else { + // TLS enabled - use topsql-style TLS proxy approach + info!("TLS enabled, creating TLS proxy for gRPC connection"); + + // Convert our TlsConfig to vector_lib::tls::TlsConfig + let vector_tls_config = self.convert_to_vector_tls_config(tls_config.unwrap())?; + + // Create TLS proxy and get local port + let proxy_port = + tls_proxy::create_tls_proxy(Some(&vector_tls_config), &self.grpc_endpoint) + .await + .map_err(|e| { + CollectionError::ConfigurationError(format!( + "Failed to create TLS proxy: {}", + e + )) + })?; + + info!("TLS proxy created on local port: {}", proxy_port); + + // Connect to local proxy instead of remote endpoint + let proxy_endpoint = format!("http://127.0.0.1:{}", proxy_port); + Channel::from_shared(proxy_endpoint) + .map_err(|e| { + CollectionError::ConfigurationError(format!("Invalid proxy endpoint: {}", e)) + })? + .http2_keep_alive_interval(Duration::from_secs(300)) + .keep_alive_timeout(Duration::from_secs(10)) + .keep_alive_while_idle(true) + }; + + let channel = endpoint.connect().await.map_err(|e| { + let error_details = format!( + "gRPC connection failed: {} (endpoint: {}, TLS enabled: {})", + e, + self.grpc_endpoint, + matches!( + self.config.config_type, + CollectorConfigType::Coprocessor { tls: Some(_), .. } + ) + ); + + // Print additional error context for debugging + if let Some(source) = e.source() { + error!("gRPC connection error source: {}", source); + } + + CollectionError::ConnectionError(error_details) + })?; + + Ok(channel) + } + + /// Convert our TlsConfig to vector_lib::tls::TlsConfig for TLS proxy + fn convert_to_vector_tls_config( + &self, + tls: &crate::sources::system_tables::TlsConfig, + ) -> Result { + let mut vector_tls = vector_lib::tls::TlsConfig::default(); + + // Set verification options + if let Some(verify_certificate) = tls.verify_certificate { + vector_tls.verify_certificate = Some(verify_certificate); + } + if let Some(verify_hostname) = tls.verify_hostname { + vector_tls.verify_hostname = Some(verify_hostname); + } + + // Set certificate files + if let Some(ca_file) = &tls.ca_file { + vector_tls.ca_file = Some(ca_file.clone()); + } + if let Some(crt_file) = &tls.crt_file { + vector_tls.crt_file = Some(crt_file.clone()); + } + if let Some(key_file) = &tls.key_file { + vector_tls.key_file = Some(key_file.clone()); + } + + Ok(vector_tls) + } + + /// Get table schema via HTTP API + async fn get_table_schema_via_http( + &self, + table: &TableConfig, + ) -> Result { + // Extract host, port, and TLS config from coprocessor config + let (host, port, tls) = match &self.config.config_type { + CollectorConfigType::Coprocessor { + host, port, tls, .. + } => (host, *port, tls), + _ => { + return Err(CollectionError::ConfigurationError( + "Invalid config type for coprocessor table schema fetch".to_string(), + )) + } + }; + let grpc_port = if port == 4000 { 10080 } else { port + 6080 }; // TiDB status port + + // Use HTTPS if TLS config is provided, otherwise use HTTP + let protocol = if tls.is_some() { "https" } else { "http" }; + let url = format!( + "{}://{}:{}/schema/{}/{}", + protocol, host, grpc_port, table.source_schema, table.source_table + ); + + info!("Fetching schema from: {}", url); + + // Create HTTP client using shared helper to ensure consistent TLS behavior across the project + let client = crate::utils::http::build_reqwest_client( + tls.clone(), + Some(Duration::from_secs(10)), + Some(Duration::from_secs(5)), + ) + .await + .map_err(|e| { + CollectionError::ConfigurationError(format!("Failed to build HTTP client: {}", e)) + })?; + + let response = client + .get(&url) + .timeout(Duration::from_secs(10)) + .send() + .await + .map_err(|e| CollectionError::NetworkError(format!("HTTP request failed: {}", e)))?; + + if !response.status().is_success() { + return Err(CollectionError::NetworkError(format!( + "HTTP request failed with status: {}", + response.status() + ))); + } + + let schema_json: serde_json::Value = response + .json() + .await + .map_err(|e| CollectionError::ParseError(format!("Failed to parse JSON: {}", e)))?; + + // Parse schema JSON and create TableSchema + let table_id = schema_json["id"].as_i64().unwrap_or(0); + + info!( + "Parsed table schema from HTTP API: table_id={}, columns_count={}", + table_id, + schema_json["cols"].as_array().map(|a| a.len()).unwrap_or(0) + ); + + let columns = if let Some(cols) = schema_json["cols"].as_array() { + let parsed_cols: Vec<_> = cols + .iter() + .map(|col| { + let name = col["name"]["O"].as_str().map(|s| s.to_string()); + TableColumn { + id: col["id"].as_i64().unwrap_or(0), + tp: col["type"]["Tp"].as_i64().unwrap_or(15) as i32, // Use "Tp" not "tp" + name, + } + }) + .collect(); + + info!("Parsed {} columns from schema", parsed_cols.len()); + + parsed_cols + } else { + info!("No columns found in schema"); + Vec::new() + }; + + Ok(TableSchema { + id: table_id, + columns, + }) + } + + /// Build coprocessor request for the given table schema + fn build_coprocessor_request( + &self, + table_schema: &TableSchema, + ) -> Result { + info!( + "Building coprocessor request for table_id: {}, columns: {}", + table_schema.id, + table_schema.columns.len() + ); + + // Build DAG request using tipb proto + // Ensure we have schema information - required for proper column mapping + if table_schema.columns.is_empty() { + return Err(CollectionError::ConfigurationError(format!( + "No schema information available for table_id {}. Schema is required for coprocessor requests.", + table_schema.id + ))); + } + + let output_offsets: Vec = (0..table_schema.columns.len() as u32).collect(); + + // Build columns for TableScan + let columns: Vec = table_schema + .columns + .iter() + .map(|col| ColumnInfo { + column_id: col.id, + tp: col.tp, + }) + .collect(); + + let dag_request = DagRequest { + start_ts_fallback: 0, + executors: vec![Executor { + tp: ExecType::TypeTableScan as i32, + tbl_scan: Some(TableScan { + table_id: table_schema.id, + columns, + desc: false, + }), + executor_id: format!("table_scan_{}", table_schema.id), + // Set all other fields to None/default to match Go version + idx_scan: None, + selection: None, + aggregation: None, + top_n: None, + limit: None, + exchange_receiver: None, + join: None, + kill: None, + exchange_sender: None, + projection: None, + partition_table_scan: None, + sort: None, + window: None, + fine_grained_shuffle_stream_count: 0, + fine_grained_shuffle_batch_size: 0, + expand: None, + expand2: None, + broadcast_query: None, + cte_sink: None, + cte_source: None, + index_lookup: None, + parent_idx: 0, + }], + time_zone_offset: 28800, // Use Asia/Shanghai timezone like Go version + flags: 0, + output_offsets: output_offsets.clone(), + collect_range_counts: false, + max_warning_count: 0, + encode_type: EncodeType::TypeDefault as i32, // Use TypeDefault like Go version + sql_mode: 0, + time_zone_name: "Asia/Shanghai".to_string(), // Use Asia/Shanghai like Go version + collect_execution_summaries: true, // Enable execution summaries like Go version + max_allowed_packet: 1024 * 1024 * 16, // 16MB + chunk_memory_layout: Some(ChunkMemoryLayout { + endian: Endian::LittleEndian as i32, + }), + is_rpn_expr: false, + user: Some(UserIdentity { + user_name: "root".to_string(), // Use root like Go code + user_host: "%".to_string(), + }), + root_executor: None, + force_encode_type: false, + div_precision_increment: 4, + intermediate_output_channels: vec![], + }; + + info!( + "DAG request: executors={}, output_offsets={:?}, encode_type={:?}", + dag_request.executors.len(), + dag_request.output_offsets, + dag_request.encode_type + ); + + // Serialize DAG request + let data = dag_request.encode_to_vec(); + info!("Serialized DAG request size: {} bytes", data.len()); + + // Build coprocessor request with proper field order matching official proto + let cop_request = CoprocessorRequest { + context: Some(Context { + region_id: 1, + region_epoch: Some(RegionEpoch { + conf_ver: 1, + version: 1, + }), + peer: Some(Peer { id: 1, store_id: 1 }), + source_stmt: Some(SourceStmt { + connection_id: 12345, + session_alias: "cluster_statements_summary_client".to_string(), + }), + }), + tp: 103, // ReqTypeDAG + data, + ranges: { + // Generate KeyRange dynamically based on table_id + // Following TiDB's key encoding: "t[tableID]_r" + // 1. 't' prefix (1 byte) + // 2. table_id encoded with XOR signMask and big-endian (8 bytes) + // 3. '_r' separator (2 bytes) + + // Encode table ID using TiDB's codec.EncodeInt: + // EncodeIntToCmpUint(v) = v XOR 0x8000000000000000 + const SIGN_MASK: u64 = 0x8000000000000000; + let encoded_table_id = (table_schema.id as u64) ^ SIGN_MASK; + let table_id_bytes = encoded_table_id.to_be_bytes(); // Big-endian + + // Build start key: 't' + encoded_table_id (table prefix only) + let mut start = vec![b't']; // 't' prefix + start.extend_from_slice(&table_id_bytes); // Encoded table ID (8 bytes) + + // Build end key: 't' + (encoded_table_id + 1) (next table prefix) + // This represents the next table's prefix boundary + let end_table_id = encoded_table_id + 1; + let end_table_id_bytes = end_table_id.to_be_bytes(); + let mut end = vec![b't']; // 't' prefix + end.extend_from_slice(&end_table_id_bytes); // Next table ID (8 bytes) + + let key_range = KeyRange { + start: start.clone(), + end: end.clone(), + }; + + info!( + "Using TiDB-encoded KeyRange for table_id {}: start={:?}, end={:?}", + table_schema.id, key_range.start, key_range.end + ); + + vec![key_range] + }, + is_cache_enabled: false, + cache_if_match_version: 0, + start_ts: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(), + schema_ver: 0, + is_trace_enabled: false, + paging_size: 0, + connection_id: 12345, + connection_alias: "cluster_statements_summary_client".to_string(), + }; + + Ok(cop_request) + } + + /// Perform actual coprocessor collection via gRPC + async fn perform_coprocessor_collection( + &self, + request: &CoprocessorRequest, + table: &TableConfig, + ) -> Result>, CollectionError> { + let channel = self.client_channel.as_ref().ok_or_else(|| { + CollectionError::ConfigurationError("gRPC channel not initialized".to_string()) + })?; + + // Create TiKV client + let mut client = TikvClient::new(channel.clone()); + + // Debug logging for request details + info!( + "Sending coprocessor request for table: {}, tp: {}, ranges: {}, has_context: {}", + table.source_table, + request.tp, + request.ranges.len(), + request.context.is_some() + ); + + // Debug the serialized request + let serialized = request.encode_to_vec(); + info!( + "Serialized coprocessor request size: {} bytes, first 64 bytes: {:?}", + serialized.len(), + &serialized[..std::cmp::min(64, serialized.len())] + ); + + let response = client + .coprocessor(request.clone()) + .await + .map_err(|e| CollectionError::NetworkError(format!("gRPC request failed: {}", e)))?; + + let cop_response = response.into_inner(); + + // Debug response details + info!( + "Received coprocessor response: data_size={}, other_error='{}'", + cop_response.data.len(), + cop_response.other_error + ); + + // Debug first 100 bytes of response data + if !cop_response.data.is_empty() { + let preview_size = std::cmp::min(100, cop_response.data.len()); + info!( + "Response data preview ({} bytes): {:?}", + preview_size, + &cop_response.data[..preview_size] + ); + } else { + info!("Response data is completely empty - no bytes received"); + } + + // Check for errors in response + if !cop_response.other_error.is_empty() { + return Err(CollectionError::QueryError(format!( + "Coprocessor error: {}", + cop_response.other_error + ))); + } + + // Note: Our protobuf definition may be incomplete. + // The TiDB CoprocessorResponse should have region_error and other fields, + // but our current definition only has data and other_error. + + // Parse the response data + self.parse_coprocessor_response(&cop_response, table).await + } + + /// Parse coprocessor response data with enhanced schema management and generalization + async fn parse_coprocessor_response( + &self, + response: &CoprocessorResponse, + table: &TableConfig, + ) -> Result>, CollectionError> { + info!( + "Parsing coprocessor response for table {}.{}: data_size={} bytes", + table.source_schema, + table.source_table, + response.data.len() + ); + + // Validate response data + if response.data.is_empty() { + warn!( + "Coprocessor response is empty for table {}.{}. This may indicate incorrect request parameters or no data.", + table.source_schema, table.source_table + ); + return Ok(Vec::new()); + } + + // Get or fetch table schema with caching + let table_schema = self.get_or_cache_table_schema(table).await?; + + info!( + "Using schema for table {}.{}: {} columns (table_id={})", + table.source_schema, + table.source_table, + table_schema.columns.len(), + table_schema.id + ); + + // Decode SelectResponse from response data + let select_response = SelectResponse::decode(&response.data[..]).map_err(|e| { + CollectionError::ParseError(format!( + "Failed to decode SelectResponse for table {}.{}: {}", + table.source_schema, table.source_table, e + )) + })?; + + // Validate SelectResponse + self.validate_select_response(&select_response, table)?; + + // Process chunks with schema-aware parsing + let all_rows = self + .process_response_chunks(&select_response, table, &table_schema) + .await?; + + info!( + "Successfully parsed {} rows from coprocessor response for table {}.{}", + all_rows.len(), + table.source_schema, + table.source_table + ); + + Ok(all_rows) + } + + /// Get table schema from cache or fetch and cache it + async fn get_or_cache_table_schema( + &self, + table: &TableConfig, + ) -> Result { + let cache_key = format!("{}.{}", table.source_schema, table.source_table); + + // Try to get from cache first + { + let schemas = self.cached_schemas.lock().unwrap(); + if let Some(cached_schema) = schemas.get(&cache_key) { + debug!("Using cached schema for table {}", cache_key); + return Ok(cached_schema.clone()); + } + } + + // Cache miss - fetch schema + info!("Fetching schema for table {} (cache miss)", cache_key); + let schema = self.get_table_schema_via_http(table).await?; + + // Validate schema before caching + if schema.columns.is_empty() { + return Err(CollectionError::ConfigurationError(format!( + "Retrieved schema for table {} has no columns", + cache_key + ))); + } + + // Cache the fetched schema + { + let mut schemas = self.cached_schemas.lock().unwrap(); + schemas.insert(cache_key.clone(), schema.clone()); + info!( + "Cached schema for table {} ({} columns)", + cache_key, + schema.columns.len() + ); + } + + Ok(schema) + } + + /// Validate SelectResponse for errors and warnings + fn validate_select_response( + &self, + select_response: &SelectResponse, + table: &TableConfig, + ) -> Result<(), CollectionError> { + info!( + "SelectResponse for table {}.{}: chunks={}, warnings={}, has_error={}, encode_type={:?}", + table.source_schema, table.source_table, + select_response.chunks.len(), + select_response.warnings.len(), + select_response.error.is_some(), + select_response.encode_type + ); + + // Check for execution errors + if let Some(error) = &select_response.error { + return Err(CollectionError::QueryError(format!( + "TiDB execution error for table {}.{} [{}]: {}", + table.source_schema, table.source_table, error.code, error.msg + ))); + } + + // Log warnings but don't fail + for (i, warning) in select_response.warnings.iter().enumerate() { + warn!( + "TiDB warning {} for table {}.{} [{}]: {}", + i + 1, + table.source_schema, + table.source_table, + warning.code, + warning.msg + ); + } + + Ok(()) + } + + /// Process all chunks in the SelectResponse with schema-aware parsing + async fn process_response_chunks( + &self, + select_response: &SelectResponse, + table: &TableConfig, + table_schema: &TableSchema, + ) -> Result>, CollectionError> { + let mut all_rows = Vec::new(); + let chunk_count = select_response.chunks.len(); + + if chunk_count == 0 { + info!( + "No chunks in SelectResponse for table {}.{}", + table.source_schema, table.source_table + ); + return Ok(all_rows); + } + + for (chunk_idx, chunk) in select_response.chunks.iter().enumerate() { + info!( + "Processing chunk {}/{} for table {}.{}: rows_data_size={} bytes", + chunk_idx + 1, + chunk_count, + table.source_schema, + table.source_table, + chunk.rows_data.len() + ); + + if chunk.rows_data.is_empty() { + debug!( + "Skipping empty chunk {} for table {}.{}", + chunk_idx, table.source_schema, table.source_table + ); + continue; + } + + // Parse chunk data with schema context and encode type awareness + let chunk_rows = self.parse_data_with_schema_and_encode_type( + &chunk.rows_data, + table, + table_schema, + select_response.encode_type, + )?; + + info!( + "Parsed {} rows from chunk {}/{} for table {}.{}", + chunk_rows.len(), + chunk_idx + 1, + chunk_count, + table.source_schema, + table.source_table + ); + + all_rows.extend(chunk_rows); + } + + Ok(all_rows) + } + + /// Parse data with schema context and encode type awareness (main parsing dispatcher) + fn parse_data_with_schema_and_encode_type( + &self, + data: &[u8], + table: &TableConfig, + table_schema: &TableSchema, + encode_type: i32, + ) -> Result>, CollectionError> { + debug!( + "Parsing data for table {}.{}: {} bytes, {} columns, encode_type={}", + table.source_schema, + table.source_table, + data.len(), + table_schema.columns.len(), + encode_type + ); + + // Choose parsing strategy based on encode_type + match encode_type { + 0 => { + // TypeDefault - use row format parsing (most common case) + info!( + "Using row format parsing (encode_type=TypeDefault) for table {}.{}", + table.source_schema, table.source_table + ); + self.parse_row_format_with_schema(data, table_schema, &table.source_table) + } + 1 => { + // TypeChunk - chunk format parsing (currently not fully implemented) + info!( + "TypeChunk detected for table {}.{}, falling back to row format parsing", + table.source_schema, table.source_table + ); + warn!("Chunk format parsing is not fully implemented yet, using row format as fallback"); + self.parse_row_format_with_schema(data, table_schema, &table.source_table) + } + _ => { + warn!( + "Unknown encode_type {} for table {}.{}, defaulting to row format", + encode_type, table.source_schema, table.source_table + ); + self.parse_row_format_with_schema(data, table_schema, &table.source_table) + } + } + } + + /// Parse row format data with explicit schema (optimized version) + fn parse_row_format_with_schema( + &self, + data: &[u8], + table_schema: &TableSchema, + table_name: &str, + ) -> Result>, CollectionError> { + debug!( + "Parsing row format with provided schema: {} bytes, {} columns", + data.len(), + table_schema.columns.len() + ); + + let mut rows = Vec::new(); + let mut offset = 0; + let mut row_index = 0; + + // Debug: Show first 32 bytes of raw data for comparison with Go + let preview_len = std::cmp::min(32, data.len()); + let hex_preview: String = data[..preview_len] + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "); + info!( + "RUST: First {} bytes of raw data: {}", + preview_len, hex_preview + ); + + // Parse all available rows + while offset < data.len() { + info!( + "RUST: Starting row {} at offset {} (remaining bytes: {})", + row_index, + offset, + data.len() - offset + ); + + let mut row = HashMap::new(); + let mut row_decoded = false; + + // Add default INSTANCE value + row.insert( + "INSTANCE".to_string(), + Value::String(self.config.instance.clone()), + ); + + // For each row, decode ALL columns in schema order (matching Go exactly) + for (col_idx, table_col) in table_schema.columns.iter().enumerate() { + if offset >= data.len() { + info!( + "Reached end of data at column {} for row {}", + col_idx, row_index + ); + break; + } + + let column_name = self.get_column_name(table_col, col_idx); + + match self.decode_value_from_bytes_with_type(data, offset, table_col.tp) { + Ok((value, new_offset)) => { + // Only log first few columns and rows to avoid spam + if row_index < 3 && col_idx < 30 { + info!( + "Row {} Column {} ({}): value={:?}, offset {}->{}", + row_index, col_idx, column_name, value, offset, new_offset + ); + } + + // Special debug for request unit columns to verify float decoding + if column_name.contains("REQUEST_UNIT") && row_index < 3 { + info!( + "DEBUG REQUEST_UNIT: Row {} Col {} Name {} Type {} Raw value={:?}", + row_index, col_idx, column_name, table_col.tp, value + ); + } + offset = new_offset; + row_decoded = true; + + // Apply data type and column-based processing + let final_value = + self.process_column_value(&column_name, &value, table_col); + + // Store the column value + row.insert(column_name, final_value); + } + Err(decode_err) => { + // Like Go: if we can't decode this column, break the column loop for this row + // But continue processing this row with the columns we did decode + if row_index < 3 || col_idx < 30 { + info!( + "Failed to decode column {} (index {}) at offset {} for row {}: {}", + column_name, col_idx, offset, row_index, decode_err + ); + } + break; // Break column loop, but continue with this row + } + } + } + + if !row_decoded { + info!( + "No columns decoded for row {}, stopping row processing", + row_index + ); + break; + } + + // Log summary for first few rows (only for CLUSTER_STATEMENTS_SUMMARY table) + if row_index < 5 && table_name == "CLUSTER_STATEMENTS_SUMMARY" { + let digest = row.get("DIGEST").unwrap_or(&Value::Null); + let exec_count = row.get("EXEC_COUNT").unwrap_or(&Value::Null); + let digest_text_len = row + .get("DIGEST_TEXT") + .and_then(|v| { + if let Value::String(s) = v { + Some(s.len()) + } else { + None + } + }) + .unwrap_or(0); + debug!("Row {} summary: DIGEST={:?}, EXEC_COUNT={:?}, DIGEST_TEXT_len={:?}, final_offset={}", + row_index, digest, exec_count, digest_text_len, offset); + } + + rows.push(row); + row_index += 1; + } + + info!( + "RUST: parse_row_format completed: decoded {} rows, final offset {}/{}", + rows.len(), + offset, + data.len() + ); + + Ok(rows) + } + + /// Decode value from bytes using TiDB codec (EXACTLY matching Go decodeValueFromBytes) + fn decode_value_from_bytes( + &self, + data: &[u8], + offset: usize, + ) -> Result<(Value, usize), CollectionError> { + if offset >= data.len() { + return Err(CollectionError::ParseError("Insufficient data".to_string())); + } + + let flag = data[offset]; + let mut new_offset = offset + 1; + + let value = match flag { + 0x00 => Value::Null, // NilFlag + 0x01 => { + // bytesFlag + let (bytes, consumed_offset) = self.decode_bytes(data, new_offset)?; + new_offset = consumed_offset; + Value::String(String::from_utf8_lossy(&bytes).to_string()) + } + 0x02 => { + // compactBytesFlag + let (bytes, consumed_offset) = self.decode_compact_bytes(data, new_offset)?; + new_offset = consumed_offset; + Value::String(String::from_utf8_lossy(&bytes).to_string()) + } + 0x03 => { + // intFlag + let (int_val, consumed_offset) = self.decode_int(data, new_offset)?; + new_offset = consumed_offset; + Value::Number(int_val.into()) + } + 0x04 => { + // uintFlag + let (uint_val, consumed_offset) = self.decode_uint(data, new_offset)?; + new_offset = consumed_offset; + Value::Number(uint_val.into()) + } + 0x05 => { + // floatFlag + let (float_val, consumed_offset) = self.decode_float(data, new_offset)?; + new_offset = consumed_offset; + Value::Number( + serde_json::Number::from_f64(float_val).unwrap_or(serde_json::Number::from(0)), + ) + } + 0x06 => { + // decimalFlag + let (decimal_str, consumed_offset) = self.decode_decimal(data, new_offset)?; + new_offset = consumed_offset; + Value::String(decimal_str) + } + 0x07 => { + // durationFlag + let (duration_str, consumed_offset) = self.decode_duration(data, new_offset)?; + new_offset = consumed_offset; + Value::String(duration_str) + } + 0x08 => { + // varintFlag + let (varint_val, consumed_offset) = self.decode_varint(data, new_offset)?; + new_offset = consumed_offset; + Value::Number(varint_val.into()) + } + 0x09 => { + // uvarintFlag + let (uvarint_val, consumed_offset) = self.decode_uvarint(data, new_offset)?; + new_offset = consumed_offset; + Value::Number(uvarint_val.into()) + } + 0x0A => { + // jsonFlag + let (json_str, consumed_offset) = self.decode_json(data, new_offset)?; + new_offset = consumed_offset; + Value::String(json_str) + } + 0x14 => { + // vectorFloat32Flag + let (vector_str, consumed_offset) = self.decode_vector_float32(data, new_offset)?; + new_offset = consumed_offset; + Value::String(vector_str) + } + 0xFA => { + // maxFlag + Value::String("MAX_VALUE".to_string()) + } + 0x20..=0x30 => { + // Time types + let (time_val, consumed_offset) = self.decode_time_value(data, new_offset, flag)?; + new_offset = consumed_offset; + Value::String(time_val) + } + _ => { + // For unknown flags, try to skip 1 byte and return NULL + // This allows decoding to continue despite unknown flags + info!("Encountered unknown encoding flag: 0x{:02x} at offset {}, treating as NULL and skipping 1 byte", flag, offset); + Value::Null + } + }; + + Ok((value, new_offset)) + } + + /// Decode value from bytes with awareness of MySQL column type, enabling packed time decode at byte stage + fn decode_value_from_bytes_with_type( + &self, + data: &[u8], + offset: usize, + mysql_tp: i32, + ) -> Result<(Value, usize), CollectionError> { + if offset >= data.len() { + return Err(CollectionError::ParseError("Insufficient data".to_string())); + } + let flag = data[offset]; + let mut new_offset = offset + 1; + + // If TIMESTAMP/DATETIME and encoded as uvarint, decode as TiDB packed time here + if mysql_tp == TYPE_TIMESTAMP || mysql_tp == TYPE_DATETIME { + // uvarintFlag + if flag == FLAG_UVARINT { + let (u, consumed_offset) = self.decode_uvarint(data, new_offset)?; + new_offset = consumed_offset; + // For TIMESTAMP, try to decode as microseconds for direct TIMESTAMP support + if mysql_tp == TYPE_TIMESTAMP { + if let Some(microseconds) = self.decode_packed_time_to_microseconds(u) { + return Ok((Value::Number(microseconds.into()), new_offset)); + } + } + // Fallback to string format for DATETIME or invalid TIMESTAMP + let s = self.decode_packed_time_to_string(u); + return Ok((Value::String(s), new_offset)); + } + // uintFlag (0x04): next 8 bytes unsigned, big-endian + if flag == FLAG_UINT { + if new_offset + 8 > data.len() { + return Err(CollectionError::ParseError( + "Insufficient bytes for uintFlag time".to_string(), + )); + } + let mut buf = [0u8; 8]; + buf.copy_from_slice(&data[new_offset..new_offset + 8]); + let u = u64::from_be_bytes(buf); + new_offset += 8; + // For TIMESTAMP, try to decode as microseconds for direct TIMESTAMP support + if mysql_tp == TYPE_TIMESTAMP { + if let Some(microseconds) = self.decode_packed_time_to_microseconds(u) { + return Ok((Value::Number(microseconds.into()), new_offset)); + } + } + // Fallback to string format for DATETIME or invalid TIMESTAMP + let s = self.decode_packed_time_to_string(u); + return Ok((Value::String(s), new_offset)); + } + // compactBytesFlag: inner buffer holds encoded time (usually uvarint/uint packed time) + if flag == FLAG_COMPACT_BYTES { + // compact bytes + let (inner, consumed_offset) = self.decode_compact_bytes(data, new_offset)?; + // decode inner by reading its flag + if !inner.is_empty() { + let inner_flag = inner[0]; + let inner_off = 1usize; + if inner_flag == FLAG_UVARINT { + // uvarint + let (u, _) = self.decode_uvarint(&inner, inner_off)?; + // For TIMESTAMP, try to decode as microseconds for direct TIMESTAMP support + if mysql_tp == TYPE_TIMESTAMP { + if let Some(microseconds) = self.decode_packed_time_to_microseconds(u) { + return Ok((Value::Number(microseconds.into()), consumed_offset)); + } + } + // Fallback to string format for DATETIME or invalid TIMESTAMP + let s = self.decode_packed_time_to_string(u); + return Ok((Value::String(s), consumed_offset)); + } else if inner_flag == FLAG_UINT { + // uintFlag 8-byte + // ensure enough bytes + if inner.len() >= inner_off + 8 { + let mut buf = [0u8; 8]; + buf.copy_from_slice(&inner[inner_off..inner_off + 8]); + // TiDB DecodeUint uses big-endian + let u = u64::from_be_bytes(buf); + // For TIMESTAMP, try to decode as microseconds for direct TIMESTAMP support + if mysql_tp == TYPE_TIMESTAMP { + if let Some(microseconds) = + self.decode_packed_time_to_microseconds(u) + { + return Ok(( + Value::Number(microseconds.into()), + consumed_offset, + )); + } + } + // Fallback to string format for DATETIME or invalid TIMESTAMP + let s = self.decode_packed_time_to_string(u); + return Ok((Value::String(s), consumed_offset)); + } + } + // Fallback: return hex for debugging + let hex = inner + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "); + info!( + "TIME COMPACT inner unhandled flag=0x{:02x} bytes=[{}]", + inner_flag, hex + ); + } + return Ok((Value::Null, consumed_offset)); + } + } + // Fallback to generic decoder + self.decode_value_from_bytes(data, offset) + } + + /// Decode decimal value (matching Go decodeDecimal) + fn decode_decimal( + &self, + data: &[u8], + offset: usize, + ) -> Result<(String, usize), CollectionError> { + // For now, decode as bytes and convert to string + let (bytes, new_offset) = self.decode_bytes(data, offset)?; + Ok((String::from_utf8_lossy(&bytes).to_string(), new_offset)) + } + + /// Decode duration value (matching Go decodeDuration) + fn decode_duration( + &self, + data: &[u8], + offset: usize, + ) -> Result<(String, usize), CollectionError> { + // For now, decode as bytes and convert to string + let (bytes, new_offset) = self.decode_bytes(data, offset)?; + Ok((String::from_utf8_lossy(&bytes).to_string(), new_offset)) + } + + /// Decode JSON value (matching Go decodeJSON) + fn decode_json(&self, data: &[u8], offset: usize) -> Result<(String, usize), CollectionError> { + // For now, decode as bytes and convert to string + let (bytes, new_offset) = self.decode_bytes(data, offset)?; + Ok((String::from_utf8_lossy(&bytes).to_string(), new_offset)) + } + + /// Decode vector float32 value (matching Go decodeVectorFloat32) + fn decode_vector_float32( + &self, + data: &[u8], + offset: usize, + ) -> Result<(String, usize), CollectionError> { + // For now, decode as bytes and convert to string + let (bytes, new_offset) = self.decode_bytes(data, offset)?; + Ok((String::from_utf8_lossy(&bytes).to_string(), new_offset)) + } + + /// Helper function to safely convert Value to i64 + fn safe_int64_value(&self, value: &Value) -> Option { + match value { + Value::Number(n) => n.as_i64(), + Value::String(s) => s.parse::().ok(), + _ => None, + } + } + + /// Decode bytes with length prefix + fn decode_bytes( + &self, + data: &[u8], + offset: usize, + ) -> Result<(Vec, usize), CollectionError> { + if offset >= data.len() { + return Err(CollectionError::ParseError( + "Cannot decode bytes: insufficient data".to_string(), + )); + } + + // Read length as varint + let (length, length_consumed) = self.decode_varint_length(data, offset)?; + let new_offset = offset + length_consumed; + + if new_offset + length > data.len() { + return Err(CollectionError::ParseError(format!( + "Cannot decode bytes: need {} bytes, have {}", + length, + data.len() - new_offset + ))); + } + + let bytes = data[new_offset..new_offset + length].to_vec(); + Ok((bytes, new_offset + length)) + } + + /// Decode compact bytes (EXACTLY matching Go's decodeCompactBytes using binary.Varint) + fn decode_compact_bytes( + &self, + data: &[u8], + offset: usize, + ) -> Result<(Vec, usize), CollectionError> { + if offset >= data.len() { + return Err(CollectionError::ParseError( + "insufficient data, cannot decode compact byte array".to_string(), + )); + } + + // Read unsigned varint first (like Go's Uvarint) + let mut ux = 0u64; + let mut bytes_consumed = 0; + let mut shift = 0; + + for i in 0..10 { + // Max 10 bytes for varint + if offset + i >= data.len() { + return Err(CollectionError::ParseError( + "cannot decode compact byte array length".to_string(), + )); + } + + let b = data[offset + i]; + bytes_consumed += 1; + + if b < 0x80 { + // Last byte + if i == 9 && b > 1 { + return Err(CollectionError::ParseError( + "varint overflows a 64-bit integer".to_string(), + )); + } + ux |= (b as u64) << shift; + break; + } + ux |= ((b & 0x7F) as u64) << shift; + shift += 7; + } + + // Apply zigzag decoding exactly like Go's binary.Varint + let mut length = (ux >> 1) as i64; + if (ux & 1) != 0 { + length = !length; // ^x in Go + } + + if length < 0 { + return Err(CollectionError::ParseError( + "negative length in compact bytes".to_string(), + )); + } + + let length = length as usize; + let new_offset = offset + bytes_consumed; + + // Debug: Log the length and actual data for analysis + if offset < 50 { + let preview_len = std::cmp::min(length, 32); + if new_offset + preview_len <= data.len() { + let data_preview: String = data[new_offset..new_offset + preview_len] + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "); + info!("RUST decode_compact_bytes: offset={}, length={}, bytes_consumed={}, data_preview=[{}]", + offset, length, bytes_consumed, data_preview); + } + } + + if new_offset + length > data.len() { + return Err(CollectionError::ParseError( + "insufficient data, cannot decode compact byte array data".to_string(), + )); + } + + let bytes = data[new_offset..new_offset + length].to_vec(); + let final_offset = new_offset + length; + + Ok((bytes, final_offset)) + } + + /// Decode varint length + fn decode_varint_length( + &self, + data: &[u8], + offset: usize, + ) -> Result<(usize, usize), CollectionError> { + let mut result = 0; + let mut shift = 0; + let mut consumed = 0; + + for i in offset..data.len() { + let byte = data[i]; + consumed += 1; + + if (byte & 0x80) == 0 { + // Last byte + result |= (byte as usize) << shift; + break; + } else { + // More bytes to come + result |= ((byte & 0x7F) as usize) << shift; + shift += 7; + if shift >= 64 { + return Err(CollectionError::ParseError("Varint too long".to_string())); + } + } + } + + Ok((result, offset + consumed)) + } + + /// Decode int64 + fn decode_int(&self, data: &[u8], offset: usize) -> Result<(i64, usize), CollectionError> { + if offset + 8 > data.len() { + return Err(CollectionError::ParseError( + "Cannot decode int: insufficient data".to_string(), + )); + } + + let bytes = &data[offset..offset + 8]; + let value = i64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]); + + Ok((value, offset + 8)) + } + + /// Decode uint64 + fn decode_uint(&self, data: &[u8], offset: usize) -> Result<(u64, usize), CollectionError> { + if offset + 8 > data.len() { + return Err(CollectionError::ParseError( + "Cannot decode uint: insufficient data".to_string(), + )); + } + + let bytes = &data[offset..offset + 8]; + let value = u64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]); + + Ok((value, offset + 8)) + } + + /// Decode float64 + fn decode_float(&self, data: &[u8], offset: usize) -> Result<(f64, usize), CollectionError> { + if offset + 8 > data.len() { + return Err(CollectionError::ParseError( + "Cannot decode float: insufficient data".to_string(), + )); + } + + let bytes = &data[offset..offset + 8]; + // TiDB uses big-endian encoding for floats (matching DecodeUint -> binary.BigEndian.Uint64) + let u = u64::from_be_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]); + + // TiDB's decodeCmpUintToFloat logic: + // 1. DecodeUint returns the encoded uint64 + // 2. decodeCmpUintToFloat converts it back to float64 + const SIGN_MASK: u64 = 0x8000000000000000; + let bits = if u & SIGN_MASK > 0 { + u & !SIGN_MASK + } else { + !u + }; + + let value = f64::from_bits(bits); + Ok((value, offset + 8)) + } + + /// Decode varint + fn decode_varint(&self, data: &[u8], offset: usize) -> Result<(i64, usize), CollectionError> { + let (unsigned, consumed) = self.decode_uvarint(data, offset)?; + let signed = (unsigned >> 1) as i64 ^ -((unsigned & 1) as i64); + Ok((signed, consumed)) + } + + /// Decode uvarint + fn decode_uvarint(&self, data: &[u8], offset: usize) -> Result<(u64, usize), CollectionError> { + let mut result = 0u64; + let mut shift = 0; + let mut consumed = 0; + + for i in offset..data.len() { + let byte = data[i]; + consumed += 1; + + if (byte & 0x80) == 0 { + // Last byte + result |= (byte as u64) << shift; + break; + } else { + // More bytes to come + result |= ((byte & 0x7F) as u64) << shift; + shift += 7; + if shift >= 64 { + return Err(CollectionError::ParseError("Uvarint too long".to_string())); + } + } + } + + Ok((result, offset + consumed)) + } + + /// Decode time value + fn decode_time_value( + &self, + data: &[u8], + offset: usize, + _flag: u8, + ) -> Result<(String, usize), CollectionError> { + // For now, just read 8 bytes and convert to timestamp string + if offset + 8 > data.len() { + return Err(CollectionError::ParseError( + "Cannot decode time: insufficient data".to_string(), + )); + } + + let bytes = &data[offset..offset + 8]; + let timestamp = u64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]); + + // Convert to readable timestamp (this is a simplified conversion) + let time_str = format!("timestamp_{}", timestamp); + Ok((time_str, offset + 8)) + } + + /// Get column name from schema or generate one + fn get_column_name(&self, table_col: &TableColumn, col_idx: usize) -> String { + table_col + .name + .clone() + .unwrap_or_else(|| format!("col_{}", col_idx)) + } + + /// Fallback to HTTP API collection for tables that don't support coprocessor + async fn fallback_to_http_collection( + &self, + table: &TableConfig, + ) -> Result>, CollectionError> { + warn!( + "Falling back to HTTP API collection for table: {}", + table.source_table + ); + + // This could be implemented to use TiDB's HTTP API endpoints + // For now, return empty result + Ok(Vec::new()) + } + + /// Process column value based on data type and column semantics + /// This method provides flexible column value processing that adapts to different tables + fn process_column_value( + &self, + column_name: &str, + value: &Value, + table_col: &TableColumn, + ) -> Value { + // Handle special INSTANCE column for all system tables + if column_name == "INSTANCE" { + // Always use our configured instance name for consistency + return Value::String(self.config.instance.clone()); + } + + // Process values based on MySQL data types and column semantics + match table_col.tp { + // Integer types (BIGINT, INT, etc.) - ensure proper number conversion + TYPE_LONGLONG | TYPE_LONG | TYPE_TINY | TYPE_SHORT | TYPE_INT24 => { + // For numeric columns that might come as strings, convert to numbers + self.ensure_numeric_value(value) + } + // Float/Double types + TYPE_FLOAT | TYPE_DOUBLE => { + // Handle float values following TiDB codec standards + if let Value::Number(n) = value { + if let Some(f) = n.as_f64() { + // Only convert truly invalid floating point values + // TiDB codec supports all finite values including subnormal numbers + // Reference: TiDB TestFloatCodec includes math.SmallestNonzeroFloat64 + if f.is_nan() || f.is_infinite() { + info!( + "Converting invalid float (NaN/Inf) {} to 0.0 for column {}", + f, column_name + ); + return if let Some(zero_float) = serde_json::Number::from_f64(0.0) { + Value::Number(zero_float) + } else { + Value::Number(serde_json::Number::from(0)) + }; + } + // Note: All finite values including subnormal numbers (like 6.3e-322) are valid + // and should be preserved as-is according to TiDB codec implementation + } + } + self.ensure_float_value(value) + } + // Decimal types - treat as numeric values + TYPE_NEWDECIMAL => self.ensure_numeric_value(value), + // Date/Time types + // For TIMESTAMP: keep numeric microseconds if already decoded as number; otherwise fall back to string + TYPE_TIMESTAMP => match value { + Value::Number(_) => value.clone(), + _ => self.convert_packed_time_value(value), + }, + // For DATETIME: keep as string (no timezone semantics) + TYPE_DATETIME => self.convert_packed_time_value(value), + TYPE_DATE | TYPE_DURATION => value.clone(), + // String types (VARCHAR, TEXT, BLOB, etc.) - keep as-is + TYPE_VARCHAR | TYPE_STRING | TYPE_VAR_STRING | TYPE_BLOB | TYPE_TINY_BLOB + | TYPE_MEDIUM_BLOB | TYPE_LONG_BLOB => value.clone(), + // Enum and Set types - keep as-is (treated as strings) + TYPE_ENUM | TYPE_SET => value.clone(), + // Bit type - treat as numeric value + TYPE_BIT => self.ensure_numeric_value(value), + // All other types - keep as-is + _ => value.clone(), + } + } + + /// Decode TiDB packed time (per TiDB types.Time.FromPackedUint) and return formatted string + fn decode_packed_time_to_string(&self, packed: u64) -> String { + fn parse_fields(p: u64) -> (i32, i32, i32, i32, i32, i32) { + let ymdhms = p >> 24; + let ymd = ymdhms >> 17; + let day = (ymd & ((1u64 << 5) - 1)) as i32; + let ym = ymd >> 5; + let rem = (ym % 13) as i32; + let mut year = (ym / 13) as i32; + let mut month = rem; + if rem == 0 { + // TiDB packed uses base-13; remainder 0 means previous year December + month = 12; + year -= 1; + } + let hms = ymdhms & ((1u64 << 17) - 1); + let second = (hms & ((1u64 << 6) - 1)) as i32; + let minute = ((hms >> 6) & ((1u64 << 6) - 1)) as i32; + let hour = (hms >> 12) as i32; + (year, month, day, hour, minute, second) + } + + fn valid(y: i32, m: i32, d: i32, h: i32, mi: i32, s: i32) -> bool { + (0..=9999).contains(&y) + && (1..=12).contains(&m) + && (1..=31).contains(&d) + && (0..=23).contains(&h) + && (0..=59).contains(&mi) + && (0..=59).contains(&s) + } + + if packed == 0 { + return "0000-00-00 00:00:00".to_string(); + } + + // try native (little-endian constructed u64) + let (y, m, d, h, mi, s) = parse_fields(packed); + if valid(y, m, d, h, mi, s) { + // TODO: TIMESTAMP should convert UTC->session tz like TiDB; currently output as-is + return format!("{y:04}-{m:02}-{d:02} {h:02}:{mi:02}:{s:02}"); + } + // invalid packed yields zero-time string to match TiDB zero behavior + "0000-00-00 00:00:00".to_string() + } + + /// Decode TiDB packed time to microseconds since Unix epoch (for direct TIMESTAMP support) + fn decode_packed_time_to_microseconds(&self, packed: u64) -> Option { + fn parse_fields(p: u64) -> (i32, i32, i32, i32, i32, i32) { + let ymdhms = p >> 24; + let ymd = ymdhms >> 17; + let day = (ymd & ((1u64 << 5) - 1)) as i32; + let ym = ymd >> 5; + let rem = (ym % 13) as i32; + let mut year = (ym / 13) as i32; + let mut month = rem; + if rem == 0 { + // TiDB packed uses base-13; remainder 0 means previous year December + month = 12; + year -= 1; + } + let hms = ymdhms & ((1u64 << 17) - 1); + let second = (hms & ((1u64 << 6) - 1)) as i32; + let minute = ((hms >> 6) & ((1u64 << 6) - 1)) as i32; + let hour = (hms >> 12) as i32; + (year, month, day, hour, minute, second) + } + + fn valid(y: i32, m: i32, d: i32, h: i32, mi: i32, s: i32) -> bool { + (0..=9999).contains(&y) + && (1..=12).contains(&m) + && (1..=31).contains(&d) + && (0..=23).contains(&h) + && (0..=59).contains(&mi) + && (0..=59).contains(&s) + } + + if packed == 0 { + return None; // Zero time is not a valid timestamp + } + + let (y, m, d, h, mi, s) = parse_fields(packed); + if valid(y, m, d, h, mi, s) { + // Convert to chrono::NaiveDateTime and then to microseconds since Unix epoch + if let Some(date) = chrono::NaiveDate::from_ymd_opt(y, m as u32, d as u32) { + if let Some(naive_dt) = date.and_hms_opt(h as u32, mi as u32, s as u32) { + // Convert to UTC timestamp in microseconds + let timestamp_micros = naive_dt.and_utc().timestamp_micros(); + return Some(timestamp_micros); + } + } + } + None + } + + /// Convert JSON value that contains TiDB packed time into formatted string + fn convert_packed_time_value(&self, value: &Value) -> Value { + match value { + Value::Number(n) => { + if let Some(u) = n.as_u64() { + let s = self.decode_packed_time_to_string(u); + Value::String(s) + } else if let Some(i) = n.as_i64() { + if i >= 0 { + let s = self.decode_packed_time_to_string(i as u64); + Value::String(s) + } else { + // negative not expected; keep as string for visibility + Value::String(i.to_string()) + } + } else { + // fallback to string + Value::String(n.to_string()) + } + } + Value::String(s) => { + // try parse as integer packed time + if let Ok(u) = s.parse::() { + Value::String(self.decode_packed_time_to_string(u)) + } else if let Ok(i) = s.parse::() { + if i >= 0 { + Value::String(self.decode_packed_time_to_string(i as u64)) + } else { + Value::String(s.clone()) + } + } else { + Value::String(s.clone()) + } + } + _ => value.clone(), + } + } + + /// Ensure value is properly formatted as a number for numeric columns + fn ensure_numeric_value(&self, value: &Value) -> Value { + match self.safe_int64_value(value) { + Some(int_val) => Value::Number(int_val.into()), + None => value.clone(), // Keep original if conversion fails + } + } + + /// Convert MySQL type number to string representation + fn mysql_type_to_string(&self, mysql_type: i32) -> String { + match mysql_type { + TYPE_TINY => "tinyint".to_string(), + TYPE_SHORT => "smallint".to_string(), + TYPE_LONG => "int".to_string(), + TYPE_FLOAT => "float".to_string(), + TYPE_DOUBLE => "double".to_string(), + TYPE_TIMESTAMP => "timestamp".to_string(), + TYPE_LONGLONG => "bigint".to_string(), + TYPE_INT24 => "mediumint".to_string(), + TYPE_DATE => "date".to_string(), + TYPE_DURATION => "time".to_string(), + TYPE_DATETIME => "datetime".to_string(), + TYPE_VARCHAR => "varchar".to_string(), + TYPE_BIT => "bit".to_string(), + TYPE_NEWDECIMAL => "decimal".to_string(), + TYPE_ENUM => "enum".to_string(), + TYPE_SET => "set".to_string(), + TYPE_TINY_BLOB => "tinyblob".to_string(), + TYPE_MEDIUM_BLOB => "mediumblob".to_string(), + TYPE_LONG_BLOB => "longblob".to_string(), + TYPE_BLOB => "blob".to_string(), + TYPE_VAR_STRING => "varchar".to_string(), + TYPE_STRING => "char".to_string(), + _ => format!("unknown_type_{}", mysql_type), + } + } + + /// Ensure value is properly formatted as a float following TiDB codec standards + fn ensure_float_value(&self, value: &Value) -> Value { + match value { + Value::Number(n) => { + if n.is_f64() { + // Already a float, preserve as-is (including subnormal numbers like 6.3e-322) + // TiDB codec supports all finite values including subnormal numbers + value.clone() + } else if let Some(i) = n.as_i64() { + // Convert integer to float + if let Some(json_num) = serde_json::Number::from_f64(i as f64) { + Value::Number(json_num) + } else { + Value::Number(serde_json::Number::from(0)) + } + } else { + Value::Number(serde_json::Number::from(0)) + } + } + Value::String(s) => { + if let Ok(float_val) = s.parse::() { + // Only convert if parsing succeeded and result is finite + // TiDB codec supports all finite values including subnormal numbers + if float_val.is_finite() { + if let Some(json_num) = serde_json::Number::from_f64(float_val) { + Value::Number(json_num) + } else { + Value::Number(serde_json::Number::from(0)) + } + } else { + // Invalid float string (NaN/Inf), convert to 0 + Value::Number(serde_json::Number::from(0)) + } + } else { + Value::Number(serde_json::Number::from(0)) + } + } + _ => Value::Number(serde_json::Number::from(0)), + } + } +} + +/// Table schema information +#[derive(Debug, Clone)] +pub struct TableSchema { + pub id: i64, + pub columns: Vec, +} + +#[derive(Debug, Clone)] +pub struct TableColumn { + pub id: i64, + pub tp: i32, + pub name: Option, // Add column name for easier extraction +} + +/// MySQL type constants (matching Go implementation) +const TYPE_TINY: i32 = 1; +const TYPE_SHORT: i32 = 2; +const TYPE_LONG: i32 = 3; +const TYPE_FLOAT: i32 = 4; +const TYPE_DOUBLE: i32 = 5; +const TYPE_TIMESTAMP: i32 = 7; +const TYPE_LONGLONG: i32 = 8; +const TYPE_INT24: i32 = 9; +const TYPE_DATE: i32 = 10; +const TYPE_DURATION: i32 = 11; +const TYPE_DATETIME: i32 = 12; +const TYPE_VARCHAR: i32 = 15; +const TYPE_BIT: i32 = 16; +const TYPE_NEWDECIMAL: i32 = 246; +const TYPE_ENUM: i32 = 247; +const TYPE_SET: i32 = 248; +const TYPE_TINY_BLOB: i32 = 249; +const TYPE_MEDIUM_BLOB: i32 = 250; +const TYPE_LONG_BLOB: i32 = 251; +const TYPE_BLOB: i32 = 252; +const TYPE_VAR_STRING: i32 = 253; +const TYPE_STRING: i32 = 254; + +// TiDB row/codec flag constants (aligned with pkg/util/codec/codec.go) +#[allow(dead_code)] +const FLAG_NIL: u8 = 0x00; // NilFlag +#[allow(dead_code)] +const FLAG_BYTES: u8 = 0x01; // bytesFlag +const FLAG_COMPACT_BYTES: u8 = 0x02; // compactBytesFlag +#[allow(dead_code)] +const FLAG_INT: u8 = 0x03; // intFlag +const FLAG_UINT: u8 = 0x04; // uintFlag +#[allow(dead_code)] +const FLAG_FLOAT: u8 = 0x05; // floatFlag +#[allow(dead_code)] +const FLAG_DECIMAL: u8 = 0x06; // decimalFlag +#[allow(dead_code)] +const FLAG_DURATION: u8 = 0x07; // durationFlag +#[allow(dead_code)] +const FLAG_VARINT: u8 = 0x08; // varintFlag +const FLAG_UVARINT: u8 = 0x09; // uvarintFlag + +#[async_trait] +impl DataCollector for CoprocessorCollector { + fn collection_method(&self) -> CollectionMethod { + CollectionMethod::Coprocessor + } + + fn can_collect_table(&self, table: &TableConfig) -> bool { + // Coprocessor method works best with CLUSTER_ tables + table.source_table.starts_with("CLUSTER_") + || table.source_table.contains("STATEMENTS_SUMMARY") + || table.source_table.contains("SLOW_QUERY") + } + + async fn initialize(&mut self) -> Result<(), CollectionError> { + info!( + "Initializing coprocessor collector for instance: {}", + self.config.instance + ); + + let channel = self.create_grpc_connection().await?; + self.client_channel = Some(channel); + + info!("Coprocessor collector initialized successfully"); + Ok(()) + } + + async fn collect_table_data( + &self, + table: &TableConfig, + ) -> Result { + let start_time = Instant::now(); + let timestamp = chrono::Utc::now(); + + let _channel = self.client_channel.as_ref().ok_or_else(|| { + CollectionError::ConfigurationError("gRPC channel not initialized".to_string()) + })?; + + // Try to get table schema + let table_schema = match self.get_table_schema_via_http(table).await { + Ok(schema) => schema, + Err(e) => { + warn!( + "Failed to get schema for table {}: {}. Using fallback.", + table.source_table, e + ); + // Create a basic schema for fallback + TableSchema { + id: 0, + columns: Vec::new(), + } + } + }; + + // Try coprocessor collection, fallback to HTTP if needed + let data = match self.build_coprocessor_request(&table_schema) { + Ok(request) => { + // Perform actual coprocessor collection via gRPC + self.perform_coprocessor_collection(&request, table).await? + } + Err(e) => { + warn!( + "Failed to build coprocessor request: {}. Using fallback.", + e + ); + self.fallback_to_http_collection(table).await? + } + }; + + let duration = start_time.elapsed(); + let row_count = data.len(); + + // Create metadata + let mut extra = HashMap::new(); + extra.insert( + "schema_columns".to_string(), + Value::Number(table_schema.columns.len().into()), + ); + extra.insert( + "grpc_endpoint".to_string(), + Value::String(self.grpc_endpoint.clone()), + ); + extra.insert("fallback_used".to_string(), Value::Bool(false)); // Now using actual gRPC + + // Add schema metadata for DeltaLake writer + let mut schema_metadata = serde_json::Map::new(); + for col in &table_schema.columns { + if let Some(name) = &col.name { + let mysql_type_str = self.mysql_type_to_string(col.tp); + let mut obj = serde_json::Map::new(); + obj.insert("mysql_type".to_string(), Value::String(mysql_type_str)); + schema_metadata.insert(name.clone(), Value::Object(obj)); + } + } + extra.insert( + "schema_metadata".to_string(), + Value::Object(schema_metadata), + ); + + let metadata = CollectionMetadata { + instance: self.config.instance.clone(), + table_config: table.clone(), + collection_method: CollectionMethod::Coprocessor, + timestamp, + row_count, + duration_ms: duration.as_millis() as u64, + extra, + }; + + info!( + "Coprocessor collection completed for table {}: {} rows in {}ms", + table.source_table, + row_count, + duration.as_millis() + ); + + Ok(CollectionResult { data, metadata }) + } + + async fn health_check(&self) -> Result<(), CollectionError> { + if self.client_channel.is_none() { + return Err(CollectionError::ConfigurationError( + "gRPC channel not initialized".to_string(), + )); + } + + // For a real health check, we could send a simple coprocessor request + // or check the gRPC connection status + Ok(()) + } +} diff --git a/src/sources/system_tables/collectors/mod.rs b/src/sources/system_tables/collectors/mod.rs new file mode 100644 index 0000000..e337065 --- /dev/null +++ b/src/sources/system_tables/collectors/mod.rs @@ -0,0 +1,5 @@ +pub mod coprocessor_collector; +pub mod sql_collector; + +pub use coprocessor_collector::CoprocessorCollector; +pub use sql_collector::SqlCollector; diff --git a/src/sources/system_tables/collectors/sql_collector.rs b/src/sources/system_tables/collectors/sql_collector.rs new file mode 100644 index 0000000..586b76c --- /dev/null +++ b/src/sources/system_tables/collectors/sql_collector.rs @@ -0,0 +1,440 @@ +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use serde_json::Value; +use sqlx::{Column, Row}; +use tracing::{debug, info, warn}; + +use crate::sources::system_tables::data_collector::{ + CollectionError, CollectionMetadata, CollectionMethod, CollectionResult, CollectorConfig, + CollectorConfigType, DataCollector, +}; +use crate::sources::system_tables::TableConfig; + +/// SQL-based data collector using MySQL protocol +pub struct SqlCollector { + config: CollectorConfig, + pool: Option, +} + +impl SqlCollector { + /// Create a new SQL collector + pub fn new(config: CollectorConfig) -> Result { + // Validate that we have SQL config + match &config.config_type { + CollectorConfigType::Sql { .. } => (), + _ => { + return Err(CollectionError::ConfigurationError( + "Invalid config type for SqlCollector".to_string(), + )) + } + } + + Ok(Self { config, pool: None }) + } + + /// Build MySQL connection pool + async fn create_connection_pool(&self) -> Result { + let database_config = match &self.config.config_type { + CollectorConfigType::Sql { database_config } => database_config, + _ => { + return Err(CollectionError::ConfigurationError( + "SQL collector requires SQL configuration".to_string(), + )) + } + }; + + let mut url = format!( + "mysql://{}:{}@{}:{}/{}", + database_config.username, + database_config.password, + database_config.host, + database_config.port, + database_config.database + ); + + // Add TLS parameters if database TLS is configured + if let Some(ref tls_config) = database_config.tls { + let mut tls_params = Vec::new(); + + // Set SSL mode based on verification settings + if tls_config.verify_certificate.unwrap_or(true) { + if tls_config.verify_hostname.unwrap_or(true) { + tls_params.push("ssl-mode=VERIFY_IDENTITY".to_string()); + } else { + tls_params.push("ssl-mode=VERIFY_CA".to_string()); + } + } else { + tls_params.push("ssl-mode=REQUIRED".to_string()); + } + + // Add CA certificate if provided + if let Some(ref ca_file) = tls_config.ca_file { + tls_params.push(format!("ssl-ca={}", ca_file.display())); + } + + // Add client certificate if provided + if let Some(ref crt_file) = tls_config.crt_file { + tls_params.push(format!("ssl-cert={}", crt_file.display())); + } + + // Add client key if provided + if let Some(ref key_file) = tls_config.key_file { + tls_params.push(format!("ssl-key={}", key_file.display())); + } + + if !tls_params.is_empty() { + url.push('?'); + url.push_str(&tls_params.join("&")); + } + + info!("Creating SQL connection pool with TLS enabled"); + } else { + info!("Creating SQL connection pool without TLS"); + } + + let pool = sqlx::mysql::MySqlPoolOptions::new() + .max_connections(database_config.max_connections.unwrap_or(10)) + .acquire_timeout(Duration::from_secs( + database_config.connect_timeout.unwrap_or(30), + )) + .connect(&url) + .await + .map_err(|e| { + CollectionError::ConnectionError(format!("Failed to create pool: {}", e)) + })?; + + Ok(pool) + } + + /// Get table schema information + async fn get_table_schema( + &self, + table: &TableConfig, + pool: &sqlx::mysql::MySqlPool, + ) -> Result, CollectionError> { + let schema_sql = format!( + "SHOW COLUMNS FROM {}.{}", + table.source_schema, table.source_table + ); + + debug!("Getting table schema: {}", schema_sql); + + let schema_rows = sqlx::query(&schema_sql) + .fetch_all(pool) + .await + .map_err(|e| CollectionError::QueryError(format!("Schema query failed: {}", e)))?; + + let mut column_types = HashMap::new(); + + for row in schema_rows { + let field_name: String = row.try_get("Field").map_err(|e| { + CollectionError::ParseError(format!("Failed to get field name: {}", e)) + })?; + let field_type: String = row.try_get("Type").map_err(|e| { + CollectionError::ParseError(format!("Failed to get field type: {}", e)) + })?; + let is_nullable: String = row.try_get("Null").map_err(|e| { + CollectionError::ParseError(format!("Failed to get nullable info: {}", e)) + })?; + + debug!( + "Column schema: {} -> {} (nullable: {})", + field_name, field_type, is_nullable + ); + column_types.insert(field_name, (field_type, is_nullable == "YES")); + } + + Ok(column_types) + } + + /// Query data from a TiDB table + async fn query_table_data( + &self, + table: &TableConfig, + pool: &sqlx::mysql::MySqlPool, + column_types: &HashMap, + ) -> Result>, CollectionError> { + // Build SQL query + let sql = if let Some(where_clause) = &table.where_clause { + format!( + "SELECT * FROM {}.{} WHERE {}", + table.source_schema, table.source_table, where_clause + ) + } else { + format!( + "SELECT * FROM {}.{}", + table.source_schema, table.source_table + ) + }; + + debug!("Executing query: {}", sql); + + // Execute query + let rows = sqlx::query(&sql) + .fetch_all(pool) + .await + .map_err(|e| CollectionError::QueryError(format!("Data query failed: {}", e)))?; + + debug!( + "Query returned {} rows for {}.{}", + rows.len(), + table.source_schema, + table.source_table + ); + + // Convert rows to HashMap format using schema information + let mut result = Vec::new(); + for row in rows.iter() { + let mut map = HashMap::new(); + + for (i, column) in row.columns().iter().enumerate() { + let column_name = column.name().to_string(); + + // Convert value based on MySQL schema + let value = self.convert_mysql_value(&row, i, &column_name, column_types)?; + map.insert(column_name, value); + } + result.push(map); + } + + Ok(result) + } + + /// Convert MySQL row value to JSON Value using schema information + fn convert_mysql_value( + &self, + row: &sqlx::mysql::MySqlRow, + column_index: usize, + column_name: &str, + column_types: &HashMap, + ) -> Result { + if let Some((mysql_type, _is_nullable)) = column_types.get(column_name) { + let mysql_type_lower = mysql_type.to_lowercase(); + + // Integer types + if mysql_type_lower.contains("int") || mysql_type_lower.contains("bigint") { + if mysql_type_lower.contains("unsigned") { + // Unsigned integer + match row.try_get::(column_index) { + Ok(v) => Ok(Value::Number((v as i64).into())), + Err(_) => self.try_parse_string_as_number(row, column_index), + } + } else { + // Signed integer + match row.try_get::(column_index) { + Ok(v) => Ok(Value::Number(v.into())), + Err(_) => self.try_parse_string_as_number(row, column_index), + } + } + } + // Float types + else if mysql_type_lower.contains("decimal") + || mysql_type_lower.contains("float") + || mysql_type_lower.contains("double") + || mysql_type_lower.contains("real") + { + match row.try_get::(column_index) { + Ok(v) => Ok(Value::Number( + serde_json::Number::from_f64(v) + .unwrap_or_else(|| serde_json::Number::from(0)), + )), + Err(_) => self.try_parse_string_as_number(row, column_index), + } + } + // Timestamp and datetime types + else if mysql_type_lower.contains("timestamp") + || mysql_type_lower.contains("datetime") + { + // Try NaiveDateTime first (proper type for MySQL TIMESTAMP) + match row.try_get::(column_index) { + Ok(dt) => { + let timestamp_str = dt.format("%Y-%m-%d %H:%M:%S").to_string(); + Ok(Value::String(timestamp_str)) + } + Err(_) => { + // Try as optional NaiveDateTime for nullable fields + match row.try_get::, _>(column_index) { + Ok(Some(dt)) => { + let timestamp_str = dt.format("%Y-%m-%d %H:%M:%S").to_string(); + Ok(Value::String(timestamp_str)) + } + Ok(None) => Ok(Value::Null), + Err(_) => { + // Try DateTime for UTC timestamps + match row.try_get::, _>(column_index) + { + Ok(dt) => { + let timestamp_str = + dt.format("%Y-%m-%d %H:%M:%S").to_string(); + Ok(Value::String(timestamp_str)) + } + Err(_) => { + // Final fallback: try as string + match row.try_get::(column_index) { + Ok(v) => Ok(Value::String(v)), + Err(_) => { + warn!("All timestamp retrieval methods failed for column '{}'", column_name); + Ok(Value::Null) + } + } + } + } + } + } + } + } + } + // String and other types + else { + self.try_get_as_string_first(row, column_index) + } + } else { + // Fallback if schema not found + self.try_simple_conversion(row, column_index) + } + } + + /// Try to parse string as number, fallback to string + fn try_parse_string_as_number( + &self, + row: &sqlx::mysql::MySqlRow, + column_index: usize, + ) -> Result { + match row.try_get::(column_index) { + Ok(s) => { + if let Ok(int_val) = s.parse::() { + Ok(Value::Number(int_val.into())) + } else if let Ok(uint_val) = s.parse::() { + Ok(Value::Number((uint_val as i64).into())) + } else if let Ok(float_val) = s.parse::() { + Ok(Value::Number( + serde_json::Number::from_f64(float_val) + .unwrap_or_else(|| serde_json::Number::from(0)), + )) + } else { + Ok(Value::String(s)) + } + } + Err(_) => Ok(Value::Null), + } + } + + /// Try to get as string first, with numeric fallback + fn try_get_as_string_first( + &self, + row: &sqlx::mysql::MySqlRow, + column_index: usize, + ) -> Result { + match row.try_get::(column_index) { + Ok(v) => Ok(Value::String(v)), + Err(_) => self.try_simple_conversion(row, column_index), + } + } + + /// Simple type conversion fallback + fn try_simple_conversion( + &self, + row: &sqlx::mysql::MySqlRow, + column_index: usize, + ) -> Result { + match row.try_get::(column_index) { + Ok(v) => Ok(Value::Number(v.into())), + Err(_) => match row.try_get::(column_index) { + Ok(v) => Ok(Value::Number( + serde_json::Number::from_f64(v).unwrap_or_else(|| serde_json::Number::from(0)), + )), + Err(_) => match row.try_get::(column_index) { + Ok(v) => Ok(Value::String(v)), + Err(_) => Ok(Value::Null), + }, + }, + } + } +} + +#[async_trait] +impl DataCollector for SqlCollector { + fn collection_method(&self) -> CollectionMethod { + CollectionMethod::Sql + } + + fn can_collect_table(&self, _table: &TableConfig) -> bool { + // SQL collector can handle any table + true + } + + async fn initialize(&mut self) -> Result<(), CollectionError> { + info!( + "Initializing SQL collector for instance: {}", + self.config.instance + ); + + let pool = self.create_connection_pool().await?; + self.pool = Some(pool); + + info!("SQL collector initialized successfully"); + Ok(()) + } + + async fn collect_table_data( + &self, + table: &TableConfig, + ) -> Result { + let start_time = Instant::now(); + let timestamp = chrono::Utc::now(); + + let pool = self.pool.as_ref().ok_or_else(|| { + CollectionError::ConfigurationError("Pool not initialized".to_string()) + })?; + + // Get table schema + let column_types = self.get_table_schema(table, pool).await?; + + // Query table data + let data = self.query_table_data(table, pool, &column_types).await?; + + let duration = start_time.elapsed(); + let row_count = data.len(); + + // Create metadata + let mut extra = HashMap::new(); + extra.insert( + "schema_columns".to_string(), + Value::Number(column_types.len().into()), + ); + + let metadata = CollectionMetadata { + instance: self.config.instance.clone(), + table_config: table.clone(), + collection_method: CollectionMethod::Sql, + timestamp, + row_count, + duration_ms: duration.as_millis() as u64, + extra, + }; + + info!( + "SQL collection completed for table {}: {} rows in {}ms", + table.source_table, + row_count, + duration.as_millis() + ); + + Ok(CollectionResult { data, metadata }) + } + + async fn health_check(&self) -> Result<(), CollectionError> { + if let Some(pool) = &self.pool { + sqlx::query("SELECT 1").fetch_one(pool).await.map_err(|e| { + CollectionError::ConnectionError(format!("Health check failed: {}", e)) + })?; + Ok(()) + } else { + Err(CollectionError::ConfigurationError( + "Pool not initialized".to_string(), + )) + } + } +} diff --git a/src/sources/system_tables/controller.rs b/src/sources/system_tables/controller.rs new file mode 100644 index 0000000..5b837b8 --- /dev/null +++ b/src/sources/system_tables/controller.rs @@ -0,0 +1,521 @@ +use std::collections::{HashMap, HashSet}; +use std::time::Duration; + +use tokio::time::interval; +use tracing::{debug, error, info, warn}; +use vector::shutdown::ShutdownSignal; +use vector::SourceSender; +use vector_lib::config::proxy::ProxyConfig; +use vector_lib::tls::TlsConfig; + +use crate::common::features::is_nextgen_mode; +use crate::common::topology::{Component, FetchError, InstanceType, TopologyFetcher}; +use crate::sources::system_tables::{CollectionConfig, DatabaseConfig, TableConfig}; + +use crate::sources::system_tables::collector_factory::CollectorFactory; +use crate::sources::system_tables::data_collector::{ + CollectionMethod, CollectorConfig, DataCollector, +}; + +/// Main controller using abstracted data collectors +#[allow(dead_code)] +pub struct Controller { + topology_fetch_interval: Duration, + topology_fetcher: TopologyFetcher, + tidb_components: HashSet, + running_collectors: HashMap, + database_config: DatabaseConfig, + collection_config: CollectionConfig, + tables: Vec, + collection_method: CollectionMethod, + proxy_config: ProxyConfig, + out: SourceSender, +} + +/// Task information for a running collector +struct CollectorTask { + handle: tokio::task::JoinHandle<()>, + collector_type: CollectionMethod, + table_count: usize, +} + +impl Controller { + /// Create a new controller with abstracted collectors + pub async fn new( + pd_address: Option, + tidb_group: Option, + label_k8s_instance: Option, + topology_fetch_interval: Duration, + database_config: DatabaseConfig, + collection_config: CollectionConfig, + tables: Vec, + pd_tls: Option, + proxy_config: &ProxyConfig, + out: SourceSender, + collection_method: String, + ) -> vector::Result { + // Parse collection method + let collection_method = CollectionMethod::from_string(&collection_method) + .map_err(|e| format!("Invalid collection method: {}", e))?; + + // Create topology fetcher + let topology_fetcher = if is_nextgen_mode() { + info!("Using nextgen mode for topology discovery"); + if tidb_group.is_none() && label_k8s_instance.is_none() { + return Err( + "In nextgen mode, either tidb_group or label_k8s_instance must be specified" + .into(), + ); + } + TopologyFetcher::new( + Some(String::new()), + None, + proxy_config, + tidb_group.clone(), + label_k8s_instance.clone(), + ) + .await + .map_err(|e| format!("Failed to create nextgen topology fetcher: {}", e))? + } else { + info!("Using legacy mode for topology discovery"); + let pd_addr = pd_address.ok_or("In legacy mode, pd_address must be specified")?; + + if let Some(ref tls_config) = pd_tls { + info!("Legacy mode using TLS configuration for PD/etcd connections"); + if tls_config.ca_file.is_some() { + info!(" CA file configured: {:?}", tls_config.ca_file); + } + if tls_config.crt_file.is_some() && tls_config.key_file.is_some() { + info!(" Client certificate and key configured"); + } + } else { + info!("Legacy mode using insecure connections to PD/etcd"); + } + + TopologyFetcher::new( + Some(pd_addr), + pd_tls.clone(), + proxy_config, + tidb_group.clone(), + label_k8s_instance.clone(), + ) + .await + .map_err(|e| format!("Failed to create legacy topology fetcher: {}", e))? + }; + + Ok(Self { + topology_fetch_interval, + topology_fetcher, + tidb_components: HashSet::new(), + running_collectors: HashMap::new(), + database_config, + collection_config, + tables, + collection_method, + proxy_config: proxy_config.clone(), + out, + }) + } + + /// Run the main controller loop + pub async fn run(mut self, mut shutdown: ShutdownSignal) { + info!("System Tables Controller starting..."); + + tokio::select! { + _ = self.run_loop() => {}, + _ = &mut shutdown => {}, + } + + info!("System Tables Controller shutting down..."); + self.shutdown_all_collectors().await; + } + + /// Main control loop + async fn run_loop(&mut self) { + let mut topology_interval = interval(self.topology_fetch_interval); + + loop { + topology_interval.tick().await; + + // Fetch TiDB instances and update collectors + if let Err(e) = self.fetch_and_update_tidb_instances().await { + error!("Failed to fetch TiDB instances: {}", e); + } + } + } + + /// Fetch TiDB instances and update collectors + async fn fetch_and_update_tidb_instances(&mut self) -> Result<(), FetchError> { + let mut new_components = HashSet::new(); + + // Fetch topology from PD/etcd or K8s + self.topology_fetcher + .get_up_components(&mut new_components) + .await?; + + // Filter only TiDB components + let tidb_components: HashSet = new_components + .into_iter() + .filter(|c| c.instance_type == InstanceType::TiDB) + .collect(); + + // Only log if there are changes in TiDB components + if tidb_components != self.tidb_components { + info!( + "TiDB topology changed: {} components discovered", + tidb_components.len() + ); + for component in &tidb_components { + info!( + " TiDB instance: {}:{}", + component.host, component.primary_port + ); + } + } else { + debug!( + "TiDB topology unchanged: {} components", + tidb_components.len() + ); + } + + // Update collectors based on component changes + self.update_collectors(tidb_components).await; + + Ok(()) + } + + /// Update collectors based on new TiDB components + async fn update_collectors(&mut self, new_components: HashSet) { + let tables = self.tables.clone(); + + // Separate tables into cluster-level and instance-level + let (cluster_tables, instance_tables): (Vec<_>, Vec<_>) = tables + .iter() + .partition(|table| table.source_table.starts_with("CLUSTER_")); + + debug!( + "Table classification: {} cluster tables, {} instance tables", + cluster_tables.len(), + instance_tables.len() + ); + + // For cluster-level tables, only start one collector on the primary instance + if !cluster_tables.is_empty() { + let primary_component = new_components.iter().next().cloned(); + if let Some(primary_component) = primary_component { + let cluster_collector_key = format!( + "{}:{}_cluster", + primary_component.host, primary_component.primary_port + ); + if !self.running_collectors.contains_key(&cluster_collector_key) { + let cluster_tables_owned: Vec = + cluster_tables.into_iter().cloned().collect(); + self.start_collector_with_tables( + &primary_component, + cluster_tables_owned, + &cluster_collector_key, + ) + .await; + } + } + } + + // For instance-level tables, start collectors on all instances + if !instance_tables.is_empty() { + for component in &new_components { + let instance_collector_key = + format!("{}:{}_instance", component.host, component.primary_port); + if !self + .running_collectors + .contains_key(&instance_collector_key) + { + let instance_tables_owned: Vec = + instance_tables.iter().map(|t| (*t).clone()).collect(); + self.start_collector_with_tables( + component, + instance_tables_owned, + &instance_collector_key, + ) + .await; + } + } + } + + // Stop collectors for removed instances + let current_component_keys: HashSet<_> = self + .tidb_components + .iter() + .map(|c| format!("{}:{}", c.host, c.primary_port)) + .collect(); + let new_component_keys: HashSet<_> = new_components + .iter() + .map(|c| format!("{}:{}", c.host, c.primary_port)) + .collect(); + + for removed_key in current_component_keys.difference(&new_component_keys) { + self.stop_collector_by_instance(removed_key).await; + } + + // Update the component set + self.tidb_components = new_components; + } + + /// Start a collector for a specific TiDB component using abstracted interface + async fn start_collector_with_tables( + &mut self, + component: &Component, + tables: Vec, + collector_key: &str, + ) { + // Validate table compatibility with collection method + if self.collection_method == CollectionMethod::Coprocessor { + for table in &tables { + if !table.source_table.starts_with("CLUSTER_") { + error!( + "Table {} is not a cluster table and cannot be collected using coprocessor method. Only CLUSTER_* tables are supported for coprocessor collection.", + table.source_table + ); + return; + } + } + } + + let table_names: Vec<&str> = tables.iter().map(|t| t.source_table.as_str()).collect(); + info!( + "Starting {} collector for {}:{} with {} tables: [{}]", + self.collection_method, + component.host, + component.primary_port, + tables.len(), + table_names.join(", ") + ); + + // Create collector config based on collection method + let instance = format!("{}:{}", component.host, component.primary_port); + let collector_config = match self.collection_method { + CollectionMethod::Coprocessor => { + // For coprocessor method, use coprocessor-specific config + // Pass database TLS config for HTTP schema fetching + CollectorConfig::for_coprocessor( + instance, + component.host.clone(), + component.primary_port, + Some(30), // grpc_timeout_secs + Some(3), // max_retries + self.database_config.tls.clone(), + ) + } + CollectionMethod::Sql => { + // For SQL method, use database config + let mut instance_db_config = self.database_config.clone(); + instance_db_config.host = component.host.clone(); + instance_db_config.port = component.primary_port; + + CollectorConfig::for_sql(instance, instance_db_config) + } + CollectionMethod::HttpApi => { + // For HTTP API method, use HTTP-specific config + CollectorConfig::for_http_api( + instance, + component.host.clone(), + component.primary_port, + Some(30), // timeout_secs + Some(3), // max_retries + ) + } + CollectionMethod::CustomGrpc => { + // For custom gRPC, fallback to coprocessor config for now + CollectorConfig::for_coprocessor( + instance, + component.host.clone(), + component.primary_port, + Some(30), + Some(3), + self.database_config.tls.clone(), + ) + } + }; + + // Create collector using simplified factory + match CollectorFactory::create_collector(self.collection_method.clone(), collector_config) { + Ok(mut collector) => { + // Initialize the collector + if let Err(e) = collector.initialize().await { + error!( + "Failed to initialize collector for {}: {}", + collector_key, e + ); + return; + } + + info!( + "Successfully initialized {} collector for {}", + collector.collection_method(), + collector_key + ); + + // Store table count before moving tables + let table_count = tables.len(); + + // Start the collector task + let out_clone = self.out.clone(); + let collection_config_clone = self.collection_config.clone(); + let handle = tokio::spawn(async move { + Self::run_collector_task(collector, tables, out_clone, collection_config_clone) + .await; + }); + let task = CollectorTask { + handle, + collector_type: self.collection_method.clone(), + table_count, + }; + + self.running_collectors + .insert(collector_key.to_string(), task); + } + Err(e) => { + error!("Failed to create collector for {}: {}", collector_key, e); + } + } + } + + /// Run a collector task for multiple tables + async fn run_collector_task( + collector: Box, + tables: Vec, + mut out: SourceSender, + collection_config: CollectionConfig, + ) { + use crate::sources::system_tables::data_collector::utils::{ + create_event_from_result, parse_collection_interval, + }; + + let table_config = &tables[0]; // Use first table's config as reference + let interval_seconds = + parse_collection_interval(&table_config.collection_interval, &collection_config); + let interval_duration = Duration::from_secs(interval_seconds); + + let table_names: Vec = tables.iter().map(|t| t.source_table.clone()).collect(); + + info!( + "📊 Starting collection loop for tables: [{}] with interval: {}s ({}) [config: short={}s, long={}s]", + table_names.join(", "), + interval_seconds, + &table_config.collection_interval, + collection_config.short_interval, + collection_config.long_interval + ); + + let mut collection_interval = interval(interval_duration); + + loop { + collection_interval.tick().await; + + info!( + "🔄 Collection cycle starting - interval: {}s, tables: [{}]", + interval_seconds, + table_names.join(", ") + ); + + // Collect data from each table + for table in &tables { + if !table.enabled { + continue; + } + + // Check if collector can handle this table + if !collector.can_collect_table(table) { + warn!( + "Collector {} cannot handle table {}.{}", + collector.collection_method(), + table.source_schema, + table.source_table + ); + continue; + } + + match collector.collect_table_data(table).await { + Ok(result) => { + let row_count = result.data.len(); + info!( + "Collected {} rows from table {} using {}", + row_count, + table.source_table, + collector.collection_method() + ); + + // Convert data to events and send + for row_data in &result.data { + let event = create_event_from_result(&result, row_data.clone()); + + // Send event to sinks + if let Err(e) = out.send_event(event).await { + error!( + "Failed to send event for table {}: {}", + table.source_table, e + ); + } else { + debug!("Successfully sent event for table {}", table.source_table); + } + } + } + Err(e) => { + error!( + "Failed to collect data from table {} using {}: {}", + table.source_table, + collector.collection_method(), + e + ); + } + } + } + + // Perform periodic health check + if let Err(e) = collector.health_check().await { + warn!( + "Health check failed for {} collector: {}", + collector.collection_method(), + e + ); + } + } + } + + /// Stop a collector by its key + async fn stop_collector(&mut self, collector_key: &str) { + if let Some(task) = self.running_collectors.remove(collector_key) { + info!( + "Stopping {} collector with key: {} ({} tables)", + task.collector_type, collector_key, task.table_count + ); + task.handle.abort(); + info!("Stopped collector with key: {}", collector_key); + } + } + + /// Stop all collectors for a specific instance + async fn stop_collector_by_instance(&mut self, instance: &str) { + let keys_to_remove: Vec = self + .running_collectors + .keys() + .filter(|key| key.starts_with(instance)) + .cloned() + .collect(); + + for key in keys_to_remove { + self.stop_collector(&key).await; + } + } + + /// Shutdown all collectors + async fn shutdown_all_collectors(&mut self) { + for (collector_key, task) in self.running_collectors.drain() { + info!( + "Shutting down {} collector with key: {} ({} tables)", + task.collector_type, collector_key, task.table_count + ); + task.handle.abort(); + } + info!("All collectors shut down"); + } +} diff --git a/src/sources/system_tables/data_collector.rs b/src/sources/system_tables/data_collector.rs new file mode 100644 index 0000000..c6221e7 --- /dev/null +++ b/src/sources/system_tables/data_collector.rs @@ -0,0 +1,308 @@ +use std::collections::HashMap; +use std::fmt; +use std::sync::atomic::{AtomicU64, Ordering}; + +use async_trait::async_trait; +use serde_json::Value; + +use crate::sources::system_tables::{CollectionConfig, DatabaseConfig, TableConfig}; + +/// Global counter for generating unique incremental IDs +static GLOBAL_ID_COUNTER: AtomicU64 = AtomicU64::new(1); + +/// Error types for data collection +#[derive(Debug)] +pub enum CollectionError { + ConnectionError(String), + QueryError(String), + ParseError(String), + ConfigurationError(String), + NetworkError(String), +} + +impl fmt::Display for CollectionError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CollectionError::ConnectionError(msg) => write!(f, "Connection error: {}", msg), + CollectionError::QueryError(msg) => write!(f, "Query error: {}", msg), + CollectionError::ParseError(msg) => write!(f, "Parse error: {}", msg), + CollectionError::ConfigurationError(msg) => write!(f, "Configuration error: {}", msg), + CollectionError::NetworkError(msg) => write!(f, "Network error: {}", msg), + } + } +} + +impl std::error::Error for CollectionError {} + +/// Collection method type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum CollectionMethod { + /// Traditional SQL-based collection via MySQL protocol + Sql, + /// gRPC coprocessor-based collection + Coprocessor, + /// HTTP API-based collection + HttpApi, + /// Custom gRPC service collection + CustomGrpc, +} + +impl fmt::Display for CollectionMethod { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CollectionMethod::Sql => write!(f, "sql"), + CollectionMethod::Coprocessor => write!(f, "coprocessor"), + CollectionMethod::HttpApi => write!(f, "http_api"), + CollectionMethod::CustomGrpc => write!(f, "custom_grpc"), + } + } +} + +impl CollectionMethod { + pub fn from_string(s: &str) -> Result { + match s.to_lowercase().as_str() { + "sql" => Ok(CollectionMethod::Sql), + "coprocessor" => Ok(CollectionMethod::Coprocessor), + "http_api" | "http" => Ok(CollectionMethod::HttpApi), + "custom_grpc" | "grpc" => Ok(CollectionMethod::CustomGrpc), + _ => Err(CollectionError::ConfigurationError(format!( + "Unknown collection method: {}. Supported: sql, coprocessor, http_api, custom_grpc", + s + ))), + } + } +} + +/// Metadata about the collection process +#[derive(Debug, Clone)] +pub struct CollectionMetadata { + /// Instance identifier + pub instance: String, + /// Table configuration + pub table_config: TableConfig, + /// Collection method used + pub collection_method: CollectionMethod, + /// Collection timestamp + pub timestamp: chrono::DateTime, + /// Number of rows collected + pub row_count: usize, + /// Collection duration in milliseconds + pub duration_ms: u64, + /// Additional metadata + pub extra: HashMap, +} + +/// Result of a data collection operation +#[derive(Debug)] +pub struct CollectionResult { + /// Collected data rows + pub data: Vec>, + /// Collection metadata + pub metadata: CollectionMetadata, +} + +/// Configuration for collection process +#[derive(Debug, Clone)] +pub struct CollectorConfig { + /// Instance identifier + pub instance: String, + /// Collector-specific configuration + pub config_type: CollectorConfigType, +} + +/// Collector-specific configuration variants +#[derive(Debug, Clone)] +pub enum CollectorConfigType { + /// SQL collector configuration + Sql { database_config: DatabaseConfig }, + /// Coprocessor collector configuration + Coprocessor { + host: String, + port: u16, + #[allow(dead_code)] + grpc_timeout_secs: u64, + #[allow(dead_code)] + max_retries: u32, + /// TLS configuration for HTTP schema fetching + tls: Option, + }, + /// HTTP API collector configuration + HttpApi { + #[allow(dead_code)] + host: String, + #[allow(dead_code)] + port: u16, + #[allow(dead_code)] + timeout_secs: u64, + #[allow(dead_code)] + max_retries: u32, + }, +} + +impl CollectorConfig { + /// Create configuration for SQL collector + pub fn for_sql(instance: String, database_config: DatabaseConfig) -> Self { + Self { + instance, + config_type: CollectorConfigType::Sql { database_config }, + } + } + + /// Create configuration for Coprocessor collector + pub fn for_coprocessor( + instance: String, + host: String, + port: u16, + grpc_timeout_secs: Option, + max_retries: Option, + tls: Option, + ) -> Self { + Self { + instance, + config_type: CollectorConfigType::Coprocessor { + host, + port, + grpc_timeout_secs: grpc_timeout_secs.unwrap_or(30), + max_retries: max_retries.unwrap_or(3), + tls, + }, + } + } + + /// Create configuration for HTTP API collector + pub fn for_http_api( + instance: String, + host: String, + port: u16, + timeout_secs: Option, + max_retries: Option, + ) -> Self { + Self { + instance, + config_type: CollectorConfigType::HttpApi { + host, + port, + timeout_secs: timeout_secs.unwrap_or(30), + max_retries: max_retries.unwrap_or(3), + }, + } + } +} + +/// Abstract trait for data collectors +#[async_trait] +pub trait DataCollector: Send + Sync + 'static { + /// Get the collection method this collector supports + fn collection_method(&self) -> CollectionMethod; + + /// Check if this collector can handle the given table + fn can_collect_table(&self, table: &TableConfig) -> bool; + + /// Initialize the collector (e.g., establish connections, verify config) + async fn initialize(&mut self) -> Result<(), CollectionError>; + + /// Collect data from a single table + async fn collect_table_data( + &self, + table: &TableConfig, + ) -> Result; + + /// Get collector health status + async fn health_check(&self) -> Result<(), CollectionError>; +} + +/// Utility functions for collection +pub mod utils { + use super::*; + use vector_lib::event::{Event, LogEvent}; + + /// Create a Vector event from collection result + pub fn create_event_from_result( + result: &CollectionResult, + row_data: HashMap, + ) -> Event { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Generate unique incremental ID + let unique_id = GLOBAL_ID_COUNTER.fetch_add(1, Ordering::SeqCst); + log.insert( + "_vector_id", + Value::Number(serde_json::Number::from(unique_id)), + ); + + // Add standard metadata + log.insert( + "_vector_table", + result.metadata.table_config.dest_table.clone(), + ); + log.insert( + "_vector_source_table", + result.metadata.table_config.source_table.clone(), + ); + log.insert( + "_vector_source_schema", + result.metadata.table_config.source_schema.clone(), + ); + log.insert("_vector_instance", result.metadata.instance.clone()); + log.insert("_vector_timestamp", result.metadata.timestamp.to_rfc3339()); + log.insert( + "_vector_collection_method", + result.metadata.collection_method.to_string(), + ); + + // Add performance metadata + log.insert( + "_vector_collection_duration_ms", + result.metadata.duration_ms as i64, + ); + log.insert("_vector_row_count", result.metadata.row_count as i64); + + // Add extra metadata + for (key, value) in &result.metadata.extra { + if key == "schema_metadata" { + // Add schema metadata directly as _schema_metadata for DeltaLake writer + log.insert("_schema_metadata", value.clone()); + } + // Intentionally skip writing generic _vector_meta_* fields + } + + // Add the actual row data + for (key, value) in row_data { + log.insert(key.as_str(), value); + } + + // For non-cluster tables, add instance column to the actual data + if !result + .metadata + .table_config + .source_table + .starts_with("CLUSTER_") + { + log.insert("instance", result.metadata.instance.clone()); + } + + event + } + + /// Parse collection interval + pub fn parse_collection_interval( + interval_str: &str, + collection_config: &CollectionConfig, + ) -> u64 { + match interval_str { + "short" => collection_config.short_interval, + "long" => collection_config.long_interval, + custom if custom.starts_with("custom=") => { + if let Some(seconds) = custom.strip_prefix("custom=") { + seconds + .parse::() + .unwrap_or(collection_config.short_interval) + } else { + collection_config.short_interval + } + } + _ => collection_config.short_interval, + } + } +} diff --git a/src/sources/system_tables/mod.rs b/src/sources/system_tables/mod.rs new file mode 100644 index 0000000..2328d60 --- /dev/null +++ b/src/sources/system_tables/mod.rs @@ -0,0 +1,497 @@ +use std::env; +use std::time::Duration; + +use serde::{Deserialize, Serialize}; +use vector::config::{GenerateConfig, SourceConfig, SourceContext}; +use vector_lib::{ + config::{DataType, LogNamespace, SourceOutput}, + configurable::configurable_component, + source::Source, + tls::TlsConfig, +}; + +use crate::sources::system_tables::controller::Controller; + +// New abstracted collectors +mod collector_factory; +mod collectors; +mod data_collector; + +// Main controller +mod controller; + +/// Environment variable names for database configuration +pub struct DatabaseEnvVars; + +impl DatabaseEnvVars { + pub const USERNAME: &'static str = "TIDB_USERNAME"; + pub const PASSWORD: &'static str = "TIDB_PASSWORD"; + pub const HOST: &'static str = "TIDB_HOST"; + pub const PORT: &'static str = "TIDB_PORT"; + pub const DATABASE: &'static str = "TIDB_DATABASE"; + pub const MAX_CONNECTIONS: &'static str = "TIDB_MAX_CONNECTIONS"; + pub const CONNECT_TIMEOUT: &'static str = "TIDB_CONNECT_TIMEOUT"; + + // TLS related environment variables + pub const TLS_CA_FILE: &'static str = "TIDB_TLS_CA_FILE"; + pub const TLS_CERT_FILE: &'static str = "TIDB_TLS_CERT_FILE"; + pub const TLS_KEY_FILE: &'static str = "TIDB_TLS_KEY_FILE"; + pub const TLS_VERIFY_CERTIFICATE: &'static str = "TIDB_TLS_VERIFY_CERTIFICATE"; + pub const TLS_VERIFY_HOSTNAME: &'static str = "TIDB_TLS_VERIFY_HOSTNAME"; + + // PD/Topology related environment variables + pub const PD_ADDRESS: &'static str = "PD_ADDRESS"; + pub const TIDB_GROUP: &'static str = "TIDB_GROUP"; + pub const LABEL_K8S_INSTANCE: &'static str = "LABEL_K8S_INSTANCE"; + + // PD TLS environment variables + pub const PD_TLS_CA_FILE: &'static str = "PD_TLS_CA_FILE"; + pub const PD_TLS_CERT_FILE: &'static str = "PD_TLS_CERT_FILE"; + pub const PD_TLS_KEY_FILE: &'static str = "PD_TLS_KEY_FILE"; + pub const PD_TLS_VERIFY_CERTIFICATE: &'static str = "PD_TLS_VERIFY_CERTIFICATE"; + pub const PD_TLS_VERIFY_HOSTNAME: &'static str = "PD_TLS_VERIFY_HOSTNAME"; + + // Collection configuration environment variables + pub const SHORT_INTERVAL: &'static str = "SYSTEM_TABLES_SHORT_INTERVAL"; + pub const LONG_INTERVAL: &'static str = "SYSTEM_TABLES_LONG_INTERVAL"; + pub const RETENTION_DAYS: &'static str = "SYSTEM_TABLES_RETENTION_DAYS"; + pub const TOPOLOGY_FETCH_INTERVAL: &'static str = "TOPOLOGY_FETCH_INTERVAL_SECONDS"; + pub const COLLECTION_METHOD: &'static str = "SYSTEM_TABLES_COLLECTION_METHOD"; +} + +/// Configuration for the system_tables source +#[configurable_component(source("system_tables"))] +#[derive(Debug, Clone)] +pub struct SystemTablesConfig { + /// PD address for legacy mode (to discover TiDB instances) + pub pd_address: Option, + + /// TiDB group name for nextgen mode + pub tidb_group: Option, + + /// Kubernetes instance label for nextgen mode + pub label_k8s_instance: Option, + + /// Database username (required for SQL collection method, optional for coprocessor) + pub database_username: Option, + /// Database password (required for SQL collection method, optional for coprocessor) + pub database_password: Option, + /// Database host (required for SQL collection method, optional for coprocessor) + pub database_host: Option, + /// Database port (required for SQL collection method, optional for coprocessor) + pub database_port: Option, + /// Database name (required for SQL collection method, optional for coprocessor) + pub database_name: Option, + /// Database max connections + pub database_max_connections: Option, + /// Database connect timeout + pub database_connect_timeout: Option, + + /// Short interval for high-frequency tables (seconds) + pub short_interval: u64, + /// Long interval for low-frequency tables (seconds) + pub long_interval: u64, + /// Data retention days + pub retention_days: u32, + + /// Tables to collect data from (array of table configurations) + pub tables: Vec, + + /// TLS configuration for PD/etcd connections + pub pd_tls: Option, + + /// TLS configuration for database connections + pub database_tls: Option, + + /// TiDB topology fetch interval in seconds + #[serde(default = "default_topology_fetch_interval")] + pub topology_fetch_interval_seconds: f64, + + /// Collection method: "coprocessor" for gRPC coprocessor-based collection (default), "sql" for SQL-based collection + #[serde(default = "default_collection_method")] + pub collection_method: String, +} + +/// Database connection configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DatabaseConfig { + pub username: String, + pub password: String, + pub host: String, + pub port: u16, + pub database: String, + pub max_connections: Option, + pub connect_timeout: Option, + pub tls: Option, +} + +/// Collection interval configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CollectionConfig { + /// Short interval for high-frequency tables (seconds) + pub short_interval: u64, + /// Long interval for low-frequency tables (seconds) + pub long_interval: u64, + /// Data retention days + pub retention_days: u32, +} + +/// Table configuration for data collection +#[derive(Debug, Clone, Serialize, Deserialize, ::vector_config::Configurable)] +pub struct TableConfig { + /// Source schema name + #[configurable(derived)] + pub source_schema: String, + /// Source table name + #[configurable(derived)] + pub source_table: String, + /// Destination table name + #[configurable(derived)] + pub dest_table: String, + /// Collection interval (short/long) + #[configurable(derived)] + pub collection_interval: String, + /// Optional WHERE clause + #[configurable(derived)] + pub where_clause: Option, + /// Whether this table is enabled + #[configurable(derived)] + pub enabled: bool, +} + +/// Collection interval type +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum CollectionInterval { + /// Use short interval + Short, + /// Use long interval + Long, + /// Custom interval in seconds + Custom(u64), +} + +pub const fn default_topology_fetch_interval() -> f64 { + 30.0 +} + +pub fn default_collection_method() -> String { + "coprocessor".to_string() +} + +/// Helper functions for reading environment variables +impl SystemTablesConfig { + /// Validate configuration based on collection method + pub fn validate(&self) -> vector::Result<()> { + match self.collection_method.to_lowercase().as_str() { + "sql" => { + // For SQL collection method, database fields are required + if self.database_username.is_none() { + return Err("missing field `database_username` in `sources.tidb_system_tables` (required for SQL collection method)".into()); + } + if self.database_password.is_none() { + return Err("missing field `database_password` in `sources.tidb_system_tables` (required for SQL collection method)".into()); + } + if self.database_host.is_none() { + return Err("missing field `database_host` in `sources.tidb_system_tables` (required for SQL collection method)".into()); + } + if self.database_port.is_none() { + return Err("missing field `database_port` in `sources.tidb_system_tables` (required for SQL collection method)".into()); + } + if self.database_name.is_none() { + return Err("missing field `database_name` in `sources.tidb_system_tables` (required for SQL collection method)".into()); + } + } + "coprocessor" | "http_api" | "custom_grpc" => { + // For coprocessor and other methods, database fields are optional + // These methods use gRPC/HTTP to communicate directly with TiKV/PD + info!( + "Using {} collection method - database connection fields are optional", + self.collection_method + ); + } + _ => { + return Err(format!("unsupported collection method: {}. Supported methods: sql, coprocessor, http_api, custom_grpc", self.collection_method).into()); + } + } + Ok(()) + } + /// Helper function to build TLS configuration from environment variables + fn build_tls_config_from_env( + ca_file_env: &str, + cert_file_env: &str, + key_file_env: &str, + verify_cert_env: &str, + verify_hostname_env: &str, + ) -> Option { + let ca_file = env::var(ca_file_env).ok().map(|p| p.into()); + let crt_file = env::var(cert_file_env).ok().map(|p| p.into()); + let key_file = env::var(key_file_env).ok().map(|p| p.into()); + let verify_certificate = env::var(verify_cert_env).ok().and_then(|s| s.parse().ok()); + let verify_hostname = env::var(verify_hostname_env) + .ok() + .and_then(|s| s.parse().ok()); + + // Only create TLS config if at least one TLS-related env var is set + if ca_file.is_some() || crt_file.is_some() || key_file.is_some() { + Some(TlsConfig { + ca_file, + crt_file, + key_file, + verify_certificate, + verify_hostname, + ..Default::default() + }) + } else { + None + } + } + + /// Merge configuration with values from environment variables + /// Environment variables take precedence over configuration file values + pub fn merge_with_env(&mut self) { + // Override with environment variables if they exist + if let Ok(val) = env::var(DatabaseEnvVars::PD_ADDRESS) { + self.pd_address = Some(val); + } + if let Ok(val) = env::var(DatabaseEnvVars::TIDB_GROUP) { + self.tidb_group = Some(val); + } + if let Ok(val) = env::var(DatabaseEnvVars::LABEL_K8S_INSTANCE) { + self.label_k8s_instance = Some(val); + } + if let Ok(val) = env::var(DatabaseEnvVars::USERNAME) { + self.database_username = Some(val); + } + if let Ok(val) = env::var(DatabaseEnvVars::PASSWORD) { + self.database_password = Some(val); + } + if let Ok(val) = env::var(DatabaseEnvVars::HOST) { + self.database_host = Some(val); + } + if let Ok(val) = env::var(DatabaseEnvVars::PORT) { + if let Ok(port) = val.parse() { + self.database_port = Some(port); + } + } + if let Ok(val) = env::var(DatabaseEnvVars::DATABASE) { + self.database_name = Some(val); + } + if let Ok(val) = env::var(DatabaseEnvVars::MAX_CONNECTIONS) { + if let Ok(connections) = val.parse() { + self.database_max_connections = Some(connections); + } + } + if let Ok(val) = env::var(DatabaseEnvVars::CONNECT_TIMEOUT) { + if let Ok(timeout) = val.parse() { + self.database_connect_timeout = Some(timeout); + } + } + if let Ok(val) = env::var(DatabaseEnvVars::SHORT_INTERVAL) { + if let Ok(interval) = val.parse() { + self.short_interval = interval; + } + } + if let Ok(val) = env::var(DatabaseEnvVars::LONG_INTERVAL) { + if let Ok(interval) = val.parse() { + self.long_interval = interval; + } + } + if let Ok(val) = env::var(DatabaseEnvVars::RETENTION_DAYS) { + if let Ok(days) = val.parse() { + self.retention_days = days; + } + } + if let Ok(val) = env::var(DatabaseEnvVars::TOPOLOGY_FETCH_INTERVAL) { + if let Ok(interval) = val.parse() { + self.topology_fetch_interval_seconds = interval; + } + } + if let Ok(val) = env::var(DatabaseEnvVars::COLLECTION_METHOD) { + self.collection_method = val; + } + + // Merge TLS configurations + if let Some(env_tls) = Self::build_tls_config_from_env( + DatabaseEnvVars::TLS_CA_FILE, + DatabaseEnvVars::TLS_CERT_FILE, + DatabaseEnvVars::TLS_KEY_FILE, + DatabaseEnvVars::TLS_VERIFY_CERTIFICATE, + DatabaseEnvVars::TLS_VERIFY_HOSTNAME, + ) { + self.database_tls = Some(env_tls); + } + + if let Some(env_pd_tls) = Self::build_tls_config_from_env( + DatabaseEnvVars::PD_TLS_CA_FILE, + DatabaseEnvVars::PD_TLS_CERT_FILE, + DatabaseEnvVars::PD_TLS_KEY_FILE, + DatabaseEnvVars::PD_TLS_VERIFY_CERTIFICATE, + DatabaseEnvVars::PD_TLS_VERIFY_HOSTNAME, + ) { + self.pd_tls = Some(env_pd_tls); + } + } +} + +impl GenerateConfig for SystemTablesConfig { + fn generate_config() -> toml::Value { + toml::Value::try_from(Self { + pd_address: Some("127.0.0.1:2379".to_owned()), + tidb_group: None, + label_k8s_instance: None, + database_username: Some("root".to_owned()), + database_password: Some("".to_owned()), + database_host: Some("127.0.0.1".to_owned()), + database_port: Some(4000), + database_name: Some("test".to_owned()), + database_max_connections: Some(10), + database_connect_timeout: Some(30), + short_interval: 5, + long_interval: 1800, + retention_days: 7, + tables: vec![TableConfig { + source_schema: "information_schema".to_owned(), + source_table: "PROCESSLIST".to_owned(), + dest_table: "hist_processlist".to_owned(), + collection_interval: "short".to_owned(), + where_clause: Some("command != 'Sleep'".to_owned()), + enabled: true, + }], + pd_tls: None, + database_tls: None, + topology_fetch_interval_seconds: default_topology_fetch_interval(), + collection_method: default_collection_method(), + }) + .unwrap() + } +} + +#[async_trait::async_trait] +#[typetag::serde(name = "system_tables")] +impl SourceConfig for SystemTablesConfig { + async fn build(&self, cx: SourceContext) -> vector::Result { + // Clone configuration and merge with environment variables + // Environment variables take precedence over config file values + let mut config = self.clone(); + config.merge_with_env(); + + // Validate configuration based on collection method + config.validate()?; + + info!("Building system_tables source with configuration:"); + if let (Some(ref host), Some(port), Some(ref database)) = ( + &config.database_host, + config.database_port, + &config.database_name, + ) { + info!(" Database: {}:{}/{}", host, port, database); + } else { + info!(" Database: Not configured (using coprocessor method)"); + } + if let Some(ref username) = config.database_username { + info!(" Username: {}", username); + } else { + info!(" Username: Not configured (using coprocessor method)"); + } + info!(" Max connections: {:?}", config.database_max_connections); + info!(" Connect timeout: {:?}", config.database_connect_timeout); + info!(" Database TLS enabled: {}", config.database_tls.is_some()); + if let Some(ref pd_addr) = config.pd_address { + info!(" PD address: {}", pd_addr); + } + info!(" PD TLS enabled: {}", config.pd_tls.is_some()); + info!(" Tables configured: {}", config.tables.len()); + + let topology_fetch_interval = + Duration::from_secs_f64(config.topology_fetch_interval_seconds); + let pd_address = config.pd_address.clone(); + let tidb_group = config.tidb_group.clone(); + let label_k8s_instance = config.label_k8s_instance.clone(); + + // Create DatabaseConfig from merged configuration only if using SQL collection method + let database_config = if config.collection_method.to_lowercase() == "sql" { + DatabaseConfig { + username: config.database_username.clone().unwrap_or_default(), + password: config.database_password.clone().unwrap_or_default(), + host: config.database_host.clone().unwrap_or_default(), + port: config.database_port.unwrap_or(4000), + database: config.database_name.clone().unwrap_or_default(), + max_connections: config.database_max_connections, + connect_timeout: config.database_connect_timeout, + tls: config.database_tls.clone(), + } + } else { + // For non-SQL collection methods (coprocessor, etc.), use dummy database config + // This config won't be used but is required by the Controller constructor + DatabaseConfig { + username: "unused".to_string(), + password: "unused".to_string(), + host: "unused".to_string(), + port: 0, + database: "unused".to_string(), + max_connections: None, + connect_timeout: None, + // Preserve database_tls here so coprocessor collectors can use it + // for HTTPS schema fetching via controller -> for_coprocessor(tls) + tls: config.database_tls.clone(), + } + }; + + // Create CollectionConfig from merged configuration + let collection_config = CollectionConfig { + short_interval: config.short_interval, + long_interval: config.long_interval, + retention_days: config.retention_days, + }; + + // Use tables from merged configuration + let tables = config.tables.clone(); + + let pd_tls = config.pd_tls.clone(); + let collection_method = config.collection_method.clone(); + + Ok(Box::pin(async move { + info!("Using system tables controller with abstracted collectors"); + let controller = Controller::new( + pd_address, + tidb_group, + label_k8s_instance, + topology_fetch_interval, + database_config, + collection_config, + tables, + pd_tls, + &cx.proxy, + cx.out, + collection_method, + ) + .await + .map_err(|error| error!(message = "Source failed to initialize.", %error))?; + + controller.run(cx.shutdown).await; + Ok(()) + })) + } + + fn outputs(&self, _: LogNamespace) -> Vec { + vec![SourceOutput { + port: None, + ty: DataType::Log, + schema_definition: None, + }] + } + + fn can_acknowledge(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_config() { + vector::test_util::test_generate_config::(); + } +} diff --git a/src/sources/topsql/controller.rs b/src/sources/topsql/controller.rs index aa194b3..601b782 100644 --- a/src/sources/topsql/controller.rs +++ b/src/sources/topsql/controller.rs @@ -8,9 +8,10 @@ use vector::shutdown::ShutdownSignal; use vector::SourceSender; use vector_lib::{config::proxy::ProxyConfig, tls::TlsConfig}; +use crate::common::features::is_nextgen_mode; +use crate::common::topology::{Component, FetchError, InstanceType, TopologyFetcher}; use crate::sources::topsql::schema_cache::{SchemaCache, SchemaManager}; use crate::sources::topsql::shutdown::{pair, ShutdownNotifier, ShutdownSubscriber}; -use crate::sources::topsql::topology::{Component, FetchError, InstanceType, TopologyFetcher}; use crate::sources::topsql::upstream::TopSQLSource; pub struct Controller { @@ -28,6 +29,8 @@ pub struct Controller { schema_update_interval: Duration, active_schema_manager: Option, keyspace_to_vmtenants: HashMap, + enable_row_format: bool, + partition_number: u32, out: SourceSender, } @@ -49,6 +52,8 @@ impl Controller { tidb_group: Option, label_k8s_instance: Option, keyspace_to_vmtenants: HashMap, + enable_row_format: bool, + partition_number: u32, out: SourceSender, ) -> vector::Result { let topo_fetcher = TopologyFetcher::new( @@ -80,6 +85,8 @@ impl Controller { schema_update_interval, active_schema_manager: None, keyspace_to_vmtenants, + enable_row_format, + partition_number, out, }) } @@ -142,7 +149,7 @@ impl Controller { }; // If we need to update the schema manager, find an available TiDB instance - if need_update_schema_manager { + if need_update_schema_manager && !is_nextgen_mode() { self.update_schema_manager(&latest_components).await; } @@ -170,7 +177,6 @@ impl Controller { let mut shuffled_components = tidb_components.clone(); shuffled_components.shuffle(&mut rand::rng()); - // Use the method to update schema_manager self.update_schema_manager_with_components(&shuffled_components) .await; } @@ -184,8 +190,6 @@ impl Controller { // Try each TiDB instance until one succeeds for tidb in tidb_components { - info!(message = "Trying schema manager with TiDB instance", instance = %tidb); - let tidb_address = format!("{}:{}", tidb.host, tidb.secondary_port); // Use async constructor with TLS configuration and pass existing schema_cache @@ -210,12 +214,11 @@ impl Controller { // Convert ShutdownSubscriber to broadcast::Receiver<()> let shutdown = self.shutdown_subscriber.subscribe(); - use crate::common::features::is_nextgen_mode; - if is_nextgen_mode() { // Schema manager is not supported in nextgen mode info!(message = "Schema manager is not supported in nextgen mode"); } else { + info!(message = "Trying schema manager with TiDB instance", instance = %tidb); // Clone the etcd client for the schema manager if let Some(etcd_client) = self.topo_fetcher.etcd_client() { let etcd_client = etcd_client.clone(); @@ -235,16 +238,16 @@ impl Controller { } else { error!(message = "Etcd client not available for schema manager"); } - } - info!( - message = "Started schema manager successfully", - instance = %tidb, - entries = cache.entry_count(), - schema_version = cache.schema_version(), - memory_usage_bytes = cache.memory_usage(), - memory_usage_kb = cache.memory_usage() / 1024 - ); + info!( + message = "Started schema manager successfully", + instance = %tidb, + entries = cache.entry_count(), + schema_version = cache.schema_version(), + memory_usage_bytes = cache.memory_usage(), + memory_usage_kb = cache.memory_usage() / 1024 + ); + } // Successfully started, exit the loop return; @@ -264,6 +267,8 @@ impl Controller { self.downsampling_interval, self.schema_cache.clone(), self.keyspace_to_vmtenants.clone(), + self.enable_row_format, + self.partition_number, ); let source = match source { Some(source) => source, diff --git a/src/sources/topsql/mod.rs b/src/sources/topsql/mod.rs index 844a50e..0b5ec82 100644 --- a/src/sources/topsql/mod.rs +++ b/src/sources/topsql/mod.rs @@ -16,7 +16,6 @@ pub use upstream::parser; mod controller; mod schema_cache; pub mod shutdown; -pub mod topology; pub mod upstream; /// PLACEHOLDER @@ -54,6 +53,14 @@ pub struct TopSQLConfig { /// Keyspace to VM tenants mapping for nextgen mode #[serde(skip)] pub keyspace_to_vmtenants: Option>, + + /// enable_row_format + #[serde(default = "default_enable_row_format")] + pub enable_row_format: bool, + + /// Partition number + #[serde(default = "default_partition_number")] + pub partition_number: u32, } pub const fn default_init_retry_delay() -> f64 { @@ -72,6 +79,14 @@ pub const fn default_downsampling_interval() -> u32 { 0 } +pub const fn default_enable_row_format() -> bool { + false +} + +pub const fn default_partition_number() -> u32 { + 1 +} + impl GenerateConfig for TopSQLConfig { fn generate_config() -> toml::Value { toml::Value::try_from(Self { @@ -84,6 +99,8 @@ impl GenerateConfig for TopSQLConfig { tidb_group: None, label_k8s_instance: None, keyspace_to_vmtenants: None, + enable_row_format: default_enable_row_format(), + partition_number: default_partition_number(), }) .unwrap() } @@ -106,6 +123,9 @@ impl SourceConfig for TopSQLConfig { let tidb_group = self.tidb_group.clone(); let label_k8s_instance = self.label_k8s_instance.clone(); let keyspace_to_vmtenants = self.keyspace_to_vmtenants.clone().unwrap_or_default(); + let enable_row_format = self.enable_row_format; + let partition_number = self.partition_number; + info!("TopSql source enable_row_format: {}", enable_row_format); Ok(Box::pin(async move { let controller = Controller::new( @@ -120,6 +140,8 @@ impl SourceConfig for TopSQLConfig { tidb_group, label_k8s_instance, keyspace_to_vmtenants, + enable_row_format, + partition_number, cx.out, ) .await diff --git a/src/sources/topsql/upstream/consts.rs b/src/sources/topsql/upstream/consts.rs index f3db51c..fe73c68 100644 --- a/src/sources/topsql/upstream/consts.rs +++ b/src/sources/topsql/upstream/consts.rs @@ -6,6 +6,7 @@ pub const LABEL_INSTANCE: &str = "instance"; pub const LABEL_DB_NAME: &str = "db"; pub const LABEL_TABLE_NAME: &str = "table"; pub const LABEL_TABLE_ID: &str = "table_id"; +pub const LABEL_REGION_ID: &str = "region_id"; pub const LABEL_INSTANCE_TYPE: &str = "instance_type"; pub const LABEL_SQL_DIGEST: &str = "sql_digest"; pub const LABEL_PLAN_DIGEST: &str = "plan_digest"; @@ -21,6 +22,12 @@ pub const METRIC_NAME_WRITE_KEYS: &str = "topsql_write_keys"; pub const METRIC_NAME_STMT_EXEC_COUNT: &str = "topsql_stmt_exec_count"; pub const METRIC_NAME_STMT_DURATION_SUM_NS: &str = "topsql_stmt_duration_sum_ns"; pub const METRIC_NAME_STMT_DURATION_COUNT: &str = "topsql_stmt_duration_count"; +pub const METRIC_NAME_NETWORK_BYTES: &str = "topsql_network_bytes"; +pub const METRIC_NAME_NETWORK_IN_BYTES: &str = "topsql_network_in_bytes"; +pub const METRIC_NAME_NETWORK_OUT_BYTES: &str = "topsql_network_out_bytes"; +pub const METRIC_NAME_LOGICAL_BYTES: &str = "topsql_logical_bytes"; +pub const METRIC_NAME_LOGICAL_READ_BYTES: &str = "topsql_logical_read_bytes"; +pub const METRIC_NAME_LOGICAL_WRITE_BYTES: &str = "topsql_logical_write_bytes"; pub const METRIC_NAME_SQL_META: &str = "topsql_sql_meta"; pub const METRIC_NAME_PLAN_META: &str = "topsql_plan_meta"; pub const METRIC_NAME_INSTANCE: &str = "topsql_instance"; diff --git a/src/sources/topsql/upstream/mod.rs b/src/sources/topsql/upstream/mod.rs index 7d1812c..e13b196 100644 --- a/src/sources/topsql/upstream/mod.rs +++ b/src/sources/topsql/upstream/mod.rs @@ -2,7 +2,7 @@ pub mod parser; pub mod tidb; pub mod tikv; -mod consts; +pub(crate) mod consts; mod tls_proxy; mod utils; @@ -17,15 +17,20 @@ use tonic::transport::{Channel, Endpoint}; use vector::{internal_events::StreamClosedError, SourceSender}; use vector_lib::{ byte_size_of::ByteSizeOf, - internal_event::{ByteSize, BytesReceived, CountByteSize, EventsReceived, InternalEvent, InternalEventHandle}, + internal_event::{ + ByteSize, BytesReceived, CountByteSize, EventsReceived, InternalEvent, InternalEventHandle, + }, register, tls::TlsConfig, + event::{Event, LogEvent, Value as LogValue}, }; +use chrono::Utc; +use crc32fast::Hasher as Crc32Hasher; +use crate::common::topology::{Component, InstanceType}; use crate::sources::topsql::{ schema_cache::SchemaCache, shutdown::ShutdownSubscriber, - topology::{Component, InstanceType}, upstream::{ parser::UpstreamEventParser, tidb::TiDBUpstream, @@ -51,12 +56,20 @@ pub trait Upstream: Send { async fn build_stream( client: Self::Client, ) -> Result, tonic::Status>; + + fn get_wait_seconds() -> u64; } // Common trait for TopSQL source behavior #[async_trait::async_trait] trait TopSQLSourceBehavior { - async fn handle_instance_event(&self, instance: &str, instance_type: &str, out: &mut SourceSender); + async fn handle_instance_event( + &self, + instance: &str, + instance_type: &str, + enable_row_format: bool, + out: &mut SourceSender, + ); } // Base TopSQL source with common functionality @@ -72,6 +85,9 @@ struct BaseTopSQLSource { top_n: usize, downsampling_interval: u32, schema_cache: Arc, + enable_row_format: bool, + partition_number: u32, // Only used when enable_row_format is true + instance_partition_id: u32, // Only used when enable_row_format is true and partition_number > 0 } impl BaseTopSQLSource { @@ -83,6 +99,8 @@ impl BaseTopSQLSource { top_n: usize, downsampling_interval: u32, schema_cache: Arc, + enable_row_format: bool, + partition_number: u32, ) -> Option { let protocal = if tls.is_none() { "http".into() @@ -106,6 +124,9 @@ impl BaseTopSQLSource { top_n, downsampling_interval, schema_cache, + enable_row_format, + partition_number, + instance_partition_id: 0, }), None => None, } @@ -119,12 +140,22 @@ impl BaseTopSQLSource { } } - async fn run_loop(&mut self, shutdown_subscriber: ShutdownSubscriber, behavior: &B) { + async fn run_loop( + &mut self, + shutdown_subscriber: ShutdownSubscriber, + behavior: &B, + ) { loop { let shutdown_subscriber = shutdown_subscriber.clone(); let state = match self.instance_type { - InstanceType::TiDB => self.run_once::(shutdown_subscriber, behavior).await, - InstanceType::TiKV => self.run_once::(shutdown_subscriber, behavior).await, + InstanceType::TiDB => { + self.run_once::(shutdown_subscriber, behavior) + .await + } + InstanceType::TiKV => { + self.run_once::(shutdown_subscriber, behavior) + .await + } _ => unreachable!(), }; @@ -145,19 +176,24 @@ impl BaseTopSQLSource { } } - async fn run_once(&mut self, shutdown_subscriber: ShutdownSubscriber, behavior: &B) -> State { + async fn run_once( + &mut self, + shutdown_subscriber: ShutdownSubscriber, + behavior: &B, + ) -> State { let response_stream = self.build_stream::(shutdown_subscriber).await; let mut response_stream = match response_stream { Ok(stream) => stream, Err(state) => return state, }; - self.on_connected(); + self.on_connected().await; let mut tick_stream = IntervalStream::new(time::interval(Duration::from_secs(1))); let mut instance_stream = IntervalStream::new(time::interval(Duration::from_secs(30))); let mut responses = vec![]; - let mut last_event_recv_ts = chrono::Local::now().timestamp(); - loop { + let mut responses_recv_ts_vec = vec![]; + info!(message = "Starting TopSQL source loop", instance = %self.instance, instance_type = %self.instance_type); + let exit_state = loop { tokio::select! { response = response_stream.next() => { match response { @@ -167,7 +203,7 @@ impl BaseTopSQLSource { }) .emit(ByteSize(response.size_of())); responses.push(response); - last_event_recv_ts = chrono::Local::now().timestamp(); + responses_recv_ts_vec.push(chrono::Local::now().timestamp()); }, Some(Err(error)) => { error!(message = "Failed to fetch events.", error = %error); @@ -177,16 +213,20 @@ impl BaseTopSQLSource { } } _ = tick_stream.next() => { - if chrono::Local::now().timestamp() > last_event_recv_ts + 10 { - if !responses.is_empty() { + if !responses.is_empty() { + if chrono::Local::now().timestamp() > responses_recv_ts_vec[0] + U::get_wait_seconds() as i64 { self.handle_responses::(responses).await; + responses_recv_ts_vec.clear(); responses = vec![]; } } } _ = instance_stream.next() => self.handle_instance(behavior).await, } - } + }; + + info!(message = "TopSQL source loop ended", instance = %self.instance, instance_type = %self.instance_type, exit_state = ?exit_state); + exit_state } async fn build_stream( @@ -242,6 +282,8 @@ impl BaseTopSQLSource { response, self.instance.clone(), self.schema_cache.clone(), + self.enable_row_format, + self.instance_partition_id, ); batch.append(&mut events); } @@ -254,12 +296,46 @@ impl BaseTopSQLSource { } async fn handle_instance(&mut self, behavior: &B) { - behavior.handle_instance_event(&self.instance, &self.instance_type.to_string(), &mut self.out).await; + behavior + .handle_instance_event( + &self.instance, + &self.instance_type.to_string(), + self.enable_row_format, + &mut self.out, + ) + .await; } - fn on_connected(&mut self) { + async fn on_connected(&mut self) { self.retry_delay = self.init_retry_delay; info!("Connected to the upstream."); + + if self.enable_row_format && self.partition_number > 0 { + // Calculate CRC32 for (instance, instance_type) + let instance_key = format!("{}_{}", self.instance, self.instance_type); + let mut hasher = Crc32Hasher::new(); + hasher.update(instance_key.as_bytes()); + let crc_value = hasher.finalize(); + + // Calculate partition by taking modulo + // Use max(1, partition_number) to avoid division by zero + let partition_number = if self.partition_number == 0 { 1 } else { self.partition_number }; + let instance_partition_id = (crc_value % partition_number as u32) as u32; + self.instance_partition_id = instance_partition_id; + + // Create and send LogEvent with (instance, instance_type, partition_number) + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + log.insert("source_table", "instance_partition"); + log.insert("timestamps", LogValue::from(Utc::now().timestamp())); + log.insert("instance", self.instance.clone()); + log.insert("instance_type", self.instance_type.to_string()); + log.insert("instance_partition_id", LogValue::from(instance_partition_id as i64)); + + if self.out.send_event(event).await.is_err() { + StreamClosedError { count: 1 }.emit(); + } + } } } @@ -268,10 +344,31 @@ struct LegacyTopSQLBehavior; #[async_trait::async_trait] impl TopSQLSourceBehavior for LegacyTopSQLBehavior { - async fn handle_instance_event(&self, instance: &str, instance_type: &str, out: &mut SourceSender) { - let event = instance_event(instance.to_string(), instance_type.to_string()); - if out.send_event(event).await.is_err() { - StreamClosedError { count: 1 }.emit(); + async fn handle_instance_event( + &self, + instance: &str, + instance_type: &str, + enable_row_format: bool, + out: &mut SourceSender, + ) { + if !enable_row_format { + let event = instance_event(instance.to_string(), instance_type.to_string()); + if out.send_event(event).await.is_err() { + StreamClosedError { count: 1 }.emit(); + } + } else { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "instance"); + log.insert("timestamps", LogValue::from(Utc::now().timestamp())); + log.insert("instance_type", instance_type.to_string()); + log.insert("instance", instance.to_string()); + + if out.send_event(event).await.is_err() { + StreamClosedError { count: 1 }.emit(); + } } } } @@ -290,6 +387,8 @@ impl LegacyTopSQLSource { top_n: usize, downsampling_interval: u32, schema_cache: Arc, + enable_row_format: bool, + partition_number: u32, ) -> Option { let base = BaseTopSQLSource::new( component, @@ -299,6 +398,8 @@ impl LegacyTopSQLSource { top_n, downsampling_interval, schema_cache, + enable_row_format, + partition_number, )?; Some(LegacyTopSQLSource { base }) } @@ -316,23 +417,60 @@ struct NextgenTopSQLBehavior { #[async_trait::async_trait] impl TopSQLSourceBehavior for NextgenTopSQLBehavior { - async fn handle_instance_event(&self, instance: &str, instance_type: &str, out: &mut SourceSender) { + async fn handle_instance_event( + &self, + instance: &str, + instance_type: &str, + enable_row_format: bool, + out: &mut SourceSender, + ) { let mut batch = vec![]; - let event = instance_event_metric(instance.to_string(), instance_type.to_string()); - batch.push(event); - for (cluster_id, (vm_account_id, vm_project_id)) in &self.keyspace_to_vmtenants { - let event = instance_event_with_tags( - instance.to_string(), - instance_type.to_string(), - cluster_id.clone(), - vm_account_id.clone(), - vm_project_id.clone(), - ); + if !enable_row_format { + let event = instance_event_metric(instance.to_string(), instance_type.to_string()); batch.push(event); - } - let count = batch.len(); - if out.send_batch(batch).await.is_err() { - StreamClosedError { count }.emit() + for (cluster_id, (vm_account_id, vm_project_id)) in &self.keyspace_to_vmtenants { + let event = instance_event_with_tags( + instance.to_string(), + instance_type.to_string(), + cluster_id.clone(), + vm_account_id.clone(), + vm_project_id.clone(), + ); + batch.push(event); + } + let count = batch.len(); + if out.send_batch(batch).await.is_err() { + StreamClosedError { count }.emit() + } + } else { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "instance"); + log.insert("timestamps", LogValue::from(Utc::now().timestamp())); + log.insert("instance_type", instance_type.to_string()); + log.insert("instance", instance.to_string()); + batch.push(event); + + for (cluster_id, (vm_account_id, vm_project_id)) in &self.keyspace_to_vmtenants { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + log.insert("source_table", "instance"); + log.insert("timestamps", LogValue::from(Utc::now().timestamp())); + log.insert("instance_type", instance_type.to_string()); + log.insert("instance", instance.to_string()); + log.insert("tidb_cluster_id", cluster_id.clone()); + log.insert("keyspace_name", cluster_id.clone()); + log.insert("vm_account_id", vm_account_id.clone()); + log.insert("vm_project_id", vm_project_id.clone()); + batch.push(event); + } + + let count = batch.len(); + if out.send_batch(batch).await.is_err() { + StreamClosedError { count }.emit() + } } } } @@ -353,6 +491,8 @@ impl NextgenTopSQLSource { downsampling_interval: u32, schema_cache: Arc, keyspace_to_vmtenants: HashMap, + enable_row_format: bool, + partition_number: u32, ) -> Option { let base = BaseTopSQLSource::new( component, @@ -362,8 +502,12 @@ impl NextgenTopSQLSource { top_n, downsampling_interval, schema_cache, + enable_row_format, + partition_number, )?; - let behavior = NextgenTopSQLBehavior { keyspace_to_vmtenants }; + let behavior = NextgenTopSQLBehavior { + keyspace_to_vmtenants, + }; Some(NextgenTopSQLSource { base, behavior }) } @@ -388,6 +532,8 @@ impl TopSQLSource { downsampling_interval: u32, schema_cache: Arc, keyspace_to_vmtenants: HashMap, + enable_row_format: bool, + partition_number: u32, ) -> Option { use crate::common::features::is_nextgen_mode; @@ -401,6 +547,8 @@ impl TopSQLSource { downsampling_interval, schema_cache, keyspace_to_vmtenants, + enable_row_format, + partition_number, )?; Some(TopSQLSource::Nextgen(source)) } else { @@ -412,6 +560,8 @@ impl TopSQLSource { top_n, downsampling_interval, schema_cache, + enable_row_format, + partition_number, )?; Some(TopSQLSource::Legacy(source)) } @@ -425,6 +575,7 @@ impl TopSQLSource { } } +#[derive(Debug)] enum State { RetryNow, RetryDelay, diff --git a/src/sources/topsql/upstream/parser.rs b/src/sources/topsql/upstream/parser.rs index e35ad00..fc06baf 100644 --- a/src/sources/topsql/upstream/parser.rs +++ b/src/sources/topsql/upstream/parser.rs @@ -6,7 +6,7 @@ use crate::sources::topsql::schema_cache::SchemaCache; use crate::sources::topsql::upstream::{ consts::{ LABEL_DB_NAME, LABEL_INSTANCE, LABEL_INSTANCE_TYPE, LABEL_NAME, LABEL_PLAN_DIGEST, - LABEL_SQL_DIGEST, LABEL_TABLE_ID, LABEL_TABLE_NAME, LABEL_TAG_LABEL, + LABEL_REGION_ID, LABEL_SQL_DIGEST, LABEL_TABLE_ID, LABEL_TABLE_NAME, LABEL_TAG_LABEL, }, utils::make_metric_like_log_event, }; @@ -18,6 +18,8 @@ pub trait UpstreamEventParser { event: Self::UpstreamEvent, instance: String, schema_cache: Arc, + enable_row_format: bool, + instance_partition_id: u32, ) -> Vec; fn keep_top_n(responses: Vec, top_n: usize) -> Vec; @@ -44,6 +46,7 @@ impl Default for Buf { (LABEL_DB_NAME, String::new()), (LABEL_TABLE_NAME, String::new()), (LABEL_TABLE_ID, String::new()), + (LABEL_REGION_ID, String::new()), ], timestamps: vec![], values: vec![], @@ -97,6 +100,11 @@ impl Buf { self } + pub fn region_id(&mut self, region_id: impl Into) -> &mut Self { + self.labels[9].1 = region_id.into(); + self + } + pub fn points(&mut self, points: impl Iterator) -> &mut Self { for (timestamp_sec, value) in points { self.timestamps.push( diff --git a/src/sources/topsql/upstream/tidb/mock_upstream.rs b/src/sources/topsql/upstream/tidb/mock_upstream.rs index 9a25552..13235b7 100644 --- a/src/sources/topsql/upstream/tidb/mock_upstream.rs +++ b/src/sources/topsql/upstream/tidb/mock_upstream.rs @@ -50,6 +50,8 @@ impl TopSqlPubSub for MockTopSqlPubSubServer { .collect(), stmt_duration_sum_ns: 30, stmt_duration_count: 20, + stmt_network_in_bytes: 40, + stmt_network_out_bytes: 50, }], keyspace_name: vec![], }; diff --git a/src/sources/topsql/upstream/tidb/mod.rs b/src/sources/topsql/upstream/tidb/mod.rs index 50ba7a3..3aa9cfd 100644 --- a/src/sources/topsql/upstream/tidb/mod.rs +++ b/src/sources/topsql/upstream/tidb/mod.rs @@ -55,4 +55,8 @@ impl Upstream for TiDBUpstream { .await .map(|r| r.into_inner()) } + + fn get_wait_seconds() -> u64 { + 0 + } } diff --git a/src/sources/topsql/upstream/tidb/parser.rs b/src/sources/topsql/upstream/tidb/parser.rs index a2b7ed1..e6fecbf 100644 --- a/src/sources/topsql/upstream/tidb/parser.rs +++ b/src/sources/topsql/upstream/tidb/parser.rs @@ -2,13 +2,14 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use chrono::Utc; -use vector::event::LogEvent; +use vector_lib::event::{Event, KeyString, LogEvent, Value as LogValue}; use crate::sources::topsql::schema_cache::SchemaCache; use crate::sources::topsql::upstream::consts::{ INSTANCE_TYPE_TIDB, INSTANCE_TYPE_TIKV, LABEL_ENCODED_NORMALIZED_PLAN, LABEL_IS_INTERNAL_SQL, LABEL_NAME, LABEL_NORMALIZED_PLAN, LABEL_NORMALIZED_SQL, LABEL_PLAN_DIGEST, LABEL_SQL_DIGEST, - METRIC_NAME_CPU_TIME_MS, METRIC_NAME_PLAN_META, METRIC_NAME_SQL_META, + METRIC_NAME_CPU_TIME_MS, METRIC_NAME_NETWORK_BYTES, METRIC_NAME_NETWORK_IN_BYTES, + METRIC_NAME_NETWORK_OUT_BYTES, METRIC_NAME_PLAN_META, METRIC_NAME_SQL_META, METRIC_NAME_STMT_DURATION_COUNT, METRIC_NAME_STMT_DURATION_SUM_NS, METRIC_NAME_STMT_EXEC_COUNT, }; use crate::sources::topsql::upstream::parser::{Buf, UpstreamEventParser}; @@ -27,12 +28,29 @@ impl UpstreamEventParser for TopSqlSubResponseParser { response: Self::UpstreamEvent, instance: String, _schema_cache: Arc, + enable_row_format: bool, + instance_partition_id: u32, ) -> Vec { - match response.resp_oneof { - Some(RespOneof::Record(record)) => Self::parse_tidb_record(record, instance), - Some(RespOneof::SqlMeta(sql_meta)) => Self::parse_tidb_sql_meta(sql_meta), - Some(RespOneof::PlanMeta(plan_meta)) => Self::parse_tidb_plan_meta(plan_meta), - None => vec![], + if !enable_row_format { + match response.resp_oneof { + Some(RespOneof::Record(record)) => Self::parse_tidb_record(record, instance), + Some(RespOneof::SqlMeta(sql_meta)) => Self::parse_tidb_sql_meta(sql_meta), + Some(RespOneof::PlanMeta(plan_meta)) => Self::parse_tidb_plan_meta(plan_meta), + None => vec![], + } + } else { + match response.resp_oneof { + Some(RespOneof::Record(record)) => { + Self::parse_tidb_record_to_row_format(record, instance, instance_partition_id) + } + Some(RespOneof::SqlMeta(sql_meta)) => { + Self::parse_tidb_sql_meta_to_row_format(sql_meta) + } + Some(RespOneof::PlanMeta(plan_meta)) => { + Self::parse_tidb_plan_meta_to_row_format(plan_meta) + } + None => vec![], + } } } @@ -45,6 +63,8 @@ impl UpstreamEventParser for TopSqlSubResponseParser { stmt_kv_exec_count: BTreeMap, stmt_duration_sum_ns: u64, stmt_duration_count: u64, + stmt_network_in_bytes: u64, + stmt_network_out_bytes: u64, } let mut new_responses = vec![]; @@ -66,6 +86,8 @@ impl UpstreamEventParser for TopSqlSubResponseParser { stmt_kv_exec_count: item.stmt_kv_exec_count.clone(), stmt_duration_sum_ns: item.stmt_duration_sum_ns, stmt_duration_count: item.stmt_duration_count, + stmt_network_in_bytes: item.stmt_network_in_bytes, + stmt_network_out_bytes: item.stmt_network_out_bytes, }; match ts_digests.get_mut(&item.timestamp_sec) { None => { @@ -92,9 +114,9 @@ impl UpstreamEventParser for TopSqlSubResponseParser { for e in evicted { others.timestamp_sec = *ts; others.cpu_time_ms += e.cpu_time_ms; - others.stmt_exec_count = e.stmt_exec_count; - others.stmt_duration_sum_ns = e.stmt_duration_sum_ns; - others.stmt_duration_count = e.stmt_duration_count; + others.stmt_exec_count += e.stmt_exec_count; + others.stmt_duration_sum_ns += e.stmt_duration_sum_ns; + others.stmt_duration_count += e.stmt_duration_count; for (k, v) in &e.stmt_kv_exec_count { match others.stmt_kv_exec_count.get(k) { None => { @@ -105,6 +127,8 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } } } + others.stmt_network_in_bytes += e.stmt_network_in_bytes; + others.stmt_network_out_bytes += e.stmt_network_out_bytes; } v.truncate(top_n); match ts_others.get_mut(&ts) { @@ -128,6 +152,8 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } } } + existed_others.stmt_network_in_bytes += others.stmt_network_in_bytes; + existed_others.stmt_network_out_bytes += others.stmt_network_out_bytes; } } } @@ -143,6 +169,8 @@ impl UpstreamEventParser for TopSqlSubResponseParser { stmt_kv_exec_count: psd.stmt_kv_exec_count.clone(), stmt_duration_sum_ns: psd.stmt_duration_sum_ns, stmt_duration_count: psd.stmt_duration_count, + stmt_network_in_bytes: psd.stmt_network_in_bytes, + stmt_network_out_bytes: psd.stmt_network_out_bytes, }; match digest_items.get_mut(&k) { None => { @@ -280,6 +308,8 @@ impl UpstreamEventParser for TopSqlSubResponseParser { } } } + new_item.stmt_network_in_bytes += item.stmt_network_in_bytes; + new_item.stmt_network_out_bytes += item.stmt_network_out_bytes; new_items.insert(new_ts, new_item); } } @@ -328,6 +358,22 @@ impl TopSqlSubResponseParser { (METRIC_NAME_STMT_DURATION_COUNT, stmt_duration_count), ); + // stmt_network_in_bytes + stmt_network_out_bytes + buf.label_name(METRIC_NAME_NETWORK_BYTES) + .points(record.items.iter().filter_map(|item| { + if item.stmt_network_in_bytes > 0 || item.stmt_network_out_bytes > 0 { + Some(( + item.timestamp_sec, + (item.stmt_network_in_bytes + item.stmt_network_out_bytes) as f64, + )) + } else { + None + } + })); + if let Some(event) = buf.build_event() { + logs.push(event); + } + // stmt_kv_exec_count buf.label_name(METRIC_NAME_STMT_EXEC_COUNT) .instance_type(INSTANCE_TYPE_TIKV); @@ -388,6 +434,98 @@ impl TopSqlSubResponseParser { &[1.0], )] } + + // TODO: consider apply chunk style LogEvent for better performance + fn parse_tidb_record_to_row_format(record: TopSqlRecord, instance: String, instance_partition_id: u32) -> Vec { + let mut events = vec![]; + for item in &record.items { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tidb_topsql"); + log.insert("timestamps", LogValue::from(item.timestamp_sec)); + log.insert("instance_type", INSTANCE_TYPE_TIDB.to_string()); + log.insert("instance", instance.clone()); + log.insert("instance_partition_id", LogValue::from(instance_partition_id as i64)); + log.insert( + LABEL_SQL_DIGEST, + hex::encode_upper(record.sql_digest.clone()), + ); + log.insert( + LABEL_PLAN_DIGEST, + hex::encode_upper(record.plan_digest.clone()), + ); + log.insert(METRIC_NAME_CPU_TIME_MS, LogValue::from(item.cpu_time_ms)); + log.insert( + METRIC_NAME_STMT_EXEC_COUNT, + LogValue::from(item.stmt_exec_count), + ); + log.insert( + METRIC_NAME_STMT_DURATION_SUM_NS, + LogValue::from(item.stmt_duration_sum_ns), + ); + log.insert( + METRIC_NAME_STMT_DURATION_COUNT, + LogValue::from(item.stmt_duration_count), + ); + log.insert( + METRIC_NAME_NETWORK_IN_BYTES, + LogValue::from(item.stmt_network_in_bytes), + ); + log.insert( + METRIC_NAME_NETWORK_OUT_BYTES, + LogValue::from(item.stmt_network_out_bytes), + ); + let mut tikv_exec_count = BTreeMap::::new(); + for (tikv_instance, exec_count) in item.stmt_kv_exec_count.iter() { + tikv_exec_count.insert( + KeyString::from(tikv_instance.as_str()), + LogValue::from(*exec_count), + ); + } + log.insert( + "topsql_tikv_stmt_exec_count", + LogValue::Object(tikv_exec_count), + ); + events.push(event.into_log()); + } + events + } + + fn parse_tidb_sql_meta_to_row_format(sql_meta: SqlMeta) -> Vec { + let mut events = vec![]; + let sql_digest = hex::encode_upper(sql_meta.sql_digest); + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + log.insert("source_table", "tidb_sql_meta"); + log.insert(LABEL_SQL_DIGEST, sql_digest); + log.insert(LABEL_NORMALIZED_SQL, sql_meta.normalized_sql); + log.insert(LABEL_IS_INTERNAL_SQL, sql_meta.is_internal_sql.to_string()); + events.push(event.into_log()); + events + } + + fn parse_tidb_plan_meta_to_row_format(plan_meta: PlanMeta) -> Vec { + let mut events = vec![]; + let plan_digest = hex::encode_upper(plan_meta.plan_digest); + let encoded_normalized_plan = + hex::encode_upper(plan_meta.encoded_normalized_plan); + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tidb_plan_meta"); + log.insert(LABEL_PLAN_DIGEST, plan_digest); + log.insert(LABEL_NORMALIZED_PLAN, plan_meta.normalized_plan); + log.insert( + LABEL_ENCODED_NORMALIZED_PLAN, + encoded_normalized_plan.clone(), + ); + events.push(event.into_log()); + events + } } #[cfg(test)] @@ -432,6 +570,8 @@ mod tests { stmt_kv_exec_count: i.stmt_kv_exec_count, stmt_duration_sum_ns: i.stmt_duration_sum_ns, stmt_duration_count: i.stmt_duration_count, + stmt_network_in_bytes: 0, + stmt_network_out_bytes: 0, }) .collect(), keyspace_name: vec![], diff --git a/src/sources/topsql/upstream/tikv/mock_upstream.rs b/src/sources/topsql/upstream/tikv/mock_upstream.rs index 65f721b..f565f3e 100644 --- a/src/sources/topsql/upstream/tikv/mock_upstream.rs +++ b/src/sources/topsql/upstream/tikv/mock_upstream.rs @@ -56,6 +56,10 @@ impl ResourceMeteringPubSub for MockResourceMeteringPubSubServer { cpu_time_ms: 10, read_keys: 20, write_keys: 30, + logical_read_bytes: 100, + logical_write_bytes: 200, + network_in_bytes: 300, + network_out_bytes: 400, }], })), })])) as Self::SubscribeStream, diff --git a/src/sources/topsql/upstream/tikv/mod.rs b/src/sources/topsql/upstream/tikv/mod.rs index fafd05f..72f74cc 100644 --- a/src/sources/topsql/upstream/tikv/mod.rs +++ b/src/sources/topsql/upstream/tikv/mod.rs @@ -55,4 +55,8 @@ impl Upstream for TiKVUpstream { .await .map(|r| r.into_inner()) } + + fn get_wait_seconds() -> u64 { + 59 + } } diff --git a/src/sources/topsql/upstream/tikv/parser.rs b/src/sources/topsql/upstream/tikv/parser.rs index f0872d2..8b54c42 100644 --- a/src/sources/topsql/upstream/tikv/parser.rs +++ b/src/sources/topsql/upstream/tikv/parser.rs @@ -1,20 +1,22 @@ -use std::collections::{BTreeMap, HashMap}; -use std::sync::Arc; - -use prost::Message; -use vector::event::LogEvent; - use crate::sources::topsql::schema_cache::SchemaCache; use crate::sources::topsql::upstream::consts::{ INSTANCE_TYPE_TIKV, KV_TAG_LABEL_INDEX, KV_TAG_LABEL_ROW, KV_TAG_LABEL_UNKNOWN, - METRIC_NAME_CPU_TIME_MS, METRIC_NAME_READ_KEYS, METRIC_NAME_WRITE_KEYS, + LABEL_PLAN_DIGEST, LABEL_REGION_ID, LABEL_SQL_DIGEST, METRIC_NAME_CPU_TIME_MS, + METRIC_NAME_LOGICAL_BYTES, METRIC_NAME_LOGICAL_READ_BYTES, METRIC_NAME_LOGICAL_WRITE_BYTES, + METRIC_NAME_NETWORK_BYTES, METRIC_NAME_NETWORK_IN_BYTES, METRIC_NAME_NETWORK_OUT_BYTES, + METRIC_NAME_READ_KEYS, METRIC_NAME_WRITE_KEYS, }; use crate::sources::topsql::upstream::parser::{Buf, UpstreamEventParser}; use crate::sources::topsql::upstream::tidb::proto::ResourceGroupTag; use crate::sources::topsql::upstream::tikv::proto::resource_usage_record::RecordOneof; use crate::sources::topsql::upstream::tikv::proto::{ - GroupTagRecord, GroupTagRecordItem, ResourceUsageRecord, + GroupTagRecord, GroupTagRecordItem, RegionRecord, ResourceUsageRecord, }; +use chrono::{DateTime, Timelike}; +use prost::Message; +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; +use vector_lib::event::{Event, LogEvent, Value as LogValue}; pub struct ResourceUsageRecordParser; @@ -25,12 +27,30 @@ impl UpstreamEventParser for ResourceUsageRecordParser { response: Self::UpstreamEvent, instance: String, schema_cache: Arc, + enable_row_format: bool, + instance_partition_id: u32, ) -> Vec { - match response.record_oneof { - Some(RecordOneof::Record(record)) => { - Self::parse_tikv_record(record, instance, schema_cache) + info!(message = "parse tikv record", instance__A = %instance, instance_partition_BB = %instance_partition_id); + if !enable_row_format { + match response.record_oneof { + Some(RecordOneof::Record(record)) => { + Self::parse_tikv_record(record, instance, schema_cache) + } + Some(RecordOneof::RegionRecord(record)) => { + Self::parse_tikv_region_record(record, instance, schema_cache) + } + None => vec![], + } + } else { + match response.record_oneof { + Some(RecordOneof::Record(record)) => { + Self::parse_tikv_record_for_row_format(record, instance, schema_cache, instance_partition_id) + } + Some(RecordOneof::RegionRecord(record)) => { + Self::parse_tikv_region_record_for_row_format(record, instance, schema_cache, instance_partition_id) + } + None => vec![], } - None => vec![], } } @@ -40,6 +60,10 @@ impl UpstreamEventParser for ResourceUsageRecordParser { cpu_time_ms: u32, read_keys: u32, write_keys: u32, + network_in_bytes: u64, + network_out_bytes: u64, + logical_read_bytes: u64, + logical_write_bytes: u64, } let mut new_responses = vec![]; @@ -62,6 +86,10 @@ impl UpstreamEventParser for ResourceUsageRecordParser { cpu_time_ms: item.cpu_time_ms, read_keys: item.read_keys, write_keys: item.write_keys, + network_in_bytes: item.network_in_bytes, + network_out_bytes: item.network_out_bytes, + logical_read_bytes: item.logical_read_bytes, + logical_write_bytes: item.logical_write_bytes, }; match ts_digests.get_mut(&item.timestamp_sec) { None => { @@ -90,6 +118,10 @@ impl UpstreamEventParser for ResourceUsageRecordParser { others.cpu_time_ms += e.cpu_time_ms; others.read_keys += e.read_keys; others.write_keys += e.write_keys; + others.network_in_bytes += e.network_in_bytes; + others.network_out_bytes += e.network_out_bytes; + others.logical_read_bytes += e.logical_read_bytes; + others.logical_write_bytes += e.logical_write_bytes; } v.truncate(top_n); match ts_others.get_mut(&ts) { @@ -100,6 +132,10 @@ impl UpstreamEventParser for ResourceUsageRecordParser { existed_others.cpu_time_ms += others.cpu_time_ms; existed_others.read_keys += others.read_keys; existed_others.write_keys += others.write_keys; + existed_others.network_in_bytes += others.network_in_bytes; + existed_others.network_out_bytes += others.network_out_bytes; + existed_others.logical_read_bytes += others.logical_read_bytes; + existed_others.logical_write_bytes += others.logical_write_bytes; } } } @@ -112,6 +148,10 @@ impl UpstreamEventParser for ResourceUsageRecordParser { cpu_time_ms: psd.cpu_time_ms, read_keys: psd.read_keys, write_keys: psd.write_keys, + network_in_bytes: psd.network_in_bytes, + network_out_bytes: psd.network_out_bytes, + logical_read_bytes: psd.logical_read_bytes, + logical_write_bytes: psd.logical_write_bytes, }; match digest_items.get_mut(&psd.resource_group_tag) { None => { @@ -161,6 +201,35 @@ impl UpstreamEventParser for ResourceUsageRecordParser { new_item.cpu_time_ms += item.cpu_time_ms; new_item.read_keys += item.read_keys; new_item.write_keys += item.write_keys; + new_item.network_in_bytes += item.network_in_bytes; + new_item.network_out_bytes += item.network_out_bytes; + new_item.logical_read_bytes += item.logical_read_bytes; + new_item.logical_write_bytes += item.logical_write_bytes; + new_items.insert(new_ts, new_item); + } + } + } + record.items = new_items.into_values().collect(); + } else if let Some(RecordOneof::RegionRecord(record)) = &mut response.record_oneof { + let mut new_items = BTreeMap::new(); + for item in &record.items { + let new_ts = + item.timestamp_sec + (interval_sec - item.timestamp_sec % interval_sec); + match new_items.get(&new_ts) { + None => { + let mut new_item = item.clone(); + new_item.timestamp_sec = new_ts; + new_items.insert(new_ts, new_item); + } + Some(existed_item) => { + let mut new_item = existed_item.clone(); + new_item.cpu_time_ms += item.cpu_time_ms; + new_item.read_keys += item.read_keys; + new_item.write_keys += item.write_keys; + new_item.network_in_bytes += item.network_in_bytes; + new_item.network_out_bytes += item.network_out_bytes; + new_item.logical_read_bytes += item.logical_read_bytes; + new_item.logical_write_bytes += item.logical_write_bytes; new_items.insert(new_ts, new_item); } } @@ -241,9 +310,263 @@ impl ResourceUsageRecordParser { (METRIC_NAME_WRITE_KEYS, write_keys), ); + // network_in_bytes + network_out_bytes + buf.label_name(METRIC_NAME_NETWORK_BYTES) + .points(record.items.iter().filter_map(|item| { + if item.network_in_bytes > 0 || item.network_out_bytes > 0 { + Some(( + item.timestamp_sec, + (item.network_in_bytes + item.network_out_bytes) as f64, + )) + } else { + None + } + })); + if let Some(event) = buf.build_event() { + logs.push(event); + } + + // logical_read_bytes + logical_write_bytes + buf.label_name(METRIC_NAME_LOGICAL_BYTES) + .points(record.items.iter().filter_map(|item| { + if item.logical_read_bytes > 0 || item.logical_write_bytes > 0 { + Some(( + item.timestamp_sec, + (item.logical_read_bytes + item.logical_write_bytes) as f64, + )) + } else { + None + } + })); + if let Some(event) = buf.build_event() { + logs.push(event); + } + + logs + } + + fn parse_tikv_region_record( + record: RegionRecord, + instance: String, + schema_cache: Arc, + ) -> Vec { + // Log schema cache info + debug!( + message = "Schema cache available in parse_tikv_record", + entries = schema_cache.entry_count(), + schema_version = schema_cache.schema_version() + ); + let mut logs = vec![]; + let mut buf = Buf::default(); + buf.instance(instance) + .instance_type(INSTANCE_TYPE_TIKV) + .region_id(record.region_id.to_string()); + + macro_rules! append { + ($( ($label_name:expr, $item_name:tt), )* ) => { + $( + buf.label_name($label_name) + .points(record.items.iter().filter_map(|item| { + if item.$item_name > 0 { + Some((item.timestamp_sec, item.$item_name as f64)) + } else { + None + } + })); + if let Some(event) = buf.build_event() { + logs.push(event); + } + )* + }; + } + append!( + // cpu_time_ms + (METRIC_NAME_CPU_TIME_MS, cpu_time_ms), + // read_keys + (METRIC_NAME_READ_KEYS, read_keys), + // write_keys + (METRIC_NAME_WRITE_KEYS, write_keys), + ); + + // network_in_bytes + network_out_bytes + buf.label_name(METRIC_NAME_NETWORK_BYTES) + .points(record.items.iter().filter_map(|item| { + if item.network_in_bytes > 0 || item.network_out_bytes > 0 { + Some(( + item.timestamp_sec, + (item.network_in_bytes + item.network_out_bytes) as f64, + )) + } else { + None + } + })); + if let Some(event) = buf.build_event() { + logs.push(event); + } + + // logical_read_bytes + logical_write_bytes + buf.label_name(METRIC_NAME_LOGICAL_BYTES) + .points(record.items.iter().filter_map(|item| { + if item.logical_read_bytes > 0 || item.logical_write_bytes > 0 { + Some(( + item.timestamp_sec, + (item.logical_read_bytes + item.logical_write_bytes) as f64, + )) + } else { + None + } + })); + if let Some(event) = buf.build_event() { + logs.push(event); + } logs } + fn parse_tikv_record_for_row_format( + record: GroupTagRecord, + instance: String, + schema_cache: Arc, + instance_partition_id: u32, + ) -> Vec { + // Log schema cache info + debug!( + message = "Schema cache available in parse_tikv_record", + entries = schema_cache.entry_count(), + schema_version = schema_cache.schema_version() + ); + + let decoded = Self::decode_tag(record.resource_group_tag.as_slice()); + if decoded.is_none() { + return vec![]; + } + let (sql_digest, plan_digest, tag_label, table_id) = decoded.unwrap(); + + let mut db_name = "".to_string(); + let mut table_name = "".to_string(); + let mut table_id_str = "".to_string(); + + if let Some(tid) = table_id { + table_id_str = tid.to_string(); + if let Some(table_detail) = schema_cache.get(tid) { + db_name = table_detail.db.clone(); + table_name = table_detail.name; + } + } + let mut events = vec![]; + for item in &record.items { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tikv_topsql"); + log.insert("timestamps", LogValue::from(item.timestamp_sec)); + // Calculate datetime string: %Y-%m-%d %H where %H is time slot index (0-3) + // Skip current item if timestamp conversion fails + let dt = match DateTime::from_timestamp(item.timestamp_sec as i64, 0) { + Some(dt) => dt, + None => continue, + }; + let naive_dt = dt.naive_utc(); + let date = naive_dt.date(); + let hour = naive_dt.hour(); + // Calculate time slot index: 0-6=0, 6-12=1, 12-18=2, 18-24=3 + let time_slot = (hour / 6) as u32; + let datetime_str = format!("{} {}", date.format("%Y-%m-%d"), time_slot); + log.insert("datetime", LogValue::from(datetime_str)); + log.insert("instance_type", INSTANCE_TYPE_TIKV.to_string()); + log.insert("instance", instance.clone()); + log.insert("instance_partition_id", LogValue::from(instance_partition_id as i64)); + log.insert(LABEL_SQL_DIGEST, sql_digest.clone()); + log.insert(LABEL_PLAN_DIGEST, plan_digest.clone()); + log.insert("tag_label", tag_label.clone()); + log.insert("db_name", db_name.clone()); + log.insert("table_name", table_name.clone()); + log.insert("table_id", table_id_str.clone()); + log.insert(METRIC_NAME_CPU_TIME_MS, LogValue::from(item.cpu_time_ms)); + log.insert(METRIC_NAME_READ_KEYS, LogValue::from(item.read_keys)); + log.insert(METRIC_NAME_WRITE_KEYS, LogValue::from(item.write_keys)); + log.insert( + METRIC_NAME_NETWORK_IN_BYTES, + LogValue::from(item.network_in_bytes), + ); + log.insert( + METRIC_NAME_NETWORK_OUT_BYTES, + LogValue::from(item.network_out_bytes), + ); + log.insert( + METRIC_NAME_LOGICAL_READ_BYTES, + LogValue::from(item.logical_read_bytes), + ); + log.insert( + METRIC_NAME_LOGICAL_WRITE_BYTES, + LogValue::from(item.logical_write_bytes), + ); + events.push(event.into_log()); + } + events + } + + fn parse_tikv_region_record_for_row_format( + record: RegionRecord, + instance: String, + schema_cache: Arc, + instance_partition_id: u32, + ) -> Vec { + // Log schema cache info + debug!( + message = "Schema cache available in parse_tikv_record", + entries = schema_cache.entry_count(), + schema_version = schema_cache.schema_version() + ); + let mut events = vec![]; + for item in &record.items { + let mut event = Event::Log(LogEvent::default()); + let log = event.as_mut_log(); + + // Add metadata with Vector prefix (ensure all fields have values) + log.insert("source_table", "tikv_topregion"); + log.insert("timestamps", LogValue::from(item.timestamp_sec as i64)); + // Calculate datetime string: %Y-%m-%d %H where %H is time slot index (0-3) + // Skip current item if timestamp conversion fails + let dt = match DateTime::from_timestamp(item.timestamp_sec as i64, 0) { + Some(dt) => dt, + None => continue, + }; + let naive_dt = dt.naive_utc(); + let date = naive_dt.date(); + let hour = naive_dt.hour(); + // Calculate time slot index: 0-6=0, 6-12=1, 12-18=2, 18-24=3 + let time_slot = (hour / 6) as u32; + let datetime_str = format!("{} {}", date.format("%Y-%m-%d"), time_slot); + log.insert("datetime", LogValue::from(datetime_str)); + log.insert("instance_type", INSTANCE_TYPE_TIKV.to_string()); + log.insert("instance", instance.clone()); + log.insert("instance_partition_id", LogValue::from(instance_partition_id as i64)); + log.insert(LABEL_REGION_ID, record.region_id.to_string()); + log.insert(METRIC_NAME_CPU_TIME_MS, LogValue::from(item.cpu_time_ms)); + log.insert(METRIC_NAME_READ_KEYS, LogValue::from(item.read_keys)); + log.insert(METRIC_NAME_WRITE_KEYS, LogValue::from(item.write_keys)); + log.insert( + METRIC_NAME_NETWORK_IN_BYTES, + LogValue::from(item.network_in_bytes), + ); + log.insert( + METRIC_NAME_NETWORK_OUT_BYTES, + LogValue::from(item.network_out_bytes), + ); + log.insert( + METRIC_NAME_LOGICAL_READ_BYTES, + LogValue::from(item.logical_read_bytes), + ); + log.insert( + METRIC_NAME_LOGICAL_WRITE_BYTES, + LogValue::from(item.logical_write_bytes), + ); + events.push(event.into_log()); + } + events + } + fn decode_tag(tag: &[u8]) -> Option<(String, String, String, Option)> { match ResourceGroupTag::decode(tag) { Ok(resource_tag) => { @@ -331,6 +654,10 @@ mod tests { cpu_time_ms: i.cpu_time_ms, read_keys: i.read_keys, write_keys: i.write_keys, + network_in_bytes: 0, + network_out_bytes: 0, + logical_read_bytes: 0, + logical_write_bytes: 0, }) .collect(), })), diff --git a/src/sources/topsql/upstream/tikv/proto.rs b/src/sources/topsql/upstream/tikv/proto.rs index 26239cd..d125dcf 100644 --- a/src/sources/topsql/upstream/tikv/proto.rs +++ b/src/sources/topsql/upstream/tikv/proto.rs @@ -16,6 +16,9 @@ impl ByteSizeOf for RecordOneof { fn allocated_bytes(&self) -> usize { match self { RecordOneof::Record(record) => record.resource_group_tag.len() + record.items.size_of(), + RecordOneof::RegionRecord(record) => { + record.region_id.size_of() + record.items.size_of() + } } } } diff --git a/system_tables_example.yaml b/system_tables_example.yaml new file mode 100644 index 0000000..0afb7ce --- /dev/null +++ b/system_tables_example.yaml @@ -0,0 +1,134 @@ +sources: + # Legacy collector system (backward compatibility) + system_tables_legacy: + type: system_tables + use_abstracted_collectors: false + collection_method: "sql" + database_username: "root" + database_password: "" + database_host: "127.0.0.1" + database_port: 4000 + database_name: "test" + short_interval: 5 + long_interval: 1800 + retention_days: 7 + tables: + - source_schema: "information_schema" + source_table: "PROCESSLIST" + dest_table: "hist_processlist" + collection_interval: "short" + where_clause: "command != 'Sleep'" + enabled: true + + # New abstracted collector system - SQL method + system_tables_sql_v2: + type: system_tables + use_abstracted_collectors: true + collection_method: "sql" + database_username: "root" + database_password: "" + database_host: "127.0.0.1" + database_port: 4000 + database_name: "test" + short_interval: 5 + long_interval: 1800 + retention_days: 7 + tables: + - source_schema: "information_schema" + source_table: "PROCESSLIST" + dest_table: "hist_processlist_v2" + collection_interval: "short" + where_clause: "command != 'Sleep'" + enabled: true + - source_schema: "information_schema" + source_table: "USER_PRIVILEGES" + dest_table: "hist_user_privileges" + collection_interval: "long" + enabled: true + + # New abstracted collector system - Coprocessor method + system_tables_coprocessor_v2: + type: system_tables + use_abstracted_collectors: true + collection_method: "coprocessor" + database_username: "root" + database_password: "" + database_host: "127.0.0.1" + database_port: 4000 + database_name: "test" + short_interval: 10 + long_interval: 1800 + retention_days: 7 + tables: + - source_schema: "information_schema" + source_table: "CLUSTER_STATEMENTS_SUMMARY" + dest_table: "hist_cluster_statements_summary_v2" + collection_interval: "short" + enabled: true + - source_schema: "information_schema" + source_table: "CLUSTER_SLOW_QUERY" + dest_table: "hist_cluster_slow_query_v2" + collection_interval: "long" + enabled: true + + # New abstracted collector system - HTTP API method + system_tables_http_api: + type: system_tables + use_abstracted_collectors: true + collection_method: "http_api" + database_username: "root" + database_password: "" + database_host: "127.0.0.1" + database_port: 4000 + database_name: "test" + short_interval: 15 + long_interval: 1800 + retention_days: 7 + tables: + - source_schema: "information_schema" + source_table: "CLUSTER_CONFIG" + dest_table: "hist_cluster_config" + collection_interval: "long" + enabled: true + - source_schema: "metrics_schema" + source_table: "CLUSTER_HARDWARE" + dest_table: "hist_cluster_hardware" + collection_interval: "short" + enabled: true + + # New abstracted collector system - Custom gRPC method + system_tables_custom_grpc: + type: system_tables + use_abstracted_collectors: true + collection_method: "custom_grpc" + database_username: "root" + database_password: "" + database_host: "127.0.0.1" + database_port: 4000 + database_name: "test" + short_interval: 20 + long_interval: 1800 + retention_days: 7 + tables: + - source_schema: "custom_schema" + source_table: "CLUSTER_CUSTOM_METRICS" + dest_table: "hist_custom_metrics" + collection_interval: "short" + enabled: true + - source_schema: "custom_schema" + source_table: "CLUSTER_CUSTOM_EVENTS" + dest_table: "hist_custom_events" + collection_interval: "long" + enabled: true + +sinks: + stdout: + inputs: + - system_tables_legacy + - system_tables_sql_v2 + - system_tables_coprocessor_v2 + - system_tables_http_api + - system_tables_custom_grpc + type: console + encoding: + codec: json \ No newline at end of file diff --git a/test_config_cluster_tables.toml b/test_config_cluster_tables.toml new file mode 100644 index 0000000..02edbbd --- /dev/null +++ b/test_config_cluster_tables.toml @@ -0,0 +1,70 @@ +[sources.tidb_system_tables] +type = "system_tables" + +# Database connection configuration +database_username = "root" +database_password = "" +database_host = "127.0.0.1" +database_port = 4000 +database_name = "information_schema" +database_max_connections = 5 +database_connect_timeout = 30 + +# PD configuration (for topology discovery) +pd_address = "127.0.0.1:2379" + +# Database TLS configuration (for schema fetching and HTTP API calls) +database_tls.verify_certificate = false +database_tls.verify_hostname = false +# database_tls.ca_file = "/path/to/ca.pem" # Optional: CA certificate file +# database_tls.crt_file = "/path/to/client.pem" # Optional: Client certificate file +# database_tls.key_file = "/path/to/client-key.pem" # Optional: Client private key file + +# PD TLS configuration (for topology discovery) +pd_tls.verify_certificate = false +pd_tls.verify_hostname = false +# pd_tls.ca_file = "/path/to/pd-ca.pem" # Optional: PD CA certificate file +# pd_tls.crt_file = "/path/to/pd-client.pem" # Optional: PD client certificate file +# pd_tls.key_file = "/path/to/pd-client-key.pem" # Optional: PD client private key file + +# Nextgen mode configuration +tidb_group = "default" # Required for nextgen mode + +# Collection configuration +short_interval = 5 +long_interval = 30 +retention_days = 1 +topology_fetch_interval_seconds = 10.0 + +# Collection method configuration +collection_method = "coprocessor" # Use coprocessor for cluster tables + +# Test CLUSTER_STATEMENTS_SUMMARY table +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "CLUSTER_STATEMENTS_SUMMARY" +dest_table = "cluster_statements_summary" +collection_interval = "short" +enabled = true + +# Test CLUSTER_TIDB_STATEMENTS_STATS table +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "CLUSTER_TIDB_STATEMENTS_STATS" +dest_table = "cluster_tidb_statements_stats" +collection_interval = "short" +enabled = true + +# Output to Delta Lake (local storage, using default partitioning) +[sinks.deltalake] +type = "deltalake" +inputs = ["tidb_system_tables"] +base_path = "./test_data/deltalake" +batch_size = 100 +timeout_secs = 30 +compression = "snappy" + +# Local storage options +[sinks.deltalake.storage_options] +# Use local file system +"file.enable_move" = "true" diff --git a/test_config_coprocessor_deltalake.toml b/test_config_coprocessor_deltalake.toml new file mode 100644 index 0000000..c68c5de --- /dev/null +++ b/test_config_coprocessor_deltalake.toml @@ -0,0 +1,48 @@ +# Vector Configuration File - System Tables Source Test +# Collection method: coprocessor, Output: deltalake + +[sources.tidb_system_tables] +type = "system_tables" + +# Database connection configuration +database_username = "root" +database_password = "" +database_host = "127.0.0.1" +database_port = 4000 +database_name = "test_db" +database_max_connections = 5 +database_connect_timeout = 30 + +# PD configuration (for topology discovery) +pd_address = "127.0.0.1:2379" + +# Collection configuration +short_interval = 5 +long_interval = 30 +retention_days = 1 +topology_fetch_interval_seconds = 10.0 + +# Collection method configuration +collection_method = "coprocessor" + +# Tables to collect configuration - only CLUSTER_STATEMENTS_SUMMARY for testing +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "CLUSTER_STATEMENTS_SUMMARY" +dest_table = "statements_summary" +collection_interval = "short" +enabled = true + +# Output to Delta Lake (local storage, using default partitioning) +[sinks.deltalake] +type = "deltalake" +inputs = ["tidb_system_tables"] +base_path = "./test_data/deltalake" +batch_size = 100 +timeout_secs = 30 +compression = "snappy" + +# Local storage options +[sinks.deltalake.storage_options] +# Use local file system +"file.enable_move" = "true" diff --git a/test_config_coprocessor_minimal.toml b/test_config_coprocessor_minimal.toml new file mode 100644 index 0000000..dda6cab --- /dev/null +++ b/test_config_coprocessor_minimal.toml @@ -0,0 +1,39 @@ +# Minimal Vector Configuration File - Coprocessor Collection Method +# Database connection fields are optional for coprocessor method + +[sources.tidb_system_tables] +type = "system_tables" + +# PD configuration (required for topology discovery) +pd_address = "127.0.0.1:2379" + +# Collection configuration +short_interval = 5 +long_interval = 30 +retention_days = 1 +topology_fetch_interval_seconds = 10.0 + +# Collection method configuration +collection_method = "coprocessor" + +# Tables to collect configuration - only CLUSTER_STATEMENTS_SUMMARY for testing +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "CLUSTER_STATEMENTS_SUMMARY" +dest_table = "statements_summary" +collection_interval = "short" +enabled = true + +# Output to Delta Lake (local storage, using default partitioning) +[sinks.deltalake] +type = "deltalake" +inputs = ["tidb_system_tables"] +base_path = "./test_data/deltalake" +batch_size = 100 +timeout_secs = 30 +compression = "snappy" + +# Local storage options +[sinks.deltalake.storage_options] +# Use local file system +"file.enable_move" = "true" diff --git a/test_delta_manual.toml b/test_delta_manual.toml new file mode 100644 index 0000000..5cf70d6 --- /dev/null +++ b/test_delta_manual.toml @@ -0,0 +1,44 @@ +# Test configuration for Delta Lake writing +[sources.tidb_system_tables] +type = "system_tables" +collection_method = "coprocessor" +collection_interval = "short" +short_interval = 5 +medium_interval = 30 +long_interval = 300 +retention_days = 30 + +database_host = "127.0.0.1" +database_port = 4000 +database_username = "root" +database_password = "" +database_name = "information_schema" +pd_address = "127.0.0.1:2379" + +[[sources.tidb_system_tables.tables]] +source_schema = "information_schema" +source_table = "CLUSTER_STATEMENTS_SUMMARY" +dest_table = "statements_summary" +collection_interval = "short" +enabled = true + +# Output to Delta Lake with small batch size for testing +[sinks.deltalake] +type = "deltalake" +inputs = ["tidb_system_tables"] +base_path = "./test_data/deltalake" +batch_size = 1 # Very small batch size to trigger writes quickly +timeout_secs = 5 # Short timeout to force writes +compression = "snappy" + +# Local storage options +[sinks.deltalake.storage_options] +"file.enable_move" = "true" + +# Add console sink for debugging data flow +[sinks.console_debug] +type = "console" +inputs = ["tidb_system_tables"] + +[sinks.console_debug.encoding] +codec = "json" diff --git a/test_system_tables_new.sh b/test_system_tables_new.sh new file mode 100755 index 0000000..6c1af76 --- /dev/null +++ b/test_system_tables_new.sh @@ -0,0 +1,625 @@ +#!/bin/bash + +# System Tables Source Local Testing Script +# For testing system_tables source with various collection methods + +set -e + +# Color definitions +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Log functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Check dependencies +check_dependencies() { + log_info "Checking dependencies..." + + # Check Rust + if ! command -v cargo &> /dev/null; then + log_error "cargo not found, please install Rust" + exit 1 + fi + + # Check mysql client (for connecting to local TiDB) + if ! command -v mysql &> /dev/null; then + log_warning "mysql client not found, some test data creation operations may not work" + fi + + log_success "Dependencies check completed" +} + +# Check if local TiDB is available +check_local_tidb() { + log_info "Checking local TiDB connection..." + + # Try to connect to TiDB + if command -v mysql &> /dev/null; then + if mysql -h127.0.0.1 -P4000 -uroot -e "SELECT 1" >/dev/null 2>&1; then + log_success "Local TiDB connection successful" + return 0 + else + log_error "Cannot connect to local TiDB (127.0.0.1:4000)" + log_info "Please ensure TiDB is started and listening on port 4000" + return 1 + fi + else + log_warning "No mysql client available, skipping TiDB connection check" + return 0 + fi +} + +# Check if local PD is available +check_local_pd() { + log_info "Checking local PD connection..." + + # Try to connect to PD + if command -v curl &> /dev/null; then + if curl -s "http://127.0.0.1:2379/pd/api/v1/health" >/dev/null 2>&1; then + log_success "Local PD connection successful" + return 0 + else + log_error "Cannot connect to local PD (127.0.0.1:2379)" + log_info "Please ensure PD is started and listening on port 2379" + return 1 + fi + else + log_warning "No curl command available, skipping PD connection check" + return 0 + fi +} + +# Build project +build_project() { + log_info "Building project..." + cargo build --release + if [ $? -eq 0 ]; then + log_success "Project build successful" + else + log_error "Project build failed" + exit 1 + fi +} + +# Check local test environment +check_local_env() { + log_info "Checking local test environment..." + + local tidb_ok=0 + local pd_ok=0 + + # Check TiDB + if check_local_tidb; then + tidb_ok=1 + fi + + # Check PD + if check_local_pd; then + pd_ok=1 + fi + + if [ $tidb_ok -eq 1 ] && [ $pd_ok -eq 1 ]; then + log_success "Local test environment check passed" + return 0 + else + log_error "Local test environment check failed" + log_info "Please ensure local TiDB (port 4000) and PD (port 2379) are properly started" + return 1 + fi +} + +# Create test data +create_test_data() { + log_info "Creating test data..." + + if ! command -v mysql &> /dev/null; then + log_warning "mysql client not available, skipping test data creation" + return 0 + fi + + # Check TiDB connection + if ! mysql -h127.0.0.1 -P4000 -uroot -e "SELECT 1" >/dev/null 2>&1; then + log_error "Cannot connect to local TiDB, skipping test data creation" + return 1 + fi + + # Create test database and tables + mysql -h127.0.0.1 -P4000 -uroot < "$config_file" <> "$config_file" <> "$config_file" </dev/null; then + log_info "Stopping Vector process (PID: $vector_pid)..." + + # First try graceful shutdown (SIGTERM) + kill -TERM "$vector_pid" 2>/dev/null + local count=0 + while [ $count -lt 5 ] && kill -0 "$vector_pid" 2>/dev/null; do + sleep 1 + count=$((count + 1)) + done + + # If still running, force stop (SIGKILL) + if kill -0 "$vector_pid" 2>/dev/null; then + log_warning "Graceful shutdown failed, force killing Vector process..." + kill -KILL "$vector_pid" 2>/dev/null + sleep 2 + fi + + log_success "Vector process stopped" + fi + } + + # Set signal trap + trap cleanup_vector SIGINT SIGTERM + + # Start Vector (run in background) + log_info "Starting Vector..." + ./target/release/vector --config "$config_file" & + vector_pid=$! + + log_info "Vector process started (PID: $vector_pid)" + log_info "Tip: Press Ctrl+C to stop the test anytime" + + # Wait for specified duration or user interruption + local countdown=$duration + while [ $countdown -gt 0 ] && kill -0 "$vector_pid" 2>/dev/null; do + sleep 1 + countdown=$((countdown - 1)) + + # Show progress every 10 seconds + if [ $((duration - countdown)) -gt 0 ] && [ $(((duration - countdown) % 10)) -eq 0 ]; then + log_info "Running... elapsed $((duration - countdown))s / ${duration}s" + fi + done + + # Check result + local exit_code=0 + if kill -0 "$vector_pid" 2>/dev/null; then + log_info "Test duration reached, stopping Vector..." + cleanup_vector + else + wait "$vector_pid" + exit_code=$? + if [ $exit_code -ne 0 ]; then + log_error "Vector exited abnormally with code: $exit_code" + fi + fi + + # Clean up signal trap + trap - SIGINT SIGTERM + + return $exit_code +} + +# Run Vector test +run_vector_test() { + local collection_method=$1 + local sink_type=${2:-"console"} + local duration=${3:-30} + local config_file="test_config_${collection_method}_${sink_type}.toml" + + log_info "Running Vector test (collection: $collection_method, output: $sink_type, duration: ${duration}s)" + + # Create configuration file + create_vector_config "$collection_method" "$sink_type" + + # Use improved process management to run Vector + if manage_vector_process "$config_file" "$duration"; then + log_success "Vector test completed (${duration}s)" + + # If deltalake, show generated files + if [ "$sink_type" = "deltalake" ]; then + show_deltalake_output + fi + else + log_error "Vector test failed" + return 1 + fi +} + +# Show Delta Lake output results +show_deltalake_output() { + local data_dir="./test_data/deltalake" + + if [ -d "$data_dir" ]; then + log_info "Delta Lake output results:" + + # Show partition directory structure + log_info "Partition directory structure:" + if command -v tree &> /dev/null; then + tree "$data_dir" -d | head -20 + else + find "$data_dir" -type d | head -15 | while read dir; do + log_info " Directory: $dir" + done + fi + + # Show data files + log_info "Data files (Parquet):" + find "$data_dir" -name "*.parquet" | head -10 | while read file; do + local size=$(ls -lh "$file" | awk '{print $5}') + log_info " File: $file (size: $size)" + done + + # Show Delta Log files + log_info "Delta Log files:" + find "$data_dir" -path "*/_delta_log/*" -name "*.json" | head -5 | while read file; do + log_info " Log: $file" + done + + # Show statistics + local parquet_count=$(find "$data_dir" -name "*.parquet" | wc -l) + local partition_count=$(find "$data_dir" -type d -name "_vector_table=*" | wc -l) + log_info "Statistics: $parquet_count data files, $partition_count table partitions" + else + log_warning "Delta Lake data directory does not exist: $data_dir" + fi +} + +# Clean up all Vector processes +cleanup_vector_processes() { + local vector_pids=$(pgrep -f "vector.*--config.*test_config_" 2>/dev/null || true) + if [ -n "$vector_pids" ]; then + log_warning "Found lingering Vector processes, cleaning up..." + echo "$vector_pids" | while read pid; do + if [ -n "$pid" ]; then + log_info "Stopping Vector process (PID: $pid)..." + kill -TERM "$pid" 2>/dev/null || true + sleep 2 + if kill -0 "$pid" 2>/dev/null; then + kill -KILL "$pid" 2>/dev/null || true + fi + fi + done + sleep 1 + fi +} + +# Clean up test files +cleanup_test_files() { + log_info "Cleaning up test files..." + + # First clean up any existing Vector processes + cleanup_vector_processes + + # Clean up configuration files + rm -f test_config_*.toml + + # Clean up test data directory + if [ -d "./test_data" ]; then + log_info "Cleaning up test data directory..." + rm -rf "./test_data" + fi + + log_success "Test files cleanup completed" +} + +# Show help +show_help() { + cat < create data -> test -> cleanup) + +Options: + -d, --duration SECONDS Test duration (default: 30s) + --no-cleanup Do not clean up test files on exit + -h, --help Show help information + +Examples: + $0 build # Build project + $0 check-env # Check local environment + $0 full-test # Run full test (console output) + $0 test-sql -d 60 # Test SQL method for 60 seconds (console output) + $0 test-sql-delta -d 60 # Test SQL method for 60 seconds (Delta Lake output) + $0 test-copr-delta --no-cleanup -d 60 # Test coprocessor method without cleanup + $0 test-all-delta -d 120 # Test all methods for 120 seconds (Delta Lake output) + $0 cleanup # Clean up test files + +Notes: + - This script assumes TiDB is listening on 127.0.0.1:4000 + - This script assumes PD is listening on 127.0.0.1:2379 + - mysql client is required for test data creation + - curl is required for PD health check + - Vector process supports graceful Ctrl+C shutdown with automatic cleanup + - Script will automatically clean up all lingering Vector processes on exit + +EOF +} + +# Main function +main() { + local duration=30 + local command="" + + # Parse arguments + local no_cleanup=false + while [[ $# -gt 0 ]]; do + case $1 in + -d|--duration) + duration="$2" + shift 2 + ;; + --no-cleanup) + no_cleanup=true + shift + ;; + -h|--help) + show_help + exit 0 + ;; + build|check-env|create-data|test-sql|test-copr|test-sql-delta|test-copr-delta|test-all|test-all-delta|cleanup|full-test) + command="$1" + shift + ;; + *) + log_error "Unknown parameter: $1" + show_help + exit 1 + ;; + esac + done + + if [ -z "$command" ]; then + log_error "Please specify a command" + show_help + exit 1 + fi + + # Set up cleanup trap unless --no-cleanup is specified + if [ "$no_cleanup" = false ]; then + trap cleanup_test_files EXIT + log_info "Cleanup enabled (will cleanup on exit)" + else + log_info "Cleanup disabled (--no-cleanup specified)" + fi + + case $command in + build) + check_dependencies + build_project + ;; + check-env) + check_dependencies + check_local_env + ;; + create-data) + create_test_data + ;; + test-sql) + if ! check_local_env; then + exit 1 + fi + run_vector_test "sql" "console" "$duration" + ;; + test-copr) + if ! check_local_env; then + exit 1 + fi + run_vector_test "coprocessor" "console" "$duration" + ;; + test-sql-delta) + if ! check_local_env; then + exit 1 + fi + run_vector_test "sql" "deltalake" "$duration" + ;; + test-copr-delta) + if ! check_local_env; then + exit 1 + fi + run_vector_test "coprocessor" "deltalake" "$duration" + ;; + test-all) + if ! check_local_env; then + exit 1 + fi + log_info "Testing all collection methods (console output)..." + run_vector_test "sql" "console" "$duration" + sleep 5 + run_vector_test "coprocessor" "console" "$duration" + ;; + test-all-delta) + if ! check_local_env; then + exit 1 + fi + log_info "Testing all collection methods (Delta Lake output)..." + run_vector_test "sql" "deltalake" "$duration" + sleep 5 + run_vector_test "coprocessor" "deltalake" "$duration" + ;; + cleanup) + cleanup_test_files + ;; + full-test) + log_info "Starting full test process..." + check_dependencies + build_project + + if ! check_local_env; then + log_error "Local environment check failed, please start TiDB and PD first" + exit 1 + fi + + cleanup_test_files # Clean up any old files + create_test_data + + log_info "=== Console Output Tests ===" + log_info "Testing SQL collection method (console)..." + run_vector_test "sql" "console" "$duration" + sleep 5 + + log_info "Testing Coprocessor collection method (console)..." + run_vector_test "coprocessor" "console" "$duration" + sleep 5 + + log_info "=== Delta Lake Output Tests ===" + log_info "Testing SQL collection method (Delta Lake)..." + run_vector_test "sql" "deltalake" "$duration" + sleep 5 + + log_info "Testing Coprocessor collection method (Delta Lake)..." + run_vector_test "coprocessor" "deltalake" "$duration" + + cleanup_test_files + log_success "Full test process completed" + ;; + *) + log_error "Unknown command: $command" + show_help + exit 1 + ;; + esac +} + +# Signal handling - conditionally cleanup (will be set in main function) +# trap cleanup_test_files EXIT + +# Run main function +main "$@" diff --git a/test_system_tables_s3.sh b/test_system_tables_s3.sh new file mode 100755 index 0000000..0898fb2 --- /dev/null +++ b/test_system_tables_s3.sh @@ -0,0 +1,512 @@ +#!/bin/bash + +# System Tables Source S3 Testing Script +# For testing system_tables source with S3 Delta Lake output + +set -e + +# Color definitions +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Log functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Default configuration values +HOST="127.0.0.1" +PORT="4000" +USER="root" +PASSWORD="" +DATABASE="test" +PD="127.0.0.1:2379" +BUCKET="" +REGION="us-west-2" +DURATION="30" +PROFILE="release" +COLLECTION_METHOD="coprocessor" # Default to coprocessor, can be changed to sql +AWS_ACCESS_KEY_ID="" +AWS_SECRET_ACCESS_KEY="" +AWS_SESSION_TOKEN="" +ASSUME_ROLE="" +EXTERNAL_ID="" +ROLE_SESSION_NAME="vector-deltalake" + +# Check dependencies +check_dependencies() { + log_info "Checking dependencies..." + + # Check Rust + if ! command -v cargo &> /dev/null; then + log_error "cargo not found, please install Rust" + exit 1 + fi + + # Check AWS CLI for verification + if ! command -v aws &> /dev/null; then + log_warning "aws CLI not found, S3 verification will be skipped" + fi + + log_success "Dependencies check completed" +} + +# Parse command line arguments +parse_arguments() { + while [[ $# -gt 0 ]]; do + case "$1" in + --host) HOST="$2"; shift 2 ;; + --port) PORT="$2"; shift 2 ;; + --user) USER="$2"; shift 2 ;; + --password) PASSWORD="$2"; shift 2 ;; + --database) DATABASE="$2"; shift 2 ;; + --pd) PD="$2"; shift 2 ;; + --bucket) BUCKET="$2"; shift 2 ;; + --region) REGION="$2"; shift 2 ;; + --duration) DURATION="$2"; shift 2 ;; + --collection-method) COLLECTION_METHOD="$2"; shift 2 ;; + --aws-access-key-id) AWS_ACCESS_KEY_ID="$2"; shift 2 ;; + --aws-secret-access-key) AWS_SECRET_ACCESS_KEY="$2"; shift 2 ;; + --aws-session-token) AWS_SESSION_TOKEN="$2"; shift 2 ;; + --assume-role) ASSUME_ROLE="$2"; shift 2 ;; + --external-id) EXTERNAL_ID="$2"; shift 2 ;; + --role-session-name) ROLE_SESSION_NAME="$2"; shift 2 ;; + --release) PROFILE="release"; shift 1 ;; + -h|--help) show_help; exit 0 ;; + *) log_error "Unknown argument: $1"; show_help; exit 1 ;; + esac + done +} + +# Validate configuration +validate_config() { + log_info "Validating configuration..." + + # Validate required S3 parameters + if [[ -z "${BUCKET}" ]]; then + log_error "--bucket is required for S3 storage" + exit 1 + fi + + # Validate collection method + if [[ "${COLLECTION_METHOD}" != "sql" && "${COLLECTION_METHOD}" != "coprocessor" ]]; then + log_error "collection-method must be 'sql' or 'coprocessor'" + exit 1 + fi + + # Check if AWS credentials are provided (either via CLI args or environment) + if [[ -z "${AWS_ACCESS_KEY_ID}" ]] && [[ -z "${AWS_ACCESS_KEY_ID:-}" ]] && [[ -z "${ASSUME_ROLE}" ]]; then + log_warning "No AWS credentials specified. Will use default AWS credential chain." + log_info "Make sure AWS credentials are configured via environment variables, ~/.aws/config, or IAM role." + fi + + log_success "Configuration validation completed" +} + +# Build project +build_project() { + log_info "Building project..." + if [[ "${PROFILE}" == "release" ]]; then + cargo build --release + else + cargo build + fi + + if [ $? -eq 0 ]; then + log_success "Project build successful" + else + log_error "Project build failed" + exit 1 + fi +} + +# Generate Vector configuration file +create_vector_config() { + local config_file="test_config_s3_${COLLECTION_METHOD}.toml" + + echo "Creating Vector configuration file: $config_file (collection: $COLLECTION_METHOD, S3: $BUCKET)" >&2 + + cat > "$config_file" <> "$config_file" <> "$config_file" <> "$config_file" <> "$config_file" <> "$config_file" <> "$config_file" <> "$config_file" <> "$config_file" <&2 + echo "$config_file" +} + +# Vector process management function +manage_vector_process() { + local config_file=$1 + local duration=$2 + local vector_pid="" + + # Define cleanup function + cleanup_vector() { + if [ -n "$vector_pid" ] && kill -0 "$vector_pid" 2>/dev/null; then + log_info "Stopping Vector process (PID: $vector_pid)..." + + # First try graceful shutdown (SIGTERM) + kill -TERM "$vector_pid" 2>/dev/null + local count=0 + while [ $count -lt 5 ] && kill -0 "$vector_pid" 2>/dev/null; do + sleep 1 + count=$((count + 1)) + done + + # If still running, force stop (SIGKILL) + if kill -0 "$vector_pid" 2>/dev/null; then + log_warning "Graceful shutdown failed, force killing Vector process..." + kill -KILL "$vector_pid" 2>/dev/null + sleep 2 + fi + + log_success "Vector process stopped" + fi + } + + # Set signal trap + trap cleanup_vector SIGINT SIGTERM + + # Start Vector (run in background) + echo "Starting Vector..." + ./target/${PROFILE}/vector --config "$config_file" & + vector_pid=$! + + echo "Vector process started (PID: $vector_pid)" + echo "Tip: Press Ctrl+C to stop the test anytime" + + # Wait for specified duration or user interruption + local countdown=$duration + while [ $countdown -gt 0 ] && kill -0 "$vector_pid" 2>/dev/null; do + sleep 1 + countdown=$((countdown - 1)) + + # Show progress every 10 seconds + if [ $((duration - countdown)) -gt 0 ] && [ $(((duration - countdown) % 10)) -eq 0 ]; then + log_info "Running... elapsed $((duration - countdown))s / ${duration}s" + fi + done + + # Check result + local exit_code=0 + if kill -0 "$vector_pid" 2>/dev/null; then + log_info "Test duration reached, stopping Vector..." + cleanup_vector + else + wait "$vector_pid" + exit_code=$? + if [ $exit_code -ne 0 ]; then + log_error "Vector exited abnormally with code: $exit_code" + fi + fi + + # Clean up signal trap + trap - SIGINT SIGTERM + + return $exit_code +} + +# Run Vector test +run_vector_test() { + log_info "Running Vector S3 test (collection: $COLLECTION_METHOD, S3: $BUCKET, duration: ${DURATION}s)" + + # Create configuration file + local config_file=$(create_vector_config) + + # Use improved process management to run Vector + if manage_vector_process "$config_file" "$DURATION"; then + log_success "Vector test completed (${DURATION}s)" + + # Show S3 verification results + verify_s3_output + else + log_error "Vector test failed" + return 1 + fi +} + +# Verify S3 output results +verify_s3_output() { + log_info "Verifying S3 bucket contents: s3://${BUCKET}/deltalake-tables" + + # Check if AWS CLI is available for verification + if command -v aws >/dev/null 2>&1; then + log_info "Using AWS CLI to check S3 bucket contents..." + + # Set AWS environment variables if provided + if [[ -n "${AWS_ACCESS_KEY_ID}" ]]; then + export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID}" + fi + if [[ -n "${AWS_SECRET_ACCESS_KEY}" ]]; then + export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY}" + fi + if [[ -n "${AWS_SESSION_TOKEN}" ]]; then + export AWS_SESSION_TOKEN="${AWS_SESSION_TOKEN}" + fi + + # Check for Delta table directories in S3 + for table_name in "hist_processlist" "hist_cluster_statements_summary"; do + table_path="s3://${BUCKET}/deltalake-tables/${table_name}/" + log_info "Checking table: ${table_path}" + + # List objects in the table directory + if aws s3 ls "${table_path}" --region "${REGION}" >/dev/null 2>&1; then + object_count=$(aws s3 ls "${table_path}" --recursive --region "${REGION}" | wc -l) + if [[ ${object_count} -gt 0 ]]; then + log_success "Found ${object_count} objects in ${table_path}" + + # Count parquet files specifically + parquet_count=$(aws s3 ls "${table_path}" --recursive --region "${REGION}" | grep -c "\.parquet$" || echo "0") + if [[ ${parquet_count} -gt 0 ]]; then + log_success "Found ${parquet_count} parquet files in ${table_path}" + else + log_warning "No parquet files found in ${table_path}" + fi + else + log_warning "No objects found in ${table_path}" + fi + else + log_warning "Unable to access ${table_path} or directory doesn't exist" + fi + done + else + log_warning "AWS CLI not found. Cannot verify S3 bucket contents." + log_info "Please install AWS CLI to verify data was written to S3." + log_info "Expected S3 locations:" + log_info " - s3://${BUCKET}/deltalake-tables/hist_processlist/" + log_info " - s3://${BUCKET}/deltalake-tables/hist_cluster_statements_summary/" + fi +} + +# Clean up test files +cleanup_test_files() { + log_info "Cleaning up test files..." + + # Clean up configuration files + rm -f test_config_s3_*.toml + + log_success "Test files cleanup completed" +} + +# Show help +show_help() { + cat <