diff --git a/Cargo.lock b/Cargo.lock index 56ba0f14c..d6628322b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -130,7 +130,7 @@ dependencies = [ "aws-http", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -181,9 +181,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.19" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" +checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" dependencies = [ "anstyle", "anstyle-parse", @@ -211,22 +211,22 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "anstyle-wincon" -version = "3.0.9" +version = "3.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -246,9 +246,9 @@ dependencies = [ [[package]] name = "arboard" -version = "3.5.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1df21f715862ede32a0c525ce2ca4d52626bb0007f8c18b87a384503ac33e70" +checksum = "55f533f8e0af236ffe5eb979b99381df3258853f00ba2e44b6e1955292c75227" dependencies = [ "clipboard-win", "log", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.25" +version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40f6024f3f856663b45fd0c9b6f2024034a702f453549449e0d84a305900dad4" +checksum = "ddb939d66e4ae03cee6091612804ba446b12878410cfa17f785f4dd67d4014e8" dependencies = [ "flate2", "futures-core", @@ -331,9 +331,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.0" +version = "1.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "455e9fb7743c6f6267eb2830ccc08686fbb3d13c9a689369562fd4d4ef9ea462" +checksum = "483020b893cdef3d89637e428d588650c71cfae7ea2e6ecbaee4de4ff99fb2dd" dependencies = [ "aws-credential-types", "aws-runtime", @@ -341,7 +341,7 @@ dependencies = [ "aws-sdk-ssooidc", "aws-sdk-sts", "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -361,9 +361,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.3" +version = "1.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "687bc16bc431a8533fe0097c7f0182874767f920989d7260950172ae8e3c4465" +checksum = "1541072f81945fa1251f8795ef6c92c4282d74d59f88498ae7d4bf00f0ebdad9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -382,9 +382,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.13.1" +version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fcc8f365936c834db5514fc45aee5b1202d677e6b40e48468aaaa8183ca8c7" +checksum = "5c953fe1ba023e6b7730c0d4b031d06f267f23a46167dcbd40316644b10a17ba" dependencies = [ "aws-lc-sys", "zeroize", @@ -392,9 +392,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61b1d86e7705efe1be1b569bab41d4fa1e14e220b60a160f78de2db687add079" +checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff" dependencies = [ "bindgen 0.69.5", "cc", @@ -405,14 +405,14 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.8" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f6c68419d8ba16d9a7463671593c54f81ba58cab466e9b759418da606dcc2e2" +checksum = "c034a1bc1d70e16e7f4e4caf7e9f7693e4c9c24cd91cf17c2a0b21abaebc7c8b" dependencies = [ "aws-credential-types", "aws-sigv4", "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -429,14 +429,14 @@ dependencies = [ [[package]] name = "aws-sdk-cognitoidentity" -version = "1.74.0" +version = "1.80.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a14e1e7fbff8dccd5ad97c52478c888bd8ccc70206d06b027db2eb4904c4a24" +checksum = "606ea1307a8caf6ecddf1d8bbe55b7d1d967c7b6c217d13d5e81c2c8e24e0cf8" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -451,14 +451,14 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.73.0" +version = "1.79.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ac1674cba7872061a29baaf02209fefe499ff034dfd91bd4cc59e4d7741489" +checksum = "0a847168f15b46329fa32c7aca4e4f1a2e072f9b422f0adb19756f2e1457f111" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -473,14 +473,14 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.74.0" +version = "1.80.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a6a22f077f5fd3e3c0270d4e1a110346cddf6769e9433eb9e6daceb4ca3b149" +checksum = "b654dd24d65568738593e8239aef279a86a15374ec926ae8714e2d7245f34149" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-json", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -495,14 +495,14 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.75.0" +version = "1.81.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3258fa707f2f585ee3049d9550954b959002abd59176975150a01d5cf38ae3f" +checksum = "c92ea8a7602321c83615c82b408820ad54280fb026e92de0eeea937342fafa24" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-json", "aws-smithy-query", "aws-smithy-runtime", @@ -518,12 +518,12 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.3.3" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfb9021f581b71870a17eac25b52335b82211cdc092e02b6876b2bcefa61666" +checksum = "084c34162187d39e3740cb635acd73c4e3a551a36146ad6fe8883c929c9f876c" dependencies = [ "aws-credential-types", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-runtime-api", "aws-smithy-types", "bytes", @@ -551,9 +551,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.9" +version = "0.60.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "338a3642c399c0a5d157648426110e199ca7fd1c689cc395676b81aa563700c4" +checksum = "604c7aec361252b8f1c871a7641d5e0ba3a7f5a586e51b66bc9510a5519594d9" dependencies = [ "aws-smithy-types", "bytes", @@ -583,9 +583,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.62.1" +version = "0.62.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99335bec6cdc50a346fda1437f9fefe33abf8c99060739a546a16457f2862ca9" +checksum = "7c4dacf2d38996cf729f55e7a762b30918229917eca115de45dfa8dfb97796c9" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", @@ -603,17 +603,17 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f491388e741b7ca73b24130ff464c1478acc34d5b331b7dd0a2ee4643595a15" +checksum = "f108f1ca850f3feef3009bdcc977be201bca9a91058864d9de0684e64514bee0" dependencies = [ "aws-smithy-async", "aws-smithy-protocol-test", "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "h2 0.3.26", - "h2 0.4.10", + "h2 0.3.27", + "h2 0.4.12", "http 0.2.12", "http 1.3.1", "http-body 0.4.6", @@ -626,7 +626,7 @@ dependencies = [ "indexmap", "pin-project-lite", "rustls 0.21.12", - "rustls 0.23.28", + "rustls 0.23.31", "rustls-native-certs 0.8.1", "rustls-pki-types", "serde", @@ -685,12 +685,12 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.8.3" +version = "1.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14302f06d1d5b7d333fd819943075b13d27c7700b414f574c3c35859bfb55d5e" +checksum = "9e107ce0783019dbff59b3a244aa0c114e4a8c9d93498af9162608cd5474e796" dependencies = [ "aws-smithy-async", - "aws-smithy-http 0.62.1", + "aws-smithy-http 0.62.3", "aws-smithy-http-client", "aws-smithy-observability", "aws-smithy-runtime-api", @@ -710,9 +710,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.8.1" +version = "1.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8531b6d8882fd8f48f82a9754e682e29dd44cff27154af51fa3eb730f59efb" +checksum = "75d52251ed4b9776a3e8487b2a01ac915f73b2da3af8fc1e77e0fce697a550d4" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -775,9 +775,9 @@ dependencies = [ [[package]] name = "aws-types" -version = "1.3.7" +version = "1.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a322fec39e4df22777ed3ad8ea868ac2f94cd15e1a55f6ee8d8d6305057689a" +checksum = "b069d19bf01e46298eaedd7c6f283fe565a59263e53eebec945f3e6398f42390" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -948,9 +948,9 @@ dependencies = [ [[package]] name = "bm25" -version = "2.2.1" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9874599901ae2aaa19b1485145be2fa4e9af42d1b127672a03a7099ab6350bac" +checksum = "b84ff0d57042bc263e2ebadb3703424b59b65870902649a2b3d0f4d7ab863244" dependencies = [ "cached", "deunicode", @@ -989,9 +989,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.18.1" +version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" [[package]] name = "bytecount" @@ -1001,18 +1001,18 @@ checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" [[package]] name = "bytemuck" -version = "1.23.1" +version = "1.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422" +checksum = "3995eaeebcdf32f91f980d360f78732ddc061097ab4e39991ae7a6ace9194677" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.9.3" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ecc273b49b3205b83d648f0690daa588925572cc5063745bfe547fe7ec8e1a1" +checksum = "4f154e572231cb6ba2bd1176980827e3d5dc04cc183a75dea38109fbdd672d29" dependencies = [ "proc-macro2", "quote", @@ -1043,14 +1043,14 @@ dependencies = [ [[package]] name = "cached" -version = "0.55.1" +version = "0.56.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0839c297f8783316fcca9d90344424e968395413f0662a5481f79c6648bbc14" +checksum = "801927ee168e17809ab8901d9f01f700cd7d8d6a6527997fee44e4b0327a253c" dependencies = [ "ahash", "cached_proc_macro", "cached_proc_macro_types", - "hashbrown 0.14.5", + "hashbrown 0.15.5", "once_cell", "thiserror 2.0.12", "web-time", @@ -1058,9 +1058,9 @@ dependencies = [ [[package]] name = "cached_proc_macro" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "673992d934f0711b68ebb3e1b79cdc4be31634b37c98f26867ced0438ca5c603" +checksum = "9225bdcf4e4a9a4c08bf16607908eb2fbf746828d5e0b5e019726dbf6571f201" dependencies = [ "darling", "proc-macro2", @@ -1076,9 +1076,9 @@ checksum = "ade8366b8bd5ba243f0a58f036cc0ca8a2f069cff1a2351ef1cac6b083e16fc0" [[package]] name = "camino" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0da45bc31171d8d6960122e222a67740df867c1dd53b4d51caa297084c185cab" +checksum = "5d07aa9a93b00c76f71bc35d598bed923f6d4f3a9ca5c24b7737ae1a292841c0" dependencies = [ "serde", ] @@ -1095,7 +1095,7 @@ dependencies = [ "memmap2", "num-traits", "num_cpus", - "rand 0.9.1", + "rand 0.9.2", "rand_distr", "rayon", "safetensors", @@ -1131,7 +1131,7 @@ dependencies = [ "candle-nn", "fancy-regex 0.13.0", "num-traits", - "rand 0.9.1", + "rand 0.9.2", "rayon", "serde", "serde_json", @@ -1145,6 +1145,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cbor-diag" version = "0.1.12" @@ -1166,9 +1175,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.27" +version = "1.2.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" +checksum = "2352e5597e9c544d5e6d9c95190d5d27738ade584fa8db0a16e130e5c2b5296e" dependencies = [ "jobserver", "libc", @@ -1267,17 +1276,17 @@ dependencies = [ "quote", "r2d2", "r2d2_sqlite", - "rand 0.9.1", + "rand 0.9.2", "regex", "reqwest", "ring", "rusqlite", - "rustls 0.23.28", + "rustls 0.23.31", "rustls-native-certs 0.8.1", "rustls-pemfile 2.2.0", "rustyline", "schemars", - "security-framework 3.2.0", + "security-framework 3.3.0", "semantic_search_client", "semver", "serde", @@ -1291,7 +1300,7 @@ dependencies = [ "skim", "spinners", "strip-ansi-escapes", - "strum 0.27.1", + "strum 0.27.2", "syn 2.0.104", "syntect", "sysinfo", @@ -1383,9 +1392,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.40" +version = "4.5.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" +checksum = "50fd97c9dc2399518aa331917ac6f274280ec5eb34e555dd291899745c48ec6f" dependencies = [ "clap_builder", "clap_derive", @@ -1393,9 +1402,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.40" +version = "4.5.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" +checksum = "c35b5830294e1fa0462034af85cc95225a4cb07092c088c55bda3147cfcd8f65" dependencies = [ "anstream", "anstyle", @@ -1408,9 +1417,9 @@ dependencies = [ [[package]] name = "clap_complete" -version = "4.5.54" +version = "4.5.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aad5b1b4de04fead402672b48897030eec1f3bfe1550776322f59f6d6e6a5677" +checksum = "67e4efcbb5da11a92e8a609233aa1e8a7d91e38de0be865f016d14700d45a7fd" dependencies = [ "clap", ] @@ -1427,9 +1436,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.40" +version = "4.5.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" +checksum = "ef4f52386a59ca4c860f7393bcf8abd8dfd91ecccc0f774635ff68e92eeef491" dependencies = [ "anstyle", "heck 0.5.0", @@ -1447,9 +1456,9 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "clipboard-win" -version = "5.4.0" +version = "5.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15efe7a882b08f34e38556b14f2fb3daa98769d06c7f0c1b076dfd0d983bc892" +checksum = "bde03770d3df201d4fb868f2c9c59e66a3e4e2bd06692a0fe701e7103c7e84d4" dependencies = [ "error-code", ] @@ -1536,6 +1545,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "compact_str" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "console" version = "0.15.11" @@ -1634,9 +1658,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.4.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] @@ -1758,9 +1782,9 @@ dependencies = [ [[package]] name = "crunchy" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-common" @@ -1817,6 +1841,15 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "dary_heap" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" +dependencies = [ + "serde", +] + [[package]] name = "data-encoding" version = "2.9.0" @@ -1976,7 +2009,7 @@ checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" dependencies = [ "libc", "option-ext", - "redox_users 0.5.0", + "redox_users 0.5.2", "windows-sys 0.60.2", ] @@ -2041,9 +2074,9 @@ checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" [[package]] name = "dyn-clone" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] name = "dyn-stack" @@ -2252,7 +2285,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", - "rustix 1.0.7", + "rustix 1.0.8", "windows-sys 0.59.0", ] @@ -2748,9 +2781,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" dependencies = [ "bytes", "fnv", @@ -2767,9 +2800,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.10" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9421a676d1b147b16b82c9225157dc629087ef8ec4d5e2960f9437a90dac0a5" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" dependencies = [ "atomic-waker", "bytes", @@ -2794,7 +2827,7 @@ dependencies = [ "cfg-if", "crunchy", "num-traits", - "rand 0.9.1", + "rand 0.9.2", "rand_distr", ] @@ -2810,9 +2843,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.4" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", @@ -2979,14 +3012,14 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2 0.3.26", + "h2 0.3.27", "http 0.2.12", "http-body 0.4.6", "httparse", "httpdate", "itoa", "pin-project-lite", - "socket2", + "socket2 0.5.10", "tokio", "tower-service", "tracing", @@ -3002,7 +3035,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.10", + "h2 0.4.12", "http 1.3.1", "http-body 1.0.1", "httparse", @@ -3039,20 +3072,20 @@ dependencies = [ "http 1.3.1", "hyper 1.6.0", "hyper-util", - "rustls 0.23.28", + "rustls 0.23.31", "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", "tokio-rustls 0.26.2", "tower-service", - "webpki-roots 1.0.1", + "webpki-roots 1.0.2", ] [[package]] name = "hyper-util" -version = "0.1.14" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc2fdfdbff08affe55bb779f33b053aa1fe5dd5b54c257343c17edfa55711bdb" +checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" dependencies = [ "base64 0.22.1", "bytes", @@ -3066,7 +3099,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2", + "socket2 0.6.0", "tokio", "tower-service", "tracing", @@ -3211,18 +3244,18 @@ dependencies = [ [[package]] name = "indenter" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" +checksum = "964de6e86d545b246d84badc0fef527924ace5134f30641c203ef52ba83f58d5" [[package]] name = "indexmap" -version = "2.9.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" dependencies = [ "equivalent", - "hashbrown 0.15.4", + "hashbrown 0.15.5", "serde", ] @@ -3265,6 +3298,17 @@ dependencies = [ "rustversion", ] +[[package]] +name = "io-uring" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -3315,27 +3359,27 @@ dependencies = [ [[package]] name = "itertools" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] [[package]] name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] [[package]] name = "itertools" -version = "0.13.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" dependencies = [ "either", ] @@ -3442,7 +3486,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.53.2", + "windows-targets 0.53.3", ] [[package]] @@ -3474,12 +3518,13 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.3" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +checksum = "391290121bad3d37fbddad76d8f5d1c1c314cfc646d143d7e07a3086ddff0ce3" dependencies = [ "bitflags 2.9.1", "libc", + "redox_syscall", ] [[package]] @@ -3531,9 +3576,9 @@ checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" [[package]] name = "litrs" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ce301924b7887e9d637144fdade93f9dfff9b60981d4ac161db09720d39aa5" +checksum = "f5e54036fe321fd421e10d732f155734c4e4afd610dd556d9a82833ab3ee0bed" [[package]] name = "lock_api" @@ -3557,7 +3602,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.15.4", + "hashbrown 0.15.5", ] [[package]] @@ -3623,9 +3668,9 @@ checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "memmap2" -version = "0.9.5" +version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +checksum = "483758ad303d734cec05e5c12b41d7e93e6a6390c5e9dae6bdeb7c1259012d28" dependencies = [ "libc", "stable_deref_trait", @@ -3743,7 +3788,7 @@ dependencies = [ "hyper 1.6.0", "hyper-util", "log", - "rand 0.9.1", + "rand 0.9.2", "regex", "serde_json", "serde_urlencoded", @@ -4417,9 +4462,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "owo-colors" -version = "4.2.1" +version = "4.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26995317201fa17f3656c36716aed4a7c81743a9634ac4c99c0eeda495db0cec" +checksum = "48dd4f4a2c8405440fd0462561f0e5806bd0f77e86f51c761481bdd4018b545e" [[package]] name = "parking_lot" @@ -4486,13 +4531,13 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "plist" -version = "1.7.2" +version = "1.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d77244ce2d584cd84f6a15f86195b8c9b2a0dfbfd817c09e0464244091a58ed" +checksum = "3af6b589e163c5a788fab00ce0c0366f6efbb9959c2f9874b224936af7fce7e1" dependencies = [ "base64 0.22.1", "indexmap", - "quick-xml", + "quick-xml 0.38.1", "serde", "time", ] @@ -4606,9 +4651,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.35" +version = "0.2.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061c1221631e079b26479d25bbf2275bfe5917ae8419cd7e34f13bfc2aa7539a" +checksum = "ff24dfcda44452b9816fff4cd4227e1bb73ff5a2f1bc1105aa92fb8565ce44d2" dependencies = [ "proc-macro2", "syn 2.0.104", @@ -4647,9 +4692,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "beef09f85ae72cea1ef96ba6870c51e6382ebfa4f0e85b643459331f3daa5be0" dependencies = [ "unicode-ident", ] @@ -4757,6 +4802,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.38.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9845d9dccf565065824e69f9f235fafba1587031eda353c1f1561cd6a6be78f4" +dependencies = [ + "memchr", +] + [[package]] name = "quinn" version = "0.11.8" @@ -4769,8 +4823,8 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash 2.1.1", - "rustls 0.23.28", - "socket2", + "rustls 0.23.31", + "socket2 0.5.10", "thiserror 2.0.12", "tokio", "tracing", @@ -4786,10 +4840,10 @@ dependencies = [ "bytes", "getrandom 0.3.3", "lru-slab", - "rand 0.9.1", + "rand 0.9.2", "ring", "rustc-hash 2.1.1", - "rustls 0.23.28", + "rustls 0.23.31", "rustls-pki-types", "slab", "thiserror 2.0.12", @@ -4807,7 +4861,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2", + "socket2 0.5.10", "tracing", "windows-sys 0.59.0", ] @@ -4872,9 +4926,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", @@ -4925,7 +4979,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" dependencies = [ "num-traits", - "rand 0.9.1", + "rand 0.9.2", ] [[package]] @@ -4958,12 +5012,12 @@ dependencies = [ [[package]] name = "rayon-cond" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" dependencies = [ "either", - "itertools 0.11.0", + "itertools 0.14.0", "rayon", ] @@ -4985,9 +5039,9 @@ checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" [[package]] name = "redox_syscall" -version = "0.5.13" +version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ "bitflags 2.9.1", ] @@ -5005,9 +5059,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.5.0" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.16", "libredox", @@ -5100,9 +5154,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "reqwest" -version = "0.12.20" +version = "0.12.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabf4c97d9130e2bf606614eb937e86edac8292eaa6f422f995d7e8de1eb1813" +checksum = "cbc931937e6ca3a06e3b6c0aa7841849b160a90351d6ab467a8b9b9959767531" dependencies = [ "async-compression", "base64 0.22.1", @@ -5113,7 +5167,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2 0.4.10", + "h2 0.4.12", "http 1.3.1", "http-body 1.0.1", "http-body-util", @@ -5126,7 +5180,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.28", + "rustls 0.23.31", "rustls-native-certs 0.8.1", "rustls-pki-types", "serde", @@ -5144,7 +5198,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.1", + "webpki-roots 1.0.2", ] [[package]] @@ -5197,9 +5251,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" [[package]] name = "rustc-hash" @@ -5237,15 +5291,15 @@ dependencies = [ [[package]] name = "rustix" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" dependencies = [ "bitflags 2.9.1", "errno", "libc", "linux-raw-sys 0.9.4", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -5262,16 +5316,16 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.28" +version = "0.23.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7160e3e10bf4535308537f3c4e1641468cd0e485175d6163087c0393c7d46643" +checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" dependencies = [ "aws-lc-rs", "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.3", + "rustls-webpki 0.103.4", "subtle", "zeroize", ] @@ -5297,7 +5351,7 @@ dependencies = [ "openssl-probe", "rustls-pki-types", "schannel", - "security-framework 3.2.0", + "security-framework 3.3.0", ] [[package]] @@ -5340,9 +5394,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.3" +version = "0.103.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" dependencies = [ "aws-lc-rs", "ring", @@ -5352,9 +5406,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rustyline" @@ -5488,9 +5542,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" +checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c" dependencies = [ "bitflags 2.9.1", "core-foundation 0.10.1", @@ -5520,6 +5574,7 @@ dependencies = [ "candle-transformers", "chrono", "dirs 5.0.1", + "glob", "hnsw_rs", "indicatif", "rayon", @@ -5593,9 +5648,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7" dependencies = [ "indexmap", "itoa", @@ -5729,9 +5784,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.5" +version = "1.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" dependencies = [ "libc", ] @@ -5766,7 +5821,7 @@ dependencies = [ "indexmap", "log", "nix 0.29.0", - "rand 0.9.1", + "rand 0.9.2", "rayon", "regex", "shell-quote", @@ -5781,9 +5836,9 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "smallvec" @@ -5801,6 +5856,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + [[package]] name = "spinners" version = "4.1.1" @@ -5830,6 +5895,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "stop-words" version = "0.8.1" @@ -5871,11 +5942,11 @@ checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" [[package]] name = "strum" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f64def088c51c9510a8579e3c5d67c65349dcf755e5479ad3d010aa6454e2c32" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" dependencies = [ - "strum_macros 0.27.1", + "strum_macros 0.27.2", ] [[package]] @@ -5906,14 +5977,13 @@ dependencies = [ [[package]] name = "strum_macros" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c77a8c5abcaf0f9ce05d62342b7d298c346515365c36b673df4ebe3ced01fde8" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "rustversion", "syn 2.0.104", ] @@ -6068,7 +6138,7 @@ dependencies = [ "fastrand", "getrandom 0.3.3", "once_cell", - "rustix 1.0.7", + "rustix 1.0.8", "windows-sys 0.59.0", ] @@ -6098,7 +6168,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "45c6481c4829e4cc63825e62c49186a34538b7b2750b73b266581ffb612fb5ed" dependencies = [ - "rustix 1.0.7", + "rustix 1.0.8", "windows-sys 0.59.0", ] @@ -6246,23 +6316,25 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokenizers" -version = "0.21.1" +version = "0.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3169b3195f925496c895caee7978a335d49218488ef22375267fba5a46a40bd7" +checksum = "a620b996116a59e184c2fa2dfd8251ea34a36d0a514758c6f966386bd2e03476" dependencies = [ + "ahash", "aho-corasick", + "compact_str", + "dary_heap", "derive_builder", "esaxx-rs", - "getrandom 0.2.16", + "getrandom 0.3.3", "indicatif", - "itertools 0.13.0", - "lazy_static", + "itertools 0.14.0", "log", "macro_rules_attribute", "monostate", "onig", "paste", - "rand 0.8.5", + "rand 0.9.2", "rayon", "rayon-cond", "regex", @@ -6278,20 +6350,22 @@ dependencies = [ [[package]] name = "tokio" -version = "1.45.1" +version = "1.47.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" dependencies = [ "backtrace", "bytes", + "io-uring", "libc", "mio", "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2", + "slab", + "socket2 0.6.0", "tokio-macros", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -6321,7 +6395,7 @@ version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" dependencies = [ - "rustls 0.23.28", + "rustls 0.23.31", "tokio", ] @@ -6350,9 +6424,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.15" +version = "0.7.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" dependencies = [ "bytes", "futures-core", @@ -6394,7 +6468,7 @@ dependencies = [ "serde_spanned", "toml_datetime", "toml_write", - "winnow 0.7.11", + "winnow 0.7.12", ] [[package]] @@ -6570,11 +6644,10 @@ dependencies = [ [[package]] name = "tree_magic_mini" -version = "3.1.6" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aac5e8971f245c3389a5a76e648bfc80803ae066a1243a75db0064d7c1129d63" +checksum = "f943391d896cdfe8eec03a04d7110332d445be7df856db382dd96a730667562c" dependencies = [ - "fnv", "memchr", "nom", "once_cell", @@ -6612,7 +6685,7 @@ dependencies = [ "http 1.3.1", "httparse", "log", - "rand 0.9.1", + "rand 0.9.2", "sha1", "thiserror 2.0.12", "utf-8", @@ -6781,7 +6854,7 @@ checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" dependencies = [ "getrandom 0.3.3", "js-sys", - "rand 0.9.1", + "rand 0.9.2", "serde", "wasm-bindgen", ] @@ -6975,34 +7048,34 @@ dependencies = [ [[package]] name = "wayland-backend" -version = "0.3.10" +version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe770181423e5fc79d3e2a7f4410b7799d5aab1de4372853de3c6aa13ca24121" +checksum = "673a33c33048a5ade91a6b139580fa174e19fb0d23f396dca9fa15f2e1e49b35" dependencies = [ "cc", "downcast-rs", - "rustix 0.38.44", + "rustix 1.0.8", "smallvec", "wayland-sys", ] [[package]] name = "wayland-client" -version = "0.31.10" +version = "0.31.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978fa7c67b0847dbd6a9f350ca2569174974cd4082737054dbb7fbb79d7d9a61" +checksum = "c66a47e840dc20793f2264eb4b3e4ecb4b75d91c0dd4af04b456128e0bdd449d" dependencies = [ "bitflags 2.9.1", - "rustix 0.38.44", + "rustix 1.0.8", "wayland-backend", "wayland-scanner", ] [[package]] name = "wayland-protocols" -version = "0.32.8" +version = "0.32.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "779075454e1e9a521794fed15886323ea0feda3f8b0fc1390f5398141310422a" +checksum = "efa790ed75fbfd71283bd2521a1cfdc022aabcc28bdcff00851f9e4ae88d9901" dependencies = [ "bitflags 2.9.1", "wayland-backend", @@ -7012,9 +7085,9 @@ dependencies = [ [[package]] name = "wayland-protocols-wlr" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cb6cdc73399c0e06504c437fe3cf886f25568dd5454473d565085b36d6a8bbf" +checksum = "efd94963ed43cf9938a090ca4f7da58eb55325ec8200c3848963e98dc25b78ec" dependencies = [ "bitflags 2.9.1", "wayland-backend", @@ -7025,20 +7098,20 @@ dependencies = [ [[package]] name = "wayland-scanner" -version = "0.31.6" +version = "0.31.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "896fdafd5d28145fce7958917d69f2fd44469b1d4e861cb5961bcbeebc6d1484" +checksum = "54cb1e9dc49da91950bdfd8b848c49330536d9d1fb03d4bfec8cae50caa50ae3" dependencies = [ "proc-macro2", - "quick-xml", + "quick-xml 0.37.5", "quote", ] [[package]] name = "wayland-sys" -version = "0.31.6" +version = "0.31.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbcebb399c77d5aa9fa5db874806ee7b4eba4e73650948e8f93963f128896615" +checksum = "34949b42822155826b41db8e5d0c1be3a2bd296c747577a43a3e6daefc296142" dependencies = [ "pkg-config", ] @@ -7074,9 +7147,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8782dd5a41a24eed3a4f40b606249b3e236ca61adf1f25ea4d45c73de122b502" +checksum = "7e8983c3ab33d6fb807cfcdad2491c4ea8cbc8ed839181c7dfd9c67c83e261b2" dependencies = [ "rustls-pki-types", ] @@ -7101,7 +7174,7 @@ checksum = "24d643ce3fd3e5b54854602a080f34fb10ab75e0b813ee32d00ca2b44fa74762" dependencies = [ "either", "env_home", - "rustix 1.0.7", + "rustix 1.0.8", "winsafe", ] @@ -7113,11 +7186,11 @@ checksum = "0b9aa3ad29c3d08283ac6b769e3ec15ad1ddb88af7d2e9bc402c574973b937e7" [[package]] name = "whoami" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6994d13118ab492c3c80c1f81928718159254c53c472bf9ce36f8dae4add02a7" +checksum = "5d4a4db5077702ca3015d3d02d74974948aba2ad9e12ab7df718ee64ccd7e97d" dependencies = [ - "redox_syscall", + "libredox", "wasite", "web-sys", ] @@ -7400,7 +7473,7 @@ version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.2", + "windows-targets 0.53.3", ] [[package]] @@ -7436,10 +7509,11 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.53.2" +version = "0.53.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" dependencies = [ + "windows-link", "windows_aarch64_gnullvm 0.53.0", "windows_aarch64_msvc 0.53.0", "windows_i686_gnu 0.53.0", @@ -7608,9 +7682,9 @@ dependencies = [ [[package]] name = "winnow" -version = "0.7.11" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74c7b26e3480b707944fc872477815d29a8e429d2f93a1ce000f5fa84a15cbcd" +checksum = "f3edebf492c8125044983378ecb5766203ad3b4c2f7a922bd7dd207f6d443e95" dependencies = [ "memchr", ] @@ -7811,9 +7885,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.2" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" dependencies = [ "yoke 0.8.0", "zerofrom", diff --git a/crates/chat-cli/src/cli/chat/cli/knowledge.rs b/crates/chat-cli/src/cli/chat/cli/knowledge.rs index 987eda059..7312330ad 100644 --- a/crates/chat-cli/src/cli/chat/cli/knowledge.rs +++ b/crates/chat-cli/src/cli/chat/cli/knowledge.rs @@ -29,13 +29,21 @@ pub enum KnowledgeSubcommand { /// Display the knowledge base contents Show, /// Add a file or directory to knowledge base - Add { path: String }, - /// Remove specified knowledge context by path + Add { + path: String, + /// Include patterns (e.g., `**/*.ts`, `**/*.md`) + #[arg(long, action = clap::ArgAction::Append)] + include: Vec, + /// Exclude patterns (e.g., `node_modules/**`, `target/**`) + #[arg(long, action = clap::ArgAction::Append)] + exclude: Vec, + }, + /// Remove specified knowledge base entry by path #[command(alias = "rm")] Remove { path: String }, /// Update a file or directory in knowledge base Update { path: String }, - /// Remove all knowledge contexts + /// Remove all knowledge base entries Clear, /// Show background operation status Status, @@ -79,7 +87,9 @@ impl KnowledgeSubcommand { queue!( session.stderr, style::SetForegroundColor(Color::Red), - style::Print("\nKnowledge tool is disabled. Enable it with: q settings chat.enableKnowledge true\n\n"), + style::Print("\nKnowledge tool is disabled. Enable it with: q settings chat.enableKnowledge true\n"), + style::SetForegroundColor(Color::Yellow), + style::Print("πŸ’‘ Your knowledge base data is preserved and will be available when re-enabled.\n\n"), style::SetForegroundColor(Color::Reset) ) } @@ -93,56 +103,67 @@ impl KnowledgeSubcommand { async fn execute_operation(&self, os: &Os, session: &mut ChatSession) -> OperationResult { match self { KnowledgeSubcommand::Show => { - match Self::handle_show(session).await { + match Self::handle_show(os, session).await { Ok(_) => OperationResult::Info("".to_string()), // Empty Info, formatting already done - Err(e) => OperationResult::Error(format!("Failed to show contexts: {}", e)), + Err(e) => OperationResult::Error(format!("Failed to show knowledge base entries: {}", e)), } }, - KnowledgeSubcommand::Add { path } => Self::handle_add(os, path).await, + KnowledgeSubcommand::Add { path, include, exclude } => Self::handle_add(os, path, include, exclude).await, KnowledgeSubcommand::Remove { path } => Self::handle_remove(os, path).await, KnowledgeSubcommand::Update { path } => Self::handle_update(os, path).await, - KnowledgeSubcommand::Clear => Self::handle_clear(session).await, - KnowledgeSubcommand::Status => Self::handle_status().await, - KnowledgeSubcommand::Cancel { operation_id } => Self::handle_cancel(operation_id.as_deref()).await, + KnowledgeSubcommand::Clear => Self::handle_clear(os, session).await, + KnowledgeSubcommand::Status => Self::handle_status(os).await, + KnowledgeSubcommand::Cancel { operation_id } => Self::handle_cancel(os, operation_id.as_deref()).await, } } - async fn handle_show(session: &mut ChatSession) -> Result<(), std::io::Error> { - let async_knowledge_store = KnowledgeStore::get_async_instance().await; - let store = async_knowledge_store.lock().await; - - // Use the async get_all method which is concurrent with indexing - let contexts = store.get_all().await.unwrap_or_else(|e| { - // Write error to output using queue system - let _ = queue!( - session.stderr, - style::SetForegroundColor(Color::Red), - style::Print(&format!("Error getting contexts: {}\n", e)), - style::ResetColor - ); - Vec::new() - }); - - Self::format_contexts(session, &contexts) + async fn handle_show(os: &Os, session: &mut ChatSession) -> Result<(), std::io::Error> { + match KnowledgeStore::get_async_instance_with_os(os).await { + Ok(store) => { + let store = store.lock().await; + let entries = store.get_all().await.unwrap_or_else(|e| { + let _ = queue!( + session.stderr, + style::SetForegroundColor(Color::Red), + style::Print(&format!("Error getting knowledge base entries: {}\n", e)), + style::ResetColor + ); + Vec::new() + }); + let _ = Self::format_knowledge_entries(session, &entries); + }, + Err(e) => { + queue!( + session.stderr, + style::SetForegroundColor(Color::Red), + style::Print(&format!("Error accessing knowledge base: {}\n", e)), + style::SetForegroundColor(Color::Reset) + )?; + }, + } + Ok(()) } - fn format_contexts(session: &mut ChatSession, contexts: &[KnowledgeContext]) -> Result<(), std::io::Error> { - if contexts.is_empty() { + fn format_knowledge_entries( + session: &mut ChatSession, + knowledge_entries: &[KnowledgeContext], + ) -> Result<(), std::io::Error> { + if knowledge_entries.is_empty() { queue!( session.stderr, style::Print("\nNo knowledge base entries found.\n"), - style::Print("πŸ’‘ Tip: If indexing is in progress, contexts may not appear until indexing completes.\n"), + style::Print("πŸ’‘ Tip: If indexing is in progress, entries may not appear until indexing completes.\n"), style::Print(" Use 'knowledge status' to check active operations.\n\n") )?; } else { queue!( session.stderr, - style::Print("\nπŸ“š Knowledge Base Contexts:\n"), + style::Print("\nπŸ“š Knowledge Base Entries:\n"), style::Print(format!("{}\n", "━".repeat(80))) )?; - for context in contexts { - Self::format_single_context(session, &context)?; + for entry in knowledge_entries { + Self::format_single_entry(session, &entry)?; queue!(session.stderr, style::Print(format!("{}\n", "━".repeat(80))))?; } // Add final newline to match original formatting exactly @@ -151,32 +172,32 @@ impl KnowledgeSubcommand { Ok(()) } - fn format_single_context(session: &mut ChatSession, context: &&KnowledgeContext) -> Result<(), std::io::Error> { + fn format_single_entry(session: &mut ChatSession, entry: &&KnowledgeContext) -> Result<(), std::io::Error> { queue!( session.stderr, style::SetAttribute(style::Attribute::Bold), style::SetForegroundColor(Color::Cyan), - style::Print(format!("πŸ“‚ {}: ", context.id)), + style::Print(format!("πŸ“‚ {}: ", entry.id)), style::SetForegroundColor(Color::Green), - style::Print(&context.name), + style::Print(&entry.name), style::SetAttribute(style::Attribute::Reset), style::Print("\n") )?; queue!( session.stderr, - style::Print(format!(" Description: {}\n", context.description)), + style::Print(format!(" Description: {}\n", entry.description)), style::Print(format!( " Created: {}\n", - context.created_at.format("%Y-%m-%d %H:%M:%S") + entry.created_at.format("%Y-%m-%d %H:%M:%S") )), style::Print(format!( " Updated: {}\n", - context.updated_at.format("%Y-%m-%d %H:%M:%S") + entry.updated_at.format("%Y-%m-%d %H:%M:%S") )) )?; - if let Some(path) = &context.source_path { + if let Some(path) = &entry.source_path { queue!(session.stderr, style::Print(format!(" Source: {}\n", path)))?; } @@ -184,12 +205,12 @@ impl KnowledgeSubcommand { session.stderr, style::Print(" Items: "), style::SetForegroundColor(Color::Yellow), - style::Print(format!("{}", context.item_count)), + style::Print(format!("{}", entry.item_count)), style::SetForegroundColor(Color::Reset), style::Print(" | Persistent: ") )?; - if context.persistent { + if entry.persistent { queue!( session.stderr, style::SetForegroundColor(Color::Green), @@ -210,33 +231,58 @@ impl KnowledgeSubcommand { } /// Handle add operation - async fn handle_add(os: &Os, path: &str) -> OperationResult { + async fn handle_add( + os: &Os, + path: &str, + include_patterns: &[String], + exclude_patterns: &[String], + ) -> OperationResult { match Self::validate_and_sanitize_path(os, path) { Ok(sanitized_path) => { - let async_knowledge_store = KnowledgeStore::get_async_instance().await; + let async_knowledge_store = match KnowledgeStore::get_async_instance_with_os(os).await { + Ok(store) => store, + Err(e) => return OperationResult::Error(format!("Error accessing knowledge base: {}", e)), + }; let mut store = async_knowledge_store.lock().await; - // Use the async add method which is fire-and-forget - match store.add(path, &sanitized_path).await { + let options = if include_patterns.is_empty() && exclude_patterns.is_empty() { + crate::util::knowledge_store::AddOptions::with_db_defaults(os) + } else { + crate::util::knowledge_store::AddOptions::new() + .with_include_patterns(include_patterns.to_vec()) + .with_exclude_patterns(exclude_patterns.to_vec()) + }; + + match store.add(path, &sanitized_path.clone(), options).await { Ok(message) => OperationResult::Info(message), - Err(e) => OperationResult::Error(format!("Failed to add to knowledge base: {}", e)), + Err(e) => { + if e.contains("Invalid include pattern") || e.contains("Invalid exclude pattern") { + OperationResult::Error(e) + } else { + OperationResult::Error(format!("Failed to add: {}", e)) + } + }, } }, - Err(e) => OperationResult::Error(e), + Err(e) => OperationResult::Error(format!("Invalid path: {}", e)), } } /// Handle remove operation async fn handle_remove(os: &Os, path: &str) -> OperationResult { let sanitized_path = sanitize_path_tool_arg(os, path); - let async_knowledge_store = KnowledgeStore::get_async_instance().await; + + let async_knowledge_store = match KnowledgeStore::get_async_instance_with_os(os).await { + Ok(store) => store, + Err(e) => return OperationResult::Error(format!("Error accessing knowledge base: {}", e)), + }; let mut store = async_knowledge_store.lock().await; // Try path first, then name if store.remove_by_path(&sanitized_path.to_string_lossy()).await.is_ok() { - OperationResult::Success(format!("Removed context with path '{}'", path)) + OperationResult::Success(format!("Removed knowledge base entry with path '{}'", path)) } else if store.remove_by_name(path).await.is_ok() { - OperationResult::Success(format!("Removed context with name '{}'", path)) + OperationResult::Success(format!("Removed knowledge base entry with name '{}'", path)) } else { OperationResult::Warning(format!("Entry not found in knowledge base: {}", path)) } @@ -246,7 +292,12 @@ impl KnowledgeSubcommand { async fn handle_update(os: &Os, path: &str) -> OperationResult { match Self::validate_and_sanitize_path(os, path) { Ok(sanitized_path) => { - let async_knowledge_store = KnowledgeStore::get_async_instance().await; + let async_knowledge_store = match KnowledgeStore::get_async_instance_with_os(os).await { + Ok(store) => store, + Err(e) => { + return OperationResult::Error(format!("Error accessing knowledge base directory: {}", e)); + }, + }; let mut store = async_knowledge_store.lock().await; match store.update_by_path(&sanitized_path).await { @@ -259,11 +310,12 @@ impl KnowledgeSubcommand { } /// Handle clear operation - async fn handle_clear(session: &mut ChatSession) -> OperationResult { + async fn handle_clear(os: &Os, session: &mut ChatSession) -> OperationResult { // Require confirmation queue!( session.stderr, - style::Print("⚠️ This will remove ALL knowledge base entries. Are you sure? (y/N): ") + style::Print("⚠️ This action will remove all knowledge base entries.\n"), + style::Print("Clear the knowledge base? (y/N): ") ) .unwrap(); session.stderr.flush().unwrap(); @@ -278,7 +330,10 @@ impl KnowledgeSubcommand { return OperationResult::Info("Clear operation cancelled".to_string()); } - let async_knowledge_store = KnowledgeStore::get_async_instance().await; + let async_knowledge_store = match KnowledgeStore::get_async_instance_with_os(os).await { + Ok(store) => store, + Err(e) => return OperationResult::Error(format!("Error accessing knowledge base directory: {}", e)), + }; let mut store = async_knowledge_store.lock().await; // First, cancel any pending operations @@ -308,8 +363,11 @@ impl KnowledgeSubcommand { } /// Handle status operation - async fn handle_status() -> OperationResult { - let async_knowledge_store = KnowledgeStore::get_async_instance().await; + async fn handle_status(os: &Os) -> OperationResult { + let async_knowledge_store = match KnowledgeStore::get_async_instance_with_os(os).await { + Ok(store) => store, + Err(e) => return OperationResult::Error(format!("Error accessing knowledge base directory: {}", e)), + }; let store = async_knowledge_store.lock().await; match store.get_status_data().await { @@ -325,9 +383,9 @@ impl KnowledgeSubcommand { fn format_status_display(status: &SystemStatus) -> String { let mut status_lines = Vec::new(); - // Show context summary + // Show knowledge base summary status_lines.push(format!( - "πŸ“š Total contexts: {} ({} persistent, {} volatile)", + "πŸ“š Total knowledge base entries: {} ({} persistent, {} volatile)", status.total_contexts, status.persistent_contexts, status.volatile_contexts )); @@ -416,8 +474,11 @@ impl KnowledgeSubcommand { } /// Handle cancel operation - async fn handle_cancel(operation_id: Option<&str>) -> OperationResult { - let async_knowledge_store = KnowledgeStore::get_async_instance().await; + async fn handle_cancel(os: &Os, operation_id: Option<&str>) -> OperationResult { + let async_knowledge_store = match KnowledgeStore::get_async_instance_with_os(os).await { + Ok(store) => store, + Err(e) => return OperationResult::Error(format!("Error accessing knowledge base directory: {}", e)), + }; let mut store = async_knowledge_store.lock().await; match store.cancel_operation(operation_id).await { @@ -491,3 +552,125 @@ impl KnowledgeSubcommand { } } } + +#[cfg(test)] +mod tests { + use clap::Parser; + + use super::*; + + #[derive(Parser)] + #[command(name = "test")] + struct TestCli { + #[command(subcommand)] + knowledge: KnowledgeSubcommand, + } + + #[test] + fn test_include_exclude_patterns_parsing() { + // Test that include and exclude patterns are parsed correctly + let result = TestCli::try_parse_from(&[ + "test", + "add", + "/some/path", + "--include", + "*.rs", + "--include", + "**/*.md", + "--exclude", + "node_modules/**", + "--exclude", + "target/**", + ]); + + assert!(result.is_ok()); + let cli = result.unwrap(); + + if let KnowledgeSubcommand::Add { path, include, exclude } = cli.knowledge { + assert_eq!(path, "/some/path"); + assert_eq!(include, vec!["*.rs", "**/*.md"]); + assert_eq!(exclude, vec!["node_modules/**", "target/**"]); + } else { + panic!("Expected Add subcommand"); + } + } + + #[test] + fn test_clap_markdown_parsing_issue() { + let help_result = TestCli::try_parse_from(&["test", "add", "--help"]); + match help_result { + Err(err) if err.kind() == clap::error::ErrorKind::DisplayHelp => { + // This is expected for --help + // The actual issue would be visible in the help text formatting + // We can't easily test the exact formatting here, but this documents the issue + }, + _ => panic!("Expected help output"), + } + } + + #[test] + fn test_empty_patterns_allowed() { + // Test that commands work without any patterns + let result = TestCli::try_parse_from(&["test", "add", "/some/path"]); + assert!(result.is_ok()); + + let cli = result.unwrap(); + if let KnowledgeSubcommand::Add { path, include, exclude } = cli.knowledge { + assert_eq!(path, "/some/path"); + assert!(include.is_empty()); + assert!(exclude.is_empty()); + } else { + panic!("Expected Add subcommand"); + } + } + + #[test] + fn test_multiple_include_patterns() { + // Test multiple include patterns + let result = TestCli::try_parse_from(&[ + "test", + "add", + "/some/path", + "--include", + "*.rs", + "--include", + "*.md", + "--include", + "*.txt", + ]); + + assert!(result.is_ok()); + let cli = result.unwrap(); + + if let KnowledgeSubcommand::Add { include, .. } = cli.knowledge { + assert_eq!(include, vec!["*.rs", "*.md", "*.txt"]); + } else { + panic!("Expected Add subcommand"); + } + } + + #[test] + fn test_multiple_exclude_patterns() { + // Test multiple exclude patterns + let result = TestCli::try_parse_from(&[ + "test", + "add", + "/some/path", + "--exclude", + "node_modules/**", + "--exclude", + "target/**", + "--exclude", + ".git/**", + ]); + + assert!(result.is_ok()); + let cli = result.unwrap(); + + if let KnowledgeSubcommand::Add { exclude, .. } = cli.knowledge { + assert_eq!(exclude, vec!["node_modules/**", "target/**", ".git/**"]); + } else { + panic!("Expected Add subcommand"); + } + } +} diff --git a/crates/chat-cli/src/cli/chat/tools/knowledge.rs b/crates/chat-cli/src/cli/chat/tools/knowledge.rs index 7e9bcef21..191fbb86d 100644 --- a/crates/chat-cli/src/cli/chat/tools/knowledge.rs +++ b/crates/chat-cli/src/cli/chat/tools/knowledge.rs @@ -124,7 +124,9 @@ impl Knowledge { Knowledge::Update(update) => { // Require at least one identifier (context_id or name) if update.context_id.is_empty() && update.name.is_empty() && update.path.is_empty() { - eyre::bail!("Please provide either context_id or name or path to identify the context to update"); + eyre::bail!( + "Please provide either context_id, name, or path to identify the knowledge base entry to update" + ); } // Validate the path exists @@ -310,8 +312,10 @@ impl Knowledge { } pub async fn invoke(&self, os: &Os, _updates: &mut impl Write) -> Result { - // Get the async knowledge store singleton - let async_knowledge_store = KnowledgeStore::get_async_instance().await; + // Get the async knowledge store singleton with OS-aware directory + let async_knowledge_store = KnowledgeStore::get_async_instance_with_os(os) + .await + .map_err(|e| eyre::eyre!("Failed to access knowledge base: {}", e))?; let mut store = async_knowledge_store.lock().await; let result = match self { @@ -325,7 +329,14 @@ impl Knowledge { add.value.clone() }; - match store.add(&add.name, &value_to_use).await { + match store + .add( + &add.name, + &value_to_use, + crate::util::knowledge_store::AddOptions::with_db_defaults(os), + ) + .await + { Ok(context_id) => format!( "Added '{}' to knowledge base with ID: {}. Track active jobs in '/knowledge status' with provided id.", add.name, context_id @@ -412,23 +423,24 @@ impl Knowledge { .await .unwrap_or_else(|e| format!("Failed to clear knowledge base: {}", e)), Knowledge::Search(search) => { - // Only use a spinner for search, not a full progress bar let results = store.search(&search.query, search.context_id.as_deref()).await; match results { Ok(results) => { if results.is_empty() { - "No matching entries found in knowledge base".to_string() + format!("No matching entries found for query: \"{}\"", search.query) } else { - let mut output = String::from("Search results:\n"); + let mut output = format!("Search results for \"{}\":\n\n", search.query); for result in results { if let Some(text) = result.text() { - output.push_str(&format!("- {}\n", text)); + output.push_str(&format!("{}\n\n", text)); } } output } }, - Err(e) => format!("Search failed: {}", e), + Err(e) => { + format!("Search failed: {}", e) + }, } }, Knowledge::Show => { diff --git a/crates/chat-cli/src/database/settings.rs b/crates/chat-cli/src/database/settings.rs index 8cfba1f07..519ac445b 100644 --- a/crates/chat-cli/src/database/settings.rs +++ b/crates/chat-cli/src/database/settings.rs @@ -22,6 +22,11 @@ pub enum Setting { ShareCodeWhispererContent, EnabledThinking, EnabledKnowledge, + KnowledgeDefaultIncludePatterns, + KnowledgeDefaultExcludePatterns, + KnowledgeMaxFiles, + KnowledgeChunkSize, + KnowledgeChunkOverlap, SkimCommandKey, ChatGreetingEnabled, ApiTimeout, @@ -47,6 +52,11 @@ impl AsRef for Setting { Self::ShareCodeWhispererContent => "codeWhisperer.shareCodeWhispererContentWithAWS", Self::EnabledThinking => "chat.enableThinking", Self::EnabledKnowledge => "chat.enableKnowledge", + Self::KnowledgeDefaultIncludePatterns => "knowledge.defaultIncludePatterns", + Self::KnowledgeDefaultExcludePatterns => "knowledge.defaultExcludePatterns", + Self::KnowledgeMaxFiles => "knowledge.maxFiles", + Self::KnowledgeChunkSize => "knowledge.chunkSize", + Self::KnowledgeChunkOverlap => "knowledge.chunkOverlap", Self::SkimCommandKey => "chat.skimCommandKey", Self::ChatGreetingEnabled => "chat.greeting.enabled", Self::ApiTimeout => "api.timeout", @@ -82,6 +92,11 @@ impl TryFrom<&str> for Setting { "codeWhisperer.shareCodeWhispererContentWithAWS" => Ok(Self::ShareCodeWhispererContent), "chat.enableThinking" => Ok(Self::EnabledThinking), "chat.enableKnowledge" => Ok(Self::EnabledKnowledge), + "knowledge.defaultIncludePatterns" => Ok(Self::KnowledgeDefaultIncludePatterns), + "knowledge.defaultExcludePatterns" => Ok(Self::KnowledgeDefaultExcludePatterns), + "knowledge.maxFiles" => Ok(Self::KnowledgeMaxFiles), + "knowledge.chunkSize" => Ok(Self::KnowledgeChunkSize), + "knowledge.chunkOverlap" => Ok(Self::KnowledgeChunkOverlap), "chat.skimCommandKey" => Ok(Self::SkimCommandKey), "chat.greeting.enabled" => Ok(Self::ChatGreetingEnabled), "api.timeout" => Ok(Self::ApiTimeout), @@ -166,6 +181,10 @@ impl Settings { self.get(key).and_then(|value| value.as_i64()) } + pub fn get_int_or(&self, key: Setting, default: usize) -> usize { + self.get_int(key).map_or(default, |v| v as usize) + } + pub async fn save_to_file(&self) -> Result<(), DatabaseError> { if cfg!(test) { return Ok(()); diff --git a/crates/chat-cli/src/util/directories.rs b/crates/chat-cli/src/util/directories.rs index d5fb2ed04..a726cf376 100644 --- a/crates/chat-cli/src/util/directories.rs +++ b/crates/chat-cli/src/util/directories.rs @@ -185,6 +185,11 @@ pub fn chat_profiles_dir(os: &Os) -> Result { Ok(home_dir(os)?.join(".aws").join("amazonq").join("profiles")) } +/// The directory for knowledge base storage +pub fn knowledge_bases_dir(os: &Os) -> Result { + Ok(home_dir(os)?.join(".aws").join("amazonq").join("knowledge_bases")) +} + /// The path to the fig settings file pub fn settings_path() -> Result { Ok(fig_data_dir()?.join("settings.json")) diff --git a/crates/chat-cli/src/util/knowledge_store.rs b/crates/chat-cli/src/util/knowledge_store.rs index 23ef1345e..92370f7a8 100644 --- a/crates/chat-cli/src/util/knowledge_store.rs +++ b/crates/chat-cli/src/util/knowledge_store.rs @@ -1,3 +1,4 @@ +use std::path::PathBuf; use std::sync::{ Arc, LazyLock as Lazy, @@ -6,10 +7,74 @@ use std::sync::{ use eyre::Result; use semantic_search_client::KnowledgeContext; use semantic_search_client::client::AsyncSemanticSearchClient; -use semantic_search_client::types::SearchResult; +use semantic_search_client::types::{ + AddContextRequest, + SearchResult, +}; use tokio::sync::Mutex; +use tracing::debug; use uuid::Uuid; +use crate::os::Os; +use crate::util::directories; + +/// Configuration for adding knowledge contexts +#[derive(Default)] +pub struct AddOptions { + pub description: Option, + pub include_patterns: Vec, + pub exclude_patterns: Vec, +} + +impl AddOptions { + pub fn new() -> Self { + Self::default() + } + + /// Create AddOptions with DB default patterns + pub fn with_db_defaults(os: &crate::os::Os) -> Self { + let default_include = os + .database + .settings + .get(crate::database::settings::Setting::KnowledgeDefaultIncludePatterns) + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect::>() + }) + .unwrap_or_default(); + + let default_exclude = os + .database + .settings + .get(crate::database::settings::Setting::KnowledgeDefaultExcludePatterns) + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect::>() + }) + .unwrap_or_default(); + + Self { + description: None, + include_patterns: default_include, + exclude_patterns: default_exclude, + } + } + + pub fn with_include_patterns(mut self, patterns: Vec) -> Self { + self.include_patterns = patterns; + self + } + + pub fn with_exclude_patterns(mut self, patterns: Vec) -> Self { + self.exclude_patterns = patterns; + self + } +} + #[derive(Debug)] pub enum KnowledgeError { ClientError(String), @@ -31,14 +96,57 @@ pub struct KnowledgeStore { } impl KnowledgeStore { - /// Get singleton instance - pub async fn get_async_instance() -> Arc> { + /// Get singleton instance with directory from OS (includes migration) + pub async fn get_async_instance_with_os(os: &Os) -> Result>, directories::DirectoryError> { + let knowledge_dir = crate::util::directories::knowledge_bases_dir(os)?; + Self::migrate_legacy_knowledge_base(&knowledge_dir).await; + Ok(Self::get_async_instance_with_os_settings(os, knowledge_dir).await) + } + + /// Migrate legacy knowledge base from old location if needed + async fn migrate_legacy_knowledge_base(knowledge_dir: &PathBuf) { + let old_flat_dir = dirs::home_dir() + .unwrap_or_else(|| PathBuf::from(".")) + .join(".semantic_search"); + + if old_flat_dir.exists() && !knowledge_dir.exists() { + // Create parent directories first + if let Some(parent) = knowledge_dir.parent() { + if let Err(e) = std::fs::create_dir_all(parent) { + debug!( + "Warning: Failed to create parent directories for knowledge base migration: {}", + e + ); + return; + } + } + + // Attempt migration + if let Err(e) = std::fs::rename(&old_flat_dir, knowledge_dir) { + debug!( + "Warning: Failed to migrate legacy knowledge base from {} to {}: {}", + old_flat_dir.display(), + knowledge_dir.display(), + e + ); + } else { + println!( + "βœ… Migrated knowledge base from {} to {}", + old_flat_dir.display(), + knowledge_dir.display() + ); + } + } + } + + /// Get singleton instance with OS settings (primary method) + pub async fn get_async_instance_with_os_settings(os: &crate::os::Os, base_dir: PathBuf) -> Arc> { static ASYNC_INSTANCE: Lazy>>> = Lazy::new(tokio::sync::OnceCell::new); if cfg!(test) { Arc::new(Mutex::new( - KnowledgeStore::new() + KnowledgeStore::new_with_os_settings(os, base_dir) .await .expect("Failed to create test async knowledge store"), )) @@ -46,7 +154,7 @@ impl KnowledgeStore { ASYNC_INSTANCE .get_or_init(|| async { Arc::new(Mutex::new( - KnowledgeStore::new() + KnowledgeStore::new_with_os_settings(os, base_dir) .await .expect("Failed to create async knowledge store"), )) @@ -56,37 +164,126 @@ impl KnowledgeStore { } } - pub async fn new() -> Result { - let client = AsyncSemanticSearchClient::new_with_default_dir() + /// Create SemanticSearchConfig from database settings with fallbacks to defaults + fn create_config_from_db_settings( + os: &crate::os::Os, + base_dir: PathBuf, + ) -> semantic_search_client::config::SemanticSearchConfig { + use semantic_search_client::config::SemanticSearchConfig; + + use crate::database::settings::Setting; + + // Create default config first + let default_config = SemanticSearchConfig { + base_dir: base_dir.clone(), + ..Default::default() + }; + + // Override with DB settings if provided, otherwise use defaults + let chunk_size = os + .database + .settings + .get_int_or(Setting::KnowledgeChunkSize, default_config.chunk_size); + let chunk_overlap = os + .database + .settings + .get_int_or(Setting::KnowledgeChunkOverlap, default_config.chunk_overlap); + let max_files = os + .database + .settings + .get_int_or(Setting::KnowledgeMaxFiles, default_config.max_files); + + SemanticSearchConfig { + chunk_size, + chunk_overlap, + max_files, + base_dir, + ..default_config + } + } + + /// Create instance with database settings from OS + pub async fn new_with_os_settings(os: &crate::os::Os, base_dir: PathBuf) -> Result { + let config = Self::create_config_from_db_settings(os, base_dir.clone()); + let client = AsyncSemanticSearchClient::with_config(&base_dir, config) .await .map_err(|e| eyre::eyre!("Failed to create client: {}", e))?; Ok(Self { client }) } - /// Add context - delegates to async client - pub async fn add(&mut self, name: &str, path_str: &str) -> Result { + /// Add context with flexible options + pub async fn add(&mut self, name: &str, path_str: &str, options: AddOptions) -> Result { let path_buf = std::path::PathBuf::from(path_str); let canonical_path = path_buf .canonicalize() .map_err(|_io_error| format!("❌ Path does not exist: {}", path_str))?; - match self - .client - .add_context_from_path(&canonical_path, name, &format!("Knowledge context for {}", name), true) - .await - { - Ok((operation_id, _)) => Ok(format!( - "πŸš€ Started indexing '{}'\nπŸ“ Path: {}\nπŸ†” Operation ID: {}.", - name, - canonical_path.display(), - &operation_id.to_string()[..8] - )), - Err(e) => Err(format!("Failed to start indexing: {}", e)), + // Use provided description or generate default + let description = options + .description + .unwrap_or_else(|| format!("Knowledge context for {}", name)); + + // Create AddContextRequest with all options + let request = AddContextRequest { + path: canonical_path.clone(), + name: name.to_string(), + description: if !options.include_patterns.is_empty() || !options.exclude_patterns.is_empty() { + let mut full_description = description; + if !options.include_patterns.is_empty() { + full_description.push_str(&format!(" [Include: {}]", options.include_patterns.join(", "))); + } + if !options.exclude_patterns.is_empty() { + full_description.push_str(&format!(" [Exclude: {}]", options.exclude_patterns.join(", "))); + } + full_description + } else { + description + }, + persistent: true, + include_patterns: if options.include_patterns.is_empty() { + None + } else { + Some(options.include_patterns.clone()) + }, + exclude_patterns: if options.exclude_patterns.is_empty() { + None + } else { + Some(options.exclude_patterns.clone()) + }, + }; + + match self.client.add_context(request).await { + Ok((operation_id, _)) => { + let mut message = format!( + "πŸš€ Started indexing '{}'\nπŸ“ Path: {}\nπŸ†” Operation ID: {}", + name, + canonical_path.display(), + &operation_id.to_string()[..8] + ); + if !options.include_patterns.is_empty() || !options.exclude_patterns.is_empty() { + message.push_str("\nπŸ“‹ Pattern filtering applied:"); + if !options.include_patterns.is_empty() { + message.push_str(&format!("\n Include: {}", options.include_patterns.join(", "))); + } + if !options.exclude_patterns.is_empty() { + message.push_str(&format!("\n Exclude: {}", options.exclude_patterns.join(", "))); + } + message.push_str("\nβœ… Only matching files will be indexed"); + } + Ok(message) + }, + Err(e) => { + let error_msg = e.to_string(); + if error_msg.contains("Invalid include pattern") || error_msg.contains("Invalid exclude pattern") { + Err(error_msg) + } else { + Err(format!("Failed to start indexing: {}", e)) + } + }, } } - /// Get all contexts - delegates to async client pub async fn get_all(&self) -> Result, KnowledgeError> { Ok(self.client.get_contexts().await) } @@ -142,8 +339,11 @@ impl KnowledgeStore { } } } else { - // Cancel all operations - self.client.cancel_all_operations().await.map_err(|e| e.to_string()) + // Cancel most recent operation (not all operations) + self.client + .cancel_most_recent_operation() + .await + .map_err(|e| e.to_string()) } } @@ -207,8 +407,13 @@ impl KnowledgeStore { .await .map_err(|e| e.to_string())?; - // Then add it back with the same name - self.add(&context.name, path_str).await + // Then add it back with the same name and original patterns + let options = AddOptions { + description: None, + include_patterns: context.include_patterns.clone(), + exclude_patterns: context.exclude_patterns.clone(), + }; + self.add(&context.name, path_str, options).await } else { // Debug: List all available contexts let available_paths = self.client.list_context_paths().await; @@ -240,8 +445,13 @@ impl KnowledgeStore { .await .map_err(|e| e.to_string())?; - // Then add it back with the same name - self.add(&context_name, path_str).await + // Then add it back with the same name and original patterns + let options = AddOptions { + description: None, + include_patterns: context.include_patterns.clone(), + exclude_patterns: context.exclude_patterns.clone(), + }; + self.add(&context_name, path_str, options).await } /// Update context by name @@ -253,10 +463,59 @@ impl KnowledgeStore { .await .map_err(|e| e.to_string())?; - // Then add it back with the same name - self.add(name, path_str).await + // Then add it back with the same name and original patterns + let options = AddOptions { + description: None, + include_patterns: context.include_patterns.clone(), + exclude_patterns: context.exclude_patterns.clone(), + }; + self.add(name, path_str, options).await } else { Err(format!("Context with name '{}' not found", name)) } } } + +#[cfg(test)] +mod tests { + use tempfile::TempDir; + + use super::*; + use crate::os::Os; + + async fn create_test_os(temp_dir: &TempDir) -> Os { + let os = Os::new().await.unwrap(); + // Override home directory to use temp directory + unsafe { + os.env.set_var("HOME", temp_dir.path().to_str().unwrap()); + } + os + } + + #[tokio::test] + async fn test_create_config_from_db_settings() { + let temp_dir = TempDir::new().unwrap(); + let os = create_test_os(&temp_dir).await; + let base_dir = temp_dir.path().join("test_kb"); + + // Test config creation with default settings + let config = KnowledgeStore::create_config_from_db_settings(&os, base_dir.clone()); + + // Should use defaults when no database settings exist + assert_eq!(config.chunk_size, 512); // Default chunk size + assert_eq!(config.chunk_overlap, 128); // Default chunk overlap + assert_eq!(config.max_files, 10000); // Default max files + assert_eq!(config.base_dir, base_dir); + } + + #[tokio::test] + async fn test_knowledge_bases_dir_structure() { + let temp_dir = TempDir::new().unwrap(); + let os = create_test_os(&temp_dir).await; + + let base_dir = crate::util::directories::knowledge_bases_dir(&os).unwrap(); + + // Verify directory structure + assert!(base_dir.to_string_lossy().contains("knowledge_bases")); + } +} diff --git a/crates/semantic-search-client/Cargo.toml b/crates/semantic-search-client/Cargo.toml index 9f6b09551..5024a5f4c 100644 --- a/crates/semantic-search-client/Cargo.toml +++ b/crates/semantic-search-client/Cargo.toml @@ -24,6 +24,7 @@ rayon.workspace = true tempfile.workspace = true tokio.workspace = true tokio-util.workspace = true +glob.workspace = true # Vector search library - pin to avoid edition2024 requirement hnsw_rs = "=0.3.1" diff --git a/crates/semantic-search-client/README.md b/crates/semantic-search-client/README.md index 3a8e87ba8..c4d94f9ac 100644 --- a/crates/semantic-search-client/README.md +++ b/crates/semantic-search-client/README.md @@ -7,17 +7,20 @@ Rust library for managing semantic memory contexts with vector embeddings, enabl ## Features +- **Async-First Design**: Built for modern async Rust applications with tokio - **Semantic Memory Management**: Create, store, and search through semantic memory contexts +- **Pattern Filtering**: Include/exclude files using glob-style patterns during indexing - **Vector Embeddings**: Generate high-quality text embeddings for semantic similarity search - **Multi-Platform Support**: Works on macOS, Windows, and Linux with optimized backends - **Hardware Acceleration**: Uses Metal on macOS and optimized backends on other platforms - **File Processing**: Process various file types including text, markdown, JSON, and code - **Persistent Storage**: Save contexts to disk for long-term storage and retrieval -- **Progress Tracking**: Detailed progress reporting for long-running operations +- **Background Processing**: Non-blocking indexing with progress tracking and cancellation - **Parallel Processing**: Efficiently process large directories with parallel execution - **Memory Efficient**: Stream large files and directories without excessive memory usage - **Cross-Platform Compatibility**: Fallback mechanisms for all platforms and architectures - **πŸ†• Configurable File Limits**: Built-in protection against indexing too many files (default: 5,000 files) +- **πŸ†• Database Settings Integration**: Configurable chunk sizes, overlap, and limits ## Installation @@ -31,30 +34,38 @@ semantic_search_client = "0.1.0" ## Quick Start ```rust -use semantic_search_client::{SemanticSearchClient, SemanticSearchConfig, Result}; -use std::path::Path; +use semantic_search_client::{AsyncSemanticSearchClient, AddContextRequest, Result}; +use std::path::PathBuf; -fn main() -> Result<()> { - // Create a new client with default settings (5,000 file limit) - let mut client = SemanticSearchClient::new_with_default_dir()?; +#[tokio::main] +async fn main() -> Result<()> { + // Create a new async client with default settings + let client = AsyncSemanticSearchClient::new_with_default_dir().await?; - // Add a context from a directory - let context_id = client.add_context_from_path( - Path::new("/path/to/project"), - "My Project", - "Code and documentation for my project", - true, // make it persistent - None, // no progress callback - )?; + // Add a context using the new structured request API + let request = AddContextRequest { + path: PathBuf::from("/path/to/project"), + name: "My Project".to_string(), + description: "Code and documentation for my project".to_string(), + persistent: true, + include_patterns: Some(vec!["**/*.rs".to_string(), "**/*.md".to_string()]), + exclude_patterns: Some(vec!["target/**".to_string(), "**/.git/**".to_string()]), + }; - // Search within the context - let results = client.search_context(&context_id, "implement authentication", 5)?; + let (operation_id, _cancel_token) = client.add_context(request).await?; + println!("Started indexing with operation ID: {}", operation_id); + + // Search across all contexts + let results = client.search_all("implement authentication", None).await?; // Print the results - for result in results { - println!("Score: {}", result.distance); - if let Some(text) = result.text() { - println!("Text: {}", text); + for (context_id, context_results) in results { + println!("Results from context {}", context_id); + for result in context_results { + println!("Score: {}", result.distance); + if let Some(text) = result.text() { + println!("Text: {}", text); + } } } @@ -126,29 +137,18 @@ The default selection logic prioritizes performance where possible: ### Creating a Client ```rust -// With default settings (5,000 file limit) -let client = SemanticSearchClient::new_with_default_dir()?; +// Create async client with default settings +let client = AsyncSemanticSearchClient::new_with_default_dir().await?; // With custom directory -let client = SemanticSearchClient::new("/path/to/storage")?; +let client = AsyncSemanticSearchClient::new("/path/to/storage").await?; // With custom configuration let config = SemanticSearchConfig::default() .set_max_files(10000) // Allow up to 10,000 files .set_chunk_size(1024); // Custom chunk size -let client = SemanticSearchClient::with_config("/path/to/storage", config)?; - -// With specific embedding type -use semantic_search_client::embedding::EmbeddingType; -let client = SemanticSearchClient::new_with_embedding_type(EmbeddingType::Candle)?; - -// With both custom config and embedding type -let config = SemanticSearchConfig::with_max_files(15000); // 15,000 file limit -let client = SemanticSearchClient::with_config_and_embedding_type( - "/path/to/storage", - config, - EmbeddingType::Candle -)?; +let client = AsyncSemanticSearchClient::with_config("/path/to/storage", config).await?; +``` ``` ### Configuration Options @@ -210,52 +210,53 @@ let client = SemanticSearchClient::with_config(path, config)?; ### Adding Contexts ```rust -// From a file -let file_context_id = client.add_context_from_file( - "/path/to/document.md", - "Documentation", - "Project documentation", - true, // persistent - None, // no progress callback -)?; +use semantic_search_client::AddContextRequest; +use std::path::PathBuf; + +// Add context with pattern filtering +let request = AddContextRequest { + path: PathBuf::from("/path/to/codebase"), + name: "Rust Codebase".to_string(), + description: "Main Rust project source code".to_string(), + persistent: true, + include_patterns: Some(vec![ + "**/*.rs".to_string(), + "**/*.toml".to_string(), + "**/*.md".to_string(), + ]), + exclude_patterns: Some(vec![ + "target/**".to_string(), + "**/.git/**".to_string(), + "**/node_modules/**".to_string(), + ]), +}; -// From a directory with progress reporting -let dir_context_id = client.add_context_from_directory( - "/path/to/codebase", - "Codebase", - "Project source code", - true, // persistent - Some(|status| { - match status { - ProgressStatus::CountingFiles => println!("Counting files..."), - ProgressStatus::StartingIndexing(count) => println!("Starting indexing {} files", count), - ProgressStatus::Indexing(current, total) => - println!("Indexing file {}/{}", current, total), - ProgressStatus::CreatingSemanticContext => - println!("Creating semantic context..."), - ProgressStatus::GeneratingEmbeddings(current, total) => - println!("Generating embeddings {}/{}", current, total), - ProgressStatus::BuildingIndex => println!("Building index..."), - ProgressStatus::Finalizing => println!("Finalizing..."), - ProgressStatus::Complete => println!("Indexing complete!"), - } - }), -)?; +let (operation_id, cancel_token) = client.add_context(request).await?; +println!("Started indexing with operation ID: {}", operation_id); + +// Add context without pattern filtering +let simple_request = AddContextRequest { + path: PathBuf::from("/path/to/docs"), + name: "Documentation".to_string(), + description: "Project documentation".to_string(), + persistent: true, + include_patterns: None, + exclude_patterns: None, +}; -// From raw text -let text_context_id = client.add_context_from_text( - "This is some text to remember", - "Note", - "Important information", - false, // volatile -)?; +let (operation_id, _) = client.add_context(simple_request).await?; + +// Monitor progress (the client runs indexing in background) +let status = client.get_status_data().await?; +println!("Active operations: {}", status.active_count); +``` ``` ### Searching ```rust // Search across all contexts -let all_results = client.search_all("authentication implementation", 5)?; +let all_results = client.search_all("authentication implementation", None).await?; for (context_id, results) in all_results { println!("Results from context {}", context_id); for result in results { @@ -266,39 +267,102 @@ for (context_id, results) in all_results { } } -// Search in a specific context -let context_results = client.search_context( - &context_id, - "authentication implementation", - 5, -)?; +// Get all contexts first, then search specific ones +let contexts = client.get_contexts().await; +for context in &contexts { + println!("Available context: {} ({})", context.name, context.id); +} +``` ``` ### Managing Contexts ```rust // Get all contexts -let contexts = client.get_all_contexts(); +let contexts = client.get_contexts().await; for context in contexts { println!("Context: {} ({})", context.name, context.id); println!(" Description: {}", context.description); println!(" Created: {}", context.created_at); println!(" Items: {}", context.item_count); + println!(" Include patterns: {:?}", context.include_patterns); + println!(" Exclude patterns: {:?}", context.exclude_patterns); } -// Make a volatile context persistent -client.make_persistent( - &context_id, - "Saved Context", - "Important information saved for later", -)?; +// Remove contexts +client.remove_context_by_id("context-id").await?; +client.remove_context_by_name("Context Name").await?; +client.remove_context_by_path("/path/to/indexed/directory").await?; + +// Cancel ongoing operations +client.cancel_operation(operation_id).await?; +client.cancel_all_operations().await?; + +// Get system status +let status = client.get_status_data().await?; +println!("Total contexts: {}", status.total_contexts); +println!("Active operations: {}", status.active_count); +``` +``` -// Remove a context -client.remove_context_by_id(&context_id, true)?; // true to delete persistent storage -client.remove_context_by_name("My Context", true)?; -client.remove_context_by_path("/path/to/indexed/directory", true)?; +## Pattern Filtering + +The library supports glob-style pattern filtering to control which files are indexed: + +### Include Patterns +Only files matching these patterns will be indexed: +```rust +let request = AddContextRequest { + // ... other fields + include_patterns: Some(vec![ + "**/*.rs".to_string(), // All Rust files + "**/*.md".to_string(), // All Markdown files + "src/**/*.toml".to_string(), // TOML files in src directory + ]), + exclude_patterns: None, +}; ``` +### Exclude Patterns +Files matching these patterns will be skipped: +```rust +let request = AddContextRequest { + // ... other fields + include_patterns: None, + exclude_patterns: Some(vec![ + "target/**".to_string(), // Build artifacts + "**/.git/**".to_string(), // Git metadata + "**/node_modules/**".to_string(), // Node.js dependencies + "**/*.log".to_string(), // Log files + ]), +}; +``` + +### Combined Filtering +Use both include and exclude patterns for precise control: +```rust +let request = AddContextRequest { + // ... other fields + include_patterns: Some(vec![ + "**/*.rs".to_string(), + "**/*.toml".to_string(), + "**/*.md".to_string(), + ]), + exclude_patterns: Some(vec![ + "target/**".to_string(), + "**/tests/**".to_string(), // Skip test files + "**/*_test.rs".to_string(), // Skip test files + ]), +}; +``` + +### Pattern Syntax +- `**` matches any number of directories +- `*` matches any characters within a single path segment +- `?` matches a single character +- `[abc]` matches any character in the set +- `{a,b,c}` matches any of the alternatives + ## Advanced Features ### Custom Embedding Models @@ -327,6 +391,7 @@ let client = SemanticSearchClient::with_embedding_type( EmbeddingType::BM25, )?; ``` +``` ### Parallel Processing diff --git a/crates/semantic-search-client/src/client/async_implementation.rs b/crates/semantic-search-client/src/client/async_implementation.rs index a5de69c1e..2efdf6a69 100644 --- a/crates/semantic-search-client/src/client/async_implementation.rs +++ b/crates/semantic-search-client/src/client/async_implementation.rs @@ -32,9 +32,11 @@ use crate::error::{ SemanticSearchError, }; use crate::types::{ + AddContextRequest, ContextId, DataPoint, IndexingJob, + IndexingParams, KnowledgeContext, OperationHandle, OperationStatus, @@ -84,9 +86,68 @@ struct BackgroundWorker { const MAX_CONCURRENT_OPERATIONS: usize = 3; impl AsyncSemanticSearchClient { - /// Create a new async semantic search client + /// Create a new async semantic search client with custom config + pub async fn with_config(base_dir: impl AsRef, config: SemanticSearchConfig) -> Result { + let base_dir = base_dir.as_ref().to_path_buf(); + + tokio::fs::create_dir_all(&base_dir).await?; + + // Create models directory + crate::config::ensure_models_dir(&base_dir)?; + + // Pre-download models if the embedding type requires them + Self::ensure_models_downloaded(&config.embedding_type).await?; + + let embedder = embedder_factory::create_embedder(config.embedding_type)?; + + // Load metadata for persistent contexts + let contexts_file = base_dir.join("contexts.json"); + let persistent_contexts: HashMap = utils::load_json_from_file(&contexts_file)?; + + let contexts = Arc::new(RwLock::new(persistent_contexts)); + let volatile_contexts = Arc::new(RwLock::new(HashMap::new())); + let active_operations = Arc::new(RwLock::new(HashMap::new())); + let (job_tx, job_rx) = mpsc::unbounded_channel(); + + // Start background worker + let worker_embedder = embedder_factory::create_embedder(config.embedding_type)?; + let worker = BackgroundWorker { + job_rx, + contexts: contexts.clone(), + volatile_contexts: volatile_contexts.clone(), + active_operations: active_operations.clone(), + embedder: worker_embedder, + config: config.clone(), + base_dir: base_dir.clone(), + indexing_semaphore: Arc::new(Semaphore::new(MAX_CONCURRENT_OPERATIONS)), + }; + + tokio::spawn(worker.run()); + + let mut client = Self { + base_dir, + contexts, + volatile_contexts, + embedder, + config, + job_tx, + active_operations, + }; + + // Load all persistent contexts + client.load_persistent_contexts().await?; + + Ok(client) + } + + /// Create a new async semantic search client with default config pub async fn new(base_dir: impl AsRef) -> Result { - Self::with_embedding_type(base_dir, EmbeddingType::default()).await + let base_dir_path = base_dir.as_ref().to_path_buf(); + let config = SemanticSearchConfig { + base_dir: base_dir_path, + ..Default::default() + }; + Self::with_config(base_dir, config).await } /// Create a new semantic search client with the default base directory @@ -157,83 +218,28 @@ impl AsyncSemanticSearchClient { config::get_default_base_dir() } - /// Create a new async semantic search client with custom configuration and embedding type - pub async fn with_embedding_type(base_dir: impl AsRef, embedding_type: EmbeddingType) -> Result { - let base_dir = base_dir.as_ref().to_path_buf(); - tokio::fs::create_dir_all(&base_dir).await?; - - // Create models directory - config::ensure_models_dir(&base_dir)?; - - // Initialize the configuration - if let Err(e) = config::init_config(&base_dir) { - tracing::error!("Failed to initialize semantic search configuration: {}", e); - } - - // Pre-download models if the embedding type requires them - Self::ensure_models_downloaded(&embedding_type).await?; - - let embedder = embedder_factory::create_embedder(embedding_type)?; - - // Load metadata for persistent contexts - let contexts_file = base_dir.join("contexts.json"); - let persistent_contexts: HashMap = utils::load_json_from_file(&contexts_file)?; - - let contexts = Arc::new(RwLock::new(persistent_contexts)); - let volatile_contexts = Arc::new(RwLock::new(HashMap::new())); - let active_operations = Arc::new(RwLock::new(HashMap::new())); - let (job_tx, job_rx) = mpsc::unbounded_channel(); - - // Start background worker - we'll need to create a new embedder for the worker - // Models should already be downloaded by now - let worker_embedder = embedder_factory::create_embedder(embedding_type)?; - // Makes sure it respects configuration even if tweaked by user. - let loaded_config = config::get_config().clone(); - let worker = BackgroundWorker { - job_rx, - contexts: contexts.clone(), - volatile_contexts: volatile_contexts.clone(), - active_operations: active_operations.clone(), - embedder: worker_embedder, - config: loaded_config.clone(), - base_dir: base_dir.clone(), - indexing_semaphore: Arc::new(Semaphore::new(MAX_CONCURRENT_OPERATIONS)), - }; - - tokio::spawn(worker.run()); - - let mut client = Self { - base_dir, - contexts, - volatile_contexts, - embedder, - config: loaded_config, - job_tx, - active_operations, - }; - - // Load all persistent contexts - client.load_persistent_contexts().await?; - - Ok(client) - } - - /// Add a context from a path (async, cancellable) - pub async fn add_context_from_path( - &self, - path: impl AsRef, - name: &str, - description: &str, - persistent: bool, - ) -> Result<(Uuid, CancellationToken)> { - let path = path.as_ref(); - let canonical_path = path.canonicalize().map_err(|_e| { - SemanticSearchError::InvalidPath(format!("Path does not exist or is not accessible: {}", path.display())) + /// Add context using structured request + pub async fn add_context(&self, request: AddContextRequest) -> Result<(Uuid, CancellationToken)> { + let canonical_path = request.path.canonicalize().map_err(|_e| { + SemanticSearchError::InvalidPath(format!( + "Path does not exist or is not accessible: {}", + request.path.display() + )) })?; // Check for conflicts self.check_path_exists(&canonical_path).await?; + // Validate patterns early to fail fast + if let Some(ref include_patterns) = request.include_patterns { + crate::pattern_filter::PatternFilter::new(include_patterns, &[]) + .map_err(|e| SemanticSearchError::InvalidArgument(format!("Invalid include pattern: {}", e)))?; + } + if let Some(ref exclude_patterns) = request.exclude_patterns { + crate::pattern_filter::PatternFilter::new(&[], exclude_patterns) + .map_err(|e| SemanticSearchError::InvalidArgument(format!("Invalid exclude pattern: {}", e)))?; + } + let operation_id = Uuid::new_v4(); let cancel_token = CancellationToken::new(); @@ -241,7 +247,7 @@ impl AsyncSemanticSearchClient { self.register_operation( operation_id, OperationType::Indexing { - name: name.to_string(), + name: request.name.clone(), path: canonical_path.to_string_lossy().to_string(), }, cancel_token.clone(), @@ -253,9 +259,11 @@ impl AsyncSemanticSearchClient { id: operation_id, cancel: cancel_token.clone(), path: canonical_path, - name: name.to_string(), - description: description.to_string(), - persistent, + name: request.name, + description: request.description, + persistent: request.persistent, + include_patterns: request.include_patterns, + exclude_patterns: request.exclude_patterns, }; self.job_tx @@ -372,6 +380,33 @@ impl AsyncSemanticSearchClient { } } + /// Cancel the most recent operation + pub async fn cancel_most_recent_operation(&self) -> Result { + let operations = self.active_operations.read().await; + + if operations.is_empty() { + return Err(SemanticSearchError::OperationFailed( + "No active operations to cancel".to_string(), + )); + } + + // Find the most recent operation (highest started_at time) + let most_recent = operations + .iter() + .max_by_key(|(_, handle)| handle.started_at) + .map(|(id, _)| *id); + + drop(operations); // Release the read lock + + if let Some(operation_id) = most_recent { + self.cancel_operation(operation_id).await + } else { + Err(SemanticSearchError::OperationFailed( + "No active operations found".to_string(), + )) + } + } + /// Cancel all active operations pub async fn cancel_all_operations(&self) -> Result { let mut operations = self.active_operations.write().await; @@ -652,7 +687,8 @@ impl AsyncSemanticSearchClient { } // Create a new semantic context - let semantic_context = SemanticContext::new(context_dir.join("data.json"))?; + let data_file = context_dir.join("data.json"); + let semantic_context = SemanticContext::new(data_file)?; // Store the semantic context let mut volatile_contexts = self.volatile_contexts.write().await; @@ -770,6 +806,22 @@ impl AsyncSemanticSearchClient { // Background Worker Implementation impl BackgroundWorker { + /// Create pattern filter from include/exclude patterns + fn create_pattern_filter( + include_patterns: &Option>, + exclude_patterns: &Option>, + ) -> std::result::Result, String> { + if include_patterns.is_some() || exclude_patterns.is_some() { + let inc = include_patterns.as_deref().unwrap_or(&[]); + let exc = exclude_patterns.as_deref().unwrap_or(&[]); + Ok(Some( + crate::pattern_filter::PatternFilter::new(inc, exc).map_err(|e| format!("Invalid patterns: {}", e))?, + )) + } else { + Ok(None) + } + } + async fn run(mut self) { debug!("Background worker started for async semantic search client"); @@ -782,9 +834,18 @@ impl BackgroundWorker { name, description, persistent, + include_patterns, + exclude_patterns, } => { - self.process_add_directory(id, path, name, description, persistent, cancel) - .await; + let params = IndexingParams { + path, + name, + description, + persistent, + include_patterns, + exclude_patterns, + }; + self.process_add_directory(id, params, cancel).await; }, IndexingJob::Clear { id, cancel } => { self.process_clear(id, cancel).await; @@ -795,16 +856,12 @@ impl BackgroundWorker { debug!("Background worker stopped"); } - async fn process_add_directory( - &self, - operation_id: Uuid, - path: PathBuf, - name: String, - description: String, - persistent: bool, - cancel_token: CancellationToken, - ) { - debug!("Processing AddDirectory job: {} -> {}", name, path.display()); + async fn process_add_directory(&self, operation_id: Uuid, params: IndexingParams, cancel_token: CancellationToken) { + debug!( + "Processing AddDirectory job: {} -> {}", + params.name, + params.path.display() + ); if cancel_token.is_cancelled() { self.mark_operation_cancelled(operation_id).await; @@ -846,9 +903,15 @@ impl BackgroundWorker { }; // Perform actual indexing - let result = self - .perform_indexing(operation_id, path, name, description, persistent, cancel_token) - .await; + let indexing_params = IndexingParams { + path: params.path, + name: params.name, + description: params.description, + persistent: params.persistent, + include_patterns: params.include_patterns, + exclude_patterns: params.exclude_patterns, + }; + let result = self.perform_indexing(operation_id, indexing_params, cancel_token).await; match result { Ok(context_id) => { @@ -865,14 +928,11 @@ impl BackgroundWorker { async fn perform_indexing( &self, operation_id: Uuid, - path: PathBuf, - name: String, - description: String, - persistent: bool, + params: IndexingParams, cancel_token: CancellationToken, ) -> std::result::Result { - if !path.exists() { - return Err(format!("Path '{}' does not exist", path.display())); + if !params.path.exists() { + return Err(format!("Path '{}' does not exist", params.path.display())); } // Check for cancellation before starting @@ -893,7 +953,7 @@ impl BackgroundWorker { let cancel_token_clone = cancel_token.clone(); // Create context directory - let context_dir = if persistent { + let context_dir = if params.persistent { base_dir.join(&context_id) } else { std::env::temp_dir().join("semantic_search").join(&context_id) @@ -909,7 +969,14 @@ impl BackgroundWorker { } // Count files and notify progress - let file_count = self.count_files_in_directory(&path, operation_id).await?; + let file_count = self + .count_files_in_directory( + ¶ms.path, + operation_id, + ¶ms.include_patterns, + ¶ms.exclude_patterns, + ) + .await?; // Check if file count exceeds the configured limit if file_count > config.max_files { @@ -941,7 +1008,14 @@ impl BackgroundWorker { // Process files with cancellation checks let items = self - .process_directory_files(&path, file_count, operation_id, &cancel_token_clone) + .process_directory_files( + ¶ms.path, + file_count, + operation_id, + &cancel_token_clone, + ¶ms.include_patterns, + ¶ms.exclude_patterns, + ) .await?; // Check cancellation before creating semantic context @@ -972,7 +1046,7 @@ impl BackgroundWorker { } // Save context if persistent - if persistent { + if params.persistent { semantic_context .save() .map_err(|e| format!("Failed to save context: {}", e))?; @@ -981,10 +1055,12 @@ impl BackgroundWorker { // Store the context self.store_context( &context_id, - &name, - &description, - persistent, - Some(path.to_string_lossy().to_string()), + ¶ms.name, + ¶ms.description, + params.persistent, + Some(params.path.to_string_lossy().to_string()), + ¶ms.include_patterns, + ¶ms.exclude_patterns, semantic_context, file_count, ) @@ -1228,6 +1304,8 @@ impl BackgroundWorker { description: &str, persistent: bool, source_path: Option, + include_patterns: &Option>, + exclude_patterns: &Option>, semantic_context: SemanticContext, item_count: usize, ) -> std::result::Result<(), String> { @@ -1238,6 +1316,10 @@ impl BackgroundWorker { description, persistent, source_path, + ( + include_patterns.as_deref().unwrap_or(&[]).to_vec(), + exclude_patterns.as_deref().unwrap_or(&[]).to_vec(), + ), item_count, ); @@ -1266,6 +1348,8 @@ impl BackgroundWorker { &self, dir_path: &Path, operation_id: Uuid, + include_patterns: &Option>, + exclude_patterns: &Option>, ) -> std::result::Result { self.update_operation_status(operation_id, "Counting files...".to_string()) .await; @@ -1274,6 +1358,9 @@ impl BackgroundWorker { let dir_path = dir_path.to_path_buf(); let active_operations = self.active_operations.clone(); + // Create pattern filter if patterns are provided + let pattern_filter = Self::create_pattern_filter(include_patterns, exclude_patterns)?; + let count_result = tokio::task::spawn_blocking(move || { let mut count = 0; let mut checked = 0; @@ -1290,6 +1377,12 @@ impl BackgroundWorker { .and_then(|n| n.to_str()) .is_some_and(|s| s.starts_with('.')) }) + .filter(|e| { + // Apply pattern filter if present + pattern_filter + .as_ref() + .is_none_or(|filter| filter.should_include(e.path())) + }) { count += 1; checked += 1; @@ -1334,12 +1427,17 @@ impl BackgroundWorker { file_count: usize, operation_id: Uuid, cancel_token: &CancellationToken, + include_patterns: &Option>, + exclude_patterns: &Option>, ) -> std::result::Result, String> { - use crate::processing::process_file; + use crate::processing::process_file_with_config; self.update_operation_status(operation_id, format!("Starting indexing ({} files)", file_count)) .await; + // Create pattern filter if patterns are provided + let pattern_filter = Self::create_pattern_filter(include_patterns, exclude_patterns)?; + let mut processed_files = 0; let mut items = Vec::new(); @@ -1348,6 +1446,12 @@ impl BackgroundWorker { .into_iter() .filter_map(|e| e.ok()) .filter(|e| e.file_type().is_file()) + .filter(|e| { + // Apply pattern filter if present + pattern_filter + .as_ref() + .is_none_or(|filter| filter.should_include(e.path())) + }) { // Check for cancellation frequently if cancel_token.is_cancelled() { @@ -1366,7 +1470,7 @@ impl BackgroundWorker { } // Process the file - match process_file(path) { + match process_file_with_config(path, Some(self.config.chunk_size), Some(self.config.chunk_overlap)) { Ok(mut file_items) => items.append(&mut file_items), Err(_) => continue, // Skip files that fail to process } diff --git a/crates/semantic-search-client/src/client/implementation.rs b/crates/semantic-search-client/src/client/implementation.rs index aec38c059..2120782c5 100644 --- a/crates/semantic-search-client/src/client/implementation.rs +++ b/crates/semantic-search-client/src/client/implementation.rs @@ -17,15 +17,12 @@ use crate::client::{ utils, }; use crate::config; -use crate::embedding::{ - EmbeddingType, - TextEmbedderTrait, -}; +use crate::embedding::TextEmbedderTrait; use crate::error::{ Result, SemanticSearchError, }; -use crate::processing::process_file; +use crate::processing::process_file_with_config; use crate::types::{ ContextId, ContextMap, @@ -84,39 +81,25 @@ impl SemanticSearchClient { /// /// A new SemanticSearchClient instance pub fn new(base_dir: impl AsRef) -> Result { - Self::with_embedding_type(base_dir, EmbeddingType::default()) - } - - /// Create a new semantic search client with a specific embedding type - /// - /// # Arguments - /// - /// * `base_dir` - Base directory for storing persistent contexts - /// * `embedding_type` - Type of embedding engine to use - /// - /// # Returns - /// - /// A new SemanticSearchClient instance - pub fn with_embedding_type(base_dir: impl AsRef, embedding_type: EmbeddingType) -> Result { - Self::with_config_and_embedding_type(base_dir, crate::config::SemanticSearchConfig::default(), embedding_type) + let base_dir_path = base_dir.as_ref().to_path_buf(); + let config = crate::config::SemanticSearchConfig { + base_dir: base_dir_path, + ..Default::default() + }; + Self::with_config(base_dir, config) } - /// Create a new semantic search client with custom configuration and embedding type + /// Create a new semantic search client with custom configuration /// /// # Arguments /// /// * `base_dir` - Base directory for storing persistent contexts /// * `config` - Configuration for the client - /// * `embedding_type` - Type of embedding engine to use /// /// # Returns /// /// A new SemanticSearchClient instance - pub fn with_config_and_embedding_type( - base_dir: impl AsRef, - config: crate::config::SemanticSearchConfig, - embedding_type: EmbeddingType, - ) -> Result { + pub fn with_config(base_dir: impl AsRef, config: crate::config::SemanticSearchConfig) -> Result { let base_dir = base_dir.as_ref().to_path_buf(); fs::create_dir_all(&base_dir)?; @@ -129,7 +112,7 @@ impl SemanticSearchClient { // Continue with default config if initialization fails } - let embedder = embedder_factory::create_embedder(embedding_type)?; + let embedder = embedder_factory::create_embedder(config.embedding_type)?; // Load metadata for persistent contexts let contexts_file = base_dir.join("contexts.json"); @@ -155,20 +138,6 @@ impl SemanticSearchClient { Ok(client) } - /// Create a new semantic search client with custom configuration - /// - /// # Arguments - /// - /// * `base_dir` - Base directory for storing persistent contexts - /// * `config` - Configuration for the client - /// - /// # Returns - /// - /// A new SemanticSearchClient instance - pub fn with_config(base_dir: impl AsRef, config: crate::config::SemanticSearchConfig) -> Result { - Self::with_config_and_embedding_type(base_dir, config, EmbeddingType::default()) - } - /// Get the default base directory for memory bank /// /// # Returns @@ -201,21 +170,6 @@ impl SemanticSearchClient { Self::new(base_dir) } - /// Create a new semantic search client with the default base directory and specific embedding - /// type - /// - /// # Arguments - /// - /// * `embedding_type` - Type of embedding engine to use - /// - /// # Returns - /// - /// A new SemanticSearchClient instance - pub fn new_with_embedding_type(embedding_type: EmbeddingType) -> Result { - let base_dir = Self::get_default_base_dir(); - Self::with_embedding_type(base_dir, embedding_type) - } - /// Get the current semantic search configuration /// /// # Returns @@ -341,7 +295,7 @@ impl SemanticSearchClient { } // Process the file - let items = process_file(file_path)?; + let items = process_file_with_config(file_path, Some(self.config.chunk_size), Some(self.config.chunk_overlap))?; // Notify progress: Indexing if let Some(ref callback) = progress_callback { @@ -418,7 +372,7 @@ impl SemanticSearchClient { } // Process files - let items = Self::process_directory_files(dir_path, file_count, &progress_callback)?; + let items = Self::process_directory_files(dir_path, file_count, &progress_callback, &self.config)?; // Create and populate semantic context let semantic_context = self.create_semantic_context(&context_dir, &items, &progress_callback)?; @@ -454,6 +408,7 @@ impl SemanticSearchClient { dir_path: &Path, file_count: usize, progress_callback: &Option, + config: &crate::config::SemanticSearchConfig, ) -> Result> where F: Fn(ProgressStatus) + Send + 'static, @@ -485,7 +440,7 @@ impl SemanticSearchClient { } // Process the file - match process_file(path) { + match process_file_with_config(path, Some(config.chunk_size), Some(config.chunk_overlap)) { Ok(mut file_items) => items.append(&mut file_items), Err(_) => continue, // Skip files that fail to process } @@ -576,7 +531,15 @@ impl SemanticSearchClient { } // Create the context metadata - let context = KnowledgeContext::new(id.to_string(), name, description, persistent, source_path, item_count); + let context = KnowledgeContext::new( + id.to_string(), + name, + description, + persistent, + source_path, + (vec![], vec![]), + item_count, + ); // Store the context if persistent { @@ -729,6 +692,7 @@ impl SemanticSearchClient { "Temporary memory context", false, None, + (vec![], vec![]), 0, ); contexts.push(context); @@ -908,6 +872,7 @@ impl SemanticSearchClient { context_description, true, None, + (vec![], vec![]), context_guard.get_data_points().len(), ); diff --git a/crates/semantic-search-client/src/config.rs b/crates/semantic-search-client/src/config.rs index 03ce5b2e8..030566bbf 100644 --- a/crates/semantic-search-client/src/config.rs +++ b/crates/semantic-search-client/src/config.rs @@ -16,6 +16,8 @@ use serde::{ Serialize, }; +use crate::embedding::EmbeddingType; + /// Main configuration structure for the semantic search client. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SemanticSearchConfig { @@ -42,6 +44,9 @@ pub struct SemanticSearchConfig { /// Base URL for hosted models pub hosted_models_base_url: String, + + /// Embedding engine type to use + pub embedding_type: EmbeddingType, } impl SemanticSearchConfig { @@ -87,6 +92,7 @@ impl Default for SemanticSearchConfig { base_dir: get_default_base_dir(), max_files: 10000, // Default limit of 10000 files hosted_models_base_url: "https://desktop-release.q.us-east-1.amazonaws.com/models".to_string(), + embedding_type: EmbeddingType::default(), } } } @@ -187,13 +193,9 @@ pub fn init_config(base_dir: &Path) -> std::io::Result<()> { /// /// # Returns /// -/// A reference to the global configuration -/// -/// # Panics -/// -/// Panics if the configuration has not been initialized +/// A reference to the global configuration, or default if not initialized pub fn get_config() -> &'static SemanticSearchConfig { - CONFIG.get().expect("Semantic search configuration not initialized") + CONFIG.get_or_init(SemanticSearchConfig::default) } /// Loads the configuration from a file or creates a new one with default values. @@ -338,6 +340,7 @@ mod tests { base_dir: temp_dir.path().to_path_buf(), max_files: 10000, hosted_models_base_url: "http://test.example.com/models".to_string(), + embedding_type: EmbeddingType::default(), }; // Update the config diff --git a/crates/semantic-search-client/src/embedding/trait_def.rs b/crates/semantic-search-client/src/embedding/trait_def.rs index 1be80b675..80e694381 100644 --- a/crates/semantic-search-client/src/embedding/trait_def.rs +++ b/crates/semantic-search-client/src/embedding/trait_def.rs @@ -1,7 +1,12 @@ +use serde::{ + Deserialize, + Serialize, +}; + use crate::error::Result; /// Embedding engine type to use -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub enum EmbeddingType { /// Use Candle embedding engine (not available on Linux ARM) #[cfg(not(all(target_os = "linux", target_arch = "aarch64")))] diff --git a/crates/semantic-search-client/src/lib.rs b/crates/semantic-search-client/src/lib.rs index cc4a6f764..c99a147b9 100644 --- a/crates/semantic-search-client/src/lib.rs +++ b/crates/semantic-search-client/src/lib.rs @@ -16,6 +16,8 @@ pub mod error; pub mod index; /// Model validation for SHA verification pub mod model_validator; +/// Pattern filtering for file selection +pub mod pattern_filter; /// File processing utilities pub mod processing; /// Data types for semantic search operations diff --git a/crates/semantic-search-client/src/pattern_filter.rs b/crates/semantic-search-client/src/pattern_filter.rs new file mode 100644 index 000000000..1c6821af3 --- /dev/null +++ b/crates/semantic-search-client/src/pattern_filter.rs @@ -0,0 +1,309 @@ +use std::path::Path; + +use glob::Pattern; + +/// Pattern-based file filtering for semantic search indexing +#[derive(Debug, Clone)] +pub struct PatternFilter { + include_patterns: Vec, + exclude_patterns: Vec, +} + +impl PatternFilter { + /// Create a new pattern filter + pub fn new(include_patterns: &[String], exclude_patterns: &[String]) -> Result { + let include_patterns = include_patterns + .iter() + .map(|p| Pattern::new(p).map_err(|e| format!("Invalid include pattern '{}': {}", p, e))) + .collect::, _>>()?; + + let exclude_patterns = exclude_patterns + .iter() + .map(|p| Pattern::new(p).map_err(|e| format!("Invalid exclude pattern '{}': {}", p, e))) + .collect::, _>>()?; + + Ok(Self { + include_patterns, + exclude_patterns, + }) + } + + /// Check if a file should be included based on patterns + /// Handles both absolute and relative paths automatically + pub fn should_include(&self, file_path: &Path) -> bool { + // Check include patterns (if any) + if !self.include_patterns.is_empty() { + let matches_include = self + .include_patterns + .iter() + .any(|pattern| Self::matches_pattern(pattern, file_path)); + if !matches_include { + return false; + } + } + + // Check exclude patterns (if any) + if !self.exclude_patterns.is_empty() { + let matches_exclude = self + .exclude_patterns + .iter() + .any(|pattern| Self::matches_pattern(pattern, file_path)); + if matches_exclude { + return false; + } + } + + true + } + + /// Match a pattern against a path, handling both absolute and relative paths + fn matches_pattern(pattern: &Pattern, file_path: &Path) -> bool { + let path_str = file_path.to_string_lossy(); + + // Try direct match first (for relative paths) + if pattern.matches(&path_str) { + return true; + } + + // For absolute paths, try matching against path components + // This handles cases where pattern is "node_modules/**" but path is + // "/full/path/to/node_modules/file" + let components: Vec<_> = file_path + .components() + .map(|c| c.as_os_str().to_string_lossy()) + .collect(); + + // Try to find a suffix of the path that matches the pattern + for i in 0..components.len() { + let suffix_path = components[i..].join("/"); + if pattern.matches(&suffix_path) { + return true; + } + } + + false + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::*; + + #[test] + fn test_pattern_filter_creation() { + let filter = PatternFilter::new(&["*.rs".to_string()], &["target/**".to_string()]); + assert!(filter.is_ok()); + + let invalid_filter = PatternFilter::new(&["[".to_string()], &[]); + assert!(invalid_filter.is_err()); + assert!(invalid_filter.unwrap_err().contains("Invalid include pattern")); + } + + #[test] + fn test_include_patterns_work() { + // Test that include patterns work correctly + let include_patterns = vec!["*.rs".to_string()]; + let exclude_patterns = vec![]; + + let filter = PatternFilter::new(&include_patterns, &exclude_patterns).unwrap(); + + // Should include .rs files + assert!(filter.should_include(&PathBuf::from("main.rs"))); + assert!(filter.should_include(&PathBuf::from("lib.rs"))); + + // Should not include other files + assert!(!filter.should_include(&PathBuf::from("main.py"))); + assert!(!filter.should_include(&PathBuf::from("README.md"))); + } + + #[test] + fn test_exclude_patterns_work() { + // Test that exclude patterns work correctly + let include_patterns = vec![]; + let exclude_patterns = vec!["node_modules/**".to_string()]; + + let filter = PatternFilter::new(&include_patterns, &exclude_patterns).unwrap(); + + // Should exclude files in node_modules + assert!(!filter.should_include(&PathBuf::from("node_modules/package/index.js"))); + assert!(!filter.should_include(&PathBuf::from("node_modules/lib.rs"))); + + // Should include other files + assert!(filter.should_include(&PathBuf::from("src/main.rs"))); + assert!(filter.should_include(&PathBuf::from("README.md"))); + } + + #[test] + fn test_recursive_patterns() { + // Test that recursive patterns (**) work correctly + let include_patterns = vec!["**/*.rs".to_string()]; + let exclude_patterns = vec![]; + + let filter = PatternFilter::new(&include_patterns, &exclude_patterns).unwrap(); + + // Should include .rs files at any depth + assert!(filter.should_include(&PathBuf::from("main.rs"))); + assert!(filter.should_include(&PathBuf::from("src/main.rs"))); + assert!(filter.should_include(&PathBuf::from("src/lib/mod.rs"))); + assert!(filter.should_include(&PathBuf::from("deep/nested/path/file.rs"))); + + // Should not include non-.rs files + assert!(!filter.should_include(&PathBuf::from("src/main.py"))); + assert!(!filter.should_include(&PathBuf::from("deep/nested/README.md"))); + } + + #[test] + fn test_combined_include_exclude() { + // Test that include and exclude patterns work together + let include_patterns = vec!["**/*.rs".to_string()]; + let exclude_patterns = vec!["target/**".to_string()]; + + let filter = PatternFilter::new(&include_patterns, &exclude_patterns).unwrap(); + + // Should include .rs files not in target + assert!(filter.should_include(&PathBuf::from("src/main.rs"))); + assert!(filter.should_include(&PathBuf::from("tests/test.rs"))); + + // Should exclude .rs files in target + assert!(!filter.should_include(&PathBuf::from("target/debug/main.rs"))); + assert!(!filter.should_include(&PathBuf::from("target/release/lib.rs"))); + + // Should exclude non-.rs files everywhere + assert!(!filter.should_include(&PathBuf::from("src/main.py"))); + assert!(!filter.should_include(&PathBuf::from("README.md"))); + } + + #[test] + fn test_node_modules_exclusion_issue() { + // Test the specific issue mentioned in PR: node_modules exclusion not working + let include_patterns = vec![]; + let exclude_patterns = vec!["node_modules/**".to_string()]; + + let filter = PatternFilter::new(&include_patterns, &exclude_patterns).unwrap(); + + // These should be excluded (the reported bug) + assert!(!filter.should_include(&PathBuf::from("node_modules/package.json"))); + assert!(!filter.should_include(&PathBuf::from("node_modules/lib/index.js"))); + assert!(!filter.should_include(&PathBuf::from("node_modules/deep/nested/file.txt"))); + + // These should be included + assert!(filter.should_include(&PathBuf::from("src/index.js"))); + assert!(filter.should_include(&PathBuf::from("package.json"))); + } + + #[test] + fn test_node_modules_exclusion_with_temp_dir() { + use std::fs; + + use tempfile::TempDir; + + let temp_dir = TempDir::new().unwrap(); + let temp_path = temp_dir.path(); + + // Create the directory structure + fs::create_dir_all(temp_path.join("node_modules/some-package")).unwrap(); + fs::create_dir_all(temp_path.join("src")).unwrap(); + + // Create files + fs::write(temp_path.join("node_modules/package.json"), "{}").unwrap(); + fs::write(temp_path.join("node_modules/some-package/index.js"), "// code").unwrap(); + fs::write(temp_path.join("src/main.js"), "// main code").unwrap(); + fs::write(temp_path.join("package.json"), "{}").unwrap(); + + // Test the filter + let include_patterns = vec![]; + let exclude_patterns = vec!["node_modules/**".to_string()]; + let filter = PatternFilter::new(&include_patterns, &exclude_patterns).unwrap(); + + // Test relative to temp directory + let node_modules_file = PathBuf::from("node_modules/package.json"); + let node_modules_nested = PathBuf::from("node_modules/some-package/index.js"); + let src_file = PathBuf::from("src/main.js"); + let root_file = PathBuf::from("package.json"); + + // These should be excluded (the reported bug) + assert!( + !filter.should_include(&node_modules_file), + "node_modules/package.json should be excluded" + ); + assert!( + !filter.should_include(&node_modules_nested), + "node_modules/some-package/index.js should be excluded" + ); + + // These should be included + assert!(filter.should_include(&src_file), "src/main.js should be included"); + assert!(filter.should_include(&root_file), "package.json should be included"); + } + + #[test] + fn test_pattern_documentation_accuracy() { + let filter = PatternFilter::new(&["*.rs".to_string()], &[]).unwrap(); + + assert!(filter.should_include(&PathBuf::from("main.rs"))); // Current dir - should match + + // These should NOT match with *.rs (only * not **) + // If they do match, then the documentation is wrong + let _nested_matches = filter.should_include(&PathBuf::from("src/main.rs")); + assert!(_nested_matches, "*.rs should match nested files recursively"); + } + + #[test] + fn test_empty_patterns() { + // Test behavior with no patterns (should include everything) + let include_patterns = vec![]; + let exclude_patterns = vec![]; + + let filter = PatternFilter::new(&include_patterns, &exclude_patterns).unwrap(); + + // Should include everything when no patterns are specified + assert!(filter.should_include(&PathBuf::from("main.rs"))); + assert!(filter.should_include(&PathBuf::from("src/main.rs"))); + assert!(filter.should_include(&PathBuf::from("node_modules/package.json"))); + assert!(filter.should_include(&PathBuf::from("target/debug/main"))); + } + + #[test] + fn test_absolute_vs_relative_path_handling() { + use std::fs; + + use tempfile::TempDir; + + let temp_dir = TempDir::new().unwrap(); + let temp_path = temp_dir.path(); + + // Create directory structure + fs::create_dir_all(temp_path.join("node_modules")).unwrap(); + fs::create_dir_all(temp_path.join("src")).unwrap(); + + let filter = PatternFilter::new(&[], &["node_modules/**".to_string()]).unwrap(); + + // Test relative paths (should work) + let relative_excluded = PathBuf::from("node_modules/package.json"); + let relative_included = PathBuf::from("src/main.js"); + + assert!( + !filter.should_include(&relative_excluded), + "Relative node_modules path should be excluded" + ); + assert!( + filter.should_include(&relative_included), + "Relative src path should be included" + ); + + // Test absolute paths (the fix - should also work now) + let absolute_excluded = temp_path.join("node_modules/package.json"); + let absolute_included = temp_path.join("src/main.js"); + + assert!( + !filter.should_include(&absolute_excluded), + "Absolute node_modules path should be excluded" + ); + assert!( + filter.should_include(&absolute_included), + "Absolute src path should be included" + ); + } +} diff --git a/crates/semantic-search-client/src/processing/file_processor.rs b/crates/semantic-search-client/src/processing/file_processor.rs index df4d23fa5..c56e50d88 100644 --- a/crates/semantic-search-client/src/processing/file_processor.rs +++ b/crates/semantic-search-client/src/processing/file_processor.rs @@ -12,10 +12,36 @@ use crate::types::FileType; /// Determine the file type based on extension pub fn get_file_type(path: &Path) -> FileType { - match path.extension().and_then(|ext| ext.to_str()) { + match path + .extension() + .and_then(|ext| ext.to_str()) + .map(|s| s.to_lowercase()) + .as_deref() + { + // Plain text files Some("txt") => FileType::Text, - Some("md" | "markdown") => FileType::Markdown, - Some("json") => FileType::Json, + + // Markdown files (including MDX) + Some("md" | "markdown" | "mdx") => FileType::Markdown, + + // JSON files - now treated as text for better searchability + Some("json") => FileType::Text, + + // Configuration files + Some("ini" | "conf" | "cfg" | "properties" | "env") => FileType::Text, + + // Data files + Some("csv" | "tsv") => FileType::Text, + + // Log files + Some("log") => FileType::Text, + + // Documentation formats + Some("rtf" | "tex" | "rst") => FileType::Text, + + // Web and markup formats (text-based) + Some("svg") => FileType::Text, + // Code file extensions Some("rs") => FileType::Code, Some("py") => FileType::Code, @@ -34,12 +60,23 @@ pub fn get_file_type(path: &Path) -> FileType { Some("sql") => FileType::Code, Some("yaml" | "yml") => FileType::Code, Some("toml") => FileType::Code, - // Default to unknown + + // Handle files without extensions (common project files) + None => match path.file_name().and_then(|name| name.to_str()) { + Some("Dockerfile" | "Makefile" | "LICENSE" | "CHANGELOG" | "README") => FileType::Text, + Some(name) if name.starts_with('.') => match name { + ".gitignore" | ".env" | ".dockerignore" => FileType::Text, + _ => FileType::Unknown, + }, + _ => FileType::Unknown, + }, + + // Default to unknown (includes office docs, PDFs, etc.) _ => FileType::Unknown, } } -/// Process a file and extract its content +/// Process a file and extract its content (backward compatible version) /// /// # Arguments /// @@ -49,6 +86,25 @@ pub fn get_file_type(path: &Path) -> FileType { /// /// A vector of JSON objects representing the file content pub fn process_file(path: &Path) -> Result> { + process_file_with_config(path, None, None) +} + +/// Process a file with custom chunk configuration +/// +/// # Arguments +/// +/// * `path` - Path to the file +/// * `chunk_size` - Optional chunk size (uses default if None) +/// * `chunk_overlap` - Optional chunk overlap (uses default if None) +/// +/// # Returns +/// +/// A vector of JSON objects representing the file content +pub fn process_file_with_config( + path: &Path, + chunk_size: Option, + chunk_overlap: Option, +) -> Result> { if !path.exists() { return Err(SemanticSearchError::InvalidPath(format!( "File does not exist: {}", @@ -65,10 +121,10 @@ pub fn process_file(path: &Path) -> Result> { })?; match file_type { - FileType::Text | FileType::Markdown | FileType::Code => { - // For text-based files, chunk the content and create multiple data points + FileType::Text | FileType::Markdown | FileType::Code | FileType::Json => { + // For text-based files (including JSON), chunk the content and create multiple data points // Use the configured chunk size and overlap - let chunks = chunk_text(&content, None, None); + let chunks = chunk_text(&content, chunk_size, chunk_overlap); let path_str = path.to_string_lossy().to_string(); let file_type_str = format!("{:?}", file_type); @@ -112,22 +168,6 @@ pub fn process_file(path: &Path) -> Result> { Ok(results) }, - FileType::Json => { - // For JSON files, parse the content - let json: Value = - serde_json::from_str(&content).map_err(|e| SemanticSearchError::SerializationError(e.to_string()))?; - - match json { - Value::Array(items) => { - // If it's an array, return each item - Ok(items) - }, - _ => { - // Otherwise, return the whole object - Ok(vec![json]) - }, - } - }, FileType::Unknown => { // For unknown file types, just store the path let mut metadata = serde_json::Map::new(); @@ -144,11 +184,17 @@ pub fn process_file(path: &Path) -> Result> { /// # Arguments /// /// * `dir_path` - Path to the directory +/// * `chunk_size` - Optional chunk size (uses default if None) +/// * `chunk_overlap` - Optional chunk overlap (uses default if None) /// /// # Returns /// /// A vector of JSON objects representing the content of all files -pub fn process_directory(dir_path: &Path) -> Result> { +pub fn process_directory( + dir_path: &Path, + chunk_size: Option, + chunk_overlap: Option, +) -> Result> { let mut results = Vec::new(); for entry in walkdir::WalkDir::new(dir_path) @@ -169,10 +215,75 @@ pub fn process_directory(dir_path: &Path) -> Result> { } // Process the file - if let Ok(mut items) = process_file(path) { + if let Ok(mut items) = process_file_with_config(path, chunk_size, chunk_overlap) { results.append(&mut items); } } Ok(results) } + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::*; + use crate::types::FileType; + + #[test] + fn test_file_type_detection() { + let test_cases = [ + // Code files + ("main.rs", FileType::Code), + ("script.py", FileType::Code), + ("app.js", FileType::Code), + ("component.tsx", FileType::Code), + ("Main.java", FileType::Code), + ("main.c", FileType::Code), + ("index.html", FileType::Code), + ("styles.css", FileType::Code), + ("config.yaml", FileType::Code), + ("Cargo.toml", FileType::Code), + // Markdown files + ("README.md", FileType::Markdown), + ("doc.markdown", FileType::Markdown), + ("component.mdx", FileType::Markdown), + // Text files + ("notes.txt", FileType::Text), + ("data.json", FileType::Text), + ("config.ini", FileType::Text), + ("data.csv", FileType::Text), + ("Dockerfile", FileType::Text), + ("LICENSE", FileType::Text), + (".gitignore", FileType::Text), + // Case insensitive + ("Main.RS", FileType::Code), + ("README.MD", FileType::Markdown), + ("notes.TXT", FileType::Text), + // Unknown files + ("image.png", FileType::Unknown), + ("document.pdf", FileType::Unknown), + ("binary.exe", FileType::Unknown), + ("unknown_file", FileType::Unknown), + ]; + + for (filename, expected) in test_cases { + assert_eq!( + get_file_type(&PathBuf::from(filename)), + expected, + "Failed for {}", + filename + ); + } + } + + #[test] + fn test_unknown_file_types() { + // Binary files and unsupported formats + assert_eq!(get_file_type(&PathBuf::from("image.png")), FileType::Unknown); + assert_eq!(get_file_type(&PathBuf::from("document.pdf")), FileType::Unknown); + assert_eq!(get_file_type(&PathBuf::from("archive.zip")), FileType::Unknown); + assert_eq!(get_file_type(&PathBuf::from("binary.exe")), FileType::Unknown); + assert_eq!(get_file_type(&PathBuf::from("data.db")), FileType::Unknown); + } +} diff --git a/crates/semantic-search-client/src/processing/mod.rs b/crates/semantic-search-client/src/processing/mod.rs index 393f82700..87d8b5dc5 100644 --- a/crates/semantic-search-client/src/processing/mod.rs +++ b/crates/semantic-search-client/src/processing/mod.rs @@ -7,5 +7,6 @@ pub use file_processor::{ get_file_type, process_directory, process_file, + process_file_with_config, }; pub use text_chunker::chunk_text; diff --git a/crates/semantic-search-client/src/processing/text_chunker.rs b/crates/semantic-search-client/src/processing/text_chunker.rs index 369128a3c..4d28af567 100644 --- a/crates/semantic-search-client/src/processing/text_chunker.rs +++ b/crates/semantic-search-client/src/processing/text_chunker.rs @@ -61,6 +61,7 @@ mod tests { base_dir: std::path::PathBuf::from("."), max_files: 1000, // Add missing max_files field hosted_models_base_url: "http://test.example.com/models".to_string(), + embedding_type: crate::embedding::EmbeddingType::default(), }; // Use a different approach that doesn't access private static let _ = crate::config::init_config(&std::env::temp_dir()); diff --git a/crates/semantic-search-client/src/types.rs b/crates/semantic-search-client/src/types.rs index 1789db216..4ebcf50c3 100644 --- a/crates/semantic-search-client/src/types.rs +++ b/crates/semantic-search-client/src/types.rs @@ -17,6 +17,40 @@ use serde::{ use tokio_util::sync::CancellationToken; use uuid::Uuid; +/// Request for adding a new context to the knowledge base +#[derive(Debug, Clone)] +pub struct AddContextRequest { + /// Path to the directory or file to index + pub path: PathBuf, + /// Human-readable name for the context + pub name: String, + /// Description of the context + pub description: String, + /// Whether this context should be persistent + pub persistent: bool, + /// Optional patterns to include during indexing + pub include_patterns: Option>, + /// Optional patterns to exclude during indexing + pub exclude_patterns: Option>, +} + +/// Parameters for indexing operations (internal use) +#[derive(Debug, Clone)] +pub struct IndexingParams { + /// Path to the directory or file to index + pub path: PathBuf, + /// Human-readable name for the context + pub name: String, + /// Description of the context + pub description: String, + /// Whether this context should be persistent + pub persistent: bool, + /// Optional patterns to include during indexing + pub include_patterns: Option>, + /// Optional patterns to exclude during indexing + pub exclude_patterns: Option>, +} + use crate::client::SemanticContext; /// Type alias for context ID @@ -52,6 +86,14 @@ pub struct KnowledgeContext { /// Original source path if created from a directory pub source_path: Option, + /// Include patterns used during indexing + #[serde(default)] + pub include_patterns: Vec, + + /// Exclude patterns used during indexing + #[serde(default)] + pub exclude_patterns: Vec, + /// Number of items in the context pub item_count: usize, } @@ -64,6 +106,7 @@ impl KnowledgeContext { description: &str, persistent: bool, source_path: Option, + patterns: (Vec, Vec), item_count: usize, ) -> Self { let now = Utc::now(); @@ -74,6 +117,8 @@ impl KnowledgeContext { created_at: now, updated_at: now, source_path, + include_patterns: patterns.0, + exclude_patterns: patterns.1, persistent, item_count, } @@ -304,6 +349,8 @@ pub(crate) enum IndexingJob { name: String, description: String, persistent: bool, + include_patterns: Option>, + exclude_patterns: Option>, }, Clear { id: Uuid, diff --git a/docs/knowledge-management.md b/docs/knowledge-management.md index 3c4cc084e..816e85a8b 100644 --- a/docs/knowledge-management.md +++ b/docs/knowledge-management.md @@ -25,19 +25,57 @@ Once enabled, you can use `/knowledge` commands within your chat session: Display all entries in your knowledge base with detailed information including creation dates, item counts, and persistence status. -#### `/knowledge add ` +#### `/knowledge add [--include pattern] [--exclude pattern]` Add files or directories to your knowledge base. The system will recursively index all supported files in directories. `/knowledge add "project-docs" /path/to/documentation` `/knowledge add "config-files" /path/to/config.json` -Supported file types: +**Default Pattern Behavior** -- Text files: .txt -- Markdown: .md, .markdown -- JSON: .json +When you don't specify `--include` or `--exclude` patterns, the system uses your configured default patterns: + +- If no patterns are specified and no defaults are configured, all supported files are indexed +- Default include patterns apply when no `--include` is specified +- Default exclude patterns apply when no `--exclude` is specified +- Explicit patterns always override defaults + +Example with defaults configured: +```bash +q settings knowledge.defaultIncludePatterns '["**/*.rs", "**/*.py"]' +q settings knowledge.defaultExcludePatterns '["target/**", "__pycache__/**"]' + +# This will use the default patterns +/knowledge add "my-project" /path/to/project + +# This will override defaults with explicit patterns +/knowledge add "docs-only" /path/to/project --include "**/*.md" +``` + +**New: Pattern Filtering** + +You can now control which files are indexed using include and exclude patterns: + +`/knowledge add "rust-code" /path/to/project --include "*.rs" --exclude "target/**"` +`/knowledge add "docs" /path/to/project --include "**/*.md" --include "**/*.txt" --exclude "node_modules/**"` + +Pattern examples: +- `*.rs` - All Rust files in all directories recursively (equivalent to `**/*.rs`) +- `**/*.py` - All Python files recursively +- `target/**` - Everything in target directory +- `node_modules/**` - Everything in node_modules directory + +Supported file types (expanded): + +- Text files: .txt, .log, .rtf, .tex, .rst +- Markdown: .md, .markdown, .mdx (now supported!) +- JSON: .json (now treated as text for better searchability) +- Configuration: .ini, .conf, .cfg, .properties, .env +- Data files: .csv, .tsv +- Web formats: .svg (text-based) - Code files: .rs, .py, .js, .jsx, .ts, .tsx, .java, .c, .cpp, .h, .hpp, .go, .rb, .php, .swift, .kt, .kts, .cs, .sh, .bash, .zsh, .html, .htm, .xml, .css, .scss, .sass, .less, .sql, .yaml, .yml, .toml +- Special files: Dockerfile, Makefile, LICENSE, CHANGELOG, README (files without extensions) > Important: Unsupported files are indexed without text content extraction. @@ -50,7 +88,7 @@ Remove entries from your knowledge base. You can remove by name, path, or contex #### `/knowledge update ` -Update an existing knowledge base entry with new content from the specified path. +Update an existing knowledge base entry with new content from the specified path. The original include/exclude patterns are preserved during updates. `/knowledge update /path/to/updated/project` @@ -73,17 +111,28 @@ Cancel background operations. You can cancel a specific operation by ID or all o `/knowledge cancel abc12345 # Cancel specific operation` `/knowledge cancel all # Cancel all operations` +## Configuration + +Configure knowledge base behavior: + +`q settings knowledge.maxFiles 10000` # Maximum files per knowledge base +`q settings knowledge.chunkSize 1024` # Text chunk size for processing +`q settings knowledge.chunkOverlap 256` # Overlap between chunks +`q settings knowledge.defaultIncludePatterns '["**/*.rs", "**/*.md"]'` # Default include patterns +`q settings knowledge.defaultExcludePatterns '["target/**", "node_modules/**"]'` # Default exclude patterns + ## How It Works #### Indexing Process When you add content to the knowledge base: -1. File Discovery: The system recursively scans directories for supported file types -2. Content Extraction: Text content is extracted from each supported file -3. Chunking: Large files are split into smaller, searchable chunks -4. Background Processing: Indexing happens asynchronously in the background -5. Semantic Embedding: Content is processed for semantic search capabilities +1. **Pattern Filtering**: Files are filtered based on include/exclude patterns (if specified) +2. **File Discovery**: The system recursively scans directories for supported file types +3. **Content Extraction**: Text content is extracted from each supported file +4. **Chunking**: Large files are split into smaller, searchable chunks +5. **Background Processing**: Indexing happens asynchronously in the background +6. **Semantic Embedding**: Content is processed for semantic search capabilities #### Search Capabilities @@ -97,6 +146,7 @@ The knowledge base uses semantic search, which means: - Persistent contexts: Survive across chat sessions and CLI restarts - Context persistence is determined automatically based on usage patterns +- Include/exclude patterns are stored with each context and reused during updates #### Best Practices @@ -104,6 +154,7 @@ Organizing Your Knowledge Base - Use descriptive names when adding contexts: "api-documentation" instead of "docs" - Group related files in directories before adding them +- Use include/exclude patterns to focus on relevant files - Regularly review and update outdated contexts #### Effective Searching @@ -116,23 +167,31 @@ Organizing Your Knowledge Base #### Managing Large Projects - Add project directories rather than individual files when possible +- Use include/exclude patterns to avoid indexing build artifacts: `--exclude "target/**" --exclude "node_modules/**"` - Use /knowledge status to monitor indexing progress for large directories - Consider breaking very large projects into logical sub-directories +#### Pattern Filtering Best Practices + +- **Be specific**: Use precise patterns to avoid over-inclusion +- **Exclude build artifacts**: Always exclude directories like `target/**`, `node_modules/**`, `.git/**` +- **Include relevant extensions**: Focus on file types you actually need to search +- **Test patterns**: Verify patterns match expected files before large indexing operations + ## Limitations #### File Type Support -- .mdx files are not currently supported for content extraction - Binary files are ignored during indexing -- Very large files may be chunked, potentially splitting related content. +- Very large files may be chunked, potentially splitting related content +- Some specialized file formats may not extract content optimally #### Performance Considerations - Large directories may take significant time to index - Background operations are limited by concurrent processing limits -- Search performance may vary based on knowledge base size -- Currently there’s a hard limit of 5k files per knowledge base (getting removed soon as on Jul 12th, 2025). +- Search performance may vary based on knowledge base size and embedding engine +- Pattern filtering happens during file walking, improving performance for large directories #### Storage and Persistence @@ -146,24 +205,37 @@ Organizing Your Knowledge Base If your files aren't appearing in search results: -1. Check file types: Ensure your files have supported extensions -2. Monitor status: Use /knowledge status to check if indexing is still in progress -3. Verify paths: Ensure the paths you added actually exist and are accessible -4. Check for errors: Look for error messages in the CLI output +1. **Check patterns**: Ensure your include patterns match the files you want +2. **Verify exclude patterns**: Make sure exclude patterns aren't filtering out desired files +3. **Check file types**: Ensure your files have supported extensions +4. **Monitor status**: Use /knowledge status to check if indexing is still in progress +5. **Verify paths**: Ensure the paths you added actually exist and are accessible +6. **Check for errors**: Look for error messages in the CLI output #### Search Not Finding Expected Results If searches aren't returning expected results: -1. Wait for indexing: Use /knowledge status to ensure indexing is complete -2. Try different queries: Use various phrasings and keywords -3. Verify content: Use /knowledge show to confirm your content was added -4. Check file types: Unsupported file types won't have searchable content +1. **Wait for indexing**: Use /knowledge status to ensure indexing is complete +2. **Try different queries**: Use various phrasings and keywords +3. **Verify content**: Use /knowledge show to confirm your content was added +4. **Check file types**: Unsupported file types won't have searchable content #### Performance Issues If operations are slow: -1. Check queue status: Use /knowledge status to see operation queue -2. Cancel if needed: Use /knowledge cancel to stop problematic operations -3. Add smaller chunks: Consider adding subdirectories instead of entire large projects +1. **Check queue status**: Use /knowledge status to see operation queue +2. **Cancel if needed**: Use /knowledge cancel to stop problematic operations +3. **Add smaller chunks**: Consider adding subdirectories instead of entire large projects +4. **Use better patterns**: Exclude unnecessary files with exclude patterns +5. **Adjust settings**: Consider lowering maxFiles or chunkSize for better performance + +#### Pattern Issues + +If patterns aren't working as expected: + +1. **Test patterns**: Use simple patterns first, then add complexity +2. **Check syntax**: Ensure glob patterns use correct syntax (`**` for recursive, `*` for single level) +3. **Verify paths**: Make sure patterns match actual file paths in your project +4. **Use absolute patterns**: Consider using full paths in patterns for precision