@@ -55,6 +55,25 @@ public struct TestDataGenerator: Sendable {
5555 " financial services "
5656 ]
5757
58+ private static let noisyTokens = [
59+ " latency " ,
60+ " throughput " ,
61+ " p95 " ,
62+ " p99 " ,
63+ " cache-hit " ,
64+ " cache-miss " ,
65+ " retry " ,
66+ " timeout " ,
67+ " batch " ,
68+ " vector-db " ,
69+ " k8s " ,
70+ " grpc " ,
71+ " ssd " ,
72+ " cold-start " ,
73+ " hot-path " ,
74+ " rollback "
75+ ]
76+
5877 public init ( ) { }
5978
6079 /// Generate a collection of test documents.
@@ -116,6 +135,115 @@ public struct TestDataGenerator: Sendable {
116135 return " Doc \( index) : " + docWords. joined ( separator: " " )
117136 }
118137 }
138+
139+ /// Generate a corpus with realistic variance in document length and lexical noise.
140+ ///
141+ /// The output mixes:
142+ /// - short and long technical documents
143+ /// - repeated near-duplicates
144+ /// - numeric/error-code style tokens and punctuation
145+ ///
146+ /// - Parameters:
147+ /// - count: Number of documents to generate
148+ /// - minWords: Minimum words in a document
149+ /// - maxWords: Maximum words in a document
150+ /// - duplicateRate: Fraction of documents that are near-duplicates [0, 1]
151+ /// - seed: Seed for reproducibility
152+ /// - Returns: Generated realistic corpus
153+ public func generateRealisticCorpus(
154+ count: Int ,
155+ minWords: Int = 20 ,
156+ maxWords: Int = 260 ,
157+ duplicateRate: Double = 0.08 ,
158+ seed: UInt64 ? = nil
159+ ) -> [ String ] {
160+ guard count > 0 else { return [ ] }
161+
162+ let clampedMinWords = max ( 5 , minWords)
163+ let clampedMaxWords = max ( clampedMinWords, maxWords)
164+ let clampedDuplicateRate = min ( max ( duplicateRate, 0 ) , 1 )
165+
166+ var generator = seed. map { SeededRandomGenerator ( seed: $0) } ?? SeededRandomGenerator ( seed: 424242 )
167+ let topicWords = Self . topics. flatMap { $0. split ( separator: " " ) . map ( String . init) }
168+ let contextWords = Self . contexts. flatMap { $0. split ( separator: " " ) . map ( String . init) }
169+ let domainWords = Self . domains. flatMap { $0. split ( separator: " " ) . map ( String . init) }
170+ let vocabulary = topicWords + contextWords + domainWords + Self. noisyTokens
171+
172+ var corpus : [ String ] = [ ]
173+ corpus. reserveCapacity ( count)
174+
175+ for index in 0 ..< count {
176+ let shouldDuplicate = !corpus. isEmpty && generator. nextDouble ( ) < clampedDuplicateRate
177+ if shouldDuplicate {
178+ let base = corpus [ generator. nextInt ( upperBound: corpus. count) ]
179+ let variant = base
180+ + " Variant- \( index) incident= \( 1000 + generator. nextInt ( upperBound: 9000 ) ) "
181+ + " status= \( generator. pick ( from: [ " ok " , " warn " , " error " ] ) ) "
182+ corpus. append ( variant)
183+ continue
184+ }
185+
186+ let topic = generator. pick ( from: Self . topics)
187+ let context = generator. pick ( from: Self . contexts)
188+ let domain = generator. pick ( from: Self . domains)
189+ let wordCount = clampedMinWords + generator. nextInt ( upperBound: clampedMaxWords - clampedMinWords + 1 )
190+
191+ var words : [ String ] = [ ]
192+ words. reserveCapacity ( wordCount)
193+ for position in 0 ..< wordCount {
194+ var token = vocabulary [ generator. nextInt ( upperBound: vocabulary. count) ]
195+ if position % 37 == 0 {
196+ token += " - \( generator. nextInt ( upperBound: 500 ) ) "
197+ }
198+ words. append ( token)
199+ }
200+
201+ let punctuation = generator. pick ( from: [ " . " , " . " , " . " , " ; " , " ! " , " ? " ] )
202+ let body = words. joined ( separator: " " )
203+ let record =
204+ " Document \( index) : \( topic) \( context) \( domain) . \( body) \( punctuation) err= \( generator. nextInt ( upperBound: 12 ) ) "
205+ corpus. append ( record)
206+ }
207+
208+ return corpus
209+ }
210+
211+ /// Generate realistic query traffic with short, medium, and long queries.
212+ ///
213+ /// - Parameters:
214+ /// - count: Number of queries
215+ /// - seed: Seed for reproducibility
216+ /// - Returns: Array of query strings
217+ public func generateRealisticQueries(
218+ count: Int ,
219+ seed: UInt64 ? = nil
220+ ) -> [ String ] {
221+ guard count > 0 else { return [ ] }
222+
223+ var generator = seed. map { SeededRandomGenerator ( seed: $0) } ?? SeededRandomGenerator ( seed: 898989 )
224+ let tokens = Self . topics + Self. domains + Self. noisyTokens
225+ var queries : [ String ] = [ ]
226+ queries. reserveCapacity ( count)
227+
228+ for _ in 0 ..< count {
229+ let mode = generator. nextInt ( upperBound: 100 )
230+ if mode < 40 {
231+ queries. append ( generator. pick ( from: Self . topics) )
232+ } else if mode < 85 {
233+ let first = generator. pick ( from: tokens)
234+ let second = generator. pick ( from: tokens)
235+ let third = generator. pick ( from: tokens)
236+ queries. append ( " \( first) \( second) \( third) " )
237+ } else {
238+ let first = generator. pick ( from: Self . topics)
239+ let second = generator. pick ( from: Self . domains)
240+ let third = generator. pick ( from: Self . noisyTokens)
241+ queries. append ( " how to optimize \( first) for \( second) with \( third) " )
242+ }
243+ }
244+
245+ return queries
246+ }
119247}
120248
121249/// Simple seeded random number generator for reproducible tests.
@@ -132,4 +260,18 @@ private struct SeededRandomGenerator {
132260 state = state &* 6364136223846793005 &+ 1442695040888963407
133261 return Int ( state >> 32 )
134262 }
263+
264+ mutating func nextInt( upperBound: Int ) -> Int {
265+ guard upperBound > 0 else { return 0 }
266+ return next ( ) % upperBound
267+ }
268+
269+ mutating func nextDouble( ) -> Double {
270+ let value = UInt64 ( nextInt ( upperBound: Int ( UInt32 . max) ) )
271+ return Double ( value) / Double( UInt32 . max)
272+ }
273+
274+ mutating func pick< T> ( from values: [ T ] ) -> T {
275+ values [ nextInt ( upperBound: values. count) ]
276+ }
135277}
0 commit comments