Complete API documentation for Agentic-Synth.
The main entry point for synthetic data generation.
new SynthEngine(config: SynthEngineConfig)interface SynthEngineConfig {
// LLM Provider Configuration
provider?: 'openai' | 'anthropic' | 'cohere' | 'custom';
model?: string;
apiKey?: string;
temperature?: number; // 0.0 - 1.0
maxTokens?: number;
// Vector Database Configuration
vectorDB?: 'ruvector' | 'agenticdb' | VectorDBInstance;
embeddingModel?: string;
embeddingDimensions?: number;
// Generation Configuration
batchSize?: number; // Default: 100
maxWorkers?: number; // Default: 4
streaming?: boolean; // Default: false
cacheEnabled?: boolean; // Default: true
// Quality Configuration
minQuality?: number; // 0.0 - 1.0, default: 0.85
validationEnabled?: boolean; // Default: true
retryOnLowQuality?: boolean; // Default: true
}import { SynthEngine } from 'agentic-synth';
const synth = new SynthEngine({
provider: 'openai',
model: 'gpt-4',
temperature: 0.8,
vectorDB: 'ruvector',
batchSize: 1000,
streaming: true,
});Generate synthetic data based on a schema.
async generate<T>(options: GenerateOptions): Promise<GeneratedData<T>>Parameters:
interface GenerateOptions {
schema: Schema;
count: number;
streaming?: boolean;
progressCallback?: (progress: Progress) => void;
abortSignal?: AbortSignal;
}
interface Progress {
current: number;
total: number;
rate: number; // Items per second
estimatedTimeRemaining: number; // Seconds
}Returns:
interface GeneratedData<T> {
data: T[];
metadata: {
count: number;
schema: Schema;
quality: QualityMetrics;
duration: number;
};
// Methods
export(options: ExportOptions): Promise<void>;
filter(predicate: (item: T) => boolean): GeneratedData<T>;
map<U>(mapper: (item: T) => U): GeneratedData<U>;
toJSON(): string;
toCSV(): string;
toParquet(): Buffer;
}Example:
const result = await synth.generate({
schema: customerSupportSchema,
count: 1000,
streaming: true,
progressCallback: (progress) => {
console.log(`Progress: ${progress.current}/${progress.total}`);
},
});
console.log('Quality:', result.metadata.quality);
await result.export({ format: 'jsonl', outputPath: './data.jsonl' });Generate data as an async iterator for real-time processing.
async *generateStream<T>(options: GenerateOptions): AsyncIterator<T>Example:
for await (const item of synth.generateStream({ schema, count: 10000 })) {
// Process item in real-time
await processItem(item);
}Generate and directly insert into vector database.
async generateAndInsert(options: GenerateAndInsertOptions): Promise<InsertResult>Parameters:
interface GenerateAndInsertOptions extends GenerateOptions {
collection: string;
batchSize?: number;
includeEmbeddings?: boolean;
}
interface InsertResult {
inserted: number;
failed: number;
duration: number;
errors: Error[];
}Example:
const result = await synth.generateAndInsert({
schema: productSchema,
count: 10000,
collection: 'products',
batchSize: 1000,
includeEmbeddings: true,
});
console.log(`Inserted ${result.inserted} items`);Schema definition system for structured data generation.
Define a custom schema.
Schema.define(definition: SchemaDefinition): SchemaParameters:
interface SchemaDefinition {
name: string;
description?: string;
type: 'object' | 'array' | 'conversation' | 'embedding';
// For object types
properties?: Record<string, PropertyDefinition>;
required?: string[];
// For array types
items?: SchemaDefinition;
minItems?: number;
maxItems?: number;
// For conversation types
personas?: PersonaDefinition[];
turns?: { min: number; max: number };
// Additional constraints
constraints?: Constraint[];
distribution?: DistributionSpec;
}
interface PropertyDefinition {
type: 'string' | 'number' | 'boolean' | 'date' | 'email' | 'url' | 'embedding';
description?: string;
format?: string;
enum?: any[];
minimum?: number;
maximum?: number;
pattern?: string;
default?: any;
}
interface PersonaDefinition {
name: string;
traits: string[];
temperature?: number;
examples?: string[];
}Example:
const userSchema = Schema.define({
name: 'User',
type: 'object',
properties: {
id: { type: 'string', format: 'uuid' },
name: { type: 'string' },
email: { type: 'email' },
age: { type: 'number', minimum: 18, maximum: 100 },
role: { type: 'string', enum: ['admin', 'user', 'guest'] },
createdAt: { type: 'date' },
bio: { type: 'string' },
embedding: { type: 'embedding', dimensions: 384 },
},
required: ['id', 'name', 'email'],
});Schema.conversation(options: ConversationOptions): Schemainterface ConversationOptions {
domain: string;
personas: string[] | PersonaDefinition[];
topics?: string[];
turns: { min: number; max: number };
includeEmbeddings?: boolean;
}Example:
const supportSchema = Schema.conversation({
domain: 'customer-support',
personas: [
{ name: 'customer', traits: ['frustrated', 'confused'] },
{ name: 'agent', traits: ['helpful', 'professional'] },
],
topics: ['billing', 'technical', 'shipping'],
turns: { min: 4, max: 12 },
});Schema.embedding(options: EmbeddingOptions): Schemainterface EmbeddingOptions {
dimensions: number;
domain: string;
clusters?: number;
distribution?: 'gaussian' | 'uniform' | 'clustered';
}Specialized generators for common use cases.
Generate question-answer pairs for RAG systems.
class RAGDataGenerator {
static async create(options: RAGOptions): Promise<GeneratedData<RAGPair>>
}Parameters:
interface RAGOptions {
domain: string;
sources?: string[]; // File paths or URLs
questionsPerSource?: number;
includeNegatives?: boolean; // For contrastive learning
difficulty?: 'easy' | 'medium' | 'hard' | 'mixed';
}
interface RAGPair {
question: string;
answer: string;
context: string;
embedding?: number[];
metadata: {
source: string;
difficulty: string;
type: 'positive' | 'negative';
};
}Example:
const ragData = await RAGDataGenerator.create({
domain: 'technical-documentation',
sources: ['./docs/**/*.md'],
questionsPerSource: 10,
includeNegatives: true,
difficulty: 'mixed',
});Generate agent interaction histories.
class AgentMemoryGenerator {
static async synthesize(options: MemoryOptions): Promise<GeneratedData<Memory>>
}Parameters:
interface MemoryOptions {
agentType: string;
interactions: number;
userPersonas?: string[];
taskDistribution?: Record<string, number>;
includeEmbeddings?: boolean;
}
interface Memory {
id: string;
timestamp: Date;
userInput: string;
agentResponse: string;
taskType: string;
persona: string;
embedding?: number[];
metadata: Record<string, any>;
}Generate edge cases for testing.
class EdgeCaseGenerator {
static async create(options: EdgeCaseOptions): Promise<GeneratedData<any>>
}Parameters:
interface EdgeCaseOptions {
schema: Schema;
categories: EdgeCaseCategory[];
coverage?: 'minimal' | 'standard' | 'exhaustive';
}
type EdgeCaseCategory =
| 'boundary-values'
| 'null-handling'
| 'type-mismatches'
| 'malicious-input'
| 'unicode-edge-cases'
| 'race-conditions'
| 'overflow'
| 'underflow';Generate vector embeddings datasets.
class EmbeddingDatasetGenerator {
static async create(options: EmbeddingDatasetOptions): Promise<GeneratedData<EmbeddingItem>>
}Parameters:
interface EmbeddingDatasetOptions {
domain: string;
clusters: number;
itemsPerCluster: number;
vectorDim: number;
distribution?: 'gaussian' | 'uniform' | 'clustered';
}
interface EmbeddingItem {
id: string;
text: string;
embedding: number[];
cluster: number;
metadata: Record<string, any>;
}Pre-built templates for common domains.
Templates.customerSupport.generate(count: number): Promise<GeneratedData<Conversation>>Templates.codeReviews.generate(count: number): Promise<GeneratedData<Review>>Templates.ecommerce.generate(count: number): Promise<GeneratedData<Product>>Templates.medicalQA.generate(count: number): Promise<GeneratedData<MedicalQA>>Templates.legalContracts.generate(count: number): Promise<GeneratedData<Contract>>Example:
import { Templates } from 'agentic-synth';
const products = await Templates.ecommerce.generate(10000);
await products.export({ format: 'parquet', outputPath: './products.parquet' });Evaluate synthetic data quality.
QualityMetrics.evaluate(data: any[], options: EvaluationOptions): Promise<QualityReport>Parameters:
interface EvaluationOptions {
realism?: boolean; // Human-like quality
diversity?: boolean; // Unique examples ratio
coverage?: boolean; // Schema satisfaction
coherence?: boolean; // Logical consistency
bias?: boolean; // Detect biases
}
interface QualityReport {
realism: number; // 0-1
diversity: number; // 0-1
coverage: number; // 0-1
coherence: number; // 0-1
bias: {
gender: number;
age: number;
ethnicity: number;
[key: string]: number;
};
overall: number; // Weighted average
}Example:
const metrics = await QualityMetrics.evaluate(syntheticData, {
realism: true,
diversity: true,
coverage: true,
bias: true,
});
if (metrics.overall < 0.85) {
console.warn('Low quality data detected');
}class RuvectorAdapter {
constructor(synthEngine: SynthEngine, vectorDB: VectorDB)
async generateAndInsert(options: GenerateOptions): Promise<InsertResult>
async augmentCollection(collection: string, count: number): Promise<void>
}class AgenticDBAdapter {
constructor(synthEngine: SynthEngine)
async generateMemory(options: MemoryOptions): Promise<Memory[]>
async generateSkills(count: number): Promise<Skill[]>
}class LangChainAdapter {
constructor(synthEngine: SynthEngine)
async generateDocuments(options: GenerateOptions): Promise<Document[]>
async createVectorStore(options: VectorStoreOptions): Promise<VectorStore>
}// Schema types
type Schema = { /* ... */ };
type PropertyDefinition = { /* ... */ };
type SchemaDefinition = { /* ... */ };
// Generation types
type GenerateOptions = { /* ... */ };
type GeneratedData<T> = { /* ... */ };
type Progress = { /* ... */ };
// Quality types
type QualityMetrics = { /* ... */ };
type QualityReport = { /* ... */ };
// Export types
type ExportFormat = 'json' | 'jsonl' | 'csv' | 'parquet' | 'sql';
type ExportOptions = {
format: ExportFormat;
outputPath: string;
includeVectors?: boolean;
compress?: boolean;
};# Generate data
agentic-synth generate --schema <schema> --count <n> --output <file>
# Augment existing data
agentic-synth augment --input <file> --variations <n> --output <file>
# Validate quality
agentic-synth validate --input <file> --metrics <metrics>
# Export/convert
agentic-synth export --input <file> --format <format> --output <file>
# List templates
agentic-synth templates list
# Generate from template
agentic-synth templates use <name> --count <n> --output <file>--schema <file> # Schema file (YAML/JSON)
--count <number> # Number of examples
--output <path> # Output file path
--format <format> # json|jsonl|csv|parquet
--embeddings # Include vector embeddings
--quality <threshold> # Minimum quality (0-1)
--streaming # Enable streaming mode
--workers <number> # Number of parallel workers
--verbose # Detailed loggingimport { SynthError, ValidationError, GenerationError } from 'agentic-synth';
try {
const data = await synth.generate({ schema, count });
} catch (error) {
if (error instanceof ValidationError) {
console.error('Schema validation failed:', error.issues);
} else if (error instanceof GenerationError) {
console.error('Generation failed:', error.message);
} else if (error instanceof SynthError) {
console.error('Synth error:', error.message);
}
}- Start Small: Test with 100 examples before scaling to millions
- Validate Schemas: Use TypeScript types for compile-time safety
- Monitor Quality: Always evaluate quality metrics
- Use Streaming: For large datasets (>10K), enable streaming
- Cache Results: Enable caching for repeated generations
- Tune Temperature: Lower (0.5-0.7) for consistency, higher (0.8-1.0) for diversity
- Batch Operations: Use batching for vector DB insertions
- Handle Errors: Implement retry logic for API failures
See EXAMPLES.md for comprehensive usage examples.
- GitHub Issues: https://github.com/ruvnet/ruvector/issues
- Discord: https://discord.gg/ruvnet
- Email: support@ruv.io