@@ -563,6 +563,168 @@ logging:
563563# 6. Exponential backoff for failing endpoints
564564```
565565
566+ ## Filtering Examples
567+
568+ Examples showing profile and model filtering capabilities. See [ Filter Concepts] ( filters.md ) for detailed pattern syntax.
569+
570+ ### Specialized Embedding Service
571+
572+ Configure endpoints to serve only embedding models:
573+
574+ ``` yaml
575+ server :
576+ port : 40114
577+
578+ proxy :
579+ engine : " sherpa"
580+ load_balancer : " priority"
581+ # Only load profiles that support embeddings
582+ profile_filter :
583+ include :
584+ - " ollama"
585+ - " openai*"
586+ exclude :
587+ - " lm-studio" # Doesn't have good embedding support
588+
589+ discovery :
590+ static :
591+ endpoints :
592+ - url : " http://localhost:11434"
593+ name : " embedding-server"
594+ type : " ollama"
595+ priority : 100
596+ model_filter :
597+ include :
598+ - " *embed*" # Embedding models
599+ - " bge-*" # BGE models
600+ - " e5-*" # E5 models
601+ - " nomic-*" # Nomic models
602+ exclude :
603+ - " *test*" # No test models
604+ ` ` `
605+
606+ ### Production Chat Service
607+
608+ Filter out experimental and inappropriate models:
609+
610+ ` ` ` yaml
611+ proxy :
612+ engine : " olla"
613+ load_balancer : " least-connections"
614+ # Exclude test/debug profiles
615+ profile_filter :
616+ exclude :
617+ - " *test*"
618+ - " *debug*"
619+
620+ discovery :
621+ static :
622+ endpoints :
623+ - url : " http://prod-gpu-1:11434"
624+ name : " prod-chat-1"
625+ type : " ollama"
626+ priority : 100
627+ model_filter :
628+ include :
629+ - " llama*" # Llama family
630+ - " mistral*" # Mistral family
631+ - " qwen*" # Qwen family
632+ exclude :
633+ - " *uncensored*" # No uncensored models
634+ - " *test*" # No test models
635+ - " *debug*" # No debug models
636+ - " *embed*" # No embedding models
637+
638+ - url : " http://prod-gpu-2:11434"
639+ name : " prod-chat-2"
640+ type : " ollama"
641+ priority : 100
642+ model_filter :
643+ # Same filters for consistency
644+ include : ["llama*", "mistral*", "qwen*"]
645+ exclude : ["*uncensored*", "*test*", "*debug*", "*embed*"]
646+ ` ` `
647+
648+ ### Mixed Workload with Different Endpoints
649+
650+ Different model types on different endpoints:
651+
652+ ` ` ` yaml
653+ discovery :
654+ static :
655+ endpoints :
656+ # Code generation endpoint
657+ - url : " http://code-server:11434"
658+ name : " code-gen"
659+ type : " ollama"
660+ priority : 100
661+ model_filter :
662+ include :
663+ - " *code*" # Code models
664+ - " deepseek-coder*"
665+ - " codellama*"
666+ - " starcoder*"
667+
668+ # General chat endpoint
669+ - url : " http://chat-server:11434"
670+ name : " chat"
671+ type : " ollama"
672+ priority : 90
673+ model_filter :
674+ include :
675+ - " *chat*" # Chat models
676+ - " *instruct*" # Instruction models
677+ exclude :
678+ - " *code*" # No code models
679+ - " *embed*" # No embeddings
680+
681+ # Vision endpoint
682+ - url : " http://vision-server:11434"
683+ name : " vision"
684+ type : " ollama"
685+ priority : 80
686+ model_filter :
687+ include :
688+ - " *vision*" # Vision models
689+ - " llava*" # LLaVA models
690+ - " *clip*" # CLIP models
691+ ` ` `
692+
693+ ### Resource-Constrained Environment
694+
695+ Filter by model size:
696+
697+ ` ` ` yaml
698+ discovery :
699+ static :
700+ endpoints :
701+ # Small GPU - only small models
702+ - url : " http://small-gpu:11434"
703+ name : " small-models"
704+ type : " ollama"
705+ priority : 100
706+ model_filter :
707+ include :
708+ - " *-3b*" # 3B models
709+ - " *-7b*" # 7B models
710+ - " *-8b*" # 8B models
711+ exclude :
712+ - " *-13b*" # Nothing larger
713+ - " *-34b*"
714+ - " *-70b*"
715+
716+ # Large GPU - only large models
717+ - url : " http://large-gpu:11434"
718+ name : " large-models"
719+ type : " ollama"
720+ priority : 50
721+ model_filter :
722+ include :
723+ - " *-34b*" # 34B+ models
724+ - " *-70b*"
725+ - " *-72b*"
726+ ` ` `
727+
566728## Environment Variables Override
567729
568730Example showing environment variable overrides:
0 commit comments