1- using System . Diagnostics . CodeAnalysis ;
21using System . Text ;
32using LLama . Batched ;
43using LLama . Common ;
@@ -34,6 +33,7 @@ public static async Task Run()
3433 var name = model . Metadata . GetValueOrDefault ( "general.name" , "unknown model name" ) ;
3534 Console . WriteLine ( $ "Created executor with model: { name } ") ;
3635
36+ // A set of questions to evaluate all at once
3737 var messages = new [ ]
3838 {
3939 "What's 2+2?" ,
@@ -46,8 +46,10 @@ public static async Task Run()
4646 "I have two sons, Bert and Ernie. What should I name my daughter?" ,
4747 "What day comes after Friday?" ,
4848 "What color shoes should I wear with dark blue pants?" ,
49+ "Wy ae cts btr tn dgs?"
4950 } ;
5051
52+ // Create a "Conversation" for each question
5153 var conversations = new List < ConversationData > ( ) ;
5254 foreach ( var message in messages )
5355 {
@@ -57,11 +59,14 @@ public static async Task Run()
5759 template . Add ( "user" , message ) ;
5860 template . AddAssistant = true ;
5961 var templatedMessage = Encoding . UTF8 . GetString ( template . Apply ( ) ) ;
60-
62+
6163 // create a new conversation and prompt it. include special and bos because we are using the template
64+ // - BOS is the "Beginning of Sequence" token and should be included at the start of any prompt
65+ // - Special tokens are special non-text tokens which an LLM is trained to understand (e.g. BOS). The templated text may contains special tokens.
6266 var conversation = executor . Create ( ) ;
6367 conversation . Prompt ( executor . Context . Tokenize ( templatedMessage , addBos : true , special : true ) ) ;
6468
69+ // Store everything we need to process this conversation
6570 conversations . Add ( new ConversationData {
6671 Prompt = message ,
6772 Conversation = conversation ,
@@ -73,50 +78,64 @@ public static async Task Run()
7378 var table = BuildTable ( conversations ) ;
7479 await AnsiConsole . Live ( table ) . StartAsync ( async ctx =>
7580 {
81+ // Enter a loop generating tokens
7682 for ( var i = 0 ; i < TokenCount ; i ++ )
7783 {
7884 // Run inference for all conversations in the batch which have pending tokens.
7985 var decodeResult = await executor . Infer ( ) ;
86+
87+ // Inference can fail, always check the return value!
88+ // NoKvSlot is not a fatal error, it just means that there's not enough memory available in the KV cache to process everything. You can force
89+ // this to happen by setting a small value for ContextSize in the ModelParams at the top of this file (e.g. 512).
90+ // In this case it's handled by ending a conversation (which will free up some space) and trying again. You could also handle this by
91+ // saving the conversation to disk and loading it up again later once some other conversations have finished.
8092 if ( decodeResult == DecodeResult . NoKvSlot )
81- throw new Exception ( "Could not find a KV slot for the batch. Try reducing the size of the batch or increase the context." ) ;
93+ {
94+ conversations . FirstOrDefault ( a => ! a . IsComplete ) ? . MarkComplete ( failed : true ) ;
95+ continue ;
96+ }
97+
98+ // A generic error, this is fatal and the batch can no longer be used. This should never occur and generally indicates
99+ // a bug in LLamaSharp, llama.cpp or a hardware error.
82100 if ( decodeResult == DecodeResult . Error )
83101 throw new Exception ( "Unknown error occurred while inferring." ) ;
84102
85- foreach ( var conversationData in conversations . Where ( c => c . IsComplete == false ) )
103+ // After inference all of the conversations must be sampled before running inference again.
104+ foreach ( var conversationData in conversations )
86105 {
87- if ( conversationData . Conversation . RequiresSampling == false )
106+ // Completed conversations don't need sampling.
107+ if ( conversationData . IsComplete )
88108 continue ;
89-
90- // sample a single token for the executor, passing the sample index of the conversation
91- var sampleIndex = conversationData . Conversation . GetSampleIndex ( ) ;
92- var token = conversationData . Sampler . Sample (
93- executor . Context ,
94- sampleIndex
95- ) ;
96-
109+
110+ // If the conversation wasn't prompted before the last call to Infer then it won't need sampling.
111+ if ( ! conversationData . Conversation . RequiresSampling )
112+ continue ;
113+
114+ // Use the sampling pipeline to choose a single token for this conversation.
115+ var token = conversationData . Conversation . Sample ( conversationData . Sampler ) ;
116+
117+ // Some special tokens indicate that this sequence has ended. Check if that's what has been chosen by the sampling pipeline.
97118 if ( modelTokens . IsEndOfGeneration ( token ) )
98119 {
99120 conversationData . MarkComplete ( ) ;
100121 }
101122 else
102123 {
103- // it isn't the end of generation, so add this token to the decoder and then add that to our tracked data
124+ // It isn't the end of generation, so add this token to the decoder and then add that to our tracked data
104125 conversationData . Decoder . Add ( token ) ;
105- todo : conversationData . AppendAnswer ( conversationData . Decoder . Read ( ) . ReplaceLineEndings ( " " ) ) ;
126+ conversationData . AppendAnswer ( conversationData . Decoder . Read ( ) . ReplaceLineEndings ( " " ) ) ;
106127
107- // add the token to the conversation
128+ // Prompt the conversation with this token, ready for the next round of inference to generate another token
108129 conversationData . Conversation . Prompt ( token ) ;
109130 }
110131 }
111132
112- // render the current state
133+ // Render the current state
113134 table = BuildTable ( conversations ) ;
114135 ctx . UpdateTarget ( table ) ;
115136
116137 if ( conversations . All ( c => c . IsComplete ) )
117- {
118138 break ;
119- }
120139 }
121140
122141 // if we ran out of tokens before completing just mark them as complete for rendering purposes.
@@ -155,20 +174,23 @@ public class ConversationData
155174 public required BaseSamplingPipeline Sampler { get ; init ; }
156175 public required StreamingTokenDecoder Decoder { get ; init ; }
157176
158- public string AnswerMarkdown => IsComplete
159- ? $ "[green]{ _inProgressAnswer . Message . EscapeMarkup ( ) } { _inProgressAnswer . LatestToken . EscapeMarkup ( ) } [/]"
160- : $ "[grey]{ _inProgressAnswer . Message . EscapeMarkup ( ) } [/][white]{ _inProgressAnswer . LatestToken . EscapeMarkup ( ) } [/]";
177+ public string AnswerMarkdown =>
178+ IsComplete
179+ ? $ "[{ ( IsFailed ? "red" : "green" ) } ]{ _inProgressAnswer . Message . EscapeMarkup ( ) } { _inProgressAnswer . LatestToken . EscapeMarkup ( ) } [/]"
180+ : $ "[grey]{ _inProgressAnswer . Message . EscapeMarkup ( ) } [/][white]{ _inProgressAnswer . LatestToken . EscapeMarkup ( ) } [/]";
161181
162182 public bool IsComplete { get ; private set ; }
183+ public bool IsFailed { get ; private set ; }
163184
164185 // we are only keeping track of the answer in two parts to render them differently.
165186 private ( string Message , string LatestToken ) _inProgressAnswer = ( string . Empty , string . Empty ) ;
166187
167188 public void AppendAnswer ( string newText ) => _inProgressAnswer = ( _inProgressAnswer . Message + _inProgressAnswer . LatestToken , newText ) ;
168189
169- public void MarkComplete ( )
190+ public void MarkComplete ( bool failed = false )
170191 {
171192 IsComplete = true ;
193+ IsFailed = failed ;
172194 if ( Conversation . IsDisposed == false )
173195 {
174196 // clean up the conversation and sampler to release more memory for inference.
0 commit comments