@@ -61,14 +61,22 @@ public void init() {
61
61
}
62
62
}
63
63
64
+ /**
65
+ * Generate response with default parameters.
66
+ */
64
67
public String generateResponse (String message , String systemMessage ) {
65
68
return generateResponse (message , systemMessage , 150 , 0.7 , 0.9 );
66
69
}
67
70
68
71
public String generateResponse (String message , String systemMessage , int maxTokens , double temperature , double topP ) {
72
+ return generateResponse (message , systemMessage , maxTokens , temperature , topP , null );
73
+ }
74
+
75
+ public String generateResponse (String message , String systemMessage , int maxTokens , double temperature , double topP , Long seed ) {
69
76
try {
70
77
// Create sampler and state like runInstructOnce
71
- Sampler sampler = selectSampler (model .configuration ().vocabularySize (), (float ) temperature , (float ) topP , System .currentTimeMillis ());
78
+ long actualSeed = seed != null ? seed : System .currentTimeMillis ();
79
+ Sampler sampler = selectSampler (model .configuration ().vocabularySize (), (float ) temperature , (float ) topP , actualSeed );
72
80
State state = model .createNewState ();
73
81
74
82
// Use model's ChatFormat
@@ -115,7 +123,6 @@ public String generateResponse(String message, String systemMessage, int maxToke
115
123
System .out .printf ("COMPLETED tokens=%d duration=%dms rate=%.1f tok/s%n" ,
116
124
generatedTokens .size (), duration , tokensPerSecond );
117
125
118
-
119
126
String responseText = model .tokenizer ().decode (generatedTokens );
120
127
121
128
// Add reasoning prefix for non-streaming if needed
@@ -132,9 +139,20 @@ public String generateResponse(String message, String systemMessage, int maxToke
132
139
}
133
140
134
141
public void generateStreamingResponse (String message , String systemMessage , SseEmitter emitter ) {
142
+ generateStreamingResponse (message , systemMessage , emitter , 150 , 0.7 , 0.9 );
143
+ }
144
+
145
+ public void generateStreamingResponse (String message , String systemMessage , SseEmitter emitter ,
146
+ int maxTokens , double temperature , double topP ) {
147
+ generateStreamingResponse (message , systemMessage , emitter , maxTokens , temperature , topP , null );
148
+ }
149
+
150
+ public void generateStreamingResponse (String message , String systemMessage , SseEmitter emitter ,
151
+ int maxTokens , double temperature , double topP , Long seed ) {
135
152
CompletableFuture .runAsync (() -> {
136
153
try {
137
- Sampler sampler = selectSampler (model .configuration ().vocabularySize (), 0.7f , 0.9f , System .currentTimeMillis ());
154
+ long actualSeed = seed != null ? seed : System .currentTimeMillis ();
155
+ Sampler sampler = selectSampler (model .configuration ().vocabularySize (), (float ) temperature , (float ) topP , actualSeed );
138
156
State state = model .createNewState ();
139
157
140
158
// Use proper chat format like in runInstructOnce
@@ -164,13 +182,14 @@ public void generateStreamingResponse(String message, String systemMessage, SseE
164
182
final int [] tokenCount = {0 };
165
183
long startTime = System .currentTimeMillis ();
166
184
List <Integer > generatedTokens = model .generateTokens (
167
- state , 0 , promptTokens , stopTokens , 150 , sampler , false ,
185
+ state , 0 , promptTokens , stopTokens , maxTokens , sampler , false ,
168
186
token -> {
169
187
try {
170
188
// Only display tokens that should be displayed (like in your original)
171
189
if (model .tokenizer ().shouldDisplayToken (token )) {
172
190
String tokenText = model .tokenizer ().decode (List .of (token ));
173
191
emitter .send (SseEmitter .event ().data (tokenText ));
192
+ //emitter.send(SseEmitter.event().comment("flush"));
174
193
tokenCount [0 ]++;
175
194
}
176
195
} catch (Exception e ) {
0 commit comments