@@ -167,7 +167,7 @@ public int generate(
167
167
}
168
168
169
169
/**
170
- * Prefill an LLaVA Module with the given images input.
170
+ * Prefill an multimodal Module with the given images input.
171
171
*
172
172
* @param image Input image as a byte array
173
173
* @param width Input image width
@@ -189,9 +189,9 @@ public long prefillImages(int[] image, int width, int height, int channels) {
189
189
private native int appendImagesInput (int [] image , int width , int height , int channels );
190
190
191
191
/**
192
- * Prefill an LLaVA Module with the given text input.
192
+ * Prefill an multimodal Module with the given text input.
193
193
*
194
- * @param prompt The text prompt to LLaVA .
194
+ * @param prompt The text prompt to multimodal model .
195
195
* @return 0, as the updated starting position in KV cache of the input in the LLM is no longer
196
196
* exposed to user.
197
197
* @throws RuntimeException if the prefill failed
@@ -208,6 +208,35 @@ public long prefillPrompt(String prompt) {
208
208
// returns status
209
209
private native int appendTextInput (String prompt );
210
210
211
+ /**
212
+ * Prefill a multimodal Module with the given text input.
213
+ *
214
+ * @param prompt The text prompt to multimodal model.
215
+ * @return 0, as the updated starting position in KV cache of the input in the LLM is no longer
216
+ * exposed to user.
217
+ * @throws RuntimeException if the prefill failed
218
+ */
219
+ public int prefillAudio (String filePath ) {
220
+ java .io .File file = new java .io .File (filePath );
221
+ try (java .io .FileInputStream fis = new java .io .FileInputStream (file )) {
222
+ byte [] fileBytes = new byte [(int ) file .length ()];
223
+ int bytesRead = fis .read (fileBytes );
224
+ if (bytesRead != fileBytes .length ) {
225
+ throw new RuntimeException ("Could not completely read file " + file .getName ());
226
+ }
227
+ int nFloats = fileBytes .length / 4 ;
228
+ int batchSize = nFloats / (128 * 3000 );
229
+ return appendAudioInput (fileBytes , batchSize , 128 , 3000 );
230
+ } catch (java .io .IOException e ) {
231
+ throw new RuntimeException ("Failed to read file: " + e );
232
+ }
233
+ }
234
+
235
+ // For Audio (option B), not RawAudio
236
+ // Use batch_size = ceil(n_floats / (n_bins * n_frames)), n_bins = 128, n_frames = 3000
237
+ // returns status
238
+ private native int appendAudioInput (byte [] audio , int batch_size , int n_bins , int n_frames );
239
+
211
240
/**
212
241
* Reset the context of the LLM. This will clear the KV cache and reset the state of the LLM.
213
242
*
0 commit comments