Skip to content

Commit 614b3a7

Browse files
committed
feat(model/ocr): support new task type
1 parent 97997ac commit 614b3a7

File tree

3 files changed

+16
-7
lines changed

3 files changed

+16
-7
lines changed

samples/MultiModalConversationQwenVLOcr.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import com.alibaba.dashscope.exception.ApiException;
1010
import com.alibaba.dashscope.exception.NoApiKeyException;
1111
import com.alibaba.dashscope.exception.UploadFileException;
12+
import com.alibaba.dashscope.utils.JsonUtils;
1213
import com.google.gson.JsonObject;
1314
import io.reactivex.Flowable;
1415

@@ -18,7 +19,7 @@
1819
import java.util.Map;
1920

2021
public class MultiModalConversationQwenVLOcr {
21-
private static final String modelName = "qwen-vl-ocr-2025-02-18";
22+
private static final String modelName = "qwen-vl-ocr-2025-08-28";
2223
public static void videoImageListSample() throws ApiException, NoApiKeyException, UploadFileException {
2324
MultiModalConversation conv = new MultiModalConversation();
2425
MultiModalMessage systemMessage = MultiModalMessage.builder()
@@ -28,20 +29,20 @@ public static void videoImageListSample() throws ApiException, NoApiKeyException
2829

2930
Map<String, Object> imageContent = new HashMap<>();
3031
imageContent.put("type", "image");
31-
imageContent.put("image", "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/public_data/POIE/test_subset/nf0986.jpg");
32-
imageContent.put("min_pixels", "3136");
33-
imageContent.put("max_pixels", "2007040");
32+
imageContent.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
33+
imageContent.put("min_pixels", "401408");
34+
imageContent.put("max_pixels", "6422528");
3435
imageContent.put("enable_rotate", false);
3536

3637
Map<String, Object> textContent = new HashMap<>();
3738
textContent.put("type", "text");
38-
textContent.put("text", "提取图像中的文字。");
39+
textContent.put("text", "定位所有的文字行,并且返回旋转矩形([cx, cy, width, height, angle])的坐标结果。");
3940

4041
JsonObject resultSchema = new JsonObject();
4142
resultSchema.addProperty("Calories", "");
4243

4344
OcrOptions ocrOptions = OcrOptions.builder()
44-
.task(OcrOptions.Task.KEY_INFORMATION_EXTRACTION)
45+
.task(OcrOptions.Task.ADVANCED_RECOGNITION)
4546
.taskConfig(OcrOptions.TaskConfig.builder()
4647
.resultSchema(resultSchema)
4748
.build())
@@ -65,6 +66,7 @@ public static void videoImageListSample() throws ApiException, NoApiKeyException
6566

6667
MultiModalConversationResult result = conv.call(param);
6768
System.out.println(result);
69+
System.out.println(JsonUtils.toJson(result));
6870
// Flowable<MultiModalConversationResult> result = conv.streamCall(param);
6971
// result.blockingForEach(System.out::println);
7072
}

src/main/java/com/alibaba/dashscope/aigc/multimodalconversation/OcrOptions.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@ public enum Task {
3434
FORMULA_RECOGNITION,
3535

3636
@SerializedName("multi_lan")
37-
MULTI_LAN
37+
MULTI_LAN,
38+
39+
@SerializedName("advanced_recognition")
40+
ADVANCED_RECOGNITION
3841
}
3942

4043
@Data

src/main/java/com/alibaba/dashscope/common/MultiModalMessageAdapter.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,12 @@ private void writeValue(JsonWriter out, Object value) throws IOException {
3737
out.value((String) value);
3838
} else if (value instanceof Integer) {
3939
out.value((Integer) value);
40+
} else if (value instanceof Long) {
41+
out.value((Long) value);
4042
} else if (value instanceof Double) {
4143
out.value((Double) value);
44+
} else if (value instanceof Float) {
45+
out.value((Float) value);
4246
} else if (value instanceof Boolean) {
4347
out.value((Boolean) value);
4448
} else if (value instanceof Character) {

0 commit comments

Comments
 (0)