Skip to content

Commit 5f880bd

Browse files
authored
New Components - ocrspace (#15311)
* ocrspace init * [Components] ocrspace #15148 Actions - Process Image - Process PDF * pnpm update * some adjusts * pnpm update * fix file field name
1 parent 320ace7 commit 5f880bd

File tree

8 files changed

+394
-4
lines changed

8 files changed

+394
-4
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import { ConfigurationError } from "@pipedream/platform";
2+
import FormData from "form-data";
3+
import { getUrlOrFile } from "../../common/utils.mjs";
4+
import ocrspace from "../../ocrspace.app.mjs";
5+
6+
export default {
7+
props: {
8+
ocrspace,
9+
language: {
10+
propDefinition: [
11+
ocrspace,
12+
"language",
13+
],
14+
},
15+
isOverlayRequired: {
16+
propDefinition: [
17+
ocrspace,
18+
"isOverlayRequired",
19+
],
20+
},
21+
detectOrientation: {
22+
propDefinition: [
23+
ocrspace,
24+
"detectOrientation",
25+
],
26+
},
27+
scale: {
28+
propDefinition: [
29+
ocrspace,
30+
"scale",
31+
],
32+
},
33+
isTable: {
34+
propDefinition: [
35+
ocrspace,
36+
"isTable",
37+
],
38+
},
39+
ocrEngine: {
40+
propDefinition: [
41+
ocrspace,
42+
"ocrEngine",
43+
],
44+
},
45+
},
46+
async run({ $ }) {
47+
const data = new FormData();
48+
const {
49+
url, file,
50+
} = getUrlOrFile(this.file);
51+
52+
if (url) data.append("url", url);
53+
if (file) data.append("base64Image", file);
54+
if (this.imageLanguage) data.append("language", this.imageLanguage);
55+
if (this.isOverlayRequired) data.append("isOverlayRequired", `${this.isOverlayRequired}`);
56+
if (this.filetype) data.append("filetype", this.filetype);
57+
if (this.detectOrientation) data.append("detectOrientation", `${this.detectOrientation}`);
58+
if (this.scale) data.append("scale", `${this.scale}`);
59+
if (this.isTable) data.append("isTable", `${this.isTable}`);
60+
if (this.ocrEngine) data.append("OCREngine", this.ocrEngine);
61+
62+
const response = await this.ocrspace.processImage({
63+
$,
64+
data,
65+
headers: data.getHeaders(),
66+
});
67+
68+
$.export("$summary", this.getSummary());
69+
70+
if (response.ErrorMessage) {
71+
throw new ConfigurationError(response.ErrorMessage[0]);
72+
}
73+
74+
return response;
75+
},
76+
};
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import common from "../common/process-base.mjs";
2+
3+
export default {
4+
...common,
5+
key: "ocrspace-process-image",
6+
name: "Process Image",
7+
description: "Submits an image file for OCR processing using OCR.space. [See the documentation](https://ocr.space/ocrapi)",
8+
version: "0.0.1",
9+
type: "action",
10+
props: {
11+
...common.props,
12+
file: {
13+
propDefinition: [
14+
common.props.ocrspace,
15+
"file",
16+
],
17+
},
18+
filetype: {
19+
propDefinition: [
20+
common.props.ocrspace,
21+
"filetype",
22+
],
23+
},
24+
},
25+
methods: {
26+
getSummary() {
27+
return "Image submitted for OCR processing.";
28+
},
29+
},
30+
};
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import common from "../common/process-base.mjs";
2+
3+
export default {
4+
...common,
5+
key: "ocrspace-process-pdf",
6+
name: "Process PDF for OCR",
7+
description: "Submit a PDF for OCR processing. [See the documentation](https://ocr.space/ocrapi)",
8+
version: "0.0.1",
9+
type: "action",
10+
props: {
11+
...common.props,
12+
file: {
13+
propDefinition: [
14+
common.props.ocrspace,
15+
"file",
16+
],
17+
label: "PDF File",
18+
description: "The URL of the PDF file or the path to the file saved to the `/tmp` directory (e.g. `/tmp/example.pdf`) to process. [See the documentation](https://pipedream.com/docs/workflows/steps/code/nodejs/working-with-files/#the-tmp-directory).",
19+
},
20+
},
21+
methods: {
22+
getSummary() {
23+
return "Submitted PDF for OCR processing.";
24+
},
25+
},
26+
};
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
export const LANGUAGE_OPTIONS = [
2+
{
3+
label: "Arabic",
4+
value: "ara",
5+
},
6+
{
7+
label: "Bulgarian",
8+
value: "bul",
9+
},
10+
{
11+
label: "Chinese (Simplified)",
12+
value: "chs",
13+
},
14+
{
15+
label: "Chinese (Traditional)",
16+
value: "cht",
17+
},
18+
{
19+
label: "Croatian",
20+
value: "hrv",
21+
},
22+
{
23+
label: "Czech",
24+
value: "cze",
25+
},
26+
{
27+
label: "Danish",
28+
value: "dan",
29+
},
30+
{
31+
label: "Dutch",
32+
value: "dut",
33+
},
34+
{
35+
label: "English",
36+
value: "eng",
37+
},
38+
{
39+
label: "Finnish",
40+
value: "fin",
41+
},
42+
{
43+
label: "French",
44+
value: "fre",
45+
},
46+
{
47+
label: "German",
48+
value: "ger",
49+
},
50+
{
51+
label: "Greek",
52+
value: "gre",
53+
},
54+
{
55+
label: "Hungarian",
56+
value: "hun",
57+
},
58+
{
59+
label: "Korean",
60+
value: "kor",
61+
},
62+
{
63+
label: "Italian",
64+
value: "ita",
65+
},
66+
{
67+
label: "Japanese",
68+
value: "jpn",
69+
},
70+
{
71+
label: "Polish",
72+
value: "pol",
73+
},
74+
{
75+
label: "Portuguese",
76+
value: "por",
77+
},
78+
{
79+
label: "Russian",
80+
value: "rus",
81+
},
82+
{
83+
label: "Slovenian",
84+
value: "slv",
85+
},
86+
{
87+
label: "Spanish",
88+
value: "spa",
89+
},
90+
{
91+
label: "Swedish",
92+
value: "swe",
93+
},
94+
{
95+
label: "Turkish",
96+
value: "tur",
97+
},
98+
];
99+
100+
export const IMAGE_FILETYPE_OPTIONS = [
101+
"GIF",
102+
"PNG",
103+
"JPG",
104+
"TIF",
105+
"BMP",
106+
];
107+
108+
export const OCR_ENGINE_OPTIONS = [
109+
{
110+
label: "OCR Engine 1",
111+
value: "1",
112+
},
113+
{
114+
label: "OCR Engine 2",
115+
value: "2",
116+
},
117+
];
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import fs from "fs";
2+
import mime from "mime";
3+
4+
export const isValidUrl = (urlString) => {
5+
var urlPattern = new RegExp("^(https?:\\/\\/)?" + // validate protocol
6+
"((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|" + // validate domain name
7+
"((\\d{1,3}\\.){3}\\d{1,3}))" + // validate OR ip (v4) address
8+
"(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*" + // validate port and path
9+
"(\\?[;&a-z\\d%_.~+=-]*)?" + // validate query string
10+
"(\\#[-a-z\\d_]*)?$", "i"); // validate fragment locator
11+
return !!urlPattern.test(urlString);
12+
};
13+
14+
export const checkTmp = (filename) => {
15+
if (filename.indexOf("/tmp") === -1) {
16+
return `/tmp/${filename}`;
17+
}
18+
return filename;
19+
};
20+
21+
export const getUrlOrFile = (url) => {
22+
if (!isValidUrl(url)) {
23+
const filePath = checkTmp(url);
24+
const data = fs.readFileSync(filePath);
25+
const mimeType = mime.getType(filePath);
26+
const base64Image = Buffer.from(data, "binary").toString("base64");
27+
return {
28+
file: `data:${mimeType};base64,${base64Image}`,
29+
};
30+
}
31+
return {
32+
url,
33+
};
34+
};
Lines changed: 83 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,90 @@
1+
import { axios } from "@pipedream/platform";
2+
import {
3+
IMAGE_FILETYPE_OPTIONS,
4+
LANGUAGE_OPTIONS,
5+
OCR_ENGINE_OPTIONS,
6+
} from "./common/constants.mjs";
7+
18
export default {
29
type: "app",
310
app: "ocrspace",
4-
propDefinitions: {},
11+
propDefinitions: {
12+
file: {
13+
type: "string",
14+
label: "Image",
15+
description: "The URL of the image or the path to the file saved to the `/tmp` directory (e.g. `/tmp/example.jpg`) to process. [See the documentation](https://pipedream.com/docs/workflows/steps/code/nodejs/working-with-files/#the-tmp-directory).",
16+
},
17+
language: {
18+
type: "string",
19+
label: "Language",
20+
description: "Language setting for image OCR processing.",
21+
options: LANGUAGE_OPTIONS,
22+
optional: true,
23+
},
24+
isOverlayRequired: {
25+
type: "boolean",
26+
label: "Is Overlay Required",
27+
description: "If true, returns the coordinates of the bounding boxes for each word. If false, the OCR'ed text is returned only as a text block (this makes the JSON reponse smaller). Overlay data can be used, for example, to show [text over the image](https://ocr.space/english).",
28+
optional: true,
29+
},
30+
filetype: {
31+
type: "string",
32+
label: "File Type",
33+
description: "Overwrites the automatic file type detection based on content-type. Supported image file formats are png, jpg (jpeg), gif, tif (tiff) and bmp. For document ocr, the api supports the Adobe PDF format. Multi-page TIFF files are supported.",
34+
options: IMAGE_FILETYPE_OPTIONS,
35+
optional: true,
36+
},
37+
detectOrientation: {
38+
type: "boolean",
39+
label: "Detect Orientation",
40+
description: "If set to true, the api autorotates the image correctly and sets the TextOrientation parameter in the JSON response. If the image is not rotated, then TextOrientation=0, otherwise it is the degree of the rotation, e. g. \"270\".",
41+
optional: true,
42+
},
43+
scale: {
44+
type: "boolean",
45+
label: "Scale",
46+
description: "If set to true, the api does some internal upscaling. This can improve the OCR result significantly, especially for low-resolution PDF scans. Note that the front page demo uses scale=true, but the API uses scale=false by default. See also this OCR forum post.",
47+
optional: true,
48+
},
49+
isTable: {
50+
type: "boolean",
51+
label: "Is Table",
52+
description: "If set to true, the OCR logic makes sure that the parsed text result is always returned line by line. This switch is recommended for [table OCR](https://ocr.space/tablerecognition), [receipt OCR](https://ocr.space/receiptscanning), invoice processing and all other type of input documents that have a table like structure.",
53+
optional: true,
54+
},
55+
ocrEngine: {
56+
type: "string",
57+
label: "OCR Engine",
58+
description: "Engine 1 is default. [See OCR Engines](https://ocr.space/OCRAPI#ocrengine).",
59+
options: OCR_ENGINE_OPTIONS,
60+
optional: true,
61+
},
62+
},
563
methods: {
6-
// this.$auth contains connected account data
7-
authKeys() {
8-
console.log(Object.keys(this.$auth));
64+
_baseUrl() {
65+
return "https://api.ocr.space";
66+
},
67+
_headers(headers = {}) {
68+
return {
69+
"apikey": this.$auth.apikey,
70+
...headers,
71+
};
72+
},
73+
_makeRequest({
74+
$ = this, path, headers, ...opts
75+
}) {
76+
return axios($, {
77+
url: this._baseUrl() + path,
78+
headers: this._headers(headers),
79+
...opts,
80+
});
81+
},
82+
processImage(opts = {}) {
83+
return this._makeRequest({
84+
method: "POST",
85+
path: "/parse/image",
86+
...opts,
87+
});
988
},
1089
},
1190
};

0 commit comments

Comments
 (0)