diff --git a/components/ocrspace/actions/common/process-base.mjs b/components/ocrspace/actions/common/process-base.mjs new file mode 100644 index 0000000000000..2a8fbc60500d1 --- /dev/null +++ b/components/ocrspace/actions/common/process-base.mjs @@ -0,0 +1,76 @@ +import { ConfigurationError } from "@pipedream/platform"; +import FormData from "form-data"; +import { getUrlOrFile } from "../../common/utils.mjs"; +import ocrspace from "../../ocrspace.app.mjs"; + +export default { + props: { + ocrspace, + language: { + propDefinition: [ + ocrspace, + "language", + ], + }, + isOverlayRequired: { + propDefinition: [ + ocrspace, + "isOverlayRequired", + ], + }, + detectOrientation: { + propDefinition: [ + ocrspace, + "detectOrientation", + ], + }, + scale: { + propDefinition: [ + ocrspace, + "scale", + ], + }, + isTable: { + propDefinition: [ + ocrspace, + "isTable", + ], + }, + ocrEngine: { + propDefinition: [ + ocrspace, + "ocrEngine", + ], + }, + }, + async run({ $ }) { + const data = new FormData(); + const { + url, file, + } = getUrlOrFile(this.file); + + if (url) data.append("url", url); + if (file) data.append("base64Image", file); + if (this.imageLanguage) data.append("language", this.imageLanguage); + if (this.isOverlayRequired) data.append("isOverlayRequired", `${this.isOverlayRequired}`); + if (this.filetype) data.append("filetype", this.filetype); + if (this.detectOrientation) data.append("detectOrientation", `${this.detectOrientation}`); + if (this.scale) data.append("scale", `${this.scale}`); + if (this.isTable) data.append("isTable", `${this.isTable}`); + if (this.ocrEngine) data.append("OCREngine", this.ocrEngine); + + const response = await this.ocrspace.processImage({ + $, + data, + headers: data.getHeaders(), + }); + + $.export("$summary", this.getSummary()); + + if (response.ErrorMessage) { + throw new ConfigurationError(response.ErrorMessage[0]); + } + + return response; + }, +}; diff --git a/components/ocrspace/actions/process-image/process-image.mjs b/components/ocrspace/actions/process-image/process-image.mjs new file mode 100644 index 0000000000000..dace35e1e2ca1 --- /dev/null +++ b/components/ocrspace/actions/process-image/process-image.mjs @@ -0,0 +1,30 @@ +import common from "../common/process-base.mjs"; + +export default { + ...common, + key: "ocrspace-process-image", + name: "Process Image", + description: "Submits an image file for OCR processing using OCR.space. [See the documentation](https://ocr.space/ocrapi)", + version: "0.0.1", + type: "action", + props: { + ...common.props, + file: { + propDefinition: [ + common.props.ocrspace, + "file", + ], + }, + filetype: { + propDefinition: [ + common.props.ocrspace, + "filetype", + ], + }, + }, + methods: { + getSummary() { + return "Image submitted for OCR processing."; + }, + }, +}; diff --git a/components/ocrspace/actions/process-pdf/process-pdf.mjs b/components/ocrspace/actions/process-pdf/process-pdf.mjs new file mode 100644 index 0000000000000..cda0790f5c620 --- /dev/null +++ b/components/ocrspace/actions/process-pdf/process-pdf.mjs @@ -0,0 +1,26 @@ +import common from "../common/process-base.mjs"; + +export default { + ...common, + key: "ocrspace-process-pdf", + name: "Process PDF for OCR", + description: "Submit a PDF for OCR processing. [See the documentation](https://ocr.space/ocrapi)", + version: "0.0.1", + type: "action", + props: { + ...common.props, + file: { + propDefinition: [ + common.props.ocrspace, + "file", + ], + label: "PDF File", + description: "The URL of the PDF file or the path to the file saved to the `/tmp` directory (e.g. `/tmp/example.pdf`) to process. [See the documentation](https://pipedream.com/docs/workflows/steps/code/nodejs/working-with-files/#the-tmp-directory).", + }, + }, + methods: { + getSummary() { + return "Submitted PDF for OCR processing."; + }, + }, +}; diff --git a/components/ocrspace/common/constants.mjs b/components/ocrspace/common/constants.mjs new file mode 100644 index 0000000000000..ea51ce5ad0337 --- /dev/null +++ b/components/ocrspace/common/constants.mjs @@ -0,0 +1,117 @@ +export const LANGUAGE_OPTIONS = [ + { + label: "Arabic", + value: "ara", + }, + { + label: "Bulgarian", + value: "bul", + }, + { + label: "Chinese (Simplified)", + value: "chs", + }, + { + label: "Chinese (Traditional)", + value: "cht", + }, + { + label: "Croatian", + value: "hrv", + }, + { + label: "Czech", + value: "cze", + }, + { + label: "Danish", + value: "dan", + }, + { + label: "Dutch", + value: "dut", + }, + { + label: "English", + value: "eng", + }, + { + label: "Finnish", + value: "fin", + }, + { + label: "French", + value: "fre", + }, + { + label: "German", + value: "ger", + }, + { + label: "Greek", + value: "gre", + }, + { + label: "Hungarian", + value: "hun", + }, + { + label: "Korean", + value: "kor", + }, + { + label: "Italian", + value: "ita", + }, + { + label: "Japanese", + value: "jpn", + }, + { + label: "Polish", + value: "pol", + }, + { + label: "Portuguese", + value: "por", + }, + { + label: "Russian", + value: "rus", + }, + { + label: "Slovenian", + value: "slv", + }, + { + label: "Spanish", + value: "spa", + }, + { + label: "Swedish", + value: "swe", + }, + { + label: "Turkish", + value: "tur", + }, +]; + +export const IMAGE_FILETYPE_OPTIONS = [ + "GIF", + "PNG", + "JPG", + "TIF", + "BMP", +]; + +export const OCR_ENGINE_OPTIONS = [ + { + label: "OCR Engine 1", + value: "1", + }, + { + label: "OCR Engine 2", + value: "2", + }, +]; diff --git a/components/ocrspace/common/utils.mjs b/components/ocrspace/common/utils.mjs new file mode 100644 index 0000000000000..3384dcc350320 --- /dev/null +++ b/components/ocrspace/common/utils.mjs @@ -0,0 +1,34 @@ +import fs from "fs"; +import mime from "mime"; + +export const isValidUrl = (urlString) => { + var urlPattern = new RegExp("^(https?:\\/\\/)?" + // validate protocol +"((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|" + // validate domain name +"((\\d{1,3}\\.){3}\\d{1,3}))" + // validate OR ip (v4) address +"(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*" + // validate port and path +"(\\?[;&a-z\\d%_.~+=-]*)?" + // validate query string +"(\\#[-a-z\\d_]*)?$", "i"); // validate fragment locator + return !!urlPattern.test(urlString); +}; + +export const checkTmp = (filename) => { + if (filename.indexOf("/tmp") === -1) { + return `/tmp/${filename}`; + } + return filename; +}; + +export const getUrlOrFile = (url) => { + if (!isValidUrl(url)) { + const filePath = checkTmp(url); + const data = fs.readFileSync(filePath); + const mimeType = mime.getType(filePath); + const base64Image = Buffer.from(data, "binary").toString("base64"); + return { + file: `data:${mimeType};base64,${base64Image}`, + }; + } + return { + url, + }; +}; diff --git a/components/ocrspace/ocrspace.app.mjs b/components/ocrspace/ocrspace.app.mjs index 915aa5782b482..ae96d7db00db7 100644 --- a/components/ocrspace/ocrspace.app.mjs +++ b/components/ocrspace/ocrspace.app.mjs @@ -1,11 +1,90 @@ +import { axios } from "@pipedream/platform"; +import { + IMAGE_FILETYPE_OPTIONS, + LANGUAGE_OPTIONS, + OCR_ENGINE_OPTIONS, +} from "./common/constants.mjs"; + export default { type: "app", app: "ocrspace", - propDefinitions: {}, + propDefinitions: { + file: { + type: "string", + label: "Image", + description: "The URL of the image or the path to the file saved to the `/tmp` directory (e.g. `/tmp/example.jpg`) to process. [See the documentation](https://pipedream.com/docs/workflows/steps/code/nodejs/working-with-files/#the-tmp-directory).", + }, + language: { + type: "string", + label: "Language", + description: "Language setting for image OCR processing.", + options: LANGUAGE_OPTIONS, + optional: true, + }, + isOverlayRequired: { + type: "boolean", + label: "Is Overlay Required", + description: "If true, returns the coordinates of the bounding boxes for each word. If false, the OCR'ed text is returned only as a text block (this makes the JSON reponse smaller). Overlay data can be used, for example, to show [text over the image](https://ocr.space/english).", + optional: true, + }, + filetype: { + type: "string", + label: "File Type", + description: "Overwrites the automatic file type detection based on content-type. Supported image file formats are png, jpg (jpeg), gif, tif (tiff) and bmp. For document ocr, the api supports the Adobe PDF format. Multi-page TIFF files are supported.", + options: IMAGE_FILETYPE_OPTIONS, + optional: true, + }, + detectOrientation: { + type: "boolean", + label: "Detect Orientation", + description: "If set to true, the api autorotates the image correctly and sets the TextOrientation parameter in the JSON response. If the image is not rotated, then TextOrientation=0, otherwise it is the degree of the rotation, e. g. \"270\".", + optional: true, + }, + scale: { + type: "boolean", + label: "Scale", + description: "If set to true, the api does some internal upscaling. This can improve the OCR result significantly, especially for low-resolution PDF scans. Note that the front page demo uses scale=true, but the API uses scale=false by default. See also this OCR forum post.", + optional: true, + }, + isTable: { + type: "boolean", + label: "Is Table", + description: "If set to true, the OCR logic makes sure that the parsed text result is always returned line by line. This switch is recommended for [table OCR](https://ocr.space/tablerecognition), [receipt OCR](https://ocr.space/receiptscanning), invoice processing and all other type of input documents that have a table like structure.", + optional: true, + }, + ocrEngine: { + type: "string", + label: "OCR Engine", + description: "Engine 1 is default. [See OCR Engines](https://ocr.space/OCRAPI#ocrengine).", + options: OCR_ENGINE_OPTIONS, + optional: true, + }, + }, methods: { - // this.$auth contains connected account data - authKeys() { - console.log(Object.keys(this.$auth)); + _baseUrl() { + return "https://api.ocr.space"; + }, + _headers(headers = {}) { + return { + "apikey": this.$auth.apikey, + ...headers, + }; + }, + _makeRequest({ + $ = this, path, headers, ...opts + }) { + return axios($, { + url: this._baseUrl() + path, + headers: this._headers(headers), + ...opts, + }); + }, + processImage(opts = {}) { + return this._makeRequest({ + method: "POST", + path: "/parse/image", + ...opts, + }); }, }, }; diff --git a/components/ocrspace/package.json b/components/ocrspace/package.json new file mode 100644 index 0000000000000..a0ab1ae330c10 --- /dev/null +++ b/components/ocrspace/package.json @@ -0,0 +1,19 @@ +{ + "name": "@pipedream/ocrspace", + "version": "0.1.0", + "description": "Pipedream OCRSpace Components", + "main": "ocrspace.app.mjs", + "keywords": [ + "pipedream", + "ocrspace" + ], + "homepage": "https://pipedream.com/apps/ocrspace", + "author": "Pipedream (https://pipedream.com/)", + "publishConfig": { + "access": "public" + }, + "dependencies": { + "@pipedream/platform": "^3.0.3", + "mime": "^4.0.6" + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ca391384a140c..b484508c2f1d9 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -7099,6 +7099,15 @@ importers: components/ocr_web_service: {} + components/ocrspace: + dependencies: + '@pipedream/platform': + specifier: ^3.0.3 + version: 3.0.3 + mime: + specifier: ^4.0.6 + version: 4.0.6 + components/octoparse: {} components/octopus_deploy: {} @@ -9592,8 +9601,7 @@ importers: components/showpad: {} - components/shutterstock: - specifiers: {} + components/shutterstock: {} components/sidetracker: {} @@ -22374,6 +22382,11 @@ packages: engines: {node: '>=16'} hasBin: true + mime@4.0.6: + resolution: {integrity: sha512-4rGt7rvQHBbaSOF9POGkk1ocRP16Md1x36Xma8sz8h8/vfCUI2OtEIeCqe4Ofes853x4xDoPiFLIT47J5fI/7A==} + engines: {node: '>=16'} + hasBin: true + mimer@2.0.2: resolution: {integrity: sha512-izxvjsB7Ur5HrTbPu6VKTrzxSMBFBqyZQc6dWlZNQ4/wAvf886fD4lrjtFd8IQ8/WmZKdxKjUtqFFNaj3hQ52g==} engines: {node: '>= 12'} @@ -31013,6 +31026,8 @@ snapshots: '@putout/operator-filesystem': 5.0.0(putout@36.13.1(eslint@8.57.1)(typescript@5.6.3)) '@putout/operator-json': 2.2.0 putout: 36.13.1(eslint@8.57.1)(typescript@5.6.3) + transitivePeerDependencies: + - supports-color '@putout/operator-regexp@1.0.0(putout@36.13.1(eslint@8.57.1)(typescript@5.6.3))': dependencies: @@ -39672,6 +39687,8 @@ snapshots: mime@4.0.4: {} + mime@4.0.6: {} + mimer@2.0.2: {} mimic-fn@2.1.0: {}