From 7a3e4cefa692252ca97aca676684a58415acb693 Mon Sep 17 00:00:00 2001 From: D-K-P <8297864+D-K-P@users.noreply.github.com> Date: Thu, 24 Apr 2025 12:27:51 +0100 Subject: [PATCH 1/4] Added Python MarkItDown docs --- docs/docs.json | 1 + docs/guides/introduction.mdx | 1 + docs/guides/python/python-doc-to-markdown.mdx | 274 ++++++++++++++++++ 3 files changed, 276 insertions(+) create mode 100644 docs/guides/python/python-doc-to-markdown.mdx diff --git a/docs/docs.json b/docs/docs.json index 061637c984..9b394980a3 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -312,6 +312,7 @@ "group": "Python guides", "pages": [ "guides/python/python-image-processing", + "guides/python/python-doc-to-markdown", "guides/python/python-crawl4ai", "guides/python/python-pdf-form-extractor" ] diff --git a/docs/guides/introduction.mdx b/docs/guides/introduction.mdx index add0c353c1..9a7e7761eb 100644 --- a/docs/guides/introduction.mdx +++ b/docs/guides/introduction.mdx @@ -29,6 +29,7 @@ Get set up fast using our detailed walk-through guides. | [Cursor rules](/guides/cursor-rules) | Use Cursor rules to help write Trigger.dev tasks | | [Prisma](/guides/frameworks/prisma) | How to setup Prisma with Trigger.dev | | [Python image processing](/guides/python/python-image-processing) | Use Python and Pillow to process images | +| [Python document to markdown](/guides/python/python-doc-to-markdown) | Use Python and MarkItDown to convert documents to markdown | | [Python PDF form extractor](/guides/python/python-pdf-form-extractor) | Use Python, PyMuPDF and Trigger.dev to extract data from a PDF form | | [Python web crawler](/guides/python/python-crawl4ai) | Use Python, Crawl4AI and Playwright to create a headless web crawler | | [Sequin database triggers](/guides/frameworks/sequin) | Trigger tasks from database changes using Sequin | diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx new file mode 100644 index 0000000000..aa718dca8d --- /dev/null +++ b/docs/guides/python/python-doc-to-markdown.mdx @@ -0,0 +1,274 @@ +--- +title: "Convert documents to markdown using Python and MarkItDown" +sidebarTitle: "Python document to markdown" +description: "Learn how to use Trigger.dev with Python to convert documents to markdown using MarkItDown." +--- + +import PythonLearnMore from "/snippets/python-learn-more.mdx"; + +## Overview + +Convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library. This can be especially useful for preparing documents in a structured format for AI applications. + +## Prerequisites + +- A project with [Trigger.dev initialized](/quick-start) +- [Python](https://www.python.org/) installed on your local machine. _This example requires Python 3.10 or higher._ + +## Features + +- A Trigger.dev task which downloads a document from a URL and runs the Python script which converts it to markdown +- A Python script to convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library +- Uses our [Python build extension](/config/extensions/pythonExtension) to install dependencies and run Python scripts + +## GitHub repo + + + Click here to view the full code for this project in our examples repository on GitHub. You can + fork it and use it as a starting point for your own project. + + +## The code + +### Build configuration + +After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file: + +```ts trigger.config.ts +import { pythonExtension } from "@trigger.dev/python/extension"; +import { defineConfig } from "@trigger.dev/sdk/v3"; + +export default defineConfig({ + runtime: "node", + project: "", + // Your other config settings... + build: { + extensions: [ + pythonExtension({ + // The path to your requirements.txt file + requirementsFile: "./requirements.txt", + // The path to your Python binary + devPythonBinaryPath: `venv/bin/python`, + // The paths to your Python scripts to run + scripts: ["src/python/**/*.py"], + }), + ], + }, +}); +``` + + + Learn more about executing scripts in your Trigger.dev project using our Python build extension + [here](/config/extensions/pythonExtension). + + +### Task code + +This task uses the `python.runScript` method to run the `markdown-converter.py` script with the given document URL as an argument. + +```ts src/trigger/convertToMarkdown.ts +import { task } from "@trigger.dev/sdk/v3"; +import { python } from "@trigger.dev/python"; +import { z } from "zod"; +import * as fs from "fs"; +import * as path from "path"; +import * as os from "os"; +import * as https from "https"; +import * as http from "http"; + +export const convertToMarkdown = task({ + id: "convert-to-markdown", + run: async (payload: { url: string }) => { + try { + const { url } = payload; + + // STEP 1: Create temporary file with unique name + const tempDir = os.tmpdir(); + const fileName = `doc-${Date.now()}-${Math.random().toString(36).substring(2, 7)}`; + const urlPath = new URL(url).pathname; + // Detect file extension from URL or default to .docx + const extension = path.extname(urlPath) || ".docx"; + const tempFilePath = path.join(tempDir, `${fileName}${extension}`); + + // STEP 2: Download file from URL + await new Promise((resolve, reject) => { + const protocol = url.startsWith("https") ? https : http; + const file = fs.createWriteStream(tempFilePath); + + protocol + .get(url, (response) => { + if (response.statusCode !== 200) { + reject(new Error(`Download failed with status ${response.statusCode}`)); + return; + } + + response.pipe(file); + file.on("finish", () => { + file.close(); + resolve(); + }); + }) + .on("error", (err) => { + // Clean up on error + fs.unlink(tempFilePath, () => {}); + reject(err); + }); + }); + + // STEP 3: Run Python script to convert document to markdown + const pythonResult = await python.runScript("./src/python/markdown-converter.py", [ + JSON.stringify({ file_path: tempFilePath }), + ]); + + // STEP 4: Clean up temporary file + fs.unlink(tempFilePath, () => {}); + + // STEP 5: Process result - handle possible warnings + // Only treat stderr as error if we don't have stdout data + // This handles cases where non-critical warnings appear in stderr + if ( + pythonResult.stderr && + !pythonResult.stderr.includes("Couldn't find ffmpeg") && + !pythonResult.stdout + ) { + throw new Error(`Python error: ${pythonResult.stderr}`); + } + + // If we got valid stdout data, parse and use it regardless of stderr warnings + // This ensures harmless warnings don't break the conversion + if (pythonResult.stdout) { + const result = JSON.parse(pythonResult.stdout); + + return { + url, + markdown: result.status === "success" ? result.markdown : null, + error: result.status === "error" ? result.error : null, + success: result.status === "success", + }; + } + + return { + url, + markdown: null, + error: "No output from Python script", + success: false, + }; + } catch (error) { + if (error instanceof z.ZodError) { + return { + url: payload.url, + markdown: null, + error: "Invalid URL format: " + error.errors[0].message, + success: false, + }; + } + + return { + url: payload.url, + markdown: null, + error: error instanceof Error ? error.message : String(error), + success: false, + }; + } + }, +}); +``` + +### Add a requirements.txt file + +Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies. + +```txt requirements.txt +markitdown[all] +``` + +### The Python script + +The Python script uses MarkItDown to convert documents to Markdown format. + +```python src/python/markdown-converter.py +import json +import sys +import os +from markitdown import MarkItDown + +def convert_to_markdown(file_path): + """Convert a file to markdown format using MarkItDown""" + # Check if file exists + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + # Initialize MarkItDown + md = MarkItDown() + + # Convert the file + try: + result = md.convert(file_path) + return result.text_content + except Exception as e: + raise Exception(f"Error converting file: {str(e)}") + +def process_trigger_task(file_path): + """Process a file and convert to markdown""" + try: + markdown_result = convert_to_markdown(file_path) + return { + "status": "success", + "markdown": markdown_result + } + except Exception as e: + return { + "status": "error", + "error": str(e) + } + +if __name__ == "__main__": + # Get the file path from command line arguments + if len(sys.argv) < 2: + print(json.dumps({"status": "error", "error": "No file path provided"})) + sys.exit(1) + + try: + config = json.loads(sys.argv[1]) + file_path = config.get("file_path") + + if not file_path: + print(json.dumps({"status": "error", "error": "No file path specified in config"})) + sys.exit(1) + + result = process_trigger_task(file_path) + print(json.dumps(result)) + except Exception as e: + print(json.dumps({"status": "error", "error": str(e)})) + sys.exit(1) +``` + +## Testing your task + +1. Create a virtual environment `python -m venv venv` +2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate` +3. Install the Python dependencies `pip install -r requirements.txt`. _Make sure you have Python 3.10 or higher installed._ +4. Copy the project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and add it to the `trigger.config.ts` file. +5. Run the Trigger.dev CLI `dev` command (it may ask you to authorize the CLI if you haven't already). +6. Test the task in the dashboard by providing a valid document URL. +7. Deploy the task to production using the Trigger.dev CLI `deploy` command. + +## MarkItDown Conversion Capabilities + +- Convert various file formats to Markdown: + - Office formats (Word, PowerPoint, Excel) + - PDFs + - Images (with optional LLM-generated descriptions) + - HTML, CSV, JSON, XML + - Audio files (with optional transcription) + - ZIP archives + - And more +- Preserve document structure (headings, lists, tables, etc.) +- Handle multiple input methods (file paths, URLs, base64 data) +- Optional Azure Document Intelligence integration for better PDF and image conversion + + From 93cac410ada1dc4ab6a6ebe9258c989aae0cf82b Mon Sep 17 00:00:00 2001 From: D-K-P <8297864+D-K-P@users.noreply.github.com> Date: Thu, 24 Apr 2025 12:41:48 +0100 Subject: [PATCH 2/4] Simplified the task --- docs/guides/python/python-doc-to-markdown.mdx | 126 ++++++------------ 1 file changed, 41 insertions(+), 85 deletions(-) diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx index aa718dca8d..90fa7836cc 100644 --- a/docs/guides/python/python-doc-to-markdown.mdx +++ b/docs/guides/python/python-doc-to-markdown.mdx @@ -71,9 +71,8 @@ export default defineConfig({ This task uses the `python.runScript` method to run the `markdown-converter.py` script with the given document URL as an argument. ```ts src/trigger/convertToMarkdown.ts -import { task } from "@trigger.dev/sdk/v3"; +import { task } from "@trigger.dev/sdk"; import { python } from "@trigger.dev/python"; -import { z } from "zod"; import * as fs from "fs"; import * as path from "path"; import * as os from "os"; @@ -83,97 +82,54 @@ import * as http from "http"; export const convertToMarkdown = task({ id: "convert-to-markdown", run: async (payload: { url: string }) => { - try { - const { url } = payload; - - // STEP 1: Create temporary file with unique name - const tempDir = os.tmpdir(); - const fileName = `doc-${Date.now()}-${Math.random().toString(36).substring(2, 7)}`; - const urlPath = new URL(url).pathname; - // Detect file extension from URL or default to .docx - const extension = path.extname(urlPath) || ".docx"; - const tempFilePath = path.join(tempDir, `${fileName}${extension}`); - - // STEP 2: Download file from URL - await new Promise((resolve, reject) => { - const protocol = url.startsWith("https") ? https : http; - const file = fs.createWriteStream(tempFilePath); - - protocol - .get(url, (response) => { - if (response.statusCode !== 200) { - reject(new Error(`Download failed with status ${response.statusCode}`)); - return; - } - - response.pipe(file); - file.on("finish", () => { - file.close(); - resolve(); - }); - }) - .on("error", (err) => { - // Clean up on error - fs.unlink(tempFilePath, () => {}); - reject(err); - }); + const { url } = payload; + + // STEP 1: Create temporary file with unique name + const tempDir = os.tmpdir(); + const fileName = `doc-${Date.now()}-${Math.random().toString(36).substring(2, 7)}`; + const urlPath = new URL(url).pathname; + const extension = path.extname(urlPath) || ".docx"; + const tempFilePath = path.join(tempDir, `${fileName}${extension}`); + + // STEP 2: Download file from URL + await new Promise((resolve, reject) => { + const protocol = url.startsWith("https") ? https : http; + const file = fs.createWriteStream(tempFilePath); + + protocol.get(url, (response) => { + response.pipe(file); + file.on("finish", () => { + file.close(); + resolve(); + }); }); + }); - // STEP 3: Run Python script to convert document to markdown - const pythonResult = await python.runScript("./src/python/markdown-converter.py", [ - JSON.stringify({ file_path: tempFilePath }), - ]); - - // STEP 4: Clean up temporary file - fs.unlink(tempFilePath, () => {}); - - // STEP 5: Process result - handle possible warnings - // Only treat stderr as error if we don't have stdout data - // This handles cases where non-critical warnings appear in stderr - if ( - pythonResult.stderr && - !pythonResult.stderr.includes("Couldn't find ffmpeg") && - !pythonResult.stdout - ) { - throw new Error(`Python error: ${pythonResult.stderr}`); - } - - // If we got valid stdout data, parse and use it regardless of stderr warnings - // This ensures harmless warnings don't break the conversion - if (pythonResult.stdout) { - const result = JSON.parse(pythonResult.stdout); + // STEP 3: Run Python script to convert document to markdown + const pythonResult = await python.runScript("./src/python/markdown-converter.py", [ + JSON.stringify({ file_path: tempFilePath }), + ]); - return { - url, - markdown: result.status === "success" ? result.markdown : null, - error: result.status === "error" ? result.error : null, - success: result.status === "success", - }; - } + // STEP 4: Clean up temporary file + fs.unlink(tempFilePath, () => {}); + // STEP 5: Process result + if (pythonResult.stdout) { + const result = JSON.parse(pythonResult.stdout); return { url, - markdown: null, - error: "No output from Python script", - success: false, - }; - } catch (error) { - if (error instanceof z.ZodError) { - return { - url: payload.url, - markdown: null, - error: "Invalid URL format: " + error.errors[0].message, - success: false, - }; - } - - return { - url: payload.url, - markdown: null, - error: error instanceof Error ? error.message : String(error), - success: false, + markdown: result.status === "success" ? result.markdown : null, + error: result.status === "error" ? result.error : null, + success: result.status === "success", }; } + + return { + url, + markdown: null, + error: "No output from Python script", + success: false, + }; }, }); ``` From 5eda05522d27e96d80d97ebca17c791b6c689305 Mon Sep 17 00:00:00 2001 From: D-K-P <8297864+D-K-P@users.noreply.github.com> Date: Mon, 28 Apr 2025 18:00:49 +0100 Subject: [PATCH 3/4] Update example to use fetch and improved sidebar titles --- docs/guides/python/python-crawl4ai.mdx | 2 +- docs/guides/python/python-doc-to-markdown.mdx | 26 +++++++------------ .../guides/python/python-image-processing.mdx | 2 +- .../python/python-pdf-form-extractor.mdx | 2 +- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/docs/guides/python/python-crawl4ai.mdx b/docs/guides/python/python-crawl4ai.mdx index c821bf6c98..526d0eaf58 100644 --- a/docs/guides/python/python-crawl4ai.mdx +++ b/docs/guides/python/python-crawl4ai.mdx @@ -1,6 +1,6 @@ --- title: "Python headless browser web crawler example" -sidebarTitle: "Python headless web crawler" +sidebarTitle: "Headless web crawler" description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev." --- diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx index 90fa7836cc..ff5dad29e1 100644 --- a/docs/guides/python/python-doc-to-markdown.mdx +++ b/docs/guides/python/python-doc-to-markdown.mdx @@ -1,11 +1,16 @@ --- title: "Convert documents to markdown using Python and MarkItDown" -sidebarTitle: "Python document to markdown" +sidebarTitle: "Convert docs to markdown" description: "Learn how to use Trigger.dev with Python to convert documents to markdown using MarkItDown." --- import PythonLearnMore from "/snippets/python-learn-more.mdx"; + + This project uses Trigger.dev v4 (which is currently in beta as of 28 April 2025). If you want to + run this project you will need to [upgrade to v4](/docs/upgrade-to-v4). + + ## Overview Convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library. This can be especially useful for preparing documents in a structured format for AI applications. @@ -71,13 +76,11 @@ export default defineConfig({ This task uses the `python.runScript` method to run the `markdown-converter.py` script with the given document URL as an argument. ```ts src/trigger/convertToMarkdown.ts -import { task } from "@trigger.dev/sdk"; +import { task } from "@trigger.dev/sdk/v3"; import { python } from "@trigger.dev/python"; import * as fs from "fs"; import * as path from "path"; import * as os from "os"; -import * as https from "https"; -import * as http from "http"; export const convertToMarkdown = task({ id: "convert-to-markdown", @@ -92,18 +95,9 @@ export const convertToMarkdown = task({ const tempFilePath = path.join(tempDir, `${fileName}${extension}`); // STEP 2: Download file from URL - await new Promise((resolve, reject) => { - const protocol = url.startsWith("https") ? https : http; - const file = fs.createWriteStream(tempFilePath); - - protocol.get(url, (response) => { - response.pipe(file); - file.on("finish", () => { - file.close(); - resolve(); - }); - }); - }); + const response = await fetch(url); + const buffer = await response.arrayBuffer(); + await fs.promises.writeFile(tempFilePath, Buffer.from(buffer)); // STEP 3: Run Python script to convert document to markdown const pythonResult = await python.runScript("./src/python/markdown-converter.py", [ diff --git a/docs/guides/python/python-image-processing.mdx b/docs/guides/python/python-image-processing.mdx index 64e73ecdae..0f81d2b54f 100644 --- a/docs/guides/python/python-image-processing.mdx +++ b/docs/guides/python/python-image-processing.mdx @@ -1,6 +1,6 @@ --- title: "Python image processing example" -sidebarTitle: "Python image processing" +sidebarTitle: "Process images" description: "Learn how to use Trigger.dev with Python to process images from URLs and upload them to S3." --- diff --git a/docs/guides/python/python-pdf-form-extractor.mdx b/docs/guides/python/python-pdf-form-extractor.mdx index a62f0e8dc1..3367ea9baf 100644 --- a/docs/guides/python/python-pdf-form-extractor.mdx +++ b/docs/guides/python/python-pdf-form-extractor.mdx @@ -1,6 +1,6 @@ --- title: "Python PDF form extractor example" -sidebarTitle: "Python PDF form extractor" +sidebarTitle: "Extract form data from PDFs" description: "Learn how to use Trigger.dev with Python to extract form data from PDF files." --- From 3372859056a8134a3284ca27705fef995837d0d1 Mon Sep 17 00:00:00 2001 From: D-K-P <8297864+D-K-P@users.noreply.github.com> Date: Mon, 28 Apr 2025 19:37:22 +0100 Subject: [PATCH 4/4] Fixed link --- docs/guides/python/python-doc-to-markdown.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx index ff5dad29e1..a36a240cec 100644 --- a/docs/guides/python/python-doc-to-markdown.mdx +++ b/docs/guides/python/python-doc-to-markdown.mdx @@ -8,7 +8,7 @@ import PythonLearnMore from "/snippets/python-learn-more.mdx"; This project uses Trigger.dev v4 (which is currently in beta as of 28 April 2025). If you want to - run this project you will need to [upgrade to v4](/docs/upgrade-to-v4). + run this project you will need to [upgrade to v4](/upgrade-to-v4). ## Overview