From 7a3e4cefa692252ca97aca676684a58415acb693 Mon Sep 17 00:00:00 2001
From: D-K-P <8297864+D-K-P@users.noreply.github.com>
Date: Thu, 24 Apr 2025 12:27:51 +0100
Subject: [PATCH 1/4] Added Python MarkItDown docs

---
 docs/docs.json                                |   1 +
 docs/guides/introduction.mdx                  |   1 +
 docs/guides/python/python-doc-to-markdown.mdx | 274 ++++++++++++++++++
 3 files changed, 276 insertions(+)
 create mode 100644 docs/guides/python/python-doc-to-markdown.mdx
diff --git a/docs/docs.json b/docs/docs.json
index 061637c984..9b394980a3 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -312,6 +312,7 @@
             "group": "Python guides",
             "pages": [
               "guides/python/python-image-processing",
+              "guides/python/python-doc-to-markdown",
               "guides/python/python-crawl4ai",
               "guides/python/python-pdf-form-extractor"
             ]
diff --git a/docs/guides/introduction.mdx b/docs/guides/introduction.mdx
index add0c353c1..9a7e7761eb 100644
--- a/docs/guides/introduction.mdx
+++ b/docs/guides/introduction.mdx
@@ -29,6 +29,7 @@ Get set up fast using our detailed walk-through guides.
 | [Cursor rules](/guides/cursor-rules)                                                       | Use Cursor rules to help write Trigger.dev tasks                     |
 | [Prisma](/guides/frameworks/prisma)                                                        | How to setup Prisma with Trigger.dev                                 |
 | [Python image processing](/guides/python/python-image-processing)                          | Use Python and Pillow to process images                              |
+| [Python document to markdown](/guides/python/python-doc-to-markdown)                       | Use Python and MarkItDown to convert documents to markdown           |
 | [Python PDF form extractor](/guides/python/python-pdf-form-extractor)                      | Use Python, PyMuPDF and Trigger.dev to extract data from a PDF form  |
 | [Python web crawler](/guides/python/python-crawl4ai)                                       | Use Python, Crawl4AI and Playwright to create a headless web crawler |
 | [Sequin database triggers](/guides/frameworks/sequin)                                      | Trigger tasks from database changes using Sequin                     |
diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx
new file mode 100644
index 0000000000..aa718dca8d
--- /dev/null
+++ b/docs/guides/python/python-doc-to-markdown.mdx
@@ -0,0 +1,274 @@
+---
+title: "Convert documents to markdown using Python and MarkItDown"
+sidebarTitle: "Python document to markdown"
+description: "Learn how to use Trigger.dev with Python to convert documents to markdown using MarkItDown."
+---
+
+import PythonLearnMore from "/snippets/python-learn-more.mdx";
+
+## Overview
+
+Convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library. This can be especially useful for preparing documents in a structured format for AI applications.
+
+## Prerequisites
+
+- A project with [Trigger.dev initialized](/quick-start)
+- [Python](https://www.python.org/) installed on your local machine. _This example requires Python 3.10 or higher._
+
+## Features
+
+- A Trigger.dev task which downloads a document from a URL and runs the Python script which converts it to markdown
+- A Python script to convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library
+- Uses our [Python build extension](/config/extensions/pythonExtension) to install dependencies and run Python scripts
+
+## GitHub repo
+
+<Card
+  title="View the project on GitHub"
+  icon="GitHub"
+  href="https://github.com/triggerdotdev/examples/tree/main/python-doc-to-markdown-converter"
+>
+  Click here to view the full code for this project in our examples repository on GitHub. You can
+  fork it and use it as a starting point for your own project.
+</Card>
+
+## The code
+
+### Build configuration
+
+After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file:
+
+```ts trigger.config.ts
+import { pythonExtension } from "@trigger.dev/python/extension";
+import { defineConfig } from "@trigger.dev/sdk/v3";
+
+export default defineConfig({
+  runtime: "node",
+  project: "<your-project-ref>",
+  // Your other config settings...
+  build: {
+    extensions: [
+      pythonExtension({
+        // The path to your requirements.txt file
+        requirementsFile: "./requirements.txt",
+        // The path to your Python binary
+        devPythonBinaryPath: `venv/bin/python`,
+        // The paths to your Python scripts to run
+        scripts: ["src/python/**/*.py"],
+      }),
+    ],
+  },
+});
+```
+
+<Info>
+  Learn more about executing scripts in your Trigger.dev project using our Python build extension
+  [here](/config/extensions/pythonExtension).
+</Info>
+
+### Task code
+
+This task uses the `python.runScript` method to run the `markdown-converter.py` script with the given document URL as an argument.
+
+```ts src/trigger/convertToMarkdown.ts
+import { task } from "@trigger.dev/sdk/v3";
+import { python } from "@trigger.dev/python";
+import { z } from "zod";
+import * as fs from "fs";
+import * as path from "path";
+import * as os from "os";
+import * as https from "https";
+import * as http from "http";
+
+export const convertToMarkdown = task({
+  id: "convert-to-markdown",
+  run: async (payload: { url: string }) => {
+    try {
+      const { url } = payload;
+
+      // STEP 1: Create temporary file with unique name
+      const tempDir = os.tmpdir();
+      const fileName = `doc-${Date.now()}-${Math.random().toString(36).substring(2, 7)}`;
+      const urlPath = new URL(url).pathname;
+      // Detect file extension from URL or default to .docx
+      const extension = path.extname(urlPath) || ".docx";
+      const tempFilePath = path.join(tempDir, `${fileName}${extension}`);
+
+      // STEP 2: Download file from URL
+      await new Promise<void>((resolve, reject) => {
+        const protocol = url.startsWith("https") ? https : http;
+        const file = fs.createWriteStream(tempFilePath);
+
+        protocol
+          .get(url, (response) => {
+            if (response.statusCode !== 200) {
+              reject(new Error(`Download failed with status ${response.statusCode}`));
+              return;
+            }
+
+            response.pipe(file);
+            file.on("finish", () => {
+              file.close();
+              resolve();
+            });
+          })
+          .on("error", (err) => {
+            // Clean up on error
+            fs.unlink(tempFilePath, () => {});
+            reject(err);
+          });
+      });
+
+      // STEP 3: Run Python script to convert document to markdown
+      const pythonResult = await python.runScript("./src/python/markdown-converter.py", [
+        JSON.stringify({ file_path: tempFilePath }),
+      ]);
+
+      // STEP 4: Clean up temporary file
+      fs.unlink(tempFilePath, () => {});
+
+      // STEP 5: Process result - handle possible warnings
+      // Only treat stderr as error if we don't have stdout data
+      // This handles cases where non-critical warnings appear in stderr
+      if (
+        pythonResult.stderr &&
+        !pythonResult.stderr.includes("Couldn't find ffmpeg") &&
+        !pythonResult.stdout
+      ) {
+        throw new Error(`Python error: ${pythonResult.stderr}`);
+      }
+
+      // If we got valid stdout data, parse and use it regardless of stderr warnings
+      // This ensures harmless warnings don't break the conversion
+      if (pythonResult.stdout) {
+        const result = JSON.parse(pythonResult.stdout);
+
+        return {
+          url,
+          markdown: result.status === "success" ? result.markdown : null,
+          error: result.status === "error" ? result.error : null,
+          success: result.status === "success",
+        };
+      }
+
+      return {
+        url,
+        markdown: null,
+        error: "No output from Python script",
+        success: false,
+      };
+    } catch (error) {
+      if (error instanceof z.ZodError) {
+        return {
+          url: payload.url,
+          markdown: null,
+          error: "Invalid URL format: " + error.errors[0].message,
+          success: false,
+        };
+      }
+
+      return {
+        url: payload.url,
+        markdown: null,
+        error: error instanceof Error ? error.message : String(error),
+        success: false,
+      };
+    }
+  },
+});
+```
+
+### Add a requirements.txt file
+
+Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies.
+
+```txt requirements.txt
+markitdown[all]
+```
+
+### The Python script
+
+The Python script uses MarkItDown to convert documents to Markdown format.
+
+```python src/python/markdown-converter.py
+import json
+import sys
+import os
+from markitdown import MarkItDown
+
+def convert_to_markdown(file_path):
+    """Convert a file to markdown format using MarkItDown"""
+    # Check if file exists
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"File not found: {file_path}")
+
+    # Initialize MarkItDown
+    md = MarkItDown()
+
+    # Convert the file
+    try:
+        result = md.convert(file_path)
+        return result.text_content
+    except Exception as e:
+        raise Exception(f"Error converting file: {str(e)}")
+
+def process_trigger_task(file_path):
+    """Process a file and convert to markdown"""
+    try:
+        markdown_result = convert_to_markdown(file_path)
+        return {
+            "status": "success",
+            "markdown": markdown_result
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "error": str(e)
+        }
+
+if __name__ == "__main__":
+    # Get the file path from command line arguments
+    if len(sys.argv) < 2:
+        print(json.dumps({"status": "error", "error": "No file path provided"}))
+        sys.exit(1)
+
+    try:
+        config = json.loads(sys.argv[1])
+        file_path = config.get("file_path")
+
+        if not file_path:
+            print(json.dumps({"status": "error", "error": "No file path specified in config"}))
+            sys.exit(1)
+
+        result = process_trigger_task(file_path)
+        print(json.dumps(result))
+    except Exception as e:
+        print(json.dumps({"status": "error", "error": str(e)}))
+        sys.exit(1)
+```
+
+## Testing your task
+
+1. Create a virtual environment `python -m venv venv`
+2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate`
+3. Install the Python dependencies `pip install -r requirements.txt`. _Make sure you have Python 3.10 or higher installed._
+4. Copy the project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and add it to the `trigger.config.ts` file.
+5. Run the Trigger.dev CLI `dev` command (it may ask you to authorize the CLI if you haven't already).
+6. Test the task in the dashboard by providing a valid document URL.
+7. Deploy the task to production using the Trigger.dev CLI `deploy` command.
+
+## MarkItDown Conversion Capabilities
+
+- Convert various file formats to Markdown:
+  - Office formats (Word, PowerPoint, Excel)
+  - PDFs
+  - Images (with optional LLM-generated descriptions)
+  - HTML, CSV, JSON, XML
+  - Audio files (with optional transcription)
+  - ZIP archives
+  - And more
+- Preserve document structure (headings, lists, tables, etc.)
+- Handle multiple input methods (file paths, URLs, base64 data)
+- Optional Azure Document Intelligence integration for better PDF and image conversion
+
+<PythonLearnMore />

From 93cac410ada1dc4ab6a6ebe9258c989aae0cf82b Mon Sep 17 00:00:00 2001
From: D-K-P <8297864+D-K-P@users.noreply.github.com>
Date: Thu, 24 Apr 2025 12:41:48 +0100
Subject: [PATCH 2/4] Simplified the task

---
 docs/guides/python/python-doc-to-markdown.mdx | 126 ++++++------------
 1 file changed, 41 insertions(+), 85 deletions(-)

diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx
index aa718dca8d..90fa7836cc 100644
--- a/docs/guides/python/python-doc-to-markdown.mdx
+++ b/docs/guides/python/python-doc-to-markdown.mdx
@@ -71,9 +71,8 @@ export default defineConfig({
 This task uses the `python.runScript` method to run the `markdown-converter.py` script with the given document URL as an argument.
 
 ```ts src/trigger/convertToMarkdown.ts
-import { task } from "@trigger.dev/sdk/v3";
+import { task } from "@trigger.dev/sdk";
 import { python } from "@trigger.dev/python";
-import { z } from "zod";
 import * as fs from "fs";
 import * as path from "path";
 import * as os from "os";
@@ -83,97 +82,54 @@ import * as http from "http";
 export const convertToMarkdown = task({
   id: "convert-to-markdown",
   run: async (payload: { url: string }) => {
-    try {
-      const { url } = payload;
-
-      // STEP 1: Create temporary file with unique name
-      const tempDir = os.tmpdir();
-      const fileName = `doc-${Date.now()}-${Math.random().toString(36).substring(2, 7)}`;
-      const urlPath = new URL(url).pathname;
-      // Detect file extension from URL or default to .docx
-      const extension = path.extname(urlPath) || ".docx";
-      const tempFilePath = path.join(tempDir, `${fileName}${extension}`);
-
-      // STEP 2: Download file from URL
-      await new Promise<void>((resolve, reject) => {
-        const protocol = url.startsWith("https") ? https : http;
-        const file = fs.createWriteStream(tempFilePath);
-
-        protocol
-          .get(url, (response) => {
-            if (response.statusCode !== 200) {
-              reject(new Error(`Download failed with status ${response.statusCode}`));
-              return;
-            }
-
-            response.pipe(file);
-            file.on("finish", () => {
-              file.close();
-              resolve();
-            });
-          })
-          .on("error", (err) => {
-            // Clean up on error
-            fs.unlink(tempFilePath, () => {});
-            reject(err);
-          });
+    const { url } = payload;
+
+    // STEP 1: Create temporary file with unique name
+    const tempDir = os.tmpdir();
+    const fileName = `doc-${Date.now()}-${Math.random().toString(36).substring(2, 7)}`;
+    const urlPath = new URL(url).pathname;
+    const extension = path.extname(urlPath) || ".docx";
+    const tempFilePath = path.join(tempDir, `${fileName}${extension}`);
+
+    // STEP 2: Download file from URL
+    await new Promise<void>((resolve, reject) => {
+      const protocol = url.startsWith("https") ? https : http;
+      const file = fs.createWriteStream(tempFilePath);
+
+      protocol.get(url, (response) => {
+        response.pipe(file);
+        file.on("finish", () => {
+          file.close();
+          resolve();
+        });
       });
+    });
 
-      // STEP 3: Run Python script to convert document to markdown
-      const pythonResult = await python.runScript("./src/python/markdown-converter.py", [
-        JSON.stringify({ file_path: tempFilePath }),
-      ]);
-
-      // STEP 4: Clean up temporary file
-      fs.unlink(tempFilePath, () => {});
-
-      // STEP 5: Process result - handle possible warnings
-      // Only treat stderr as error if we don't have stdout data
-      // This handles cases where non-critical warnings appear in stderr
-      if (
-        pythonResult.stderr &&
-        !pythonResult.stderr.includes("Couldn't find ffmpeg") &&
-        !pythonResult.stdout
-      ) {
-        throw new Error(`Python error: ${pythonResult.stderr}`);
-      }
-
-      // If we got valid stdout data, parse and use it regardless of stderr warnings
-      // This ensures harmless warnings don't break the conversion
-      if (pythonResult.stdout) {
-        const result = JSON.parse(pythonResult.stdout);
+    // STEP 3: Run Python script to convert document to markdown
+    const pythonResult = await python.runScript("./src/python/markdown-converter.py", [
+      JSON.stringify({ file_path: tempFilePath }),
+    ]);
 
-        return {
-          url,
-          markdown: result.status === "success" ? result.markdown : null,
-          error: result.status === "error" ? result.error : null,
-          success: result.status === "success",
-        };
-      }
+    // STEP 4: Clean up temporary file
+    fs.unlink(tempFilePath, () => {});
 
+    // STEP 5: Process result
+    if (pythonResult.stdout) {
+      const result = JSON.parse(pythonResult.stdout);
       return {
         url,
-        markdown: null,
-        error: "No output from Python script",
-        success: false,
-      };
-    } catch (error) {
-      if (error instanceof z.ZodError) {
-        return {
-          url: payload.url,
-          markdown: null,
-          error: "Invalid URL format: " + error.errors[0].message,
-          success: false,
-        };
-      }
-
-      return {
-        url: payload.url,
-        markdown: null,
-        error: error instanceof Error ? error.message : String(error),
-        success: false,
+        markdown: result.status === "success" ? result.markdown : null,
+        error: result.status === "error" ? result.error : null,
+        success: result.status === "success",
       };
     }
+
+    return {
+      url,
+      markdown: null,
+      error: "No output from Python script",
+      success: false,
+    };
   },
 });
 ```

From 5eda05522d27e96d80d97ebca17c791b6c689305 Mon Sep 17 00:00:00 2001
From: D-K-P <8297864+D-K-P@users.noreply.github.com>
Date: Mon, 28 Apr 2025 18:00:49 +0100
Subject: [PATCH 3/4] Update example to use fetch and improved sidebar titles

---
 docs/guides/python/python-crawl4ai.mdx        |  2 +-
 docs/guides/python/python-doc-to-markdown.mdx | 26 +++++++------------
 .../guides/python/python-image-processing.mdx |  2 +-
 .../python/python-pdf-form-extractor.mdx      |  2 +-
 4 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/docs/guides/python/python-crawl4ai.mdx b/docs/guides/python/python-crawl4ai.mdx
index c821bf6c98..526d0eaf58 100644
--- a/docs/guides/python/python-crawl4ai.mdx
+++ b/docs/guides/python/python-crawl4ai.mdx
@@ -1,6 +1,6 @@
 ---
 title: "Python headless browser web crawler example"
-sidebarTitle: "Python headless web crawler"
+sidebarTitle: "Headless web crawler"
 description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev."
 ---
 
diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx
index 90fa7836cc..ff5dad29e1 100644
--- a/docs/guides/python/python-doc-to-markdown.mdx
+++ b/docs/guides/python/python-doc-to-markdown.mdx
@@ -1,11 +1,16 @@
 ---
 title: "Convert documents to markdown using Python and MarkItDown"
-sidebarTitle: "Python document to markdown"
+sidebarTitle: "Convert docs to markdown"
 description: "Learn how to use Trigger.dev with Python to convert documents to markdown using MarkItDown."
 ---
 
 import PythonLearnMore from "/snippets/python-learn-more.mdx";
 
+<Note>
+  This project uses Trigger.dev v4 (which is currently in beta as of 28 April 2025). If you want to
+  run this project you will need to [upgrade to v4](/docs/upgrade-to-v4).
+</Note>
+
 ## Overview
 
 Convert documents to markdown using Microsoft's [MarkItDown](https://github.com/microsoft/markitdown) library. This can be especially useful for preparing documents in a structured format for AI applications.
@@ -71,13 +76,11 @@ export default defineConfig({
 This task uses the `python.runScript` method to run the `markdown-converter.py` script with the given document URL as an argument.
 
 ```ts src/trigger/convertToMarkdown.ts
-import { task } from "@trigger.dev/sdk";
+import { task } from "@trigger.dev/sdk/v3";
 import { python } from "@trigger.dev/python";
 import * as fs from "fs";
 import * as path from "path";
 import * as os from "os";
-import * as https from "https";
-import * as http from "http";
 
 export const convertToMarkdown = task({
   id: "convert-to-markdown",
@@ -92,18 +95,9 @@ export const convertToMarkdown = task({
     const tempFilePath = path.join(tempDir, `${fileName}${extension}`);
 
     // STEP 2: Download file from URL
-    await new Promise<void>((resolve, reject) => {
-      const protocol = url.startsWith("https") ? https : http;
-      const file = fs.createWriteStream(tempFilePath);
-
-      protocol.get(url, (response) => {
-        response.pipe(file);
-        file.on("finish", () => {
-          file.close();
-          resolve();
-        });
-      });
-    });
+    const response = await fetch(url);
+    const buffer = await response.arrayBuffer();
+    await fs.promises.writeFile(tempFilePath, Buffer.from(buffer));
 
     // STEP 3: Run Python script to convert document to markdown
     const pythonResult = await python.runScript("./src/python/markdown-converter.py", [
diff --git a/docs/guides/python/python-image-processing.mdx b/docs/guides/python/python-image-processing.mdx
index 64e73ecdae..0f81d2b54f 100644
--- a/docs/guides/python/python-image-processing.mdx
+++ b/docs/guides/python/python-image-processing.mdx
@@ -1,6 +1,6 @@
 ---
 title: "Python image processing example"
-sidebarTitle: "Python image processing"
+sidebarTitle: "Process images"
 description: "Learn how to use Trigger.dev with Python to process images from URLs and upload them to S3."
 ---
 
diff --git a/docs/guides/python/python-pdf-form-extractor.mdx b/docs/guides/python/python-pdf-form-extractor.mdx
index a62f0e8dc1..3367ea9baf 100644
--- a/docs/guides/python/python-pdf-form-extractor.mdx
+++ b/docs/guides/python/python-pdf-form-extractor.mdx
@@ -1,6 +1,6 @@
 ---
 title: "Python PDF form extractor example"
-sidebarTitle: "Python PDF form extractor"
+sidebarTitle: "Extract form data from PDFs"
 description: "Learn how to use Trigger.dev with Python to extract form data from PDF files."
 ---
 

From 3372859056a8134a3284ca27705fef995837d0d1 Mon Sep 17 00:00:00 2001
From: D-K-P <8297864+D-K-P@users.noreply.github.com>
Date: Mon, 28 Apr 2025 19:37:22 +0100
Subject: [PATCH 4/4] Fixed link

---
 docs/guides/python/python-doc-to-markdown.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/python/python-doc-to-markdown.mdx b/docs/guides/python/python-doc-to-markdown.mdx
index ff5dad29e1..a36a240cec 100644
--- a/docs/guides/python/python-doc-to-markdown.mdx
+++ b/docs/guides/python/python-doc-to-markdown.mdx
@@ -8,7 +8,7 @@ import PythonLearnMore from "/snippets/python-learn-more.mdx";
 
 <Note>
   This project uses Trigger.dev v4 (which is currently in beta as of 28 April 2025). If you want to
-  run this project you will need to [upgrade to v4](/docs/upgrade-to-v4).
+  run this project you will need to [upgrade to v4](/upgrade-to-v4).
 </Note>
 
 ## Overview