Skip to content

Commit f2a92aa

Browse files
feat: markitdown implementation (#486)
* feat: markitdown implementation * fix: code review and docker file: * fix: add markitdown PATH in container * fix: feedback changes * en: readme changed
1 parent 8af8e59 commit f2a92aa

File tree

4 files changed

+52
-0
lines changed

4 files changed

+52
-0
lines changed

Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,16 @@ RUN apt-get update && apt-get install -y \
7474
texlive-latex-extra \
7575
texlive-latex-recommended \
7676
texlive-xetex \
77+
python3 \
78+
python3-pip \
79+
pipx \
7780
--no-install-recommends \
81+
&& pipx install "markitdown[all]" \
7882
&& rm -rf /var/lib/apt/lists/*
7983

84+
# Add pipx bin directory to PATH
85+
ENV PATH="/root/.local/bin:${PATH}"
86+
8087
# Install VTracer binary
8188
RUN ARCH=$(uname -m) && \
8289
if [ "$ARCH" = "aarch64" ]; then \

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ A self-hosted online file converter. Supports over a thousand different formats.
4545
| [FFmpeg](https://ffmpeg.org/) | Video | ~472 | ~199 |
4646
| [Potrace](https://potrace.sourceforge.net/) | Raster to vector | 4 | 11 |
4747
| [VTracer](https://github.com/visioncortex/vtracer) | Raster to vector | 8 | 1 |
48+
| [Markitdown](https://github.com/microsoft/markitdown) | Documents | 6 | 1 |
4849

4950
<!-- many ffmpeg fileformats are duplicates -->
5051

src/converters/main.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import { convert as convertresvg, properties as propertiesresvg } from "./resvg"
2323
import { convert as convertImage, properties as propertiesImage } from "./vips";
2424
import { convert as convertVtracer, properties as propertiesVtracer } from "./vtracer";
2525
import { convert as convertxelatex, properties as propertiesxelatex } from "./xelatex";
26+
import { convert as convertMarkitdown, properties as propertiesMarkitdown } from "./markitdown";
2627

2728
// This should probably be reconstructed so that the functions are not imported instead the functions hook into this to make the converters more modular
2829

@@ -127,6 +128,10 @@ const properties: Record<
127128
properties: propertiesVtracer,
128129
converter: convertVtracer,
129130
},
131+
markitDown: {
132+
properties: propertiesMarkitdown,
133+
converter: convertMarkitdown,
134+
},
130135
};
131136

132137
function chunks<T>(arr: T[], size: number): T[][] {

src/converters/markitdown.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { execFile as execFileOriginal } from "node:child_process";
2+
import { ExecFileFn } from "./types";
3+
4+
export const properties = {
5+
from: {
6+
document: ["pdf", "powerpoint", "excel", "docx", "pptx", "html"],
7+
},
8+
to: {
9+
document: ["md"],
10+
},
11+
};
12+
13+
export async function convert(
14+
filePath: string,
15+
fileType: string,
16+
convertTo: string,
17+
targetPath: string,
18+
options?: unknown,
19+
execFile: ExecFileFn = execFileOriginal,
20+
): Promise<string> {
21+
return new Promise((resolve, reject) => {
22+
execFile("markitdown", [filePath, "-o", targetPath], (err, stdout, stderr) => {
23+
if (err) {
24+
reject(`markitdown error: ${err}`);
25+
return;
26+
}
27+
28+
if (stdout) {
29+
console.log(`stdout: ${stdout}`);
30+
}
31+
32+
if (stderr) {
33+
console.error(`stderr: ${stderr}`);
34+
}
35+
36+
resolve("Done");
37+
});
38+
});
39+
}

0 commit comments

Comments
 (0)