diff --git a/python-markitdown/README.md b/python-markitdown/README.md new file mode 100644 index 0000000000..fb7cdd2dda --- /dev/null +++ b/python-markitdown/README.md @@ -0,0 +1,3 @@ +# Python MarkItDown: Convert Documents Into LLM-Ready Markdown + +This folder provides the code examples for the Real Python tutorial [Python MarkItDown: Convert Documents Into LLM-Ready Markdown](https://realpython.com/python-markitdown/). diff --git a/python-markitdown/batch_converter.py b/python-markitdown/batch_converter.py new file mode 100644 index 0000000000..1675ef11b3 --- /dev/null +++ b/python-markitdown/batch_converter.py @@ -0,0 +1,33 @@ +from pathlib import Path + +from markitdown import MarkItDown + + +def main( + input_dir, + output_dir="output", + target_formats=(".docx", ".xlsx", ".pdf"), +): + input_path = Path(input_dir) + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + md = MarkItDown() + + for file_path in input_path.glob("*"): + if file_path.suffix in target_formats: + try: + result = md.convert(file_path) + except Exception as e: + print(f"✗ Error converting {file_path.name}: {e}") + continue + + output_file = ( + output_path / f"{file_path.stem}{file_path.suffix}.md" + ) + output_file.write_text(result.markdown, encoding="utf-8") + print(f"✓ Converted {file_path.name} → {output_file.name}") + + +if __name__ == "__main__": + main("data", "output") diff --git a/python-markitdown/convert_files.py b/python-markitdown/convert_files.py new file mode 100644 index 0000000000..772ba7a05c --- /dev/null +++ b/python-markitdown/convert_files.py @@ -0,0 +1,5 @@ +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("./data/sample_DOCX.docx") +print(result) diff --git a/python-markitdown/data/employees.csv b/python-markitdown/data/employees.csv new file mode 100644 index 0000000000..70a9ffc7fd --- /dev/null +++ b/python-markitdown/data/employees.csv @@ -0,0 +1,9 @@ +First Name,Last Name,Department,Position,Start Date +Alice,Johnson,Marketing,Marketing Coordinator,1/15/2022 +Bob,Williams,Human Resources,HR Generalist,6/1/2021 +Carol,Davis,Engineering,Software Engineer,3/20/2023 +David,Brown,Sales,Sales Representative,9/10/2022 +Eve,Miller,Finance,Financial Analyst,11/5/2021 +Frank,Garcia,Customer Service,Customer Support Specialist,7/1/2023 +Grace,Rodriguez,Research & Development,Research Scientist,4/25/2022 +Henry,Martinez,Operations,Operations Manager,2/14/2021 diff --git a/python-markitdown/data/employees.xlsx b/python-markitdown/data/employees.xlsx new file mode 100644 index 0000000000..d319e5414c Binary files /dev/null and b/python-markitdown/data/employees.xlsx differ diff --git a/python-markitdown/data/markdown_syntax.docx b/python-markitdown/data/markdown_syntax.docx new file mode 100644 index 0000000000..547d9d3981 Binary files /dev/null and b/python-markitdown/data/markdown_syntax.docx differ diff --git a/python-markitdown/data/markdown_syntax.pdf b/python-markitdown/data/markdown_syntax.pdf new file mode 100644 index 0000000000..16ee9c2592 Binary files /dev/null and b/python-markitdown/data/markdown_syntax.pdf differ diff --git a/python-markitdown/data/pep8.docx b/python-markitdown/data/pep8.docx new file mode 100644 index 0000000000..c65da105a3 Binary files /dev/null and b/python-markitdown/data/pep8.docx differ diff --git a/python-markitdown/data/real-python.png b/python-markitdown/data/real-python.png new file mode 100644 index 0000000000..67571011ee Binary files /dev/null and b/python-markitdown/data/real-python.png differ diff --git a/python-markitdown/data/zen-of-python.png b/python-markitdown/data/zen-of-python.png new file mode 100644 index 0000000000..c72a9d368d Binary files /dev/null and b/python-markitdown/data/zen-of-python.png differ diff --git a/python-markitdown/data/zen-of-python.txt b/python-markitdown/data/zen-of-python.txt new file mode 100644 index 0000000000..a5b0d97316 --- /dev/null +++ b/python-markitdown/data/zen-of-python.txt @@ -0,0 +1,22 @@ +