Skip to content

Commit 431ce64

Browse files
adding pdf-tex summarizer example
1 parent 41e8317 commit 431ce64

File tree

7 files changed

+6765
-0
lines changed

7 files changed

+6765
-0
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
MODEL=gpt-4o-mini
2+
ENVIRONMENT=dev
3+
OPENAI_API_KEY=
4+
OPENROUTER_MODEL=gryphe/mythomax-l2-13b
5+
SWARMZERO_LOG_LEVEL=INFO
6+
LANGTRACE_API_KEY=03ef836c358dc2f6edf599393d671ea180f23df1a2e3a99a16748b5adcf530fd
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.venv
2+
swarmzero-data
3+
.env
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# PDF Summarizer to Tex Workflow
2+
3+
A sequential workflow built using SwarmZero framework that enables concise summarization of large pdf's to be outputted in a tex file
4+
5+
## Description
6+
7+
This workflow utilizes PyMuPDF for extracting text out of a pdf and is built on top fo the SwarmZero framework, providing enhanced summarization cpabilities wiht AI-powered processing.
8+
9+
## Prerequisites
10+
11+
- Python 3.11 or higher
12+
- Poetry package manager
13+
- OpenAI API Key
14+
- Langtrace API Key
15+
16+
## Installation
17+
18+
1. Clone the repository:
19+
```bash
20+
git clone https://github.com/swarmzero/examples.git
21+
cd examples/workflows/pdf_summarizer_tex_workflow
22+
```
23+
24+
2. Install dependencies using Poetry:
25+
```bash
26+
poetry install --no-root
27+
```
28+
29+
3. Set up environment variables:
30+
Create a `.env` file in the root directory and add your API keys based on the .env.example file
31+
## Usage
32+
33+
Run the workflow on a pdf like so:
34+
``` bash
35+
python main.py path/to/pdf/text.pdf
36+
```
37+
38+
## Learn more
39+
Visit [SwarmZero](https://swarmzero.ai) to learn more about the SwarmZero framework.
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import asyncio
2+
import os
3+
import sys
4+
import uuid
5+
from typing import List
6+
7+
import fitz # PyMuPDF
8+
9+
from swarmzero.agent import Agent
10+
from swarmzero.sdk_context import SDKContext
11+
from swarmzero.workflow import Workflow, WorkflowStep, StepMode
12+
13+
14+
def extract_text_from_pdf(pdf_path: str) -> str:
15+
doc = fitz.open(pdf_path)
16+
text = "\n".join(page.get_text() for page in doc)
17+
doc.close()
18+
return text
19+
20+
21+
def chunk_text(text: str, max_chars: int = 2000) -> List[str]:
22+
paragraphs = text.split("\n")
23+
chunks, current = [], ""
24+
for para in paragraphs:
25+
if len(current) + len(para) < max_chars:
26+
current += para + "\n"
27+
else:
28+
chunks.append(current.strip())
29+
current = para + "\n"
30+
if current:
31+
chunks.append(current.strip())
32+
return chunks
33+
34+
35+
def summarize_bullets(prompt: str, **kwargs):
36+
lines = [line.strip() for line in prompt.split(".") if line.strip()]
37+
bullets = "\n".join(f"- {line}" for line in lines)
38+
return f"[Bullet Point Summary]:\n{bullets}"
39+
40+
def format_latex(prompt: str, **kwargs) -> str:
41+
bullet_lines = prompt.strip().split("\n")
42+
latex_items = "\n".join(f"\\item {line.lstrip('- ').strip()}" for line in bullet_lines if line.startswith("-"))
43+
44+
latex_doc = (
45+
"\\documentclass{article}\n"
46+
"\\usepackage[utf8]{inputenc}\n"
47+
"\\usepackage{enumitem}\n"
48+
"\\begin{document}\n"
49+
"\\section*{Summary Notes}\n"
50+
"\\begin{itemize}[leftmargin=*, label=--]\n"
51+
f"{latex_items}\n"
52+
"\\end{itemize}\n"
53+
"\\end{document}"
54+
)
55+
return latex_doc
56+
57+
58+
59+
CONFIG_PATH = os.path.join(os.path.dirname(__file__), "swarmzero_config.toml")
60+
61+
sdk_context = SDKContext(CONFIG_PATH)
62+
63+
summarizer_agent = Agent(
64+
name="BulletSummarizerAgent",
65+
functions=[summarize_bullets],
66+
instruction="Summarize input in bullet points.",
67+
agent_id=str(uuid.uuid4()),
68+
config_path=CONFIG_PATH,
69+
sdk_context=sdk_context,
70+
chat_only_mode=True
71+
)
72+
73+
74+
async def run_agent(prompt, **kwargs):
75+
return await summarizer_agent.chat(prompt)
76+
77+
latex_agent = Agent(
78+
name="LatexFormatterAgent",
79+
functions=[format_latex],
80+
instruction="Format input bullet points into a LaTeX document.",
81+
agent_id=str(uuid.uuid4()),
82+
config_path=CONFIG_PATH,
83+
sdk_context=sdk_context,
84+
chat_only_mode=True
85+
)
86+
87+
async def run_latex_agent(prompt, **kwargs):
88+
return await latex_agent.chat(prompt)
89+
90+
workflow = Workflow(
91+
name="pdf_latex_summary",
92+
instruction="Summarize PDF and convert to LaTeX notes.",
93+
description="A workflow that summarizes and formats PDF text as LaTeX.",
94+
steps=[
95+
WorkflowStep(name="SummarizeBullets", runner=run_agent, mode=StepMode.SEQUENTIAL),
96+
WorkflowStep(name="FormatLatex", runner=run_latex_agent, mode=StepMode.SEQUENTIAL),
97+
],
98+
sdk_context=sdk_context,
99+
)
100+
101+
async def main():
102+
pdf_path = sys.argv[1] if len(sys.argv) > 1 else "sample.pdf"
103+
text = extract_text_from_pdf(pdf_path)
104+
chunks = chunk_text(text)
105+
106+
# Step 1: Summarize each chunk into bullets
107+
bullet_summaries = []
108+
for chunk in chunks:
109+
summary = await run_agent(chunk)
110+
bullet_summaries.append(summary)
111+
112+
# Step 2: Merge all bullet summaries into one bullet string
113+
merged_bullets = "\n".join(bullet_summaries)
114+
115+
# Step 3: Extract just the bullet lines
116+
bullet_lines = [line.strip() for line in merged_bullets.split("\n") if line.strip().startswith("-")]
117+
latex_items = "\n".join(f"\\item {line.lstrip('- ').strip()}" for line in bullet_lines)
118+
119+
# Step 4: Hardcoded LaTeX wrapping
120+
final_latex = (
121+
"\\documentclass{article}\n"
122+
"\\usepackage[utf8]{inputenc}\n"
123+
"\\usepackage{enumitem}\n"
124+
"\\begin{document}\n"
125+
"\\section*{Summary Notes}\n"
126+
"\\begin{itemize}[leftmargin=*, label=--]\n"
127+
f"{latex_items}\n"
128+
"\\end{itemize}\n"
129+
"\\end{document}"
130+
)
131+
132+
# Step 5: Write the final LaTeX doc to file
133+
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
134+
output_path = f"{base_name}_summary.tex"
135+
with open(output_path, "w", encoding="utf-8") as f:
136+
f.write(final_latex)
137+
138+
print(f"Summary written to: {output_path}")
139+
140+
141+
142+
143+
if __name__ == "__main__":
144+
asyncio.run(main())

0 commit comments

Comments
 (0)