Skip to content

feat(docs): prebuilt (not for merging) #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
70 changes: 69 additions & 1 deletion pipeline/core/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tqdm import tqdm

from pipeline.preprocessors import preprocess_markdown
from pipeline.tools.notebook.convert import convert_notebook

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,6 +41,7 @@ def __init__(self, src_dir: Path, build_dir: Path) -> None:
self.copy_extensions: set[str] = {
".mdx",
".md",
".ipynb",
".json",
".svg",
".png",
Expand All @@ -51,6 +53,35 @@ def __init__(self, src_dir: Path, build_dir: Path) -> None:
".css",
}

def _should_ignore_file(self, file_path: Path) -> bool:
"""Check if a file should be ignored during build.

This method filters out cached files, temporary files, and other
files that should not be included in the build process.

Args:
file_path: Path to the file to check.

Returns:
True if the file should be ignored, False otherwise.
"""
filename = file_path.name

# Ignore files starting with .~ (cached/temporary files)
if filename.startswith(".~"):
return True

# Ignore files starting with ~ (backup files)
if filename.startswith("~"):
return True

# Ignore hidden files starting with . (except specific ones we want)
if filename.startswith(".") and filename not in {".gitkeep"}:
return True

# Ignore common temporary file patterns
return bool(filename.endswith((".tmp", ".temp")))

def build_all(self) -> None:
"""Build all documentation files from source to build directory.

Expand All @@ -76,7 +107,8 @@ def build_all(self) -> None:

# Collect all files to process
all_files = [
file_path for file_path in self.src_dir.rglob("*") if file_path.is_file()
file_path for file_path in self.src_dir.rglob("*")
if file_path.is_file() and not self._should_ignore_file(file_path)
]

if not all_files:
Expand Down Expand Up @@ -188,6 +220,34 @@ def _process_markdown_file(self, input_path: Path, output_path: Path) -> None:
logger.exception("Failed to process markdown file %s", input_path)
raise

def _process_notebook_file(self, input_path: Path, output_path: Path) -> None:
"""Process a Jupyter notebook file and convert to markdown.

This method converts a Jupyter notebook to markdown, applies preprocessing,
and writes the processed content to the output path as an .mdx file.

Args:
input_path: Path to the source notebook file.
output_path: Path where the processed file should be written.
"""
try:
# Convert notebook to markdown
markdown_content = convert_notebook(input_path)

# Apply markdown preprocessing
processed_content = self._process_markdown_content(markdown_content, input_path)

# Convert .ipynb to .mdx
output_path = output_path.with_suffix(".mdx")

# Write the processed content
with output_path.open("w", encoding="utf-8") as f:
f.write(processed_content)

except Exception:
logger.exception("Failed to process notebook file %s", input_path)
raise

def build_file(self, file_path: Path) -> None:
"""Build a single file by copying it to the build directory.

Expand Down Expand Up @@ -227,6 +287,10 @@ def build_file(self, file_path: Path) -> None:
if file_path.suffix.lower() in {".md", ".mdx"}:
self._process_markdown_file(file_path, output_path)
logger.info("Processed markdown: %s", relative_path)
# Handle notebook files with conversion to markdown
elif file_path.suffix.lower() == ".ipynb":
self._process_notebook_file(file_path, output_path)
logger.info("Converted notebook: %s", relative_path)
else:
shutil.copy2(file_path, output_path)
logger.info("Copied: %s", relative_path)
Expand Down Expand Up @@ -269,6 +333,10 @@ def _build_file_with_progress(self, file_path: Path, pbar: tqdm) -> bool:
if file_path.suffix.lower() in {".md", ".mdx"}:
self._process_markdown_file(file_path, output_path)
return True
# Handle notebook files with conversion to markdown
if file_path.suffix.lower() == ".ipynb":
self._process_notebook_file(file_path, output_path)
return True
shutil.copy2(file_path, output_path)
return True
return False
Expand Down
3 changes: 2 additions & 1 deletion pipeline/core/watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def on_modified(self, event: FileSystemEvent) -> None:
src_path = event.src_path

file_path = Path(src_path)
if file_path.suffix.lower() in self.builder.copy_extensions:
if (file_path.suffix.lower() in self.builder.copy_extensions and
not self.builder._should_ignore_file(file_path)):
logger.info("File changed: %s", file_path)
# Put file change event in queue for async processing
self.loop.call_soon_threadsafe(self.event_queue.put_nowait, file_path)
Expand Down
20 changes: 19 additions & 1 deletion src/docs.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,24 @@
},
"navigation": {
"dropdowns": [
{
"dropdown": "LangChain v1",
"icon": "/images/brand/langchain-pill.svg",
"description": "LangChain v1 documentation and guides",
"tabs": [
{
"tab": "Prebuilts",
"pages": [
"langchain_v1/stuff",
"langchain_v1/map_reduce",
"langchain_v1/recursive",
"langchain_v1/rag_agent",
"langchain_v1/data_analysis",
"langchain_v1/supervisor"
]
}
]
},
{
"dropdown": "LangGraph Platform",
"icon": "/images/brand/langgraph-platform-pill.svg",
Expand Down Expand Up @@ -316,4 +334,4 @@
}
]
}
}
}
269 changes: 269 additions & 0 deletions src/langchain_v1/data_analysis.ipynb

Large diffs are not rendered by default.

170 changes: 170 additions & 0 deletions src/langchain_v1/extraction.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Document extraction

This guide shows you how to extract information from documents using LangChain's **prebuilt** extraction functionality. The extraction chain can produce either text summaries or structured data from one or more documents.

## Prerequisites

Before you start this tutorial, ensure you have the following:

- An [Anthropic](https://console.anthropic.com/settings/keys) API key

## 1. Install dependencies

If you haven't already, install LangGraph and LangChain:

```bash
pip install -U langgraph "langchain[anthropic]"
```

<Tip>
LangChain is installed so the extractor can call the [model](https://python.langchain.com/docs/integrations/chat/).
</Tip>

## 2. Set up documents

First, create some documents to extract information from:

```python
from langchain_core.documents import Document

documents = [
Document(
id="1",
page_content="""Bobby Luka was 10 years old.
Synthetic fuels—produced from captured carbon and green hydrogen—are gaining traction in aviation. The EU's "ReFuelEU" mandate requires increasing blends of sustainable aviation fuel (SAF) starting in 2025. Airbus and Rolls-Royce have completed long-haul test flights powered entirely by synthetic kerosene.""",
metadata={"source": "synthetic_fuel_aviation"},
),
Document(
id="2",
page_content="""
AI is accelerating early-stage drug discovery, especially in target identification and molecule generation. Platforms like BenevolentAI and Insilico Medicine have generated preclinical candidates using generative models trained on biological and chemical data.""",
metadata={"source": "ai_drug_discovery"},
),
Document(
id="3",
page_content="""Jack Johnson was 23 years old and blonde.
Bobby Luka's hair is brown.""",
metadata={"source": "people_info"},
),
]
```

## 3. Configure a model

Configure an LLM for extraction using [init_chat_model](https://python.langchain.com/api_reference/langchain/chat_models/langchain.chat_models.base.init_chat_model.html):

```python
from langchain.chat_models import init_chat_model

model = init_chat_model(
"anthropic:claude-3-5-sonnet-latest",
temperature=0
)
```

## 4. Extract a basic summary

Create an extractor to produce text summaries from documents:

```python
from langchain.chains.summarization import create_summarizer

# Create a basic summarizer
summarizer = create_summarizer(
model,
initial_prompt="Produce a concise summary of the following document in 2-3 sentences."
).compile(name="TextSummarizer")

# Extract summary
result = summarizer.invoke({"documents": documents})
print(result["result"])
```

## 5. Extract structured summaries

To produce structured responses with a specific format, use the `response_format` parameter with a Pydantic model:

```python
from pydantic import BaseModel
from langchain.chains.summarization import create_summarizer

class Summary(BaseModel):
"""Structured summary with title and key points."""

title: str
key_points: list[str]

# Create structured summarizer
structured_summarizer = create_summarizer(
model,
initial_prompt="Extract the main topics and create a structured summary with a title and up to 3 key points.",
response_format=Summary
).compile(name="StructuredSummarizer")

# Extract structured summary
result = structured_summarizer.invoke({"documents": documents})

# Access structured fields
print(f"Title: {result['result'].title}")
print("Key points:")
for point in result['result'].key_points:
print(f" • {point}")
```

## 6. Extract entities with source tracking

Extract specific entities while tracking which documents they came from:

```python
from typing import Optional
from pydantic import BaseModel, Field

class Person(BaseModel):
"""Person entity with source tracking."""

name: str
age: Optional[str] = None
hair_color: Optional[str] = None
source_doc_ids: list[str] = Field(
default=[],
description="The IDs of the documents where the information was found.",
)

class PeopleExtraction(BaseModel):
"""Collection of extracted people."""

people: list[Person]

# Create entity extractor
entity_extractor = create_summarizer(
model,
initial_prompt="Extract information about people mentioned in the documents. Include the document IDs where each piece of information was found.",
response_format=PeopleExtraction
).compile(name="EntityExtractor")

# Extract entities
result = entity_extractor.invoke({"documents": documents})

# Display extracted people with sources
for person in result['result'].people:
print(f"Name: {person.name}")
if person.age:
print(f" Age: {person.age}")
if person.hair_color:
print(f" Hair: {person.hair_color}")
print(f" Sources: {', '.join(person.source_doc_ids)}")
print()
```

## Custom prompts

Customize extraction behavior with specific prompts:

```python
custom_extractor = create_summarizer(
model,
initial_prompt="Focus on extracting technical information and key innovations mentioned in the documents."
).compile()
```

For more advanced extraction patterns and customization, see the [extraction how-to guides](../how-tos/extraction/).
Loading
Loading