langchain-ai · eyurtsev · Aug 2, 2025 · Aug 3, 2025 · Aug 3, 2025 · Aug 3, 2025
@@ -9,6 +9,7 @@
 from tqdm import tqdm
 
 from pipeline.preprocessors import preprocess_markdown
+from pipeline.tools.notebook.convert import convert_notebook
 
 logger = logging.getLogger(__name__)
 
@@ -40,6 +41,7 @@ def __init__(self, src_dir: Path, build_dir: Path) -> None:
         self.copy_extensions: set[str] = {
             ".mdx",
             ".md",
+            ".ipynb",
             ".json",
             ".svg",
             ".png",
@@ -51,6 +53,35 @@ def __init__(self, src_dir: Path, build_dir: Path) -> None:
             ".css",
         }
 
+    def _should_ignore_file(self, file_path: Path) -> bool:
+        """Check if a file should be ignored during build.
+
+        This method filters out cached files, temporary files, and other
+        files that should not be included in the build process.
+
+        Args:
+            file_path: Path to the file to check.
+
+        Returns:
+            True if the file should be ignored, False otherwise.
+        """
+        filename = file_path.name
+
+        # Ignore files starting with .~ (cached/temporary files)
+        if filename.startswith(".~"):
+            return True
+
+        # Ignore files starting with ~ (backup files)
+        if filename.startswith("~"):
+            return True
+
+        # Ignore hidden files starting with . (except specific ones we want)
+        if filename.startswith(".") and filename not in {".gitkeep"}:
+            return True
+
+        # Ignore common temporary file patterns
+        return bool(filename.endswith((".tmp", ".temp")))
+
     def build_all(self) -> None:
         """Build all documentation files from source to build directory.
 
@@ -76,7 +107,8 @@ def build_all(self) -> None:
 
         # Collect all files to process
         all_files = [
-            file_path for file_path in self.src_dir.rglob("*") if file_path.is_file()
+            file_path for file_path in self.src_dir.rglob("*")
+            if file_path.is_file() and not self._should_ignore_file(file_path)
         ]
 
         if not all_files:
@@ -188,6 +220,34 @@ def _process_markdown_file(self, input_path: Path, output_path: Path) -> None:
             logger.exception("Failed to process markdown file %s", input_path)
             raise
 
+    def _process_notebook_file(self, input_path: Path, output_path: Path) -> None:
+        """Process a Jupyter notebook file and convert to markdown.
+
+        This method converts a Jupyter notebook to markdown, applies preprocessing,
+        and writes the processed content to the output path as an .mdx file.
+
+        Args:
+            input_path: Path to the source notebook file.
+            output_path: Path where the processed file should be written.
+        """
+        try:
+            # Convert notebook to markdown
+            markdown_content = convert_notebook(input_path)
+
+            # Apply markdown preprocessing
+            processed_content = self._process_markdown_content(markdown_content, input_path)
+
+            # Convert .ipynb to .mdx
+            output_path = output_path.with_suffix(".mdx")
+
+            # Write the processed content
+            with output_path.open("w", encoding="utf-8") as f:
+                f.write(processed_content)
+
+        except Exception:
+            logger.exception("Failed to process notebook file %s", input_path)
+            raise
+
     def build_file(self, file_path: Path) -> None:
         """Build a single file by copying it to the build directory.
 
@@ -227,6 +287,10 @@ def build_file(self, file_path: Path) -> None:
             if file_path.suffix.lower() in {".md", ".mdx"}:
                 self._process_markdown_file(file_path, output_path)
                 logger.info("Processed markdown: %s", relative_path)
+            # Handle notebook files with conversion to markdown
+            elif file_path.suffix.lower() == ".ipynb":
+                self._process_notebook_file(file_path, output_path)
+                logger.info("Converted notebook: %s", relative_path)
             else:
                 shutil.copy2(file_path, output_path)
                 logger.info("Copied: %s", relative_path)
@@ -269,6 +333,10 @@ def _build_file_with_progress(self, file_path: Path, pbar: tqdm) -> bool:
             if file_path.suffix.lower() in {".md", ".mdx"}:
                 self._process_markdown_file(file_path, output_path)
                 return True
+            # Handle notebook files with conversion to markdown
+            if file_path.suffix.lower() == ".ipynb":
+                self._process_notebook_file(file_path, output_path)
+                return True
             shutil.copy2(file_path, output_path)
             return True
         return False

@@ -67,7 +67,8 @@ def on_modified(self, event: FileSystemEvent) -> None:
         src_path = event.src_path
 
         file_path = Path(src_path)
-        if file_path.suffix.lower() in self.builder.copy_extensions:
+        if (file_path.suffix.lower() in self.builder.copy_extensions and
+            not self.builder._should_ignore_file(file_path)):
             logger.info("File changed: %s", file_path)
             # Put file change event in queue for async processing
             self.loop.call_soon_threadsafe(self.event_queue.put_nowait, file_path)

@@ -52,6 +52,24 @@
   },
   "navigation": {
     "dropdowns": [
+      {
+        "dropdown": "LangChain v1",
+        "icon": "/images/brand/langchain-pill.svg",
+        "description": "LangChain v1 documentation and guides",
+        "tabs": [
+          {
+            "tab": "Prebuilts",
+            "pages": [
+              "langchain_v1/stuff",
+              "langchain_v1/map_reduce",
+              "langchain_v1/recursive",
+              "langchain_v1/rag_agent",
+              "langchain_v1/data_analysis",
+              "langchain_v1/supervisor"
+            ]
+          }
+        ]
+      },
       {
         "dropdown": "LangGraph Platform",
         "icon": "/images/brand/langgraph-platform-pill.svg",
@@ -316,4 +334,4 @@
       }
     ]
   }
-}
+}
@@ -0,0 +1,170 @@
+# Document extraction
+
+This guide shows you how to extract information from documents using LangChain's **prebuilt** extraction functionality. The extraction chain can produce either text summaries or structured data from one or more documents.
+
+## Prerequisites
+
+Before you start this tutorial, ensure you have the following:
+
+- An [Anthropic](https://console.anthropic.com/settings/keys) API key
+
+## 1. Install dependencies
+
+If you haven't already, install LangGraph and LangChain:
+
+```bash
+pip install -U langgraph "langchain[anthropic]"
+```
+
+<Tip>
+    LangChain is installed so the extractor can call the [model](https://python.langchain.com/docs/integrations/chat/).
+</Tip>
+
+## 2. Set up documents
+
+First, create some documents to extract information from:
+
+```python
+from langchain_core.documents import Document
+
+documents = [
+    Document(
+        id="1",
+        page_content="""Bobby Luka was 10 years old.
+Synthetic fuels—produced from captured carbon and green hydrogen—are gaining traction in aviation. The EU's "ReFuelEU" mandate requires increasing blends of sustainable aviation fuel (SAF) starting in 2025. Airbus and Rolls-Royce have completed long-haul test flights powered entirely by synthetic kerosene.""",
+        metadata={"source": "synthetic_fuel_aviation"},
+    ),
+    Document(
+        id="2",
+        page_content="""
+AI is accelerating early-stage drug discovery, especially in target identification and molecule generation. Platforms like BenevolentAI and Insilico Medicine have generated preclinical candidates using generative models trained on biological and chemical data.""",
+        metadata={"source": "ai_drug_discovery"},
+    ),
+    Document(
+        id="3",
+        page_content="""Jack Johnson was 23 years old and blonde.
+Bobby Luka's hair is brown.""",
+        metadata={"source": "people_info"},
+    ),
+]
+```
+
+## 3. Configure a model
+
+Configure an LLM for extraction using [init_chat_model](https://python.langchain.com/api_reference/langchain/chat_models/langchain.chat_models.base.init_chat_model.html):
+
+```python
+from langchain.chat_models import init_chat_model
+
+model = init_chat_model(
+    "anthropic:claude-3-5-sonnet-latest",
+    temperature=0
+)
+```
+
+## 4. Extract a basic summary
+
+Create an extractor to produce text summaries from documents:
+
+```python
+from langchain.chains.summarization import create_summarizer
+
+# Create a basic summarizer
+summarizer = create_summarizer(
+    model,
+    initial_prompt="Produce a concise summary of the following document in 2-3 sentences."
+).compile(name="TextSummarizer")
+
+# Extract summary
+result = summarizer.invoke({"documents": documents})
+print(result["result"])
+```
+
+## 5. Extract structured summaries
+
+To produce structured responses with a specific format, use the `response_format` parameter with a Pydantic model:
+
+```python
+from pydantic import BaseModel
+from langchain.chains.summarization import create_summarizer
+
+class Summary(BaseModel):
+    """Structured summary with title and key points."""
+
+    title: str
+    key_points: list[str]
+
+# Create structured summarizer
+structured_summarizer = create_summarizer(
+    model,
+    initial_prompt="Extract the main topics and create a structured summary with a title and up to 3 key points.",
+    response_format=Summary
+).compile(name="StructuredSummarizer")
+
+# Extract structured summary
+result = structured_summarizer.invoke({"documents": documents})
+
+# Access structured fields
+print(f"Title: {result['result'].title}")
+print("Key points:")
+for point in result['result'].key_points:
+    print(f"  • {point}")
+```
+
+## 6. Extract entities with source tracking
+
+Extract specific entities while tracking which documents they came from:
+
+```python
+from typing import Optional
+from pydantic import BaseModel, Field
+
+class Person(BaseModel):
+    """Person entity with source tracking."""
+
+    name: str
+    age: Optional[str] = None
+    hair_color: Optional[str] = None
+    source_doc_ids: list[str] = Field(
+        default=[],
+        description="The IDs of the documents where the information was found.",
+    )
+
+class PeopleExtraction(BaseModel):
+    """Collection of extracted people."""
+
+    people: list[Person]
+
+# Create entity extractor
+entity_extractor = create_summarizer(
+    model,
+    initial_prompt="Extract information about people mentioned in the documents. Include the document IDs where each piece of information was found.",
+    response_format=PeopleExtraction
+).compile(name="EntityExtractor")
+
+# Extract entities
+result = entity_extractor.invoke({"documents": documents})
+
+# Display extracted people with sources
+for person in result['result'].people:
+    print(f"Name: {person.name}")
+    if person.age:
+        print(f"  Age: {person.age}")
+    if person.hair_color:
+        print(f"  Hair: {person.hair_color}")
+    print(f"  Sources: {', '.join(person.source_doc_ids)}")
+    print()
+```
+
+## Custom prompts
+
+Customize extraction behavior with specific prompts:
+
+```python
+custom_extractor = create_summarizer(
+    model,
+    initial_prompt="Focus on extracting technical information and key innovations mentioned in the documents."
+).compile()
+```
+
+For more advanced extraction patterns and customization, see the [extraction how-to guides](../how-tos/extraction/).