Merge pull request #3 from stride-nyc/api-limit-100

kenjudy · web-flow · commit d6063ba1ea06 · 2025-12-08T10:06:34.000-05:00
feat: Add automatic pagination for LangSmith API 100-record limit
diff --git a/.env.example b/.env.example
@@ -12,4 +12,5 @@ LANGSMITH_API_URL=https://api.smith.langchain.com
 LANGSMITH_PROJECT=your-project-name
 
 # Default trace limit (optional)
-LANGSMITH_LIMIT=150
+# Note: LangSmith API paginates results in chunks of 100
+LANGSMITH_LIMIT=100
diff --git a/README.md b/README.md
@@ -9,13 +9,14 @@ This Python script exports trace data from LangSmith using the SDK API, designed
 ## Features
 
 - Export N most recent traces from any LangSmith project
+- **Automatic pagination** - Handles large exports (> 100 records) seamlessly with progress indication
 - **Environment variable support** - Configure once via `.env` file for simplified usage
 - Automatic rate limiting with exponential backoff
 - Progress indication for long-running exports
 - Comprehensive error handling (auth, network, rate limits)
 - Structured JSON output with metadata
 - Type-safe implementation with full type hints
-- Test-driven development with pytest suite (25 tests, 86% coverage)
+- Test-driven development with pytest suite (33 tests, high coverage)
 
 ## Requirements
 
@@ -114,8 +115,10 @@ python export_langsmith_traces.py \
 ### Parameters
 
 - `--api-key` (optional): LangSmith API key for authentication (default: `LANGSMITH_API_KEY` env var)
-- `--project` (optional): LangSmith project name or ID (default: `LANGSMITH_PROJECT` env var)  
+- `--project` (optional): LangSmith project name or ID (default: `LANGSMITH_PROJECT` env var)
 - `--limit` (optional): Number of most recent traces to export (default: `LANGSMITH_LIMIT` env var)
+  - For limits > 100, the tool automatically handles pagination across multiple API calls
+  - If fewer records exist in the project, you'll receive a warning and all available records
 - `--output` (required): Output JSON file path
 
 **Note**: While the CLI arguments are now optional, the values must be provided either via command line or environment variables.
@@ -150,6 +153,45 @@ python export_langsmith_traces.py \
   --output "large_export.json"
 ```
 
+## Pagination for Large Exports
+
+The LangSmith API limits results to 100 records per call. This tool automatically handles pagination for larger exports with progress indication:
+
+**Example: Exporting 500 records**
+```bash
+python export_langsmith_traces.py --limit 500 --output large_export.json
+```
+
+**Output:**
+```
+🚀 Exporting 500 traces from project 'my-project'...
+✓ Connected to LangSmith API
+📥 Fetching traces...
+  📄 Fetching 500 runs across 5 pages...
+    ✓ Page 1/5: 100 runs (Total: 100)
+    ✓ Page 2/5: 100 runs (Total: 200)
+    ✓ Page 3/5: 100 runs (Total: 300)
+    ✓ Page 4/5: 100 runs (Total: 400)
+    ✓ Page 5/5: 100 runs (Total: 500)
+✓ Fetched 500 traces
+🔄 Formatting trace data...
+✓ Data formatted
+💾 Exporting to large_export.json...
+✅ Export complete! Saved to large_export.json
+```
+
+**If Project Has Fewer Records:**
+```
+⚠️  Warning: Fetched 250 runs (requested 500)
+```
+
+**Pagination Features:**
+- Automatic chunking into 100-record pages
+- Progress indication for multi-page exports
+- Rate limiting between pages (500ms delay)
+- Retry logic per page for reliability
+- Warning when fewer records available than requested
+
 ## Output Format
 
 The script generates a JSON file with the following structure:
@@ -227,14 +269,15 @@ All core features implemented and tested:
 - ✅ Dependencies configuration with CI/CD quality gates
 - ✅ CLI argument parsing with validation
 - ✅ **Environment variable support** - Optional `.env` file configuration for simplified usage
+- ✅ **Automatic pagination** - Handles API 100-record limit with multi-page fetching and progress indication
 - ✅ LangSmith client initialization with authentication
 - ✅ Run fetching with exponential backoff rate limiting
 - ✅ Data formatting and transformation with safe field extraction
 - ✅ JSON export functionality with error handling
 - ✅ Comprehensive error scenario handling
 - ✅ Main orchestration with user-friendly progress feedback
 - ✅ End-to-end integration testing
-- ✅ Test suite: 25 tests, 86% coverage
+- ✅ Test suite: 33 tests (25 original + 8 pagination tests), high coverage
 - ✅ Code quality: Black, Ruff, mypy, Bandit, Safety checks passing
 
 ### Optional Features Not Implemented
diff --git a/export_langsmith_traces.py b/export_langsmith_traces.py
@@ -101,7 +101,10 @@ def _looks_like_uuid(self, value: str) -> bool:
 
     def fetch_runs(self, project_name: str, limit: int) -> List[Any]:
         """
-        Fetch runs from LangSmith with rate limiting.
+        Fetch runs from LangSmith with pagination support for large exports.
+
+        Due to LangSmith API limitations (max 100 records per call), this method
+        makes multiple API calls to fetch all requested runs.
 
         Args:
             project_name: Name or ID of the LangSmith project
@@ -114,16 +117,130 @@ def fetch_runs(self, project_name: str, limit: int) -> List[Any]:
             ProjectNotFoundError: If project doesn't exist
             RateLimitError: If rate limit exceeded after retries
         """
+        CHUNK_SIZE = 100  # LangSmith API limit per call
+
+        all_runs = []
+        fetched_count = 0
+
+        # Calculate number of pages needed
+        num_pages = (limit + CHUNK_SIZE - 1) // CHUNK_SIZE
+
+        # Only show pagination message if multiple pages needed
+        if num_pages > 1:
+            print(f"  📄 Fetching {limit} runs across {num_pages} pages...")
+
+        for page_num in range(num_pages):
+            # Calculate how many runs to fetch in this page
+            remaining = limit - fetched_count
+            page_size = min(CHUNK_SIZE, remaining)
+
+            # Fetch this page
+            page_runs = self._fetch_page_with_retry(
+                project_name=project_name,
+                limit=page_size,
+                fetched_so_far=fetched_count,
+                page_num=page_num + 1,
+                total_pages=num_pages,
+            )
+
+            # No more runs available
+            if len(page_runs) == 0:
+                if fetched_count == 0:
+                    # No runs at all - will be handled by caller
+                    break
+                else:
+                    # Got some runs but not all requested
+                    print(
+                        f"  ℹ️  Reached end of available runs at {fetched_count} (requested {limit})"
+                    )
+                    break
+
+            all_runs.extend(page_runs)
+            fetched_count += len(page_runs)
+
+            # Progress update for multi-page fetches
+            if num_pages > 1:
+                print(
+                    f"    ✓ Page {page_num + 1}/{num_pages}: {len(page_runs)} runs (Total: {fetched_count})"
+                )
+
+            # Check if we got fewer than requested - indicates no more runs available
+            if len(page_runs) < page_size:
+                if fetched_count < limit:
+                    print(
+                        f"  ℹ️  Only {fetched_count} runs available (requested {limit})"
+                    )
+                break
+
+            # Reached our limit
+            if fetched_count >= limit:
+                break
+
+            # Add small delay between pages (not on last page)
+            if page_num < num_pages - 1 and fetched_count < limit:
+                time.sleep(0.5)  # 500ms delay between pages
+
+        # Final warning if significantly fewer runs than requested
+        if fetched_count < limit:
+            print(f"  ⚠️  Warning: Fetched {fetched_count} runs (requested {limit})")
+
+        return all_runs
+
+    def _fetch_page_with_retry(
+        self,
+        project_name: str,
+        limit: int,
+        fetched_so_far: int,
+        page_num: int,
+        total_pages: int,
+    ) -> List[Any]:
+        """
+        Fetch a single page of runs with exponential backoff retry logic.
+
+        This method wraps the SDK's list_runs call with retry logic to handle
+        transient errors and rate limiting.
+
+        Since LangSmith SDK doesn't support offset parameter, we request all runs
+        up to our position + page size, then skip to our position using islice.
+
+        Args:
+            project_name: Name or ID of the LangSmith project
+            limit: Number of runs to fetch for this page
+            fetched_so_far: Number of runs already fetched (used for offset simulation)
+            page_num: Current page number (1-indexed, for logging)
+            total_pages: Total number of pages expected (for logging)
+
+        Returns:
+            List of Run objects from this page
+
+        Raises:
+            ProjectNotFoundError: If project doesn't exist
+            RateLimitError: If rate limit exceeded after retries
+        """
+        from itertools import islice
+
         attempt = 0
         last_exception = None
 
         while attempt < self.MAX_RETRIES:
             try:
-                # Try with project_name parameter first
-                runs = list(
-                    self.client.list_runs(project_name=project_name, limit=limit)
+                # Since LangSmith SDK doesn't support offset parameter,
+                # we request all runs up to our position + page size,
+                # then skip to our position using islice
+                total_to_request = fetched_so_far + limit
+
+                # Try with project_name first
+                runs_iterator = self.client.list_runs(
+                    project_name=project_name, limit=total_to_request
                 )
-                return runs
+
+                # Skip already-fetched runs and take the next page
+                page_runs = list(
+                    islice(runs_iterator, fetched_so_far, fetched_so_far + limit)
+                )
+
+                return page_runs
+
             except Exception as e:
                 last_exception = e
                 error_msg = str(e).lower()
@@ -137,12 +254,17 @@ def fetch_runs(self, project_name: str, limit: int) -> List[Any]:
                     if self._looks_like_uuid(project_name):
                         print("Trying project ID instead of name...")
                         try:
-                            runs = list(
-                                self.client.list_runs(
-                                    project_id=project_name, limit=limit
+                            runs_iterator = self.client.list_runs(
+                                project_id=project_name, limit=total_to_request
+                            )
+                            page_runs = list(
+                                islice(
+                                    runs_iterator,
+                                    fetched_so_far,
+                                    fetched_so_far + limit,
                                 )
                             )
-                            return runs
+                            return page_runs
                         except Exception:  # nosec B110
                             pass  # Intentional: Fall through to retry logic if project_id also fails
 
@@ -158,15 +280,23 @@ def fetch_runs(self, project_name: str, limit: int) -> List[Any]:
                 attempt += 1
                 if attempt >= self.MAX_RETRIES:
                     break
+
                 # Exponential backoff
                 backoff_time = self.INITIAL_BACKOFF * (
                     self.BACKOFF_MULTIPLIER ** (attempt - 1)
                 )
+
+                # Only show retry message for multi-page fetches
+                if total_pages > 1:
+                    print(
+                        f"    ⚠️  Page {page_num}/{total_pages} failed (attempt {attempt}/{self.MAX_RETRIES}), retrying in {backoff_time:.1f}s..."
+                    )
+
                 time.sleep(backoff_time)
 
         # If we get here, all retries failed
         raise RateLimitError(
-            f"Failed to fetch runs after {self.MAX_RETRIES} attempts. "
+            f"Failed to fetch page {page_num}/{total_pages} after {self.MAX_RETRIES} attempts. "
             f"Last error: {str(last_exception)}"
         ) from last_exception
 
@@ -423,7 +553,11 @@ def main() -> None:
         try:
             print("📥 Fetching traces...")
             runs = exporter.fetch_runs(project_name=args.project, limit=args.limit)
-            print(f"✓ Fetched {len(runs)} traces")
+            # fetch_runs now provides progress updates, so adjust final message
+            if len(runs) != args.limit:
+                print(f"✓ Fetched {len(runs)} traces (requested {args.limit})")
+            else:
+                print(f"✓ Fetched {len(runs)} traces")
 
             if len(runs) == 0:
                 print("⚠️  No traces found in project")
diff --git a/test_export_langsmith_traces.py b/test_export_langsmith_traces.py