mongodb
diff --git a/‎pnpm-lock.yaml‎
Lines changed: 1991 additions & 11 deletions b/‎pnpm-lock.yaml‎
Lines changed: 1991 additions & 11 deletions
diff --git a/‎tools/crawler/.env.example‎
Lines changed: 10 additions & 0 deletions b/‎tools/crawler/.env.example‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎tools/crawler/CHANGELOG.md‎
Lines changed: 26 additions & 0 deletions b/‎tools/crawler/CHANGELOG.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎tools/crawler/README.md‎
Lines changed: 112 additions & 0 deletions b/‎tools/crawler/README.md‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎tools/crawler/bin/cli.js‎
Lines changed: 2 additions & 0 deletions b/‎tools/crawler/bin/cli.js‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tools/crawler/package.json‎
Lines changed: 55 additions & 0 deletions b/‎tools/crawler/package.json‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎tools/crawler/rollup.config.mjs‎
Lines changed: 14 additions & 0 deletions b/‎tools/crawler/rollup.config.mjs‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tools/crawler/scripts/deploy.sh‎
Lines changed: 36 additions & 0 deletions b/‎tools/crawler/scripts/deploy.sh‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎tools/crawler/src/cli.ts‎
Lines changed: 52 additions & 0 deletions b/‎tools/crawler/src/cli.ts‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎tools/crawler/src/constants.ts‎
Lines changed: 54 additions & 0 deletions b/‎tools/crawler/src/constants.ts‎
Lines changed: 54 additions & 0 deletions
@@ -0,0 +1,10 @@
+MONGODB_USER=<YOUR_MONGODB_USER>
+MONGODB_PASSWORD=<YOUR_MONGODB_PASSWORD>
+MONGODB_PROJECT_URL=<YOUR_PROJECT_URL>
+MONGODB_APP_NAME=LeafyGreenAI
+
+# Used for vector embedding
+AZURE_API_KEY1=<Key1>
+AZURE_API_KEY2=<Key2>
+AZURE_OPENAI_ENDPOINT=https://<your-env>.openai.azure.com/
+AZURE_OPENAI_DEPLOYMENT=text-embedding-3-small
@@ -0,0 +1,26 @@
+# @lg-tools/crawler
+
+## 0.0.2
+
+### Patch Changes
+
+- ## Added
+
+  - Implemented prune command in CLI
+  - Added prune script in package.json
+  - Created CrawlerDocument interface for better type safety
+  - Implemented robots.txt checking functionality
+  - Added new utility function newURL for enhanced URL processing
+
+  ## Changed
+
+  - Updated SOURCES in constants.ts to include additional URLs and collections
+  - Changed log color to green in processSingleUrl for better visibility
+  - Refactored crawler logic to improve URL processing
+  - Enhanced logging in recursive crawling functionality
+  - Improved URL processing with better logging
+
+  ## Updated
+
+  - Refactored crawler constants for better organization
+  - Updated various log formats and display
@@ -0,0 +1,112 @@
+# LeafyGreen Crawler Tool
+
+A CLI tool for crawling and analyzing website content for LeafyGreen AI.
+
+## Overview
+
+This tool crawls websites and stores the content in MongoDB collections for use with LeafyGreen AI systems. The crawler can process either specific URLs or use pre-configured website sources.
+
+## Prerequisites
+
+- Node.js (v16 or higher)
+- Yarn package manager
+- MongoDB Atlas account with connection details
+- Environment variables properly configured
+
+## Installation
+
+```bash
+# From the root of the leafygreen-ui-private repository
+cd tools/crawler
+yarn install
+```
+
+## Configuration
+
+Create a `.env` file in the `tools/crawler` directory with the following variables:
+
+```
+MONGODB_USER=your_mongodb_user
+MONGODB_PASSWORD=your_mongodb_password
+MONGODB_PROJECT_URL=your_project_url
+MONGODB_APP_NAME=your_app_name
+```
+
+### Default Sources
+
+The crawler comes with pre-configured sources in `src/constants.ts`:
+
+- MongoDB Design (https://mongodb.design)
+- React Documentation (https://react.dev)
+- MDN Web Docs (https://developer.mozilla.org)
+
+To add or modify sources, edit the `SOURCES` array in `src/constants.ts`.
+
+## Usage
+
+### Building the Tool
+
+```bash
+yarn build
+```
+
+### Basic Usage
+
+```bash
+# Use the built version
+yarn lg-crawler
+
+# Or use the development version
+yarn crawl
+```
+
+### Command Line Options
+
+- `-v, --verbose`: Enable verbose output
+- `-d, --depth <number>`: Set maximum crawl depth (default: 3)
+- `--url <url>`: Specify a single URL to crawl
+- `--dry-run`: Run crawler without inserting documents into MongoDB
+
+### Examples
+
+```bash
+# Crawl all pre-configured sources with verbose output
+yarn crawl --verbose
+
+# Crawl a specific URL with a depth of 2
+yarn crawl --url https://example.com --depth 2
+
+# Test crawling without saving to MongoDB
+yarn crawl --dry-run --verbose
+```
+
+## Development
+
+### Project Structure
+
+- `src/index.ts`: Main entry point and command-line interface
+- `src/crawler.ts`: Core crawler implementation
+- `src/constants.ts`: Configuration constants and source definitions
+- `src/utils/`: Helper utilities for crawling and data processing
+
+### Adding New Features
+
+1. Make your code changes
+2. Build the project: `yarn build`
+3. Test your changes: `yarn crawl --dry-run --verbose`
+
+### Running Tests
+
+```bash
+yarn test
+```
+
+## Troubleshooting
+
+- **MongoDB Connection Issues**: Verify your `.env` file has the correct credentials
+- **Crawling Errors**: Use the `--verbose` flag to get detailed logs
+- **Rate Limiting**: Some websites may block the crawler if too many requests are made
+
+## License
+
+Apache-2.0
@@ -0,0 +1,2 @@
+#!/usr/bin/env node
+require('../dist/cli.js');
@@ -0,0 +1,55 @@
+{
+  "name": "@lg-tools/crawler",
+  "version": "0.0.2",
+  "description": "Crawler for MongoDB documentation and other sites",
+  "type": "module",
+  "main": "./dist/index.js",
+  "module": "./dist/esm/index.js",
+  "types": "./dist/types/index.d.ts",
+  "bin": {
+    "lg-crawler": "./bin/cli.js"
+  },
+  "scripts": {
+    "build": "lg build-package",
+    "tsc": "lg build-ts",
+    "postbuild": "zip -r dist/lambda.zip dist/lambda.js node_modules package.json",
+    "crawl": "tsx src/cli.ts crawl",
+    "prune": "tsx src/cli.ts prune",
+    "deploy": "bash scripts/deploy.sh"
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "keywords": [
+    "mongodb",
+    "ui",
+    "kit",
+    "components",
+    "react",
+    "uikit",
+    "leafygreen",
+    "crawler",
+    "ai"
+  ],
+  "author": "",
+  "license": "Apache-2.0",
+  "dependencies": {
+    "@azure/identity": "^4.9.1",
+    "@langchain/community": "^0.3.42",
+    "@langchain/core": "^0.3.42",
+    "chalk": "4.1.2",
+    "cheerio": "^1.0.0",
+    "commander": "^13.1.0",
+    "dotenv": "^16.5.0",
+    "langchain": "^0.3.24",
+    "lodash": "^4.17.21",
+    "mongodb": "^6.16.0",
+    "openai": "^4.97.0",
+    "ora": "^8.2.0"
+  },
+  "devDependencies": {
+    "@lg-tools/build": "workspace:^",
+    "@lg-tools/meta": "workspace:^",
+    "tsx": "^4.19.4"
+  }
+}
@@ -0,0 +1,14 @@
+import { esmConfig, umdConfig } from '@lg-tools/build/config/rollup.config.mjs';
+
+const cli = {
+  ...umdConfig,
+  input: ['./src/cli.ts'],
+};
+
+const lambda = {
+  ...umdConfig,
+  input: ['./src/lambda.ts'],
+  external: [],
+};
+
+export default [esmConfig, umdConfig, cli, lambda];
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Configuration
+FUNCTION_NAME="ragCrawl"
+ZIP_FILE="./dist/lambda.zip"
+
+# Check if the zip file exists
+if [ ! -f "$ZIP_FILE" ]; then
+    echo "Error: $ZIP_FILE does not exist. Please build the Lambda package first."
+    exit 1
+fi
+
+echo "Deploying $ZIP_FILE to Lambda function: $FUNCTION_NAME"
+
+
+aws lambda update-function-code \
+  --function-name $FUNCTION_NAME \
+  --zip-file fileb://$ZIP_FILE \
+
+
+if [ $? -eq 0 ]; then
+    echo "Successfully updated Lambda function: $FUNCTION_NAME"
+    
+    # Optional: Wait for function to be updated and then publish a new version
+    echo "Waiting for function update to complete..."
+    aws lambda wait function-updated --function-name $FUNCTION_NAME 
+    
+    # Print the function details
+    echo "Getting updated function details..."
+    aws lambda get-function \
+        --function-name $FUNCTION_NAME \
+        --query 'Configuration.[FunctionName,Version,LastModified]'
+else
+    echo "Failed to update Lambda function"
+    exit 1
+fi
@@ -0,0 +1,52 @@
+import { Command } from 'commander';
+
+import { crawl } from './crawler';
+import { prune } from './prune';
+
+const program = new Command();
+
+// Initialize CLI program
+program
+  .name('lg-crawler')
+  .description(
+    'A CLI tool for crawling and analyzing website content for LeafyGreen AI',
+  );
+
+program
+  .command('crawl')
+  .description('Run the crawler')
+  .option('-v, --verbose', 'Enable verbose output', false)
+  .option('-d, --depth <number>', 'Maximum crawl depth', '3')
+  .option(
+    '--url <url>',
+    'Specific URL to crawl. If not provided, the crawler will scan all URLs defined in the config.',
+  )
+  .option(
+    '--dry-run',
+    'Run crawler without inserting documents into MongoDB',
+    false,
+  )
+  .action(crawl);
+
+program
+  .command('prune')
+  .description(
+    'Prune old documents from MongoDB collections used by LeafyGreen Crawler',
+  )
+  .option('-v, --verbose', 'Enable verbose output', false)
+  .option(
+    '--dry-run',
+    'Run prune without deleting documents from MongoDB',
+    false,
+  )
+  .option(
+    '-d, --days <number>',
+    'Keep documents newer than this many days',
+    '7',
+  )
+  .action(prune);
+
+// Parse the command line arguments
+program.parse(process.argv);
+
+export default program;
@@ -0,0 +1,54 @@
+import dotenv from 'dotenv';
+dotenv.config();
+
+const {
+  MONGODB_USER,
+  MONGODB_PASSWORD,
+  MONGODB_PROJECT_URL,
+  MONGODB_APP_NAME,
+} = process.env;
+
+export const MDB_URI = `mongodb+srv://${MONGODB_USER}:${MONGODB_PASSWORD}@${MONGODB_PROJECT_URL}/?retryWrites=true&w=majority&appName=${MONGODB_APP_NAME}`;
+export const MDB_DB = 'rag-sources' as const;
+
+export const EMBEDDING_MODEL_NAME = 'text-embedding-3-small';
+
+export const SOURCES = [
+  {
+    url: 'https://mongodb.design',
+    collection: 'mongodb-dot-design',
+  },
+  {
+    url: 'https://react.dev/reference/react',
+    collection: 'react-dev',
+  },
+  {
+    url: 'https://developer.mozilla.org/en-US/docs/Web',
+    collection: 'mdn',
+  },
+  {
+    url: 'https://css-tricks.com/category/articles',
+    collection: 'css-tricks',
+  },
+  {
+    url: 'https://www.nngroup.com/articles',
+    collection: 'nn-group',
+  },
+  {
+    url: 'https://www.w3.org/WAI/standards-guidelines/wcag',
+    collection: 'wcag',
+  },
+  {
+    url: 'https://atomicdesign.bradfrost.com/table-of-contents',
+    collection: 'atomic-design',
+  },
+] as const;
+
+/**
+ * Allow the crawler to follow links to these domains
+ * (with restricted depth)
+ */
+export const allowedDomains = [
+  'https://www.mongodb.com',
+  'https://github.com',
+] as const;
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#!/usr/bin/env node`
	`2`	`+require('../dist/cli.js');`