Jina tool (#185)

darielnoel · web-flow · commit 2994c39e9c7a · 2025-01-08T10:03:03.000-05:00
feat(tools): add jina-url-to-markdown tool integration
- Updated package.json to include jina-url-to-markdown with ESM and CJS
entry points.
- Modified rollup.config.mjs to include jina-url-to-markdown in the
toolFolders array.
- Exported jina-url-to-markdown in the main index.js file for tools.

This integration enhances the tools package by adding functionality for
converting Jina URLs to Markdown format.
diff --git a/.gitignore b/.gitignore
@@ -36,6 +36,7 @@ yarn-error.log*
 .DS_Store
 *.pem
 _todo.md
+.vscode
 
 *storybook.log
 _todo.md
diff --git a/packages/tools/package.json b/packages/tools/package.json
@@ -53,6 +53,10 @@
     "./make-webhook": {
       "import": "./dist/make-webhook/index.esm.js",
       "require": "./dist/make-webhook/index.cjs.js"
+    },
+    "./jina-url-to-markdown": {
+      "import": "./dist/jina-url-to-markdown/index.esm.js",
+      "require": "./dist/jina-url-to-markdown/index.cjs.js"
     }
   },
   "files": [
diff --git a/packages/tools/rollup.config.mjs b/packages/tools/rollup.config.mjs
@@ -20,6 +20,7 @@ const toolFolders = [
   'textfile-search',
   'zapier-webhook',
   'make-webhook',
+  'jina-url-to-markdown',
 ]; // Add more folder names as needed
 
 const toolConfigs = toolFolders.map((tool) => {
diff --git a/packages/tools/src/index.js b/packages/tools/src/index.js
@@ -10,3 +10,4 @@ export * from './pdf-search/index.js';
 export * from './textfile-search/index.js';
 export * from './zapier-webhook/index.js';
 export * from './make-webhook/index.js';
+export * from './jina-url-to-markdown/index.js';
diff --git a/packages/tools/src/jina-url-to-markdown/README.md b/packages/tools/src/jina-url-to-markdown/README.md
@@ -0,0 +1,81 @@
+# Jina URL to Markdown
+
+This tool integrates with Jina (https://jina.ai/), a web scraping and crawling service designed to turn websites into LLM-ready data. It enables the extraction of clean, well-formatted content from websites, making it ideal for AI applications, particularly those using Large Language Models (LLMs).
+
+## Components
+
+The tool uses the following components:
+
+- A Jina API client instance
+- An API Key for authentication
+- A custom HTTP client (ky) for making API requests
+- Input validation using Zod schema
+- Configurable output format
+
+## Key Features
+
+- Scrapes and crawls websites, even those with dynamic content
+- Converts web content into clean, LLM-ready markdown
+- Handles complex web scraping challenges:
+  - Rate limits
+  - JavaScript rendering
+  - Anti-bot mechanisms
+- Multiple output format options
+- Clean, structured data extraction
+- Support for dynamic content
+- Automatic content cleaning and formatting
+
+## Input
+
+The input should be a JSON object with a "url" field containing the URL to scrape and retrieve content from.
+
+## Output
+
+The output is the scraped content from the specified URL, formatted according to the configured format (default: markdown).
+
+## Configuration Options
+
+- `apiKey`: Your Jina API key (optional)
+- `options`: Options for the Jina API request (optional)
+
+## Example
+
+```javascript
+const tool = new JinaUrlToMarkdown();
+
+const result = await tool._call({
+  url: 'https://example.com',
+});
+```
+
+## Advanced Example with Custom Options and Error Handling
+
+```javascript
+const tool = new JinaUrlToMarkdown({
+  apiKey: process.env.JINA_API_KEY,
+  options: {
+    targetSelector: ['body', '.class', '#id'],
+    retainImages: 'none',
+  },
+});
+
+try {
+  const result = await tool._call({
+    url: 'https://example.com/blog/article',
+  });
+
+  // Process the scraped content
+  console.log('Markdown content:', result);
+
+  // Use the content with an LLM or other processing
+  // ...
+} catch (error) {
+  console.error('Error scraping website:', error);
+}
+```
+
+For more information about Jina, visit: https://jina.ai/, https://r.jina.ai/docs
+
+### Disclaimer
+
+Ensure you have proper API credentials and respect Jina's usage terms and rate limits. The service offers flexible pricing plans, including a free tier for small-scale use. When scraping websites, make sure to comply with the target website's terms of service and robots.txt directives.
diff --git a/packages/tools/src/jina-url-to-markdown/index.js b/packages/tools/src/jina-url-to-markdown/index.js
@@ -0,0 +1,81 @@
+/**
+ * Jina URL to Markdown
+ *
+ * This tool integrates with Jina (https://jina.ai/), a web scraping
+ * and crawling service designed to turn websites into LLM-ready data.
+ *
+ * Jina allows you to extract clean, well-formatted markdown or structured data
+ * from websites, making it ideal for AI applications, particularly those using
+ * Large Language Models (LLMs).
+ *
+ * Key features of Jina:
+ * - Scrapes and crawls websites, even those with dynamic content
+ * - Converts web content into clean, LLM-ready markdown
+ * - Handles challenges like rate limits, JavaScript rendering, and anti-bot mechanisms
+ * - Offers flexible pricing plans, including a free tier for small-scale use
+ *
+ * Usage:
+ * const tool = new JinaUrlToMarkdown();
+ * const result = await tool._call({ url: 'https://example.com' });
+ * or
+ * const tool = new JinaUrlToMarkdown({ apiKey: 'your-api-key', options: { 'targetSelector': ['body', '.class', '#id'], 'retainImages': 'none' } });
+ * const result = await tool._call({ url: 'https://example.com' });
+ *
+ * For more information about Jina, visit: https://jina.ai/, https://r.jina.ai/docs
+ */
+
+import { Tool } from '@langchain/core/tools';
+import { z } from 'zod';
+import ky from 'ky';
+import { HTTPError } from 'ky';
+
+export class JinaUrlToMarkdown extends Tool {
+  constructor(fields) {
+    super(fields);
+    this.name = 'jina-url-to-markdown';
+    this.apiKey = fields.apiKey;
+    this.options = fields.options || {};
+    this.description = `Fetches web content from a specified URL and returns it in Markdown format. Input should be a JSON object with a "url".`;
+
+    this.headers = { 'Content-Type': 'application/json' };
+
+    if (this.apiKey) {
+      this.headers.Authorization = `Bearer ${this.apiKey}`;
+    }
+    // Define the input schema using Zod
+    this.schema = z.object({
+      url: z.string().describe('The URL to scrape and retrieve content from.'),
+    });
+
+    this.httpClient = ky;
+  }
+
+  async _call(input) {
+    try {
+      const response = await this.httpClient
+        .post(`https://r.jina.ai/`, {
+          json: {
+            url: input.url,
+            ...this.options,
+          },
+          headers: this.headers,
+        })
+        .json();
+
+      return response?.data || 'The API returned an empty response.';
+    } catch (error) {
+      if (error instanceof HTTPError) {
+        const statusCode = error.response.status;
+        let errorType = 'Unknown';
+        if (statusCode >= 400 && statusCode < 500) {
+          errorType = 'Client Error';
+        } else if (statusCode >= 500) {
+          errorType = 'Server Error';
+        }
+        return `API request failed: ${errorType} (${statusCode})`;
+      } else {
+        return `An unexpected error occurred: ${error.message}`;
+      }
+    }
+  }
+}
diff --git a/packages/tools/src/jina-url-to-markdown/tool.stories.jsx b/packages/tools/src/jina-url-to-markdown/tool.stories.jsx
@@ -0,0 +1,82 @@
+import { ToolPreviewer } from '../_utils/ToolPreviewer.jsx';
+import { AgentWithToolPreviewer } from '../_utils/AgentWithToolPreviewer.jsx';
+import { JinaUrlToMarkdown } from './index.js';
+import { Agent, Task, Team } from '../../../../src/index';
+import React from 'react';
+
+// More on how to set up stories at: https://storybook.js.org/docs/writing-stories#default-export
+export default {
+  title: 'Tools/Jina URL to Markdown',
+  parameters: {
+    // Optional parameter to center the component in the Canvas. More info: https://storybook.js.org/docs/configure/story-layout
+    layout: 'centered',
+  },
+  // This component will have an automatically generated Autodocs entry: https://storybook.js.org/docs/writing-docs/autodocs
+  tags: ['autodocs'],
+  // More on argTypes: https://storybook.js.org/docs/api/argtypes
+  argTypes: {
+    // backgroundColor: { control: 'color' },
+    // url: { control: 'text' },
+    // apiKey: { control: 'text' },
+    // format: { control: 'select', options: ['markdown', 'json']},
+    // initializationCode: { table: { disable: true } },
+    // executionCode: { table: { disable: true } }
+  },
+};
+
+const jinaUrlToMarkdownTool = new JinaUrlToMarkdown({
+  // apiKey: import.meta.env.VITE_JINA_API_KEY,
+  // options: {
+  //   targetSelector: ['body', '.class', '#id'],
+  //   retainImages: 'none',
+  // },
+});
+
+// More on writing stories with args: https://storybook.js.org/docs/writing-stories/args
+export const Default = {
+  render: (args) => <ToolPreviewer {...args} />,
+  args: {
+    toolInstance: jinaUrlToMarkdownTool,
+    callParams: {
+      url: 'https://www.kaibanjs.com',
+    },
+  },
+};
+
+// Create an agent with the firecrawl tool
+const webResearcher = new Agent({
+  name: 'Web Researcher',
+  role: 'Web Content Analyzer',
+  goal: 'Extract and analyze content from specified websites',
+  tools: [jinaUrlToMarkdownTool],
+});
+
+// Create a research task
+const webAnalysisTask = new Task({
+  description:
+    'Fetches web content from the followin URL: {url} and provides a structured summary',
+  agent: webResearcher,
+  expectedOutput: 'A well-formatted analysis of the website content',
+});
+
+// Create the team
+const team = new Team({
+  name: 'Web Analysis Unit',
+  description: 'Specialized team for web content extraction and analysis',
+  agents: [webResearcher],
+  tasks: [webAnalysisTask],
+  inputs: {
+    url: 'https://www.kaibanjs.com',
+  },
+  env: {
+    OPENAI_API_KEY: import.meta.env.VITE_OPENAI_API_KEY,
+  },
+});
+
+// More on writing stories with args: https://storybook.js.org/docs/writing-stories/args
+export const withAgent = {
+  render: (args) => <AgentWithToolPreviewer {...args} />,
+  args: {
+    team: team,
+  },
+};
diff --git a/packages/tools/src/jina-url-to-markdown/tool.test.js b/packages/tools/src/jina-url-to-markdown/tool.test.js