Skip to content

Commit ba27103

Browse files
danielginesautofix-ci[bot]
authored andcommitted
feat: Add GitLoader Component with advanced filtering options (langflow-ai#2850)
* feat: Add GitLoader Component with advanced filtering options This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options. GitLoader Component: - Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module. - Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria. Examples of `file_filter` usage: - Include only .py files: `lambda file_path: file_path.endswith('.py')` - Exclude .py files: `lambda file_path: not file_path.endswith('.py')` This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities. Features: - Support for loading documents from Git repositories. - Advanced file filtering options to include or exclude specific files. * feat: Add GitLoader Component with advanced filtering options This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options. GitLoader Component: - Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module. - Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria. Examples of `file_filter` usage: - Include only .py files: `lambda file_path: file_path.endswith('.py')` - Exclude .py files: `lambda file_path: not file_path.endswith('.py')` This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities. Features: - Support for loading documents from Git repositories. - Advanced file filtering options to include or exclude specific files. * fix: Ensure proper evaluation and validation of file_filter in GitLoaderComponent This commit fixes the issue where the GitLoaderComponent would fail if the file_filter input was not evaluated correctly. Changes include: - Added a check to ensure that file_filter is a valid string before calling eval. - Ensured that the evaluated file_filter is callable, otherwise it defaults to None. * [autofix.ci] apply automated fixes * feat: Enhance GitLoaderComponent with dynamic inputs, content filtering - Changed inputs from `StrInput` to `MessageTextInput` to enable dynamic use with agents. - Added `content_filter` field to allow additional content filtering using regex. - Updated `file_filter` to support glob format, simplifying usage for users. - Implemented binary file removal filter to exclude binary files from queries, aligning with the agent's purpose. * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> (cherry picked from commit d108ca1)
1 parent f256458 commit ba27103

File tree

6 files changed

+152
-1
lines changed

6 files changed

+152
-1
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
from pathlib import Path
2+
from typing import List
3+
import re
4+
5+
from langchain_community.document_loaders.git import GitLoader
6+
from langflow.custom import Component
7+
from langflow.io import MessageTextInput, Output
8+
from langflow.schema import Data
9+
10+
11+
class GitLoaderComponent(Component):
12+
display_name = "GitLoader"
13+
description = "Load files from a Git repository"
14+
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/"
15+
trace_type = "tool"
16+
icon = "GitLoader"
17+
name = "GitLoader"
18+
19+
inputs = [
20+
MessageTextInput(
21+
name="repo_path",
22+
display_name="Repository Path",
23+
required=True,
24+
info="The local path to the Git repository.",
25+
),
26+
MessageTextInput(
27+
name="clone_url",
28+
display_name="Clone URL",
29+
required=False,
30+
info="The URL to clone the Git repository from.",
31+
),
32+
MessageTextInput(
33+
name="branch",
34+
display_name="Branch",
35+
required=False,
36+
value="main",
37+
info="The branch to load files from. Defaults to 'main'.",
38+
),
39+
MessageTextInput(
40+
name="file_filter",
41+
display_name="File Filter",
42+
required=False,
43+
advanced=True,
44+
info="A list of patterns to filter files. Example to include only .py files: '*.py'. "
45+
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.",
46+
),
47+
MessageTextInput(
48+
name="content_filter",
49+
display_name="Content Filter",
50+
required=False,
51+
advanced=True,
52+
info="A regex pattern to filter files based on their content.",
53+
),
54+
]
55+
56+
outputs = [
57+
Output(name="data", display_name="Data", method="load_documents"),
58+
]
59+
60+
@staticmethod
61+
def is_binary(file_path: str) -> bool:
62+
"""
63+
Check if a file is binary by looking for null bytes.
64+
This is necessary because when searches are performed using
65+
the content_filter, binary files need to be ignored.
66+
"""
67+
with open(file_path, "rb") as file:
68+
return b"\x00" in file.read(1024)
69+
70+
def build_gitloader(self) -> GitLoader:
71+
file_filter_patterns = getattr(self, "file_filter", None)
72+
content_filter_pattern = getattr(self, "content_filter", None)
73+
74+
file_filters = []
75+
if file_filter_patterns:
76+
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")]
77+
78+
def file_filter(file_path: Path) -> bool:
79+
if len(patterns) == 1 and patterns[0].startswith("!"):
80+
return not file_path.match(patterns[0][1:])
81+
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!"))
82+
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!"))
83+
return included and not excluded
84+
85+
file_filters.append(file_filter)
86+
87+
if content_filter_pattern:
88+
content_regex = re.compile(content_filter_pattern)
89+
90+
def content_filter(file_path: Path) -> bool:
91+
with file_path.open("r", encoding="utf-8", errors="ignore") as file:
92+
content = file.read()
93+
return bool(content_regex.search(content))
94+
95+
file_filters.append(content_filter)
96+
97+
def combined_filter(file_path: str) -> bool:
98+
path = Path(file_path)
99+
if self.is_binary(file_path):
100+
return False
101+
return all(f(path) for f in file_filters)
102+
103+
loader = GitLoader(
104+
repo_path=self.repo_path,
105+
clone_url=self.clone_url,
106+
branch=self.branch,
107+
file_filter=combined_filter,
108+
)
109+
return loader
110+
111+
def load_documents(self) -> List[Data]:
112+
gitloader = self.build_gitloader()
113+
documents = list(gitloader.lazy_load())
114+
data = [Data.from_document(doc) for doc in documents]
115+
self.status = data
116+
return data
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .Confluence import ConfluenceComponent
2+
from .GitLoader import GitLoaderComponent
23

3-
__all__ = ["ConfluenceComponent"]
4+
__all__ = ["ConfluenceComponent", "GitLoaderComponent"]
Lines changed: 1 addition & 0 deletions
Loading
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
const GitLoaderIcon = (props) => (
2+
<svg
3+
xmlns="http://www.w3.org/2000/svg"
4+
width="32"
5+
height="32"
6+
viewBox="0 0 32 32"
7+
fill="none"
8+
{...props}
9+
>
10+
<path
11+
d="M31.349 14.191L17.451.293a1.938 1.938 0 0 0-2.738 0L11.618 3.39l3.47 3.47a2.311 2.311 0 0 1 2.377.554 2.31 2.31 0 0 1 .549 2.392l3.36 3.359a2.31 2.31 0 0 1 2.393.55 2.311 2.311 0 0 1 0 3.27 2.312 2.312 0 0 1-3.271 0 2.309 2.309 0 0 1-.501-2.511l-3.12-3.12V20.24a2.31 2.31 0 0 1 .611 3.701 2.31 2.31 0 0 1-3.27 0 2.31 2.31 0 0 1 0-3.27 2.324 2.324 0 0 1 .759-.509V11.925a2.35 2.35 0 0 1-1.27-3.082L9.747 4.741 1.73 12.758a1.938 1.938 0 0 0 0 2.737L14.628 28.393a1.938 1.938 0 0 0 2.737 0l13.372-13.371a1.938 1.938 0 0 0 0-2.738"
12+
style={{
13+
stroke: "none",
14+
fillRule: "nonzero",
15+
fill: "#f03c2e",
16+
fillOpacity: 1,
17+
}}
18+
/>
19+
</svg>
20+
);
21+
22+
export default GitLoaderIcon;
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import React, { forwardRef } from "react";
2+
import SvgGitLoader from "./GitLoader";
3+
4+
export const GitLoaderIcon = forwardRef<
5+
SVGSVGElement,
6+
React.PropsWithChildren<{}>
7+
>((props, ref) => {
8+
return <SvgGitLoader ref={ref} {...props} />;
9+
});

src/frontend/src/utils/styleUtils.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ import { EvernoteIcon } from "../icons/Evernote";
179179
import { FBIcon } from "../icons/FacebookMessenger";
180180
import { FirecrawlIcon } from "../icons/Firecrawl";
181181
import { GitBookIcon } from "../icons/GitBook";
182+
import { GitLoaderIcon } from "../icons/GitLoader";
182183
import { GoogleIcon } from "../icons/Google";
183184
import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI";
184185
import {
@@ -588,4 +589,5 @@ export const nodeIconsLucide: iconsType = {
588589
Table: Table,
589590
AIML: AIMLIcon,
590591
"AI/ML": AIMLIcon,
592+
GitLoader: GitLoaderIcon,
591593
};

0 commit comments

Comments
 (0)