File tree Expand file tree Collapse file tree 5 files changed +16
-44
lines changed Expand file tree Collapse file tree 5 files changed +16
-44
lines changed Original file line number Diff line number Diff line change @@ -32,6 +32,7 @@ dependencies = [
3232 " playwright==1.43.0" ,
3333 " google==3.0.0" ,
3434 " undetected-playwright==0.3.0" ,
35+ " semchunk==1.0.1" ,
3536]
3637
3738license = " MIT"
@@ -81,4 +82,4 @@ dev-dependencies = [
8182 " pytest-mock==3.14.0" ,
8283 " -e file:.[burr]" ,
8384 " -e file:.[docs]" ,
84- ]
85+ ]
Original file line number Diff line number Diff line change @@ -30,9 +30,6 @@ anyio==4.3.0
3030 # via openai
3131 # via starlette
3232 # via watchfiles
33- async-timeout==4.0.3
34- # via aiohttp
35- # via langchain
3633attrs==23.2.0
3734 # via aiohttp
3835 # via jsonschema
@@ -51,7 +48,6 @@ botocore==1.34.113
5148 # via boto3
5249 # via s3transfer
5350burr==0.19.1
54- # via burr
5551 # via scrapegraphai
5652cachetools==5.3.3
5753 # via google-auth
@@ -67,13 +63,6 @@ click==8.1.7
6763 # via streamlit
6864 # via typer
6965 # via uvicorn
70- colorama==0.4.6
71- # via click
72- # via loguru
73- # via pytest
74- # via sphinx
75- # via tqdm
76- # via uvicorn
7766contourpy==1.2.1
7867 # via matplotlib
7968cycler==0.12.1
@@ -93,9 +82,6 @@ docutils==0.19
9382 # via sphinx
9483email-validator==2.1.1
9584 # via fastapi
96- exceptiongroup==1.2.1
97- # via anyio
98- # via pytest
9985faiss-cpu==1.8.0
10086 # via scrapegraphai
10187fastapi==0.111.0
@@ -150,7 +136,6 @@ graphviz==0.20.3
150136 # via scrapegraphai
151137greenlet==3.0.3
152138 # via playwright
153- # via sqlalchemy
154139groq==0.8.0
155140 # via langchain-groq
156141grpcio==1.64.0
@@ -388,6 +373,8 @@ rsa==4.9
388373 # via google-auth
389374s3transfer==0.10.1
390375 # via boto3
376+ semchunk==1.0.1
377+ # via scrapegraphai
391378sf-hamilton==1.63.0
392379 # via burr
393380shellingham==1.5.4
@@ -443,8 +430,6 @@ tokenizers==0.19.1
443430 # via anthropic
444431toml==0.10.2
445432 # via streamlit
446- tomli==2.0.1
447- # via pytest
448433toolz==0.12.1
449434 # via altair
450435tornado==6.4
@@ -454,12 +439,11 @@ tqdm==4.66.4
454439 # via huggingface-hub
455440 # via openai
456441 # via scrapegraphai
442+ # via semchunk
457443typer==0.12.3
458444 # via fastapi-cli
459445typing-extensions==4.12.0
460- # via altair
461446 # via anthropic
462- # via anyio
463447 # via fastapi
464448 # via fastapi-pagination
465449 # via google-generativeai
@@ -474,7 +458,6 @@ typing-extensions==4.12.0
474458 # via streamlit
475459 # via typer
476460 # via typing-inspect
477- # via uvicorn
478461typing-inspect==0.9.0
479462 # via dataclasses-json
480463 # via sf-hamilton
@@ -492,13 +475,11 @@ urllib3==1.26.18
492475uvicorn==0.29.0
493476 # via burr
494477 # via fastapi
495- watchdog==4.0.1
496- # via streamlit
478+ uvloop==0.19.0
479+ # via uvicorn
497480watchfiles==0.21.0
498481 # via uvicorn
499482websockets==12.0
500483 # via uvicorn
501- win32-setctime==1.1.0
502- # via loguru
503484yarl==1.9.4
504485 # via aiohttp
Original file line number Diff line number Diff line change @@ -22,9 +22,6 @@ anyio==4.3.0
2222 # via groq
2323 # via httpx
2424 # via openai
25- async-timeout==4.0.3
26- # via aiohttp
27- # via langchain
2825attrs==23.2.0
2926 # via aiohttp
3027beautifulsoup4==4.12.3
@@ -43,8 +40,6 @@ certifi==2024.2.2
4340 # via requests
4441charset-normalizer==3.3.2
4542 # via requests
46- colorama==0.4.6
47- # via tqdm
4843dataclasses-json==0.6.6
4944 # via langchain
5045 # via langchain-community
@@ -54,8 +49,6 @@ distro==1.9.0
5449 # via anthropic
5550 # via groq
5651 # via openai
57- exceptiongroup==1.2.1
58- # via anyio
5952faiss-cpu==1.8.0
6053 # via scrapegraphai
6154filelock==3.14.0
@@ -94,7 +87,6 @@ graphviz==0.20.3
9487 # via scrapegraphai
9588greenlet==3.0.3
9689 # via playwright
97- # via sqlalchemy
9890groq==0.8.0
9991 # via langchain-groq
10092grpcio==1.64.0
@@ -246,6 +238,8 @@ rsa==4.9
246238 # via google-auth
247239s3transfer==0.10.1
248240 # via boto3
241+ semchunk==1.0.1
242+ # via scrapegraphai
249243six==1.16.0
250244 # via python-dateutil
251245sniffio==1.3.1
@@ -273,9 +267,9 @@ tqdm==4.66.4
273267 # via huggingface-hub
274268 # via openai
275269 # via scrapegraphai
270+ # via semchunk
276271typing-extensions==4.12.0
277272 # via anthropic
278- # via anyio
279273 # via google-generativeai
280274 # via groq
281275 # via huggingface-hub
Original file line number Diff line number Diff line change @@ -18,3 +18,4 @@ playwright==1.43.0
1818langchain-aws == 0.1.2
1919yahoo-search-py == 0.3
2020undetected-playwright == 0.3.0
21+ semchunk == 1.0.1
Original file line number Diff line number Diff line change 33"""
44
55from typing import List , Optional
6-
7- from langchain .text_splitter import RecursiveCharacterTextSplitter
6+ from semchunk import chunk
87from langchain_community .document_transformers import Html2TextTransformer
98from ..utils .logging import get_logger
109from .base_node import BaseNode
@@ -67,20 +66,16 @@ def execute(self, state: dict) -> dict:
6766
6867 # Fetching data from the state based on the input keys
6968 input_data = [state [key ] for key in input_keys ]
70-
71- text_splitter = RecursiveCharacterTextSplitter .from_tiktoken_encoder (
72- chunk_size = self .node_config .get ("chunk_size" , 4096 ),
73- chunk_overlap = 0 ,
74- )
75-
7669 # Parse the document
7770 docs_transformed = input_data [0 ]
7871 if self .parse_html :
7972 docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
8073 docs_transformed = docs_transformed [0 ]
8174
82- chunks = text_splitter .split_text (docs_transformed .page_content )
83-
75+ chunks = chunk (text = docs_transformed .page_content ,
76+ chunk_size = self .node_config .get ("chunk_size" , 4096 ),
77+ token_counter = lambda x : len (x .split ()),
78+ memoize = False )
8479 state .update ({self .output [0 ]: chunks })
8580
8681 return state
You can’t perform that action at this time.
0 commit comments