Skip to content

Commit 57331c2

Browse files
committed
feat: add MySQL code text splitter with unit tests.
1 parent b7d1831 commit 57331c2

File tree

3 files changed

+116
-0
lines changed

3 files changed

+116
-0
lines changed

libs/text-splitters/langchain_text_splitters/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,7 @@ class Language(str, Enum):
330330
ELIXIR = "elixir"
331331
POWERSHELL = "powershell"
332332
VISUALBASIC6 = "visualbasic6"
333+
MYSQL = "mysql"
333334

334335

335336
@dataclass(frozen=True)

libs/text-splitters/langchain_text_splitters/character.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,82 @@ def get_separators_for_language(language: Language) -> list[str]:
760760
" ",
761761
"",
762762
]
763+
if language == Language.MYSQL:
764+
return [
765+
# Split along definitions
766+
"\ncreate ",
767+
"\nCREATE ",
768+
"\nalter ",
769+
"\nALTER ",
770+
"\ndrop ",
771+
"\nDROP ",
772+
"\ntruncate ",
773+
"\nTRUNCATE ",
774+
"\nrename ",
775+
"\nRENAME ",
776+
"\nuse ",
777+
"\nUSE ",
778+
"\ndesc ",
779+
"\nDESC ",
780+
"\ndescribe ",
781+
"\nDESCRIBE ",
782+
# split along Control and procedural code
783+
"\nbegin",
784+
"\nBEGIN",
785+
"\nloop ",
786+
"\nLOOP ",
787+
"\nif ",
788+
"\nIF ",
789+
"\nwhile ",
790+
"\nWHILE ",
791+
"\nelse ",
792+
"\nELSE ",
793+
"\nelseif ",
794+
"\nELSEIF ",
795+
"\nrepeat ",
796+
"\nREPEAT ",
797+
"\nhandler ",
798+
"\nHANDLER ",
799+
# split along data manipulation
800+
"\nselect ",
801+
"\nSELECT ",
802+
"\ninsert ",
803+
"\nINSERT ",
804+
"\nupdate ",
805+
"\nUPDATE ",
806+
"\ndelete ",
807+
"\nDELETE ",
808+
"\nreplace ",
809+
"\nREPLACE ",
810+
"\nwith ",
811+
"\nWITH ",
812+
"\nshow ",
813+
"\nSHOW ",
814+
"\nexplain ",
815+
"\nEXPLAIN ",
816+
"\ncall ",
817+
"\nCALL ",
818+
# aplit along permissions and transactions
819+
"\ngrant ",
820+
"\nGRANT ",
821+
"\nrevoke ",
822+
"\nREVOKE ",
823+
"\ncommit ",
824+
"\nCOMMIT ",
825+
"\nrollback ",
826+
"\nROLLBACK ",
827+
"\nstart transaction",
828+
"\nSTART TRANSACTION",
829+
"\nset autocommit",
830+
"\nSET AUTOCOMMIT",
831+
"\nDELIMITER ",
832+
"\ndelimiter ",
833+
# Split by the normal type of lines
834+
"\n\n",
835+
"\n",
836+
" ",
837+
"",
838+
]
763839

764840
if language in Language._value2member_map_:
765841
msg = f"Language {language} is not implemented yet!"

libs/text-splitters/tests/unit_tests/test_text_splitters.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3277,6 +3277,45 @@ def test_visualbasic6_code_splitter() -> None:
32773277
]
32783278

32793279

3280+
def test_mysql_code_splitter() -> None:
3281+
splitter = RecursiveCharacterTextSplitter.from_language(
3282+
Language.MYSQL,
3283+
chunk_size=CHUNK_SIZE,
3284+
chunk_overlap=0,
3285+
)
3286+
code = """
3287+
CREATE TABLE products (
3288+
id INT PRIMARY KEY,
3289+
name VARCHAR(100)
3290+
);
3291+
INSERT INTO products VALUES (1, 'Keyboard'), (2, 'Mouse');
3292+
SELECT * FROM products WHERE id = 1;
3293+
SELECT name FROM products ORDER BY name DESC;
3294+
"""
3295+
chunks = splitter.split_text(code)
3296+
assert chunks == [
3297+
"CREATE TABLE",
3298+
"products (",
3299+
"id INT",
3300+
"PRIMARY KEY,",
3301+
"name",
3302+
"VARCHAR(100)",
3303+
");",
3304+
"INSERT INTO",
3305+
"products VALUES",
3306+
"(1,",
3307+
"'Keyboard'),",
3308+
"(2, 'Mouse');",
3309+
"SELECT * FROM",
3310+
"products WHERE",
3311+
"id = 1;",
3312+
"SELECT name",
3313+
"FROM products",
3314+
"ORDER BY name",
3315+
"DESC;",
3316+
]
3317+
3318+
32803319
def custom_iframe_extractor(iframe_tag: Tag) -> str:
32813320
iframe_src = iframe_tag.get("src", "")
32823321
return f"[iframe:{iframe_src}]({iframe_src})"

0 commit comments

Comments
 (0)