11from dotenv import load_dotenv
22
33import cocoindex
4+ import os
5+
6+ class ExtractExtension (cocoindex .op .FunctionSpec ):
7+ """Summarize a Python module."""
8+
9+ @cocoindex .op .executor_class ()
10+ class ExtractExtensionExecutor :
11+ """Executor for ExtractExtension."""
12+
13+ spec : ExtractExtension
14+
15+ def __call__ (self , filename : str ) -> str :
16+ return os .path .splitext (filename )[1 ]
417
518def code_to_embedding (text : cocoindex .DataSlice ) -> cocoindex .DataSlice :
619 """
@@ -17,14 +30,15 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
1730 """
1831 data_scope ["files" ] = flow_builder .add_source (
1932 cocoindex .sources .LocalFile (path = "../.." ,
20- included_patterns = ["*.py" ],
21- excluded_patterns = [".*" ]))
33+ included_patterns = ["*.py" , "*.rs" , "*.toml" , "*.md" , "*.mdx" ],
34+ excluded_patterns = [".*" , "target" , "**/node_modules" ]))
2235 code_embeddings = data_scope .add_collector ()
2336
2437 with data_scope ["files" ].row () as file :
38+ file ["extension" ] = file ["filename" ].transform (ExtractExtension ())
2539 file ["chunks" ] = file ["content" ].transform (
2640 cocoindex .functions .SplitRecursively (),
27- language = "python" , chunk_size = 1000 , chunk_overlap = 300 )
41+ language = file [ "extension" ] , chunk_size = 1000 , chunk_overlap = 300 )
2842 with file ["chunks" ].row () as chunk :
2943 chunk ["embedding" ] = chunk ["text" ].call (code_to_embedding )
3044 code_embeddings .collect (filename = file ["filename" ], location = chunk ["location" ],
0 commit comments