|
1 | 1 | use anyhow::{Context, anyhow}; |
2 | | -use log::{error, trace}; |
3 | 2 | use regex::{Matches, Regex}; |
4 | | -use std::collections::HashSet; |
5 | 3 | use std::sync::LazyLock; |
6 | 4 | use std::{collections::HashMap, sync::Arc}; |
7 | 5 | use unicase::UniCase; |
8 | 6 |
|
9 | 7 | use crate::ops::sdk::RangeValue; |
| 8 | +use crate::ops::shared::program_langs; |
10 | 9 | use crate::ops::shared::split::{Position, set_output_positions}; |
11 | 10 | use crate::{fields_value, ops::sdk::*}; |
12 | | - |
13 | 11 | #[derive(Serialize, Deserialize)] |
14 | 12 | struct CustomLanguageSpec { |
15 | 13 | language_name: String, |
@@ -60,198 +58,9 @@ static DEFAULT_LANGUAGE_CONFIG: LazyLock<SimpleLanguageConfig> = |
60 | 58 | .collect(), |
61 | 59 | }); |
62 | 60 |
|
63 | | -struct TreesitterLanguageConfig { |
64 | | - name: String, |
65 | | - tree_sitter_lang: tree_sitter::Language, |
66 | | - terminal_node_kind_ids: HashSet<u16>, |
67 | | -} |
68 | | - |
69 | | -fn add_treesitter_language( |
70 | | - output: &mut HashMap<UniCase<String>, Arc<TreesitterLanguageConfig>>, |
71 | | - name: &'static str, |
72 | | - aliases: impl IntoIterator<Item = &'static str>, |
73 | | - lang_fn: impl Into<tree_sitter::Language>, |
74 | | - terminal_node_kinds: impl IntoIterator<Item = &'static str>, |
75 | | -) { |
76 | | - let tree_sitter_lang: tree_sitter::Language = lang_fn.into(); |
77 | | - let terminal_node_kind_ids = terminal_node_kinds |
78 | | - .into_iter() |
79 | | - .filter_map(|kind| { |
80 | | - let id = tree_sitter_lang.id_for_node_kind(kind, true); |
81 | | - if id != 0 { |
82 | | - trace!("Got id for node kind: `{kind}` -> {id}"); |
83 | | - Some(id) |
84 | | - } else { |
85 | | - error!("Failed in getting id for node kind: `{kind}`"); |
86 | | - None |
87 | | - } |
88 | | - }) |
89 | | - .collect(); |
90 | | - |
91 | | - let config = Arc::new(TreesitterLanguageConfig { |
92 | | - name: name.to_string(), |
93 | | - tree_sitter_lang, |
94 | | - terminal_node_kind_ids, |
95 | | - }); |
96 | | - for name in std::iter::once(name).chain(aliases.into_iter()) { |
97 | | - if output.insert(name.into(), config.clone()).is_some() { |
98 | | - panic!("Language `{name}` already exists"); |
99 | | - } |
100 | | - } |
101 | | -} |
102 | | - |
103 | | -static TREE_SITTER_LANGUAGE_BY_LANG: LazyLock< |
104 | | - HashMap<UniCase<String>, Arc<TreesitterLanguageConfig>>, |
105 | | -> = LazyLock::new(|| { |
106 | | - let mut map = HashMap::new(); |
107 | | - add_treesitter_language(&mut map, "C", [".c"], tree_sitter_c::LANGUAGE, []); |
108 | | - add_treesitter_language( |
109 | | - &mut map, |
110 | | - "C++", |
111 | | - [".cpp", ".cc", ".cxx", ".h", ".hpp", "cpp"], |
112 | | - tree_sitter_cpp::LANGUAGE, |
113 | | - [], |
114 | | - ); |
115 | | - add_treesitter_language( |
116 | | - &mut map, |
117 | | - "C#", |
118 | | - [".cs", "cs", "csharp"], |
119 | | - tree_sitter_c_sharp::LANGUAGE, |
120 | | - [], |
121 | | - ); |
122 | | - add_treesitter_language( |
123 | | - &mut map, |
124 | | - "CSS", |
125 | | - [".css", ".scss"], |
126 | | - tree_sitter_css::LANGUAGE, |
127 | | - [], |
128 | | - ); |
129 | | - add_treesitter_language( |
130 | | - &mut map, |
131 | | - "Fortran", |
132 | | - [".f", ".f90", ".f95", ".f03", "f", "f90", "f95", "f03"], |
133 | | - tree_sitter_fortran::LANGUAGE, |
134 | | - [], |
135 | | - ); |
136 | | - add_treesitter_language( |
137 | | - &mut map, |
138 | | - "Go", |
139 | | - [".go", "golang"], |
140 | | - tree_sitter_go::LANGUAGE, |
141 | | - [], |
142 | | - ); |
143 | | - add_treesitter_language( |
144 | | - &mut map, |
145 | | - "HTML", |
146 | | - [".html", ".htm"], |
147 | | - tree_sitter_html::LANGUAGE, |
148 | | - [], |
149 | | - ); |
150 | | - add_treesitter_language(&mut map, "Java", [".java"], tree_sitter_java::LANGUAGE, []); |
151 | | - add_treesitter_language( |
152 | | - &mut map, |
153 | | - "JavaScript", |
154 | | - [".js", "js"], |
155 | | - tree_sitter_javascript::LANGUAGE, |
156 | | - [], |
157 | | - ); |
158 | | - add_treesitter_language(&mut map, "JSON", [".json"], tree_sitter_json::LANGUAGE, []); |
159 | | - add_treesitter_language( |
160 | | - &mut map, |
161 | | - "Kotlin", |
162 | | - [".kt", ".kts"], |
163 | | - tree_sitter_kotlin_ng::LANGUAGE, |
164 | | - [], |
165 | | - ); |
166 | | - add_treesitter_language( |
167 | | - &mut map, |
168 | | - "Markdown", |
169 | | - [".md", ".mdx", "md"], |
170 | | - tree_sitter_md::LANGUAGE, |
171 | | - ["inline", "indented_code_block", "fenced_code_block"], |
172 | | - ); |
173 | | - add_treesitter_language( |
174 | | - &mut map, |
175 | | - "Pascal", |
176 | | - [".pas", "pas", ".dpr", "dpr", "Delphi"], |
177 | | - tree_sitter_pascal::LANGUAGE, |
178 | | - [], |
179 | | - ); |
180 | | - add_treesitter_language(&mut map, "PHP", [".php"], tree_sitter_php::LANGUAGE_PHP, []); |
181 | | - add_treesitter_language( |
182 | | - &mut map, |
183 | | - "Python", |
184 | | - [".py"], |
185 | | - tree_sitter_python::LANGUAGE, |
186 | | - [], |
187 | | - ); |
188 | | - add_treesitter_language(&mut map, "R", [".r"], tree_sitter_r::LANGUAGE, []); |
189 | | - add_treesitter_language(&mut map, "Ruby", [".rb"], tree_sitter_ruby::LANGUAGE, []); |
190 | | - add_treesitter_language( |
191 | | - &mut map, |
192 | | - "Rust", |
193 | | - [".rs", "rs"], |
194 | | - tree_sitter_rust::LANGUAGE, |
195 | | - [], |
196 | | - ); |
197 | | - add_treesitter_language( |
198 | | - &mut map, |
199 | | - "Scala", |
200 | | - [".scala"], |
201 | | - tree_sitter_scala::LANGUAGE, |
202 | | - [], |
203 | | - ); |
204 | | - add_treesitter_language(&mut map, "SQL", [".sql"], tree_sitter_sequel::LANGUAGE, []); |
205 | | - add_treesitter_language( |
206 | | - &mut map, |
207 | | - "Swift", |
208 | | - [".swift"], |
209 | | - tree_sitter_swift::LANGUAGE, |
210 | | - [], |
211 | | - ); |
212 | | - add_treesitter_language( |
213 | | - &mut map, |
214 | | - "TOML", |
215 | | - [".toml"], |
216 | | - tree_sitter_toml_ng::LANGUAGE, |
217 | | - [], |
218 | | - ); |
219 | | - add_treesitter_language( |
220 | | - &mut map, |
221 | | - "TSX", |
222 | | - [".tsx"], |
223 | | - tree_sitter_typescript::LANGUAGE_TSX, |
224 | | - [], |
225 | | - ); |
226 | | - add_treesitter_language( |
227 | | - &mut map, |
228 | | - "TypeScript", |
229 | | - [".ts", "ts"], |
230 | | - tree_sitter_typescript::LANGUAGE_TYPESCRIPT, |
231 | | - [], |
232 | | - ); |
233 | | - add_treesitter_language(&mut map, "XML", [".xml"], tree_sitter_xml::LANGUAGE_XML, []); |
234 | | - add_treesitter_language(&mut map, "DTD", [".dtd"], tree_sitter_xml::LANGUAGE_DTD, []); |
235 | | - add_treesitter_language( |
236 | | - &mut map, |
237 | | - "YAML", |
238 | | - [".yaml", ".yml"], |
239 | | - tree_sitter_yaml::LANGUAGE, |
240 | | - [], |
241 | | - ); |
242 | | - add_treesitter_language( |
243 | | - &mut map, |
244 | | - "Solidity", |
245 | | - [".sol"], |
246 | | - tree_sitter_solidity::LANGUAGE, |
247 | | - [], |
248 | | - ); |
249 | | - map |
250 | | -}); |
251 | | - |
252 | 61 | enum ChunkKind<'t> { |
253 | 62 | TreeSitterNode { |
254 | | - lang_config: &'t TreesitterLanguageConfig, |
| 63 | + tree_sitter_info: &'t program_langs::TreeSitterLanguageInfo, |
255 | 64 | node: tree_sitter::Node<'t>, |
256 | 65 | }, |
257 | 66 | RegexpSepChunk { |
@@ -325,7 +134,7 @@ impl<'t, 's: 't> Iterator for TextChunksIter<'t, 's> { |
325 | 134 | } |
326 | 135 |
|
327 | 136 | struct TreeSitterNodeIter<'t, 's: 't> { |
328 | | - lang_config: &'t TreesitterLanguageConfig, |
| 137 | + lang_config: &'t program_langs::TreeSitterLanguageInfo, |
329 | 138 | full_text: &'s str, |
330 | 139 | cursor: Option<tree_sitter::TreeCursor<'t>>, |
331 | 140 | next_start_pos: usize, |
@@ -378,7 +187,7 @@ impl<'t, 's: 't> Iterator for TreeSitterNodeIter<'t, 's> { |
378 | 187 | full_text: self.full_text, |
379 | 188 | range: RangeValue::new(node.start_byte(), node.end_byte()), |
380 | 189 | kind: ChunkKind::TreeSitterNode { |
381 | | - lang_config: self.lang_config, |
| 190 | + tree_sitter_info: self.lang_config, |
382 | 191 | node, |
383 | 192 | }, |
384 | 193 | }) |
@@ -531,7 +340,10 @@ impl<'t, 's: 't> RecursiveChunker<'s> { |
531 | 340 | atom_collector.collect(current_chunk.range); |
532 | 341 | } else { |
533 | 342 | match current_chunk.kind { |
534 | | - ChunkKind::TreeSitterNode { lang_config, node } => { |
| 343 | + ChunkKind::TreeSitterNode { |
| 344 | + tree_sitter_info: lang_config, |
| 345 | + node, |
| 346 | + } => { |
535 | 347 | if !lang_config.terminal_node_kind_ids.contains(&node.kind_id()) { |
536 | 348 | let mut cursor = node.walk(); |
537 | 349 | if cursor.goto_first_child() { |
@@ -858,14 +670,16 @@ impl SimpleFunctionExecutor for Executor { |
858 | 670 | lang_config, |
859 | 671 | next_regexp_sep_id: 0, |
860 | 672 | })? |
861 | | - } else if let Some(lang_config) = TREE_SITTER_LANGUAGE_BY_LANG.get(&language) { |
| 673 | + } else if let Some(lang_info) = program_langs::get_language_info(&language) |
| 674 | + && let Some(tree_sitter_info) = lang_info.treesitter_info.as_ref() |
| 675 | + { |
862 | 676 | let mut parser = tree_sitter::Parser::new(); |
863 | | - parser.set_language(&lang_config.tree_sitter_lang)?; |
864 | | - let tree = parser.parse(full_text.as_ref(), None).ok_or_else(|| { |
865 | | - anyhow!("failed in parsing text in language: {}", lang_config.name) |
866 | | - })?; |
| 677 | + parser.set_language(&tree_sitter_info.tree_sitter_lang)?; |
| 678 | + let tree = parser |
| 679 | + .parse(full_text.as_ref(), None) |
| 680 | + .ok_or_else(|| anyhow!("failed in parsing text in language: {}", lang_info.name))?; |
867 | 681 | recursive_chunker.split_root_chunk(ChunkKind::TreeSitterNode { |
868 | | - lang_config, |
| 682 | + tree_sitter_info, |
869 | 683 | node: tree.root_node(), |
870 | 684 | })? |
871 | 685 | } else { |
|
0 commit comments