Skip to content

Commit 4065ba5

Browse files
committed
refactor: extract the language list into a separate mod
1 parent 1d125c2 commit 4065ba5

File tree

3 files changed

+305
-202
lines changed

3 files changed

+305
-202
lines changed

src/ops/functions/split_recursively.rs

Lines changed: 16 additions & 202 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
use anyhow::{Context, anyhow};
2-
use log::{error, trace};
32
use regex::{Matches, Regex};
4-
use std::collections::HashSet;
53
use std::sync::LazyLock;
64
use std::{collections::HashMap, sync::Arc};
75
use unicase::UniCase;
86

97
use crate::ops::sdk::RangeValue;
8+
use crate::ops::shared::program_langs;
109
use crate::ops::shared::split::{Position, set_output_positions};
1110
use crate::{fields_value, ops::sdk::*};
12-
1311
#[derive(Serialize, Deserialize)]
1412
struct CustomLanguageSpec {
1513
language_name: String,
@@ -60,198 +58,9 @@ static DEFAULT_LANGUAGE_CONFIG: LazyLock<SimpleLanguageConfig> =
6058
.collect(),
6159
});
6260

63-
struct TreesitterLanguageConfig {
64-
name: String,
65-
tree_sitter_lang: tree_sitter::Language,
66-
terminal_node_kind_ids: HashSet<u16>,
67-
}
68-
69-
fn add_treesitter_language(
70-
output: &mut HashMap<UniCase<String>, Arc<TreesitterLanguageConfig>>,
71-
name: &'static str,
72-
aliases: impl IntoIterator<Item = &'static str>,
73-
lang_fn: impl Into<tree_sitter::Language>,
74-
terminal_node_kinds: impl IntoIterator<Item = &'static str>,
75-
) {
76-
let tree_sitter_lang: tree_sitter::Language = lang_fn.into();
77-
let terminal_node_kind_ids = terminal_node_kinds
78-
.into_iter()
79-
.filter_map(|kind| {
80-
let id = tree_sitter_lang.id_for_node_kind(kind, true);
81-
if id != 0 {
82-
trace!("Got id for node kind: `{kind}` -> {id}");
83-
Some(id)
84-
} else {
85-
error!("Failed in getting id for node kind: `{kind}`");
86-
None
87-
}
88-
})
89-
.collect();
90-
91-
let config = Arc::new(TreesitterLanguageConfig {
92-
name: name.to_string(),
93-
tree_sitter_lang,
94-
terminal_node_kind_ids,
95-
});
96-
for name in std::iter::once(name).chain(aliases.into_iter()) {
97-
if output.insert(name.into(), config.clone()).is_some() {
98-
panic!("Language `{name}` already exists");
99-
}
100-
}
101-
}
102-
103-
static TREE_SITTER_LANGUAGE_BY_LANG: LazyLock<
104-
HashMap<UniCase<String>, Arc<TreesitterLanguageConfig>>,
105-
> = LazyLock::new(|| {
106-
let mut map = HashMap::new();
107-
add_treesitter_language(&mut map, "C", [".c"], tree_sitter_c::LANGUAGE, []);
108-
add_treesitter_language(
109-
&mut map,
110-
"C++",
111-
[".cpp", ".cc", ".cxx", ".h", ".hpp", "cpp"],
112-
tree_sitter_cpp::LANGUAGE,
113-
[],
114-
);
115-
add_treesitter_language(
116-
&mut map,
117-
"C#",
118-
[".cs", "cs", "csharp"],
119-
tree_sitter_c_sharp::LANGUAGE,
120-
[],
121-
);
122-
add_treesitter_language(
123-
&mut map,
124-
"CSS",
125-
[".css", ".scss"],
126-
tree_sitter_css::LANGUAGE,
127-
[],
128-
);
129-
add_treesitter_language(
130-
&mut map,
131-
"Fortran",
132-
[".f", ".f90", ".f95", ".f03", "f", "f90", "f95", "f03"],
133-
tree_sitter_fortran::LANGUAGE,
134-
[],
135-
);
136-
add_treesitter_language(
137-
&mut map,
138-
"Go",
139-
[".go", "golang"],
140-
tree_sitter_go::LANGUAGE,
141-
[],
142-
);
143-
add_treesitter_language(
144-
&mut map,
145-
"HTML",
146-
[".html", ".htm"],
147-
tree_sitter_html::LANGUAGE,
148-
[],
149-
);
150-
add_treesitter_language(&mut map, "Java", [".java"], tree_sitter_java::LANGUAGE, []);
151-
add_treesitter_language(
152-
&mut map,
153-
"JavaScript",
154-
[".js", "js"],
155-
tree_sitter_javascript::LANGUAGE,
156-
[],
157-
);
158-
add_treesitter_language(&mut map, "JSON", [".json"], tree_sitter_json::LANGUAGE, []);
159-
add_treesitter_language(
160-
&mut map,
161-
"Kotlin",
162-
[".kt", ".kts"],
163-
tree_sitter_kotlin_ng::LANGUAGE,
164-
[],
165-
);
166-
add_treesitter_language(
167-
&mut map,
168-
"Markdown",
169-
[".md", ".mdx", "md"],
170-
tree_sitter_md::LANGUAGE,
171-
["inline", "indented_code_block", "fenced_code_block"],
172-
);
173-
add_treesitter_language(
174-
&mut map,
175-
"Pascal",
176-
[".pas", "pas", ".dpr", "dpr", "Delphi"],
177-
tree_sitter_pascal::LANGUAGE,
178-
[],
179-
);
180-
add_treesitter_language(&mut map, "PHP", [".php"], tree_sitter_php::LANGUAGE_PHP, []);
181-
add_treesitter_language(
182-
&mut map,
183-
"Python",
184-
[".py"],
185-
tree_sitter_python::LANGUAGE,
186-
[],
187-
);
188-
add_treesitter_language(&mut map, "R", [".r"], tree_sitter_r::LANGUAGE, []);
189-
add_treesitter_language(&mut map, "Ruby", [".rb"], tree_sitter_ruby::LANGUAGE, []);
190-
add_treesitter_language(
191-
&mut map,
192-
"Rust",
193-
[".rs", "rs"],
194-
tree_sitter_rust::LANGUAGE,
195-
[],
196-
);
197-
add_treesitter_language(
198-
&mut map,
199-
"Scala",
200-
[".scala"],
201-
tree_sitter_scala::LANGUAGE,
202-
[],
203-
);
204-
add_treesitter_language(&mut map, "SQL", [".sql"], tree_sitter_sequel::LANGUAGE, []);
205-
add_treesitter_language(
206-
&mut map,
207-
"Swift",
208-
[".swift"],
209-
tree_sitter_swift::LANGUAGE,
210-
[],
211-
);
212-
add_treesitter_language(
213-
&mut map,
214-
"TOML",
215-
[".toml"],
216-
tree_sitter_toml_ng::LANGUAGE,
217-
[],
218-
);
219-
add_treesitter_language(
220-
&mut map,
221-
"TSX",
222-
[".tsx"],
223-
tree_sitter_typescript::LANGUAGE_TSX,
224-
[],
225-
);
226-
add_treesitter_language(
227-
&mut map,
228-
"TypeScript",
229-
[".ts", "ts"],
230-
tree_sitter_typescript::LANGUAGE_TYPESCRIPT,
231-
[],
232-
);
233-
add_treesitter_language(&mut map, "XML", [".xml"], tree_sitter_xml::LANGUAGE_XML, []);
234-
add_treesitter_language(&mut map, "DTD", [".dtd"], tree_sitter_xml::LANGUAGE_DTD, []);
235-
add_treesitter_language(
236-
&mut map,
237-
"YAML",
238-
[".yaml", ".yml"],
239-
tree_sitter_yaml::LANGUAGE,
240-
[],
241-
);
242-
add_treesitter_language(
243-
&mut map,
244-
"Solidity",
245-
[".sol"],
246-
tree_sitter_solidity::LANGUAGE,
247-
[],
248-
);
249-
map
250-
});
251-
25261
enum ChunkKind<'t> {
25362
TreeSitterNode {
254-
lang_config: &'t TreesitterLanguageConfig,
63+
tree_sitter_info: &'t program_langs::TreeSitterLanguageInfo,
25564
node: tree_sitter::Node<'t>,
25665
},
25766
RegexpSepChunk {
@@ -325,7 +134,7 @@ impl<'t, 's: 't> Iterator for TextChunksIter<'t, 's> {
325134
}
326135

327136
struct TreeSitterNodeIter<'t, 's: 't> {
328-
lang_config: &'t TreesitterLanguageConfig,
137+
lang_config: &'t program_langs::TreeSitterLanguageInfo,
329138
full_text: &'s str,
330139
cursor: Option<tree_sitter::TreeCursor<'t>>,
331140
next_start_pos: usize,
@@ -378,7 +187,7 @@ impl<'t, 's: 't> Iterator for TreeSitterNodeIter<'t, 's> {
378187
full_text: self.full_text,
379188
range: RangeValue::new(node.start_byte(), node.end_byte()),
380189
kind: ChunkKind::TreeSitterNode {
381-
lang_config: self.lang_config,
190+
tree_sitter_info: self.lang_config,
382191
node,
383192
},
384193
})
@@ -531,7 +340,10 @@ impl<'t, 's: 't> RecursiveChunker<'s> {
531340
atom_collector.collect(current_chunk.range);
532341
} else {
533342
match current_chunk.kind {
534-
ChunkKind::TreeSitterNode { lang_config, node } => {
343+
ChunkKind::TreeSitterNode {
344+
tree_sitter_info: lang_config,
345+
node,
346+
} => {
535347
if !lang_config.terminal_node_kind_ids.contains(&node.kind_id()) {
536348
let mut cursor = node.walk();
537349
if cursor.goto_first_child() {
@@ -858,14 +670,16 @@ impl SimpleFunctionExecutor for Executor {
858670
lang_config,
859671
next_regexp_sep_id: 0,
860672
})?
861-
} else if let Some(lang_config) = TREE_SITTER_LANGUAGE_BY_LANG.get(&language) {
673+
} else if let Some(lang_info) = program_langs::get_language_info(&language)
674+
&& let Some(tree_sitter_info) = lang_info.treesitter_info.as_ref()
675+
{
862676
let mut parser = tree_sitter::Parser::new();
863-
parser.set_language(&lang_config.tree_sitter_lang)?;
864-
let tree = parser.parse(full_text.as_ref(), None).ok_or_else(|| {
865-
anyhow!("failed in parsing text in language: {}", lang_config.name)
866-
})?;
677+
parser.set_language(&tree_sitter_info.tree_sitter_lang)?;
678+
let tree = parser
679+
.parse(full_text.as_ref(), None)
680+
.ok_or_else(|| anyhow!("failed in parsing text in language: {}", lang_info.name))?;
867681
recursive_chunker.split_root_chunk(ChunkKind::TreeSitterNode {
868-
lang_config,
682+
tree_sitter_info,
869683
node: tree.root_node(),
870684
})?
871685
} else {

src/ops/shared/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod postgres;
2+
pub mod program_langs;
23
pub mod split;

0 commit comments

Comments
 (0)