Skip to content

Commit c4d7658

Browse files
committed
Shared: high level API for the shared extractor
This API makes it easy to create an extractor for simple use cases.
1 parent b6a7661 commit c4d7658

File tree

2 files changed

+157
-0
lines changed

2 files changed

+157
-0
lines changed

shared/tree-sitter-extractor/src/extractor.rs renamed to shared/tree-sitter-extractor/src/extractor/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ use std::path::Path;
99

1010
use tree_sitter::{Language, Node, Parser, Range, Tree};
1111

12+
pub mod simple;
13+
1214
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
1315
let (file_label, fresh) = writer.global_id(&trap::full_id_for_file(
1416
&file_paths::normalize_path(absolute_path),
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
use crate::trap;
2+
use rayon::prelude::*;
3+
use std::collections::HashMap;
4+
use std::ffi::{OsStr, OsString};
5+
use std::fs::File;
6+
use std::io::BufRead;
7+
use std::path::{Path, PathBuf};
8+
9+
use crate::diagnostics;
10+
use crate::node_types;
11+
12+
pub struct LanguageSpec {
13+
pub prefix: &'static str,
14+
pub ts_language: tree_sitter::Language,
15+
pub node_types: &'static str,
16+
pub file_extensions: Vec<OsString>,
17+
}
18+
19+
pub struct Extractor {
20+
pub prefix: String,
21+
pub languages: Vec<LanguageSpec>,
22+
pub trap_dir: PathBuf,
23+
pub source_archive_dir: PathBuf,
24+
pub file_list: PathBuf,
25+
}
26+
27+
impl Extractor {
28+
pub fn run(&self) -> std::io::Result<()> {
29+
let diagnostics = diagnostics::DiagnosticLoggers::new(&self.prefix);
30+
let mut main_thread_logger = diagnostics.logger();
31+
let num_threads = match crate::options::num_threads() {
32+
Ok(num) => num,
33+
Err(e) => {
34+
main_thread_logger.write(
35+
main_thread_logger
36+
.new_entry("configuration-error", "Configuration error")
37+
.message(
38+
"{}; defaulting to 1 thread.",
39+
&[diagnostics::MessageArg::Code(&e)],
40+
)
41+
.severity(diagnostics::Severity::Warning),
42+
);
43+
1
44+
}
45+
};
46+
tracing::info!(
47+
"Using {} {}",
48+
num_threads,
49+
if num_threads == 1 {
50+
"thread"
51+
} else {
52+
"threads"
53+
}
54+
);
55+
let trap_compression = match trap::Compression::from_env("CODEQL_QL_TRAP_COMPRESSION") {
56+
Ok(x) => x,
57+
Err(e) => {
58+
main_thread_logger.write(
59+
main_thread_logger
60+
.new_entry("configuration-error", "Configuration error")
61+
.message("{}; using gzip.", &[diagnostics::MessageArg::Code(&e)])
62+
.severity(diagnostics::Severity::Warning),
63+
);
64+
trap::Compression::Gzip
65+
}
66+
};
67+
drop(main_thread_logger);
68+
69+
rayon::ThreadPoolBuilder::new()
70+
.num_threads(num_threads)
71+
.build_global()
72+
.unwrap();
73+
74+
let file_list = File::open(&self.file_list)?;
75+
76+
let mut schemas = vec![];
77+
for lang in &self.languages {
78+
let schema = node_types::read_node_types_str(lang.prefix, lang.node_types)?;
79+
schemas.push(schema);
80+
}
81+
82+
// Construct a map from file extension -> LanguageSpec
83+
let mut file_extension_language_mapping: HashMap<&OsStr, Vec<usize>> = HashMap::new();
84+
for (i, lang) in self.languages.iter().enumerate() {
85+
for (j, _ext) in lang.file_extensions.iter().enumerate() {
86+
let indexes = file_extension_language_mapping
87+
.entry(&lang.file_extensions[j])
88+
.or_default();
89+
indexes.push(i);
90+
}
91+
}
92+
93+
let lines: std::io::Result<Vec<String>> =
94+
std::io::BufReader::new(file_list).lines().collect();
95+
let lines = lines?;
96+
97+
lines
98+
.par_iter()
99+
.try_for_each(|line| {
100+
let mut diagnostics_writer = diagnostics.logger();
101+
let path = PathBuf::from(line).canonicalize()?;
102+
let src_archive_file =
103+
crate::file_paths::path_for(&self.source_archive_dir, &path, "");
104+
let source = std::fs::read(&path)?;
105+
let mut trap_writer = trap::Writer::new();
106+
107+
match path.extension() {
108+
None => {
109+
tracing::error!(?path, "No extension found, skipping file.");
110+
}
111+
Some(ext) => {
112+
if let Some(indexes) = file_extension_language_mapping.get(ext) {
113+
for i in indexes {
114+
let lang = &self.languages[*i];
115+
crate::extractor::extract(
116+
lang.ts_language,
117+
"ruby",
118+
&schemas[*i],
119+
&mut diagnostics_writer,
120+
&mut trap_writer,
121+
&path,
122+
&source,
123+
&[],
124+
);
125+
std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
126+
std::fs::copy(&path, &src_archive_file)?;
127+
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)?;
128+
}
129+
} else {
130+
tracing::warn!(?path, "No language matches path, skipping file.");
131+
}
132+
}
133+
};
134+
Ok(()) as std::io::Result<()>
135+
})
136+
.expect("failed to extract files");
137+
138+
let path = PathBuf::from("extras");
139+
let mut trap_writer = trap::Writer::new();
140+
crate::extractor::populate_empty_location(&mut trap_writer);
141+
142+
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)
143+
}
144+
}
145+
146+
fn write_trap(
147+
trap_dir: &Path,
148+
path: &Path,
149+
trap_writer: &trap::Writer,
150+
trap_compression: trap::Compression,
151+
) -> std::io::Result<()> {
152+
let trap_file = crate::file_paths::path_for(trap_dir, path, trap_compression.extension());
153+
std::fs::create_dir_all(trap_file.parent().unwrap())?;
154+
trap_writer.write_to_file(&trap_file, trap_compression)
155+
}

0 commit comments

Comments
 (0)