Skip to content

Commit 149722a

Browse files
authored
Merge pull request github#12881 from hmac/extractor-high-level-api
Shared: High level extractor API
2 parents c4b2bce + 5688da1 commit 149722a

File tree

7 files changed

+333
-228
lines changed

7 files changed

+333
-228
lines changed

.devcontainer/devcontainer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"extensions": [
3-
"rust-lang.rust",
3+
"rust-lang.rust-analyzer",
44
"bungcip.better-toml",
55
"github.vscode-codeql",
66
"hbenl.vscode-test-explorer",
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
name: Test tree-sitter-extractor
2+
3+
on:
4+
push:
5+
paths:
6+
- "shared/tree-sitter-extractor/**"
7+
- .github/workflows/tree-sitter-extractor-test.yml
8+
branches:
9+
- main
10+
- "rc/*"
11+
pull_request:
12+
paths:
13+
- "shared/tree-sitter-extractor/**"
14+
- .github/workflows/tree-sitter-extractor-test.yml
15+
branches:
16+
- main
17+
- "rc/*"
18+
19+
env:
20+
CARGO_TERM_COLOR: always
21+
22+
defaults:
23+
run:
24+
working-directory: shared/tree-sitter-extractor
25+
26+
jobs:
27+
test:
28+
steps:
29+
- uses: actions/checkout@v3
30+
- name: Check formatting
31+
run: cargo fmt --all -- --check
32+
- name: Run tests
33+
run: cargo test --verbose
34+
- name: Run clippy
35+
fmt:
36+
steps:
37+
- uses: actions/checkout@v3
38+
- name: Check formatting
39+
run: cargo fmt --check
40+
clippy:
41+
steps:
42+
- uses: actions/checkout@v3
43+
- name: Run clippy
44+
run: cargo clippy -- --no-deps -D warnings

ql/extractor/src/extractor.rs

Lines changed: 42 additions & 216 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
use clap::Args;
2-
use rayon::prelude::*;
3-
use std::fs;
4-
use std::io::BufRead;
5-
use std::path::{Path, PathBuf};
2+
use std::path::PathBuf;
63

7-
use codeql_extractor::{diagnostics, extractor, node_types, trap};
4+
use codeql_extractor::extractor::simple;
5+
use codeql_extractor::trap;
86

97
#[derive(Args)]
108
pub struct Options {
@@ -29,217 +27,45 @@ pub fn run(options: Options) -> std::io::Result<()> {
2927
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
3028
.init();
3129

32-
let diagnostics = diagnostics::DiagnosticLoggers::new("ql");
33-
let mut main_thread_logger = diagnostics.logger();
34-
let num_threads = match codeql_extractor::options::num_threads() {
35-
Ok(num) => num,
36-
Err(e) => {
37-
main_thread_logger.write(
38-
main_thread_logger
39-
.new_entry("configuration-error", "Configuration error")
40-
.message(
41-
"{}; defaulting to 1 thread.",
42-
&[diagnostics::MessageArg::Code(&e)],
43-
)
44-
.severity(diagnostics::Severity::Warning),
45-
);
46-
1
47-
}
48-
};
49-
tracing::info!(
50-
"Using {} {}",
51-
num_threads,
52-
if num_threads == 1 {
53-
"thread"
54-
} else {
55-
"threads"
56-
}
57-
);
58-
let trap_compression = match trap::Compression::from_env("CODEQL_QL_TRAP_COMPRESSION") {
59-
Ok(x) => x,
60-
Err(e) => {
61-
main_thread_logger.write(
62-
main_thread_logger
63-
.new_entry("configuration-error", "Configuration error")
64-
.message("{}; using gzip.", &[diagnostics::MessageArg::Code(&e)])
65-
.severity(diagnostics::Severity::Warning),
66-
);
67-
trap::Compression::Gzip
68-
}
30+
let extractor = simple::Extractor {
31+
prefix: "ql".to_string(),
32+
languages: vec![
33+
simple::LanguageSpec {
34+
prefix: "ql",
35+
ts_language: tree_sitter_ql::language(),
36+
node_types: tree_sitter_ql::NODE_TYPES,
37+
file_extensions: vec!["ql".into(), "qll".into()],
38+
},
39+
simple::LanguageSpec {
40+
prefix: "dbscheme",
41+
ts_language: tree_sitter_ql_dbscheme::language(),
42+
node_types: tree_sitter_ql_dbscheme::NODE_TYPES,
43+
file_extensions: vec!["dbscheme".into()],
44+
},
45+
simple::LanguageSpec {
46+
prefix: "yaml",
47+
ts_language: tree_sitter_ql_yaml::language(),
48+
node_types: tree_sitter_ql_yaml::NODE_TYPES,
49+
file_extensions: vec!["yml".into()],
50+
},
51+
simple::LanguageSpec {
52+
prefix: "json",
53+
ts_language: tree_sitter_json::language(),
54+
node_types: tree_sitter_json::NODE_TYPES,
55+
file_extensions: vec!["json".into(), "jsonl".into(), "jsonc".into()],
56+
},
57+
simple::LanguageSpec {
58+
prefix: "blame",
59+
ts_language: tree_sitter_blame::language(),
60+
node_types: tree_sitter_blame::NODE_TYPES,
61+
file_extensions: vec!["blame".into()],
62+
},
63+
],
64+
trap_dir: options.output_dir,
65+
trap_compression: trap::Compression::from_env("CODEQL_QL_TRAP_COMPRESSION"),
66+
source_archive_dir: options.source_archive_dir,
67+
file_list: options.file_list,
6968
};
70-
drop(main_thread_logger);
71-
72-
rayon::ThreadPoolBuilder::new()
73-
.num_threads(num_threads)
74-
.build_global()
75-
.unwrap();
76-
77-
let trap_dir = options.output_dir;
78-
let file_list = fs::File::open(options.file_list)?;
79-
let source_archive_dir = options.source_archive_dir;
8069

81-
let language = tree_sitter_ql::language();
82-
let dbscheme = tree_sitter_ql_dbscheme::language();
83-
let yaml = tree_sitter_ql_yaml::language();
84-
let blame = tree_sitter_blame::language();
85-
let json = tree_sitter_json::language();
86-
let schema = node_types::read_node_types_str("ql", tree_sitter_ql::NODE_TYPES)?;
87-
let dbscheme_schema =
88-
node_types::read_node_types_str("dbscheme", tree_sitter_ql_dbscheme::NODE_TYPES)?;
89-
let yaml_schema = node_types::read_node_types_str("yaml", tree_sitter_ql_yaml::NODE_TYPES)?;
90-
let blame_schema = node_types::read_node_types_str("blame", tree_sitter_blame::NODE_TYPES)?;
91-
let json_schema = node_types::read_node_types_str("json", tree_sitter_json::NODE_TYPES)?;
92-
93-
let lines: std::io::Result<Vec<String>> = std::io::BufReader::new(file_list).lines().collect();
94-
let lines = lines?;
95-
lines
96-
.par_iter()
97-
.try_for_each(|line| {
98-
// only consider files that end with .ql/.qll/.dbscheme/qlpack.yml
99-
// TODO: This is a bad fix, wait for the post-merge discussion in https://github.com/github/codeql/pull/7444 to be resolved
100-
if !line.ends_with(".ql")
101-
&& !line.ends_with(".qll")
102-
&& !line.ends_with(".dbscheme")
103-
&& !line.ends_with("qlpack.yml")
104-
&& !line.ends_with(".blame")
105-
&& !line.ends_with(".json")
106-
&& !line.ends_with(".jsonl")
107-
&& !line.ends_with(".jsonc")
108-
{
109-
return Ok(());
110-
}
111-
let path = PathBuf::from(line).canonicalize()?;
112-
let src_archive_file = path_for(&source_archive_dir, &path, "");
113-
let source = std::fs::read(&path)?;
114-
let code_ranges = vec![];
115-
let mut trap_writer = trap::Writer::new();
116-
let mut diagnostics_writer = diagnostics.logger();
117-
if line.ends_with(".dbscheme") {
118-
extractor::extract(
119-
dbscheme,
120-
"dbscheme",
121-
&dbscheme_schema,
122-
&mut diagnostics_writer,
123-
&mut trap_writer,
124-
&path,
125-
&source,
126-
&code_ranges,
127-
)
128-
} else if line.ends_with("qlpack.yml") {
129-
extractor::extract(
130-
yaml,
131-
"yaml",
132-
&yaml_schema,
133-
&mut diagnostics_writer,
134-
&mut trap_writer,
135-
&path,
136-
&source,
137-
&code_ranges,
138-
)
139-
} else if line.ends_with(".json")
140-
|| line.ends_with(".jsonl")
141-
|| line.ends_with(".jsonc")
142-
{
143-
extractor::extract(
144-
json,
145-
"json",
146-
&json_schema,
147-
&mut diagnostics_writer,
148-
&mut trap_writer,
149-
&path,
150-
&source,
151-
&code_ranges,
152-
)
153-
} else if line.ends_with(".blame") {
154-
extractor::extract(
155-
blame,
156-
"blame",
157-
&blame_schema,
158-
&mut diagnostics_writer,
159-
&mut trap_writer,
160-
&path,
161-
&source,
162-
&code_ranges,
163-
)
164-
} else {
165-
extractor::extract(
166-
language,
167-
"ql",
168-
&schema,
169-
&mut diagnostics_writer,
170-
&mut trap_writer,
171-
&path,
172-
&source,
173-
&code_ranges,
174-
)
175-
}
176-
std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
177-
std::fs::copy(&path, &src_archive_file)?;
178-
write_trap(&trap_dir, path, &trap_writer, trap_compression)
179-
})
180-
.expect("failed to extract files");
181-
182-
let path = PathBuf::from("extras");
183-
let mut trap_writer = trap::Writer::new();
184-
extractor::populate_empty_location(&mut trap_writer);
185-
write_trap(&trap_dir, path, &trap_writer, trap_compression)
186-
}
187-
188-
fn write_trap(
189-
trap_dir: &Path,
190-
path: PathBuf,
191-
trap_writer: &trap::Writer,
192-
trap_compression: trap::Compression,
193-
) -> std::io::Result<()> {
194-
let trap_file = path_for(trap_dir, &path, trap_compression.extension());
195-
std::fs::create_dir_all(trap_file.parent().unwrap())?;
196-
trap_writer.write_to_file(&trap_file, trap_compression)
197-
}
198-
199-
fn path_for(dir: &Path, path: &Path, ext: &str) -> PathBuf {
200-
let mut result = PathBuf::from(dir);
201-
for component in path.components() {
202-
match component {
203-
std::path::Component::Prefix(prefix) => match prefix.kind() {
204-
std::path::Prefix::Disk(letter) | std::path::Prefix::VerbatimDisk(letter) => {
205-
result.push(format!("{}_", letter as char))
206-
}
207-
std::path::Prefix::Verbatim(x) | std::path::Prefix::DeviceNS(x) => {
208-
result.push(x);
209-
}
210-
std::path::Prefix::UNC(server, share)
211-
| std::path::Prefix::VerbatimUNC(server, share) => {
212-
result.push("unc");
213-
result.push(server);
214-
result.push(share);
215-
}
216-
},
217-
std::path::Component::RootDir => {
218-
// skip
219-
}
220-
std::path::Component::Normal(_) => {
221-
result.push(component);
222-
}
223-
std::path::Component::CurDir => {
224-
// skip
225-
}
226-
std::path::Component::ParentDir => {
227-
result.pop();
228-
}
229-
}
230-
}
231-
if !ext.is_empty() {
232-
match result.extension() {
233-
Some(x) => {
234-
let mut new_ext = x.to_os_string();
235-
new_ext.push(".");
236-
new_ext.push(ext);
237-
result.set_extension(new_ext);
238-
}
239-
None => {
240-
result.set_extension(ext);
241-
}
242-
}
243-
}
244-
result
70+
extractor.run()
24571
}

shared/tree-sitter-extractor/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,7 @@ serde = { version = "1.0", features = ["derive"] }
1616
serde_json = "1.0"
1717
chrono = { version = "0.4.19", features = ["serde"] }
1818
num_cpus = "1.14.0"
19+
20+
[dev-dependencies]
21+
tree-sitter-ql = { git = "https://github.com/tree-sitter/tree-sitter-ql" }
22+
rand = "0.8.5"

shared/tree-sitter-extractor/src/extractor.rs renamed to shared/tree-sitter-extractor/src/extractor/mod.rs

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ use crate::node_types::{self, EntryKind, Field, NodeTypeMap, Storage, TypeName};
44
use crate::trap;
55
use std::collections::BTreeMap as Map;
66
use std::collections::BTreeSet as Set;
7-
use std::fmt;
87
use std::path::Path;
98

109
use tree_sitter::{Language, Node, Parser, Range, Tree};
1110

11+
pub mod simple;
12+
1213
pub fn populate_file(writer: &mut trap::Writer, absolute_path: &Path) -> trap::Label {
1314
let (file_label, fresh) = writer.global_id(&trap::full_id_for_file(
1415
&file_paths::normalize_path(absolute_path),
@@ -634,13 +635,3 @@ fn traverse(tree: &Tree, visitor: &mut Visitor) {
634635
}
635636
}
636637
}
637-
638-
// Numeric indices.
639-
#[derive(Debug, Copy, Clone)]
640-
struct Index(usize);
641-
642-
impl fmt::Display for Index {
643-
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
644-
write!(f, "{}", self.0)
645-
}
646-
}

0 commit comments

Comments
 (0)