Skip to content

Commit 96e9dfc

Browse files
authored
Merge pull request github#13969 from hmac/shared-extractor-globs
Shared extractor: support file path globs
2 parents 6cf9968 + b76842a commit 96e9dfc

File tree

7 files changed

+226
-84
lines changed

7 files changed

+226
-84
lines changed

ql/Cargo.lock

Lines changed: 42 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ql/extractor/src/extractor.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,25 +34,25 @@ pub fn run(options: Options) -> std::io::Result<()> {
3434
prefix: "ql",
3535
ts_language: tree_sitter_ql::language(),
3636
node_types: tree_sitter_ql::NODE_TYPES,
37-
file_extensions: vec!["ql".into(), "qll".into()],
37+
file_globs: vec!["*.ql".into(), "*.qll".into()],
3838
},
3939
simple::LanguageSpec {
4040
prefix: "dbscheme",
4141
ts_language: tree_sitter_ql_dbscheme::language(),
4242
node_types: tree_sitter_ql_dbscheme::NODE_TYPES,
43-
file_extensions: vec!["dbscheme".into()],
43+
file_globs: vec!["*.dbscheme".into()],
4444
},
4545
simple::LanguageSpec {
4646
prefix: "json",
4747
ts_language: tree_sitter_json::language(),
4848
node_types: tree_sitter_json::NODE_TYPES,
49-
file_extensions: vec!["json".into(), "jsonl".into(), "jsonc".into()],
49+
file_globs: vec!["*.json".into(), "*.jsonl".into(), "*.jsonc".into()],
5050
},
5151
simple::LanguageSpec {
5252
prefix: "blame",
5353
ts_language: tree_sitter_blame::language(),
5454
node_types: tree_sitter_blame::NODE_TYPES,
55-
file_extensions: vec!["blame".into()],
55+
file_globs: vec!["*.blame".into()],
5656
},
5757
],
5858
trap_dir: options.output_dir,

shared/tree-sitter-extractor/Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
[package]
22
name = "codeql-extractor"
3-
version = "0.1.0"
3+
version = "0.2.0"
44
edition = "2021"
55
authors = ["GitHub"]
66

77
[dependencies]
88
flate2 = "1.0"
9+
globset = "0.4"
910
tree-sitter = "0.20"
1011
tracing = "0.1"
1112
rayon = "1.5.0"
@@ -19,4 +20,5 @@ num_cpus = "1.14.0"
1920

2021
[dev-dependencies]
2122
tree-sitter-ql = { git = "https://github.com/tree-sitter/tree-sitter-ql" }
23+
tree-sitter-json = {git = "https://github.com/tausbn/tree-sitter-json" }
2224
rand = "0.8.5"

shared/tree-sitter-extractor/src/extractor/simple.rs

Lines changed: 40 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use crate::trap;
2+
use globset::{GlobBuilder, GlobSetBuilder};
23
use rayon::prelude::*;
3-
use std::collections::HashMap;
4-
use std::ffi::{OsStr, OsString};
54
use std::fs::File;
65
use std::io::BufRead;
76
use std::path::{Path, PathBuf};
@@ -13,7 +12,7 @@ pub struct LanguageSpec {
1312
pub prefix: &'static str,
1413
pub ts_language: tree_sitter::Language,
1514
pub node_types: &'static str,
16-
pub file_extensions: Vec<OsString>,
15+
pub file_globs: Vec<String>,
1716
}
1817

1918
pub struct Extractor {
@@ -83,16 +82,26 @@ impl Extractor {
8382
schemas.push(schema);
8483
}
8584

86-
// Construct a map from file extension -> LanguageSpec
87-
let mut file_extension_language_mapping: HashMap<&OsStr, Vec<usize>> = HashMap::new();
88-
for (i, lang) in self.languages.iter().enumerate() {
89-
for (j, _ext) in lang.file_extensions.iter().enumerate() {
90-
let indexes = file_extension_language_mapping
91-
.entry(&lang.file_extensions[j])
92-
.or_default();
93-
indexes.push(i);
85+
// Construct a single globset containing all language globs,
86+
// and a mapping from glob index to language index.
87+
let (globset, glob_language_mapping) = {
88+
let mut builder = GlobSetBuilder::new();
89+
let mut glob_lang_mapping = vec![];
90+
for (i, lang) in self.languages.iter().enumerate() {
91+
for glob_str in &lang.file_globs {
92+
let glob = GlobBuilder::new(glob_str)
93+
.literal_separator(true)
94+
.build()
95+
.expect("invalid glob");
96+
builder.add(glob);
97+
glob_lang_mapping.push(i);
98+
}
9499
}
95-
}
100+
(
101+
builder.build().expect("failed to build globset"),
102+
glob_lang_mapping,
103+
)
104+
};
96105

97106
let lines: std::io::Result<Vec<String>> =
98107
std::io::BufReader::new(file_list).lines().collect();
@@ -108,18 +117,29 @@ impl Extractor {
108117
let source = std::fs::read(&path)?;
109118
let mut trap_writer = trap::Writer::new();
110119

111-
match path.extension() {
120+
match path.file_name() {
112121
None => {
113-
tracing::error!(?path, "No extension found, skipping file.");
122+
tracing::error!(?path, "No file name found, skipping file.");
114123
}
115-
Some(ext) => {
116-
if let Some(indexes) = file_extension_language_mapping.get(ext) {
117-
for i in indexes {
118-
let lang = &self.languages[*i];
124+
Some(filename) => {
125+
let matches = globset.matches(filename);
126+
if matches.is_empty() {
127+
tracing::error!(?path, "No matching language found, skipping file.");
128+
} else {
129+
let mut languages_processed = vec![false; self.languages.len()];
130+
131+
for m in matches {
132+
let i = glob_language_mapping[m];
133+
if languages_processed[i] {
134+
continue;
135+
}
136+
languages_processed[i] = true;
137+
let lang = &self.languages[i];
138+
119139
crate::extractor::extract(
120140
lang.ts_language,
121141
lang.prefix,
122-
&schemas[*i],
142+
&schemas[i],
123143
&mut diagnostics_writer,
124144
&mut trap_writer,
125145
&path,
@@ -130,11 +150,9 @@ impl Extractor {
130150
std::fs::copy(&path, &src_archive_file)?;
131151
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)?;
132152
}
133-
} else {
134-
tracing::warn!(?path, "No language matches path, skipping file.");
135153
}
136154
}
137-
};
155+
}
138156
Ok(()) as std::io::Result<()>
139157
})
140158
.expect("failed to extract files");
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
use std::io::{Read, Write};
2+
use std::{
3+
fs::File,
4+
path::{Path, PathBuf},
5+
};
6+
7+
use flate2::read::GzDecoder;
8+
9+
pub struct SourceArchive {
10+
pub root_dir: PathBuf,
11+
pub file_list: PathBuf,
12+
pub source_archive_dir: PathBuf,
13+
pub trap_dir: PathBuf,
14+
}
15+
16+
pub fn create_source_dir(files: Vec<(&'static str, &'static str)>) -> SourceArchive {
17+
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
18+
std::fs::create_dir_all(&root_dir).unwrap();
19+
let root_dir = root_dir
20+
.canonicalize()
21+
.expect("failed to canonicalize root directory");
22+
23+
let trap_dir = create_dir(&root_dir, "trap");
24+
let source_archive_dir = create_dir(&root_dir, "src");
25+
26+
let mut file_paths = vec![];
27+
for (filename, contents) in files {
28+
let path = source_archive_dir.join(filename);
29+
let mut file = File::create(&path).unwrap();
30+
file.write_all(contents.as_bytes()).unwrap();
31+
file_paths.push(PathBuf::from(path));
32+
}
33+
34+
let file_list = {
35+
let path = root_dir.join("files.txt");
36+
let mut file = File::create(&path).unwrap();
37+
for path in file_paths {
38+
file.write_all(path.as_path().display().to_string().as_bytes())
39+
.unwrap();
40+
file.write_all(b"\n").unwrap();
41+
}
42+
path
43+
};
44+
45+
SourceArchive {
46+
root_dir,
47+
file_list,
48+
source_archive_dir,
49+
trap_dir,
50+
}
51+
}
52+
53+
pub fn expect_trap_file(root_dir: &Path, filename: &str) {
54+
let root_dir_relative = {
55+
let r = root_dir.display().to_string();
56+
r.strip_prefix("/").unwrap().to_string()
57+
};
58+
let trap_gz = root_dir
59+
.join("trap")
60+
.join(root_dir_relative)
61+
.join("src")
62+
.join(format!("{filename}.trap.gz"));
63+
let mut decoder = GzDecoder::new(File::open(trap_gz).unwrap());
64+
let mut first_line = [0; 31];
65+
decoder.read_exact(&mut first_line).unwrap();
66+
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
67+
}
68+
69+
fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
70+
let full_path = root.join(path);
71+
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
72+
full_path.into()
73+
}

0 commit comments

Comments
 (0)