Skip to content

Commit 7e2abf2

Browse files
committed
Shared: Support glob patterns in shared extractor
Replace the `file_extensions` field with `file_globs`, which supports UNIX style glob patterns powered by the `globset` crate. This allows files with no extension (e.g. Dockerfiles) to be extracted, by specifying a glob such as `*Dockerfile`. One surprising aspect of this change is that the globs match against the whole path, rather than just the file name. This is a breaking change.
1 parent 08d44c1 commit 7e2abf2

File tree

5 files changed

+191
-91
lines changed

5 files changed

+191
-91
lines changed

shared/tree-sitter-extractor/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ authors = ["GitHub"]
66

77
[dependencies]
88
flate2 = "1.0"
9+
globset = "0.4"
910
tree-sitter = "0.20"
1011
tracing = "0.1"
1112
rayon = "1.5.0"
@@ -19,4 +20,5 @@ num_cpus = "1.14.0"
1920

2021
[dev-dependencies]
2122
tree-sitter-ql = { git = "https://github.com/tree-sitter/tree-sitter-ql" }
23+
tree-sitter-json = {git = "https://github.com/tausbn/tree-sitter-json" }
2224
rand = "0.8.5"

shared/tree-sitter-extractor/src/extractor/simple.rs

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use crate::trap;
2+
use globset::{Glob, GlobSetBuilder};
23
use rayon::prelude::*;
3-
use std::collections::HashMap;
4-
use std::ffi::{OsStr, OsString};
54
use std::fs::File;
65
use std::io::BufRead;
76
use std::path::{Path, PathBuf};
@@ -13,7 +12,7 @@ pub struct LanguageSpec {
1312
pub prefix: &'static str,
1413
pub ts_language: tree_sitter::Language,
1514
pub node_types: &'static str,
16-
pub file_extensions: Vec<OsString>,
15+
pub file_globs: Vec<String>,
1716
}
1817

1918
pub struct Extractor {
@@ -83,16 +82,23 @@ impl Extractor {
8382
schemas.push(schema);
8483
}
8584

86-
// Construct a map from file extension -> LanguageSpec
87-
let mut file_extension_language_mapping: HashMap<&OsStr, Vec<usize>> = HashMap::new();
88-
for (i, lang) in self.languages.iter().enumerate() {
89-
for (j, _ext) in lang.file_extensions.iter().enumerate() {
90-
let indexes = file_extension_language_mapping
91-
.entry(&lang.file_extensions[j])
92-
.or_default();
93-
indexes.push(i);
85+
// Construct a single globset containing all language globs,
86+
// and a mapping from glob index to language index.
87+
let (globset, glob_language_mapping) = {
88+
let mut builder = GlobSetBuilder::new();
89+
let mut glob_lang_mapping = vec![];
90+
for (i, lang) in self.languages.iter().enumerate() {
91+
for glob_str in &lang.file_globs {
92+
let glob = Glob::new(glob_str).expect("invalid glob");
93+
builder.add(glob);
94+
glob_lang_mapping.push(i);
95+
}
9496
}
95-
}
97+
(
98+
builder.build().expect("failed to build globset"),
99+
glob_lang_mapping,
100+
)
101+
};
96102

97103
let lines: std::io::Result<Vec<String>> =
98104
std::io::BufReader::new(file_list).lines().collect();
@@ -108,33 +114,42 @@ impl Extractor {
108114
let source = std::fs::read(&path)?;
109115
let mut trap_writer = trap::Writer::new();
110116

111-
match path.extension() {
112-
None => {
113-
tracing::error!(?path, "No extension found, skipping file.");
114-
}
115-
Some(ext) => {
116-
if let Some(indexes) = file_extension_language_mapping.get(ext) {
117-
for i in indexes {
118-
let lang = &self.languages[*i];
119-
crate::extractor::extract(
120-
lang.ts_language,
121-
lang.prefix,
122-
&schemas[*i],
123-
&mut diagnostics_writer,
124-
&mut trap_writer,
125-
&path,
126-
&source,
127-
&[],
128-
);
129-
std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
130-
std::fs::copy(&path, &src_archive_file)?;
131-
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)?;
132-
}
133-
} else {
134-
tracing::warn!(?path, "No language matches path, skipping file.");
117+
let matches = globset.matches(&path);
118+
if matches.is_empty() {
119+
tracing::error!(?path, "No matching language found, skipping file.");
120+
} else {
121+
let mut languages_processed = {
122+
// No known extractor uses more than 8 languages.
123+
let mut v = Vec::with_capacity(8);
124+
for _ in &self.languages {
125+
v.push(false);
135126
}
127+
v
128+
};
129+
130+
for m in matches {
131+
let i = glob_language_mapping[m];
132+
if languages_processed[i] {
133+
continue;
134+
}
135+
languages_processed[i] = true;
136+
let lang = &self.languages[i];
137+
138+
crate::extractor::extract(
139+
lang.ts_language,
140+
lang.prefix,
141+
&schemas[i],
142+
&mut diagnostics_writer,
143+
&mut trap_writer,
144+
&path,
145+
&source,
146+
&[],
147+
);
148+
std::fs::create_dir_all(src_archive_file.parent().unwrap())?;
149+
std::fs::copy(&path, &src_archive_file)?;
150+
write_trap(&self.trap_dir, &path, &trap_writer, trap_compression)?;
136151
}
137-
};
152+
}
138153
Ok(()) as std::io::Result<()>
139154
})
140155
.expect("failed to extract files");
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
use std::io::{Read, Write};
2+
use std::{
3+
fs::File,
4+
path::{Path, PathBuf},
5+
};
6+
7+
use flate2::read::GzDecoder;
8+
9+
pub struct SourceArchive {
10+
pub root_dir: PathBuf,
11+
pub file_list: PathBuf,
12+
pub source_archive_dir: PathBuf,
13+
pub trap_dir: PathBuf,
14+
}
15+
16+
pub fn create_source_dir(files: Vec<(&'static str, &'static str)>) -> SourceArchive {
17+
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
18+
std::fs::create_dir_all(&root_dir).unwrap();
19+
let root_dir = root_dir
20+
.canonicalize()
21+
.expect("failed to canonicalize root directory");
22+
23+
let trap_dir = create_dir(&root_dir, "trap");
24+
let source_archive_dir = create_dir(&root_dir, "src");
25+
26+
let mut file_paths = vec![];
27+
for (filename, contents) in files {
28+
let path = source_archive_dir.join(filename);
29+
let mut file = File::create(&path).unwrap();
30+
file.write_all(contents.as_bytes()).unwrap();
31+
file_paths.push(PathBuf::from(path));
32+
}
33+
34+
let file_list = {
35+
let path = root_dir.join("files.txt");
36+
let mut file = File::create(&path).unwrap();
37+
for path in file_paths {
38+
file.write_all(path.as_path().display().to_string().as_bytes())
39+
.unwrap();
40+
file.write_all(b"\n").unwrap();
41+
}
42+
path
43+
};
44+
45+
SourceArchive {
46+
root_dir,
47+
file_list,
48+
source_archive_dir,
49+
trap_dir,
50+
}
51+
}
52+
53+
pub fn expect_trap_file(root_dir: &Path, filename: &str) {
54+
let root_dir_relative = {
55+
let r = root_dir.display().to_string();
56+
r.strip_prefix("/").unwrap().to_string()
57+
};
58+
let trap_gz = root_dir
59+
.join("trap")
60+
.join(root_dir_relative)
61+
.join("src")
62+
.join(format!("{filename}.trap.gz"));
63+
let mut decoder = GzDecoder::new(File::open(trap_gz).unwrap());
64+
let mut first_line = [0; 31];
65+
decoder.read_exact(&mut first_line).unwrap();
66+
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
67+
}
68+
69+
fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
70+
let full_path = root.join(path);
71+
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
72+
full_path.into()
73+
}
Lines changed: 13 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
use std::fs::File;
2-
use std::io::{Read, Write};
3-
use std::path::{Path, PathBuf};
4-
51
use codeql_extractor::extractor::simple;
62
use codeql_extractor::trap;
7-
use flate2::read::GzDecoder;
3+
84
use tree_sitter_ql;
95

10-
/// An very simple happy-path test.
6+
mod common;
7+
use common::{create_source_dir, expect_trap_file, SourceArchive};
8+
9+
/// A very simple happy-path test.
1110
/// We run the extractor using the tree-sitter-ql grammar and a single source file,
1211
/// and check that we get a reasonable-looking trap file in the expected location.
1312
#[test]
@@ -16,31 +15,15 @@ fn simple_extractor() {
1615
prefix: "ql",
1716
ts_language: tree_sitter_ql::language(),
1817
node_types: tree_sitter_ql::NODE_TYPES,
19-
file_extensions: vec!["qll".into()],
20-
};
21-
22-
let root_dir = std::env::temp_dir().join(format!("codeql-extractor-{}", rand::random::<u16>()));
23-
std::fs::create_dir_all(&root_dir).unwrap();
24-
25-
let trap_dir = create_dir(&root_dir, "trap");
26-
let source_archive_dir = create_dir(&root_dir, "src");
27-
28-
// Create foo.qll source file
29-
let foo_qll = {
30-
let path = source_archive_dir.join("foo.qll");
31-
let mut file = File::create(&path).expect("Failed to create src/foo.qll");
32-
file.write_all(b"predicate p(int a) { a = 1 }")
33-
.expect("Failed to write to foo.qll");
34-
PathBuf::from(path)
18+
file_globs: vec!["*.qll".into()],
3519
};
3620

37-
let file_list = {
38-
let path = root_dir.join("files.txt");
39-
let mut file = File::create(&path).expect("Failed to create files.txt");
40-
file.write_all(foo_qll.as_path().display().to_string().as_bytes())
41-
.expect("Failed to write to files.txt");
42-
path
43-
};
21+
let SourceArchive {
22+
root_dir,
23+
file_list,
24+
source_archive_dir,
25+
trap_dir,
26+
} = create_source_dir(vec![("foo.qll", "predicate p(int a) { a = 1 }")]);
4427

4528
let extractor = simple::Extractor {
4629
prefix: "ql".to_string(),
@@ -51,31 +34,7 @@ fn simple_extractor() {
5134
trap_compression: Ok(trap::Compression::Gzip),
5235
};
5336

54-
// The extractor should run successfully
5537
extractor.run().unwrap();
5638

57-
// Check for the presence of $root/trap/$root/src/foo.qll
58-
{
59-
let root_dir_relative = {
60-
let r = root_dir.as_path().display().to_string();
61-
r.strip_prefix("/").unwrap().to_string()
62-
};
63-
let foo_qll_trap_gz = root_dir
64-
.join("trap")
65-
.join(root_dir_relative)
66-
.join("src/foo.qll.trap.gz");
67-
let mut decoder =
68-
GzDecoder::new(File::open(foo_qll_trap_gz).expect("Failed to open foo.qll.trap.gz"));
69-
let mut first_line = [0; 31];
70-
decoder
71-
.read_exact(&mut first_line)
72-
.expect("Failed to read from foo.qll.trap.gz");
73-
assert_eq!(first_line.as_slice(), b"// Auto-generated TRAP file for");
74-
}
75-
}
76-
77-
fn create_dir(root: &Path, path: impl AsRef<Path>) -> PathBuf {
78-
let full_path = root.join(path);
79-
std::fs::create_dir_all(&full_path).expect("Failed to create directory");
80-
full_path.into()
39+
expect_trap_file(&root_dir, "foo.qll");
8140
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
use codeql_extractor::extractor::simple;
2+
use codeql_extractor::trap;
3+
use tree_sitter_ql;
4+
5+
mod common;
6+
use common::{create_source_dir, expect_trap_file, SourceArchive};
7+
8+
/// Like the `simple_extractor` test but with multiple languages.
9+
/// This is in a separate crate because the simple extractor API sets up a
10+
/// global thread pool, and therefore can't be called twice in the same process.
11+
#[test]
12+
fn multiple_language_extractor() {
13+
let lang_ql = simple::LanguageSpec {
14+
prefix: "ql",
15+
ts_language: tree_sitter_ql::language(),
16+
node_types: tree_sitter_ql::NODE_TYPES,
17+
file_globs: vec!["*.qll".into()],
18+
};
19+
let lang_json = simple::LanguageSpec {
20+
prefix: "json",
21+
ts_language: tree_sitter_json::language(),
22+
node_types: tree_sitter_json::NODE_TYPES,
23+
file_globs: vec!["*.json".into(), "*Jsonfile".into()],
24+
};
25+
26+
let SourceArchive {
27+
root_dir,
28+
file_list,
29+
source_archive_dir,
30+
trap_dir,
31+
} = create_source_dir(vec![
32+
("foo.qll", "predicate p(int a) { a = 1 }"),
33+
("bar.json", "{\"a\": 1}"),
34+
("Jsonfile", "{\"b\": 2}"),
35+
]);
36+
37+
let extractor = simple::Extractor {
38+
prefix: "ql".to_string(),
39+
languages: vec![lang_ql, lang_json],
40+
trap_dir,
41+
source_archive_dir,
42+
file_list,
43+
trap_compression: Ok(trap::Compression::Gzip),
44+
};
45+
46+
extractor.run().unwrap();
47+
48+
expect_trap_file(&root_dir, "foo.qll");
49+
expect_trap_file(&root_dir, "bar.json");
50+
expect_trap_file(&root_dir, "Jsonfile");
51+
}

0 commit comments

Comments
 (0)