Skip to content

Commit 96d6f65

Browse files
authored
Parse concrete syntax with a proper grammar (#741)
Concrete syntax has a hacky implementation based on string matching. If we want to expand the grammar to support constraints and other constructs, it's worthwhile to build a proper grammar. This will allow us to expand it in the future to use things like: - `where len(:[name]) > 2` for example to select nodes only with two or more children; - `where :[name] matches regex` with regex to add constraints on template variables. In this PR I essentially wrote the paper, but didn't touch the matching algorithm. In summary: - Added Pest grammar (concrete_syntax.pest) - Added parser that converts strings to build an actual AST - Moved the matching algorithm to [interpreter.rs](https://github.com/danieltrt/piranha/blob/a993eb15fd9f2317f928c6a94c96577ffb73336d/src/models/concrete_syntax/interpreter.rs#L44)
1 parent 1149375 commit 96d6f65

File tree

12 files changed

+534
-107
lines changed

12 files changed

+534
-107
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ pyo3 = "0.20.0"
7070
pyo3-log = "0.9.0"
7171
glob = "0.3.1"
7272
lazy_static = "1.4.0"
73+
pest = "2.8.1"
74+
pest_derive = "2.8.1"
7375

7476
[features]
7577
extension-module = ["pyo3/extension-module"]

mod.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/*
2+
Copyright (c) 2023 Uber Technologies, Inc.
3+
4+
<p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
5+
except in compliance with the License. You may obtain a copy of the License at
6+
<p>http://www.apache.org/licenses/LICENSE-2.0
7+
8+
<p>Unless required by applicable law or agreed to in writing, software distributed under the
9+
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
10+
express or implied. See the License for the specific language governing permissions and
11+
limitations under the License.
12+
*/
13+
14+
//! Concrete Syntax Module
15+
//!
16+
//! This module provides concrete syntax pattern matching capabilities for Piranha.
17+
//! It includes a parser for concrete syntax patterns and an interpreter that can
18+
//! match these patterns against tree-sitter ASTs.

src/df/utils.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ pub fn get_capture_groups_from_matcher(node: &Rule) -> Vec<String> {
3434
match &node.query().pattern_type() {
3535
PatternType::Tsq => get_capture_groups_from_tsq(node.query().pattern()),
3636
PatternType::Regex => get_capture_groups_from_regex(node.query().extract_regex().unwrap()),
37+
PatternType::Cs => vec![],
3738
PatternType::Unknown => vec![],
3839
}
3940
}
@@ -47,6 +48,7 @@ pub fn get_capture_group_usage_from_matcher(node: &Rule) -> Vec<String> {
4748
match &node.query().pattern_type() {
4849
PatternType::Tsq => get_capture_group_usage_from_tsq(node.query().pattern()),
4950
PatternType::Regex => get_capture_group_usage_from_regex(node.query().pattern()),
51+
PatternType::Cs => vec![],
5052
PatternType::Unknown => vec![],
5153
}
5254
}

src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ use models::{
2121
extern crate lazy_static;
2222
pub mod df;
2323
pub mod models;
24-
2524
#[cfg(test)]
2625
mod tests;
2726
pub mod utilities;

src/models/capture_group_patterns.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ Copyright (c) 2023 Uber Technologies, Inc.
1111
limitations under the License.
1212
*/
1313

14-
use crate::models::concrete_syntax::get_all_matches_for_concrete_syntax;
14+
use crate::models::concrete_syntax::interpreter::get_all_matches_for_concrete_syntax;
15+
use crate::models::concrete_syntax::parser::ConcreteSyntax;
1516
use crate::{
1617
models::Validator,
1718
utilities::{
@@ -26,8 +27,6 @@ use serde_derive::Deserialize;
2627
use std::collections::HashMap;
2728
use tree_sitter::{Node, Query};
2829

29-
#[derive(Debug)]
30-
pub struct ConcreteSyntax(pub String);
3130
use super::{
3231
default_configs::{CONCRETE_SYNTAX_QUERY_PREFIX, REGEX_QUERY_PREFIX},
3332
matches::Match,
@@ -36,6 +35,7 @@ use super::{
3635
pub enum PatternType {
3736
Tsq,
3837
Regex,
38+
Cs,
3939
Unknown,
4040
}
4141

@@ -59,13 +59,14 @@ impl CGPattern {
5959

6060
pub(crate) fn extract_concrete_syntax(&self) -> ConcreteSyntax {
6161
let mut _val = &self.pattern()[CONCRETE_SYNTAX_QUERY_PREFIX.len()..];
62-
ConcreteSyntax(_val.to_string())
62+
ConcreteSyntax::parse(_val).unwrap()
6363
}
6464

6565
pub(crate) fn pattern_type(&self) -> PatternType {
6666
match self.0.as_str() {
6767
pattern if pattern.starts_with("rgx") => PatternType::Regex,
6868
pattern if pattern.trim().starts_with('(') => PatternType::Tsq,
69+
pattern if pattern.trim().starts_with("cs") => PatternType::Cs,
6970
_ => PatternType::Unknown,
7071
}
7172
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
// Concrete Syntax Grammar for Piranha
2+
concrete_syntax = { SOI ~ pattern ~ EOI }
3+
pattern = { (element)+ }
4+
5+
// An element is either a capture or literal text
6+
element = _{ capture | literal_text | whitespace }
7+
8+
// Captures: :[name], :[name+], :[name*], @name
9+
capture = { (":[" ~ identifier ~ capture_mode? ~ "]") | "@"~identifier } // FIXME: Should remove @ from the grammar, because literals may be parsed incorrectly
10+
capture_mode = { "+" | "*" }
11+
identifier = { (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }
12+
13+
// Literal text - single word/token without whitespace
14+
literal_text = { (!( ":[" | whitespace ) ~ ANY)+ }
15+
whitespace = _{ (" " | "\t" | "\r" | "\n")+ }

0 commit comments

Comments
 (0)