diff --git a/src/ast/comments.rs b/src/ast/comments.rs new file mode 100644 index 000000000..33cefb51d --- /dev/null +++ b/src/ast/comments.rs @@ -0,0 +1,280 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Provides a representation of source code comments in parsed SQL code. + +#[cfg(not(feature = "std"))] +use alloc::{string::String, vec::Vec}; + +use core::{ + ops::{Bound, Deref, RangeBounds}, + slice, +}; + +use crate::tokenizer::{Location, Span}; + +/// An opaque container for comments from a parse SQL source code. +#[derive(Default, Debug)] +pub struct Comments(Vec); + +impl Comments { + pub(crate) fn push(&mut self, comment: CommentWithSpan) { + debug_assert!(self + .0 + .last() + .map(|last| last.span < comment.span) + .unwrap_or(true)); + self.0.push(comment); + } + + /// Finds comments starting within the given location range. The order of + /// iterator reflects the order of the comments as encountered in the parsed + /// source code. + pub fn find>(&self, range: R) -> Iter<'_> { + let (start, end) = ( + self.start_index(range.start_bound()), + self.end_index(range.end_bound()), + ); + // ~ in case the user specified a rever range + Iter(if start <= end { + self.0[start..end].iter() + } else { + self.0[0..0].iter() + }) + } + + /// Find the index of the first comment starting "before" the given location. + /// + /// The returned index is _inclusive._ + fn start_index(&self, location: Bound<&Location>) -> usize { + match location { + Bound::Included(location) => { + match self.0.binary_search_by(|c| c.span.start.cmp(location)) { + Ok(i) => i, + Err(i) => i, + } + } + Bound::Excluded(location) => { + match self.0.binary_search_by(|c| c.span.start.cmp(location)) { + Ok(i) => i + 1, + Err(i) => i, + } + } + Bound::Unbounded => 0, + } + } + + /// Find the index of the first comment starting "after" the given location. + /// + /// The returned index is _exclusive._ + fn end_index(&self, location: Bound<&Location>) -> usize { + match location { + Bound::Included(location) => { + match self.0.binary_search_by(|c| c.span.start.cmp(location)) { + Ok(i) => i + 1, + Err(i) => i, + } + } + Bound::Excluded(location) => { + match self.0.binary_search_by(|c| c.span.start.cmp(location)) { + Ok(i) => i, + Err(i) => i, + } + } + Bound::Unbounded => self.0.len(), + } + } +} + +impl From for Vec { + fn from(comments: Comments) -> Self { + comments.0 + } +} + +/// A source code comment with information of its entire span. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct CommentWithSpan { + /// The source code comment iself + pub comment: Comment, + /// The span of the comment including its markers + pub span: Span, +} + +impl Deref for CommentWithSpan { + type Target = Comment; + + fn deref(&self) -> &Self::Target { + &self.comment + } +} + +/// A unified type of the different source code comment formats. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Comment { + /// A single line comment, typically introduced with a prefix and spanning + /// until end-of-line or end-of-file in the source code. + /// + /// Note: `content` will include the terminating new-line character, if any. + SingleLine { content: String, prefix: String }, + + /// A multi-line comment, typically enclosed in `/* .. */` markers. The + /// string represents the content excluding the markers. + MultiLine(String), +} + +impl Comment { + /// Retrieves the content of the comment as string slice. + pub fn as_str(&self) -> &str { + match self { + Comment::SingleLine { content, prefix: _ } => content.as_str(), + Comment::MultiLine(content) => content.as_str(), + } + } +} + +impl Deref for Comment { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +/// An opaque iterator implementation over comments served by [Comments::find]. +pub struct Iter<'a>(slice::Iter<'a, CommentWithSpan>); + +impl<'a> Iterator for Iter<'a> { + type Item = &'a CommentWithSpan; + + fn next(&mut self) -> Option { + self.0.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find() { + let comments = { + // ``` + // -- abc + // /* hello */--, world + // /* def + // ghi + // jkl + // */ + // ``` + let mut c = Comments(Vec::new()); + c.push(CommentWithSpan { + comment: Comment::SingleLine { + content: " abc".into(), + prefix: "--".into(), + }, + span: Span::new((1, 1).into(), (1, 7).into()), + }); + c.push(CommentWithSpan { + comment: Comment::MultiLine(" hello ".into()), + span: Span::new((2, 3).into(), (2, 14).into()), + }); + c.push(CommentWithSpan { + comment: Comment::SingleLine { + content: ", world".into(), + prefix: "--".into(), + }, + span: Span::new((2, 14).into(), (2, 21).into()), + }); + c.push(CommentWithSpan { + comment: Comment::MultiLine(" def\n ghi\n jkl\n".into()), + span: Span::new((3, 3).into(), (7, 1).into()), + }); + c + }; + + fn find>(comments: &Comments, range: R) -> Vec<&str> { + comments.find(range).map(|c| c.as_str()).collect::>() + } + + // ~ end-points only -------------------------------------------------- + assert_eq!(find(&comments, ..Location::new(0, 0)), Vec::<&str>::new()); + assert_eq!(find(&comments, ..Location::new(2, 1)), vec![" abc"]); + assert_eq!(find(&comments, ..Location::new(2, 3)), vec![" abc"]); + assert_eq!( + find(&comments, ..=Location::new(2, 3)), + vec![" abc", " hello "] + ); + assert_eq!( + find(&comments, ..=Location::new(2, 3)), + vec![" abc", " hello "] + ); + assert_eq!( + find(&comments, ..Location::new(2, 15)), + vec![" abc", " hello ", ", world"] + ); + + // ~ start-points only ------------------------------------------------ + assert_eq!( + find(&comments, Location::new(1000, 1000)..), + Vec::<&str>::new() + ); + assert_eq!( + find(&comments, Location::new(2, 14)..), + vec![", world", " def\n ghi\n jkl\n"] + ); + assert_eq!( + find(&comments, Location::new(2, 15)..), + vec![" def\n ghi\n jkl\n"] + ); + assert_eq!( + find(&comments, Location::new(0, 0)..), + vec![" abc", " hello ", ", world", " def\n ghi\n jkl\n"] + ); + assert_eq!( + find(&comments, Location::new(1, 1)..), + vec![" abc", " hello ", ", world", " def\n ghi\n jkl\n"] + ); + + // ~ ranges ----------------------------------------------------------- + assert_eq!( + find(&comments, Location::new(2, 1)..Location::new(1, 1)), + Vec::<&str>::new() + ); + assert_eq!( + find(&comments, Location::new(1, 1)..Location::new(2, 3)), + vec![" abc"] + ); + assert_eq!( + find(&comments, Location::new(1, 1)..=Location::new(2, 3)), + vec![" abc", " hello "] + ); + assert_eq!( + find(&comments, Location::new(1, 1)..=Location::new(2, 10)), + vec![" abc", " hello "] + ); + assert_eq!( + find(&comments, Location::new(1, 1)..=Location::new(2, 14)), + vec![" abc", " hello ", ", world"] + ); + assert_eq!( + find(&comments, Location::new(1, 1)..Location::new(2, 15)), + vec![" abc", " hello ", ", world"] + ); + + // ~ find everything -------------------------------------------------- + assert_eq!( + find(&comments, ..), + vec![" abc", " hello ", ", world", " def\n ghi\n jkl\n"] + ); + } +} diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 63a7bebc7..eede3a14e 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -133,6 +133,7 @@ mod query; mod spans; pub use spans::Spanned; +pub mod comments; mod trigger; mod value; diff --git a/src/ast/spans.rs b/src/ast/spans.rs index 994cee972..7a3953def 100644 --- a/src/ast/spans.rs +++ b/src/ast/spans.rs @@ -28,7 +28,7 @@ use core::iter; use crate::tokenizer::Span; use super::{ - dcl::SecondaryRoles, value::ValueWithSpan, AccessExpr, AlterColumnOperation, + comments, dcl::SecondaryRoles, value::ValueWithSpan, AccessExpr, AlterColumnOperation, AlterIndexOperation, AlterTableOperation, Analyze, Array, Assignment, AssignmentTarget, AttachedToken, BeginEndStatements, CaseStatement, CloseCursor, ClusteredIndex, ColumnDef, ColumnOption, ColumnOptionDef, ConditionalStatementBlock, ConditionalStatements, @@ -2469,6 +2469,12 @@ impl Spanned for OutputClause { } } +impl Spanned for comments::CommentWithSpan { + fn span(&self) -> Span { + self.span + } +} + #[cfg(test)] pub mod tests { use crate::dialect::{Dialect, GenericDialect, SnowflakeDialect}; diff --git a/src/parser/mod.rs b/src/parser/mod.rs index f3daf628a..36e3b8898 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -32,14 +32,17 @@ use recursion::RecursionCounter; use IsLateral::*; use IsOptional::*; -use crate::ast::helpers::{ - key_value_options::{ - KeyValueOption, KeyValueOptionKind, KeyValueOptions, KeyValueOptionsDelimiter, - }, - stmt_create_table::{CreateTableBuilder, CreateTableConfiguration}, -}; use crate::ast::Statement::CreatePolicy; use crate::ast::*; +use crate::ast::{ + comments, + helpers::{ + key_value_options::{ + KeyValueOption, KeyValueOptionKind, KeyValueOptions, KeyValueOptionsDelimiter, + }, + stmt_create_table::{CreateTableBuilder, CreateTableConfiguration}, + }, +}; use crate::dialect::*; use crate::keywords::{Keyword, ALL_KEYWORDS}; use crate::tokenizer::*; @@ -529,6 +532,44 @@ impl<'a> Parser<'a> { Parser::new(dialect).try_with_sql(sql)?.parse_statements() } + /// Parses the given `sql` into an Abstract Syntax Tree (AST), returning + /// also encountered source code comments. + /// + /// See [Parser::parse_sql]. + pub fn parse_sql_with_comments( + dialect: &'a dyn Dialect, + sql: &str, + ) -> Result<(Vec, comments::Comments), ParserError> { + let mut p = Parser::new(dialect).try_with_sql(sql)?; + p.parse_statements().map(|stmts| (stmts, p.into_comments())) + } + + /// Consumes this parser returning comments from the parsed token stream. + fn into_comments(self) -> comments::Comments { + let mut comments = comments::Comments::default(); + for t in self.tokens.into_iter() { + match t.token { + Token::Whitespace(Whitespace::SingleLineComment { comment, prefix }) => { + comments.push(comments::CommentWithSpan { + comment: comments::Comment::SingleLine { + content: comment, + prefix, + }, + span: t.span, + }); + } + Token::Whitespace(Whitespace::MultiLineComment(comment)) => { + comments.push(comments::CommentWithSpan { + comment: comments::Comment::MultiLine(comment), + span: t.span, + }); + } + _ => {} + } + } + comments + } + /// Parse a single top-level statement (such as SELECT, INSERT, CREATE, etc.), /// stopping before the statement separator, if any. pub fn parse_statement(&mut self) -> Result { diff --git a/tests/sqlparser_comments.rs b/tests/sqlparser_comments.rs new file mode 100644 index 000000000..34442ca3e --- /dev/null +++ b/tests/sqlparser_comments.rs @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#![warn(clippy::all)] +//! Test comment extraction from SQL source code. + +#[cfg(test)] +use pretty_assertions::assert_eq; + +use sqlparser::{ + ast::comments::{Comment, CommentWithSpan}, + dialect::GenericDialect, + parser::Parser, + tokenizer::Span, +}; + +#[test] +fn parse_sql_with_comments() { + let sql = r#" +-- second line comment +select * from /* inline comment after `from` */ dual; + +/*select +some +more*/ + + -- end-of-script-with-no-newline"#; + + let comments = match Parser::parse_sql_with_comments(&GenericDialect, sql) { + Ok((_, comments)) => comments, + Err(e) => panic!("Invalid sql script: {e}"), + }; + + assert_eq!( + Vec::from(comments), + vec![ + CommentWithSpan { + comment: Comment::SingleLine { + content: " second line comment\n".into(), + prefix: "--".into() + }, + span: Span::new((2, 1).into(), (3, 1).into()), + }, + CommentWithSpan { + comment: Comment::MultiLine(" inline comment after `from` ".into()), + span: Span::new((3, 15).into(), (3, 48).into()), + }, + CommentWithSpan { + comment: Comment::MultiLine("select\nsome\nmore".into()), + span: Span::new((5, 1).into(), (7, 7).into()) + }, + CommentWithSpan { + comment: Comment::SingleLine { + content: " end-of-script-with-no-newline".into(), + prefix: "--".into() + }, + span: Span::new((9, 3).into(), (9, 35).into()), + } + ] + ); +}