diff --git a/code/.gitignore b/code/.gitignore index a9d37c5..4ca7e1d 100644 --- a/code/.gitignore +++ b/code/.gitignore @@ -1,2 +1,3 @@ target Cargo.lock +risinglight.db/ diff --git a/code/03-00/src/executor/create.rs b/code/03-00/src/executor/create.rs index c4ad484..22d3116 100644 --- a/code/03-00/src/executor/create.rs +++ b/code/03-00/src/executor/create.rs @@ -1,7 +1,6 @@ use super::*; use crate::catalog::TableRefId; use crate::physical_planner::PhysicalCreateTable; -use crate::storage::StorageRef; /// The executor of `CREATE TABLE` statement. pub struct CreateTableExecutor { diff --git a/code/03-00/src/executor/insert.rs b/code/03-00/src/executor/insert.rs index 0c76fc3..f169932 100644 --- a/code/03-00/src/executor/insert.rs +++ b/code/03-00/src/executor/insert.rs @@ -3,7 +3,6 @@ use itertools::Itertools; use super::*; use crate::array::{ArrayBuilderImpl, DataChunk}; use crate::catalog::{ColumnId, TableRefId}; -use crate::storage::StorageRef; use crate::types::{DataType, DataValue}; /// The executor of `INSERT` statement. @@ -42,7 +41,7 @@ impl InsertExecutor { for chunk in self.child { let chunk = transform_chunk(chunk?, &output_columns); count += chunk.cardinality(); - table.append(chunk)?; + table.append(chunk).await?; } yield DataChunk::single(count as i32); } diff --git a/code/03-00/src/executor/seq_scan.rs b/code/03-00/src/executor/seq_scan.rs index e6c0002..63770f6 100644 --- a/code/03-00/src/executor/seq_scan.rs +++ b/code/03-00/src/executor/seq_scan.rs @@ -13,7 +13,7 @@ impl SeqScanExecutor { #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] pub async fn execute(self) { let table = self.storage.get_table(self.table_ref_id)?; - for chunk in table.all_chunks()? { + for chunk in table.all_chunks().await? { yield chunk; } } diff --git a/code/03-00/src/storage/mod.rs b/code/03-00/src/storage/mod.rs index f4fa7e3..d9cc032 100644 --- a/code/03-00/src/storage/mod.rs +++ b/code/03-00/src/storage/mod.rs @@ -1,31 +1,29 @@ -//! In-memory storage. -//! -//! RisingLight's in-memory representation of data is very simple. Currently, -//! it is simple a vector of `DataChunk`. Upon insertion, users' data are -//! simply appended to the end of the vector. +//! On-disk storage -use std::collections::HashMap; -use std::sync::{Arc, Mutex, RwLock}; +use std::sync::Arc; use crate::array::DataChunk; use crate::catalog::TableRefId; /// The error type of storage operations. #[derive(thiserror::Error, Debug)] -pub enum StorageError { - #[error("table not found: {0:?}")] - NotFound(TableRefId), -} +#[error("{0:?}")] +pub struct StorageError(#[from] anyhow::Error); /// A specialized `Result` type for storage operations. pub type StorageResult = std::result::Result; pub type StorageRef = Arc; -pub type DiskTableRef = Arc; +pub type StorageTableRef = Arc; + +/// On-disk storage. +#[derive(Clone)] +pub struct DiskStorage; -/// In-memory storage. -pub struct DiskStorage { - tables: Mutex>, +/// An on-disk table. +pub struct DiskTable { + #[allow(dead_code)] + id: TableRefId, } impl Default for DiskStorage { @@ -37,59 +35,28 @@ impl Default for DiskStorage { impl DiskStorage { /// Create a new in-memory storage. pub fn new() -> Self { - DiskStorage { - tables: Mutex::new(HashMap::new()), - } + DiskStorage } /// Add a table. - pub fn add_table(&self, id: TableRefId) -> StorageResult<()> { - let table = Arc::new(DiskTable::new(id)); - self.tables.lock().unwrap().insert(id, table); - Ok(()) + pub fn add_table(&self, _id: TableRefId) -> StorageResult<()> { + todo!() } /// Get a table. - pub fn get_table(&self, id: TableRefId) -> StorageResult { - self.tables - .lock() - .unwrap() - .get(&id) - .cloned() - .ok_or(StorageError::NotFound(id)) + pub fn get_table(&self, _id: TableRefId) -> StorageResult { + todo!() } } -/// A table in in-memory engine. -pub struct DiskTable { - #[allow(dead_code)] - id: TableRefId, - inner: RwLock, -} - -#[derive(Default)] -struct DiskTableInner { - chunks: Vec, -} - impl DiskTable { - fn new(id: TableRefId) -> Self { - Self { - id, - inner: RwLock::new(DiskTableInner::default()), - } - } - /// Append a chunk to the table. - pub fn append(&self, chunk: DataChunk) -> StorageResult<()> { - let mut inner = self.inner.write().unwrap(); - inner.chunks.push(chunk); - Ok(()) + pub async fn append(&self, _chunk: DataChunk) -> StorageResult<()> { + todo!() } /// Get all chunks of the table. - pub fn all_chunks(&self) -> StorageResult> { - let inner = self.inner.read().unwrap(); - Ok(inner.chunks.clone()) + pub async fn all_chunks(&self) -> StorageResult> { + todo!() } } diff --git a/code/03-00/src/test.rs b/code/03-00/src/test.rs index 495722f..f833c4d 100644 --- a/code/03-00/src/test.rs +++ b/code/03-00/src/test.rs @@ -1,3 +1,6 @@ +#![allow(unused_imports)] +#![allow(dead_code)] + use std::path::Path; use test_case::test_case; @@ -6,16 +9,13 @@ use crate::array::DataChunk; use crate::types::DataValue; use crate::{Database, Error}; -#[test_case("01-01.slt")] -#[test_case("01-03.slt")] -#[test_case("01-05.slt")] -#[test_case("01-06.slt")] -#[test_case("01-07.slt")] fn test(name: &str) { init_logger(); let script = std::fs::read_to_string(Path::new("../sql").join(name)).unwrap(); let mut tester = sqllogictest::Runner::new(Database::new()); - tester.run_script(&script).unwrap(); + if let Err(err) = tester.run_script(&script) { + panic!("{}", err); + } } impl sqllogictest::DB for Database { diff --git a/code/03-01/Cargo.toml b/code/03-01/Cargo.toml index 5d1f778..7a8a5e9 100644 --- a/code/03-01/Cargo.toml +++ b/code/03-01/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [dependencies] anyhow = "1" bitvec = "1.0" +bytes = "1" enum_dispatch = "0.3" env_logger = "0.9" futures = { version = "0.3", default-features = false, features = ["alloc"] } @@ -17,7 +18,7 @@ prettytable-rs = { version = "0.8", default-features = false } rustyline = "9" sqlparser = "0.13" thiserror = "1" -tokio = { version = "1", features = ["rt", "rt-multi-thread", "sync", "macros"] } +tokio = { version = "1", features = ["rt", "rt-multi-thread", "sync", "macros", "fs"] } tokio-stream = "0.1" [dev-dependencies] diff --git a/code/03-01/src/array b/code/03-01/src/array deleted file mode 120000 index a30992f..0000000 --- a/code/03-01/src/array +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/array \ No newline at end of file diff --git a/code/03-01/src/array/data_chunk.rs b/code/03-01/src/array/data_chunk.rs new file mode 100644 index 0000000..40a2df6 --- /dev/null +++ b/code/03-01/src/array/data_chunk.rs @@ -0,0 +1,85 @@ +use std::fmt; +use std::sync::Arc; + +use itertools::Itertools; + +use super::*; + +/// A collection of arrays. +/// +/// A chunk is a horizontal subset of a query result. +#[derive(PartialEq, Clone)] +pub struct DataChunk { + arrays: Arc<[ArrayImpl]>, +} + +/// Create [`DataChunk`] from a list of column arrays. +impl FromIterator for DataChunk { + fn from_iter>(iter: I) -> Self { + let arrays = iter.into_iter().collect::>(); + assert!(!arrays.is_empty()); + let cardinality = arrays[0].len(); + assert!( + arrays.iter().map(|a| a.len()).all(|l| l == cardinality), + "all arrays must have the same length" + ); + DataChunk { arrays } + } +} + +impl DataChunk { + /// Return a [`DataChunk`] with 1 `item` in 1 array. + pub fn single(item: i32) -> Self { + DataChunk { + arrays: [ArrayImpl::Int32([item].into_iter().collect())] + .into_iter() + .collect(), + } + } + + /// Return the number of rows in the chunk. + pub fn cardinality(&self) -> usize { + self.arrays[0].len() + } + + /// Get all arrays. + pub fn arrays(&self) -> &[ArrayImpl] { + &self.arrays + } + + /// Concatenate multiple chunks into one. + pub fn concat(chunks: &[DataChunk]) -> Self { + assert!(!chunks.is_empty(), "must concat at least one chunk"); + let mut builders = chunks[0] + .arrays() + .iter() + .map(ArrayBuilderImpl::from_type_of_array) + .collect_vec(); + for chunk in chunks { + for (array, builder) in chunk.arrays.iter().zip(builders.iter_mut()) { + builder.append(array); + } + } + builders.into_iter().map(|b| b.finish()).collect() + } +} + +/// Print the chunk as a pretty table. +impl fmt::Display for DataChunk { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use prettytable::{format, Table}; + let mut table = Table::new(); + table.set_format(*format::consts::FORMAT_NO_LINESEP_WITH_TITLE); + for i in 0..self.cardinality() { + let row = self.arrays.iter().map(|a| a.get(i).to_string()).collect(); + table.add_row(row); + } + write!(f, "{}", table) + } +} + +impl fmt::Debug for DataChunk { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self) + } +} diff --git a/code/03-01/src/array/iter.rs b/code/03-01/src/array/iter.rs new file mode 100644 index 0000000..8871bda --- /dev/null +++ b/code/03-01/src/array/iter.rs @@ -0,0 +1,37 @@ +use std::iter::Iterator; +use std::marker::PhantomData; + +use super::Array; + +/// An iterator over the elements of an [`Array`]. +#[derive(Clone)] +pub struct ArrayIter<'a, A: Array> { + array: &'a A, + index: usize, + _phantom: PhantomData<&'a usize>, +} + +impl<'a, A: Array> ArrayIter<'a, A> { + /// Create an iterator over array. + pub fn new(array: &'a A) -> Self { + Self { + array, + index: 0, + _phantom: PhantomData, + } + } +} + +impl<'a, A: Array> Iterator for ArrayIter<'a, A> { + type Item = Option<&'a A::Item>; + + fn next(&mut self) -> Option { + if self.index >= self.array.len() { + None + } else { + let item = self.array.get(self.index); + self.index += 1; + Some(item) + } + } +} diff --git a/code/03-01/src/array/mod.rs b/code/03-01/src/array/mod.rs new file mode 100644 index 0000000..3c29a6b --- /dev/null +++ b/code/03-01/src/array/mod.rs @@ -0,0 +1,244 @@ +//! In-memory representations of a column values. + +use std::convert::TryFrom; + +use crate::types::{DataType, DataTypeKind, DataValue}; + +mod data_chunk; +mod iter; +mod primitive_array; +mod utf8_array; + +pub use self::data_chunk::*; +pub use self::iter::ArrayIter; +pub use self::primitive_array::*; +pub use self::utf8_array::*; + +/// A trait over all array builders. +/// +/// [`ArrayBuilder`] is a trait over all builders. You could build an array with +/// `push` with the help of [`ArrayBuilder`] trait. The `push` function always +/// accepts reference to an element. e.g. for [`PrimitiveArray`], +/// you must do `builder.push(Some(&1))`. For [`Utf8Array`], you must do +/// `builder.push(Some("xxx"))`. Note that you don't need to construct a `String`. +/// +/// The associated type `Array` is the type of the corresponding array. It is the +/// return type of `finish`. +pub trait ArrayBuilder: Send + Sync + 'static { + /// Corresponding `Array` of this builder + type Array: Array; + + /// Create a new builder with `capacity`. + fn with_capacity(capacity: usize) -> Self; + + /// Append a value to builder. + fn push(&mut self, value: Option<&::Item>); + + /// Append an array to builder. + fn append(&mut self, other: &Self::Array); + + /// Finish build and return a new array. + fn finish(self) -> Self::Array; +} + +/// A trait over all array. +/// +/// [`Array`] must be built with an [`ArrayBuilder`]. The array trait provides several +/// unified interface on an array, like `len`, `get` and `iter`. +/// +/// The `Builder` associated type is the builder for this array. +/// The `Item` is the item you could retrieve from this array. +/// +/// For example, [`PrimitiveArray`] could return an `Option<&u32>`, and [`Utf8Array`] will +/// return an `Option<&str>`. +pub trait Array: Sized + Send + Sync + 'static { + /// Corresponding builder of this array. + type Builder: ArrayBuilder; + + /// Type of element in the array. + type Item: ToOwned + ?Sized; + + /// Retrieve a reference to value. + fn get(&self, idx: usize) -> Option<&Self::Item>; + + /// Number of items of array. + fn len(&self) -> usize; + + /// Get iterator of current array. + fn iter(&self) -> ArrayIter<'_, Self> { + ArrayIter::new(self) + } + + /// Check if the array has a length of 0. + fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +pub type BoolArray = PrimitiveArray; +pub type I32Array = PrimitiveArray; +pub type F64Array = PrimitiveArray; + +/// Embeds all types of arrays in `array` module. +#[derive(Clone, PartialEq)] +pub enum ArrayImpl { + Bool(BoolArray), + Int32(I32Array), + Float64(F64Array), + Utf8(Utf8Array), +} + +pub type BoolArrayBuilder = PrimitiveArrayBuilder; +pub type I32ArrayBuilder = PrimitiveArrayBuilder; +pub type F64ArrayBuilder = PrimitiveArrayBuilder; + +/// Embeds all types of array builders in `array` module. +pub enum ArrayBuilderImpl { + Bool(BoolArrayBuilder), + Int32(I32ArrayBuilder), + Float64(F64ArrayBuilder), + Utf8(Utf8ArrayBuilder), +} + +/// An error which can be returned when downcasting an [`ArrayImpl`] into a concrete type array. +#[derive(Debug, Clone)] +pub struct TypeMismatch; + +macro_rules! impl_into { + ($x:ty, $y:ident) => { + impl From<$x> for ArrayImpl { + fn from(array: $x) -> Self { + Self::$y(array) + } + } + + impl TryFrom for $x { + type Error = TypeMismatch; + + fn try_from(array: ArrayImpl) -> Result { + match array { + ArrayImpl::$y(array) => Ok(array), + _ => Err(TypeMismatch), + } + } + } + + impl<'a> TryFrom<&'a ArrayImpl> for &'a $x { + type Error = TypeMismatch; + + fn try_from(array: &'a ArrayImpl) -> Result { + match array { + ArrayImpl::$y(array) => Ok(array), + _ => Err(TypeMismatch), + } + } + } + }; +} + +impl_into! { PrimitiveArray, Bool } +impl_into! { PrimitiveArray, Int32 } +impl_into! { PrimitiveArray, Float64 } +impl_into! { Utf8Array, Utf8 } + +impl ArrayBuilderImpl { + /// Create a new array builder from data type. + pub fn with_capacity(capacity: usize, ty: &DataType) -> Self { + match ty.kind() { + DataTypeKind::Boolean => Self::Bool(BoolArrayBuilder::with_capacity(capacity)), + DataTypeKind::Int(_) => Self::Int32(I32ArrayBuilder::with_capacity(capacity)), + DataTypeKind::Float(_) | DataTypeKind::Double => { + Self::Float64(F64ArrayBuilder::with_capacity(capacity)) + } + DataTypeKind::Char(_) | DataTypeKind::Varchar(_) | DataTypeKind::String => { + Self::Utf8(Utf8ArrayBuilder::with_capacity(capacity)) + } + _ => panic!("unsupported data type"), + } + } + + /// Create a new array builder with the same type of given array. + pub fn from_type_of_array(array: &ArrayImpl) -> Self { + match array { + ArrayImpl::Bool(_) => Self::Bool(BoolArrayBuilder::with_capacity(0)), + ArrayImpl::Int32(_) => Self::Int32(I32ArrayBuilder::with_capacity(0)), + ArrayImpl::Float64(_) => Self::Float64(F64ArrayBuilder::with_capacity(0)), + ArrayImpl::Utf8(_) => Self::Utf8(Utf8ArrayBuilder::with_capacity(0)), + } + } + + /// Appends an element to the back of array. + pub fn push(&mut self, v: &DataValue) { + match (self, v) { + (Self::Bool(a), DataValue::Bool(v)) => a.push(Some(v)), + (Self::Int32(a), DataValue::Int32(v)) => a.push(Some(v)), + (Self::Float64(a), DataValue::Float64(v)) => a.push(Some(v)), + (Self::Utf8(a), DataValue::String(v)) => a.push(Some(v)), + (Self::Bool(a), DataValue::Null) => a.push(None), + (Self::Int32(a), DataValue::Null) => a.push(None), + (Self::Float64(a), DataValue::Null) => a.push(None), + (Self::Utf8(a), DataValue::Null) => a.push(None), + _ => panic!("failed to push value: type mismatch"), + } + } + + /// Appends a [`ArrayImpl`]. + pub fn append(&mut self, array_impl: &ArrayImpl) { + match (self, array_impl) { + (Self::Bool(builder), ArrayImpl::Bool(arr)) => builder.append(arr), + (Self::Int32(builder), ArrayImpl::Int32(arr)) => builder.append(arr), + (Self::Float64(builder), ArrayImpl::Float64(arr)) => builder.append(arr), + (Self::Utf8(builder), ArrayImpl::Utf8(arr)) => builder.append(arr), + _ => panic!("failed to push value: type mismatch"), + } + } + + /// Finish build and return a new array. + pub fn finish(self) -> ArrayImpl { + match self { + Self::Bool(a) => ArrayImpl::Bool(a.finish()), + Self::Int32(a) => ArrayImpl::Int32(a.finish()), + Self::Float64(a) => ArrayImpl::Float64(a.finish()), + Self::Utf8(a) => ArrayImpl::Utf8(a.finish()), + } + } +} + +impl ArrayImpl { + /// Get the value at the given index. + pub fn get(&self, idx: usize) -> DataValue { + match self { + Self::Bool(a) => match a.get(idx) { + Some(val) => DataValue::Bool(*val), + None => DataValue::Null, + }, + Self::Int32(a) => match a.get(idx) { + Some(val) => DataValue::Int32(*val), + None => DataValue::Null, + }, + Self::Float64(a) => match a.get(idx) { + Some(val) => DataValue::Float64(*val), + None => DataValue::Null, + }, + Self::Utf8(a) => match a.get(idx) { + Some(val) => DataValue::String(val.to_string()), + None => DataValue::Null, + }, + } + } + + /// Number of items of array. + pub fn len(&self) -> usize { + match self { + Self::Bool(a) => a.len(), + Self::Int32(a) => a.len(), + Self::Float64(a) => a.len(), + Self::Utf8(a) => a.len(), + } + } + + /// Check if array is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} diff --git a/code/03-01/src/array/primitive_array.rs b/code/03-01/src/array/primitive_array.rs new file mode 100644 index 0000000..0593655 --- /dev/null +++ b/code/03-01/src/array/primitive_array.rs @@ -0,0 +1,107 @@ +use std::fmt::Debug; +use std::iter::FromIterator; + +use bitvec::vec::BitVec; + +use super::{Array, ArrayBuilder}; + +/// A collection of primitive types, such as `i32`, `f32`. +#[derive(Debug, Clone, PartialEq)] +pub struct PrimitiveArray { + valid: BitVec, + data: Vec, +} + +/// A trait over primitive types. +pub trait Primitive: + PartialOrd + PartialEq + Debug + Copy + Send + Sync + Sized + Default + 'static +{ +} + +macro_rules! impl_primitive { + ($($t:ty),*) => { + $(impl Primitive for $t {})* + } +} +impl_primitive!(u8, u16, u32, u64, usize, i8, i16, i32, i64, isize, f32, f64, bool); + +/// Enable `collect()` an array from iterator of `Option`. +impl FromIterator> for PrimitiveArray { + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = ::Builder::with_capacity(iter.size_hint().0); + for e in iter { + builder.push(e.as_ref()); + } + builder.finish() + } +} + +/// Enable `collect()` an array from iterator of `T`. +impl FromIterator for PrimitiveArray { + fn from_iter>(iter: I) -> Self { + iter.into_iter().map(Some).collect() + } +} + +impl Array for PrimitiveArray { + type Item = T; + type Builder = PrimitiveArrayBuilder; + + fn get(&self, idx: usize) -> Option<&T> { + self.valid[idx].then(|| &self.data[idx]) + } + + fn len(&self) -> usize { + self.valid.len() + } +} + +/// A builder that constructs a [`PrimitiveArray`] from `Option`. +pub struct PrimitiveArrayBuilder { + valid: BitVec, + data: Vec, +} + +impl ArrayBuilder for PrimitiveArrayBuilder { + type Array = PrimitiveArray; + + fn with_capacity(capacity: usize) -> Self { + Self { + valid: BitVec::with_capacity(capacity), + data: Vec::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<&T>) { + self.valid.push(value.is_some()); + self.data.push(value.cloned().unwrap_or_default()); + } + + fn append(&mut self, other: &PrimitiveArray) { + self.valid.extend_from_bitslice(&other.valid); + self.data.extend_from_slice(&other.data); + } + + fn finish(self) -> PrimitiveArray { + PrimitiveArray { + valid: self.valid, + data: self.data, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collect() { + let iter = (0..1000).map(|x| if x % 2 == 0 { None } else { Some(x) }); + let array = iter.clone().collect::>(); + assert_eq!( + array.iter().map(|x| x.cloned()).collect::>(), + iter.collect::>() + ); + } +} diff --git a/code/03-01/src/array/utf8_array.rs b/code/03-01/src/array/utf8_array.rs new file mode 100644 index 0000000..199bcdc --- /dev/null +++ b/code/03-01/src/array/utf8_array.rs @@ -0,0 +1,105 @@ +use std::iter::FromIterator; + +use bitvec::vec::BitVec; + +use super::{Array, ArrayBuilder}; + +/// A collection of Rust UTF-8 [`String`]s. +#[derive(Clone, PartialEq)] +pub struct Utf8Array { + offset: Vec, + valid: BitVec, + data: Vec, +} + +impl Array for Utf8Array { + type Item = str; + type Builder = Utf8ArrayBuilder; + + fn get(&self, idx: usize) -> Option<&str> { + if self.valid[idx] { + let data_slice = &self.data[self.offset[idx]..self.offset[idx + 1]]; + Some(unsafe { std::str::from_utf8_unchecked(data_slice) }) + } else { + None + } + } + + fn len(&self) -> usize { + self.valid.len() + } +} + +/// A builder that uses `&str` to build an [`Utf8Array`]. +pub struct Utf8ArrayBuilder { + offset: Vec, + valid: BitVec, + data: Vec, +} + +impl ArrayBuilder for Utf8ArrayBuilder { + type Array = Utf8Array; + + fn with_capacity(capacity: usize) -> Self { + let mut offset = Vec::with_capacity(capacity + 1); + offset.push(0); + Self { + offset, + data: Vec::with_capacity(capacity), + valid: BitVec::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<&str>) { + self.valid.push(value.is_some()); + if let Some(x) = value { + self.data.extend_from_slice(x.as_bytes()); + } + self.offset.push(self.data.len()); + } + + fn append(&mut self, other: &Utf8Array) { + self.valid.extend_from_bitslice(&other.valid); + self.data.extend_from_slice(&other.data); + let start = *self.offset.last().unwrap(); + for other_offset in &other.offset[1..] { + self.offset.push(*other_offset + start); + } + } + + fn finish(self) -> Utf8Array { + Utf8Array { + valid: self.valid, + data: self.data, + offset: self.offset, + } + } +} + +/// Enable `collect()` an array from iterator of `Option<&str>` or `Option`. +impl> FromIterator> for Utf8Array { + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = ::Builder::with_capacity(iter.size_hint().0); + for e in iter { + if let Some(s) = e { + builder.push(Some(s.as_ref())); + } else { + builder.push(None); + } + } + builder.finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collect() { + let iter = [None, Some("1"), None, Some("3")].into_iter(); + let array = iter.clone().collect::(); + assert_eq!(array.iter().collect::>(), iter.collect::>()); + } +} diff --git a/code/03-01/src/binder b/code/03-01/src/binder deleted file mode 120000 index 936cb6b..0000000 --- a/code/03-01/src/binder +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/binder/ \ No newline at end of file diff --git a/code/03-01/src/binder/expression/column_ref.rs b/code/03-01/src/binder/expression/column_ref.rs new file mode 100644 index 0000000..9b8fd7c --- /dev/null +++ b/code/03-01/src/binder/expression/column_ref.rs @@ -0,0 +1,72 @@ +use super::*; + +/// A bound column reference expression. +#[derive(PartialEq, Clone)] +pub struct BoundColumnRef { + pub column_ref_id: ColumnRefId, + pub return_type: DataType, +} + +impl std::fmt::Debug for BoundColumnRef { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.column_ref_id) + } +} + +impl Binder { + /// Expand wildcard into a list of column references. + pub fn bind_all_column_refs(&mut self) -> Result, BindError> { + let mut exprs = vec![]; + for &table_ref_id in self.tables.values() { + let table = self.catalog.get_table(table_ref_id).unwrap(); + for (col_id, col) in table.all_columns() { + let expr = BoundExpr::ColumnRef(BoundColumnRef { + column_ref_id: ColumnRefId::from_table(table_ref_id, col_id), + return_type: col.datatype(), + }); + exprs.push(expr); + } + } + Ok(exprs) + } + + pub fn bind_column_ref(&mut self, idents: &[Ident]) -> Result { + let (_schema_name, table_name, column_name) = match idents { + [column] => (None, None, &column.value), + [table, column] => (None, Some(&table.value), &column.value), + [schema, table, column] => (Some(&schema.value), Some(&table.value), &column.value), + _ => return Err(BindError::InvalidTableName(idents.into())), + }; + if let Some(name) = table_name { + let table_ref_id = *self + .tables + .get(name) + .ok_or_else(|| BindError::TableNotFound(name.clone()))?; + let table = self.catalog.get_table(table_ref_id).unwrap(); + let col = table + .get_column_by_name(column_name) + .ok_or_else(|| BindError::ColumnNotFound(column_name.clone()))?; + Ok(BoundExpr::ColumnRef(BoundColumnRef { + column_ref_id: ColumnRefId::from_table(table_ref_id, col.id()), + return_type: col.datatype(), + })) + } else { + let mut column_ref = None; + for &table_ref_id in self.tables.values() { + let table = self.catalog.get_table(table_ref_id).unwrap(); + if let Some(col) = table.get_column_by_name(column_name) { + if column_ref.is_some() { + return Err(BindError::AmbiguousColumnName(column_name.into())); + } + column_ref = Some(BoundColumnRef { + column_ref_id: ColumnRefId::from_table(table_ref_id, col.id()), + return_type: col.datatype(), + }); + } + } + Ok(BoundExpr::ColumnRef(column_ref.ok_or_else(|| { + BindError::ColumnNotFound(column_name.clone()) + })?)) + } + } +} diff --git a/code/03-01/src/binder/expression/mod.rs b/code/03-01/src/binder/expression/mod.rs new file mode 100644 index 0000000..ed89891 --- /dev/null +++ b/code/03-01/src/binder/expression/mod.rs @@ -0,0 +1,59 @@ +use super::*; +use crate::parser::{Expr, Value}; +use crate::types::{DataType, DataValue}; + +mod column_ref; + +pub use self::column_ref::*; + +/// A bound expression. +#[derive(Debug, PartialEq, Clone)] +pub enum BoundExpr { + Constant(DataValue), + ColumnRef(BoundColumnRef), +} + +impl BoundExpr { + /// Get return type of the expression. + /// + /// Returns `None` if the type can not be decided. + pub fn return_type(&self) -> Option { + match self { + Self::Constant(v) => v.datatype(), + Self::ColumnRef(c) => Some(c.return_type.clone()), + } + } +} + +impl Binder { + /// Bind an expression. + pub fn bind_expr(&mut self, expr: &Expr) -> Result { + match expr { + Expr::Value(v) => Ok(BoundExpr::Constant(v.into())), + Expr::Identifier(ident) => self.bind_column_ref(std::slice::from_ref(ident)), + Expr::CompoundIdentifier(idents) => self.bind_column_ref(idents), + _ => todo!("bind expression: {:?}", expr), + } + } +} + +impl From<&Value> for DataValue { + fn from(v: &Value) -> Self { + match v { + Value::Number(n, _) => { + if let Ok(int) = n.parse::() { + Self::Int32(int) + } else if let Ok(float) = n.parse::() { + Self::Float64(float) + } else { + panic!("invalid digit: {}", n); + } + } + Value::SingleQuotedString(s) => Self::String(s.clone()), + Value::DoubleQuotedString(s) => Self::String(s.clone()), + Value::Boolean(b) => Self::Bool(*b), + Value::Null => Self::Null, + _ => todo!("parse value: {:?}", v), + } + } +} diff --git a/code/03-01/src/binder/mod.rs b/code/03-01/src/binder/mod.rs new file mode 100644 index 0000000..98d0bdc --- /dev/null +++ b/code/03-01/src/binder/mod.rs @@ -0,0 +1,96 @@ +//! Resolve all expressions referring with their names. + +use std::collections::HashMap; +use std::vec::Vec; + +use crate::catalog::*; +use crate::parser::{Ident, ObjectName, Statement}; + +mod expression; +mod statement; +mod table_ref; + +pub use self::expression::*; +pub use self::statement::*; +pub use self::table_ref::*; + +/// A bound SQL statement generated by the [`Binder`]. +#[derive(Debug, PartialEq, Clone)] +pub enum BoundStatement { + CreateTable(BoundCreateTable), + Insert(BoundInsert), + Explain(Box), + Select(BoundSelect), +} + +/// The error type of bind operations. +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum BindError { + #[error("table must have at least one column")] + EmptyColumns, + #[error("schema not found: {0}")] + SchemaNotFound(String), + #[error("table not found: {0}")] + TableNotFound(String), + #[error("column not found: {0}")] + ColumnNotFound(String), + #[error("duplicated table: {0}")] + DuplicatedTable(String), + #[error("duplicated column: {0}")] + DuplicatedColumn(String), + #[error("invalid table name: {0:?}")] + InvalidTableName(Vec), + #[error("duplicated alias: {0}")] + DuplicatedAlias(String), + #[error("ambiguous column name: {0}")] + AmbiguousColumnName(String), + #[error("not nullable column: {0}")] + NotNullableColumn(String), + #[error("tuple length mismatch: expected {expected} but got {actual}")] + TupleLengthMismatch { expected: usize, actual: usize }, + #[error("value should not be null in column: {0}")] + NullValueInColumn(String), +} + +/// The binder resolves all expressions referring to schema objects such as +/// tables or views with their column names and types. +pub struct Binder { + catalog: CatalogRef, + tables: HashMap, +} + +type TableName = String; + +impl Binder { + /// Create a new [Binder]. + pub fn new(catalog: CatalogRef) -> Self { + Binder { + catalog, + tables: HashMap::default(), + } + } + + /// Bind a statement. + pub fn bind(&mut self, stmt: &Statement) -> Result { + match stmt { + Statement::CreateTable { .. } => { + Ok(BoundStatement::CreateTable(self.bind_create_table(stmt)?)) + } + Statement::Insert { .. } => Ok(BoundStatement::Insert(self.bind_insert(stmt)?)), + Statement::Explain { statement, .. } => { + Ok(BoundStatement::Explain(self.bind(&*statement)?.into())) + } + Statement::Query(query) => Ok(BoundStatement::Select(self.bind_select(&*query)?)), + _ => todo!("bind statement: {:#?}", stmt), + } + } +} + +/// Split an [ObjectName] into `(schema name, table name)`. +fn split_name(name: &ObjectName) -> Result<(&str, &str), BindError> { + Ok(match name.0.as_slice() { + [table] => (DEFAULT_SCHEMA_NAME, &table.value), + [schema, table] => (&schema.value, &table.value), + _ => return Err(BindError::InvalidTableName(name.0.clone())), + }) +} diff --git a/code/03-01/src/binder/statement/create_table.rs b/code/03-01/src/binder/statement/create_table.rs new file mode 100644 index 0000000..97fdd6a --- /dev/null +++ b/code/03-01/src/binder/statement/create_table.rs @@ -0,0 +1,117 @@ +use std::collections::HashSet; + +use super::*; +use crate::catalog::ColumnDesc; +use crate::parser::{ColumnDef, ColumnOption, Statement}; +use crate::types::DataType; + +/// A bound `CREATE TABLE` statement. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundCreateTable { + pub schema_id: SchemaId, + pub table_name: String, + pub columns: Vec<(String, ColumnDesc)>, +} + +impl Binder { + pub fn bind_create_table(&mut self, stmt: &Statement) -> Result { + match stmt { + Statement::CreateTable { name, columns, .. } => { + // check empty columns + if columns.is_empty() { + return Err(BindError::EmptyColumns); + } + let (schema_name, table_name) = split_name(name)?; + let schema = self + .catalog + .get_schema_by_name(schema_name) + .ok_or_else(|| BindError::SchemaNotFound(schema_name.into()))?; + // check duplicated table name + if schema.get_table_by_name(table_name).is_some() { + return Err(BindError::DuplicatedTable(table_name.into())); + } + // check duplicated column names + let mut set = HashSet::new(); + for col in columns.iter() { + if !set.insert(col.name.value.clone()) { + return Err(BindError::DuplicatedColumn(col.name.value.clone())); + } + } + let columns = columns + .iter() + .map(|col| (col.name.value.clone(), ColumnDesc::from(col))) + .collect(); + Ok(BoundCreateTable { + schema_id: schema.id(), + table_name: table_name.into(), + columns, + }) + } + _ => panic!("mismatched statement type"), + } + } +} + +impl From<&ColumnDef> for ColumnDesc { + fn from(cdef: &ColumnDef) -> Self { + let mut is_nullable = true; + let mut is_primary = false; + for opt in cdef.options.iter() { + match opt.option { + ColumnOption::Null => is_nullable = true, + ColumnOption::NotNull => is_nullable = false, + ColumnOption::Unique { is_primary: v } => is_primary = v, + _ => todo!("column options"), + } + } + ColumnDesc::new( + DataType::new(cdef.data_type.clone(), is_nullable), + is_primary, + ) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use crate::catalog::DatabaseCatalog; + use crate::parser::parse; + use crate::types::{DataTypeExt, DataTypeKind}; + + #[test] + fn bind_create_table() { + let catalog = Arc::new(DatabaseCatalog::new()); + let mut binder = Binder::new(catalog.clone()); + let sql = " + create table t1 (v1 int not null, v2 int); + create table t2 (a int not null, a int not null); + create table t3 (v1 int not null);"; + let stmts = parse(sql).unwrap(); + + assert_eq!( + binder.bind_create_table(&stmts[0]).unwrap(), + BoundCreateTable { + schema_id: 0, + table_name: "t1".into(), + columns: vec![ + ("v1".into(), DataTypeKind::Int(None).not_null().to_column()), + ("v2".into(), DataTypeKind::Int(None).nullable().to_column()), + ], + } + ); + + assert_eq!( + binder.bind_create_table(&stmts[1]), + Err(BindError::DuplicatedColumn("a".into())) + ); + + let schema = catalog.get_schema(0).unwrap(); + schema.add_table("t3").unwrap(); + assert_eq!( + binder.bind_create_table(&stmts[2]), + Err(BindError::DuplicatedTable("t3".into())) + ); + } +} diff --git a/code/03-01/src/binder/statement/insert.rs b/code/03-01/src/binder/statement/insert.rs new file mode 100644 index 0000000..3e7bfc1 --- /dev/null +++ b/code/03-01/src/binder/statement/insert.rs @@ -0,0 +1,131 @@ +use std::collections::HashSet; +use std::sync::Arc; + +use itertools::Itertools; + +use super::*; +use crate::catalog::{ColumnCatalog, ColumnId, TableCatalog}; +use crate::parser::{SetExpr, Statement}; +use crate::types::{DataType, DataTypeKind}; + +/// A bound `INSERT` statement. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundInsert { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub column_types: Vec, + pub values: Vec>, +} + +impl Binder { + pub fn bind_insert(&mut self, stmt: &Statement) -> Result { + let (table_name, columns, source) = match stmt { + Statement::Insert { + table_name, + columns, + source, + .. + } => (table_name, columns, source), + _ => panic!("mismatched statement type"), + }; + let (table_ref_id, table, columns) = self.bind_table_columns(table_name, columns)?; + let column_ids = columns.iter().map(|col| col.id()).collect_vec(); + let column_types = columns.iter().map(|col| col.datatype()).collect_vec(); + + // Check columns after transforming. + let col_set: HashSet = column_ids.iter().cloned().collect(); + for (id, col) in table.all_columns() { + if !col_set.contains(&id) && !col.is_nullable() { + return Err(BindError::NotNullableColumn(col.name().into())); + } + } + + let values = match &source.body { + SetExpr::Select(_) => todo!("handle 'insert into .. select .. from ..' case."), + SetExpr::Values(values) => &values.0, + _ => todo!("handle insert ???"), + }; + + // Handle 'insert into .. values ..' case. + + // Check inserted values, we only support inserting values now. + let mut bound_values = Vec::with_capacity(values.len()); + for row in values.iter() { + if row.len() > column_ids.len() { + return Err(BindError::TupleLengthMismatch { + expected: columns.len(), + actual: row.len(), + }); + } + let mut bound_row = Vec::with_capacity(row.len()); + for (idx, expr) in row.iter().enumerate() { + // Bind expression + let expr = self.bind_expr(expr)?; + + if let Some(data_type) = &expr.return_type() { + // TODO: support valid type cast + // For example: + // CREATE TABLE t (a FLOAT, b FLOAT); + // INSERT INTO VALUES (1, 1); + // 1 should be casted to float. + let left_kind = data_type.kind(); + let right_kind = column_types[idx].kind(); + match (&left_kind, &right_kind) { + _ if left_kind == right_kind => {} + // For char types, no need to cast + (DataTypeKind::Char(_), DataTypeKind::Varchar(_)) => {} + (DataTypeKind::Varchar(_), DataTypeKind::Char(_)) => {} + _ => todo!("type cast: {} -> {}", left_kind, right_kind), + } + } else { + // If the data value is null, the column must be nullable. + if !column_types[idx].is_nullable() { + return Err(BindError::NullValueInColumn(columns[idx].name().into())); + } + } + bound_row.push(expr); + } + bound_values.push(bound_row); + } + + Ok(BoundInsert { + table_ref_id, + column_ids, + column_types, + values: bound_values, + }) + } + + /// Bind `table_name [ (column_name [, ...] ) ]` + pub fn bind_table_columns( + &mut self, + table_name: &ObjectName, + columns: &[Ident], + ) -> Result<(TableRefId, Arc, Vec), BindError> { + let (schema_name, table_name) = split_name(table_name)?; + let schema = self + .catalog + .get_schema_by_name(schema_name) + .ok_or_else(|| BindError::SchemaNotFound(schema_name.into()))?; + let table = schema + .get_table_by_name(table_name) + .ok_or_else(|| BindError::TableNotFound(table_name.into()))?; + let table_ref_id = TableRefId::new(schema.id(), table.id()); + + let columns = if columns.is_empty() { + // If the query does not provide column information, get all columns info. + table.all_columns().values().cloned().collect_vec() + } else { + // Otherwise, we get columns info from the query. + let mut column_catalogs = vec![]; + for col in columns.iter() { + let col = table + .get_column_by_name(&col.value) + .ok_or_else(|| BindError::ColumnNotFound(col.value.clone()))?; + column_catalogs.push(col); + } + column_catalogs + }; + Ok((table_ref_id, table, columns)) + } +} diff --git a/code/03-01/src/binder/statement/mod.rs b/code/03-01/src/binder/statement/mod.rs new file mode 100644 index 0000000..03bf861 --- /dev/null +++ b/code/03-01/src/binder/statement/mod.rs @@ -0,0 +1,9 @@ +use super::*; + +mod create_table; +mod insert; +mod select; + +pub use self::create_table::*; +pub use self::insert::*; +pub use self::select::*; diff --git a/code/03-01/src/binder/statement/select.rs b/code/03-01/src/binder/statement/select.rs new file mode 100644 index 0000000..ba475cf --- /dev/null +++ b/code/03-01/src/binder/statement/select.rs @@ -0,0 +1,61 @@ +use super::*; +use crate::binder::BoundTableRef; +use crate::parser::{Query, SelectItem, SetExpr}; + +/// A bound `SELECT` statement. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundSelect { + pub select_list: Vec, + pub from_list: Vec, +} + +impl Binder { + pub fn bind_select(&mut self, query: &Query) -> Result { + let select = match &query.body { + SetExpr::Select(select) => &**select, + _ => todo!("not select"), + }; + + let mut from_list = vec![]; + assert!(select.from.len() <= 1, "multiple tables are not supported"); + for table_with_join in select.from.iter() { + let table_ref = self.bind_table_with_joins(table_with_join)?; + from_list.push(table_ref); + } + + assert!(select.selection.is_none(), "WHERE clause is not supported"); + assert!( + query.order_by.is_empty(), + "ORDER BY clause is not supported" + ); + assert!(query.limit.is_none(), "LIMIT clause is not supported"); + assert!(query.offset.is_none(), "OFFSET clause is not supported"); + assert!( + select.group_by.is_empty(), + "GROUP BY clause is not supported" + ); + assert!(!select.distinct, "DISTINCT is not supported"); + + // Bind the select list. + let mut select_list = vec![]; + for item in select.projection.iter() { + match item { + SelectItem::UnnamedExpr(expr) => { + select_list.push(self.bind_expr(expr)?); + } + SelectItem::ExprWithAlias { expr, .. } => { + select_list.push(self.bind_expr(expr)?); + } + SelectItem::Wildcard => { + select_list.extend(self.bind_all_column_refs()?); + } + _ => todo!("not supported select item: {:?}", item), + } + } + + Ok(BoundSelect { + select_list, + from_list, + }) + } +} diff --git a/code/03-01/src/binder/table_ref/mod.rs b/code/03-01/src/binder/table_ref/mod.rs new file mode 100644 index 0000000..ad9ead6 --- /dev/null +++ b/code/03-01/src/binder/table_ref/mod.rs @@ -0,0 +1,36 @@ +use super::*; +use crate::parser::{TableFactor, TableWithJoins}; + +/// A bound table reference. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundTableRef { + pub table_ref_id: TableRefId, + pub column_ids: Vec, +} + +impl Binder { + pub fn bind_table_with_joins( + &mut self, + table: &TableWithJoins, + ) -> Result { + assert!(table.joins.is_empty(), "JOIN is not supported"); + + let (name, alias) = match &table.relation { + TableFactor::Table { name, alias, .. } => (name, alias), + r => panic!("not supported table factor: {:?}", r), + }; + let (table_ref_id, _, columns) = self.bind_table_columns(name, &[])?; + let alias = match alias { + Some(alias) => &alias.name.value, + None => split_name(name).unwrap().1, + }; + if self.tables.contains_key(alias) { + return Err(BindError::DuplicatedAlias(alias.into())); + } + self.tables.insert(alias.into(), table_ref_id); + Ok(BoundTableRef { + table_ref_id, + column_ids: columns.iter().map(|col| col.id()).collect(), + }) + } +} diff --git a/code/03-01/src/catalog b/code/03-01/src/catalog deleted file mode 120000 index e98b018..0000000 --- a/code/03-01/src/catalog +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/catalog/ \ No newline at end of file diff --git a/code/03-01/src/catalog/column.rs b/code/03-01/src/catalog/column.rs new file mode 100644 index 0000000..abb6683 --- /dev/null +++ b/code/03-01/src/catalog/column.rs @@ -0,0 +1,94 @@ +use super::*; +use crate::types::DataType; + +/// The descriptor of a column. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ColumnDesc { + datatype: DataType, + is_primary: bool, +} + +impl ColumnDesc { + pub const fn new(datatype: DataType, is_primary: bool) -> Self { + ColumnDesc { + datatype, + is_primary, + } + } + + pub fn is_primary(&self) -> bool { + self.is_primary + } + + pub fn is_nullable(&self) -> bool { + self.datatype.is_nullable() + } + + pub fn datatype(&self) -> &DataType { + &self.datatype + } +} + +impl DataType { + pub const fn to_column(self) -> ColumnDesc { + ColumnDesc::new(self, false) + } + + pub const fn to_column_primary_key(self) -> ColumnDesc { + ColumnDesc::new(self, true) + } +} + +/// The catalog of a column. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ColumnCatalog { + id: ColumnId, + name: String, + desc: ColumnDesc, +} + +impl ColumnCatalog { + pub(super) fn new(id: ColumnId, name: String, desc: ColumnDesc) -> ColumnCatalog { + ColumnCatalog { id, name, desc } + } + + pub fn id(&self) -> ColumnId { + self.id + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn desc(&self) -> &ColumnDesc { + &self.desc + } + + pub fn datatype(&self) -> DataType { + self.desc.datatype.clone() + } + + pub fn is_primary(&self) -> bool { + self.desc.is_primary() + } + + pub fn is_nullable(&self) -> bool { + self.desc.is_nullable() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{DataTypeExt, DataTypeKind}; + + #[test] + fn test_column_catalog() { + let col_desc = DataTypeKind::Int(None).not_null().to_column(); + let col_catalog = ColumnCatalog::new(0, "grade".into(), col_desc); + assert_eq!(col_catalog.id(), 0); + assert!(!col_catalog.is_primary()); + assert!(!col_catalog.is_nullable()); + assert_eq!(col_catalog.name(), "grade"); + } +} diff --git a/code/03-01/src/catalog/database.rs b/code/03-01/src/catalog/database.rs new file mode 100644 index 0000000..e1fca34 --- /dev/null +++ b/code/03-01/src/catalog/database.rs @@ -0,0 +1,79 @@ +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use super::*; + +/// The catalog of a database. +pub struct DatabaseCatalog { + inner: Mutex, +} + +#[derive(Default)] +struct Inner { + schema_idxs: HashMap, + schemas: HashMap>, + next_schema_id: SchemaId, +} + +impl Default for DatabaseCatalog { + fn default() -> Self { + Self::new() + } +} + +impl DatabaseCatalog { + pub fn new() -> Self { + let db_catalog = DatabaseCatalog { + inner: Mutex::new(Inner::default()), + }; + db_catalog.add_schema(DEFAULT_SCHEMA_NAME).unwrap(); + db_catalog + } + + pub fn add_schema(&self, name: &str) -> Result { + let mut inner = self.inner.lock().unwrap(); + if inner.schema_idxs.contains_key(name) { + return Err(CatalogError::Duplicated("schema", name.into())); + } + let id = inner.next_schema_id; + inner.next_schema_id += 1; + let schema_catalog = Arc::new(SchemaCatalog::new(id, name.into())); + inner.schema_idxs.insert(name.into(), id); + inner.schemas.insert(id, schema_catalog); + Ok(id) + } + + pub fn del_schema(&self, name: &str) -> Result<(), CatalogError> { + let mut inner = self.inner.lock().unwrap(); + let id = inner + .schema_idxs + .remove(name) + .ok_or_else(|| CatalogError::NotFound("schema", name.into()))?; + inner.schemas.remove(&id); + Ok(()) + } + + pub fn all_schemas(&self) -> HashMap> { + let inner = self.inner.lock().unwrap(); + inner.schemas.clone() + } + + pub fn get_schema(&self, schema_id: SchemaId) -> Option> { + let inner = self.inner.lock().unwrap(); + inner.schemas.get(&schema_id).cloned() + } + + pub fn get_schema_by_name(&self, name: &str) -> Option> { + let inner = self.inner.lock().unwrap(); + inner + .schema_idxs + .get(name) + .and_then(|id| inner.schemas.get(id)) + .cloned() + } + + pub fn get_table(&self, table_ref_id: TableRefId) -> Option> { + let schema = self.get_schema(table_ref_id.schema_id)?; + schema.get_table(table_ref_id.table_id) + } +} diff --git a/code/03-01/src/catalog/mod.rs b/code/03-01/src/catalog/mod.rs new file mode 100644 index 0000000..594a43f --- /dev/null +++ b/code/03-01/src/catalog/mod.rs @@ -0,0 +1,85 @@ +//! The metadata of all database objects. +//! +//! The hierarchy of the catalog is: [Database] - [Schema] - [Table] - [Column]. +//! +//! There is a default schema `postgres` in it. +//! +//! [Database]: DatabaseCatalog +//! [Schema]: SchemaCatalog +//! [Table]: TableCatalog +//! [Column]: ColumnCatalog + +use std::sync::Arc; + +mod column; +mod database; +mod schema; +mod table; + +pub use self::column::*; +pub use self::database::*; +pub use self::schema::*; +pub use self::table::*; + +/// The type of catalog reference. +pub type CatalogRef = Arc; +/// The type of schema ID. +pub type SchemaId = u32; +/// The type of table ID. +pub type TableId = u32; +/// The type of column ID. +pub type ColumnId = u32; + +/// The name of default schema: `postgres`. +pub const DEFAULT_SCHEMA_NAME: &str = "postgres"; + +/// The reference ID of a table. +#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] +pub struct TableRefId { + pub schema_id: SchemaId, + pub table_id: TableId, +} + +impl TableRefId { + pub const fn new(schema_id: SchemaId, table_id: TableId) -> Self { + TableRefId { + schema_id, + table_id, + } + } +} + +/// The reference ID of a column. +#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] +pub struct ColumnRefId { + pub schema_id: SchemaId, + pub table_id: TableId, + pub column_id: ColumnId, +} + +impl ColumnRefId { + pub const fn from_table(table: TableRefId, column_id: ColumnId) -> Self { + ColumnRefId { + schema_id: table.schema_id, + table_id: table.table_id, + column_id, + } + } + + pub const fn new(schema_id: SchemaId, table_id: TableId, column_id: ColumnId) -> Self { + ColumnRefId { + schema_id, + table_id, + column_id, + } + } +} + +/// The error type of catalog operations. +#[derive(thiserror::Error, Debug)] +pub enum CatalogError { + #[error("{0} not found: {1}")] + NotFound(&'static str, String), + #[error("duplicated {0}: {1}")] + Duplicated(&'static str, String), +} diff --git a/code/03-01/src/catalog/schema.rs b/code/03-01/src/catalog/schema.rs new file mode 100644 index 0000000..d30c0df --- /dev/null +++ b/code/03-01/src/catalog/schema.rs @@ -0,0 +1,88 @@ +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use super::*; + +/// The catalog of a schema. +pub struct SchemaCatalog { + id: SchemaId, + inner: Mutex, +} + +struct Inner { + name: String, + table_idxs: HashMap, + tables: HashMap>, + next_table_id: TableId, +} + +impl SchemaCatalog { + pub(super) fn new(id: SchemaId, name: String) -> SchemaCatalog { + SchemaCatalog { + id, + inner: Mutex::new(Inner { + name, + table_idxs: HashMap::new(), + tables: HashMap::new(), + next_table_id: 0, + }), + } + } + + pub fn id(&self) -> SchemaId { + self.id + } + + pub fn name(&self) -> String { + let inner = self.inner.lock().unwrap(); + inner.name.clone() + } + + pub fn add_table(&self, name: &str) -> Result { + let mut inner = self.inner.lock().unwrap(); + if inner.table_idxs.contains_key(name) { + return Err(CatalogError::Duplicated("table", name.into())); + } + let id = inner.next_table_id; + inner.next_table_id += 1; + let table_catalog = Arc::new(TableCatalog::new(id, name.into())); + inner.table_idxs.insert(name.into(), id); + inner.tables.insert(id, table_catalog); + Ok(id) + } + + pub fn del_table_by_name(&self, name: &str) -> Result<(), CatalogError> { + let mut inner = self.inner.lock().unwrap(); + let id = inner + .table_idxs + .remove(name) + .ok_or_else(|| CatalogError::NotFound("table", name.into()))?; + inner.tables.remove(&id); + Ok(()) + } + + pub fn del_table(&self, id: TableId) { + let mut inner = self.inner.lock().unwrap(); + let catalog = inner.tables.remove(&id).unwrap(); + inner.table_idxs.remove(&catalog.name()).unwrap(); + } + + pub fn all_tables(&self) -> HashMap> { + let inner = self.inner.lock().unwrap(); + inner.tables.clone() + } + + pub fn get_table(&self, table_id: TableId) -> Option> { + let inner = self.inner.lock().unwrap(); + inner.tables.get(&table_id).cloned() + } + + pub fn get_table_by_name(&self, name: &str) -> Option> { + let inner = self.inner.lock().unwrap(); + inner + .table_idxs + .get(name) + .and_then(|id| inner.tables.get(id)) + .cloned() + } +} diff --git a/code/03-01/src/catalog/table.rs b/code/03-01/src/catalog/table.rs new file mode 100644 index 0000000..2422457 --- /dev/null +++ b/code/03-01/src/catalog/table.rs @@ -0,0 +1,111 @@ +use std::collections::{BTreeMap, HashMap}; +use std::sync::Mutex; + +use super::*; + +/// The catalog of a table. +pub struct TableCatalog { + id: TableId, + inner: Mutex, +} + +struct Inner { + name: String, + /// Mapping from column names to column ids + column_idxs: HashMap, + columns: BTreeMap, + next_column_id: ColumnId, +} + +impl TableCatalog { + pub(super) fn new(id: TableId, name: String) -> TableCatalog { + TableCatalog { + id, + inner: Mutex::new(Inner { + name, + column_idxs: HashMap::new(), + columns: BTreeMap::new(), + next_column_id: 0, + }), + } + } + + pub fn id(&self) -> TableId { + self.id + } + + pub fn name(&self) -> String { + let inner = self.inner.lock().unwrap(); + inner.name.clone() + } + + pub fn add_column(&self, name: &str, desc: ColumnDesc) -> Result { + let mut inner = self.inner.lock().unwrap(); + if inner.column_idxs.contains_key(name) { + return Err(CatalogError::Duplicated("column", name.into())); + } + let id = inner.next_column_id; + inner.next_column_id += 1; + inner.column_idxs.insert(name.into(), id); + inner + .columns + .insert(id, ColumnCatalog::new(id, name.into(), desc)); + Ok(id) + } + + pub fn contains_column(&self, name: &str) -> bool { + let inner = self.inner.lock().unwrap(); + inner.column_idxs.contains_key(name) + } + + pub fn all_columns(&self) -> BTreeMap { + let inner = self.inner.lock().unwrap(); + inner.columns.clone() + } + + pub fn get_column(&self, id: ColumnId) -> Option { + let inner = self.inner.lock().unwrap(); + inner.columns.get(&id).cloned() + } + + pub fn get_column_by_name(&self, name: &str) -> Option { + let inner = self.inner.lock().unwrap(); + inner + .column_idxs + .get(name) + .and_then(|id| inner.columns.get(id)) + .cloned() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{DataTypeExt, DataTypeKind}; + + #[test] + fn test_table_catalog() { + let table_catalog = TableCatalog::new(0, "t".into()); + table_catalog + .add_column("a", DataTypeKind::Int(None).not_null().to_column()) + .unwrap(); + table_catalog + .add_column("b", DataTypeKind::Boolean.not_null().to_column()) + .unwrap(); + + assert!(!table_catalog.contains_column("c")); + assert!(table_catalog.contains_column("a")); + assert!(table_catalog.contains_column("b")); + + assert_eq!(table_catalog.get_column_by_name("a").unwrap().id(), 0); + assert_eq!(table_catalog.get_column_by_name("b").unwrap().id(), 1); + + let col0_catalog = table_catalog.get_column(0).unwrap(); + assert_eq!(col0_catalog.name(), "a"); + assert_eq!(col0_catalog.datatype().kind(), DataTypeKind::Int(None)); + + let col1_catalog = table_catalog.get_column(1).unwrap(); + assert_eq!(col1_catalog.name(), "b"); + assert_eq!(col1_catalog.datatype().kind(), DataTypeKind::Boolean); + } +} diff --git a/code/03-01/src/db.rs b/code/03-01/src/db.rs deleted file mode 120000 index ef0ca75..0000000 --- a/code/03-01/src/db.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/db.rs \ No newline at end of file diff --git a/code/03-01/src/db.rs b/code/03-01/src/db.rs new file mode 100644 index 0000000..5b29739 --- /dev/null +++ b/code/03-01/src/db.rs @@ -0,0 +1,93 @@ +//! Top-level structure of the database. + +use std::sync::Arc; + +use futures::TryStreamExt; +use tokio::runtime::Runtime; + +use crate::array::DataChunk; +use crate::binder::{BindError, Binder}; +use crate::catalog::{CatalogRef, DatabaseCatalog}; +use crate::executor::{ExecuteError, ExecutorBuilder}; +use crate::logical_planner::{LogicalPlanError, LogicalPlanner}; +use crate::parser::{parse, ParserError}; +use crate::physical_planner::{PhysicalPlanError, PhysicalPlanner}; +use crate::storage::DiskStorage; + +/// The database instance. +pub struct Database { + catalog: CatalogRef, + executor_builder: ExecutorBuilder, + runtime: Runtime, +} + +impl Default for Database { + fn default() -> Self { + Self::new() + } +} + +impl Database { + /// Create a new database instance. + pub fn new() -> Self { + let catalog = Arc::new(DatabaseCatalog::new()); + let storage = Arc::new(DiskStorage::new()); + let parallel = matches!(std::env::var("LIGHT_PARALLEL"), Ok(s) if s == "1"); + let runtime = if parallel { + tokio::runtime::Builder::new_multi_thread() + } else { + tokio::runtime::Builder::new_current_thread() + } + .build() + .expect("failed to create tokio runtime"); + let handle = parallel.then(|| runtime.handle().clone()); + Database { + catalog: catalog.clone(), + executor_builder: ExecutorBuilder::new(catalog, storage, handle), + runtime, + } + } + + /// Run SQL queries and return the outputs. + pub fn run(&self, sql: &str) -> Result, Error> { + // parse + let stmts = parse(sql)?; + + let mut outputs = vec![]; + for stmt in stmts { + let mut binder = Binder::new(self.catalog.clone()); + let logical_planner = LogicalPlanner::default(); + let physical_planner = PhysicalPlanner::default(); + + let bound_stmt = binder.bind(&stmt)?; + debug!("{:#?}", bound_stmt); + let logical_plan = logical_planner.plan(bound_stmt)?; + debug!("{:#?}", logical_plan); + let physical_plan = physical_planner.plan(&logical_plan)?; + debug!("{:#?}", physical_plan); + let mut executor = self.executor_builder.build(physical_plan); + self.runtime.block_on(async { + while let Some(chunk) = executor.try_next().await? { + outputs.push(chunk); + } + Ok(()) as Result<(), Error> + })?; + } + Ok(outputs) + } +} + +/// The error type of database operations. +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("parse error: {0}")] + Parse(#[from] ParserError), + #[error("bind error: {0}")] + Bind(#[from] BindError), + #[error("logical plan error: {0}")] + LogicalPlan(#[from] LogicalPlanError), + #[error("physical plan error: {0}")] + PhysicalPlan(#[from] PhysicalPlanError), + #[error("execute error: {0}")] + Execute(#[from] ExecuteError), +} diff --git a/code/03-01/src/executor b/code/03-01/src/executor deleted file mode 120000 index 991a7a7..0000000 --- a/code/03-01/src/executor +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/executor/ \ No newline at end of file diff --git a/code/03-01/src/executor/create.rs b/code/03-01/src/executor/create.rs new file mode 100644 index 0000000..b06a81b --- /dev/null +++ b/code/03-01/src/executor/create.rs @@ -0,0 +1,29 @@ +use super::*; +use crate::catalog::TableRefId; +use crate::physical_planner::PhysicalCreateTable; + +/// The executor of `CREATE TABLE` statement. +pub struct CreateTableExecutor { + pub plan: PhysicalCreateTable, + pub catalog: CatalogRef, + pub storage: StorageRef, +} + +impl CreateTableExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let schema = self.catalog.get_schema(self.plan.schema_id).unwrap(); + let table_id = schema.add_table(&self.plan.table_name).unwrap(); + let table = schema.get_table(table_id).unwrap(); + let mut column_descs = vec![]; + for (name, desc) in &self.plan.columns { + table.add_column(name, desc.clone()).unwrap(); + column_descs.push(desc.clone()); + } + self.storage.add_table( + TableRefId::new(self.plan.schema_id, table_id), + &column_descs, + )?; + yield DataChunk::single(1); + } +} diff --git a/code/03-01/src/executor/dummy.rs b/code/03-01/src/executor/dummy.rs new file mode 100644 index 0000000..f6e5ed4 --- /dev/null +++ b/code/03-01/src/executor/dummy.rs @@ -0,0 +1,11 @@ +use super::*; + +/// A dummy executor that produces a single value. +pub struct DummyExecutor; + +impl DummyExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + yield DataChunk::single(0); + } +} diff --git a/code/03-01/src/executor/evaluator.rs b/code/03-01/src/executor/evaluator.rs new file mode 100644 index 0000000..52d10c8 --- /dev/null +++ b/code/03-01/src/executor/evaluator.rs @@ -0,0 +1,40 @@ +use crate::array::*; +use crate::binder::BoundExpr; +use crate::executor::ExecuteError; +use crate::types::DataValue; + +impl BoundExpr { + /// Evaluate the given expression as a constant value. + /// + /// This method is used in the evaluation of `insert values` and optimizer + pub fn eval_const(&self) -> Result { + match &self { + Self::Constant(v) => Ok(v.clone()), + Self::ColumnRef(_) => panic!("can not evaluate on ColumnRef"), + } + } + + /// Evaluate the given expression as an array. + pub fn eval_array(&self, chunk: &DataChunk) -> Result { + match &self { + // NOTE: + // Currently we assume that the column id is equal to its physical index in the + // DataChunk. It is true in a simple `SELECT v FROM t` case, where the child plan of the + // Projection is Get. However, in a more complex case with join or aggregation, this + // assumption no longer holds. At that time we will convert the ColumnRef into an + // InputRef, and resolve the physical index from column id. + Self::ColumnRef(v) => Ok(chunk.arrays()[v.column_ref_id.column_id as usize].clone()), + Self::Constant(v) => { + let mut builder = ArrayBuilderImpl::with_capacity( + chunk.cardinality(), + &self.return_type().unwrap(), + ); + // TODO: optimize this + for _ in 0..chunk.cardinality() { + builder.push(v); + } + Ok(builder.finish()) + } + } + } +} diff --git a/code/03-01/src/executor/explain.rs b/code/03-01/src/executor/explain.rs new file mode 100644 index 0000000..3a2c6b5 --- /dev/null +++ b/code/03-01/src/executor/explain.rs @@ -0,0 +1,19 @@ +use super::*; +use crate::array::ArrayImpl; +use crate::physical_planner::PhysicalPlan; + +/// The executor of `EXPLAIN` statement. +pub struct ExplainExecutor { + pub plan: Box, +} + +impl ExplainExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let explain_result = format!("{}", *self.plan); + let chunk = DataChunk::from_iter([ArrayImpl::Utf8( + [Some(explain_result)].into_iter().collect(), + )]); + yield chunk; + } +} diff --git a/code/03-01/src/executor/insert.rs b/code/03-01/src/executor/insert.rs new file mode 100644 index 0000000..f169932 --- /dev/null +++ b/code/03-01/src/executor/insert.rs @@ -0,0 +1,71 @@ +use itertools::Itertools; + +use super::*; +use crate::array::{ArrayBuilderImpl, DataChunk}; +use crate::catalog::{ColumnId, TableRefId}; +use crate::types::{DataType, DataValue}; + +/// The executor of `INSERT` statement. +pub struct InsertExecutor { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub catalog: CatalogRef, + pub storage: StorageRef, + pub child: BoxedExecutor, +} + +impl InsertExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let table = self.storage.get_table(self.table_ref_id)?; + let catalog = self.catalog.get_table(self.table_ref_id).unwrap(); + // Describe each column of the output chunks. + // example: + // columns = [0: Int, 1: Bool, 3: Float, 4: String] + // column_ids = [4, 1] + // => output_columns = [Null(Int), Pick(1), Null(Float), Pick(0)] + let output_columns = catalog + .all_columns() + .values() + .map( + |col| match self.column_ids.iter().position(|&id| id == col.id()) { + Some(index) => Column::Pick { index }, + None => Column::Null { + type_: col.datatype(), + }, + }, + ) + .collect_vec(); + let mut count = 0; + #[for_await] + for chunk in self.child { + let chunk = transform_chunk(chunk?, &output_columns); + count += chunk.cardinality(); + table.append(chunk).await?; + } + yield DataChunk::single(count as i32); + } +} + +enum Column { + /// Pick the column at `index` from child. + Pick { index: usize }, + /// Null values with `type`. + Null { type_: DataType }, +} + +fn transform_chunk(chunk: DataChunk, output_columns: &[Column]) -> DataChunk { + output_columns + .iter() + .map(|col| match col { + Column::Pick { index } => chunk.arrays()[*index].clone(), + Column::Null { type_ } => { + let mut builder = ArrayBuilderImpl::with_capacity(chunk.cardinality(), type_); + for _ in 0..chunk.cardinality() { + builder.push(&DataValue::Null); + } + builder.finish() + } + }) + .collect() +} diff --git a/code/03-01/src/executor/mod.rs b/code/03-01/src/executor/mod.rs new file mode 100644 index 0000000..9626fbc --- /dev/null +++ b/code/03-01/src/executor/mod.rs @@ -0,0 +1,122 @@ +//! Execute the queries. + +use futures::stream::{BoxStream, StreamExt}; +use futures_async_stream::try_stream; + +use crate::array::DataChunk; +use crate::catalog::CatalogRef; +use crate::physical_planner::PhysicalPlan; +use crate::storage::{StorageError, StorageRef}; + +mod create; +mod dummy; +mod evaluator; +mod explain; +mod insert; +mod projection; +mod seq_scan; +mod values; + +use self::create::*; +use self::dummy::*; +use self::explain::*; +use self::insert::*; +use self::projection::*; +use self::seq_scan::*; +use self::values::*; + +/// The maximum chunk length produced by executor at a time. +const PROCESSING_WINDOW_SIZE: usize = 1024; + +/// The error type of execution. +#[derive(thiserror::Error, Debug)] +pub enum ExecuteError { + #[error("storage error: {0}")] + Storage(#[from] StorageError), +} + +/// A type-erased executor object. +/// +/// Logically an executor is a stream of data chunks. +/// +/// It consumes one or more streams from its child executors, +/// and produces a stream to its parent. +pub type BoxedExecutor = BoxStream<'static, Result>; + +/// The builder of executor. +pub struct ExecutorBuilder { + catalog: CatalogRef, + storage: StorageRef, + /// An optional runtime handle. + /// + /// If it is some, spawn the executor to runtime and return a channel receiver. + handle: Option, +} + +impl ExecutorBuilder { + /// Create a new executor builder. + pub fn new( + catalog: CatalogRef, + storage: StorageRef, + handle: Option, + ) -> ExecutorBuilder { + ExecutorBuilder { + catalog, + storage, + handle, + } + } + + /// Build executor from a [PhysicalPlan]. + pub fn build(&self, plan: PhysicalPlan) -> BoxedExecutor { + use PhysicalPlan::*; + let mut executor: BoxedExecutor = match plan { + PhysicalCreateTable(plan) => CreateTableExecutor { + plan, + catalog: self.catalog.clone(), + storage: self.storage.clone(), + } + .execute(), + PhysicalInsert(plan) => InsertExecutor { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids, + catalog: self.catalog.clone(), + storage: self.storage.clone(), + child: self.build(*plan.child), + } + .execute(), + PhysicalValues(plan) => ValuesExecutor { + column_types: plan.column_types, + values: plan.values, + } + .execute(), + PhysicalExplain(plan) => ExplainExecutor { plan: plan.child }.execute(), + PhysicalDummy(_) => DummyExecutor.execute(), + PhysicalSeqScan(plan) => SeqScanExecutor { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids, + storage: self.storage.clone(), + } + .execute(), + PhysicalProjection(plan) => ProjectionExecutor { + exprs: plan.exprs, + child: self.build(*plan.child), + } + .execute(), + }; + if let Some(handle) = &self.handle { + // In parallel mode, we spawn the executor into the current tokio runtime, + // connect it with a channel, and return the receiver as an executor. + // Therefore, when used with tokio multi-thread runtime, they can run in parallel. + let (tx, rx) = tokio::sync::mpsc::channel(1); + handle.spawn(async move { + while let Some(e) = executor.next().await { + tx.send(e).await.unwrap(); + } + }); + tokio_stream::wrappers::ReceiverStream::new(rx).boxed() + } else { + executor + } + } +} diff --git a/code/03-01/src/executor/projection.rs b/code/03-01/src/executor/projection.rs new file mode 100644 index 0000000..038584d --- /dev/null +++ b/code/03-01/src/executor/projection.rs @@ -0,0 +1,25 @@ +use super::*; +use crate::array::DataChunk; +use crate::binder::BoundExpr; + +/// The executor of project operation. +pub struct ProjectionExecutor { + pub exprs: Vec, + pub child: BoxedExecutor, +} + +impl ProjectionExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + #[for_await] + for batch in self.child { + let batch = batch?; + let chunk = self + .exprs + .iter() + .map(|expr| expr.eval_array(&batch)) + .collect::>()?; + yield chunk; + } + } +} diff --git a/code/03-01/src/executor/seq_scan.rs b/code/03-01/src/executor/seq_scan.rs new file mode 100644 index 0000000..63770f6 --- /dev/null +++ b/code/03-01/src/executor/seq_scan.rs @@ -0,0 +1,20 @@ +use super::*; +use crate::array::DataChunk; +use crate::catalog::{ColumnId, TableRefId}; + +/// The executor of sequential scan operation. +pub struct SeqScanExecutor { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub storage: StorageRef, +} + +impl SeqScanExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let table = self.storage.get_table(self.table_ref_id)?; + for chunk in table.all_chunks().await? { + yield chunk; + } + } +} diff --git a/code/03-01/src/executor/values.rs b/code/03-01/src/executor/values.rs new file mode 100644 index 0000000..f0e3550 --- /dev/null +++ b/code/03-01/src/executor/values.rs @@ -0,0 +1,73 @@ +use itertools::Itertools; + +use super::*; +use crate::array::{ArrayBuilderImpl, DataChunk}; +use crate::binder::BoundExpr; +use crate::types::DataType; + +/// The executor of `VALUES`. +pub struct ValuesExecutor { + pub column_types: Vec, + /// Each row is composed of multiple values, each value is represented by an expression. + pub values: Vec>, +} + +impl ValuesExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + for chunk in self.values.chunks(PROCESSING_WINDOW_SIZE) { + // Create array builders. + let mut builders = self + .column_types + .iter() + .map(|ty| ArrayBuilderImpl::with_capacity(chunk.len(), ty)) + .collect_vec(); + // Push value into the builder. + for row in chunk { + for (expr, builder) in row.iter().zip(&mut builders) { + let value = expr.eval_const()?; + builder.push(&value); + } + } + // Finish build and yield chunk. + let chunk = builders + .into_iter() + .map(|builder| builder.finish()) + .collect::(); + yield chunk; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::array::ArrayImpl; + use crate::binder::BoundExpr; + use crate::types::{DataTypeExt, DataTypeKind, DataValue}; + + #[tokio::test] + async fn values() { + let values = [[0, 100], [1, 101], [2, 102], [3, 103]]; + let mut executor = ValuesExecutor { + column_types: vec![DataTypeKind::Int(None).nullable(); 2], + values: values + .iter() + .map(|row| { + row.iter() + .map(|&v| BoundExpr::Constant(DataValue::Int32(v))) + .collect::>() + }) + .collect::>(), + } + .execute(); + let output = executor.next().await.unwrap().unwrap(); + let expected = [ + ArrayImpl::Int32((0..4).collect()), + ArrayImpl::Int32((100..104).collect()), + ] + .into_iter() + .collect::(); + assert_eq!(output, expected); + } +} diff --git a/code/03-01/src/lib.rs b/code/03-01/src/lib.rs deleted file mode 120000 index 84f6551..0000000 --- a/code/03-01/src/lib.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/lib.rs \ No newline at end of file diff --git a/code/03-01/src/lib.rs b/code/03-01/src/lib.rs new file mode 100644 index 0000000..f7704d4 --- /dev/null +++ b/code/03-01/src/lib.rs @@ -0,0 +1,36 @@ +//! RisingLight -- an educational OLAP database. + +#![deny(unused_must_use)] +#![feature(generators)] + +// Enable macros for logging. +#[macro_use] +extern crate log; + +#[cfg(test)] +mod test; + +// Top-level structure of the database. +pub mod db; + +// Stage 1: Parse the SQL string into an Abstract Syntax Tree (AST). +pub mod parser; + +// Stage 2: Resolve all expressions referring with their names. +pub mod binder; + +// Stage 3: Transform the parse tree into a logical operations tree. +pub mod logical_planner; + +// Stage 4: Transform the logical plan into the physical plan. +pub mod physical_planner; + +// Stage 5: Execute the plans. +pub mod executor; + +pub mod array; +pub mod catalog; +pub mod storage; +pub mod types; + +pub use self::db::{Database, Error}; diff --git a/code/03-01/src/logical_planner b/code/03-01/src/logical_planner deleted file mode 120000 index 80ac2b5..0000000 --- a/code/03-01/src/logical_planner +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/logical_planner \ No newline at end of file diff --git a/code/03-01/src/logical_planner/create.rs b/code/03-01/src/logical_planner/create.rs new file mode 100644 index 0000000..beef26f --- /dev/null +++ b/code/03-01/src/logical_planner/create.rs @@ -0,0 +1,41 @@ +use itertools::Itertools; + +use super::*; +use crate::binder::BoundCreateTable; +use crate::catalog::{ColumnDesc, SchemaId}; + +/// The logical plan of `CREATE TABLE`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalCreateTable { + pub schema_id: SchemaId, + pub table_name: String, + pub columns: Vec<(String, ColumnDesc)>, +} + +impl LogicalPlanner { + pub fn plan_create_table( + &self, + stmt: BoundCreateTable, + ) -> Result { + Ok(LogicalCreateTable { + schema_id: stmt.schema_id, + table_name: stmt.table_name, + columns: stmt.columns, + } + .into()) + } +} + +impl Explain for LogicalCreateTable { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "CreateTable: name: {}, columns: [{}]", + self.table_name, + self.columns + .iter() + .map(|(name, col)| format!("{}: {:?}", name, col.datatype())) + .join(", ") + ) + } +} diff --git a/code/03-01/src/logical_planner/explain.rs b/code/03-01/src/logical_planner/explain.rs new file mode 100644 index 0000000..f44c4a4 --- /dev/null +++ b/code/03-01/src/logical_planner/explain.rs @@ -0,0 +1,22 @@ +use super::*; + +/// The logical plan of `EXPLAIN`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalExplain { + pub child: LogicalPlanRef, +} + +impl LogicalPlanner { + pub fn plan_explain(&self, stmt: BoundStatement) -> Result { + Ok(LogicalExplain { + child: self.plan(stmt)?.into(), + } + .into()) + } +} + +impl Explain for LogicalExplain { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Huh, explain myself?") + } +} diff --git a/code/03-01/src/logical_planner/insert.rs b/code/03-01/src/logical_planner/insert.rs new file mode 100644 index 0000000..41ed47f --- /dev/null +++ b/code/03-01/src/logical_planner/insert.rs @@ -0,0 +1,56 @@ +use itertools::Itertools; + +use super::*; +use crate::binder::{BoundExpr, BoundInsert}; +use crate::catalog::{ColumnId, TableRefId}; +use crate::types::DataType; + +/// The logical plan of `INSERT`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalInsert { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub child: LogicalPlanRef, +} + +/// The logical plan of `VALUES`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalValues { + pub column_types: Vec, + pub values: Vec>, +} + +impl LogicalPlanner { + pub fn plan_insert(&self, stmt: BoundInsert) -> Result { + Ok(LogicalInsert { + table_ref_id: stmt.table_ref_id, + column_ids: stmt.column_ids, + child: Rc::new( + LogicalValues { + column_types: stmt.column_types, + values: stmt.values, + } + .into(), + ), + } + .into()) + } +} + +impl Explain for LogicalInsert { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "Insert: table {}, columns [{}]", + self.table_ref_id.table_id, + self.column_ids.iter().map(ToString::to_string).join(", ") + )?; + self.child.explain(level + 1, f) + } +} + +impl Explain for LogicalValues { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Values: {} rows", self.values.len()) + } +} diff --git a/code/03-01/src/logical_planner/mod.rs b/code/03-01/src/logical_planner/mod.rs new file mode 100644 index 0000000..0080fc1 --- /dev/null +++ b/code/03-01/src/logical_planner/mod.rs @@ -0,0 +1,68 @@ +use std::rc::Rc; + +use enum_dispatch::enum_dispatch; + +use crate::binder::BoundStatement; + +mod create; +mod explain; +mod insert; +mod select; + +pub use self::create::*; +pub use self::explain::*; +pub use self::insert::*; +pub use self::select::*; + +/// The logical plan. +#[enum_dispatch(Explain)] +#[derive(Debug, PartialEq, Clone)] +pub enum LogicalPlan { + LogicalCreateTable, + LogicalInsert, + LogicalValues, + LogicalExplain, + LogicalDummy, + LogicalGet, + LogicalProjection, +} + +/// The reference type of logical plan. +pub type LogicalPlanRef = Rc; + +impl std::fmt::Display for LogicalPlan { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.explain(0, f) + } +} + +/// Logical planner transforms the AST into a logical operations tree. +#[derive(Default)] +pub struct LogicalPlanner; + +/// The error type of logical planner. +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum LogicalPlanError {} + +impl LogicalPlanner { + /// Generate [`LogicalPlan`] from a [`BoundStatement`]. + pub fn plan(&self, stmt: BoundStatement) -> Result { + match stmt { + BoundStatement::CreateTable(stmt) => self.plan_create_table(stmt), + BoundStatement::Insert(stmt) => self.plan_insert(stmt), + BoundStatement::Explain(stmt) => self.plan_explain(*stmt), + BoundStatement::Select(stmt) => self.plan_select(stmt), + } + } +} + +/// Format a plan in `EXPLAIN` statement. +#[enum_dispatch] +pub trait Explain { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result; + + fn explain(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", " ".repeat(level))?; + self.explain_inner(level, f) + } +} diff --git a/code/03-01/src/logical_planner/select.rs b/code/03-01/src/logical_planner/select.rs new file mode 100644 index 0000000..0d9cb45 --- /dev/null +++ b/code/03-01/src/logical_planner/select.rs @@ -0,0 +1,73 @@ +//! Logical planner of `select` statement. +//! +//! A `select` statement will be planned to a compose of: +//! +//! - [`LogicalGet`] (from *) or [`LogicalDummy`] (no from) +//! - [`LogicalProjection`] (select *) + +use super::*; +use crate::binder::{BoundExpr, BoundSelect}; +use crate::catalog::{ColumnId, TableRefId}; + +/// The logical plan of dummy get. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalDummy; + +/// The logical plan of get. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalGet { + pub table_ref_id: TableRefId, + pub column_ids: Vec, +} + +/// The logical plan of projection. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalProjection { + pub exprs: Vec, + pub child: LogicalPlanRef, +} + +impl LogicalPlanner { + pub fn plan_select(&self, stmt: BoundSelect) -> Result { + let mut plan: LogicalPlan = LogicalDummy.into(); + + if let Some(table_ref) = stmt.from_list.get(0) { + plan = LogicalGet { + table_ref_id: table_ref.table_ref_id, + column_ids: table_ref.column_ids.clone(), + } + .into(); + } + if !stmt.select_list.is_empty() { + plan = LogicalProjection { + exprs: stmt.select_list, + child: plan.into(), + } + .into(); + } + Ok(plan) + } +} + +impl Explain for LogicalDummy { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Dummy:") + } +} + +impl Explain for LogicalGet { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "Get: table: {:?}, columns: {:?}", + self.table_ref_id, self.column_ids + ) + } +} + +impl Explain for LogicalProjection { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Projection: exprs: {:?}", self.exprs)?; + self.child.explain(level + 1, f) + } +} diff --git a/code/03-01/src/parser.rs b/code/03-01/src/parser.rs deleted file mode 120000 index 41306c1..0000000 --- a/code/03-01/src/parser.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/parser.rs \ No newline at end of file diff --git a/code/03-01/src/parser.rs b/code/03-01/src/parser.rs new file mode 100644 index 0000000..4c3b971 --- /dev/null +++ b/code/03-01/src/parser.rs @@ -0,0 +1,15 @@ +//! Parse the SQL string into an Abstract Syntax Tree (AST). +//! +//! The parser module directly uses the [`sqlparser`] crate +//! and re-exports its AST types. + +pub use sqlparser::ast::*; +use sqlparser::dialect::PostgreSqlDialect; +use sqlparser::parser::Parser; +pub use sqlparser::parser::ParserError; + +/// Parse the SQL string into a list of ASTs. +pub fn parse(sql: &str) -> Result, ParserError> { + let dialect = PostgreSqlDialect {}; + Parser::parse_sql(&dialect, sql) +} diff --git a/code/03-01/src/physical_planner b/code/03-01/src/physical_planner deleted file mode 120000 index 844f2c0..0000000 --- a/code/03-01/src/physical_planner +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/physical_planner \ No newline at end of file diff --git a/code/03-01/src/physical_planner/create.rs b/code/03-01/src/physical_planner/create.rs new file mode 100644 index 0000000..b45ef14 --- /dev/null +++ b/code/03-01/src/physical_planner/create.rs @@ -0,0 +1,41 @@ +use itertools::Itertools; + +use super::*; +use crate::catalog::{ColumnDesc, SchemaId}; +use crate::logical_planner::LogicalCreateTable; + +/// The physical plan of `CREATE TABLE`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalCreateTable { + pub schema_id: SchemaId, + pub table_name: String, + pub columns: Vec<(String, ColumnDesc)>, +} + +impl PhysicalPlanner { + pub fn plan_create_table( + &self, + plan: &LogicalCreateTable, + ) -> Result { + Ok(PhysicalCreateTable { + schema_id: plan.schema_id, + table_name: plan.table_name.clone(), + columns: plan.columns.clone(), + } + .into()) + } +} + +impl Explain for PhysicalCreateTable { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "CreateTable: name: {}, columns: [{}]", + self.table_name, + self.columns + .iter() + .map(|(name, col)| format!("{}: {:?}", name, col.datatype())) + .join(", ") + ) + } +} diff --git a/code/03-01/src/physical_planner/dummy.rs b/code/03-01/src/physical_planner/dummy.rs new file mode 100644 index 0000000..5a52db2 --- /dev/null +++ b/code/03-01/src/physical_planner/dummy.rs @@ -0,0 +1,17 @@ +use super::*; +use crate::logical_planner::LogicalDummy; + +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalDummy; + +impl PhysicalPlanner { + pub fn plan_dummy(&self, _plan: &LogicalDummy) -> Result { + Ok(PhysicalDummy.into()) + } +} + +impl Explain for PhysicalDummy { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Dummy:") + } +} diff --git a/code/03-01/src/physical_planner/explain.rs b/code/03-01/src/physical_planner/explain.rs new file mode 100644 index 0000000..39c5388 --- /dev/null +++ b/code/03-01/src/physical_planner/explain.rs @@ -0,0 +1,23 @@ +use super::*; +use crate::logical_planner::LogicalExplain; + +/// The physical plan of `EXPLAIN`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalExplain { + pub child: Box, +} + +impl PhysicalPlanner { + pub fn plan_explain(&self, plan: &LogicalExplain) -> Result { + Ok(PhysicalExplain { + child: self.plan(&plan.child)?.into(), + } + .into()) + } +} + +impl Explain for PhysicalExplain { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Huh, explain myself?") + } +} diff --git a/code/03-01/src/physical_planner/insert.rs b/code/03-01/src/physical_planner/insert.rs new file mode 100644 index 0000000..91a2463 --- /dev/null +++ b/code/03-01/src/physical_planner/insert.rs @@ -0,0 +1,59 @@ +use itertools::Itertools; + +use super::*; +use crate::binder::BoundExpr; +use crate::catalog::{ColumnId, TableRefId}; +use crate::logical_planner::{LogicalInsert, LogicalValues}; +use crate::types::DataType; + +/// The physical plan of `INSERT`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalInsert { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub child: Box, +} + +/// The physical plan of `VALUES`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalValues { + pub column_types: Vec, + pub values: Vec>, +} + +impl PhysicalPlanner { + pub fn plan_insert(&self, plan: &LogicalInsert) -> Result { + Ok(PhysicalInsert { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids.clone(), + child: self.plan(&plan.child)?.into(), + } + .into()) + } + + pub fn plan_values(&self, plan: &LogicalValues) -> Result { + Ok(PhysicalValues { + column_types: plan.column_types.clone(), + values: plan.values.clone(), + } + .into()) + } +} + +impl Explain for PhysicalInsert { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "Insert: table {}, columns [{}]", + self.table_ref_id.table_id, + self.column_ids.iter().map(ToString::to_string).join(", ") + )?; + self.child.explain(level + 1, f) + } +} + +impl Explain for PhysicalValues { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Values: {} rows", self.values.len()) + } +} diff --git a/code/03-01/src/physical_planner/mod.rs b/code/03-01/src/physical_planner/mod.rs new file mode 100644 index 0000000..ee3535f --- /dev/null +++ b/code/03-01/src/physical_planner/mod.rs @@ -0,0 +1,60 @@ +use enum_dispatch::enum_dispatch; + +use crate::logical_planner::{Explain, LogicalPlan}; + +mod create; +mod dummy; +mod explain; +mod insert; +mod projection; +mod seq_scan; + +pub use self::create::*; +pub use self::dummy::*; +pub use self::explain::*; +pub use self::insert::*; +pub use self::projection::*; +pub use self::seq_scan::*; + +/// The physical plan. +#[enum_dispatch(Explain)] +#[derive(Debug, PartialEq, Clone)] +pub enum PhysicalPlan { + PhysicalCreateTable, + PhysicalInsert, + PhysicalValues, + PhysicalExplain, + PhysicalDummy, + PhysicalSeqScan, + PhysicalProjection, +} + +impl std::fmt::Display for PhysicalPlan { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.explain(0, f) + } +} + +/// Physical planner transforms the logical plan tree into a physical plan tree. +#[derive(Default)] +pub struct PhysicalPlanner; + +/// The error type of physical planner. +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum PhysicalPlanError {} + +impl PhysicalPlanner { + /// Generate [`PhysicalPlan`] from a [`LogicalPlan`]. + pub fn plan(&self, plan: &LogicalPlan) -> Result { + use LogicalPlan::*; + match plan { + LogicalCreateTable(plan) => self.plan_create_table(plan), + LogicalInsert(plan) => self.plan_insert(plan), + LogicalValues(plan) => self.plan_values(plan), + LogicalExplain(plan) => self.plan_explain(plan), + LogicalDummy(plan) => self.plan_dummy(plan), + LogicalGet(plan) => self.plan_get(plan), + LogicalProjection(plan) => self.plan_projection(plan), + } + } +} diff --git a/code/03-01/src/physical_planner/projection.rs b/code/03-01/src/physical_planner/projection.rs new file mode 100644 index 0000000..4f53a52 --- /dev/null +++ b/code/03-01/src/physical_planner/projection.rs @@ -0,0 +1,30 @@ +use super::*; +use crate::binder::BoundExpr; +use crate::logical_planner::LogicalProjection; + +/// The physical plan of project operation. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalProjection { + pub exprs: Vec, + pub child: Box, +} + +impl PhysicalPlanner { + pub fn plan_projection( + &self, + plan: &LogicalProjection, + ) -> Result { + Ok(PhysicalProjection { + exprs: plan.exprs.clone(), + child: self.plan(&plan.child)?.into(), + } + .into()) + } +} + +impl Explain for PhysicalProjection { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Projection: exprs: {:?}", self.exprs)?; + self.child.explain(level + 1, f) + } +} diff --git a/code/03-01/src/physical_planner/seq_scan.rs b/code/03-01/src/physical_planner/seq_scan.rs new file mode 100644 index 0000000..09800ef --- /dev/null +++ b/code/03-01/src/physical_planner/seq_scan.rs @@ -0,0 +1,30 @@ +use super::*; +use crate::catalog::{ColumnId, TableRefId}; +use crate::logical_planner::LogicalGet; + +/// The physical plan of sequential scan operation. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalSeqScan { + pub table_ref_id: TableRefId, + pub column_ids: Vec, +} + +impl PhysicalPlanner { + pub fn plan_get(&self, plan: &LogicalGet) -> Result { + Ok(PhysicalSeqScan { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids.clone(), + } + .into()) + } +} + +impl Explain for PhysicalSeqScan { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "SeqScan: table #{}, columns: {:?}", + self.table_ref_id.table_id, self.column_ids, + ) + } +} diff --git a/code/03-01/src/storage/mod.rs b/code/03-01/src/storage/mod.rs index 7e380c1..aa649ad 100644 --- a/code/03-01/src/storage/mod.rs +++ b/code/03-01/src/storage/mod.rs @@ -1,35 +1,54 @@ -//! Persistent storage on disk. -//! -//! RisingLight's in-memory representation of data is very simple. Currently, -//! it is simple a vector of `DataChunk`. Upon insertion, users' data are -//! simply appended to the end of the vector. - -mod rowset; -mod table_transaction; +//! On-disk storage use std::collections::HashMap; -use std::sync::{Arc, Mutex, RwLock}; +use std::path::PathBuf; +use std::sync::{Arc, RwLock}; + +use anyhow::anyhow; +use bytes::{Buf, BufMut}; -use self::table_transaction::TableTransaction; -use crate::array::DataChunk; -use crate::catalog::TableRefId; +use crate::array::{Array, ArrayBuilder, ArrayImpl, DataChunk, I32Array, I32ArrayBuilder}; +use crate::catalog::{ColumnDesc, TableRefId}; /// The error type of storage operations. #[derive(thiserror::Error, Debug)] -pub enum StorageError { - #[error("table not found: {0:?}")] - NotFound(TableRefId), -} +#[error("{0:?}")] +pub struct StorageError(#[from] anyhow::Error); /// A specialized `Result` type for storage operations. pub type StorageResult = std::result::Result; pub type StorageRef = Arc; -pub type DiskTableRef = Arc; +pub type StorageTableRef = Arc; -/// Persistent storage on disk. +/// On-disk storage. pub struct DiskStorage { - tables: Mutex>, + /// All tables in the current storage engine. + tables: RwLock>, + + /// The storage options. + options: Arc, +} + +pub struct StorageOptions { + /// The directory of the storage + base_path: PathBuf, +} + +pub fn err(error: impl Into) -> StorageError { + StorageError(error.into()) +} + +/// An on-disk table. +pub struct DiskTable { + /// Id of the table. + id: TableRefId, + + /// Columns of the current table. + column_descs: Arc<[ColumnDesc]>, + + /// The storage options. + options: Arc, } impl Default for DiskStorage { @@ -39,80 +58,96 @@ impl Default for DiskStorage { } impl DiskStorage { - /// Create a new persistent storage on disk. + /// Create a new in-memory storage. pub fn new() -> Self { DiskStorage { - tables: Mutex::new(HashMap::new()), + tables: RwLock::new(HashMap::new()), + options: Arc::new(StorageOptions { + base_path: "risinglight.db".into(), + }), } } /// Add a table. - pub fn add_table(&self, id: TableRefId) -> StorageResult<()> { - let table = Arc::new(DiskTable::new(id)); - self.tables.lock().unwrap().insert(id, table); + pub fn add_table(&self, id: TableRefId, column_descs: &[ColumnDesc]) -> StorageResult<()> { + let mut tables = self.tables.write().unwrap(); + let table = DiskTable { + id, + options: self.options.clone(), + column_descs: column_descs.into(), + }; + let res = tables.insert(id, table.into()); + if res.is_some() { + return Err(anyhow!("table already exists: {:?}", id).into()); + } Ok(()) } /// Get a table. - pub fn get_table(&self, id: TableRefId) -> StorageResult { - self.tables - .lock() - .unwrap() + pub fn get_table(&self, id: TableRefId) -> StorageResult { + let tables = self.tables.read().unwrap(); + tables .get(&id) + .ok_or_else(|| anyhow!("table not found: {:?}", id).into()) .cloned() - .ok_or(StorageError::NotFound(id)) } } -/// A table in in-memory engine. -pub struct DiskTable { - #[allow(dead_code)] - id: TableRefId, - inner: RwLock, -} - -#[derive(Default)] -struct DiskTableInner { - chunks: Vec, -} - -impl DiskTable { - fn new(id: TableRefId) -> Self { - Self { - id, - inner: RwLock::new(DiskTableInner::default()), +/// Encode an `I32Array` into a `Vec`. +fn encode_int32_column(a: &I32Array) -> StorageResult> { + let mut buffer = Vec::with_capacity(a.len() * 4); + for item in a.iter() { + if let Some(item) = item { + buffer.put_i32_le(*item); + } else { + return Err(anyhow!("nullable encoding not supported!").into()); } } + Ok(buffer) +} - #[allow(dead_code)] - async fn write(self: &Arc) -> StorageResult { - Ok(TableTransaction::start(self.clone(), false, false).await?) +fn decode_int32_column(mut data: &[u8]) -> StorageResult { + let mut builder = I32ArrayBuilder::with_capacity(data.len() / 4); + while data.has_remaining() { + builder.push(Some(&data.get_i32_le())); } + Ok(builder.finish()) +} - #[allow(dead_code)] - async fn read(self: &Arc) -> StorageResult { - Ok(TableTransaction::start(self.clone(), true, false).await?) +impl DiskTable { + fn table_path(&self) -> PathBuf { + self.options.base_path.join(self.id.table_id.to_string()) } - #[allow(dead_code)] - async fn update(self: &Arc) -> StorageResult { - Ok(TableTransaction::start(self.clone(), false, true).await?) + fn column_path(&self, column_id: usize) -> PathBuf { + self.table_path().join(format!("{}.col", column_id)) } /// Append a chunk to the table. - /// - /// This interface will be deprecated soon in this tutorial. - pub fn append(&self, chunk: DataChunk) -> StorageResult<()> { - let mut inner = self.inner.write().unwrap(); - inner.chunks.push(chunk); + pub async fn append(&self, chunk: DataChunk) -> StorageResult<()> { + for (idx, column) in chunk.arrays().iter().enumerate() { + if let ArrayImpl::Int32(column) = column { + let column_path = self.column_path(idx); + let data = encode_int32_column(column)?; + tokio::fs::create_dir_all(column_path.parent().unwrap()) + .await + .map_err(err)?; + tokio::fs::write(column_path, data).await.map_err(err)?; + } else { + return Err(anyhow!("unsupported column type").into()); + } + } Ok(()) } /// Get all chunks of the table. - /// - /// This interface will be deprecated soon in this tutorial. - pub fn all_chunks(&self) -> StorageResult> { - let inner = self.inner.read().unwrap(); - Ok(inner.chunks.clone()) + pub async fn all_chunks(&self) -> StorageResult> { + let mut columns = vec![]; + for (idx, _) in self.column_descs.iter().enumerate() { + let column_path = self.column_path(idx); + let data = tokio::fs::read(column_path).await.map_err(err)?; + columns.push(decode_int32_column(&data)?); + } + Ok(vec![columns.into_iter().map(ArrayImpl::Int32).collect()]) } } diff --git a/code/03-01/src/storage/rowset/mem_rowset.rs b/code/03-01/src/storage/rowset/mem_rowset.rs deleted file mode 100644 index 3b904a8..0000000 --- a/code/03-01/src/storage/rowset/mem_rowset.rs +++ /dev/null @@ -1,39 +0,0 @@ -#![allow(dead_code)] - -use std::sync::Arc; - -use itertools::Itertools; - -use crate::array::{ArrayBuilderImpl, DataChunk}; -use crate::catalog::ColumnCatalog; -use crate::storage::StorageResult; - -pub struct MemRowset { - builders: Vec, -} - -impl MemRowset { - pub fn new(columns: Arc<[ColumnCatalog]>) -> Self { - Self { - builders: columns - .iter() - .map(|column| ArrayBuilderImpl::with_capacity(0, column.desc().datatype())) - .collect_vec(), - } - } - - fn append(&mut self, columns: DataChunk) -> StorageResult<()> { - for (idx, column) in columns.arrays().iter().enumerate() { - self.builders[idx].append(column); - } - Ok(()) - } - - fn flush(self) -> StorageResult { - Ok(self - .builders - .into_iter() - .map(|builder| builder.finish()) - .collect::()) - } -} diff --git a/code/03-01/src/storage/rowset/mod.rs b/code/03-01/src/storage/rowset/mod.rs deleted file mode 100644 index 1675535..0000000 --- a/code/03-01/src/storage/rowset/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod mem_rowset; - -pub use mem_rowset::*; diff --git a/code/03-01/src/storage/table_transaction.rs b/code/03-01/src/storage/table_transaction.rs deleted file mode 100644 index 749bbbf..0000000 --- a/code/03-01/src/storage/table_transaction.rs +++ /dev/null @@ -1,56 +0,0 @@ -#![allow(dead_code)] - -use super::rowset::MemRowset; -use super::{DiskTableRef, StorageResult}; -use crate::array::DataChunk; - -/// [`TableTransaction`] records the state of a single table. All operations (insert, update, -/// delete) should go through [`TableTransaction`]. -pub struct TableTransaction { - mem_rowset: Option, - read_only: bool, - update: bool, - table: DiskTableRef, -} - -impl TableTransaction { - /// Start a [`WriteBatch`] - pub async fn start(table: DiskTableRef, read_only: bool, update: bool) -> StorageResult { - Ok(Self { - mem_rowset: None, - table, - update, - read_only, - }) - } - - /// Flush [`WriteBatch`] to some on-disk RowSets. - pub async fn flush(self) { - todo!() - } - - /// Add a [`DataChunk`] to the mem rowset - pub fn append(&self, _chunk: DataChunk) -> StorageResult<()> { - todo!() - } - - /// Delete a row from the table. - async fn delete(&mut self, _row_id: u64) -> StorageResult<()> { - todo!() - } - - /// Commit all changes in this transaction. - pub fn commit(self) -> StorageResult<()> { - todo!() - } - - /// Abort all changes in this transaction. - pub fn abort(self) -> StorageResult<()> { - todo!() - } - - /// Create an iterator on this table. - pub async fn scan(&self) { - todo!() - } -} diff --git a/code/03-01/src/test.rs b/code/03-01/src/test.rs deleted file mode 120000 index 05edac9..0000000 --- a/code/03-01/src/test.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/test.rs \ No newline at end of file diff --git a/code/03-01/src/test.rs b/code/03-01/src/test.rs new file mode 100644 index 0000000..b57647a --- /dev/null +++ b/code/03-01/src/test.rs @@ -0,0 +1,55 @@ +use std::path::Path; + +use test_case::test_case; + +use crate::array::DataChunk; +use crate::types::DataValue; +use crate::{Database, Error}; + +#[test_case("03-01.slt")] +fn test(name: &str) { + init_logger(); + let script = std::fs::read_to_string(Path::new("../sql").join(name)).unwrap(); + let mut tester = sqllogictest::Runner::new(Database::new()); + if let Err(err) = tester.run_script(&script) { + panic!("{}", err); + } +} + +impl sqllogictest::DB for Database { + type Error = Error; + fn run(&self, sql: &str) -> Result { + let chunks = self.run(sql)?; + let strings = chunks.iter().map(datachunk_to_string).collect(); + Ok(strings) + } +} + +fn init_logger() { + use std::sync::Once; + static INIT: Once = Once::new(); + INIT.call_once(env_logger::init); +} + +fn datachunk_to_string(chunk: &DataChunk) -> String { + use std::fmt::Write; + let mut string = String::new(); + for row in 0..chunk.cardinality() { + for (col, array) in chunk.arrays().iter().enumerate() { + if col != 0 { + write!(string, " ").unwrap(); + } + match array.get(row) { + DataValue::Null => write!(string, "NULL"), + DataValue::Bool(v) => write!(string, "{}", v), + DataValue::Int32(v) => write!(string, "{}", v), + DataValue::Float64(v) => write!(string, "{}", v), + DataValue::String(s) if s.is_empty() => write!(string, "(empty)"), + DataValue::String(s) => write!(string, "{}", s), + } + .unwrap(); + } + writeln!(string).unwrap(); + } + string +} diff --git a/code/03-01/src/types.rs b/code/03-01/src/types.rs deleted file mode 120000 index 88bb446..0000000 --- a/code/03-01/src/types.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/types.rs \ No newline at end of file diff --git a/code/03-01/src/types.rs b/code/03-01/src/types.rs new file mode 100644 index 0000000..509d59a --- /dev/null +++ b/code/03-01/src/types.rs @@ -0,0 +1,89 @@ +//! Defination of data types. + +pub use sqlparser::ast::DataType as DataTypeKind; + +/// Data type with nullable. +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct DataType { + kind: DataTypeKind, + nullable: bool, +} + +impl DataType { + pub const fn new(kind: DataTypeKind, nullable: bool) -> Self { + DataType { kind, nullable } + } + + pub fn is_nullable(&self) -> bool { + self.nullable + } + + pub fn kind(&self) -> DataTypeKind { + self.kind.clone() + } +} + +impl std::fmt::Debug for DataType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.kind)?; + if self.nullable { + write!(f, " (null)")?; + } + Ok(()) + } +} + +/// The extension methods for [`DataTypeKind`]. +pub trait DataTypeExt { + /// Create a nullable [`DataType`] from self. + fn nullable(self) -> DataType; + /// Create a non-nullable [`DataType`] from self. + fn not_null(self) -> DataType; +} + +impl DataTypeExt for DataTypeKind { + fn nullable(self) -> DataType { + DataType::new(self, true) + } + + fn not_null(self) -> DataType { + DataType::new(self, false) + } +} + +/// Primitive SQL value. +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum DataValue { + // NOTE: Null comes first. + // => NULL is less than any non-NULL values + Null, + Bool(bool), + Int32(i32), + Float64(f64), + String(String), +} + +impl ToString for DataValue { + fn to_string(&self) -> String { + match self { + Self::Null => String::from("NULL"), + Self::Bool(v) => v.to_string(), + Self::Int32(v) => v.to_string(), + Self::Float64(v) => v.to_string(), + Self::String(v) => v.to_string(), + } + } +} + +impl DataValue { + /// Get the type of value. `None` means NULL. + pub fn datatype(&self) -> Option { + match self { + Self::Bool(_) => Some(DataTypeKind::Boolean.not_null()), + Self::Int32(_) => Some(DataTypeKind::Int(None).not_null()), + Self::Float64(_) => Some(DataTypeKind::Double.not_null()), + Self::String(_) => Some(DataTypeKind::Varchar(None).not_null()), + Self::Null => None, + } + } +} diff --git a/code/03-02/Cargo.toml b/code/03-02/Cargo.toml index d5f856f..fa1e6f2 100644 --- a/code/03-02/Cargo.toml +++ b/code/03-02/Cargo.toml @@ -17,8 +17,9 @@ log = "0.4" prettytable-rs = { version = "0.8", default-features = false } rustyline = "9" sqlparser = "0.13" +tempfile = "3" thiserror = "1" -tokio = { version = "1", features = ["rt", "rt-multi-thread", "sync", "macros"] } +tokio = { version = "1", features = ["rt", "rt-multi-thread", "sync", "macros", "fs"] } tokio-stream = "0.1" [dev-dependencies] diff --git a/code/03-02/src/array b/code/03-02/src/array deleted file mode 120000 index a30992f..0000000 --- a/code/03-02/src/array +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/array \ No newline at end of file diff --git a/code/03-02/src/array/data_chunk.rs b/code/03-02/src/array/data_chunk.rs new file mode 100644 index 0000000..40a2df6 --- /dev/null +++ b/code/03-02/src/array/data_chunk.rs @@ -0,0 +1,85 @@ +use std::fmt; +use std::sync::Arc; + +use itertools::Itertools; + +use super::*; + +/// A collection of arrays. +/// +/// A chunk is a horizontal subset of a query result. +#[derive(PartialEq, Clone)] +pub struct DataChunk { + arrays: Arc<[ArrayImpl]>, +} + +/// Create [`DataChunk`] from a list of column arrays. +impl FromIterator for DataChunk { + fn from_iter>(iter: I) -> Self { + let arrays = iter.into_iter().collect::>(); + assert!(!arrays.is_empty()); + let cardinality = arrays[0].len(); + assert!( + arrays.iter().map(|a| a.len()).all(|l| l == cardinality), + "all arrays must have the same length" + ); + DataChunk { arrays } + } +} + +impl DataChunk { + /// Return a [`DataChunk`] with 1 `item` in 1 array. + pub fn single(item: i32) -> Self { + DataChunk { + arrays: [ArrayImpl::Int32([item].into_iter().collect())] + .into_iter() + .collect(), + } + } + + /// Return the number of rows in the chunk. + pub fn cardinality(&self) -> usize { + self.arrays[0].len() + } + + /// Get all arrays. + pub fn arrays(&self) -> &[ArrayImpl] { + &self.arrays + } + + /// Concatenate multiple chunks into one. + pub fn concat(chunks: &[DataChunk]) -> Self { + assert!(!chunks.is_empty(), "must concat at least one chunk"); + let mut builders = chunks[0] + .arrays() + .iter() + .map(ArrayBuilderImpl::from_type_of_array) + .collect_vec(); + for chunk in chunks { + for (array, builder) in chunk.arrays.iter().zip(builders.iter_mut()) { + builder.append(array); + } + } + builders.into_iter().map(|b| b.finish()).collect() + } +} + +/// Print the chunk as a pretty table. +impl fmt::Display for DataChunk { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use prettytable::{format, Table}; + let mut table = Table::new(); + table.set_format(*format::consts::FORMAT_NO_LINESEP_WITH_TITLE); + for i in 0..self.cardinality() { + let row = self.arrays.iter().map(|a| a.get(i).to_string()).collect(); + table.add_row(row); + } + write!(f, "{}", table) + } +} + +impl fmt::Debug for DataChunk { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self) + } +} diff --git a/code/03-02/src/array/iter.rs b/code/03-02/src/array/iter.rs new file mode 100644 index 0000000..8871bda --- /dev/null +++ b/code/03-02/src/array/iter.rs @@ -0,0 +1,37 @@ +use std::iter::Iterator; +use std::marker::PhantomData; + +use super::Array; + +/// An iterator over the elements of an [`Array`]. +#[derive(Clone)] +pub struct ArrayIter<'a, A: Array> { + array: &'a A, + index: usize, + _phantom: PhantomData<&'a usize>, +} + +impl<'a, A: Array> ArrayIter<'a, A> { + /// Create an iterator over array. + pub fn new(array: &'a A) -> Self { + Self { + array, + index: 0, + _phantom: PhantomData, + } + } +} + +impl<'a, A: Array> Iterator for ArrayIter<'a, A> { + type Item = Option<&'a A::Item>; + + fn next(&mut self) -> Option { + if self.index >= self.array.len() { + None + } else { + let item = self.array.get(self.index); + self.index += 1; + Some(item) + } + } +} diff --git a/code/03-02/src/array/mod.rs b/code/03-02/src/array/mod.rs new file mode 100644 index 0000000..3c29a6b --- /dev/null +++ b/code/03-02/src/array/mod.rs @@ -0,0 +1,244 @@ +//! In-memory representations of a column values. + +use std::convert::TryFrom; + +use crate::types::{DataType, DataTypeKind, DataValue}; + +mod data_chunk; +mod iter; +mod primitive_array; +mod utf8_array; + +pub use self::data_chunk::*; +pub use self::iter::ArrayIter; +pub use self::primitive_array::*; +pub use self::utf8_array::*; + +/// A trait over all array builders. +/// +/// [`ArrayBuilder`] is a trait over all builders. You could build an array with +/// `push` with the help of [`ArrayBuilder`] trait. The `push` function always +/// accepts reference to an element. e.g. for [`PrimitiveArray`], +/// you must do `builder.push(Some(&1))`. For [`Utf8Array`], you must do +/// `builder.push(Some("xxx"))`. Note that you don't need to construct a `String`. +/// +/// The associated type `Array` is the type of the corresponding array. It is the +/// return type of `finish`. +pub trait ArrayBuilder: Send + Sync + 'static { + /// Corresponding `Array` of this builder + type Array: Array; + + /// Create a new builder with `capacity`. + fn with_capacity(capacity: usize) -> Self; + + /// Append a value to builder. + fn push(&mut self, value: Option<&::Item>); + + /// Append an array to builder. + fn append(&mut self, other: &Self::Array); + + /// Finish build and return a new array. + fn finish(self) -> Self::Array; +} + +/// A trait over all array. +/// +/// [`Array`] must be built with an [`ArrayBuilder`]. The array trait provides several +/// unified interface on an array, like `len`, `get` and `iter`. +/// +/// The `Builder` associated type is the builder for this array. +/// The `Item` is the item you could retrieve from this array. +/// +/// For example, [`PrimitiveArray`] could return an `Option<&u32>`, and [`Utf8Array`] will +/// return an `Option<&str>`. +pub trait Array: Sized + Send + Sync + 'static { + /// Corresponding builder of this array. + type Builder: ArrayBuilder; + + /// Type of element in the array. + type Item: ToOwned + ?Sized; + + /// Retrieve a reference to value. + fn get(&self, idx: usize) -> Option<&Self::Item>; + + /// Number of items of array. + fn len(&self) -> usize; + + /// Get iterator of current array. + fn iter(&self) -> ArrayIter<'_, Self> { + ArrayIter::new(self) + } + + /// Check if the array has a length of 0. + fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +pub type BoolArray = PrimitiveArray; +pub type I32Array = PrimitiveArray; +pub type F64Array = PrimitiveArray; + +/// Embeds all types of arrays in `array` module. +#[derive(Clone, PartialEq)] +pub enum ArrayImpl { + Bool(BoolArray), + Int32(I32Array), + Float64(F64Array), + Utf8(Utf8Array), +} + +pub type BoolArrayBuilder = PrimitiveArrayBuilder; +pub type I32ArrayBuilder = PrimitiveArrayBuilder; +pub type F64ArrayBuilder = PrimitiveArrayBuilder; + +/// Embeds all types of array builders in `array` module. +pub enum ArrayBuilderImpl { + Bool(BoolArrayBuilder), + Int32(I32ArrayBuilder), + Float64(F64ArrayBuilder), + Utf8(Utf8ArrayBuilder), +} + +/// An error which can be returned when downcasting an [`ArrayImpl`] into a concrete type array. +#[derive(Debug, Clone)] +pub struct TypeMismatch; + +macro_rules! impl_into { + ($x:ty, $y:ident) => { + impl From<$x> for ArrayImpl { + fn from(array: $x) -> Self { + Self::$y(array) + } + } + + impl TryFrom for $x { + type Error = TypeMismatch; + + fn try_from(array: ArrayImpl) -> Result { + match array { + ArrayImpl::$y(array) => Ok(array), + _ => Err(TypeMismatch), + } + } + } + + impl<'a> TryFrom<&'a ArrayImpl> for &'a $x { + type Error = TypeMismatch; + + fn try_from(array: &'a ArrayImpl) -> Result { + match array { + ArrayImpl::$y(array) => Ok(array), + _ => Err(TypeMismatch), + } + } + } + }; +} + +impl_into! { PrimitiveArray, Bool } +impl_into! { PrimitiveArray, Int32 } +impl_into! { PrimitiveArray, Float64 } +impl_into! { Utf8Array, Utf8 } + +impl ArrayBuilderImpl { + /// Create a new array builder from data type. + pub fn with_capacity(capacity: usize, ty: &DataType) -> Self { + match ty.kind() { + DataTypeKind::Boolean => Self::Bool(BoolArrayBuilder::with_capacity(capacity)), + DataTypeKind::Int(_) => Self::Int32(I32ArrayBuilder::with_capacity(capacity)), + DataTypeKind::Float(_) | DataTypeKind::Double => { + Self::Float64(F64ArrayBuilder::with_capacity(capacity)) + } + DataTypeKind::Char(_) | DataTypeKind::Varchar(_) | DataTypeKind::String => { + Self::Utf8(Utf8ArrayBuilder::with_capacity(capacity)) + } + _ => panic!("unsupported data type"), + } + } + + /// Create a new array builder with the same type of given array. + pub fn from_type_of_array(array: &ArrayImpl) -> Self { + match array { + ArrayImpl::Bool(_) => Self::Bool(BoolArrayBuilder::with_capacity(0)), + ArrayImpl::Int32(_) => Self::Int32(I32ArrayBuilder::with_capacity(0)), + ArrayImpl::Float64(_) => Self::Float64(F64ArrayBuilder::with_capacity(0)), + ArrayImpl::Utf8(_) => Self::Utf8(Utf8ArrayBuilder::with_capacity(0)), + } + } + + /// Appends an element to the back of array. + pub fn push(&mut self, v: &DataValue) { + match (self, v) { + (Self::Bool(a), DataValue::Bool(v)) => a.push(Some(v)), + (Self::Int32(a), DataValue::Int32(v)) => a.push(Some(v)), + (Self::Float64(a), DataValue::Float64(v)) => a.push(Some(v)), + (Self::Utf8(a), DataValue::String(v)) => a.push(Some(v)), + (Self::Bool(a), DataValue::Null) => a.push(None), + (Self::Int32(a), DataValue::Null) => a.push(None), + (Self::Float64(a), DataValue::Null) => a.push(None), + (Self::Utf8(a), DataValue::Null) => a.push(None), + _ => panic!("failed to push value: type mismatch"), + } + } + + /// Appends a [`ArrayImpl`]. + pub fn append(&mut self, array_impl: &ArrayImpl) { + match (self, array_impl) { + (Self::Bool(builder), ArrayImpl::Bool(arr)) => builder.append(arr), + (Self::Int32(builder), ArrayImpl::Int32(arr)) => builder.append(arr), + (Self::Float64(builder), ArrayImpl::Float64(arr)) => builder.append(arr), + (Self::Utf8(builder), ArrayImpl::Utf8(arr)) => builder.append(arr), + _ => panic!("failed to push value: type mismatch"), + } + } + + /// Finish build and return a new array. + pub fn finish(self) -> ArrayImpl { + match self { + Self::Bool(a) => ArrayImpl::Bool(a.finish()), + Self::Int32(a) => ArrayImpl::Int32(a.finish()), + Self::Float64(a) => ArrayImpl::Float64(a.finish()), + Self::Utf8(a) => ArrayImpl::Utf8(a.finish()), + } + } +} + +impl ArrayImpl { + /// Get the value at the given index. + pub fn get(&self, idx: usize) -> DataValue { + match self { + Self::Bool(a) => match a.get(idx) { + Some(val) => DataValue::Bool(*val), + None => DataValue::Null, + }, + Self::Int32(a) => match a.get(idx) { + Some(val) => DataValue::Int32(*val), + None => DataValue::Null, + }, + Self::Float64(a) => match a.get(idx) { + Some(val) => DataValue::Float64(*val), + None => DataValue::Null, + }, + Self::Utf8(a) => match a.get(idx) { + Some(val) => DataValue::String(val.to_string()), + None => DataValue::Null, + }, + } + } + + /// Number of items of array. + pub fn len(&self) -> usize { + match self { + Self::Bool(a) => a.len(), + Self::Int32(a) => a.len(), + Self::Float64(a) => a.len(), + Self::Utf8(a) => a.len(), + } + } + + /// Check if array is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} diff --git a/code/03-02/src/array/primitive_array.rs b/code/03-02/src/array/primitive_array.rs new file mode 100644 index 0000000..0593655 --- /dev/null +++ b/code/03-02/src/array/primitive_array.rs @@ -0,0 +1,107 @@ +use std::fmt::Debug; +use std::iter::FromIterator; + +use bitvec::vec::BitVec; + +use super::{Array, ArrayBuilder}; + +/// A collection of primitive types, such as `i32`, `f32`. +#[derive(Debug, Clone, PartialEq)] +pub struct PrimitiveArray { + valid: BitVec, + data: Vec, +} + +/// A trait over primitive types. +pub trait Primitive: + PartialOrd + PartialEq + Debug + Copy + Send + Sync + Sized + Default + 'static +{ +} + +macro_rules! impl_primitive { + ($($t:ty),*) => { + $(impl Primitive for $t {})* + } +} +impl_primitive!(u8, u16, u32, u64, usize, i8, i16, i32, i64, isize, f32, f64, bool); + +/// Enable `collect()` an array from iterator of `Option`. +impl FromIterator> for PrimitiveArray { + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = ::Builder::with_capacity(iter.size_hint().0); + for e in iter { + builder.push(e.as_ref()); + } + builder.finish() + } +} + +/// Enable `collect()` an array from iterator of `T`. +impl FromIterator for PrimitiveArray { + fn from_iter>(iter: I) -> Self { + iter.into_iter().map(Some).collect() + } +} + +impl Array for PrimitiveArray { + type Item = T; + type Builder = PrimitiveArrayBuilder; + + fn get(&self, idx: usize) -> Option<&T> { + self.valid[idx].then(|| &self.data[idx]) + } + + fn len(&self) -> usize { + self.valid.len() + } +} + +/// A builder that constructs a [`PrimitiveArray`] from `Option`. +pub struct PrimitiveArrayBuilder { + valid: BitVec, + data: Vec, +} + +impl ArrayBuilder for PrimitiveArrayBuilder { + type Array = PrimitiveArray; + + fn with_capacity(capacity: usize) -> Self { + Self { + valid: BitVec::with_capacity(capacity), + data: Vec::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<&T>) { + self.valid.push(value.is_some()); + self.data.push(value.cloned().unwrap_or_default()); + } + + fn append(&mut self, other: &PrimitiveArray) { + self.valid.extend_from_bitslice(&other.valid); + self.data.extend_from_slice(&other.data); + } + + fn finish(self) -> PrimitiveArray { + PrimitiveArray { + valid: self.valid, + data: self.data, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collect() { + let iter = (0..1000).map(|x| if x % 2 == 0 { None } else { Some(x) }); + let array = iter.clone().collect::>(); + assert_eq!( + array.iter().map(|x| x.cloned()).collect::>(), + iter.collect::>() + ); + } +} diff --git a/code/03-02/src/array/utf8_array.rs b/code/03-02/src/array/utf8_array.rs new file mode 100644 index 0000000..199bcdc --- /dev/null +++ b/code/03-02/src/array/utf8_array.rs @@ -0,0 +1,105 @@ +use std::iter::FromIterator; + +use bitvec::vec::BitVec; + +use super::{Array, ArrayBuilder}; + +/// A collection of Rust UTF-8 [`String`]s. +#[derive(Clone, PartialEq)] +pub struct Utf8Array { + offset: Vec, + valid: BitVec, + data: Vec, +} + +impl Array for Utf8Array { + type Item = str; + type Builder = Utf8ArrayBuilder; + + fn get(&self, idx: usize) -> Option<&str> { + if self.valid[idx] { + let data_slice = &self.data[self.offset[idx]..self.offset[idx + 1]]; + Some(unsafe { std::str::from_utf8_unchecked(data_slice) }) + } else { + None + } + } + + fn len(&self) -> usize { + self.valid.len() + } +} + +/// A builder that uses `&str` to build an [`Utf8Array`]. +pub struct Utf8ArrayBuilder { + offset: Vec, + valid: BitVec, + data: Vec, +} + +impl ArrayBuilder for Utf8ArrayBuilder { + type Array = Utf8Array; + + fn with_capacity(capacity: usize) -> Self { + let mut offset = Vec::with_capacity(capacity + 1); + offset.push(0); + Self { + offset, + data: Vec::with_capacity(capacity), + valid: BitVec::with_capacity(capacity), + } + } + + fn push(&mut self, value: Option<&str>) { + self.valid.push(value.is_some()); + if let Some(x) = value { + self.data.extend_from_slice(x.as_bytes()); + } + self.offset.push(self.data.len()); + } + + fn append(&mut self, other: &Utf8Array) { + self.valid.extend_from_bitslice(&other.valid); + self.data.extend_from_slice(&other.data); + let start = *self.offset.last().unwrap(); + for other_offset in &other.offset[1..] { + self.offset.push(*other_offset + start); + } + } + + fn finish(self) -> Utf8Array { + Utf8Array { + valid: self.valid, + data: self.data, + offset: self.offset, + } + } +} + +/// Enable `collect()` an array from iterator of `Option<&str>` or `Option`. +impl> FromIterator> for Utf8Array { + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let mut builder = ::Builder::with_capacity(iter.size_hint().0); + for e in iter { + if let Some(s) = e { + builder.push(Some(s.as_ref())); + } else { + builder.push(None); + } + } + builder.finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collect() { + let iter = [None, Some("1"), None, Some("3")].into_iter(); + let array = iter.clone().collect::(); + assert_eq!(array.iter().collect::>(), iter.collect::>()); + } +} diff --git a/code/03-02/src/binder b/code/03-02/src/binder deleted file mode 120000 index 936cb6b..0000000 --- a/code/03-02/src/binder +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/binder/ \ No newline at end of file diff --git a/code/03-02/src/binder/expression/column_ref.rs b/code/03-02/src/binder/expression/column_ref.rs new file mode 100644 index 0000000..9b8fd7c --- /dev/null +++ b/code/03-02/src/binder/expression/column_ref.rs @@ -0,0 +1,72 @@ +use super::*; + +/// A bound column reference expression. +#[derive(PartialEq, Clone)] +pub struct BoundColumnRef { + pub column_ref_id: ColumnRefId, + pub return_type: DataType, +} + +impl std::fmt::Debug for BoundColumnRef { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.column_ref_id) + } +} + +impl Binder { + /// Expand wildcard into a list of column references. + pub fn bind_all_column_refs(&mut self) -> Result, BindError> { + let mut exprs = vec![]; + for &table_ref_id in self.tables.values() { + let table = self.catalog.get_table(table_ref_id).unwrap(); + for (col_id, col) in table.all_columns() { + let expr = BoundExpr::ColumnRef(BoundColumnRef { + column_ref_id: ColumnRefId::from_table(table_ref_id, col_id), + return_type: col.datatype(), + }); + exprs.push(expr); + } + } + Ok(exprs) + } + + pub fn bind_column_ref(&mut self, idents: &[Ident]) -> Result { + let (_schema_name, table_name, column_name) = match idents { + [column] => (None, None, &column.value), + [table, column] => (None, Some(&table.value), &column.value), + [schema, table, column] => (Some(&schema.value), Some(&table.value), &column.value), + _ => return Err(BindError::InvalidTableName(idents.into())), + }; + if let Some(name) = table_name { + let table_ref_id = *self + .tables + .get(name) + .ok_or_else(|| BindError::TableNotFound(name.clone()))?; + let table = self.catalog.get_table(table_ref_id).unwrap(); + let col = table + .get_column_by_name(column_name) + .ok_or_else(|| BindError::ColumnNotFound(column_name.clone()))?; + Ok(BoundExpr::ColumnRef(BoundColumnRef { + column_ref_id: ColumnRefId::from_table(table_ref_id, col.id()), + return_type: col.datatype(), + })) + } else { + let mut column_ref = None; + for &table_ref_id in self.tables.values() { + let table = self.catalog.get_table(table_ref_id).unwrap(); + if let Some(col) = table.get_column_by_name(column_name) { + if column_ref.is_some() { + return Err(BindError::AmbiguousColumnName(column_name.into())); + } + column_ref = Some(BoundColumnRef { + column_ref_id: ColumnRefId::from_table(table_ref_id, col.id()), + return_type: col.datatype(), + }); + } + } + Ok(BoundExpr::ColumnRef(column_ref.ok_or_else(|| { + BindError::ColumnNotFound(column_name.clone()) + })?)) + } + } +} diff --git a/code/03-02/src/binder/expression/mod.rs b/code/03-02/src/binder/expression/mod.rs new file mode 100644 index 0000000..ed89891 --- /dev/null +++ b/code/03-02/src/binder/expression/mod.rs @@ -0,0 +1,59 @@ +use super::*; +use crate::parser::{Expr, Value}; +use crate::types::{DataType, DataValue}; + +mod column_ref; + +pub use self::column_ref::*; + +/// A bound expression. +#[derive(Debug, PartialEq, Clone)] +pub enum BoundExpr { + Constant(DataValue), + ColumnRef(BoundColumnRef), +} + +impl BoundExpr { + /// Get return type of the expression. + /// + /// Returns `None` if the type can not be decided. + pub fn return_type(&self) -> Option { + match self { + Self::Constant(v) => v.datatype(), + Self::ColumnRef(c) => Some(c.return_type.clone()), + } + } +} + +impl Binder { + /// Bind an expression. + pub fn bind_expr(&mut self, expr: &Expr) -> Result { + match expr { + Expr::Value(v) => Ok(BoundExpr::Constant(v.into())), + Expr::Identifier(ident) => self.bind_column_ref(std::slice::from_ref(ident)), + Expr::CompoundIdentifier(idents) => self.bind_column_ref(idents), + _ => todo!("bind expression: {:?}", expr), + } + } +} + +impl From<&Value> for DataValue { + fn from(v: &Value) -> Self { + match v { + Value::Number(n, _) => { + if let Ok(int) = n.parse::() { + Self::Int32(int) + } else if let Ok(float) = n.parse::() { + Self::Float64(float) + } else { + panic!("invalid digit: {}", n); + } + } + Value::SingleQuotedString(s) => Self::String(s.clone()), + Value::DoubleQuotedString(s) => Self::String(s.clone()), + Value::Boolean(b) => Self::Bool(*b), + Value::Null => Self::Null, + _ => todo!("parse value: {:?}", v), + } + } +} diff --git a/code/03-02/src/binder/mod.rs b/code/03-02/src/binder/mod.rs new file mode 100644 index 0000000..98d0bdc --- /dev/null +++ b/code/03-02/src/binder/mod.rs @@ -0,0 +1,96 @@ +//! Resolve all expressions referring with their names. + +use std::collections::HashMap; +use std::vec::Vec; + +use crate::catalog::*; +use crate::parser::{Ident, ObjectName, Statement}; + +mod expression; +mod statement; +mod table_ref; + +pub use self::expression::*; +pub use self::statement::*; +pub use self::table_ref::*; + +/// A bound SQL statement generated by the [`Binder`]. +#[derive(Debug, PartialEq, Clone)] +pub enum BoundStatement { + CreateTable(BoundCreateTable), + Insert(BoundInsert), + Explain(Box), + Select(BoundSelect), +} + +/// The error type of bind operations. +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum BindError { + #[error("table must have at least one column")] + EmptyColumns, + #[error("schema not found: {0}")] + SchemaNotFound(String), + #[error("table not found: {0}")] + TableNotFound(String), + #[error("column not found: {0}")] + ColumnNotFound(String), + #[error("duplicated table: {0}")] + DuplicatedTable(String), + #[error("duplicated column: {0}")] + DuplicatedColumn(String), + #[error("invalid table name: {0:?}")] + InvalidTableName(Vec), + #[error("duplicated alias: {0}")] + DuplicatedAlias(String), + #[error("ambiguous column name: {0}")] + AmbiguousColumnName(String), + #[error("not nullable column: {0}")] + NotNullableColumn(String), + #[error("tuple length mismatch: expected {expected} but got {actual}")] + TupleLengthMismatch { expected: usize, actual: usize }, + #[error("value should not be null in column: {0}")] + NullValueInColumn(String), +} + +/// The binder resolves all expressions referring to schema objects such as +/// tables or views with their column names and types. +pub struct Binder { + catalog: CatalogRef, + tables: HashMap, +} + +type TableName = String; + +impl Binder { + /// Create a new [Binder]. + pub fn new(catalog: CatalogRef) -> Self { + Binder { + catalog, + tables: HashMap::default(), + } + } + + /// Bind a statement. + pub fn bind(&mut self, stmt: &Statement) -> Result { + match stmt { + Statement::CreateTable { .. } => { + Ok(BoundStatement::CreateTable(self.bind_create_table(stmt)?)) + } + Statement::Insert { .. } => Ok(BoundStatement::Insert(self.bind_insert(stmt)?)), + Statement::Explain { statement, .. } => { + Ok(BoundStatement::Explain(self.bind(&*statement)?.into())) + } + Statement::Query(query) => Ok(BoundStatement::Select(self.bind_select(&*query)?)), + _ => todo!("bind statement: {:#?}", stmt), + } + } +} + +/// Split an [ObjectName] into `(schema name, table name)`. +fn split_name(name: &ObjectName) -> Result<(&str, &str), BindError> { + Ok(match name.0.as_slice() { + [table] => (DEFAULT_SCHEMA_NAME, &table.value), + [schema, table] => (&schema.value, &table.value), + _ => return Err(BindError::InvalidTableName(name.0.clone())), + }) +} diff --git a/code/03-02/src/binder/statement/create_table.rs b/code/03-02/src/binder/statement/create_table.rs new file mode 100644 index 0000000..97fdd6a --- /dev/null +++ b/code/03-02/src/binder/statement/create_table.rs @@ -0,0 +1,117 @@ +use std::collections::HashSet; + +use super::*; +use crate::catalog::ColumnDesc; +use crate::parser::{ColumnDef, ColumnOption, Statement}; +use crate::types::DataType; + +/// A bound `CREATE TABLE` statement. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundCreateTable { + pub schema_id: SchemaId, + pub table_name: String, + pub columns: Vec<(String, ColumnDesc)>, +} + +impl Binder { + pub fn bind_create_table(&mut self, stmt: &Statement) -> Result { + match stmt { + Statement::CreateTable { name, columns, .. } => { + // check empty columns + if columns.is_empty() { + return Err(BindError::EmptyColumns); + } + let (schema_name, table_name) = split_name(name)?; + let schema = self + .catalog + .get_schema_by_name(schema_name) + .ok_or_else(|| BindError::SchemaNotFound(schema_name.into()))?; + // check duplicated table name + if schema.get_table_by_name(table_name).is_some() { + return Err(BindError::DuplicatedTable(table_name.into())); + } + // check duplicated column names + let mut set = HashSet::new(); + for col in columns.iter() { + if !set.insert(col.name.value.clone()) { + return Err(BindError::DuplicatedColumn(col.name.value.clone())); + } + } + let columns = columns + .iter() + .map(|col| (col.name.value.clone(), ColumnDesc::from(col))) + .collect(); + Ok(BoundCreateTable { + schema_id: schema.id(), + table_name: table_name.into(), + columns, + }) + } + _ => panic!("mismatched statement type"), + } + } +} + +impl From<&ColumnDef> for ColumnDesc { + fn from(cdef: &ColumnDef) -> Self { + let mut is_nullable = true; + let mut is_primary = false; + for opt in cdef.options.iter() { + match opt.option { + ColumnOption::Null => is_nullable = true, + ColumnOption::NotNull => is_nullable = false, + ColumnOption::Unique { is_primary: v } => is_primary = v, + _ => todo!("column options"), + } + } + ColumnDesc::new( + DataType::new(cdef.data_type.clone(), is_nullable), + is_primary, + ) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use crate::catalog::DatabaseCatalog; + use crate::parser::parse; + use crate::types::{DataTypeExt, DataTypeKind}; + + #[test] + fn bind_create_table() { + let catalog = Arc::new(DatabaseCatalog::new()); + let mut binder = Binder::new(catalog.clone()); + let sql = " + create table t1 (v1 int not null, v2 int); + create table t2 (a int not null, a int not null); + create table t3 (v1 int not null);"; + let stmts = parse(sql).unwrap(); + + assert_eq!( + binder.bind_create_table(&stmts[0]).unwrap(), + BoundCreateTable { + schema_id: 0, + table_name: "t1".into(), + columns: vec![ + ("v1".into(), DataTypeKind::Int(None).not_null().to_column()), + ("v2".into(), DataTypeKind::Int(None).nullable().to_column()), + ], + } + ); + + assert_eq!( + binder.bind_create_table(&stmts[1]), + Err(BindError::DuplicatedColumn("a".into())) + ); + + let schema = catalog.get_schema(0).unwrap(); + schema.add_table("t3").unwrap(); + assert_eq!( + binder.bind_create_table(&stmts[2]), + Err(BindError::DuplicatedTable("t3".into())) + ); + } +} diff --git a/code/03-02/src/binder/statement/insert.rs b/code/03-02/src/binder/statement/insert.rs new file mode 100644 index 0000000..3e7bfc1 --- /dev/null +++ b/code/03-02/src/binder/statement/insert.rs @@ -0,0 +1,131 @@ +use std::collections::HashSet; +use std::sync::Arc; + +use itertools::Itertools; + +use super::*; +use crate::catalog::{ColumnCatalog, ColumnId, TableCatalog}; +use crate::parser::{SetExpr, Statement}; +use crate::types::{DataType, DataTypeKind}; + +/// A bound `INSERT` statement. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundInsert { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub column_types: Vec, + pub values: Vec>, +} + +impl Binder { + pub fn bind_insert(&mut self, stmt: &Statement) -> Result { + let (table_name, columns, source) = match stmt { + Statement::Insert { + table_name, + columns, + source, + .. + } => (table_name, columns, source), + _ => panic!("mismatched statement type"), + }; + let (table_ref_id, table, columns) = self.bind_table_columns(table_name, columns)?; + let column_ids = columns.iter().map(|col| col.id()).collect_vec(); + let column_types = columns.iter().map(|col| col.datatype()).collect_vec(); + + // Check columns after transforming. + let col_set: HashSet = column_ids.iter().cloned().collect(); + for (id, col) in table.all_columns() { + if !col_set.contains(&id) && !col.is_nullable() { + return Err(BindError::NotNullableColumn(col.name().into())); + } + } + + let values = match &source.body { + SetExpr::Select(_) => todo!("handle 'insert into .. select .. from ..' case."), + SetExpr::Values(values) => &values.0, + _ => todo!("handle insert ???"), + }; + + // Handle 'insert into .. values ..' case. + + // Check inserted values, we only support inserting values now. + let mut bound_values = Vec::with_capacity(values.len()); + for row in values.iter() { + if row.len() > column_ids.len() { + return Err(BindError::TupleLengthMismatch { + expected: columns.len(), + actual: row.len(), + }); + } + let mut bound_row = Vec::with_capacity(row.len()); + for (idx, expr) in row.iter().enumerate() { + // Bind expression + let expr = self.bind_expr(expr)?; + + if let Some(data_type) = &expr.return_type() { + // TODO: support valid type cast + // For example: + // CREATE TABLE t (a FLOAT, b FLOAT); + // INSERT INTO VALUES (1, 1); + // 1 should be casted to float. + let left_kind = data_type.kind(); + let right_kind = column_types[idx].kind(); + match (&left_kind, &right_kind) { + _ if left_kind == right_kind => {} + // For char types, no need to cast + (DataTypeKind::Char(_), DataTypeKind::Varchar(_)) => {} + (DataTypeKind::Varchar(_), DataTypeKind::Char(_)) => {} + _ => todo!("type cast: {} -> {}", left_kind, right_kind), + } + } else { + // If the data value is null, the column must be nullable. + if !column_types[idx].is_nullable() { + return Err(BindError::NullValueInColumn(columns[idx].name().into())); + } + } + bound_row.push(expr); + } + bound_values.push(bound_row); + } + + Ok(BoundInsert { + table_ref_id, + column_ids, + column_types, + values: bound_values, + }) + } + + /// Bind `table_name [ (column_name [, ...] ) ]` + pub fn bind_table_columns( + &mut self, + table_name: &ObjectName, + columns: &[Ident], + ) -> Result<(TableRefId, Arc, Vec), BindError> { + let (schema_name, table_name) = split_name(table_name)?; + let schema = self + .catalog + .get_schema_by_name(schema_name) + .ok_or_else(|| BindError::SchemaNotFound(schema_name.into()))?; + let table = schema + .get_table_by_name(table_name) + .ok_or_else(|| BindError::TableNotFound(table_name.into()))?; + let table_ref_id = TableRefId::new(schema.id(), table.id()); + + let columns = if columns.is_empty() { + // If the query does not provide column information, get all columns info. + table.all_columns().values().cloned().collect_vec() + } else { + // Otherwise, we get columns info from the query. + let mut column_catalogs = vec![]; + for col in columns.iter() { + let col = table + .get_column_by_name(&col.value) + .ok_or_else(|| BindError::ColumnNotFound(col.value.clone()))?; + column_catalogs.push(col); + } + column_catalogs + }; + Ok((table_ref_id, table, columns)) + } +} diff --git a/code/03-02/src/binder/statement/mod.rs b/code/03-02/src/binder/statement/mod.rs new file mode 100644 index 0000000..03bf861 --- /dev/null +++ b/code/03-02/src/binder/statement/mod.rs @@ -0,0 +1,9 @@ +use super::*; + +mod create_table; +mod insert; +mod select; + +pub use self::create_table::*; +pub use self::insert::*; +pub use self::select::*; diff --git a/code/03-02/src/binder/statement/select.rs b/code/03-02/src/binder/statement/select.rs new file mode 100644 index 0000000..ba475cf --- /dev/null +++ b/code/03-02/src/binder/statement/select.rs @@ -0,0 +1,61 @@ +use super::*; +use crate::binder::BoundTableRef; +use crate::parser::{Query, SelectItem, SetExpr}; + +/// A bound `SELECT` statement. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundSelect { + pub select_list: Vec, + pub from_list: Vec, +} + +impl Binder { + pub fn bind_select(&mut self, query: &Query) -> Result { + let select = match &query.body { + SetExpr::Select(select) => &**select, + _ => todo!("not select"), + }; + + let mut from_list = vec![]; + assert!(select.from.len() <= 1, "multiple tables are not supported"); + for table_with_join in select.from.iter() { + let table_ref = self.bind_table_with_joins(table_with_join)?; + from_list.push(table_ref); + } + + assert!(select.selection.is_none(), "WHERE clause is not supported"); + assert!( + query.order_by.is_empty(), + "ORDER BY clause is not supported" + ); + assert!(query.limit.is_none(), "LIMIT clause is not supported"); + assert!(query.offset.is_none(), "OFFSET clause is not supported"); + assert!( + select.group_by.is_empty(), + "GROUP BY clause is not supported" + ); + assert!(!select.distinct, "DISTINCT is not supported"); + + // Bind the select list. + let mut select_list = vec![]; + for item in select.projection.iter() { + match item { + SelectItem::UnnamedExpr(expr) => { + select_list.push(self.bind_expr(expr)?); + } + SelectItem::ExprWithAlias { expr, .. } => { + select_list.push(self.bind_expr(expr)?); + } + SelectItem::Wildcard => { + select_list.extend(self.bind_all_column_refs()?); + } + _ => todo!("not supported select item: {:?}", item), + } + } + + Ok(BoundSelect { + select_list, + from_list, + }) + } +} diff --git a/code/03-02/src/binder/table_ref/mod.rs b/code/03-02/src/binder/table_ref/mod.rs new file mode 100644 index 0000000..ad9ead6 --- /dev/null +++ b/code/03-02/src/binder/table_ref/mod.rs @@ -0,0 +1,36 @@ +use super::*; +use crate::parser::{TableFactor, TableWithJoins}; + +/// A bound table reference. +#[derive(Debug, PartialEq, Clone)] +pub struct BoundTableRef { + pub table_ref_id: TableRefId, + pub column_ids: Vec, +} + +impl Binder { + pub fn bind_table_with_joins( + &mut self, + table: &TableWithJoins, + ) -> Result { + assert!(table.joins.is_empty(), "JOIN is not supported"); + + let (name, alias) = match &table.relation { + TableFactor::Table { name, alias, .. } => (name, alias), + r => panic!("not supported table factor: {:?}", r), + }; + let (table_ref_id, _, columns) = self.bind_table_columns(name, &[])?; + let alias = match alias { + Some(alias) => &alias.name.value, + None => split_name(name).unwrap().1, + }; + if self.tables.contains_key(alias) { + return Err(BindError::DuplicatedAlias(alias.into())); + } + self.tables.insert(alias.into(), table_ref_id); + Ok(BoundTableRef { + table_ref_id, + column_ids: columns.iter().map(|col| col.id()).collect(), + }) + } +} diff --git a/code/03-02/src/catalog b/code/03-02/src/catalog deleted file mode 120000 index e98b018..0000000 --- a/code/03-02/src/catalog +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/catalog/ \ No newline at end of file diff --git a/code/03-02/src/catalog/column.rs b/code/03-02/src/catalog/column.rs new file mode 100644 index 0000000..abb6683 --- /dev/null +++ b/code/03-02/src/catalog/column.rs @@ -0,0 +1,94 @@ +use super::*; +use crate::types::DataType; + +/// The descriptor of a column. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ColumnDesc { + datatype: DataType, + is_primary: bool, +} + +impl ColumnDesc { + pub const fn new(datatype: DataType, is_primary: bool) -> Self { + ColumnDesc { + datatype, + is_primary, + } + } + + pub fn is_primary(&self) -> bool { + self.is_primary + } + + pub fn is_nullable(&self) -> bool { + self.datatype.is_nullable() + } + + pub fn datatype(&self) -> &DataType { + &self.datatype + } +} + +impl DataType { + pub const fn to_column(self) -> ColumnDesc { + ColumnDesc::new(self, false) + } + + pub const fn to_column_primary_key(self) -> ColumnDesc { + ColumnDesc::new(self, true) + } +} + +/// The catalog of a column. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ColumnCatalog { + id: ColumnId, + name: String, + desc: ColumnDesc, +} + +impl ColumnCatalog { + pub(super) fn new(id: ColumnId, name: String, desc: ColumnDesc) -> ColumnCatalog { + ColumnCatalog { id, name, desc } + } + + pub fn id(&self) -> ColumnId { + self.id + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn desc(&self) -> &ColumnDesc { + &self.desc + } + + pub fn datatype(&self) -> DataType { + self.desc.datatype.clone() + } + + pub fn is_primary(&self) -> bool { + self.desc.is_primary() + } + + pub fn is_nullable(&self) -> bool { + self.desc.is_nullable() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{DataTypeExt, DataTypeKind}; + + #[test] + fn test_column_catalog() { + let col_desc = DataTypeKind::Int(None).not_null().to_column(); + let col_catalog = ColumnCatalog::new(0, "grade".into(), col_desc); + assert_eq!(col_catalog.id(), 0); + assert!(!col_catalog.is_primary()); + assert!(!col_catalog.is_nullable()); + assert_eq!(col_catalog.name(), "grade"); + } +} diff --git a/code/03-02/src/catalog/database.rs b/code/03-02/src/catalog/database.rs new file mode 100644 index 0000000..e1fca34 --- /dev/null +++ b/code/03-02/src/catalog/database.rs @@ -0,0 +1,79 @@ +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use super::*; + +/// The catalog of a database. +pub struct DatabaseCatalog { + inner: Mutex, +} + +#[derive(Default)] +struct Inner { + schema_idxs: HashMap, + schemas: HashMap>, + next_schema_id: SchemaId, +} + +impl Default for DatabaseCatalog { + fn default() -> Self { + Self::new() + } +} + +impl DatabaseCatalog { + pub fn new() -> Self { + let db_catalog = DatabaseCatalog { + inner: Mutex::new(Inner::default()), + }; + db_catalog.add_schema(DEFAULT_SCHEMA_NAME).unwrap(); + db_catalog + } + + pub fn add_schema(&self, name: &str) -> Result { + let mut inner = self.inner.lock().unwrap(); + if inner.schema_idxs.contains_key(name) { + return Err(CatalogError::Duplicated("schema", name.into())); + } + let id = inner.next_schema_id; + inner.next_schema_id += 1; + let schema_catalog = Arc::new(SchemaCatalog::new(id, name.into())); + inner.schema_idxs.insert(name.into(), id); + inner.schemas.insert(id, schema_catalog); + Ok(id) + } + + pub fn del_schema(&self, name: &str) -> Result<(), CatalogError> { + let mut inner = self.inner.lock().unwrap(); + let id = inner + .schema_idxs + .remove(name) + .ok_or_else(|| CatalogError::NotFound("schema", name.into()))?; + inner.schemas.remove(&id); + Ok(()) + } + + pub fn all_schemas(&self) -> HashMap> { + let inner = self.inner.lock().unwrap(); + inner.schemas.clone() + } + + pub fn get_schema(&self, schema_id: SchemaId) -> Option> { + let inner = self.inner.lock().unwrap(); + inner.schemas.get(&schema_id).cloned() + } + + pub fn get_schema_by_name(&self, name: &str) -> Option> { + let inner = self.inner.lock().unwrap(); + inner + .schema_idxs + .get(name) + .and_then(|id| inner.schemas.get(id)) + .cloned() + } + + pub fn get_table(&self, table_ref_id: TableRefId) -> Option> { + let schema = self.get_schema(table_ref_id.schema_id)?; + schema.get_table(table_ref_id.table_id) + } +} diff --git a/code/03-02/src/catalog/mod.rs b/code/03-02/src/catalog/mod.rs new file mode 100644 index 0000000..594a43f --- /dev/null +++ b/code/03-02/src/catalog/mod.rs @@ -0,0 +1,85 @@ +//! The metadata of all database objects. +//! +//! The hierarchy of the catalog is: [Database] - [Schema] - [Table] - [Column]. +//! +//! There is a default schema `postgres` in it. +//! +//! [Database]: DatabaseCatalog +//! [Schema]: SchemaCatalog +//! [Table]: TableCatalog +//! [Column]: ColumnCatalog + +use std::sync::Arc; + +mod column; +mod database; +mod schema; +mod table; + +pub use self::column::*; +pub use self::database::*; +pub use self::schema::*; +pub use self::table::*; + +/// The type of catalog reference. +pub type CatalogRef = Arc; +/// The type of schema ID. +pub type SchemaId = u32; +/// The type of table ID. +pub type TableId = u32; +/// The type of column ID. +pub type ColumnId = u32; + +/// The name of default schema: `postgres`. +pub const DEFAULT_SCHEMA_NAME: &str = "postgres"; + +/// The reference ID of a table. +#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] +pub struct TableRefId { + pub schema_id: SchemaId, + pub table_id: TableId, +} + +impl TableRefId { + pub const fn new(schema_id: SchemaId, table_id: TableId) -> Self { + TableRefId { + schema_id, + table_id, + } + } +} + +/// The reference ID of a column. +#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] +pub struct ColumnRefId { + pub schema_id: SchemaId, + pub table_id: TableId, + pub column_id: ColumnId, +} + +impl ColumnRefId { + pub const fn from_table(table: TableRefId, column_id: ColumnId) -> Self { + ColumnRefId { + schema_id: table.schema_id, + table_id: table.table_id, + column_id, + } + } + + pub const fn new(schema_id: SchemaId, table_id: TableId, column_id: ColumnId) -> Self { + ColumnRefId { + schema_id, + table_id, + column_id, + } + } +} + +/// The error type of catalog operations. +#[derive(thiserror::Error, Debug)] +pub enum CatalogError { + #[error("{0} not found: {1}")] + NotFound(&'static str, String), + #[error("duplicated {0}: {1}")] + Duplicated(&'static str, String), +} diff --git a/code/03-02/src/catalog/schema.rs b/code/03-02/src/catalog/schema.rs new file mode 100644 index 0000000..d30c0df --- /dev/null +++ b/code/03-02/src/catalog/schema.rs @@ -0,0 +1,88 @@ +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use super::*; + +/// The catalog of a schema. +pub struct SchemaCatalog { + id: SchemaId, + inner: Mutex, +} + +struct Inner { + name: String, + table_idxs: HashMap, + tables: HashMap>, + next_table_id: TableId, +} + +impl SchemaCatalog { + pub(super) fn new(id: SchemaId, name: String) -> SchemaCatalog { + SchemaCatalog { + id, + inner: Mutex::new(Inner { + name, + table_idxs: HashMap::new(), + tables: HashMap::new(), + next_table_id: 0, + }), + } + } + + pub fn id(&self) -> SchemaId { + self.id + } + + pub fn name(&self) -> String { + let inner = self.inner.lock().unwrap(); + inner.name.clone() + } + + pub fn add_table(&self, name: &str) -> Result { + let mut inner = self.inner.lock().unwrap(); + if inner.table_idxs.contains_key(name) { + return Err(CatalogError::Duplicated("table", name.into())); + } + let id = inner.next_table_id; + inner.next_table_id += 1; + let table_catalog = Arc::new(TableCatalog::new(id, name.into())); + inner.table_idxs.insert(name.into(), id); + inner.tables.insert(id, table_catalog); + Ok(id) + } + + pub fn del_table_by_name(&self, name: &str) -> Result<(), CatalogError> { + let mut inner = self.inner.lock().unwrap(); + let id = inner + .table_idxs + .remove(name) + .ok_or_else(|| CatalogError::NotFound("table", name.into()))?; + inner.tables.remove(&id); + Ok(()) + } + + pub fn del_table(&self, id: TableId) { + let mut inner = self.inner.lock().unwrap(); + let catalog = inner.tables.remove(&id).unwrap(); + inner.table_idxs.remove(&catalog.name()).unwrap(); + } + + pub fn all_tables(&self) -> HashMap> { + let inner = self.inner.lock().unwrap(); + inner.tables.clone() + } + + pub fn get_table(&self, table_id: TableId) -> Option> { + let inner = self.inner.lock().unwrap(); + inner.tables.get(&table_id).cloned() + } + + pub fn get_table_by_name(&self, name: &str) -> Option> { + let inner = self.inner.lock().unwrap(); + inner + .table_idxs + .get(name) + .and_then(|id| inner.tables.get(id)) + .cloned() + } +} diff --git a/code/03-02/src/catalog/table.rs b/code/03-02/src/catalog/table.rs new file mode 100644 index 0000000..2422457 --- /dev/null +++ b/code/03-02/src/catalog/table.rs @@ -0,0 +1,111 @@ +use std::collections::{BTreeMap, HashMap}; +use std::sync::Mutex; + +use super::*; + +/// The catalog of a table. +pub struct TableCatalog { + id: TableId, + inner: Mutex, +} + +struct Inner { + name: String, + /// Mapping from column names to column ids + column_idxs: HashMap, + columns: BTreeMap, + next_column_id: ColumnId, +} + +impl TableCatalog { + pub(super) fn new(id: TableId, name: String) -> TableCatalog { + TableCatalog { + id, + inner: Mutex::new(Inner { + name, + column_idxs: HashMap::new(), + columns: BTreeMap::new(), + next_column_id: 0, + }), + } + } + + pub fn id(&self) -> TableId { + self.id + } + + pub fn name(&self) -> String { + let inner = self.inner.lock().unwrap(); + inner.name.clone() + } + + pub fn add_column(&self, name: &str, desc: ColumnDesc) -> Result { + let mut inner = self.inner.lock().unwrap(); + if inner.column_idxs.contains_key(name) { + return Err(CatalogError::Duplicated("column", name.into())); + } + let id = inner.next_column_id; + inner.next_column_id += 1; + inner.column_idxs.insert(name.into(), id); + inner + .columns + .insert(id, ColumnCatalog::new(id, name.into(), desc)); + Ok(id) + } + + pub fn contains_column(&self, name: &str) -> bool { + let inner = self.inner.lock().unwrap(); + inner.column_idxs.contains_key(name) + } + + pub fn all_columns(&self) -> BTreeMap { + let inner = self.inner.lock().unwrap(); + inner.columns.clone() + } + + pub fn get_column(&self, id: ColumnId) -> Option { + let inner = self.inner.lock().unwrap(); + inner.columns.get(&id).cloned() + } + + pub fn get_column_by_name(&self, name: &str) -> Option { + let inner = self.inner.lock().unwrap(); + inner + .column_idxs + .get(name) + .and_then(|id| inner.columns.get(id)) + .cloned() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{DataTypeExt, DataTypeKind}; + + #[test] + fn test_table_catalog() { + let table_catalog = TableCatalog::new(0, "t".into()); + table_catalog + .add_column("a", DataTypeKind::Int(None).not_null().to_column()) + .unwrap(); + table_catalog + .add_column("b", DataTypeKind::Boolean.not_null().to_column()) + .unwrap(); + + assert!(!table_catalog.contains_column("c")); + assert!(table_catalog.contains_column("a")); + assert!(table_catalog.contains_column("b")); + + assert_eq!(table_catalog.get_column_by_name("a").unwrap().id(), 0); + assert_eq!(table_catalog.get_column_by_name("b").unwrap().id(), 1); + + let col0_catalog = table_catalog.get_column(0).unwrap(); + assert_eq!(col0_catalog.name(), "a"); + assert_eq!(col0_catalog.datatype().kind(), DataTypeKind::Int(None)); + + let col1_catalog = table_catalog.get_column(1).unwrap(); + assert_eq!(col1_catalog.name(), "b"); + assert_eq!(col1_catalog.datatype().kind(), DataTypeKind::Boolean); + } +} diff --git a/code/03-02/src/db.rs b/code/03-02/src/db.rs deleted file mode 120000 index ef0ca75..0000000 --- a/code/03-02/src/db.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/db.rs \ No newline at end of file diff --git a/code/03-02/src/db.rs b/code/03-02/src/db.rs new file mode 100644 index 0000000..cae83a5 --- /dev/null +++ b/code/03-02/src/db.rs @@ -0,0 +1,87 @@ +//! Top-level structure of the database. + +use std::sync::Arc; + +use futures::TryStreamExt; +use tokio::runtime::Runtime; + +use crate::array::DataChunk; +use crate::binder::{BindError, Binder}; +use crate::catalog::{CatalogRef, DatabaseCatalog}; +use crate::executor::{ExecuteError, ExecutorBuilder}; +use crate::logical_planner::{LogicalPlanError, LogicalPlanner}; +use crate::parser::{parse, ParserError}; +use crate::physical_planner::{PhysicalPlanError, PhysicalPlanner}; +use crate::storage::{DiskStorage, StorageOptions}; + +/// The database instance. +pub struct Database { + catalog: CatalogRef, + executor_builder: ExecutorBuilder, + runtime: Runtime, +} + +impl Database { + /// Create a new database instance. + pub fn new(options: StorageOptions) -> Self { + let catalog = Arc::new(DatabaseCatalog::new()); + let storage = Arc::new(DiskStorage::new(options)); + let parallel = matches!(std::env::var("LIGHT_PARALLEL"), Ok(s) if s == "1"); + let runtime = if parallel { + tokio::runtime::Builder::new_multi_thread() + } else { + tokio::runtime::Builder::new_current_thread() + } + .build() + .expect("failed to create tokio runtime"); + let handle = parallel.then(|| runtime.handle().clone()); + Database { + catalog: catalog.clone(), + executor_builder: ExecutorBuilder::new(catalog, storage, handle), + runtime, + } + } + + /// Run SQL queries and return the outputs. + pub fn run(&self, sql: &str) -> Result, Error> { + // parse + let stmts = parse(sql)?; + + let mut outputs = vec![]; + for stmt in stmts { + let mut binder = Binder::new(self.catalog.clone()); + let logical_planner = LogicalPlanner::default(); + let physical_planner = PhysicalPlanner::default(); + + let bound_stmt = binder.bind(&stmt)?; + debug!("{:#?}", bound_stmt); + let logical_plan = logical_planner.plan(bound_stmt)?; + debug!("{:#?}", logical_plan); + let physical_plan = physical_planner.plan(&logical_plan)?; + debug!("{:#?}", physical_plan); + let mut executor = self.executor_builder.build(physical_plan); + self.runtime.block_on(async { + while let Some(chunk) = executor.try_next().await? { + outputs.push(chunk); + } + Ok(()) as Result<(), Error> + })?; + } + Ok(outputs) + } +} + +/// The error type of database operations. +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("parse error: {0}")] + Parse(#[from] ParserError), + #[error("bind error: {0}")] + Bind(#[from] BindError), + #[error("logical plan error: {0}")] + LogicalPlan(#[from] LogicalPlanError), + #[error("physical plan error: {0}")] + PhysicalPlan(#[from] PhysicalPlanError), + #[error("execute error: {0}")] + Execute(#[from] ExecuteError), +} diff --git a/code/03-02/src/executor b/code/03-02/src/executor deleted file mode 120000 index 991a7a7..0000000 --- a/code/03-02/src/executor +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/executor/ \ No newline at end of file diff --git a/code/03-02/src/executor/create.rs b/code/03-02/src/executor/create.rs new file mode 100644 index 0000000..b06a81b --- /dev/null +++ b/code/03-02/src/executor/create.rs @@ -0,0 +1,29 @@ +use super::*; +use crate::catalog::TableRefId; +use crate::physical_planner::PhysicalCreateTable; + +/// The executor of `CREATE TABLE` statement. +pub struct CreateTableExecutor { + pub plan: PhysicalCreateTable, + pub catalog: CatalogRef, + pub storage: StorageRef, +} + +impl CreateTableExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let schema = self.catalog.get_schema(self.plan.schema_id).unwrap(); + let table_id = schema.add_table(&self.plan.table_name).unwrap(); + let table = schema.get_table(table_id).unwrap(); + let mut column_descs = vec![]; + for (name, desc) in &self.plan.columns { + table.add_column(name, desc.clone()).unwrap(); + column_descs.push(desc.clone()); + } + self.storage.add_table( + TableRefId::new(self.plan.schema_id, table_id), + &column_descs, + )?; + yield DataChunk::single(1); + } +} diff --git a/code/03-02/src/executor/dummy.rs b/code/03-02/src/executor/dummy.rs new file mode 100644 index 0000000..f6e5ed4 --- /dev/null +++ b/code/03-02/src/executor/dummy.rs @@ -0,0 +1,11 @@ +use super::*; + +/// A dummy executor that produces a single value. +pub struct DummyExecutor; + +impl DummyExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + yield DataChunk::single(0); + } +} diff --git a/code/03-02/src/executor/evaluator.rs b/code/03-02/src/executor/evaluator.rs new file mode 100644 index 0000000..52d10c8 --- /dev/null +++ b/code/03-02/src/executor/evaluator.rs @@ -0,0 +1,40 @@ +use crate::array::*; +use crate::binder::BoundExpr; +use crate::executor::ExecuteError; +use crate::types::DataValue; + +impl BoundExpr { + /// Evaluate the given expression as a constant value. + /// + /// This method is used in the evaluation of `insert values` and optimizer + pub fn eval_const(&self) -> Result { + match &self { + Self::Constant(v) => Ok(v.clone()), + Self::ColumnRef(_) => panic!("can not evaluate on ColumnRef"), + } + } + + /// Evaluate the given expression as an array. + pub fn eval_array(&self, chunk: &DataChunk) -> Result { + match &self { + // NOTE: + // Currently we assume that the column id is equal to its physical index in the + // DataChunk. It is true in a simple `SELECT v FROM t` case, where the child plan of the + // Projection is Get. However, in a more complex case with join or aggregation, this + // assumption no longer holds. At that time we will convert the ColumnRef into an + // InputRef, and resolve the physical index from column id. + Self::ColumnRef(v) => Ok(chunk.arrays()[v.column_ref_id.column_id as usize].clone()), + Self::Constant(v) => { + let mut builder = ArrayBuilderImpl::with_capacity( + chunk.cardinality(), + &self.return_type().unwrap(), + ); + // TODO: optimize this + for _ in 0..chunk.cardinality() { + builder.push(v); + } + Ok(builder.finish()) + } + } + } +} diff --git a/code/03-02/src/executor/explain.rs b/code/03-02/src/executor/explain.rs new file mode 100644 index 0000000..3a2c6b5 --- /dev/null +++ b/code/03-02/src/executor/explain.rs @@ -0,0 +1,19 @@ +use super::*; +use crate::array::ArrayImpl; +use crate::physical_planner::PhysicalPlan; + +/// The executor of `EXPLAIN` statement. +pub struct ExplainExecutor { + pub plan: Box, +} + +impl ExplainExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let explain_result = format!("{}", *self.plan); + let chunk = DataChunk::from_iter([ArrayImpl::Utf8( + [Some(explain_result)].into_iter().collect(), + )]); + yield chunk; + } +} diff --git a/code/03-02/src/executor/insert.rs b/code/03-02/src/executor/insert.rs new file mode 100644 index 0000000..d40b61a --- /dev/null +++ b/code/03-02/src/executor/insert.rs @@ -0,0 +1,77 @@ +use itertools::Itertools; + +use super::*; +use crate::array::{ArrayBuilderImpl, DataChunk}; +use crate::catalog::{ColumnId, TableRefId}; +use crate::types::{DataType, DataValue}; + +/// The executor of `INSERT` statement. +pub struct InsertExecutor { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub catalog: CatalogRef, + pub storage: StorageRef, + pub child: BoxedExecutor, +} + +impl InsertExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let table = self.storage.get_table(self.table_ref_id)?; + let catalog = self.catalog.get_table(self.table_ref_id).unwrap(); + // Describe each column of the output chunks. + // example: + // columns = [0: Int, 1: Bool, 3: Float, 4: String] + // column_ids = [4, 1] + // => output_columns = [Null(Int), Pick(1), Null(Float), Pick(0)] + let output_columns = catalog + .all_columns() + .values() + .map( + |col| match self.column_ids.iter().position(|&id| id == col.id()) { + Some(index) => Column::Pick { index }, + None => Column::Null { + type_: col.datatype(), + }, + }, + ) + .collect_vec(); + let mut count = 0; + + let mut txn = table.write().await?; + + #[for_await] + for chunk in self.child { + let chunk = transform_chunk(chunk?, &output_columns); + count += chunk.cardinality(); + txn.append(chunk).await?; + } + + txn.commit().await?; + + yield DataChunk::single(count as i32); + } +} + +enum Column { + /// Pick the column at `index` from child. + Pick { index: usize }, + /// Null values with `type`. + Null { type_: DataType }, +} + +fn transform_chunk(chunk: DataChunk, output_columns: &[Column]) -> DataChunk { + output_columns + .iter() + .map(|col| match col { + Column::Pick { index } => chunk.arrays()[*index].clone(), + Column::Null { type_ } => { + let mut builder = ArrayBuilderImpl::with_capacity(chunk.cardinality(), type_); + for _ in 0..chunk.cardinality() { + builder.push(&DataValue::Null); + } + builder.finish() + } + }) + .collect() +} diff --git a/code/03-02/src/executor/mod.rs b/code/03-02/src/executor/mod.rs new file mode 100644 index 0000000..9626fbc --- /dev/null +++ b/code/03-02/src/executor/mod.rs @@ -0,0 +1,122 @@ +//! Execute the queries. + +use futures::stream::{BoxStream, StreamExt}; +use futures_async_stream::try_stream; + +use crate::array::DataChunk; +use crate::catalog::CatalogRef; +use crate::physical_planner::PhysicalPlan; +use crate::storage::{StorageError, StorageRef}; + +mod create; +mod dummy; +mod evaluator; +mod explain; +mod insert; +mod projection; +mod seq_scan; +mod values; + +use self::create::*; +use self::dummy::*; +use self::explain::*; +use self::insert::*; +use self::projection::*; +use self::seq_scan::*; +use self::values::*; + +/// The maximum chunk length produced by executor at a time. +const PROCESSING_WINDOW_SIZE: usize = 1024; + +/// The error type of execution. +#[derive(thiserror::Error, Debug)] +pub enum ExecuteError { + #[error("storage error: {0}")] + Storage(#[from] StorageError), +} + +/// A type-erased executor object. +/// +/// Logically an executor is a stream of data chunks. +/// +/// It consumes one or more streams from its child executors, +/// and produces a stream to its parent. +pub type BoxedExecutor = BoxStream<'static, Result>; + +/// The builder of executor. +pub struct ExecutorBuilder { + catalog: CatalogRef, + storage: StorageRef, + /// An optional runtime handle. + /// + /// If it is some, spawn the executor to runtime and return a channel receiver. + handle: Option, +} + +impl ExecutorBuilder { + /// Create a new executor builder. + pub fn new( + catalog: CatalogRef, + storage: StorageRef, + handle: Option, + ) -> ExecutorBuilder { + ExecutorBuilder { + catalog, + storage, + handle, + } + } + + /// Build executor from a [PhysicalPlan]. + pub fn build(&self, plan: PhysicalPlan) -> BoxedExecutor { + use PhysicalPlan::*; + let mut executor: BoxedExecutor = match plan { + PhysicalCreateTable(plan) => CreateTableExecutor { + plan, + catalog: self.catalog.clone(), + storage: self.storage.clone(), + } + .execute(), + PhysicalInsert(plan) => InsertExecutor { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids, + catalog: self.catalog.clone(), + storage: self.storage.clone(), + child: self.build(*plan.child), + } + .execute(), + PhysicalValues(plan) => ValuesExecutor { + column_types: plan.column_types, + values: plan.values, + } + .execute(), + PhysicalExplain(plan) => ExplainExecutor { plan: plan.child }.execute(), + PhysicalDummy(_) => DummyExecutor.execute(), + PhysicalSeqScan(plan) => SeqScanExecutor { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids, + storage: self.storage.clone(), + } + .execute(), + PhysicalProjection(plan) => ProjectionExecutor { + exprs: plan.exprs, + child: self.build(*plan.child), + } + .execute(), + }; + if let Some(handle) = &self.handle { + // In parallel mode, we spawn the executor into the current tokio runtime, + // connect it with a channel, and return the receiver as an executor. + // Therefore, when used with tokio multi-thread runtime, they can run in parallel. + let (tx, rx) = tokio::sync::mpsc::channel(1); + handle.spawn(async move { + while let Some(e) = executor.next().await { + tx.send(e).await.unwrap(); + } + }); + tokio_stream::wrappers::ReceiverStream::new(rx).boxed() + } else { + executor + } + } +} diff --git a/code/03-02/src/executor/projection.rs b/code/03-02/src/executor/projection.rs new file mode 100644 index 0000000..038584d --- /dev/null +++ b/code/03-02/src/executor/projection.rs @@ -0,0 +1,25 @@ +use super::*; +use crate::array::DataChunk; +use crate::binder::BoundExpr; + +/// The executor of project operation. +pub struct ProjectionExecutor { + pub exprs: Vec, + pub child: BoxedExecutor, +} + +impl ProjectionExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + #[for_await] + for batch in self.child { + let batch = batch?; + let chunk = self + .exprs + .iter() + .map(|expr| expr.eval_array(&batch)) + .collect::>()?; + yield chunk; + } + } +} diff --git a/code/03-02/src/executor/seq_scan.rs b/code/03-02/src/executor/seq_scan.rs new file mode 100644 index 0000000..d1f5df9 --- /dev/null +++ b/code/03-02/src/executor/seq_scan.rs @@ -0,0 +1,24 @@ +use super::*; +use crate::array::DataChunk; +use crate::catalog::{ColumnId, TableRefId}; + +/// The executor of sequential scan operation. +pub struct SeqScanExecutor { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub storage: StorageRef, +} + +impl SeqScanExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + let table = self.storage.get_table(self.table_ref_id)?; + let txn = table.read().await?; + + for chunk in txn.all_chunks().await? { + yield chunk; + } + + txn.commit().await?; + } +} diff --git a/code/03-02/src/executor/values.rs b/code/03-02/src/executor/values.rs new file mode 100644 index 0000000..f0e3550 --- /dev/null +++ b/code/03-02/src/executor/values.rs @@ -0,0 +1,73 @@ +use itertools::Itertools; + +use super::*; +use crate::array::{ArrayBuilderImpl, DataChunk}; +use crate::binder::BoundExpr; +use crate::types::DataType; + +/// The executor of `VALUES`. +pub struct ValuesExecutor { + pub column_types: Vec, + /// Each row is composed of multiple values, each value is represented by an expression. + pub values: Vec>, +} + +impl ValuesExecutor { + #[try_stream(boxed, ok = DataChunk, error = ExecuteError)] + pub async fn execute(self) { + for chunk in self.values.chunks(PROCESSING_WINDOW_SIZE) { + // Create array builders. + let mut builders = self + .column_types + .iter() + .map(|ty| ArrayBuilderImpl::with_capacity(chunk.len(), ty)) + .collect_vec(); + // Push value into the builder. + for row in chunk { + for (expr, builder) in row.iter().zip(&mut builders) { + let value = expr.eval_const()?; + builder.push(&value); + } + } + // Finish build and yield chunk. + let chunk = builders + .into_iter() + .map(|builder| builder.finish()) + .collect::(); + yield chunk; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::array::ArrayImpl; + use crate::binder::BoundExpr; + use crate::types::{DataTypeExt, DataTypeKind, DataValue}; + + #[tokio::test] + async fn values() { + let values = [[0, 100], [1, 101], [2, 102], [3, 103]]; + let mut executor = ValuesExecutor { + column_types: vec![DataTypeKind::Int(None).nullable(); 2], + values: values + .iter() + .map(|row| { + row.iter() + .map(|&v| BoundExpr::Constant(DataValue::Int32(v))) + .collect::>() + }) + .collect::>(), + } + .execute(); + let output = executor.next().await.unwrap().unwrap(); + let expected = [ + ArrayImpl::Int32((0..4).collect()), + ArrayImpl::Int32((100..104).collect()), + ] + .into_iter() + .collect::(); + assert_eq!(output, expected); + } +} diff --git a/code/03-02/src/lib.rs b/code/03-02/src/lib.rs deleted file mode 120000 index 84f6551..0000000 --- a/code/03-02/src/lib.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/lib.rs \ No newline at end of file diff --git a/code/03-02/src/lib.rs b/code/03-02/src/lib.rs new file mode 100644 index 0000000..f7704d4 --- /dev/null +++ b/code/03-02/src/lib.rs @@ -0,0 +1,36 @@ +//! RisingLight -- an educational OLAP database. + +#![deny(unused_must_use)] +#![feature(generators)] + +// Enable macros for logging. +#[macro_use] +extern crate log; + +#[cfg(test)] +mod test; + +// Top-level structure of the database. +pub mod db; + +// Stage 1: Parse the SQL string into an Abstract Syntax Tree (AST). +pub mod parser; + +// Stage 2: Resolve all expressions referring with their names. +pub mod binder; + +// Stage 3: Transform the parse tree into a logical operations tree. +pub mod logical_planner; + +// Stage 4: Transform the logical plan into the physical plan. +pub mod physical_planner; + +// Stage 5: Execute the plans. +pub mod executor; + +pub mod array; +pub mod catalog; +pub mod storage; +pub mod types; + +pub use self::db::{Database, Error}; diff --git a/code/03-02/src/logical_planner b/code/03-02/src/logical_planner deleted file mode 120000 index 80ac2b5..0000000 --- a/code/03-02/src/logical_planner +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/logical_planner \ No newline at end of file diff --git a/code/03-02/src/logical_planner/create.rs b/code/03-02/src/logical_planner/create.rs new file mode 100644 index 0000000..beef26f --- /dev/null +++ b/code/03-02/src/logical_planner/create.rs @@ -0,0 +1,41 @@ +use itertools::Itertools; + +use super::*; +use crate::binder::BoundCreateTable; +use crate::catalog::{ColumnDesc, SchemaId}; + +/// The logical plan of `CREATE TABLE`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalCreateTable { + pub schema_id: SchemaId, + pub table_name: String, + pub columns: Vec<(String, ColumnDesc)>, +} + +impl LogicalPlanner { + pub fn plan_create_table( + &self, + stmt: BoundCreateTable, + ) -> Result { + Ok(LogicalCreateTable { + schema_id: stmt.schema_id, + table_name: stmt.table_name, + columns: stmt.columns, + } + .into()) + } +} + +impl Explain for LogicalCreateTable { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "CreateTable: name: {}, columns: [{}]", + self.table_name, + self.columns + .iter() + .map(|(name, col)| format!("{}: {:?}", name, col.datatype())) + .join(", ") + ) + } +} diff --git a/code/03-02/src/logical_planner/explain.rs b/code/03-02/src/logical_planner/explain.rs new file mode 100644 index 0000000..f44c4a4 --- /dev/null +++ b/code/03-02/src/logical_planner/explain.rs @@ -0,0 +1,22 @@ +use super::*; + +/// The logical plan of `EXPLAIN`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalExplain { + pub child: LogicalPlanRef, +} + +impl LogicalPlanner { + pub fn plan_explain(&self, stmt: BoundStatement) -> Result { + Ok(LogicalExplain { + child: self.plan(stmt)?.into(), + } + .into()) + } +} + +impl Explain for LogicalExplain { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Huh, explain myself?") + } +} diff --git a/code/03-02/src/logical_planner/insert.rs b/code/03-02/src/logical_planner/insert.rs new file mode 100644 index 0000000..41ed47f --- /dev/null +++ b/code/03-02/src/logical_planner/insert.rs @@ -0,0 +1,56 @@ +use itertools::Itertools; + +use super::*; +use crate::binder::{BoundExpr, BoundInsert}; +use crate::catalog::{ColumnId, TableRefId}; +use crate::types::DataType; + +/// The logical plan of `INSERT`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalInsert { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub child: LogicalPlanRef, +} + +/// The logical plan of `VALUES`. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalValues { + pub column_types: Vec, + pub values: Vec>, +} + +impl LogicalPlanner { + pub fn plan_insert(&self, stmt: BoundInsert) -> Result { + Ok(LogicalInsert { + table_ref_id: stmt.table_ref_id, + column_ids: stmt.column_ids, + child: Rc::new( + LogicalValues { + column_types: stmt.column_types, + values: stmt.values, + } + .into(), + ), + } + .into()) + } +} + +impl Explain for LogicalInsert { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "Insert: table {}, columns [{}]", + self.table_ref_id.table_id, + self.column_ids.iter().map(ToString::to_string).join(", ") + )?; + self.child.explain(level + 1, f) + } +} + +impl Explain for LogicalValues { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Values: {} rows", self.values.len()) + } +} diff --git a/code/03-02/src/logical_planner/mod.rs b/code/03-02/src/logical_planner/mod.rs new file mode 100644 index 0000000..0080fc1 --- /dev/null +++ b/code/03-02/src/logical_planner/mod.rs @@ -0,0 +1,68 @@ +use std::rc::Rc; + +use enum_dispatch::enum_dispatch; + +use crate::binder::BoundStatement; + +mod create; +mod explain; +mod insert; +mod select; + +pub use self::create::*; +pub use self::explain::*; +pub use self::insert::*; +pub use self::select::*; + +/// The logical plan. +#[enum_dispatch(Explain)] +#[derive(Debug, PartialEq, Clone)] +pub enum LogicalPlan { + LogicalCreateTable, + LogicalInsert, + LogicalValues, + LogicalExplain, + LogicalDummy, + LogicalGet, + LogicalProjection, +} + +/// The reference type of logical plan. +pub type LogicalPlanRef = Rc; + +impl std::fmt::Display for LogicalPlan { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.explain(0, f) + } +} + +/// Logical planner transforms the AST into a logical operations tree. +#[derive(Default)] +pub struct LogicalPlanner; + +/// The error type of logical planner. +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum LogicalPlanError {} + +impl LogicalPlanner { + /// Generate [`LogicalPlan`] from a [`BoundStatement`]. + pub fn plan(&self, stmt: BoundStatement) -> Result { + match stmt { + BoundStatement::CreateTable(stmt) => self.plan_create_table(stmt), + BoundStatement::Insert(stmt) => self.plan_insert(stmt), + BoundStatement::Explain(stmt) => self.plan_explain(*stmt), + BoundStatement::Select(stmt) => self.plan_select(stmt), + } + } +} + +/// Format a plan in `EXPLAIN` statement. +#[enum_dispatch] +pub trait Explain { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result; + + fn explain(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", " ".repeat(level))?; + self.explain_inner(level, f) + } +} diff --git a/code/03-02/src/logical_planner/select.rs b/code/03-02/src/logical_planner/select.rs new file mode 100644 index 0000000..0d9cb45 --- /dev/null +++ b/code/03-02/src/logical_planner/select.rs @@ -0,0 +1,73 @@ +//! Logical planner of `select` statement. +//! +//! A `select` statement will be planned to a compose of: +//! +//! - [`LogicalGet`] (from *) or [`LogicalDummy`] (no from) +//! - [`LogicalProjection`] (select *) + +use super::*; +use crate::binder::{BoundExpr, BoundSelect}; +use crate::catalog::{ColumnId, TableRefId}; + +/// The logical plan of dummy get. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalDummy; + +/// The logical plan of get. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalGet { + pub table_ref_id: TableRefId, + pub column_ids: Vec, +} + +/// The logical plan of projection. +#[derive(Debug, PartialEq, Clone)] +pub struct LogicalProjection { + pub exprs: Vec, + pub child: LogicalPlanRef, +} + +impl LogicalPlanner { + pub fn plan_select(&self, stmt: BoundSelect) -> Result { + let mut plan: LogicalPlan = LogicalDummy.into(); + + if let Some(table_ref) = stmt.from_list.get(0) { + plan = LogicalGet { + table_ref_id: table_ref.table_ref_id, + column_ids: table_ref.column_ids.clone(), + } + .into(); + } + if !stmt.select_list.is_empty() { + plan = LogicalProjection { + exprs: stmt.select_list, + child: plan.into(), + } + .into(); + } + Ok(plan) + } +} + +impl Explain for LogicalDummy { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Dummy:") + } +} + +impl Explain for LogicalGet { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "Get: table: {:?}, columns: {:?}", + self.table_ref_id, self.column_ids + ) + } +} + +impl Explain for LogicalProjection { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Projection: exprs: {:?}", self.exprs)?; + self.child.explain(level + 1, f) + } +} diff --git a/code/03-02/src/main.rs b/code/03-02/src/main.rs index d8ff167..65a9c6d 100644 --- a/code/03-02/src/main.rs +++ b/code/03-02/src/main.rs @@ -1,5 +1,6 @@ //! A simple interactive shell of the database. +use risinglight_03_02::storage::StorageOptions; use risinglight_03_02::Database; use rustyline::error::ReadlineError; use rustyline::Editor; @@ -7,7 +8,9 @@ use rustyline::Editor; fn main() { env_logger::init(); - let db = Database::new(); + let db = Database::new(StorageOptions { + base_path: "risinglight.db".into(), + }); let mut rl = Editor::<()>::new(); loop { diff --git a/code/03-02/src/parser.rs b/code/03-02/src/parser.rs deleted file mode 120000 index 41306c1..0000000 --- a/code/03-02/src/parser.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/parser.rs \ No newline at end of file diff --git a/code/03-02/src/parser.rs b/code/03-02/src/parser.rs new file mode 100644 index 0000000..4c3b971 --- /dev/null +++ b/code/03-02/src/parser.rs @@ -0,0 +1,15 @@ +//! Parse the SQL string into an Abstract Syntax Tree (AST). +//! +//! The parser module directly uses the [`sqlparser`] crate +//! and re-exports its AST types. + +pub use sqlparser::ast::*; +use sqlparser::dialect::PostgreSqlDialect; +use sqlparser::parser::Parser; +pub use sqlparser::parser::ParserError; + +/// Parse the SQL string into a list of ASTs. +pub fn parse(sql: &str) -> Result, ParserError> { + let dialect = PostgreSqlDialect {}; + Parser::parse_sql(&dialect, sql) +} diff --git a/code/03-02/src/physical_planner b/code/03-02/src/physical_planner deleted file mode 120000 index 844f2c0..0000000 --- a/code/03-02/src/physical_planner +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/physical_planner \ No newline at end of file diff --git a/code/03-02/src/physical_planner/create.rs b/code/03-02/src/physical_planner/create.rs new file mode 100644 index 0000000..b45ef14 --- /dev/null +++ b/code/03-02/src/physical_planner/create.rs @@ -0,0 +1,41 @@ +use itertools::Itertools; + +use super::*; +use crate::catalog::{ColumnDesc, SchemaId}; +use crate::logical_planner::LogicalCreateTable; + +/// The physical plan of `CREATE TABLE`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalCreateTable { + pub schema_id: SchemaId, + pub table_name: String, + pub columns: Vec<(String, ColumnDesc)>, +} + +impl PhysicalPlanner { + pub fn plan_create_table( + &self, + plan: &LogicalCreateTable, + ) -> Result { + Ok(PhysicalCreateTable { + schema_id: plan.schema_id, + table_name: plan.table_name.clone(), + columns: plan.columns.clone(), + } + .into()) + } +} + +impl Explain for PhysicalCreateTable { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "CreateTable: name: {}, columns: [{}]", + self.table_name, + self.columns + .iter() + .map(|(name, col)| format!("{}: {:?}", name, col.datatype())) + .join(", ") + ) + } +} diff --git a/code/03-02/src/physical_planner/dummy.rs b/code/03-02/src/physical_planner/dummy.rs new file mode 100644 index 0000000..5a52db2 --- /dev/null +++ b/code/03-02/src/physical_planner/dummy.rs @@ -0,0 +1,17 @@ +use super::*; +use crate::logical_planner::LogicalDummy; + +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalDummy; + +impl PhysicalPlanner { + pub fn plan_dummy(&self, _plan: &LogicalDummy) -> Result { + Ok(PhysicalDummy.into()) + } +} + +impl Explain for PhysicalDummy { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Dummy:") + } +} diff --git a/code/03-02/src/physical_planner/explain.rs b/code/03-02/src/physical_planner/explain.rs new file mode 100644 index 0000000..39c5388 --- /dev/null +++ b/code/03-02/src/physical_planner/explain.rs @@ -0,0 +1,23 @@ +use super::*; +use crate::logical_planner::LogicalExplain; + +/// The physical plan of `EXPLAIN`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalExplain { + pub child: Box, +} + +impl PhysicalPlanner { + pub fn plan_explain(&self, plan: &LogicalExplain) -> Result { + Ok(PhysicalExplain { + child: self.plan(&plan.child)?.into(), + } + .into()) + } +} + +impl Explain for PhysicalExplain { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Huh, explain myself?") + } +} diff --git a/code/03-02/src/physical_planner/insert.rs b/code/03-02/src/physical_planner/insert.rs new file mode 100644 index 0000000..91a2463 --- /dev/null +++ b/code/03-02/src/physical_planner/insert.rs @@ -0,0 +1,59 @@ +use itertools::Itertools; + +use super::*; +use crate::binder::BoundExpr; +use crate::catalog::{ColumnId, TableRefId}; +use crate::logical_planner::{LogicalInsert, LogicalValues}; +use crate::types::DataType; + +/// The physical plan of `INSERT`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalInsert { + pub table_ref_id: TableRefId, + pub column_ids: Vec, + pub child: Box, +} + +/// The physical plan of `VALUES`. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalValues { + pub column_types: Vec, + pub values: Vec>, +} + +impl PhysicalPlanner { + pub fn plan_insert(&self, plan: &LogicalInsert) -> Result { + Ok(PhysicalInsert { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids.clone(), + child: self.plan(&plan.child)?.into(), + } + .into()) + } + + pub fn plan_values(&self, plan: &LogicalValues) -> Result { + Ok(PhysicalValues { + column_types: plan.column_types.clone(), + values: plan.values.clone(), + } + .into()) + } +} + +impl Explain for PhysicalInsert { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "Insert: table {}, columns [{}]", + self.table_ref_id.table_id, + self.column_ids.iter().map(ToString::to_string).join(", ") + )?; + self.child.explain(level + 1, f) + } +} + +impl Explain for PhysicalValues { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Values: {} rows", self.values.len()) + } +} diff --git a/code/03-02/src/physical_planner/mod.rs b/code/03-02/src/physical_planner/mod.rs new file mode 100644 index 0000000..ee3535f --- /dev/null +++ b/code/03-02/src/physical_planner/mod.rs @@ -0,0 +1,60 @@ +use enum_dispatch::enum_dispatch; + +use crate::logical_planner::{Explain, LogicalPlan}; + +mod create; +mod dummy; +mod explain; +mod insert; +mod projection; +mod seq_scan; + +pub use self::create::*; +pub use self::dummy::*; +pub use self::explain::*; +pub use self::insert::*; +pub use self::projection::*; +pub use self::seq_scan::*; + +/// The physical plan. +#[enum_dispatch(Explain)] +#[derive(Debug, PartialEq, Clone)] +pub enum PhysicalPlan { + PhysicalCreateTable, + PhysicalInsert, + PhysicalValues, + PhysicalExplain, + PhysicalDummy, + PhysicalSeqScan, + PhysicalProjection, +} + +impl std::fmt::Display for PhysicalPlan { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.explain(0, f) + } +} + +/// Physical planner transforms the logical plan tree into a physical plan tree. +#[derive(Default)] +pub struct PhysicalPlanner; + +/// The error type of physical planner. +#[derive(thiserror::Error, Debug, PartialEq)] +pub enum PhysicalPlanError {} + +impl PhysicalPlanner { + /// Generate [`PhysicalPlan`] from a [`LogicalPlan`]. + pub fn plan(&self, plan: &LogicalPlan) -> Result { + use LogicalPlan::*; + match plan { + LogicalCreateTable(plan) => self.plan_create_table(plan), + LogicalInsert(plan) => self.plan_insert(plan), + LogicalValues(plan) => self.plan_values(plan), + LogicalExplain(plan) => self.plan_explain(plan), + LogicalDummy(plan) => self.plan_dummy(plan), + LogicalGet(plan) => self.plan_get(plan), + LogicalProjection(plan) => self.plan_projection(plan), + } + } +} diff --git a/code/03-02/src/physical_planner/projection.rs b/code/03-02/src/physical_planner/projection.rs new file mode 100644 index 0000000..4f53a52 --- /dev/null +++ b/code/03-02/src/physical_planner/projection.rs @@ -0,0 +1,30 @@ +use super::*; +use crate::binder::BoundExpr; +use crate::logical_planner::LogicalProjection; + +/// The physical plan of project operation. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalProjection { + pub exprs: Vec, + pub child: Box, +} + +impl PhysicalPlanner { + pub fn plan_projection( + &self, + plan: &LogicalProjection, + ) -> Result { + Ok(PhysicalProjection { + exprs: plan.exprs.clone(), + child: self.plan(&plan.child)?.into(), + } + .into()) + } +} + +impl Explain for PhysicalProjection { + fn explain_inner(&self, level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Projection: exprs: {:?}", self.exprs)?; + self.child.explain(level + 1, f) + } +} diff --git a/code/03-02/src/physical_planner/seq_scan.rs b/code/03-02/src/physical_planner/seq_scan.rs new file mode 100644 index 0000000..09800ef --- /dev/null +++ b/code/03-02/src/physical_planner/seq_scan.rs @@ -0,0 +1,30 @@ +use super::*; +use crate::catalog::{ColumnId, TableRefId}; +use crate::logical_planner::LogicalGet; + +/// The physical plan of sequential scan operation. +#[derive(Debug, PartialEq, Clone)] +pub struct PhysicalSeqScan { + pub table_ref_id: TableRefId, + pub column_ids: Vec, +} + +impl PhysicalPlanner { + pub fn plan_get(&self, plan: &LogicalGet) -> Result { + Ok(PhysicalSeqScan { + table_ref_id: plan.table_ref_id, + column_ids: plan.column_ids.clone(), + } + .into()) + } +} + +impl Explain for PhysicalSeqScan { + fn explain_inner(&self, _level: usize, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!( + f, + "SeqScan: table #{}, columns: {:?}", + self.table_ref_id.table_id, self.column_ids, + ) + } +} diff --git a/code/03-02/src/storage/block/mod.rs b/code/03-02/src/storage/block/mod.rs deleted file mode 100644 index 558e2e9..0000000 --- a/code/03-02/src/storage/block/mod.rs +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2022 RisingLight Project Authors. Licensed under Apache-2.0. - -//! Secondary's Block builders and iterators -//! -//! [`Block`] is the minimum managing unit in the storage engine. - -mod primitive_block_builder; -mod primitive_block_iterator; -use anyhow::{anyhow, Context, Result}; -use bytes::{Buf, BufMut, Bytes}; -pub use primitive_block_builder::*; -pub use primitive_block_iterator::*; - -use crate::array::Array; -use crate::storage::proto::*; - -/// A block is simply a [`Bytes`] array. -pub type Block = Bytes; - -/// Builds a block. All builders should implement the trait, while -/// ensuring that the format follows the block encoding scheme. -/// -/// In RisingLight, the block encoding scheme is as follows: -/// -/// ```plain -/// | block_type | cksum_type | cksum | data | -/// | 4B | 4B | 8B | variable | -/// ``` -pub trait BlockBuilder { - /// Append one data into the block. - fn append(&mut self, item: Option<&A::Item>); - - /// Get estimated size of block. Will be useful on runlength or compression encoding. - fn estimated_size(&self) -> usize; - - /// Check if we should finish the current block. If there is no item in the current - /// builder, this function must return `true`. - fn should_finish(&self, next_item: &Option<&A::Item>) -> bool; - - /// Finish a block and return encoded data. - fn finish(self) -> Vec; -} - -/// An iterator on a block. This iterator requires the block being pre-loaded in memory. -pub trait BlockIterator { - /// Get a batch from the block. A `0` return value means that this batch contains no - /// element. Some iterators might support exact size output. By using `expected_size`, - /// developers can get an array of NO MORE THAN the `expected_size`. - fn next_batch(&mut self, expected_size: Option, builder: &mut A::Builder) -> usize; - - /// Skip `cnt` items. - fn skip(&mut self, cnt: usize); - - /// Number of items remaining in this block - fn remaining_items(&self) -> usize; -} - -#[derive(Debug, Clone)] -pub struct BlockHeader { - pub block_type: BlockType, - pub checksum_type: ChecksumType, - pub checksum: u64, -} - -pub const BLOCK_HEADER_SIZE: usize = 4 + 4 + 8; - -impl BlockHeader { - pub fn encode(&self, buf: &mut impl BufMut) { - buf.put_i32(self.block_type.into()); - buf.put_i32(self.checksum_type.into()); - buf.put_u64(self.checksum); - } - - pub fn decode(&mut self, buf: &mut impl Buf) -> Result<()> { - if buf.remaining() < 4 + 4 + 8 { - return Err(anyhow!("expected 16 bytes")); - } - self.block_type = - BlockType::from_i32(buf.get_i32()).context("expected valid checksum type")?; - self.checksum_type = - ChecksumType::from_i32(buf.get_i32()).context("expected valid checksum type")?; - self.checksum = buf.get_u64(); - Ok(()) - } -} diff --git a/code/03-02/src/storage/block/primitive_block_builder.rs b/code/03-02/src/storage/block/primitive_block_builder.rs deleted file mode 100644 index 6dd6608..0000000 --- a/code/03-02/src/storage/block/primitive_block_builder.rs +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022 RisingLight Project Authors. Licensed under Apache-2.0. - -use std::marker::PhantomData; - -use super::super::encode::PrimitiveFixedWidthEncode; -use super::BlockBuilder; - -/// Encodes fixed-width data into a block. The layout is simply an array of -/// little endian fixed-width data. -pub struct PlainPrimitiveBlockBuilder { - data: Vec, - target_size: usize, - _phantom: PhantomData, -} - -impl PlainPrimitiveBlockBuilder { - pub fn new(target_size: usize) -> Self { - let data = Vec::with_capacity(target_size); - Self { - data, - target_size, - _phantom: PhantomData, - } - } -} - -impl BlockBuilder for PlainPrimitiveBlockBuilder { - fn append(&mut self, item: Option<&T>) { - item.expect("nullable item found in non-nullable block builder") - .encode(&mut self.data); - } - - fn estimated_size(&self) -> usize { - self.data.len() - } - - fn should_finish(&self, _next_item: &Option<&T>) -> bool { - !self.data.is_empty() && self.estimated_size() + T::WIDTH > self.target_size - } - - fn finish(self) -> Vec { - self.data - } -} diff --git a/code/03-02/src/storage/block/primitive_block_iterator.rs b/code/03-02/src/storage/block/primitive_block_iterator.rs deleted file mode 100644 index 1a5d6fe..0000000 --- a/code/03-02/src/storage/block/primitive_block_iterator.rs +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2022 RisingLight Project Authors. Licensed under Apache-2.0. - -use std::marker::PhantomData; - -use super::super::PrimitiveFixedWidthEncode; -use super::{Block, BlockIterator}; -use crate::array::{Array, ArrayBuilder}; - -/// Scans one or several arrays from the block content. -pub struct PlainPrimitiveBlockIterator { - /// Block content - block: Block, - - /// Total count of elements in block - row_count: usize, - - /// Indicates the beginning row of the next batch - next_row: usize, - - _phantom: PhantomData, -} - -impl PlainPrimitiveBlockIterator { - pub fn new(block: Block, row_count: usize) -> Self { - Self { - block, - row_count, - next_row: 0, - _phantom: PhantomData, - } - } -} - -impl BlockIterator for PlainPrimitiveBlockIterator { - fn next_batch( - &mut self, - expected_size: Option, - builder: &mut ::Builder, - ) -> usize { - if self.next_row >= self.row_count { - return 0; - } - - // TODO(chi): error handling on corrupted block - - let mut cnt = 0; - let mut buffer = &self.block[self.next_row * T::WIDTH..]; - - loop { - if let Some(expected_size) = expected_size { - assert!(expected_size > 0); - if cnt >= expected_size { - break; - } - } - - if self.next_row >= self.row_count { - break; - } - - builder.push(Some(&T::decode(&mut buffer))); - cnt += 1; - self.next_row += 1; - } - - cnt - } - - fn skip(&mut self, cnt: usize) { - self.next_row += cnt; - } - - fn remaining_items(&self) -> usize { - self.row_count - self.next_row - } -} diff --git a/code/03-02/src/storage/column.rs b/code/03-02/src/storage/column.rs new file mode 100644 index 0000000..46b3edf --- /dev/null +++ b/code/03-02/src/storage/column.rs @@ -0,0 +1,25 @@ +use anyhow::anyhow; +use bytes::{Buf, BufMut}; + +use super::StorageResult; +use crate::array::{Array, ArrayBuilder, I32Array, I32ArrayBuilder}; + +/// Encode an `I32Array` into a `Vec`. +pub fn encode_int32_column(a: &I32Array, mut buffer: impl BufMut) -> StorageResult<()> { + for item in a.iter() { + if let Some(item) = item { + buffer.put_i32_le(*item); + } else { + return Err(anyhow!("nullable encoding not supported!").into()); + } + } + Ok(()) +} + +pub fn decode_int32_column(mut data: impl Buf) -> StorageResult { + let mut builder = I32ArrayBuilder::with_capacity(data.remaining() / 4); + while data.has_remaining() { + builder.push(Some(&data.get_i32_le())); + } + Ok(builder.finish()) +} diff --git a/code/03-02/src/storage/encode.rs b/code/03-02/src/storage/encode.rs deleted file mode 100644 index c6ed52a..0000000 --- a/code/03-02/src/storage/encode.rs +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2022 RisingLight Project Authors. Licensed under Apache-2.0. - -use bytes::{Buf, BufMut}; - -use crate::array::{Array, BoolArray, F64Array, I32Array}; - -/// Encode a primitive value into fixed-width buffer -pub trait PrimitiveFixedWidthEncode: Copy + Clone + 'static + Send + Sync { - /// Width of each element - const WIDTH: usize; - const DEAFULT_VALUE: &'static Self; - - type ArrayType: Array; - - /// Encode current primitive data to the end of an `Vec`. - fn encode(&self, buffer: &mut impl BufMut); - - /// Decode a data from a bytes array. - fn decode(buffer: &mut impl Buf) -> Self; -} - -impl PrimitiveFixedWidthEncode for bool { - const WIDTH: usize = std::mem::size_of::(); - const DEAFULT_VALUE: &'static bool = &false; - type ArrayType = BoolArray; - - fn encode(&self, buffer: &mut impl BufMut) { - buffer.put_u8(*self as u8) - } - - fn decode(buffer: &mut impl Buf) -> Self { - buffer.get_u8() != 0 - } -} - -impl PrimitiveFixedWidthEncode for i32 { - const WIDTH: usize = std::mem::size_of::(); - const DEAFULT_VALUE: &'static i32 = &0; - - type ArrayType = I32Array; - - fn encode(&self, buffer: &mut impl BufMut) { - buffer.put_i32_le(*self); - } - - fn decode(buffer: &mut impl Buf) -> Self { - buffer.get_i32_le() - } -} - -impl PrimitiveFixedWidthEncode for f64 { - const WIDTH: usize = std::mem::size_of::(); - const DEAFULT_VALUE: &'static f64 = &0.0; - - type ArrayType = F64Array; - - fn encode(&self, buffer: &mut impl BufMut) { - buffer.put_f64_le(*self); - } - - fn decode(buffer: &mut impl Buf) -> Self { - buffer.get_f64_le() - } -} diff --git a/code/03-02/src/storage/mod.rs b/code/03-02/src/storage/mod.rs index bf8e2ff..58886eb 100644 --- a/code/03-02/src/storage/mod.rs +++ b/code/03-02/src/storage/mod.rs @@ -1,126 +1,208 @@ -//! Persistent storage on disk. -//! -//! RisingLight's in-memory representation of data is very simple. Currently, -//! it is simple a vector of `DataChunk`. Upon insertion, users' data are -//! simply appended to the end of the vector. - -mod block; -mod encode; -mod proto; +//! On-disk storage + +mod column; mod rowset; -mod table_transaction; use std::collections::HashMap; -use std::sync::{Arc, Mutex, RwLock}; +use std::path::PathBuf; +use std::sync::atomic::AtomicU32; +use std::sync::{Arc, RwLock}; use anyhow::anyhow; -pub use block::*; -pub use encode::*; -pub use proto::*; -pub use rowset::*; -pub use table_transaction::*; +use self::rowset::{DiskRowset, RowSetBuilder}; use crate::array::DataChunk; -use crate::catalog::TableRefId; +use crate::catalog::{ColumnDesc, TableRefId}; /// The error type of storage operations. #[derive(thiserror::Error, Debug)] #[error("{0:?}")] -pub struct StorageError(anyhow::Error); +pub struct StorageError(#[from] anyhow::Error); /// A specialized `Result` type for storage operations. pub type StorageResult = std::result::Result; pub type StorageRef = Arc; -pub type DiskTableRef = Arc; +pub type StorageTableRef = Arc; -/// Persistent storage on disk. +/// On-disk storage. pub struct DiskStorage { - tables: Mutex>, + /// All tables in the current storage engine. + tables: RwLock>, + + /// Generator for RowSet id. + rowset_id_generator: Arc, + + /// The storage options. + options: Arc, } -impl Default for DiskStorage { - fn default() -> Self { - Self::new() - } +pub struct StorageOptions { + /// The directory of the storage + pub base_path: PathBuf, +} + +pub fn err(error: impl Into) -> StorageError { + StorageError(error.into()) +} + +/// An on-disk table. +pub struct DiskTable { + /// Id of the table. + id: TableRefId, + + /// Columns of the current table. + column_descs: Arc<[ColumnDesc]>, + + /// The storage options. + options: Arc, + + /// Generator for RowSet id. + rowset_id_generator: Arc, + + /// RowSets in the table + rowsets: RwLock>, } impl DiskStorage { - /// Create a new persistent storage on disk. - pub fn new() -> Self { + /// Create a new in-memory storage. + pub fn new(options: StorageOptions) -> Self { DiskStorage { - tables: Mutex::new(HashMap::new()), + tables: RwLock::new(HashMap::new()), + options: Arc::new(options), + rowset_id_generator: Arc::new(AtomicU32::new(0)), } } /// Add a table. - pub fn add_table(&self, id: TableRefId) -> StorageResult<()> { - let table = Arc::new(DiskTable::new(id)); - self.tables.lock().unwrap().insert(id, table); + pub fn add_table(&self, id: TableRefId, column_descs: &[ColumnDesc]) -> StorageResult<()> { + let mut tables = self.tables.write().unwrap(); + let table = DiskTable { + id, + options: self.options.clone(), + column_descs: column_descs.into(), + rowsets: RwLock::new(Vec::new()), + rowset_id_generator: self.rowset_id_generator.clone(), + }; + let res = tables.insert(id, table.into()); + if res.is_some() { + return Err(anyhow!("table already exists: {:?}", id).into()); + } Ok(()) } /// Get a table. - pub fn get_table(&self, id: TableRefId) -> StorageResult { - self.tables - .lock() - .unwrap() + pub fn get_table(&self, id: TableRefId) -> StorageResult { + let tables = self.tables.read().unwrap(); + tables .get(&id) + .ok_or_else(|| anyhow!("table not found: {:?}", id).into()) .cloned() - .ok_or_else(|| anyhow!("table not found: {:?}", id)) - .map_err(StorageError) } } -/// A table in in-memory engine. -pub struct DiskTable { - #[allow(dead_code)] - id: TableRefId, - inner: RwLock, +impl DiskTable { + /// Start a transaction which only contains write. + pub async fn write(self: &Arc) -> StorageResult { + let rowsets = self.rowsets.read().unwrap(); + Ok(DiskTransaction { + read_only: false, + table: self.clone(), + rowset_snapshot: rowsets.clone(), + builder: None, + finished: false, + }) + } + + /// Start a transaction which only contains read. + pub async fn read(self: &Arc) -> StorageResult { + let rowsets = self.rowsets.read().unwrap(); + Ok(DiskTransaction { + read_only: true, + table: self.clone(), + rowset_snapshot: rowsets.clone(), + builder: None, + finished: false, + }) + } + + pub fn table_path(&self) -> PathBuf { + self.options.base_path.join(self.id.table_id.to_string()) + } + + pub fn rowset_path_of(&self, rowset_id: u32) -> PathBuf { + self.table_path().join(rowset_id.to_string()) + } } -#[derive(Default)] -struct DiskTableInner { - chunks: Vec, +pub struct DiskTransaction { + /// If this txn is read only. + read_only: bool, + + /// Reference to table object + table: Arc, + + /// Current snapshot of RowSets + rowset_snapshot: Vec, + + /// Builder for the RowSet + builder: Option, + + /// Indicates whether the transaction is committed or aborted. If + /// the [`SecondaryTransaction`] object is dropped without finishing, + /// the transaction will panic. + finished: bool, } -impl DiskTable { - fn new(id: TableRefId) -> Self { - Self { - id, - inner: RwLock::new(DiskTableInner::default()), +impl Drop for DiskTransaction { + fn drop(&mut self) { + if !self.finished { + warn!("Transaction dropped without committing or aborting"); } } +} - #[allow(dead_code)] - async fn write(self: &Arc) -> StorageResult { - Ok(TableTransaction::start(self.clone(), false, false).await?) - } +impl DiskTransaction { + /// Append a chunk to the table. + pub async fn append(&mut self, chunk: DataChunk) -> StorageResult<()> { + if self.read_only { + return Err(anyhow!("cannot append chunks in read only txn!").into()); + } + if self.builder.is_none() { + self.builder = Some(RowSetBuilder::new(self.table.column_descs.clone())); + } + let builder = self.builder.as_mut().unwrap(); - #[allow(dead_code)] - async fn read(self: &Arc) -> StorageResult { - Ok(TableTransaction::start(self.clone(), true, false).await?) - } + builder.append(chunk)?; - #[allow(dead_code)] - async fn update(self: &Arc) -> StorageResult { - Ok(TableTransaction::start(self.clone(), false, true).await?) + Ok(()) } - /// Append a chunk to the table. - /// - /// This interface will be deprecated soon in this tutorial. - pub fn append(&self, chunk: DataChunk) -> StorageResult<()> { - let mut inner = self.inner.write().unwrap(); - inner.chunks.push(chunk); + pub async fn commit(mut self) -> StorageResult<()> { + self.finished = true; + + if let Some(builder) = self.builder.take() { + use std::sync::atomic::Ordering::SeqCst; + let rowset_id = self.table.rowset_id_generator.fetch_add(1, SeqCst); + let rowset_path = self + .table + .options + .base_path + .join(self.table.rowset_path_of(rowset_id)); + let rowset = builder.flush(rowset_id, rowset_path).await?; + let mut rowsets = self.table.rowsets.write().unwrap(); + rowsets.push(rowset); + } + Ok(()) } /// Get all chunks of the table. - /// - /// This interface will be deprecated soon in this tutorial. - pub fn all_chunks(&self) -> StorageResult> { - let inner = self.inner.read().unwrap(); - Ok(inner.chunks.clone()) + pub async fn all_chunks(&self) -> StorageResult> { + let mut chunks = vec![]; + for rowset in &self.rowset_snapshot { + chunks.push(rowset.as_chunk().await?); + } + Ok(chunks) } } diff --git a/code/03-02/src/storage/proto.rs b/code/03-02/src/storage/proto.rs deleted file mode 100644 index be74640..0000000 --- a/code/03-02/src/storage/proto.rs +++ /dev/null @@ -1,47 +0,0 @@ -//! On-disk representation of some enums - -use anyhow::{anyhow, Result}; - -#[derive(Debug, Clone, Copy)] -pub enum BlockType { - PrimitiveNonNull, -} - -impl BlockType { - pub fn from_i32(item: i32) -> Result { - match item { - 1 => Ok(Self::PrimitiveNonNull), - other => Err(anyhow!("invlid block type {}", other)), - } - } -} - -impl From for i32 { - fn from(ty: BlockType) -> Self { - match ty { - BlockType::PrimitiveNonNull => 1, - } - } -} - -#[derive(Debug, Clone, Copy)] -pub enum ChecksumType { - None, -} - -impl ChecksumType { - pub fn from_i32(item: i32) -> Result { - match item { - 1 => Ok(Self::None), - other => Err(anyhow!("invlid checksum type {}", other)), - } - } -} - -impl From for i32 { - fn from(ty: ChecksumType) -> Self { - match ty { - ChecksumType::None => 1, - } - } -} diff --git a/code/03-02/src/storage/rowset.rs b/code/03-02/src/storage/rowset.rs new file mode 100644 index 0000000..8961890 --- /dev/null +++ b/code/03-02/src/storage/rowset.rs @@ -0,0 +1,90 @@ +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use anyhow::anyhow; +use itertools::Itertools; + +use super::column::{decode_int32_column, encode_int32_column}; +use super::{err, StorageResult}; +use crate::array::{ArrayImpl, DataChunk}; +use crate::catalog::ColumnDesc; + +fn column_path(rowset_path: impl AsRef, column_id: usize) -> PathBuf { + rowset_path.as_ref().join(format!("{}.col", column_id)) +} + +#[derive(Clone)] +pub struct DiskRowset { + /// Columns of the current RowSet. + column_descs: Arc<[ColumnDesc]>, + + /// Id of the current rowset within the table. + #[allow(dead_code)] + rowset_id: u32, + + /// Base path of the RowSet + rowset_path: PathBuf, +} + +impl DiskRowset { + pub async fn as_chunk(&self) -> StorageResult { + let mut columns = vec![]; + for (idx, _) in self.column_descs.iter().enumerate() { + let column_path = column_path(&self.rowset_path, idx); + let data = tokio::fs::read(column_path).await.map_err(err)?; + columns.push(decode_int32_column(&data[..])?); + } + Ok(columns.into_iter().map(ArrayImpl::Int32).collect()) + } +} + +pub struct RowSetBuilder { + /// Columns of the current RowSet. + column_descs: Arc<[ColumnDesc]>, + + /// Buffer of all column data + buffer: Vec>, +} + +impl RowSetBuilder { + pub fn new(column_descs: Arc<[ColumnDesc]>) -> Self { + RowSetBuilder { + buffer: (0..column_descs.len()).map(|_| vec![]).collect_vec(), + column_descs, + } + } + + pub fn append(&mut self, chunk: DataChunk) -> StorageResult<()> { + for (idx, column) in chunk.arrays().iter().enumerate() { + if let ArrayImpl::Int32(column) = column { + encode_int32_column(column, &mut self.buffer[idx])?; + } else { + return Err(anyhow!("unsupported column type").into()); + } + } + Ok(()) + } + + pub async fn flush( + self, + rowset_id: u32, + rowset_path: impl AsRef, + ) -> StorageResult { + let rowset_path = rowset_path.as_ref(); + + tokio::fs::create_dir_all(rowset_path).await.map_err(err)?; + + for (idx, _) in self.column_descs.iter().enumerate() { + let column_path = column_path(rowset_path, idx); + tokio::fs::write(column_path, &self.buffer[idx]) + .await + .map_err(err)?; + } + + Ok(DiskRowset { + column_descs: self.column_descs, + rowset_id, + rowset_path: rowset_path.into(), + }) + } +} diff --git a/code/03-02/src/storage/rowset/mem_rowset.rs b/code/03-02/src/storage/rowset/mem_rowset.rs deleted file mode 100644 index 3b904a8..0000000 --- a/code/03-02/src/storage/rowset/mem_rowset.rs +++ /dev/null @@ -1,39 +0,0 @@ -#![allow(dead_code)] - -use std::sync::Arc; - -use itertools::Itertools; - -use crate::array::{ArrayBuilderImpl, DataChunk}; -use crate::catalog::ColumnCatalog; -use crate::storage::StorageResult; - -pub struct MemRowset { - builders: Vec, -} - -impl MemRowset { - pub fn new(columns: Arc<[ColumnCatalog]>) -> Self { - Self { - builders: columns - .iter() - .map(|column| ArrayBuilderImpl::with_capacity(0, column.desc().datatype())) - .collect_vec(), - } - } - - fn append(&mut self, columns: DataChunk) -> StorageResult<()> { - for (idx, column) in columns.arrays().iter().enumerate() { - self.builders[idx].append(column); - } - Ok(()) - } - - fn flush(self) -> StorageResult { - Ok(self - .builders - .into_iter() - .map(|builder| builder.finish()) - .collect::()) - } -} diff --git a/code/03-02/src/storage/rowset/mod.rs b/code/03-02/src/storage/rowset/mod.rs deleted file mode 100644 index 1675535..0000000 --- a/code/03-02/src/storage/rowset/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod mem_rowset; - -pub use mem_rowset::*; diff --git a/code/03-02/src/storage/table_transaction.rs b/code/03-02/src/storage/table_transaction.rs deleted file mode 100644 index 749bbbf..0000000 --- a/code/03-02/src/storage/table_transaction.rs +++ /dev/null @@ -1,56 +0,0 @@ -#![allow(dead_code)] - -use super::rowset::MemRowset; -use super::{DiskTableRef, StorageResult}; -use crate::array::DataChunk; - -/// [`TableTransaction`] records the state of a single table. All operations (insert, update, -/// delete) should go through [`TableTransaction`]. -pub struct TableTransaction { - mem_rowset: Option, - read_only: bool, - update: bool, - table: DiskTableRef, -} - -impl TableTransaction { - /// Start a [`WriteBatch`] - pub async fn start(table: DiskTableRef, read_only: bool, update: bool) -> StorageResult { - Ok(Self { - mem_rowset: None, - table, - update, - read_only, - }) - } - - /// Flush [`WriteBatch`] to some on-disk RowSets. - pub async fn flush(self) { - todo!() - } - - /// Add a [`DataChunk`] to the mem rowset - pub fn append(&self, _chunk: DataChunk) -> StorageResult<()> { - todo!() - } - - /// Delete a row from the table. - async fn delete(&mut self, _row_id: u64) -> StorageResult<()> { - todo!() - } - - /// Commit all changes in this transaction. - pub fn commit(self) -> StorageResult<()> { - todo!() - } - - /// Abort all changes in this transaction. - pub fn abort(self) -> StorageResult<()> { - todo!() - } - - /// Create an iterator on this table. - pub async fn scan(&self) { - todo!() - } -} diff --git a/code/03-02/src/test.rs b/code/03-02/src/test.rs deleted file mode 120000 index 05edac9..0000000 --- a/code/03-02/src/test.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/test.rs \ No newline at end of file diff --git a/code/03-02/src/test.rs b/code/03-02/src/test.rs new file mode 100644 index 0000000..b55a2d3 --- /dev/null +++ b/code/03-02/src/test.rs @@ -0,0 +1,61 @@ +use std::path::Path; + +use tempfile::tempdir; +use test_case::test_case; + +use crate::array::DataChunk; +use crate::storage::StorageOptions; +use crate::types::DataValue; +use crate::{Database, Error}; + +#[test_case("03-01.slt")] +#[test_case("03-02.slt")] +fn test(name: &str) { + init_logger(); + let script = std::fs::read_to_string(Path::new("../sql").join(name)).unwrap(); + let tempdir = tempdir().unwrap(); + let mut tester = sqllogictest::Runner::new(Database::new(StorageOptions { + base_path: tempdir.path().into(), + })); + if let Err(err) = tester.run_script(&script) { + panic!("{}", err); + } +} + +impl sqllogictest::DB for Database { + type Error = Error; + fn run(&self, sql: &str) -> Result { + let chunks = self.run(sql)?; + let strings = chunks.iter().map(datachunk_to_string).collect(); + Ok(strings) + } +} + +fn init_logger() { + use std::sync::Once; + static INIT: Once = Once::new(); + INIT.call_once(env_logger::init); +} + +fn datachunk_to_string(chunk: &DataChunk) -> String { + use std::fmt::Write; + let mut string = String::new(); + for row in 0..chunk.cardinality() { + for (col, array) in chunk.arrays().iter().enumerate() { + if col != 0 { + write!(string, " ").unwrap(); + } + match array.get(row) { + DataValue::Null => write!(string, "NULL"), + DataValue::Bool(v) => write!(string, "{}", v), + DataValue::Int32(v) => write!(string, "{}", v), + DataValue::Float64(v) => write!(string, "{}", v), + DataValue::String(s) if s.is_empty() => write!(string, "(empty)"), + DataValue::String(s) => write!(string, "{}", s), + } + .unwrap(); + } + writeln!(string).unwrap(); + } + string +} diff --git a/code/03-02/src/types.rs b/code/03-02/src/types.rs deleted file mode 120000 index 88bb446..0000000 --- a/code/03-02/src/types.rs +++ /dev/null @@ -1 +0,0 @@ -../../03-00/src/types.rs \ No newline at end of file diff --git a/code/03-02/src/types.rs b/code/03-02/src/types.rs new file mode 100644 index 0000000..509d59a --- /dev/null +++ b/code/03-02/src/types.rs @@ -0,0 +1,89 @@ +//! Defination of data types. + +pub use sqlparser::ast::DataType as DataTypeKind; + +/// Data type with nullable. +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct DataType { + kind: DataTypeKind, + nullable: bool, +} + +impl DataType { + pub const fn new(kind: DataTypeKind, nullable: bool) -> Self { + DataType { kind, nullable } + } + + pub fn is_nullable(&self) -> bool { + self.nullable + } + + pub fn kind(&self) -> DataTypeKind { + self.kind.clone() + } +} + +impl std::fmt::Debug for DataType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.kind)?; + if self.nullable { + write!(f, " (null)")?; + } + Ok(()) + } +} + +/// The extension methods for [`DataTypeKind`]. +pub trait DataTypeExt { + /// Create a nullable [`DataType`] from self. + fn nullable(self) -> DataType; + /// Create a non-nullable [`DataType`] from self. + fn not_null(self) -> DataType; +} + +impl DataTypeExt for DataTypeKind { + fn nullable(self) -> DataType { + DataType::new(self, true) + } + + fn not_null(self) -> DataType { + DataType::new(self, false) + } +} + +/// Primitive SQL value. +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum DataValue { + // NOTE: Null comes first. + // => NULL is less than any non-NULL values + Null, + Bool(bool), + Int32(i32), + Float64(f64), + String(String), +} + +impl ToString for DataValue { + fn to_string(&self) -> String { + match self { + Self::Null => String::from("NULL"), + Self::Bool(v) => v.to_string(), + Self::Int32(v) => v.to_string(), + Self::Float64(v) => v.to_string(), + Self::String(v) => v.to_string(), + } + } +} + +impl DataValue { + /// Get the type of value. `None` means NULL. + pub fn datatype(&self) -> Option { + match self { + Self::Bool(_) => Some(DataTypeKind::Boolean.not_null()), + Self::Int32(_) => Some(DataTypeKind::Int(None).not_null()), + Self::Float64(_) => Some(DataTypeKind::Double.not_null()), + Self::String(_) => Some(DataTypeKind::Varchar(None).not_null()), + Self::Null => None, + } + } +} diff --git a/code/sql/03-01.slt b/code/sql/03-01.slt new file mode 100644 index 0000000..fb7575d --- /dev/null +++ b/code/sql/03-01.slt @@ -0,0 +1,14 @@ +# 03-01: Very simple storage test + +statement ok +CREATE TABLE t (a INT NOT NULL, b INT NOT NULL, c INT NOT NULL) + +statement ok +INSERT INTO t VALUES (1,10,100), (2,20,200), (3,30,300) + +query III +SELECT * FROM t +---- +1 10 100 +2 20 200 +3 30 300 diff --git a/code/sql/03-02.slt b/code/sql/03-02.slt new file mode 100644 index 0000000..f9037c2 --- /dev/null +++ b/code/sql/03-02.slt @@ -0,0 +1,17 @@ +# 03-01: RowSet tests + +statement ok +CREATE TABLE t (a INT NOT NULL, b INT NOT NULL, c INT NOT NULL) + +statement ok +INSERT INTO t VALUES (1,10,100) + +statement ok +INSERT INTO t VALUES (2,20,200), (3,30,300) + +query III rowsort +SELECT * FROM t +---- +1 10 100 +2 20 200 +3 30 300 diff --git a/docs/src/01-01-hello-sql.md b/docs/src/01-01-hello-sql.md index bc36d4d..87efc86 100644 --- a/docs/src/01-01-hello-sql.md +++ b/docs/src/01-01-hello-sql.md @@ -320,6 +320,8 @@ fn test() { 我们提供了一份完整的测试框架代码 [test.rs],你可以直接把它复制到你的项目中。 +在运行测试时,你可以开启 `RUST_LOG=info` 和 `RUST_BACKTRACE=1` 这两个环境变量。开启这两个选项可以方便定位运行过程的错误。 + [sqllogictest]: https://docs.rs/sqllogictest/0.1.0/sqllogictest/ [`DB`]: https://docs.rs/sqllogictest/0.1.0/sqllogictest/trait.DB.html [`Runner`]: https://docs.rs/sqllogictest/0.1.0/sqllogictest/struct.Runner.html#method.run_script diff --git a/scripts/update-storage-tutorial.sh b/scripts/update-storage-tutorial.sh new file mode 100755 index 0000000..1e7cd6e --- /dev/null +++ b/scripts/update-storage-tutorial.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set -e + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +TEMP_DIR=$(mktemp -d "/tmp/risinglight-tutorial.XXXXXX") +CHECKOUT_AT="${TEMP_DIR}/checkout" + +cd "$DIR/.." + +rm -rf "code/03-00" +rm -rf "code/03-01" +rm -rf "code/03-02" + +git worktree prune + +git worktree add "${CHECKOUT_AT}" storage --detach +cp -a "${CHECKOUT_AT}/code/03-00/" "code/03-02" +sed -i ".bak" -e "s/risinglight-03-00/risinglight-03-02/g" "code/03-02/Cargo.toml" +sed -i ".bak" -e "s/risinglight_03_00/risinglight_03_02/g" "code/03-02/src/main.rs" +rm "code/03-02/Cargo.toml.bak" +rm "code/03-02/src/main.rs.bak" + +cp -a "${CHECKOUT_AT}/code/sql/" "code/sql/" + +git worktree remove "${CHECKOUT_AT}" + +git worktree add "${CHECKOUT_AT}" storage~1 --detach +cp -a "${CHECKOUT_AT}/code/03-00/" "code/03-01" +sed -i ".bak" -e "s/risinglight-03-00/risinglight-03-01/g" "code/03-01/Cargo.toml" +sed -i ".bak" -e "s/risinglight_03_00/risinglight_03_01/g" "code/03-01/src/main.rs" +rm "code/03-01/Cargo.toml.bak" +rm "code/03-01/src/main.rs.bak" +git worktree remove "${CHECKOUT_AT}" + +git worktree add "${CHECKOUT_AT}" storage~2 --detach +cp -a "${CHECKOUT_AT}/code/03-00/" "code/03-00" +git worktree remove "${CHECKOUT_AT}"