1515use std:: sync:: Arc ;
1616
1717use databend_common_expression:: passthrough_nullable;
18+ use databend_common_expression:: types:: array:: ArrayColumnBuilder ;
1819use databend_common_expression:: types:: nullable:: NullableColumn ;
1920use databend_common_expression:: types:: number:: Int64Type ;
2021use databend_common_expression:: types:: number:: NumberScalar ;
@@ -32,6 +33,7 @@ use databend_common_expression::FunctionRegistry;
3233use databend_common_expression:: FunctionSignature ;
3334use databend_common_expression:: Scalar ;
3435use databend_common_expression:: Value ;
36+ use regex:: Match ;
3537use string:: StringColumnBuilder ;
3638
3739pub fn register ( registry : & mut FunctionRegistry ) {
@@ -323,6 +325,110 @@ pub fn register(registry: &mut FunctionRegistry) {
323325 }
324326 } ) ;
325327
328+ registry. register_passthrough_nullable_2_arg :: < StringType , StringType , StringType , _ , _ > (
329+ "regexp_extract" ,
330+ |_, _, _| FunctionDomain :: MayThrow ,
331+ |source_arg, pat_arg, ctx| {
332+ inner_regexp_extract ( & source_arg, & pat_arg, & Value :: Scalar ( 0 ) , ctx)
333+ } ,
334+ ) ;
335+
336+ registry. register_passthrough_nullable_3_arg :: < StringType , StringType , UInt32Type , StringType , _ , _ > (
337+ "regexp_extract" ,
338+ |_, _, _, _| FunctionDomain :: MayThrow ,
339+ |source_arg, pat_arg, group_arg, ctx| {
340+ inner_regexp_extract ( & source_arg, & pat_arg, & group_arg, ctx)
341+ }
342+ ) ;
343+
344+ registry. register_passthrough_nullable_3_arg :: < StringType , StringType , ArrayType < StringType > , MapType < StringType , StringType > , _ , _ > (
345+ "regexp_extract" ,
346+ |_, _, _, _| FunctionDomain :: MayThrow ,
347+ |source_arg, pat_arg, name_list_arg, ctx| {
348+ let len = [ & source_arg, & pat_arg] . iter ( ) . find_map ( |arg| match arg {
349+ Value :: Column ( col) => Some ( col. len ( ) ) ,
350+ _ => None ,
351+ } ) . or_else ( || match & name_list_arg {
352+ Value :: Column ( col) => Some ( col. len ( ) ) ,
353+ _ => None ,
354+ } ) ;
355+
356+ let cached_reg = match & pat_arg {
357+ Value :: Scalar ( pat) => {
358+ match regexp:: build_regexp_from_pattern ( "regexp_extract" , pat, None ) {
359+ Ok ( re) => Some ( re) ,
360+ _ => None ,
361+ }
362+ }
363+ _ => None ,
364+ } ;
365+
366+ let size = len. unwrap_or ( 1 ) ;
367+ let mut builder =
368+ MapType :: < StringType , StringType > :: create_builder ( size, ctx. generics ) ;
369+
370+ for idx in 0 ..size {
371+ let source = unsafe { source_arg. index_unchecked ( idx) } ;
372+ let pat = unsafe { pat_arg. index_unchecked ( idx) } ;
373+ let name_list = unsafe { name_list_arg. index_unchecked ( idx) } ;
374+ let mut local_re = None ;
375+ if cached_reg. is_none ( ) {
376+ match regexp:: build_regexp_from_pattern ( "regexp_extract" , pat, None ) {
377+ Ok ( re) => {
378+ local_re = Some ( re) ;
379+ }
380+ Err ( err) => {
381+ ctx. set_error ( builder. len ( ) , err) ;
382+ builder. push_default ( ) ;
383+ continue ;
384+ }
385+ }
386+ } ;
387+ let re = cached_reg
388+ . as_ref ( )
389+ . unwrap_or_else ( || local_re. as_ref ( ) . unwrap ( ) ) ;
390+ let captures = re. captures_iter ( source) . last ( ) ;
391+ if let Some ( captures) = & captures {
392+ if name_list. len ( ) + 1 > captures. len ( ) {
393+ ctx. set_error ( builder. len ( ) , "Not enough group names in regexp_extract" ) ;
394+ builder. push_default ( ) ;
395+ continue ;
396+ }
397+ }
398+ for ( i, name) in name_list. iter ( ) . enumerate ( ) {
399+ let value = captures
400+ . as_ref ( )
401+ . and_then ( |caps| caps. get ( i + 1 ) . as_ref ( ) . map ( Match :: as_str) )
402+ . unwrap_or ( "" ) ;
403+ builder. put_item ( ( name, value) )
404+ }
405+ builder. commit_row ( ) ;
406+ }
407+ if len. is_some ( ) {
408+ Value :: Column ( builder. build ( ) )
409+ } else {
410+ Value :: Scalar ( builder. build_scalar ( ) )
411+ }
412+ }
413+ ) ;
414+
415+ registry
416+ . register_passthrough_nullable_2_arg :: < StringType , StringType , ArrayType < StringType > , _ , _ > (
417+ "regexp_extract_all" ,
418+ |_, _, _| FunctionDomain :: MayThrow ,
419+ |source_arg, pat_arg, ctx| {
420+ regexp_extract_all ( & source_arg, & pat_arg, & Value :: Scalar ( 0 ) , ctx)
421+ } ,
422+ ) ;
423+
424+ registry. register_passthrough_nullable_3_arg :: < StringType , StringType , UInt32Type , ArrayType < StringType > , _ , _ > (
425+ "regexp_extract_all" ,
426+ |_, _, _, _| FunctionDomain :: MayThrow ,
427+ |source_arg, pat_arg, group_arg, ctx| {
428+ regexp_extract_all ( & source_arg, & pat_arg, & group_arg, ctx)
429+ }
430+ ) ;
431+
326432 // Notes: https://dev.mysql.com/doc/refman/8.0/en/regexp.html#function_regexp-replace
327433 registry. register_function_factory ( "regexp_replace" , |_, args_type| {
328434 let has_null = args_type. iter ( ) . any ( |t| t. is_nullable_or_null ( ) ) ;
@@ -418,6 +524,158 @@ pub fn register(registry: &mut FunctionRegistry) {
418524 } ) ;
419525}
420526
527+ fn regexp_extract_all (
528+ source_arg : & Value < StringType > ,
529+ pat_arg : & Value < StringType > ,
530+ group_arg : & Value < UInt32Type > ,
531+ ctx : & mut EvalContext ,
532+ ) -> Value < ArrayType < StringType > > {
533+ let len = [ & source_arg, & pat_arg]
534+ . iter ( )
535+ . find_map ( |arg| match arg {
536+ Value :: Column ( col) => Some ( col. len ( ) ) ,
537+ _ => None ,
538+ } )
539+ . or_else ( || match & group_arg {
540+ Value :: Column ( col) => Some ( col. len ( ) ) ,
541+ _ => None ,
542+ } ) ;
543+ let cached_reg = match & pat_arg {
544+ Value :: Scalar ( pat) => {
545+ match regexp:: build_regexp_from_pattern ( "regexp_extract" , pat, None ) {
546+ Ok ( re) => Some ( re) ,
547+ _ => None ,
548+ }
549+ }
550+ _ => None ,
551+ } ;
552+
553+ let size = len. unwrap_or ( 1 ) ;
554+ let mut builder = ArrayColumnBuilder :: < StringType > :: with_capacity ( size, 0 , ctx. generics ) ;
555+ for idx in 0 ..size {
556+ let source = unsafe { source_arg. index_unchecked ( idx) } ;
557+ let pat = unsafe { pat_arg. index_unchecked ( idx) } ;
558+ let group = unsafe { group_arg. index_unchecked ( idx) as usize } ;
559+
560+ let mut local_re = None ;
561+ if cached_reg. is_none ( ) {
562+ match regexp:: build_regexp_from_pattern ( "regexp_extract" , pat, None ) {
563+ Ok ( re) => {
564+ local_re = Some ( re) ;
565+ }
566+ Err ( err) => {
567+ ctx. set_error ( builder. len ( ) , err) ;
568+ builder. push_default ( ) ;
569+ continue ;
570+ }
571+ }
572+ } ;
573+
574+ let re = cached_reg
575+ . as_ref ( )
576+ . unwrap_or_else ( || local_re. as_ref ( ) . unwrap ( ) ) ;
577+ let mut row = StringColumnBuilder :: with_capacity ( 0 ) ;
578+ if group > 9 {
579+ ctx. set_error ( builder. len ( ) , "Group index must be between 0 and 9!" ) ;
580+ }
581+ for caps in re. captures_iter ( source) {
582+ if group >= caps. len ( ) {
583+ ctx. set_error (
584+ builder. len ( ) ,
585+ format ! (
586+ "Pattern has {} groups. Cannot access group {}" ,
587+ caps. len( ) ,
588+ group
589+ ) ,
590+ ) ;
591+ row. put_str ( "" ) ;
592+ row. commit_row ( ) ;
593+ continue ;
594+ }
595+ if let Some ( v) = caps. get ( group) . map ( |ma| ma. as_str ( ) ) {
596+ row. put_str ( v) ;
597+ } else {
598+ row. put_str ( "" ) ;
599+ }
600+ row. commit_row ( ) ;
601+ }
602+ builder. push ( row. build ( ) ) ;
603+ }
604+ if len. is_some ( ) {
605+ Value :: Column ( builder. build ( ) )
606+ } else {
607+ Value :: Scalar ( builder. build_scalar ( ) )
608+ }
609+ }
610+
611+ fn inner_regexp_extract (
612+ source_arg : & Value < StringType > ,
613+ pat_arg : & Value < StringType > ,
614+ group_arg : & Value < UInt32Type > ,
615+ ctx : & mut EvalContext ,
616+ ) -> Value < StringType > {
617+ let len = [ & source_arg, & pat_arg]
618+ . iter ( )
619+ . find_map ( |arg| match arg {
620+ Value :: Column ( col) => Some ( col. len ( ) ) ,
621+ _ => None ,
622+ } )
623+ . or_else ( || match & group_arg {
624+ Value :: Column ( col) => Some ( col. len ( ) ) ,
625+ _ => None ,
626+ } ) ;
627+
628+ let cached_reg = match & pat_arg {
629+ Value :: Scalar ( pat) => {
630+ match regexp:: build_regexp_from_pattern ( "regexp_extract" , pat, None ) {
631+ Ok ( re) => Some ( re) ,
632+ _ => None ,
633+ }
634+ }
635+ _ => None ,
636+ } ;
637+
638+ let size = len. unwrap_or ( 1 ) ;
639+ let mut builder = StringColumnBuilder :: with_capacity ( size) ;
640+ for idx in 0 ..size {
641+ let source = unsafe { source_arg. index_unchecked ( idx) } ;
642+ let pat = unsafe { pat_arg. index_unchecked ( idx) } ;
643+ let group = unsafe { group_arg. index_unchecked ( idx) as usize } ;
644+
645+ let mut local_re = None ;
646+ if cached_reg. is_none ( ) {
647+ match regexp:: build_regexp_from_pattern ( "regexp_extract" , pat, None ) {
648+ Ok ( re) => {
649+ local_re = Some ( re) ;
650+ }
651+ Err ( err) => {
652+ ctx. set_error ( builder. len ( ) , err) ;
653+ builder. put_str ( "" ) ;
654+ continue ;
655+ }
656+ }
657+ } ;
658+ let re = cached_reg
659+ . as_ref ( )
660+ . unwrap_or_else ( || local_re. as_ref ( ) . unwrap ( ) ) ;
661+ if let Some ( caps) = re. captures ( source) {
662+ if group > 9 {
663+ ctx. set_error ( builder. len ( ) , "Group index must be between 0 and 9!" ) ;
664+ builder. put_str ( "" ) ;
665+ } else if let Some ( ma) = caps. get ( group) {
666+ builder. put_str ( ma. as_str ( ) ) ;
667+ }
668+ }
669+ builder. put_str ( "" ) ;
670+ builder. commit_row ( ) ;
671+ }
672+ if len. is_some ( ) {
673+ Value :: Column ( builder. build ( ) )
674+ } else {
675+ Value :: Scalar ( builder. build_scalar ( ) )
676+ }
677+ }
678+
421679fn concat_fn ( args : & [ Value < AnyType > ] , _: & mut EvalContext ) -> Value < AnyType > {
422680 let len = args. iter ( ) . find_map ( |arg| match arg {
423681 Value :: Column ( col) => Some ( col. len ( ) ) ,
0 commit comments