@@ -863,6 +863,163 @@ Statistic DataStats::CountIfInColumnForWords(Pred pred, size_t index) const {
863863 return Statistic (res, &int_type, false );
864864}
865865
866+ Statistic DataStats::GetWhitespaceOnlyCount (size_t index) const {
867+ if (all_stats_[index].whitespace_only_count .HasValue ())
868+ return all_stats_[index].whitespace_only_count ;
869+
870+ mo::TypedColumnData const & col = col_data_[index];
871+ if (col.GetTypeId () != +mo::TypeId::kString ) return {};
872+
873+ size_t count = 0 ;
874+
875+ for (size_t i = 0 ; i < col.GetNumRows (); i++) {
876+ if (col.IsNullOrEmpty (i)) continue ;
877+
878+ auto const & str = mo::Type::GetValue<std::string>(col.GetValue (i));
879+ bool only_space_or_tab = true ;
880+
881+ for (char c : str) {
882+ if (c != ' ' && c != ' \t ' ) {
883+ only_space_or_tab = false ;
884+ break ;
885+ }
886+ }
887+
888+ if (only_space_or_tab && !str.empty ()) {
889+ count++;
890+ }
891+ }
892+
893+ mo::IntType int_type;
894+ std::byte const * res = int_type.MakeValue (count);
895+ return Statistic (res, &int_type, false );
896+ }
897+
898+ Statistic DataStats::GetLeadingWhitespaceCount (size_t index) const {
899+ if (all_stats_[index].leading_whitespace_count .HasValue ())
900+ return all_stats_[index].leading_whitespace_count ;
901+
902+ mo::TypedColumnData const & col = col_data_[index];
903+ if (col.GetTypeId () != +mo::TypeId::kString ) return {};
904+
905+ size_t count = 0 ;
906+
907+ for (size_t i = 0 ; i < col.GetNumRows (); i++) {
908+ if (col.IsNullOrEmpty (i)) continue ;
909+
910+ auto const & str = mo::Type::GetValue<std::string>(col.GetValue (i));
911+ if (!str.empty () && std::isspace (static_cast <unsigned char >(str[0 ]))) {
912+ count++;
913+ }
914+ }
915+
916+ mo::IntType int_type;
917+ std::byte const * res = int_type.MakeValue (count);
918+ return Statistic (res, &int_type, false );
919+ }
920+
921+ Statistic DataStats::GetTrailingWhitespaceCount (size_t index) const {
922+ if (all_stats_[index].trailing_whitespace_count .HasValue ())
923+ return all_stats_[index].trailing_whitespace_count ;
924+
925+ mo::TypedColumnData const & col = col_data_[index];
926+ if (col.GetTypeId () != +mo::TypeId::kString ) return {};
927+
928+ size_t count = 0 ;
929+
930+ for (size_t i = 0 ; i < col.GetNumRows (); i++) {
931+ if (col.IsNullOrEmpty (i)) continue ;
932+
933+ auto const & str = mo::Type::GetValue<std::string>(col.GetValue (i));
934+ if (!str.empty () && std::isspace (static_cast <unsigned char >(str.back ()))) {
935+ count++;
936+ }
937+ }
938+
939+ mo::IntType int_type;
940+ std::byte const * res = int_type.MakeValue (count);
941+ return Statistic (res, &int_type, false );
942+ }
943+
944+ Statistic DataStats::GetSpecialCharsCount (size_t index) const {
945+ if (all_stats_[index].special_chars_count .HasValue ())
946+ return all_stats_[index].special_chars_count ;
947+
948+ mo::TypedColumnData const & col = col_data_[index];
949+ if (col.GetTypeId () != +mo::TypeId::kString ) return {};
950+ static constexpr std::string_view const special_chars = " @#$%^&!?*_+=~'-\" " ;
951+
952+ size_t count = 0 ;
953+
954+ for (size_t i = 0 ; i < col.GetNumRows (); i++) {
955+ if (col.IsNullOrEmpty (i)) continue ;
956+
957+ auto const & str = mo::Type::GetValue<std::string>(col.GetValue (i));
958+ static std::array<bool , 256 > map = {0 };
959+ for (char c : special_chars) {
960+ map[static_cast <unsigned char >(c)] = true ;
961+ }
962+ for (char c : str) {
963+ if (map[static_cast <unsigned char >(c)]) {
964+ count++;
965+ break ;
966+ }
967+ }
968+ }
969+
970+ mo::IntType int_type;
971+ std::byte const * res = int_type.MakeValue (count);
972+ return Statistic (res, &int_type, false );
973+ }
974+
975+ Statistic DataStats::GetCharFrequency (size_t index, CharPosition pos) const {
976+ if ((pos == CharPosition::kFirst && all_stats_[index].first_char_freq .HasValue ()) ||
977+ (pos == CharPosition::kLast && all_stats_[index].last_char_freq .HasValue ())) {
978+ return pos == CharPosition::kFirst ? all_stats_[index].first_char_freq
979+ : all_stats_[index].last_char_freq ;
980+ }
981+
982+ mo::TypedColumnData const & col = col_data_[index];
983+ if (col.GetTypeId () != +mo::TypeId::kString ) return {};
984+
985+ std::unordered_map<char , size_t > freq_map;
986+
987+ for (size_t i = 0 ; i < col.GetNumRows (); i++) {
988+ if (col.IsNullOrEmpty (i)) continue ;
989+
990+ auto const & str = mo::Type::GetValue<std::string>(col.GetValue (i));
991+ if (str.empty ()) continue ;
992+
993+ char c = (pos == CharPosition::kFirst ) ? str.front () : str.back ();
994+ freq_map[c]++;
995+ }
996+
997+ char most_frequent = ' \0 ' ;
998+ size_t max_count = 0 ;
999+
1000+ for (auto const & [c, freq] : freq_map) {
1001+ if (freq > max_count) {
1002+ max_count = freq;
1003+ most_frequent = c;
1004+ }
1005+ }
1006+
1007+ if (max_count == 0 ) return {};
1008+
1009+ std::string result = std::string (1 , most_frequent) + " :" + std::to_string (max_count);
1010+ mo::StringType string_type;
1011+ std::byte const * res = string_type.MakeValue (result);
1012+ return Statistic (res, &string_type, false );
1013+ }
1014+
1015+ Statistic DataStats::GetFirstCharFrequency (size_t index) const {
1016+ return GetCharFrequency (index, CharPosition::kFirst );
1017+ }
1018+
1019+ Statistic DataStats::GetLastCharFrequency (size_t index) const {
1020+ return GetCharFrequency (index, CharPosition::kLast );
1021+ }
1022+
8661023unsigned long long DataStats::ExecuteInternal () {
8671024 if (all_stats_.empty ()) {
8681025 // Table has 0 columns, nothing to do
@@ -906,6 +1063,12 @@ unsigned long long DataStats::ExecuteInternal() {
9061063 all_stats_[index].num_words = GetNumberOfWords (index);
9071064 all_stats_[index].num_entirely_uppercase = GetNumberOfEntirelyUppercaseWords (index);
9081065 all_stats_[index].num_entirely_lowercase = GetNumberOfEntirelyLowercaseWords (index);
1066+ all_stats_[index].whitespace_only_count = GetWhitespaceOnlyCount (index);
1067+ all_stats_[index].leading_whitespace_count = GetLeadingWhitespaceCount (index);
1068+ all_stats_[index].trailing_whitespace_count = GetTrailingWhitespaceCount (index);
1069+ all_stats_[index].special_chars_count = GetSpecialCharsCount (index);
1070+ all_stats_[index].first_char_freq = GetFirstCharFrequency (index);
1071+ all_stats_[index].last_char_freq = GetLastCharFrequency (index);
9091072 }
9101073 // distinct for mixed type will be calculated here
9111074 all_stats_[index].is_categorical = IsCategorical (
0 commit comments