Skip to content

Commit 8ef7d19

Browse files
committed
Add new statistics
1 parent a8073e5 commit 8ef7d19

File tree

8 files changed

+356
-2
lines changed

8 files changed

+356
-2
lines changed

examples/test_examples/snapshots/snap_test_examples_pytest.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,20 +559,26 @@
559559
Word vocabulary: ['Analyst', 'Assistant', 'Client', 'Developer', 'Electrician', 'Engineer', 'Farm', 'Financial', 'Front-End', 'JavaScript', 'Junior', 'Loader', 'Manager', 'Medical', 'Operator', 'Physiotherapist', 'Planner', 'Project', 'Receptionist', 'Senior', 'Service', 'Site', 'Solution', 'Store', 'Supervisor', 'Technician', 'Workshop']
560560
561561
Column num = 0
562+
first_char_freq = e:77
563+
leading_whitespace_count = 0
562564
max_num_words = 1
563565
min_num_words = 1
564566
num_chars = 34020
565567
num_uppercase_chars = 0
566568
type = String
567569
isCategorical = 0
568570
num_lowercase_chars = 11108
571+
last_char_freq = 0:70
572+
special_chars_count = 945
569573
count = 945
570574
quantile50 = 81aabb56-808c-48a1-b2a3-5d3f2e1a752f
571575
num_entirely_lowercase = 945
572576
num_entirely_uppercase = 0
573577
max_num_chars = 1
574578
num_digit_chars = 19132
579+
trailing_whitespace_count = 0
575580
distinct = 945
581+
whitespace_only_count = 0
576582
avg_chars = 36.000000
577583
min = 0008f14d-e2a7-4582-bf5e-89ce32b55606
578584
num_words = 945
@@ -584,20 +590,26 @@
584590
vocab = -0123456789abcdef
585591
586592
Column num = 1
593+
first_char_freq = D:123
594+
leading_whitespace_count = 0
587595
max_num_words = 2
588596
min_num_words = 2
589597
num_chars = 12261
590598
num_uppercase_chars = 1890
591599
type = String
592600
isCategorical = 0
593601
num_lowercase_chars = 9426
602+
last_char_freq = n:211
603+
special_chars_count = 0
594604
count = 945
595605
quantile50 = Kenneth King
596606
num_entirely_lowercase = 0
597607
num_entirely_uppercase = 0
598608
max_num_chars = 2
599609
num_digit_chars = 0
610+
trailing_whitespace_count = 0
600611
distinct = 945
612+
whitespace_only_count = 0
601613
avg_chars = 12.974603
602614
min = Anthony Campbell
603615
num_words = 1890
@@ -609,20 +621,26 @@
609621
vocab = ABCDEGHJKLMNPRSTWYabcdefghiklmnoprstuvwyz
610622
611623
Column num = 2
624+
first_char_freq = S:204
625+
leading_whitespace_count = 0
612626
max_num_words = 2
613627
min_num_words = 2
614628
num_chars = 11843
615629
num_uppercase_chars = 1890
616630
type = String
617631
isCategorical = 1
618632
num_lowercase_chars = 9008
633+
last_char_freq = a:390
634+
special_chars_count = 0
619635
count = 945
620636
quantile50 = Galen Calla
621637
num_entirely_lowercase = 0
622638
num_entirely_uppercase = 0
623639
max_num_chars = 2
624640
num_digit_chars = 0
641+
trailing_whitespace_count = 0
625642
distinct = 6
643+
whitespace_only_count = 0
626644
avg_chars = 12.532275
627645
min = Addyson Aaliyah
628646
num_words = 1890
@@ -634,20 +652,26 @@
634652
vocab = ACDGJPSadefhilnorsuvy
635653
636654
Column num = 3
655+
first_char_freq = T:204
656+
leading_whitespace_count = 0
637657
max_num_words = 1
638658
min_num_words = 1
639659
num_chars = 10452
640660
num_uppercase_chars = 1300
641661
type = String
642662
isCategorical = 1
643663
num_lowercase_chars = 9152
664+
last_char_freq = n:391
665+
special_chars_count = 0
644666
count = 945
645667
quantile50 = Talkspiration
646668
num_entirely_lowercase = 0
647669
num_entirely_uppercase = 0
648670
max_num_chars = 1
649671
num_digit_chars = 0
672+
trailing_whitespace_count = 0
650673
distinct = 5
674+
whitespace_only_count = 0
651675
avg_chars = 11.060317
652676
min = MonsterWorq
653677
num_words = 945
@@ -682,20 +706,26 @@
682706
sum = 880984
683707
684708
Column num = 5
709+
first_char_freq = S:339
710+
leading_whitespace_count = 0
685711
max_num_words = 3
686712
min_num_words = 1
687713
num_chars = 17603
688714
num_uppercase_chars = 2226
689715
type = String
690716
isCategorical = 0
691717
num_lowercase_chars = 14152
718+
last_char_freq = r:534
719+
special_chars_count = 57
692720
count = 945
693721
quantile50 = Physiotherapist
694722
num_entirely_lowercase = 0
695723
num_entirely_uppercase = 0
696724
max_num_chars = 3
697725
num_digit_chars = 0
726+
trailing_whitespace_count = 0
698727
distinct = 15
728+
whitespace_only_count = 0
699729
avg_chars = 18.627513
700730
min = Client Solution Analyst
701731
num_words = 2113

src/core/algorithms/statistics/data_stats.cpp

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -863,6 +863,163 @@ Statistic DataStats::CountIfInColumnForWords(Pred pred, size_t index) const {
863863
return Statistic(res, &int_type, false);
864864
}
865865

866+
Statistic DataStats::GetWhitespaceOnlyCount(size_t index) const {
867+
if (all_stats_[index].whitespace_only_count.HasValue())
868+
return all_stats_[index].whitespace_only_count;
869+
870+
mo::TypedColumnData const& col = col_data_[index];
871+
if (col.GetTypeId() != +mo::TypeId::kString) return {};
872+
873+
size_t count = 0;
874+
875+
for (size_t i = 0; i < col.GetNumRows(); i++) {
876+
if (col.IsNullOrEmpty(i)) continue;
877+
878+
auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));
879+
bool only_space_or_tab = true;
880+
881+
for (char c : str) {
882+
if (c != ' ' && c != '\t') {
883+
only_space_or_tab = false;
884+
break;
885+
}
886+
}
887+
888+
if (only_space_or_tab && !str.empty()) {
889+
count++;
890+
}
891+
}
892+
893+
mo::IntType int_type;
894+
std::byte const* res = int_type.MakeValue(count);
895+
return Statistic(res, &int_type, false);
896+
}
897+
898+
Statistic DataStats::GetLeadingWhitespaceCount(size_t index) const {
899+
if (all_stats_[index].leading_whitespace_count.HasValue())
900+
return all_stats_[index].leading_whitespace_count;
901+
902+
mo::TypedColumnData const& col = col_data_[index];
903+
if (col.GetTypeId() != +mo::TypeId::kString) return {};
904+
905+
size_t count = 0;
906+
907+
for (size_t i = 0; i < col.GetNumRows(); i++) {
908+
if (col.IsNullOrEmpty(i)) continue;
909+
910+
auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));
911+
if (!str.empty() && std::isspace(static_cast<unsigned char>(str[0]))) {
912+
count++;
913+
}
914+
}
915+
916+
mo::IntType int_type;
917+
std::byte const* res = int_type.MakeValue(count);
918+
return Statistic(res, &int_type, false);
919+
}
920+
921+
Statistic DataStats::GetTrailingWhitespaceCount(size_t index) const {
922+
if (all_stats_[index].trailing_whitespace_count.HasValue())
923+
return all_stats_[index].trailing_whitespace_count;
924+
925+
mo::TypedColumnData const& col = col_data_[index];
926+
if (col.GetTypeId() != +mo::TypeId::kString) return {};
927+
928+
size_t count = 0;
929+
930+
for (size_t i = 0; i < col.GetNumRows(); i++) {
931+
if (col.IsNullOrEmpty(i)) continue;
932+
933+
auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));
934+
if (!str.empty() && std::isspace(static_cast<unsigned char>(str.back()))) {
935+
count++;
936+
}
937+
}
938+
939+
mo::IntType int_type;
940+
std::byte const* res = int_type.MakeValue(count);
941+
return Statistic(res, &int_type, false);
942+
}
943+
944+
Statistic DataStats::GetSpecialCharsCount(size_t index) const {
945+
if (all_stats_[index].special_chars_count.HasValue())
946+
return all_stats_[index].special_chars_count;
947+
948+
mo::TypedColumnData const& col = col_data_[index];
949+
if (col.GetTypeId() != +mo::TypeId::kString) return {};
950+
static constexpr std::string_view const special_chars = "@#$%^&!?*_+=~'-\"";
951+
952+
size_t count = 0;
953+
954+
for (size_t i = 0; i < col.GetNumRows(); i++) {
955+
if (col.IsNullOrEmpty(i)) continue;
956+
957+
auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));
958+
static std::array<bool, 256> map = {0};
959+
for (char c : special_chars) {
960+
map[static_cast<unsigned char>(c)] = true;
961+
}
962+
for (char c : str) {
963+
if (map[static_cast<unsigned char>(c)]) {
964+
count++;
965+
break;
966+
}
967+
}
968+
}
969+
970+
mo::IntType int_type;
971+
std::byte const* res = int_type.MakeValue(count);
972+
return Statistic(res, &int_type, false);
973+
}
974+
975+
Statistic DataStats::GetCharFrequency(size_t index, CharPosition pos) const {
976+
if ((pos == CharPosition::kFirst && all_stats_[index].first_char_freq.HasValue()) ||
977+
(pos == CharPosition::kLast && all_stats_[index].last_char_freq.HasValue())) {
978+
return pos == CharPosition::kFirst ? all_stats_[index].first_char_freq
979+
: all_stats_[index].last_char_freq;
980+
}
981+
982+
mo::TypedColumnData const& col = col_data_[index];
983+
if (col.GetTypeId() != +mo::TypeId::kString) return {};
984+
985+
std::unordered_map<char, size_t> freq_map;
986+
987+
for (size_t i = 0; i < col.GetNumRows(); i++) {
988+
if (col.IsNullOrEmpty(i)) continue;
989+
990+
auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));
991+
if (str.empty()) continue;
992+
993+
char c = (pos == CharPosition::kFirst) ? str.front() : str.back();
994+
freq_map[c]++;
995+
}
996+
997+
char most_frequent = '\0';
998+
size_t max_count = 0;
999+
1000+
for (auto const& [c, freq] : freq_map) {
1001+
if (freq > max_count) {
1002+
max_count = freq;
1003+
most_frequent = c;
1004+
}
1005+
}
1006+
1007+
if (max_count == 0) return {};
1008+
1009+
std::string result = std::string(1, most_frequent) + ":" + std::to_string(max_count);
1010+
mo::StringType string_type;
1011+
std::byte const* res = string_type.MakeValue(result);
1012+
return Statistic(res, &string_type, false);
1013+
}
1014+
1015+
Statistic DataStats::GetFirstCharFrequency(size_t index) const {
1016+
return GetCharFrequency(index, CharPosition::kFirst);
1017+
}
1018+
1019+
Statistic DataStats::GetLastCharFrequency(size_t index) const {
1020+
return GetCharFrequency(index, CharPosition::kLast);
1021+
}
1022+
8661023
unsigned long long DataStats::ExecuteInternal() {
8671024
if (all_stats_.empty()) {
8681025
// Table has 0 columns, nothing to do
@@ -906,6 +1063,12 @@ unsigned long long DataStats::ExecuteInternal() {
9061063
all_stats_[index].num_words = GetNumberOfWords(index);
9071064
all_stats_[index].num_entirely_uppercase = GetNumberOfEntirelyUppercaseWords(index);
9081065
all_stats_[index].num_entirely_lowercase = GetNumberOfEntirelyLowercaseWords(index);
1066+
all_stats_[index].whitespace_only_count = GetWhitespaceOnlyCount(index);
1067+
all_stats_[index].leading_whitespace_count = GetLeadingWhitespaceCount(index);
1068+
all_stats_[index].trailing_whitespace_count = GetTrailingWhitespaceCount(index);
1069+
all_stats_[index].special_chars_count = GetSpecialCharsCount(index);
1070+
all_stats_[index].first_char_freq = GetFirstCharFrequency(index);
1071+
all_stats_[index].last_char_freq = GetLastCharFrequency(index);
9091072
}
9101073
// distinct for mixed type will be calculated here
9111074
all_stats_[index].is_categorical = IsCategorical(

src/core/algorithms/statistics/data_stats.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,21 @@ class DataStats : public Algorithm {
166166
Statistic GetNumberOfEntirelyUppercaseWords(size_t index) const;
167167
// Returns the amount of entirely lowercase words in a string column.
168168
Statistic GetNumberOfEntirelyLowercaseWords(size_t index) const;
169+
// Returns the number of rows that consist only of whitespace characters (spaces and tabs).
170+
Statistic GetWhitespaceOnlyCount(size_t index) const;
171+
// Returns the number of rows that have leading whitespace characters.
172+
Statistic GetLeadingWhitespaceCount(size_t index) const;
173+
// Returns the number of rows that have trailing whitespace characters.
174+
Statistic GetTrailingWhitespaceCount(size_t index) const;
175+
// Returns the number of rows that contain special characters.
176+
Statistic GetSpecialCharsCount(size_t index) const;
177+
// Returns the most frequent first character.
178+
Statistic GetFirstCharFrequency(size_t index) const;
179+
// Returns the most frequent last character.
180+
Statistic GetLastCharFrequency(size_t index) const;
181+
enum class CharPosition { kFirst, kLast };
182+
183+
Statistic GetCharFrequency(size_t index, CharPosition pos) const;
169184

170185
ColumnStats const& GetAllStats(size_t index) const;
171186
std::vector<ColumnStats> const& GetAllStats() const;

src/core/algorithms/statistics/statistic.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,12 @@ std::unordered_map<std::string, std::string> ColumnStats::ToKeyValueMap() const
125125
try_add_stat(num_words, "num_words");
126126
try_add_stat(num_entirely_uppercase, "num_entirely_uppercase");
127127
try_add_stat(num_entirely_lowercase, "num_entirely_lowercase");
128+
try_add_stat(whitespace_only_count, "whitespace_only_count");
129+
try_add_stat(leading_whitespace_count, "leading_whitespace_count");
130+
try_add_stat(trailing_whitespace_count, "trailing_whitespace_count");
131+
try_add_stat(special_chars_count, "special_chars_count");
132+
try_add_stat(first_char_freq, "first_char_freq");
133+
try_add_stat(last_char_freq, "last_char_freq");
128134

129135
return res;
130136
}

src/core/algorithms/statistics/statistic.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ struct ColumnStats {
3636
num_zeros, num_negatives, sum_of_squares, geometric_mean, mean_ad, median, median_ad,
3737
vocab, num_non_letter_chars, num_digit_chars, num_lowercase_chars, num_uppercase_chars,
3838
num_chars, num_avg_chars, min_num_chars, max_num_chars, min_num_words, max_num_words,
39-
num_words, num_entirely_uppercase, num_entirely_lowercase;
39+
num_words, num_entirely_uppercase, num_entirely_lowercase, whitespace_only_count,
40+
leading_whitespace_count, trailing_whitespace_count, special_chars_count,
41+
first_char_freq, last_char_freq;
4042

4143
std::string ToString() const;
4244
std::unordered_map<std::string, std::string> ToKeyValueMap() const;

src/python_bindings/statistics/bind_statistics.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,24 @@ void BindStatistics(pybind11::module_& main_module) {
166166
"Returns the amount of entirely lowercase words in a column.", py::arg("index"))
167167
.def("get_number_of_entirely_uppercase_words",
168168
&DataStats::GetNumberOfEntirelyUppercaseWords,
169-
"Returns the amount of entirely uppercase words in a column.", py::arg("index"));
169+
"Returns the amount of entirely uppercase words in a column.", py::arg("index"))
170+
.def("get_whitespace_only_count", &DataStats::GetWhitespaceOnlyCount,
171+
"Returns the number of rows that consist only of whitespace characters (spaces "
172+
"and tabs).",
173+
py::arg("index"))
174+
.def("get_leading_whitespace_count", &DataStats::GetLeadingWhitespaceCount,
175+
"Returns the number of rows that have leading whitespace characters.",
176+
py::arg("index"))
177+
.def("get_trailing_whitespace_count", &DataStats::GetTrailingWhitespaceCount,
178+
"Returns the number of rows that have trailing whitespace characters.",
179+
py::arg("index"))
180+
.def("get_special_chars_count", &DataStats::GetSpecialCharsCount,
181+
"Returns the number of rows that contain special characters.", py::arg("index"))
182+
.def("get_first_char_frequency", &DataStats::GetFirstCharFrequency,
183+
"Returns the most frequent first character and its count as a string in format.",
184+
py::arg("index"))
185+
.def("get_last_char_frequency", &DataStats::GetLastCharFrequency,
186+
"Returns the most frequent last character and its count as a string in format.",
187+
py::arg("index"));
170188
}
171189
} // namespace python_bindings

0 commit comments

Comments
 (0)