Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions examples/test_examples/snapshots/snap_test_examples_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,20 +559,26 @@
Word vocabulary: ['Analyst', 'Assistant', 'Client', 'Developer', 'Electrician', 'Engineer', 'Farm', 'Financial', 'Front-End', 'JavaScript', 'Junior', 'Loader', 'Manager', 'Medical', 'Operator', 'Physiotherapist', 'Planner', 'Project', 'Receptionist', 'Senior', 'Service', 'Site', 'Solution', 'Store', 'Supervisor', 'Technician', 'Workshop']

Column num = 0
first_char_freq = e:77
leading_whitespace_count = 0
max_num_words = 1
min_num_words = 1
num_chars = 34020
num_uppercase_chars = 0
type = String
isCategorical = 0
num_lowercase_chars = 11108
last_char_freq = 0:70
special_chars_count = 945
count = 945
quantile50 = 81aabb56-808c-48a1-b2a3-5d3f2e1a752f
num_entirely_lowercase = 945
num_entirely_uppercase = 0
max_num_chars = 1
num_digit_chars = 19132
trailing_whitespace_count = 0
distinct = 945
whitespace_only_count = 0
avg_chars = 36.000000
min = 0008f14d-e2a7-4582-bf5e-89ce32b55606
num_words = 945
Expand All @@ -584,20 +590,26 @@
vocab = -0123456789abcdef

Column num = 1
first_char_freq = D:123
leading_whitespace_count = 0
max_num_words = 2
min_num_words = 2
num_chars = 12261
num_uppercase_chars = 1890
type = String
isCategorical = 0
num_lowercase_chars = 9426
last_char_freq = n:211
special_chars_count = 0
count = 945
quantile50 = Kenneth King
num_entirely_lowercase = 0
num_entirely_uppercase = 0
max_num_chars = 2
num_digit_chars = 0
trailing_whitespace_count = 0
distinct = 945
whitespace_only_count = 0
avg_chars = 12.974603
min = Anthony Campbell
num_words = 1890
Expand All @@ -609,20 +621,26 @@
vocab = ABCDEGHJKLMNPRSTWYabcdefghiklmnoprstuvwyz

Column num = 2
first_char_freq = S:204
leading_whitespace_count = 0
max_num_words = 2
min_num_words = 2
num_chars = 11843
num_uppercase_chars = 1890
type = String
isCategorical = 1
num_lowercase_chars = 9008
last_char_freq = a:390
special_chars_count = 0
count = 945
quantile50 = Galen Calla
num_entirely_lowercase = 0
num_entirely_uppercase = 0
max_num_chars = 2
num_digit_chars = 0
trailing_whitespace_count = 0
distinct = 6
whitespace_only_count = 0
avg_chars = 12.532275
min = Addyson Aaliyah
num_words = 1890
Expand All @@ -634,20 +652,26 @@
vocab = ACDGJPSadefhilnorsuvy

Column num = 3
first_char_freq = T:204
leading_whitespace_count = 0
max_num_words = 1
min_num_words = 1
num_chars = 10452
num_uppercase_chars = 1300
type = String
isCategorical = 1
num_lowercase_chars = 9152
last_char_freq = n:391
special_chars_count = 0
count = 945
quantile50 = Talkspiration
num_entirely_lowercase = 0
num_entirely_uppercase = 0
max_num_chars = 1
num_digit_chars = 0
trailing_whitespace_count = 0
distinct = 5
whitespace_only_count = 0
avg_chars = 11.060317
min = MonsterWorq
num_words = 945
Expand Down Expand Up @@ -682,20 +706,26 @@
sum = 880984

Column num = 5
first_char_freq = S:339
leading_whitespace_count = 0
max_num_words = 3
min_num_words = 1
num_chars = 17603
num_uppercase_chars = 2226
type = String
isCategorical = 0
num_lowercase_chars = 14152
last_char_freq = r:534
special_chars_count = 57
count = 945
quantile50 = Physiotherapist
num_entirely_lowercase = 0
num_entirely_uppercase = 0
max_num_chars = 3
num_digit_chars = 0
trailing_whitespace_count = 0
distinct = 15
whitespace_only_count = 0
avg_chars = 18.627513
min = Client Solution Analyst
num_words = 2113
Expand Down
150 changes: 150 additions & 0 deletions src/core/algorithms/statistics/data_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,148 @@ Statistic DataStats::CountIfInColumnForWords(Pred pred, size_t index) const {
return Statistic(res, &int_type, false);
}

Statistic DataStats::GetWhitespaceOnlyCount(size_t index) const {
if (all_stats_[index].whitespace_only_count.HasValue())
return all_stats_[index].whitespace_only_count;

mo::TypedColumnData const& col = col_data_[index];
if (col.GetTypeId() != +mo::TypeId::kString) return {};

size_t count = 0;

for (size_t i = 0; i < col.GetNumRows(); i++) {
if (col.IsNullOrEmpty(i)) continue;

auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));

if (!str.empty() && std::all_of(str.begin(), str.end(), [](char c) {
return std::isspace(static_cast<unsigned char>(c));
})) {
count++;
}
}

mo::IntType int_type;
std::byte const* res = int_type.MakeValue(count);
return Statistic(res, &int_type, false);
}

Statistic DataStats::GetWhitespaceCount(size_t index, CharPosition pos) const {
auto& stat_cache = (pos == CharPosition::kFirst) ? all_stats_[index].leading_whitespace_count
: all_stats_[index].trailing_whitespace_count;

if (stat_cache.HasValue()) return stat_cache;

mo::TypedColumnData const& col = col_data_[index];
if (col.GetTypeId() != +mo::TypeId::kString) return {};

size_t count = 0;
auto check_whitespace = [pos](std::string const& str) {
if (str.empty()) return false;

char char_to_check = (pos == CharPosition::kFirst) ? str[0] : str.back();
return static_cast<bool>(std::isspace(static_cast<unsigned char>(char_to_check)));
};

for (size_t i = 0; i < col.GetNumRows(); i++) {
if (col.IsNullOrEmpty(i)) continue;

auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));
if (check_whitespace(str)) {
count++;
}
}

mo::IntType int_type;
std::byte const* res = int_type.MakeValue(count);
return Statistic(res, &int_type, false);
}

Statistic DataStats::GetNumberOfRowsWithLeadingWhitespace(size_t index) const {
return GetWhitespaceCount(index, CharPosition::kFirst);
}

Statistic DataStats::GetNumberOfRowsWithTrailingWhitespace(size_t index) const {
return GetWhitespaceCount(index, CharPosition::kLast);
}

Statistic DataStats::GetNumberOfRowsWithSpecialChars(size_t index) const {
if (all_stats_[index].special_chars_count.HasValue())
return all_stats_[index].special_chars_count;

mo::TypedColumnData const& col = col_data_[index];
if (col.GetTypeId() != +mo::TypeId::kString) return {};
static constexpr std::string_view const kSpecialChars = "@#$%^&!?*_+=~'-\"";
size_t count = 0;

static constexpr std::array<bool, 256> kMap = []() constexpr {
std::array<bool, 256> map = {0};
for (char c : kSpecialChars) {
map[static_cast<unsigned char>(c)] = true;
}
return map;
}();

for (size_t i = 0; i < col.GetNumRows(); i++) {
if (col.IsNullOrEmpty(i)) continue;

auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));

if (std::any_of(str.begin(), str.end(),
[](char c) { return kMap[static_cast<unsigned char>(c)]; })) {
count++;
}
}

mo::IntType int_type;
std::byte const* res = int_type.MakeValue(count);
return Statistic(res, &int_type, false);
}

Statistic DataStats::GetCharFrequency(size_t index, CharPosition pos) const {
if ((pos == CharPosition::kFirst && all_stats_[index].first_char_freq.HasValue()) ||
(pos == CharPosition::kLast && all_stats_[index].last_char_freq.HasValue())) {
return pos == CharPosition::kFirst ? all_stats_[index].first_char_freq
: all_stats_[index].last_char_freq;
}

mo::TypedColumnData const& col = col_data_[index];
if (col.GetTypeId() != +mo::TypeId::kString) return {};

std::unordered_map<char, size_t> freq_map;

for (size_t i = 0; i < col.GetNumRows(); i++) {
if (col.IsNullOrEmpty(i)) continue;

auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i));
if (str.empty()) continue;

char c = (pos == CharPosition::kFirst) ? str.front() : str.back();
freq_map[c]++;
}
assert(freq_map.size() != 0);

auto const& [most_frequent, max_freq] = *std::max_element(
freq_map.begin(), freq_map.end(), [](auto const& lhs, auto const& rhs) {
return std::tie(lhs.second, lhs.first) < std::tie(rhs.second, rhs.first);
});

if (max_freq == 0) return {};

std::string result = std::string(1, most_frequent) + ":" + std::to_string(max_freq);
mo::StringType string_type;
std::byte const* res = string_type.MakeValue(result);
return Statistic(res, &string_type, false);
}

Statistic DataStats::GetFirstCharFrequency(size_t index) const {
return GetCharFrequency(index, CharPosition::kFirst);
}

Statistic DataStats::GetLastCharFrequency(size_t index) const {
return GetCharFrequency(index, CharPosition::kLast);
}

unsigned long long DataStats::ExecuteInternal() {
if (all_stats_.empty()) {
// Table has 0 columns, nothing to do
Expand Down Expand Up @@ -906,6 +1048,14 @@ unsigned long long DataStats::ExecuteInternal() {
all_stats_[index].num_words = GetNumberOfWords(index);
all_stats_[index].num_entirely_uppercase = GetNumberOfEntirelyUppercaseWords(index);
all_stats_[index].num_entirely_lowercase = GetNumberOfEntirelyLowercaseWords(index);
all_stats_[index].whitespace_only_count = GetWhitespaceOnlyCount(index);
all_stats_[index].leading_whitespace_count =
GetNumberOfRowsWithLeadingWhitespace(index);
all_stats_[index].trailing_whitespace_count =
GetNumberOfRowsWithTrailingWhitespace(index);
all_stats_[index].special_chars_count = GetNumberOfRowsWithSpecialChars(index);
all_stats_[index].first_char_freq = GetFirstCharFrequency(index);
all_stats_[index].last_char_freq = GetLastCharFrequency(index);
}
// distinct for mixed type will be calculated here
all_stats_[index].is_categorical = IsCategorical(
Expand Down
19 changes: 18 additions & 1 deletion src/core/algorithms/statistics/data_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class DataStats : public Algorithm {

void ResetState() final;

enum class CharPosition { kFirst, kLast };

// Returns number of elements satisfying the predicate
template <class Pred, class Data>
size_t CountIf(Pred pred, Data const& data) const;
Expand Down Expand Up @@ -61,6 +63,10 @@ class DataStats : public Algorithm {
// Returns median value for numeric vector
static std::byte* MedianOfNumericVector(std::vector<std::byte const*> const& data,
model::INumericType const& type);
// Returns number of rows with whitespace on first or last position
Statistic GetWhitespaceCount(size_t index, CharPosition pos) const;
// Returns the most frequent character in a column on first or last position
Statistic GetCharFrequency(size_t index, CharPosition pos) const;

protected:
config::InputTable input_table_;
Expand Down Expand Up @@ -166,7 +172,18 @@ class DataStats : public Algorithm {
Statistic GetNumberOfEntirelyUppercaseWords(size_t index) const;
// Returns the amount of entirely lowercase words in a string column.
Statistic GetNumberOfEntirelyLowercaseWords(size_t index) const;

// Returns the number of rows that consist only of whitespace characters (spaces and tabs).
Statistic GetWhitespaceOnlyCount(size_t index) const;
// Returns the number of rows that have leading whitespace characters.
Statistic GetNumberOfRowsWithLeadingWhitespace(size_t index) const;
// Returns the number of rows that have trailing whitespace characters.
Statistic GetNumberOfRowsWithTrailingWhitespace(size_t index) const;
// Returns the number of rows that contain special characters.
Statistic GetNumberOfRowsWithSpecialChars(size_t index) const;
// Returns the most frequent first character.
Statistic GetFirstCharFrequency(size_t index) const;
// Returns the most frequent last character.
Statistic GetLastCharFrequency(size_t index) const;
ColumnStats const& GetAllStats(size_t index) const;
std::vector<ColumnStats> const& GetAllStats() const;
std::string ToString() const;
Expand Down
6 changes: 6 additions & 0 deletions src/core/algorithms/statistics/statistic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ std::unordered_map<std::string, std::string> ColumnStats::ToKeyValueMap() const
try_add_stat(num_words, "num_words");
try_add_stat(num_entirely_uppercase, "num_entirely_uppercase");
try_add_stat(num_entirely_lowercase, "num_entirely_lowercase");
try_add_stat(whitespace_only_count, "whitespace_only_count");
try_add_stat(leading_whitespace_count, "leading_whitespace_count");
try_add_stat(trailing_whitespace_count, "trailing_whitespace_count");
try_add_stat(special_chars_count, "special_chars_count");
try_add_stat(first_char_freq, "first_char_freq");
try_add_stat(last_char_freq, "last_char_freq");

return res;
}
Expand Down
4 changes: 3 additions & 1 deletion src/core/algorithms/statistics/statistic.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ struct ColumnStats {
num_zeros, num_negatives, sum_of_squares, geometric_mean, mean_ad, median, median_ad,
vocab, num_non_letter_chars, num_digit_chars, num_lowercase_chars, num_uppercase_chars,
num_chars, num_avg_chars, min_num_chars, max_num_chars, min_num_words, max_num_words,
num_words, num_entirely_uppercase, num_entirely_lowercase;
num_words, num_entirely_uppercase, num_entirely_lowercase, whitespace_only_count,
leading_whitespace_count, trailing_whitespace_count, special_chars_count,
first_char_freq, last_char_freq;

std::string ToString() const;
std::unordered_map<std::string, std::string> ToKeyValueMap() const;
Expand Down
20 changes: 19 additions & 1 deletion src/python_bindings/statistics/bind_statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,24 @@ void BindStatistics(pybind11::module_& main_module) {
"Returns the amount of entirely lowercase words in a column.", py::arg("index"))
.def("get_number_of_entirely_uppercase_words",
&DataStats::GetNumberOfEntirelyUppercaseWords,
"Returns the amount of entirely uppercase words in a column.", py::arg("index"));
"Returns the amount of entirely uppercase words in a column.", py::arg("index"))
.def("get_whitespace_only_count", &DataStats::GetWhitespaceOnlyCount,
"Returns the number of rows that consist only of whitespace characters (spaces "
"and tabs).",
py::arg("index"))
.def("get_leading_whitespace_count", &DataStats::GetNumberOfRowsWithLeadingWhitespace,
"Returns the number of rows that have leading whitespace characters.",
py::arg("index"))
.def("get_trailing_whitespace_count", &DataStats::GetNumberOfRowsWithTrailingWhitespace,
"Returns the number of rows that have trailing whitespace characters.",
py::arg("index"))
.def("get_special_chars_count", &DataStats::GetNumberOfRowsWithSpecialChars,
"Returns the number of rows that contain special characters.", py::arg("index"))
.def("get_first_char_frequency", &DataStats::GetFirstCharFrequency,
"Returns the most frequent first character and its count as a string in format.",
py::arg("index"))
.def("get_last_char_frequency", &DataStats::GetLastCharFrequency,
"Returns the most frequent last character and its count as a string in format.",
py::arg("index"));
}
} // namespace python_bindings
Loading
Loading