Conversation
c5c3d3d to
a978811
Compare
|
Since you've added new statistic methods to DataStats, they are now printed whenever |
a978811 to
df3b75f
Compare
|
|
||
| for (char c : str) { | ||
| if (special_chars.find(c) != std::string::npos) { | ||
| has_special = true; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if (has_special) { | ||
| count++; | ||
| } |
There was a problem hiding this comment.
At first
| for (char c : str) { | |
| if (special_chars.find(c) != std::string::npos) { | |
| has_special = true; | |
| break; | |
| } | |
| } | |
| if (has_special) { | |
| count++; | |
| } | |
| for (char c : str) { | |
| if (special_chars.find(c) != std::string::npos) { | |
| count++; | |
| break; | |
| } | |
| } |
There was a problem hiding this comment.
Also, finding spec character in a string take O(N * M) time, N - string length, M - spec character length. You may store a 256 sized array (number of possible values of char) for checking if the current symbol of string is in this array.
| for (char c : str) { | |
| if (special_chars.find(c) != std::string::npos) { | |
| has_special = true; | |
| break; | |
| } | |
| } | |
| if (has_special) { | |
| count++; | |
| } | |
| static constexpr const std::array<bool, 256> map = {0}; | |
| for (char c : special_characters) { | |
| map[static_cast<unisgned char>(c)] = true; | |
| } | |
| for (char c : str) { | |
| if (map[static_cast<unisgned char>(c)]) { | |
| count++; | |
| break; | |
| } | |
| } |
There was a problem hiding this comment.
Unicode symbols won't affect the frequency of special characters since they start after 0x7F.
|
|
||
| mo::TypedColumnData const& col = col_data_[index]; | ||
| if (col.GetTypeId() != +mo::TypeId::kString) return {}; | ||
| std::string const special_chars = "@#$%^&!?*_+=~'-\""; |
There was a problem hiding this comment.
| std::string const special_chars = "@#$%^&!?*_+=~'-\""; | |
| static const auto constexpr special_chars = "@#$%^&!?*_+=~'-\""; |
| char most_frequent = '\0'; | ||
| size_t max_count = 0; | ||
|
|
||
| for (auto const& pair : freq_map) { |
There was a problem hiding this comment.
| for (auto const& pair : freq_map) { | |
| for (auto [c, freq] : freq_map) { |
| return Statistic(res, &int_type, false); | ||
| } | ||
|
|
||
| Statistic DataStats::GetFirstCharFrequency(size_t index) const { |
There was a problem hiding this comment.
GetFirstCharFrequency and GetLastCharFrequency are identical, better put a common part in a separate function
9dab0de to
742d809
Compare
|
|
||
| mo::TypedColumnData const& col = col_data_[index]; | ||
| if (col.GetTypeId() != +mo::TypeId::kString) return {}; | ||
| static constexpr std::string_view const special_chars = "@#$%^&!?*_+=~'-\""; |
There was a problem hiding this comment.
warning: invalid case style for static constant 'special_chars' [readability-identifier-naming]
| static constexpr std::string_view const special_chars = "@#$%^&!?*_+=~'-\""; | |
| static constexpr std::string_view const kSpecialChars = "@#$%^&!?*_+=~'-\""; |
src/core/algorithms/statistics/data_stats.cpp:959:
- for (char c : special_chars) {
+ for (char c : kSpecialChars) {| if (col.IsNullOrEmpty(i)) continue; | ||
|
|
||
| auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i)); | ||
| bool has_special = false; |
There was a problem hiding this comment.
warning: unused variable 'has_special' [clang-diagnostic-unused-variable]
bool has_special = false;
^742d809 to
8ef7d19
Compare
|
|
||
| mo::TypedColumnData const& col = col_data_[index]; | ||
| if (col.GetTypeId() != +mo::TypeId::kString) return {}; | ||
| static constexpr std::string_view const special_chars = "@#$%^&!?*_+=~'-\""; |
There was a problem hiding this comment.
warning: invalid case style for static constant 'special_chars' [readability-identifier-naming]
| static constexpr std::string_view const special_chars = "@#$%^&!?*_+=~'-\""; | |
| static constexpr std::string_view const kSpecialChars = "@#$%^&!?*_+=~'-\""; |
src/core/algorithms/statistics/data_stats.cpp:958:
- for (char c : special_chars) {
+ for (char c : kSpecialChars) {| static std::array<bool, 256> map = {0}; | ||
| for (char c : special_chars) { | ||
| map[static_cast<unsigned char>(c)] = true; | ||
| } |
There was a problem hiding this comment.
Should better put it outside of loop, do not initialize on each iteration. Initialization also can be made constexpr via lambda function
| static std::array<bool, 256> map = {0}; | |
| for (char c : special_chars) { | |
| map[static_cast<unsigned char>(c)] = true; | |
| } | |
| static contexpr std::array<bool, 256> map = [&special_chars]() constexpr { | |
| std::array<bool, 256> map = {0}; | |
| for (char c : special_chars) { | |
| map[static_cast<unsigned char>(c)] = true; | |
| } | |
| return map; | |
| }(); |
8ef7d19 to
c5b7a1b
Compare
| for (auto const& [c, freq] : freq_map) { | ||
| if (freq > max_count) { | ||
| max_count = freq; | ||
| most_frequent = c; | ||
| } | ||
| } |
There was a problem hiding this comment.
| for (auto const& [c, freq] : freq_map) { | |
| if (freq > max_count) { | |
| max_count = freq; | |
| most_frequent = c; | |
| } | |
| } | |
| auto const& [c, max_freq] = *std::max_element(map.begin(), map.end(), [](const auto& lhs, const auto& rhs){ return lhs.second < rhs.second; }); |
Also consider empty container in order to not dereference end() iterator
There was a problem hiding this comment.
Also, we might want to find the least lexicographical character if frequencies are equal. It can be made with std::tie
| for (auto const& [c, freq] : freq_map) { | |
| if (freq > max_count) { | |
| max_count = freq; | |
| most_frequent = c; | |
| } | |
| } | |
| auto const& [c, max_freq] = *std::max_element(map.begin(), map.end(), [](const auto& lhs, const auto& rhs) { | |
| // frequencies in tuples are compared first, then lexicographic order of characters | |
| return std::tie(lhs.second, lhs.first) < std::tie(rhs.second, rhs.first); | |
| }; |
| for (char c : str) { | ||
| if (map[static_cast<unsigned char>(c)]) { | ||
| count++; | ||
| break; | ||
| } | ||
| } |
| for (char c : str) { | ||
| if (c != ' ' && c != '\t') { | ||
| only_space_or_tab = false; | ||
| break; | ||
| } | ||
| } |
There was a problem hiding this comment.
Why didn't you use std::isspace?
| if (col.IsNullOrEmpty(i)) continue; | ||
|
|
||
| auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i)); | ||
| static std::array<bool, 256> map = []() constexpr { |
There was a problem hiding this comment.
constexpr, also put outside of loop
c5b7a1b to
03a3e42
Compare
| for (char c : str) { | ||
| if (!std::isspace(static_cast<unsigned char>(c))) { | ||
| only_space_or_tab = false; | ||
| break; | ||
| } | ||
| } |
There was a problem hiding this comment.
| for (char c : str) { | |
| if (!std::isspace(static_cast<unsigned char>(c))) { | |
| only_space_or_tab = false; | |
| break; | |
| } | |
| } | |
| if (str.empty() or std::none_of(...)) | |
| count++; |
There was a problem hiding this comment.
The test doesn't pass with non_of, I suggest leaving it as is
| Statistic DataStats::GetLeadingWhitespaceCount(size_t index) const { | ||
| if (all_stats_[index].leading_whitespace_count.HasValue()) | ||
| return all_stats_[index].leading_whitespace_count; | ||
|
|
||
| mo::TypedColumnData const& col = col_data_[index]; | ||
| if (col.GetTypeId() != +mo::TypeId::kString) return {}; | ||
|
|
||
| size_t count = 0; | ||
|
|
||
| for (size_t i = 0; i < col.GetNumRows(); i++) { | ||
| if (col.IsNullOrEmpty(i)) continue; | ||
|
|
||
| auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i)); | ||
| if (!str.empty() && std::isspace(static_cast<unsigned char>(str[0]))) { | ||
| count++; | ||
| } | ||
| } | ||
|
|
||
| mo::IntType int_type; | ||
| std::byte const* res = int_type.MakeValue(count); | ||
| return Statistic(res, &int_type, false); | ||
| } | ||
|
|
||
| Statistic DataStats::GetTrailingWhitespaceCount(size_t index) const { | ||
| if (all_stats_[index].trailing_whitespace_count.HasValue()) | ||
| return all_stats_[index].trailing_whitespace_count; | ||
|
|
||
| mo::TypedColumnData const& col = col_data_[index]; | ||
| if (col.GetTypeId() != +mo::TypeId::kString) return {}; | ||
|
|
||
| size_t count = 0; | ||
|
|
||
| for (size_t i = 0; i < col.GetNumRows(); i++) { | ||
| if (col.IsNullOrEmpty(i)) continue; | ||
|
|
||
| auto const& str = mo::Type::GetValue<std::string>(col.GetValue(i)); | ||
| if (!str.empty() && std::isspace(static_cast<unsigned char>(str.back()))) { | ||
| count++; | ||
| } | ||
| } | ||
|
|
||
| mo::IntType int_type; | ||
| std::byte const* res = int_type.MakeValue(count); | ||
| return Statistic(res, &int_type, false); | ||
| } |
| static constexpr std::string_view const kSpecialChars = "@#$%^&!?*_+=~'-\""; | ||
| size_t count = 0; | ||
|
|
||
| std::array<bool, 256> map = []() constexpr { |
There was a problem hiding this comment.
| std::array<bool, 256> map = []() constexpr { | |
| static constexpr std::array<bool, 256> map = []() constexpr { |
03a3e42 to
b52df00
Compare
| static constexpr std::string_view const kSpecialChars = "@#$%^&!?*_+=~'-\""; | ||
| size_t count = 0; | ||
|
|
||
| static constexpr std::array<bool, 256> map = []() constexpr { |
There was a problem hiding this comment.
warning: invalid case style for static constant 'map' [readability-identifier-naming]
| static constexpr std::array<bool, 256> map = []() constexpr { | |
| static constexpr std::array<bool, 256> kMap = []() constexpr { |
src/core/algorithms/statistics/data_stats.cpp:960:
- [](char c) { return map[static_cast<unsigned char>(c)]; })) {
+ [](char c) { return kMap[static_cast<unsigned char>(c)]; })) {| // Returns the most frequent last character. | ||
| Statistic GetLastCharFrequency(size_t index) const; | ||
| enum class CharPosition { kFirst, kLast }; | ||
| enum class WhitespacePosition { Leading, Trailing }; |
There was a problem hiding this comment.
warning: invalid case style for enum constant 'Leading' [readability-identifier-naming]
| enum class WhitespacePosition { Leading, Trailing }; | |
| enum class WhitespacePosition { kLeading, Trailing }; |
| // Returns the most frequent last character. | ||
| Statistic GetLastCharFrequency(size_t index) const; | ||
| enum class CharPosition { kFirst, kLast }; | ||
| enum class WhitespacePosition { Leading, Trailing }; |
There was a problem hiding this comment.
warning: invalid case style for enum constant 'Trailing' [readability-identifier-naming]
| enum class WhitespacePosition { Leading, Trailing }; | |
| enum class WhitespacePosition { Leading, kTrailing }; |
b52df00 to
fe1f418
Compare
| static constexpr std::string_view const kSpecialChars = "@#$%^&!?*_+=~'-\""; | ||
| size_t count = 0; | ||
|
|
||
| static constexpr std::array<bool, 256> kmap = []() constexpr { |
There was a problem hiding this comment.
warning: invalid case style for static constant 'kmap' [readability-identifier-naming]
| static constexpr std::array<bool, 256> kmap = []() constexpr { | |
| static constexpr std::array<bool, 256> kMap = []() constexpr { |
src/core/algorithms/statistics/data_stats.cpp:960:
- [](char c) { return kmap[static_cast<unsigned char>(c)]; })) {
+ [](char c) { return kMap[static_cast<unsigned char>(c)]; })) {9237253 to
d848454
Compare
| enum class CharPosition { kFirst, kLast }; | ||
| enum class WhitespacePosition { kLeading, kTrailing }; |
There was a problem hiding this comment.
Seems like the are serving the same purpose. You might leave only the CharPosition enum and place it somewhere above, closer to DataStats() ctor, not in the middle of methods declaration
d848454 to
0e8419f
Compare
| return GetWhitespaceCount(index, CharPosition::kLast); | ||
| } | ||
|
|
||
| Statistic DataStats::GetSpecialCharsCount(size_t index) const { |
There was a problem hiding this comment.
The naming does not reflect the actual purpose of this function. Should better name it like GetNumberOfRowsWithSpecialChars
| Statistic DataStats::GetLeadingWhitespaceCount(size_t index) const { | ||
| return GetWhitespaceCount(index, CharPosition::kFirst); | ||
| } | ||
|
|
||
| Statistic DataStats::GetTrailingWhitespaceCount(size_t index) const { |
There was a problem hiding this comment.
The same with these functions. GetNumberOfRowsWithTrailingWhitespaces
New statistics:
whitespaceOnlyCount - the number of lines that consist only of whitespace characters
(space and tab)
firstCharFrequency / lastCharFrequency - the most common first/ last character in the column lines and the number of its occurrences
leadingWhitespaceCount / trailingWhitespaceCount - the number of lines with spaces at the beginning or at the end
specialCharsCount
Number of lines containing special characters