Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/check-codestyle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,11 @@ jobs:
- name: Fail the check if clang-tidy reported issues
if: steps.review.outputs.total_comments > 0
run: exit 1

typos-check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Check spelling with typos
uses: crate-ci/typos@v1.43.5
2 changes: 1 addition & 1 deletion cmake/desbordante_configure_datasets.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ function(desbordante_fetch_datasets)
list(GET download_status 0 status_code)
if(NOT status_code EQUAL 0)
list(GET download_status 1 str_val)
message(NOTICE "Donwload log:\n${download_log}")
message(NOTICE "Download log:\n${download_log}")
message(FATAL_ERROR "Failed to download ${filename}: [${status_code}] ${str_val}.")
endif()

Expand Down
2 changes: 1 addition & 1 deletion cmake/desbordante_deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ if(DESBORDANTE_BUILD_TESTS)
# TODO(senichenkov): remove when googletest gets updated
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "21")
message(WARNING "Googletest has a bug recognized by Clang 21+. "
"Supressing character-conversion warning. "
"Suppressing character-conversion warning. "
"Consider using an older version of Clang.")
target_compile_options(gtest PRIVATE "-Wno-error=character-conversion")
endif()
Expand Down
4 changes: 2 additions & 2 deletions examples/advanced/comparison_pfd_vs_afd.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_pfds():
for fd in pfds - afds:
verifier_algo.execute(lhs_indices=fd.lhs_indices, rhs_indices=[fd.rhs_index])
fd_error = verifier_algo.get_error()
print(f"e({fd}) =", fd_error) # AFD error is signifcantly larger than PFD PerValue
print(f"e({fd}) =", fd_error) # AFD error is significantly larger than PFD PerValue

print('In case of PerValue error measure, violations on data from the single "glitchy"')
print('sensor device among many do not prevent dependecy from being found')
print('sensor device among many do not prevent dependency from being found')
2 changes: 1 addition & 1 deletion examples/advanced/comparison_ucc_and_aucc_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def print_table(filename: str) -> None:
print(f'\t{CYAN}{aucc.to_long_string()}{ENDC}')
print()

print('Let\'s run UCC mining algorihm:')
print('Let\'s run UCC mining algorithm:')
e_algo = desbordante.ucc.algorithms.Default()
e_algo.load_data(table=(TABLE, ',', True))
e_algo.execute()
Expand Down
2 changes: 1 addition & 1 deletion examples/basic/mining_list_od.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def print_named_ods(list_ods, data_frame):
print("Resulting dependencies for this table are:")
print_named_ods(ods, df)
print()
print("Depenency [weight] -> [shipping cost] means that ordering table by weight")
print("Dependency [weight] -> [shipping cost] means that ordering table by weight")
print("will also order table by shipping cost automatically. Let's order by weight: ")

df_sorted = df.sort_values("weight")
Expand Down
2 changes: 1 addition & 1 deletion examples/basic/verifying_mfd.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def mfd_coordinates():
"approximately the same place with a degree of accuracy "
"that is sufficient for us to consider them basically the same. "
f"For example, in {BLUE_CODE}Cluster 3{DEFAULT_COLOR_CODE}, "
"the appartments differ by 0.000229 in longitude and by 0.00004 "
"the apartments differ by 0.000229 in longitude and by 0.00004 "
"in latitude, with the 2 points being merely "
"around 0.01979 km (or 0.012298 miles) apart, "
"which is considered to be the same place with "
Expand Down
4 changes: 2 additions & 2 deletions examples/basic/verifying_nd/verifying_nd_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ def print_table(filename: str, row_numbers: list[int] = []) -> None:
print(f'Number of clusters: {len(highlights)}')
highlight_indices = []
for high in highlights:
highlight_indices += high.get_occurences_indices()
highlight_indices += high.get_occurrences_indices()

print(CYAN, end='')
print_table(EXPIRED_PASSPORT_TABLE, highlight_indices)
print(ENDC)
print(f'So, {highlights[0].lhs_value} has {highlights[0].occurences_number} documents')
print(f'So, {highlights[0].lhs_value} has {highlights[0].occurrences_number} documents')
print('One of them is expired and shouldn\'t appear in this table. Let\'s remove this line:')
print(CYAN, end='')
print_table(VALID_PASSPORTS_TABLE)
Expand Down
4 changes: 2 additions & 2 deletions examples/basic/verifying_nd/verifying_nd_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ def print_table(filename: str, row_numbers: list[int] = []) -> None:
print(f'Number of clusters: {len(highlights)}')
highlight_indices = []
for high in highlights:
highlight_indices += high.get_occurences_indices()
highlight_indices += high.get_occurrences_indices()

print(CYAN, end='')
print_table(MERGED_PEOPLE_TABLE, highlight_indices)
print(ENDC)
print(f'So, {highlights[0].lhs_value} has {highlights[0].occurences_number} documents. It\'s twice as much as needed.')
print(f'So, {highlights[0].lhs_value} has {highlights[0].occurrences_number} documents. It\'s twice as much as needed.')
print(f'Look at birth date. {highlights[0].lhs_value} has two different values.')
print(f'Maybe, we have two different {highlights[0].lhs_value}? Let\'s split them:')
print(CYAN, end='')
Expand Down
4 changes: 2 additions & 2 deletions examples/basic/verifying_nd/verifying_nd_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ def print_table(filename: str, row_numbers: list[int] = []) -> None:
print(f'Number of clusters: {len(highlights)}')
highlight_indices = []
for high in highlights:
highlight_indices += high.get_occurences_indices()
highlight_indices += high.get_occurrences_indices()

print(CYAN, end='')
print_table(EXPIRED_PASSPORT_TABLE, highlight_indices)
print(ENDC)
print(f'So, {highlights[0].lhs_value} has {highlights[0].occurences_number} documents')
print(f'So, {highlights[0].lhs_value} has {highlights[0].occurrences_number} documents')
print('One of them is expired and shouldn\'t appear in this table. Let\'s remove this line:')
print(CYAN, end='')
print_table(VALID_PASSPORTS_TABLE)
Expand Down
2 changes: 1 addition & 1 deletion examples/expert/data_cleaning_dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def main():
print("This is an advanced example explaining how to use Denial Constraint (DC) verification for data cleaning.\n"
"A basic example of using Denial Constraints is located in examples/basic/verifying_dc.py.\n")

print("DC verification is perfomed by the Rapidash algorithm:\n"
print("DC verification is performed by the Rapidash algorithm:\n"
"Zifan Liu, Shaleen Deep, Anna Fariha, Fotis Psallidas, Ashish Tiwari, and Avrilia\n"
"Floratou. 2023. Rapidash: Efficient Constraint Discovery via Rapid Verification.\n"
"URL: https://arxiv.org/abs/2309.12436\n")
Expand Down
2 changes: 1 addition & 1 deletion examples/notebooks/Functional_Dependencies_Mining.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@
"source": [
"# Dsicover functional dependencies\n",
"\n",
"Using Desbordante it's trivial to discover all `functional dependecies` in the dataset."
"Using Desbordante it's trivial to discover all `functional dependencies` in the dataset."
]
},
{
Expand Down
10 changes: 5 additions & 5 deletions examples/test_examples/snapshots/snap_test_examples_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@
1 - PerValue([DeviceId] -> Data) = 0.1714285714
e([DeviceId] -> Data) = 0.23076923076923078
In case of PerValue error measure, violations on data from the single "glitchy"
sensor device among many do not prevent dependecy from being found
sensor device among many do not prevent dependency from being found
'''

snapshots['test_example[advanced/comparison_ucc_and_aucc_1.py-None-comparison_ucc_and_aucc_1_output] comparison_ucc_and_aucc_1_output'] = '''\x1b[1m\x1b[36mThis example illustrates the difference between exact and approximate Unique
Expand Down Expand Up @@ -268,7 +268,7 @@
\t\x1b[1m\x1b[36m[Last_name Grade Salary]\x1b[0m
\t\x1b[1m\x1b[36m[Work_experience]\x1b[0m

Let's run UCC mining algorihm:
Let's run UCC mining algorithm:
Found UCCs:
\t\x1b[1m\x1b[36m[First_name Grade Salary]\x1b[0m
\t\x1b[1m\x1b[36m[Work_experience]\x1b[0m
Expand Down Expand Up @@ -2057,7 +2057,7 @@
['weight', 'days'] -> ['shipping cost']
['weight'] -> ['shipping cost']

Depenency [weight] -> [shipping cost] means that ordering table by weight
Dependency [weight] -> [shipping cost] means that ordering table by weight
will also order table by shipping cost automatically. Let's order by weight:

+----+----------+-----------------+--------+
Expand Down Expand Up @@ -4028,7 +4028,7 @@ class [10%] with 2 elements

Let's take a closer look at them.

Both google and geocoder provided coordinates for multiple addresses that didn't have matching coordinates across different sources, yet were close enough for us to assume that they point to approximately the same place with a degree of accuracy that is sufficient for us to consider them basically the same. For example, in \x1b[1;46mCluster 3\x1b[1;49m, the appartments differ by 0.000229 in longitude and by 0.00004 in latitude, with the 2 points being merely around 0.01979 km (or 0.012298 miles) apart, which is considered to be the same place with parameter δ = 0.001, but violates the MFD with parameter δ = 0.0001.
Both google and geocoder provided coordinates for multiple addresses that didn't have matching coordinates across different sources, yet were close enough for us to assume that they point to approximately the same place with a degree of accuracy that is sufficient for us to consider them basically the same. For example, in \x1b[1;46mCluster 3\x1b[1;49m, the apartments differ by 0.000229 in longitude and by 0.00004 in latitude, with the 2 points being merely around 0.01979 km (or 0.012298 miles) apart, which is considered to be the same place with parameter δ = 0.001, but violates the MFD with parameter δ = 0.0001.
--------------------------------------------------------------------------------
MFD discovery can even be performed on \x1b[1;42mstrings\x1b[1;49m using cosine distance.
Let's showcase this by checking addresses_names.csv and trying to verify a metric functional dependency in it.
Expand Down Expand Up @@ -4344,7 +4344,7 @@ class [10%] with 2 elements
snapshots['test_example[expert/data_cleaning_dc.py-None-data_cleaning_dc_output] data_cleaning_dc_output'] = '''This is an advanced example explaining how to use Denial Constraint (DC) verification for data cleaning.
A basic example of using Denial Constraints is located in examples/basic/verifying_dc.py.

DC verification is perfomed by the Rapidash algorithm:
DC verification is performed by the Rapidash algorithm:
Zifan Liu, Shaleen Deep, Anna Fariha, Fotis Psallidas, Ashish Tiwari, and Avrilia
Floratou. 2023. Rapidash: Efficient Constraint Discovery via Rapid Verification.
URL: https://arxiv.org/abs/2309.12436
Expand Down
27 changes: 27 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,30 @@ exclude = [
"src/tests/*",
"examples/*",
]

[tool.typos]
[tool.typos.files]
extend-exclude = [
"test_input_data/",
]
[tool.typos.default]
extend-ignore-re = [
"countr_zero",
"BA thesis\\.pdf",
"Peter J\\. Hass",
"Comput\\. J\\.",
'"ba"',

# forced options. In the future, when decentralized configurations are added, it will be necessary to remove them
"helo",
"abd",
]

[tool.typos.default.extend-words]
nd = "nd"

[tool.typos.type.jupyter]
extend-ignore-re = [
'\"id":\s*"[^"\n]*"',
'"outputs"\s*:\s*\[(\s*\{[^}]*\}(?:,\s*\{[^}]*\})*)?\s*\]'
]
4 changes: 2 additions & 2 deletions src/core/algorithms/algebraic_constraints/ac_algorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace algos {

/* Discovers Algebraic Constraints (AC). In theory AC consists of: 1) Set of value
* pairs (a_i, b_k), where a_i from column A and b_k from column B. 2) Pairing
* rule - bijection beetwen columns A and B. Algorithm was implemented with Trivial
* rule - bijection between columns A and B. Algorithm was implemented with Trivial
* pairing rule. Trivial pairing rule creates (a_i, b_i) pairs, both values are from
* the same row. 3) Binary operation. 4) Ranges - set of ranges/intervals that was
* constructed by grouping results of binary operation between a_i and b_k, boundary
Expand Down Expand Up @@ -88,7 +88,7 @@ class ACAlgorithm : public Algorithm {
}

size_t CalculateSampleSize(size_t k_bumps) const;
/* Returns ranges reconstucted with new weight for pair of columns */
/* Returns ranges reconstructed with new weight for pair of columns */
RangesCollection ReconstructRangesByColumns(size_t lhs_i, size_t rhs_i, double weight);

std::vector<RangesCollection> const& GetRangesCollections() const {
Expand Down
4 changes: 2 additions & 2 deletions src/core/algorithms/association_rules/apriori.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,10 @@ unsigned long long Apriori::FindFrequent() {
candidates_count += candidate_children.size();
}
auto const branching_degree = level_num_;
auto const min_treshold = candidates_count / branching_degree + 1;
auto const min_threshold = candidates_count / branching_degree + 1;

candidate_hash_tree_ = std::make_unique<CandidateHashTree>(
transactional_data_.get(), candidates_, branching_degree, min_treshold);
transactional_data_.get(), candidates_, branching_degree, min_threshold);
candidate_hash_tree_->PerformCounting();
candidate_hash_tree_->PruneNodes(minsup_);
AppendToTree();
Expand Down
8 changes: 4 additions & 4 deletions src/core/algorithms/association_rules/candidate_hash_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ void CandidateHashTree::AppendRow(LeafRow row, HashTreeNode& subtree_root) {
unsigned const max_level_number = row.candidate_node->items.size();
subtree_root.candidates.push_back(std::move(row));

/* If the number of candidates in a leaf node is more than min_thresold, a leaf node becomes
* an internal node and the tree expands. But if there is no more levels to expand (maximum
* level number equals to the cardinality of a candidates), min_threshold is ignored
* and a new candidates are just appended without trying to further grow the tree.*/
/* If the number of candidates in a leaf node is more than min_threshold, a leaf node
* becomes an internal node and the tree expands. But if there is no more levels to expand
* (maximum level number equals to the cardinality of a candidates), min_threshold is
* ignored and a new candidates are just appended without trying to further grow the tree.*/
if (subtree_root.candidates.size() > min_threshold_ &&
subtree_root.level_number <= max_level_number) {
AddLevel(subtree_root);
Expand Down
2 changes: 1 addition & 1 deletion src/core/algorithms/cfd/fd_first_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ void FDFirstAlgorithm::CheckForIncorrectInput() const {
if (tuples_number_ != 0 && columns_number_ == 0) {
throw config::ConfigurationError(
"[ERROR] Illegal columns_number and tuples_number values: tuples_number is " +
std::to_string(tuples_number_) + " while columnes_number is 0");
std::to_string(tuples_number_) + " while columns_number is 0");
}

if (columns_number_ != 0 && tuples_number_ != 0 && min_supp_ > tuples_number_) {
Expand Down
8 changes: 4 additions & 4 deletions src/core/algorithms/cfd/model/cfd_relation_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ size_t CFDRelationData::GetNumRows() const {
}

void CFDRelationData::AddNewItemsInFullTable(ItemDictionary& item_dictionary,
ColumnesValuesDict& columns_values_dict,
ColumnsValuesDict& columns_values_dict,
std::vector<ItemInfo>& items,
std::vector<std::string> const& string_row,
std::vector<int>& int_row,
Expand Down Expand Up @@ -52,7 +52,7 @@ std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStre
std::vector<Transaction> data_rows;
ItemDictionary item_dictionary;
std::vector<ItemInfo> items;
ColumnesValuesDict columns_values_dict;
ColumnsValuesDict columns_values_dict;
int unique_elems_number = 1;

unsigned num_columns = parser.GetNumberOfColumns();
Expand Down Expand Up @@ -83,7 +83,7 @@ std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStre
}

void CFDRelationData::AddNewItemsInPartialTable(ItemDictionary& item_dictionary,
ColumnesValuesDict& columns_values_dict,
ColumnsValuesDict& columns_values_dict,
std::vector<ItemInfo>& items,
std::vector<std::string> const& string_row,
std::vector<int> const& columns_numbers_list,
Expand Down Expand Up @@ -121,7 +121,7 @@ std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStre
std::vector<Transaction> data_rows;
ItemDictionary item_dictionary;
std::vector<ItemInfo> items;
ColumnesValuesDict columns_values_dict;
ColumnsValuesDict columns_values_dict;
int unique_elems_number = 1;
std::random_device rd; // only used once to initialise (seed) engine
std::mt19937 rng(rd()); // random-number engine used (Mersenne-Twister in this case)
Expand Down
6 changes: 3 additions & 3 deletions src/core/algorithms/cfd/model/cfd_relation_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace algos::cfd {
class CFDRelationData : public AbstractRelationData<CFDColumnData> {
private:
using ItemDictionary = boost::unordered_map<std::pair<int, std::string>, int, PairHash>;
using ColumnesValuesDict = std::unordered_map<AttributeIndex, std::vector<int>>;
using ColumnsValuesDict = std::unordered_map<AttributeIndex, std::vector<int>>;

// ItemInfo contains info about one elem in the table.
struct ItemInfo {
Expand All @@ -39,12 +39,12 @@ class CFDRelationData : public AbstractRelationData<CFDColumnData> {
boost::unordered_map<std::pair<int, std::string>, int, PairHash> item_dictionary_;
std::vector<ItemInfo> items_;

static void AddNewItemsInFullTable(ItemDictionary &, ColumnesValuesDict &,
static void AddNewItemsInFullTable(ItemDictionary &, ColumnsValuesDict &,
std::vector<ItemInfo> &, std::vector<std::string> const &,
std::vector<int> &, std::vector<Transaction> &, int &,
unsigned);

static void AddNewItemsInPartialTable(ItemDictionary &, ColumnesValuesDict &,
static void AddNewItemsInPartialTable(ItemDictionary &, ColumnsValuesDict &,
std::vector<ItemInfo> &, std::vector<std::string> const &,
std::vector<int> const &, std::vector<Transaction> &,
int &, int);
Expand Down
2 changes: 1 addition & 1 deletion src/core/algorithms/dc/FastADC/model/pli_shard.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ class PliShardBuilder {
return string_provider_->GetIndex(GetValue<std::string>(column, row));
else
static_assert(details::DependentFalse<T>::value,
"PliShardBuilder does not unsupport that type");
"PliShardBuilder does not support that type");
}

template <typename T>
Expand Down
2 changes: 1 addition & 1 deletion src/core/algorithms/dc/FastADC/model/predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class PredicateProvider;
* TODO: Java code uses LongBitSet, which is like boost::dynamic_bitset, but
* restructs number of bits in the clue to kPredicateBits. Need to investigate further whether
* the Java's algorithm could work with predicate space more than kPredicateBits.
* But for now we use kPredicateBits as maxumum amount of predicates
* But for now we use kPredicateBits as maximum amount of predicates
*/
constexpr auto kPredicateBits = 128;
using PredicateBitset = model::Bitset<kPredicateBits>;
Expand Down
2 changes: 1 addition & 1 deletion src/core/algorithms/dc/model/tuple.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace algos::dc {
// Left operand - Tuple::kS.
// Right operand is a constant value thus it has no tuple.
// If the Predicate involves only one variable ColumnOperand thus Predicate.tuple_ is
// the same as the tuple_ of involved varialble ColumnOperand.
// the same as the tuple_ of involved variable ColumnOperand.
// In this case Predicate.tuple_ is set to Tuple::kS.
//
enum class Tuple { kS, kT, kMixed };
Expand Down
Loading
Loading