Desbordante · PodushkaPIR · Nov 19, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/.github/workflows/check-codestyle.yml b/.github/workflows/check-codestyle.yml
@@ -84,3 +84,11 @@ jobs:
       - name: Fail the check if clang-tidy reported issues
         if: steps.review.outputs.total_comments > 0
         run: exit 1
+
+  typos-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check spelling with typos
+        uses: crate-ci/typos@v1.43.5
diff --git a/cmake/desbordante_configure_datasets.cmake b/cmake/desbordante_configure_datasets.cmake
@@ -69,7 +69,7 @@ function(desbordante_fetch_datasets)
     list(GET download_status 0 status_code)
     if(NOT status_code EQUAL 0)
         list(GET download_status 1 str_val)
-        message(NOTICE "Donwload log:\n${download_log}")
+        message(NOTICE "Download log:\n${download_log}")
         message(FATAL_ERROR "Failed to download ${filename}: [${status_code}] ${str_val}.")
     endif()
 

diff --git a/cmake/desbordante_deps.cmake b/cmake/desbordante_deps.cmake
@@ -67,7 +67,7 @@ if(DESBORDANTE_BUILD_TESTS)
     # TODO(senichenkov): remove when googletest gets updated
     if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "21")
         message(WARNING "Googletest has a bug recognized by Clang 21+. "
-                "Supressing character-conversion warning. "
+                "Suppressing character-conversion warning. "
                 "Consider using an older version of Clang.")
         target_compile_options(gtest PRIVATE "-Wno-error=character-conversion")
     endif()

diff --git a/examples/advanced/comparison_pfd_vs_afd.py b/examples/advanced/comparison_pfd_vs_afd.py
@@ -38,7 +38,7 @@ def get_pfds():
 for fd in pfds - afds:
     verifier_algo.execute(lhs_indices=fd.lhs_indices, rhs_indices=[fd.rhs_index])
     fd_error = verifier_algo.get_error()
-    print(f"e({fd}) =", fd_error) # AFD error is signifcantly larger than PFD PerValue
+    print(f"e({fd}) =", fd_error) # AFD error is significantly larger than PFD PerValue
 
 print('In case of PerValue error measure, violations on data from the single "glitchy"')
-print('sensor device among many do not prevent dependecy from being found')
+print('sensor device among many do not prevent dependency from being found')
diff --git a/examples/advanced/comparison_ucc_and_aucc_2.py b/examples/advanced/comparison_ucc_and_aucc_2.py
@@ -51,7 +51,7 @@ def print_table(filename: str) -> None:
     print(f'\t{CYAN}{aucc.to_long_string()}{ENDC}')
 print()
 
-print('Let\'s run UCC mining algorihm:')
+print('Let\'s run UCC mining algorithm:')
 e_algo = desbordante.ucc.algorithms.Default()
 e_algo.load_data(table=(TABLE, ',', True))
 e_algo.execute()

diff --git a/examples/basic/mining_list_od.py b/examples/basic/mining_list_od.py
@@ -40,7 +40,7 @@ def print_named_ods(list_ods, data_frame):
     print("Resulting dependencies for this table are:")
     print_named_ods(ods, df)
     print()
-    print("Depenency [weight] -> [shipping cost] means that ordering table by weight")
+    print("Dependency [weight] -> [shipping cost] means that ordering table by weight")
     print("will also order table by shipping cost automatically. Let's order by weight: ")
 
     df_sorted = df.sort_values("weight")

diff --git a/examples/basic/verifying_mfd.py b/examples/basic/verifying_mfd.py
@@ -70,7 +70,7 @@ def mfd_coordinates():
           "approximately the same place with a degree of accuracy "
           "that is sufficient for us to consider them basically the same. "
           f"For example, in {BLUE_CODE}Cluster 3{DEFAULT_COLOR_CODE}, "
-          "the appartments differ by 0.000229 in longitude and by 0.00004 "
+          "the apartments differ by 0.000229 in longitude and by 0.00004 "
           "in latitude, with the 2 points being merely "
           "around 0.01979 km (or 0.012298 miles) apart, "
           "which is considered to be the same place with "

diff --git a/examples/basic/verifying_nd/verifying_nd_1.py b/examples/basic/verifying_nd/verifying_nd_1.py
@@ -63,12 +63,12 @@ def print_table(filename: str, row_numbers: list[int] = []) -> None:
 print(f'Number of clusters: {len(highlights)}')
 highlight_indices = []
 for high in highlights:
-    highlight_indices += high.get_occurences_indices()
+    highlight_indices += high.get_occurrences_indices()
 
 print(CYAN, end='')
 print_table(EXPIRED_PASSPORT_TABLE, highlight_indices)
 print(ENDC)
-print(f'So, {highlights[0].lhs_value} has {highlights[0].occurences_number} documents')
+print(f'So, {highlights[0].lhs_value} has {highlights[0].occurrences_number} documents')
 print('One of them is expired and shouldn\'t appear in this table. Let\'s remove this line:')
 print(CYAN, end='')
 print_table(VALID_PASSPORTS_TABLE)

diff --git a/examples/basic/verifying_nd/verifying_nd_2.py b/examples/basic/verifying_nd/verifying_nd_2.py
@@ -63,12 +63,12 @@ def print_table(filename: str, row_numbers: list[int] = []) -> None:
 print(f'Number of clusters: {len(highlights)}')
 highlight_indices = []
 for high in highlights:
-    highlight_indices += high.get_occurences_indices()
+    highlight_indices += high.get_occurrences_indices()
 
 print(CYAN, end='')
 print_table(MERGED_PEOPLE_TABLE, highlight_indices)
 print(ENDC)
-print(f'So, {highlights[0].lhs_value} has {highlights[0].occurences_number} documents. It\'s twice as much as needed.')
+print(f'So, {highlights[0].lhs_value} has {highlights[0].occurrences_number} documents. It\'s twice as much as needed.')
 print(f'Look at birth date. {highlights[0].lhs_value} has two different values.')
 print(f'Maybe, we have two different {highlights[0].lhs_value}? Let\'s split them:')
 print(CYAN, end='')

diff --git a/examples/basic/verifying_nd/verifying_nd_3.py b/examples/basic/verifying_nd/verifying_nd_3.py
@@ -65,12 +65,12 @@ def print_table(filename: str, row_numbers: list[int] = []) -> None:
 print(f'Number of clusters: {len(highlights)}')
 highlight_indices = []
 for high in highlights:
-    highlight_indices += high.get_occurences_indices()
+    highlight_indices += high.get_occurrences_indices()
 
 print(CYAN, end='')
 print_table(EXPIRED_PASSPORT_TABLE, highlight_indices)
 print(ENDC)
-print(f'So, {highlights[0].lhs_value} has {highlights[0].occurences_number} documents')
+print(f'So, {highlights[0].lhs_value} has {highlights[0].occurrences_number} documents')
 print('One of them is expired and shouldn\'t appear in this table. Let\'s remove this line:')
 print(CYAN, end='')
 print_table(VALID_PASSPORTS_TABLE)

diff --git a/examples/expert/data_cleaning_dc.py b/examples/expert/data_cleaning_dc.py
@@ -59,7 +59,7 @@ def main():
     print("This is an advanced example explaining how to use Denial Constraint (DC) verification for data cleaning.\n"
     "A basic example of using Denial Constraints is located in examples/basic/verifying_dc.py.\n")
 
-    print("DC verification is perfomed by the Rapidash algorithm:\n"
+    print("DC verification is performed by the Rapidash algorithm:\n"
     "Zifan Liu, Shaleen Deep, Anna Fariha, Fotis Psallidas, Ashish Tiwari, and Avrilia\n"
     "Floratou. 2023. Rapidash: Efficient Constraint Discovery via Rapid Verification.\n"
     "URL: https://arxiv.org/abs/2309.12436\n")

diff --git a/examples/notebooks/Functional_Dependencies_Mining.ipynb b/examples/notebooks/Functional_Dependencies_Mining.ipynb
@@ -468,7 +468,7 @@
       "source": [
         "# Dsicover functional dependencies\n",
         "\n",
-        "Using Desbordante it's trivial to discover all `functional dependecies` in the dataset."
+        "Using Desbordante it's trivial to discover all `functional dependencies` in the dataset."
       ]
     },
     {

diff --git a/examples/test_examples/snapshots/snap_test_examples_pytest.py b/examples/test_examples/snapshots/snap_test_examples_pytest.py
@@ -163,7 +163,7 @@
 1 - PerValue([DeviceId] -> Data) = 0.1714285714
 e([DeviceId] -> Data) = 0.23076923076923078
 In case of PerValue error measure, violations on data from the single "glitchy"
-sensor device among many do not prevent dependecy from being found
+sensor device among many do not prevent dependency from being found
 '''
 
 snapshots['test_example[advanced/comparison_ucc_and_aucc_1.py-None-comparison_ucc_and_aucc_1_output] comparison_ucc_and_aucc_1_output'] = '''\x1b[1m\x1b[36mThis example illustrates the difference between exact and approximate Unique
@@ -268,7 +268,7 @@
 \t\x1b[1m\x1b[36m[Last_name Grade Salary]\x1b[0m
 \t\x1b[1m\x1b[36m[Work_experience]\x1b[0m
 
-Let's run UCC mining algorihm:
+Let's run UCC mining algorithm:
 Found UCCs:
 \t\x1b[1m\x1b[36m[First_name Grade Salary]\x1b[0m
 \t\x1b[1m\x1b[36m[Work_experience]\x1b[0m
@@ -2057,7 +2057,7 @@
 ['weight', 'days'] -> ['shipping cost']
 ['weight'] -> ['shipping cost']
 
-Depenency [weight] -> [shipping cost] means that ordering table by weight
+Dependency [weight] -> [shipping cost] means that ordering table by weight
 will also order table by shipping cost automatically. Let's order by weight: 
 
 +----+----------+-----------------+--------+
@@ -4028,7 +4028,7 @@ class [10%] with 2 elements
 
 Let's take a closer look at them.
 
-Both google and geocoder provided coordinates for multiple addresses that didn't have matching coordinates across different sources, yet were close enough for us to assume that they point to approximately the same place with a degree of accuracy that is sufficient for us to consider them basically the same. For example, in \x1b[1;46mCluster 3\x1b[1;49m, the appartments differ by 0.000229 in longitude and by 0.00004 in latitude, with the 2 points being merely around 0.01979 km (or 0.012298 miles) apart, which is considered to be the same place with parameter δ = 0.001, but violates the MFD with parameter δ = 0.0001.
+Both google and geocoder provided coordinates for multiple addresses that didn't have matching coordinates across different sources, yet were close enough for us to assume that they point to approximately the same place with a degree of accuracy that is sufficient for us to consider them basically the same. For example, in \x1b[1;46mCluster 3\x1b[1;49m, the apartments differ by 0.000229 in longitude and by 0.00004 in latitude, with the 2 points being merely around 0.01979 km (or 0.012298 miles) apart, which is considered to be the same place with parameter δ = 0.001, but violates the MFD with parameter δ = 0.0001.
 --------------------------------------------------------------------------------
 MFD discovery can even be performed on \x1b[1;42mstrings\x1b[1;49m using cosine distance.
 Let's showcase this by checking addresses_names.csv and trying to verify a metric functional dependency in it.
@@ -4344,7 +4344,7 @@ class [10%] with 2 elements
 snapshots['test_example[expert/data_cleaning_dc.py-None-data_cleaning_dc_output] data_cleaning_dc_output'] = '''This is an advanced example explaining how to use Denial Constraint (DC) verification for data cleaning.
 A basic example of using Denial Constraints is located in examples/basic/verifying_dc.py.
 
-DC verification is perfomed by the Rapidash algorithm:
+DC verification is performed by the Rapidash algorithm:
 Zifan Liu, Shaleen Deep, Anna Fariha, Fotis Psallidas, Ashish Tiwari, and Avrilia
 Floratou. 2023. Rapidash: Efficient Constraint Discovery via Rapid Verification.
 URL: https://arxiv.org/abs/2309.12436

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,3 +49,30 @@ exclude = [
     "src/tests/*",
     "examples/*",
 ]
+
+[tool.typos]
+[tool.typos.files]
+extend-exclude = [
+    "test_input_data/",
+]
+[tool.typos.default]
+extend-ignore-re = [
+    "countr_zero",
+    "BA thesis\\.pdf",
+    "Peter J\\. Hass",
+    "Comput\\. J\\.",
+    '"ba"',
+
+# forced options. In the future, when decentralized configurations are added, it will be necessary to remove them
+    "helo",
+    "abd",
+]
+
+[tool.typos.default.extend-words]
+nd = "nd"
+
+[tool.typos.type.jupyter]
+extend-ignore-re = [
+    '\"id":\s*"[^"\n]*"',
+    '"outputs"\s*:\s*\[(\s*\{[^}]*\}(?:,\s*\{[^}]*\})*)?\s*\]'
+]
diff --git a/src/core/algorithms/algebraic_constraints/ac_algorithm.h b/src/core/algorithms/algebraic_constraints/ac_algorithm.h
@@ -22,7 +22,7 @@ namespace algos {
 
 /* Discovers Algebraic Constraints (AC). In theory AC consists of: 1) Set of value
  * pairs (a_i, b_k), where a_i from column A and b_k from column B. 2) Pairing
- * rule - bijection beetwen columns A and B. Algorithm was implemented with Trivial
+ * rule - bijection between columns A and B. Algorithm was implemented with Trivial
  * pairing rule. Trivial pairing rule creates (a_i, b_i) pairs, both values are from
  * the same row.  3) Binary operation. 4) Ranges - set of ranges/intervals that was
  * constructed by grouping results of binary operation between a_i and b_k, boundary
@@ -88,7 +88,7 @@ class ACAlgorithm : public Algorithm {
     }
 
     size_t CalculateSampleSize(size_t k_bumps) const;
-    /* Returns ranges reconstucted with new weight for pair of columns */
+    /* Returns ranges reconstructed with new weight for pair of columns */
     RangesCollection ReconstructRangesByColumns(size_t lhs_i, size_t rhs_i, double weight);
 
     std::vector<RangesCollection> const& GetRangesCollections() const {

diff --git a/src/core/algorithms/association_rules/apriori.cpp b/src/core/algorithms/association_rules/apriori.cpp
@@ -132,10 +132,10 @@ unsigned long long Apriori::FindFrequent() {
             candidates_count += candidate_children.size();
         }
         auto const branching_degree = level_num_;
-        auto const min_treshold = candidates_count / branching_degree + 1;
+        auto const min_threshold = candidates_count / branching_degree + 1;
 
         candidate_hash_tree_ = std::make_unique<CandidateHashTree>(
-                transactional_data_.get(), candidates_, branching_degree, min_treshold);
+                transactional_data_.get(), candidates_, branching_degree, min_threshold);
         candidate_hash_tree_->PerformCounting();
         candidate_hash_tree_->PruneNodes(minsup_);
         AppendToTree();

diff --git a/src/core/algorithms/association_rules/candidate_hash_tree.cpp b/src/core/algorithms/association_rules/candidate_hash_tree.cpp
@@ -13,10 +13,10 @@ void CandidateHashTree::AppendRow(LeafRow row, HashTreeNode& subtree_root) {
         unsigned const max_level_number = row.candidate_node->items.size();
         subtree_root.candidates.push_back(std::move(row));
 
-        /* If the number of candidates in a leaf node is more than min_thresold, a leaf node becomes
-         * an internal node and the tree expands. But if there is no more levels to expand (maximum
-         * level number equals to the cardinality of a candidates), min_threshold is ignored
-         * and a new candidates are just appended without trying to further grow the tree.*/
+        /* If the number of candidates in a leaf node is more than min_threshold, a leaf node
+         * becomes an internal node and the tree expands. But if there is no more levels to expand
+         * (maximum level number equals to the cardinality of a candidates), min_threshold is
+         * ignored and a new candidates are just appended without trying to further grow the tree.*/
         if (subtree_root.candidates.size() > min_threshold_ &&
             subtree_root.level_number <= max_level_number) {
             AddLevel(subtree_root);

diff --git a/src/core/algorithms/cfd/fd_first_algorithm.cpp b/src/core/algorithms/cfd/fd_first_algorithm.cpp
@@ -80,7 +80,7 @@ void FDFirstAlgorithm::CheckForIncorrectInput() const {
     if (tuples_number_ != 0 && columns_number_ == 0) {
         throw config::ConfigurationError(
                 "[ERROR] Illegal columns_number and tuples_number values: tuples_number is " +
-                std::to_string(tuples_number_) + " while columnes_number is 0");
+                std::to_string(tuples_number_) + " while columns_number is 0");
     }
 
     if (columns_number_ != 0 && tuples_number_ != 0 && min_supp_ > tuples_number_) {

diff --git a/src/core/algorithms/cfd/model/cfd_relation_data.cpp b/src/core/algorithms/cfd/model/cfd_relation_data.cpp
@@ -16,7 +16,7 @@ size_t CFDRelationData::GetNumRows() const {
 }
 
 void CFDRelationData::AddNewItemsInFullTable(ItemDictionary& item_dictionary,
-                                             ColumnesValuesDict& columns_values_dict,
+                                             ColumnsValuesDict& columns_values_dict,
                                              std::vector<ItemInfo>& items,
                                              std::vector<std::string> const& string_row,
                                              std::vector<int>& int_row,
@@ -52,7 +52,7 @@ std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStre
     std::vector<Transaction> data_rows;
     ItemDictionary item_dictionary;
     std::vector<ItemInfo> items;
-    ColumnesValuesDict columns_values_dict;
+    ColumnsValuesDict columns_values_dict;
     int unique_elems_number = 1;
 
     unsigned num_columns = parser.GetNumberOfColumns();
@@ -83,7 +83,7 @@ std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStre
 }
 
 void CFDRelationData::AddNewItemsInPartialTable(ItemDictionary& item_dictionary,
-                                                ColumnesValuesDict& columns_values_dict,
+                                                ColumnsValuesDict& columns_values_dict,
                                                 std::vector<ItemInfo>& items,
                                                 std::vector<std::string> const& string_row,
                                                 std::vector<int> const& columns_numbers_list,
@@ -121,7 +121,7 @@ std::unique_ptr<CFDRelationData> CFDRelationData::CreateFrom(model::IDatasetStre
     std::vector<Transaction> data_rows;
     ItemDictionary item_dictionary;
     std::vector<ItemInfo> items;
-    ColumnesValuesDict columns_values_dict;
+    ColumnsValuesDict columns_values_dict;
     int unique_elems_number = 1;
     std::random_device rd;   // only used once to initialise (seed) engine
     std::mt19937 rng(rd());  // random-number engine used (Mersenne-Twister in this case)

diff --git a/src/core/algorithms/cfd/model/cfd_relation_data.h b/src/core/algorithms/cfd/model/cfd_relation_data.h
@@ -19,7 +19,7 @@ namespace algos::cfd {
 class CFDRelationData : public AbstractRelationData<CFDColumnData> {
 private:
     using ItemDictionary = boost::unordered_map<std::pair<int, std::string>, int, PairHash>;
-    using ColumnesValuesDict = std::unordered_map<AttributeIndex, std::vector<int>>;
+    using ColumnsValuesDict = std::unordered_map<AttributeIndex, std::vector<int>>;
 
     // ItemInfo contains info about one elem in the table.
     struct ItemInfo {
@@ -39,12 +39,12 @@ class CFDRelationData : public AbstractRelationData<CFDColumnData> {
     boost::unordered_map<std::pair<int, std::string>, int, PairHash> item_dictionary_;
     std::vector<ItemInfo> items_;
 
-    static void AddNewItemsInFullTable(ItemDictionary &, ColumnesValuesDict &,
+    static void AddNewItemsInFullTable(ItemDictionary &, ColumnsValuesDict &,
                                        std::vector<ItemInfo> &, std::vector<std::string> const &,
                                        std::vector<int> &, std::vector<Transaction> &, int &,
                                        unsigned);
 
-    static void AddNewItemsInPartialTable(ItemDictionary &, ColumnesValuesDict &,
+    static void AddNewItemsInPartialTable(ItemDictionary &, ColumnsValuesDict &,
                                           std::vector<ItemInfo> &, std::vector<std::string> const &,
                                           std::vector<int> const &, std::vector<Transaction> &,
                                           int &, int);

diff --git a/src/core/algorithms/dc/FastADC/model/pli_shard.h b/src/core/algorithms/dc/FastADC/model/pli_shard.h
@@ -149,7 +149,7 @@ class PliShardBuilder {
             return string_provider_->GetIndex(GetValue<std::string>(column, row));
         else
             static_assert(details::DependentFalse<T>::value,
-                          "PliShardBuilder does not unsupport that type");
+                          "PliShardBuilder does not support that type");
     }
 
     template <typename T>

diff --git a/src/core/algorithms/dc/FastADC/model/predicate.h b/src/core/algorithms/dc/FastADC/model/predicate.h
@@ -27,7 +27,7 @@ class PredicateProvider;
  * TODO: Java code uses LongBitSet, which is like boost::dynamic_bitset, but
  * restructs number of bits in the clue to kPredicateBits. Need to investigate further whether
  * the Java's algorithm could work with predicate space more than kPredicateBits.
- * But for now we use kPredicateBits as maxumum amount of predicates
+ * But for now we use kPredicateBits as maximum amount of predicates
  */
 constexpr auto kPredicateBits = 128;
 using PredicateBitset = model::Bitset<kPredicateBits>;

diff --git a/src/core/algorithms/dc/model/tuple.h b/src/core/algorithms/dc/model/tuple.h
@@ -18,7 +18,7 @@ namespace algos::dc {
 //  Left operand - Tuple::kS.
 //  Right operand is a constant value thus it has no tuple.
 //  If the Predicate involves only one variable ColumnOperand thus Predicate.tuple_ is
-//  the same as the tuple_ of involved varialble ColumnOperand.
+//  the same as the tuple_ of involved variable ColumnOperand.
 //  In this case Predicate.tuple_ is set to Tuple::kS.
 //
 enum class Tuple { kS, kT, kMixed };