Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 12 additions & 7 deletions harper-cli/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![doc = include_str!("../README.md")]

use harper_core::spell::{Dictionary, FstDictionary, MutableDictionary, WordId};
use harper_core::spell::{CanonicalWordId, Dictionary, FstDictionary, MutableDictionary};
use hashbrown::HashMap;
use std::collections::BTreeMap;
use std::fs::File;
Expand Down Expand Up @@ -358,7 +358,7 @@ fn main() -> anyhow::Result<()> {
];

for word in words {
let meta = curated_dictionary.get_word_metadata_str(&word);
let meta = curated_dictionary.get_word_metadata_str_exact(&word);
let (flags, emojis) = meta.as_ref().map_or_else(
|| (String::new(), String::new()),
|md| {
Expand Down Expand Up @@ -854,7 +854,7 @@ fn main() -> anyhow::Result<()> {
let mut processed_words = HashMap::new();
let mut longest_word = 0;
for word in curated_dictionary.words_iter() {
if let Some(metadata) = curated_dictionary.get_word_metadata(word) {
if let Some(metadata) = curated_dictionary.get_word_metadata_exact(word) {
let orth = metadata.orth_info;
let bits = orth.bits() & case_bitmask.bits();

Expand Down Expand Up @@ -956,11 +956,16 @@ fn line_to_parts(line: &str) -> (String, String) {
fn print_word_derivations(word: &str, annot: &str, dictionary: &impl Dictionary) {
println!("{word}/{annot}");

let id = WordId::from_word_str(word);
let id = CanonicalWordId::from_word_str(word);

let children = dictionary
.words_iter()
.filter(|e| dictionary.get_word_metadata(e).unwrap().derived_from == Some(id));
let children = dictionary.words_iter().filter(|e| {
dictionary
.get_word_metadata_exact(e)
.unwrap()
.derived_from
.map(|derived_from| derived_from.canonical())
== Some(id)
});

println!(" - {word}");

Expand Down
2 changes: 1 addition & 1 deletion harper-comments/tests/language_support.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ create_test!(jsdoc.ts, 4);
create_test!(issue_96.lua, 0);
create_test!(merged_lines.ts, 1);
create_test!(javadoc_clean_simple.java, 0);
create_test!(javadoc_complex.java, 5);
create_test!(javadoc_complex.java, 4);
create_test!(issue_132.rs, 1);
create_test!(laravel_app.php, 2);
create_test!(ignore_shebang_1.sh, 0);
Expand Down
1 change: 1 addition & 0 deletions harper-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ harper-brill = { path = "../harper-brill", version = "1.0.0" }
harper-thesaurus = { path = "../harper-thesaurus", version = "1.4.1", optional = true }
bitflags = { version = "2.10.0", features = ["serde"] }
trie-rs = "0.4.2"
indexmap = "2.12.1"

[dev-dependencies]
criterion = { version = "0.8.1", default-features = false }
Expand Down
1 change: 1 addition & 0 deletions harper-core/dictionary.dict
Original file line number Diff line number Diff line change
Expand Up @@ -8252,6 +8252,7 @@ PowerPoint/ONgV
Powers/NOg
Powhatan/NOg
Poznan/Og
Pr/ # Praseodymium
Prada/g
Prado/Og
Praetorian/Ng
Expand Down
16 changes: 10 additions & 6 deletions harper-core/src/dict_word_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use strum_macros::{Display, EnumCount, EnumIter, EnumString, VariantArray};
use std::convert::TryFrom;

use crate::dict_word_metadata_orthography::OrthFlags;
use crate::spell::WordId;
use crate::spell::WordIdPair;
use crate::{Document, TokenKind, TokenStringExt};

/// This represents a "lexeme" or "headword" which is case-folded but affix-expanded.
Expand Down Expand Up @@ -45,7 +45,7 @@ pub struct DictWordMetadata {
#[serde(default = "default_false")]
pub common: bool,
#[serde(default = "default_none")]
pub derived_from: Option<WordId>,
pub derived_from: Option<WordIdPair>,
/// Generated by a chunker. Declares whether the word is a member of a nominal phrase. Using
/// this should be preferred over the similarly named `Pattern`.
///
Expand Down Expand Up @@ -1195,15 +1195,19 @@ impl Default for DialectFlags {

#[cfg(test)]
pub mod tests {
use std::borrow::Cow;
use std::sync::{Arc, LazyLock};

use crate::DictWordMetadata;
use crate::spell::{Dictionary, FstDictionary};

// Helper function to get metadata from the curated dictionary
pub fn md(word: &str) -> DictWordMetadata {
FstDictionary::curated()
.get_word_metadata_str(word)
pub fn md(word: &str) -> Cow<'_, DictWordMetadata> {
static CURATED_DICT: LazyLock<Arc<FstDictionary>> = LazyLock::new(FstDictionary::curated);

CURATED_DICT
.get_word_metadata_combined_str(word)
.unwrap_or_else(|| panic!("Word '{word}' not found in dictionary"))
.into_owned()
}

mod dialect {
Expand Down
2 changes: 1 addition & 1 deletion harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ impl Document {
if let TokenKind::Word(meta) = &mut token.kind {
let word_source = token.span.get_content(&self.source);
let mut found_meta = dictionary
.get_word_metadata(word_source)
.get_word_metadata_combined(word_source)
.map(|c| c.into_owned());

if let Some(inner) = &mut found_meta {
Expand Down
6 changes: 3 additions & 3 deletions harper-core/src/expr/mergeable_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ impl MergeableWords {
let mut compound = a_chars.clone();
compound.push(' ');
compound.extend_from_slice(&b_chars);
let meta_open = self.dict.get_word_metadata(&compound);
let meta_open = self.dict.get_word_metadata(&compound).first().copied();

// Then check if the closed compound exists in the dictionary
compound.remove(a_chars.len());
let meta_closed = self.dict.get_word_metadata(&compound);
let meta_closed = self.dict.get_word_metadata(&compound).first().copied();

if (self.predicate)(meta_closed.as_deref(), meta_open.as_deref()) {
if (self.predicate)(meta_closed, meta_open) {
return Some(compound);
}

Expand Down
4 changes: 2 additions & 2 deletions harper-core/src/linting/inflected_verb_after_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ impl<T: Dictionary> Linter for InflectedVerbAfterTo<T> {
}

let check_stem = |stem: &[char]| {
if let Some(metadata) = self.dictionary.get_word_metadata(stem)
if let Some(metadata) = self.dictionary.get_word_metadata_combined(stem)
&& metadata.is_verb()
&& !metadata.is_noun()
{
Expand Down Expand Up @@ -79,7 +79,7 @@ impl<T: Dictionary> Linter for InflectedVerbAfterTo<T> {
let ed_specific_heuristics = || {
if let Some(prev) = document.get_next_word_from_offset(pi, -1) {
let prev_chars = document.get_span_content(&prev.span);
if let Some(metadata) = self.dictionary.get_word_metadata(prev_chars) {
if let Some(metadata) = self.dictionary.get_word_metadata_combined(prev_chars) {
// adj: "able to" expects an infinitive verb
// verb: "have/had/has/having to" expect an infinitive verb
if metadata.is_adjective() || metadata.is_verb() {
Expand Down
4 changes: 2 additions & 2 deletions harper-core/src/linting/mass_nouns/mass_plurals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ where

fn is_mass_noun_in_dictionary(&self, chars: &[char]) -> bool {
self.dict
.get_word_metadata(chars)
.get_word_metadata_combined(chars)
.is_some_and(|wmd| wmd.is_mass_noun_only())
}

fn is_mass_noun_in_dictionary_str(&self, s: &str) -> bool {
self.dict
.get_word_metadata_str(s)
.get_word_metadata_combined_str(s)
.is_some_and(|wmd| wmd.is_mass_noun_only())
}
}
Expand Down
2 changes: 1 addition & 1 deletion harper-core/src/linting/more_adjective.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ where
}

fn add_valid_candidate(&self, candidates: &mut Vec<String>, candidate: String) -> bool {
if let Some(metadata) = self.dict.get_word_metadata_str(&candidate)
if let Some(metadata) = self.dict.get_word_metadata_str_exact(&candidate)
&& (metadata.is_comparative_adjective() || metadata.is_superlative_adjective())
{
candidates.push(candidate);
Expand Down
8 changes: 4 additions & 4 deletions harper-core/src/linting/one_of_the_singular.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ impl<D: Dictionary + 'static> ExprLinter for OneOfTheSingular<D> {

if self
.dict
.get_word_metadata(&plural_s)
.get_word_metadata_combined(&plural_s)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_s, singular));
}
if self
.dict
.get_word_metadata(&plural_es)
.get_word_metadata_combined(&plural_es)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_es, singular));
Expand All @@ -117,7 +117,7 @@ impl<D: Dictionary + 'static> ExprLinter for OneOfTheSingular<D> {
plural_ies.extend(['i', 'e', 's']);
if self
.dict
.get_word_metadata(&plural_ies)
.get_word_metadata_combined(&plural_ies)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_ies, singular));
Expand All @@ -130,7 +130,7 @@ impl<D: Dictionary + 'static> ExprLinter for OneOfTheSingular<D> {
plural_ves.extend(['v', 'e', 's']);
if self
.dict
.get_word_metadata(&plural_ves)
.get_word_metadata_combined(&plural_ves)
.is_some_and(|m| m.is_plural_noun())
{
suggestions.push(Suggestion::replace_with_match_case(plural_ves, singular));
Expand Down
89 changes: 68 additions & 21 deletions harper-core/src/linting/orthographic_consistency.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use itertools::Itertools;

use crate::linting::{LintKind, Suggestion};
use std::sync::Arc;

Expand Down Expand Up @@ -67,6 +69,11 @@ impl ExprLinter for OrthographicConsistency {

let chars = word.span.get_content(source);

if self.dict.contains_exact_word(chars) {
// Exit if the dictionary contains the exact word.
return None;
}

let cur_flags = OrthFlags::from_letters(chars);

if metadata.is_allcaps()
Expand All @@ -86,19 +93,19 @@ impl ExprLinter for OrthographicConsistency {
}

let canonical_flags = metadata.orth_info;
let flags_to_check = [
OrthFlags::LOWER_CAMEL,
OrthFlags::UPPER_CAMEL,
OrthFlags::APOSTROPHE,
OrthFlags::HYPHENATED,
];

if flags_to_check
.into_iter()
.filter(|flag| canonical_flags.contains(*flag) != cur_flags.contains(*flag))
.count()
== 1
&& let Some(canonical) = self.dict.get_correct_capitalization_of(chars)
let flags_to_check = OrthFlags::LOWER_CAMEL
| OrthFlags::UPPER_CAMEL
| OrthFlags::APOSTROPHE
| OrthFlags::HYPHENATED;

// If any of the flags specified by flags_to_check differ between cur_flags and
// canonical_flags.
if !((canonical_flags ^ cur_flags) & flags_to_check).is_empty()
&& let Ok(canonical) = self
.dict
.get_correct_capitalization_of(chars)
.into_iter()
.exactly_one()
&& alphabetic_differs(canonical, chars)
{
return Some(Lint {
Expand All @@ -115,7 +122,11 @@ impl ExprLinter for OrthographicConsistency {

if metadata.is_titlecase()
&& cur_flags.contains(OrthFlags::LOWERCASE)
&& let Some(canonical) = self.dict.get_correct_capitalization_of(chars)
&& let Ok(canonical) = self
.dict
.get_correct_capitalization_of(chars)
.into_iter()
.exactly_one()
&& alphabetic_differs(canonical, chars)
{
return Some(Lint {
Expand Down Expand Up @@ -144,7 +155,10 @@ fn alphabetic_differs(a: &[char], b: &[char]) -> bool {

#[cfg(test)]
mod tests {
use crate::linting::tests::{assert_no_lints, assert_suggestion_result};
use crate::linting::tests::{
assert_good_and_bad_suggestions, assert_lint_count, assert_no_lints,
assert_suggestion_result,
};

use super::OrthographicConsistency;

Expand All @@ -158,20 +172,29 @@ mod tests {
}

#[test]
fn ikea_should_be_all_caps() {
fn america_capitalized() {
assert_suggestion_result(
"Ikea operates a vast retail network.",
"The word america should be capitalized.",
OrthographicConsistency::default(),
"IKEA operates a vast retail network.",
"The word America should be capitalized.",
);
}

#[test]
fn lego_should_be_all_caps() {
fn harper_automattic_capitalized() {
assert_lint_count(
"So should harper and automattic.",
OrthographicConsistency::default(),
2,
);
}

#[test]
fn ikea_should_be_all_caps() {
assert_suggestion_result(
"Lego bricks encourage creativity.",
"Ikea operates a vast retail network.",
OrthographicConsistency::default(),
"LEGO bricks encourage creativity.",
"IKEA operates a vast retail network.",
);
}

Expand Down Expand Up @@ -403,4 +426,28 @@ mod tests {
OrthographicConsistency::default(),
);
}

#[test]
fn no_improper_suggestion_for_macos() {
assert_good_and_bad_suggestions(
"MacOS",
OrthographicConsistency::default(),
&["macOS"],
&["MacOS"],
);
}

#[test]
fn accept_case_variants() {
// At the time of writing this test, "Pr" (despite being a word in the curated dictionary)
// would be linted for the supposed reason of the canonical spelling being "PR".
// Since both words are in the curated dictionary, neither should be linted.
assert_no_lints("Pr PR", OrthographicConsistency::default());
}

#[test]
fn dont_accept_undefined_case_variants() {
// "pr" isn't defined in the dictionary, so it should be linted.
assert_lint_count("pr", OrthographicConsistency::default(), 1);
}
}
4 changes: 2 additions & 2 deletions harper-core/src/linting/phrasal_verb_as_compound_noun.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ impl Linter for PhrasalVerbAsCompoundNoun {
// So far we only have a small number of phrasal verbs in the dictionary.
let (verb_part_is_verb, phrasal_verb_is_verb) = (
self.dict
.get_word_metadata(verb_part)
.get_word_metadata_combined(verb_part)
.is_some_and(|md| md.verb.is_some()),
self.dict
.get_word_metadata_str(&phrasal_verb)
.get_word_metadata_combined_str(&phrasal_verb)
.is_some_and(|md| md.verb.is_some()),
);

Expand Down
4 changes: 2 additions & 2 deletions harper-core/src/linting/pronoun_verb_agreement.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ where
.iter()
.filter(|&w| {
self.dict
.get_word_metadata(w)
.get_word_metadata_exact(w)
.is_some_and(|md| md.is_verb_lemma())
})
.map(|w| w.to_vec())
Expand Down Expand Up @@ -161,7 +161,7 @@ where
.iter()
.filter(|&w| {
self.dict
.get_word_metadata(w)
.get_word_metadata_exact(w)
.is_some_and(|md| md.is_verb_third_person_singular_present_form())
})
.map(|w| w.to_vec())
Expand Down
Loading
Loading