diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index 59ac27dd6..3777a07fb 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -29676,7 +29676,7 @@ indignance/Ng indignant/JY indignation/~Ng indigo/~NgJ -indirect/~JYNV +indirect/~JYNV # noun senses: 1) type of const in finace; 2) type of radiator indiscipline/N indiscreet/JY indiscretion/NS @@ -33552,7 +33552,7 @@ menstruation/Ng mensurable/J mensuration/Ng menswear/Nmg -mental/~JYN +mental/~JY # removed slang/Indian/rare noun senses mentalist/JNSg mentality/~NSg menthol/Ng diff --git a/harper-core/src/linting/damages.rs b/harper-core/src/linting/damages.rs new file mode 100644 index 000000000..db0aa9b67 --- /dev/null +++ b/harper-core/src/linting/damages.rs @@ -0,0 +1,305 @@ +use crate::{ + CharStringExt, Lint, Token, + expr::{Expr, SequenceExpr}, + linting::{ExprLinter, LintKind, Suggestion, expr_linter::Sentence}, +}; + +static KEYWORDS: &[&str] = &[ + "case", + "cases", + "claim", + "claims", + "judgment", + "judgments", + "liabilities", + "liability", + "liable", + "settlement", + "settlements", + "warranty", +]; + +pub struct Damages { + expr: Box, +} + +impl Default for Damages { + fn default() -> Self { + Self { + expr: Box::new(SequenceExpr::word_set(&["damages", "damage"])), + } + } +} + +impl ExprLinter for Damages { + type Unit = Sentence; + + fn expr(&self) -> &dyn Expr { + self.expr.as_ref() + } + + fn match_to_lint_with_context( + &self, + toks: &[Token], + src: &[char], + ctx: Option<(&[Token], &[Token])>, + ) -> Option { + let (pretoks, postoks) = ctx?; + let damage_idx = 0; + let damage_tok = &toks[damage_idx]; + let damage_span = damage_tok.span; + let damage_chars = damage_span.get_content(src); + + // Singular noun/verb lemma is not an error but during development we'll print uses of it + // to observe its context. + if damage_chars.eq_ignore_ascii_case_chars(&['d', 'a', 'm', 'a', 'g', 'e']) { + return None; + } + + // If the word after "damages" is a noun or object pronoun, it's the object and "damages" is a verb. + let next_word_tok = match (postoks.first(), postoks.get(1)) { + (Some(sp), Some(w)) if sp.kind.is_whitespace() && w.kind.is_word() => Some(w), + _ => None, + }; + + if next_word_tok.is_some_and(|nwt| nwt.kind.is_object_pronoun() || nwt.kind.is_noun()) { + return None; + } + + // The word before "damages" may help us narrow down whether it's a noun or verb. + let prev_word_tok = match (pretoks.get(pretoks.len() - 2), pretoks.last()) { + (Some(w), Some(sp)) if sp.kind.is_whitespace() && w.kind.is_word() => Some(w), + _ => None, + }; + + #[derive(PartialEq)] + enum CanPrecede { + Unknown, + NeitherNounNorVerb, + Noun, + Verb, + EitherNounOrVerb, + } + + // Try to disambiguate whether "damages" is a noun or verb. + let can_precede = prev_word_tok.map_or(CanPrecede::Unknown, |prev_word| { + let mut can: CanPrecede = CanPrecede::Unknown; + + if (prev_word.kind.is_adjective() + || prev_word.kind.is_determiner() + || prev_word.kind.is_preposition()) + && !prev_word + .span + .get_content(src) + .eq_ignore_ascii_case_chars(&['t', 'o']) + { + can = CanPrecede::Noun; + } + + if prev_word.kind.is_auxiliary_verb() { + can = if can == CanPrecede::Noun { + CanPrecede::EitherNounOrVerb + } else { + CanPrecede::Verb + }; + } + + can + }); + + if can_precede == CanPrecede::Verb { + return None; + } + + // We now know "damages" isn't unambiguously a verb, but it could still be an ambiguous verb-noun. + // Or it could be a noun. Or it could still be unknown. + + // Check if it's the object of the verb "to pay" + let pay_det = SequenceExpr::word_set(&["paid", "pay", "paying", "pays"]) + .then_optional(SequenceExpr::default().t_ws().then_determiner()) + .t_ws(); + + if pretoks + .windows(2) + .enumerate() + .rev() + .take_while(|(i, _)| pay_det.run(*i, pretoks, src).is_none()) + .count() + < pretoks.len() / 2 + { + return None; + } + + // Check all the tokens for words that are used in the legal compesation context + // TODO: this fails when "damages" is misuses in a diclaimer: + // 1. "If you encounter any issues, errors, or damages resulting from the use of these templates, + // the repository author assumes no responsibility or liability." + // 2. "The author will not be liable for any losses and/or damages in connection with the use of our website" + if pretoks.iter().any(|t| { + t.span + .get_content(src) + .eq_any_ignore_ascii_case_str(KEYWORDS) + }) || postoks.iter().any(|t| { + t.span + .get_content(src) + .eq_any_ignore_ascii_case_str(KEYWORDS) + }) { + return None; + } + + Some(Lint { + span: damage_span, + lint_kind: LintKind::Usage, + suggestions: vec![Suggestion::replace_with_match_case( + damage_chars[..6].to_vec(), + damage_chars, + )], + message: "Singular `damage` is correct when not refering to a court case.".to_string(), + ..Default::default() + }) + } + + fn description(&self) -> &str { + "Checks for plural `damages` not in the context of a court case." + } +} + +#[cfg(test)] +mod tests { + use super::Damages; + use crate::linting::tests::{assert_no_lints, assert_suggestion_result}; + + // Examples of the error from GitHub: + + #[test] + fn fix_robust_against_damages_by_prev_preposition() { + assert_suggestion_result( + "Flow networks robust against damages are simple model networks described in a series of publications by Kaluza et al.", + Damages::default(), + "Flow networks robust against damage are simple model networks described in a series of publications by Kaluza et al.", + ); + } + + #[test] + fn fix_vehicle_damages_on_a_car_by_fall_through() { + assert_suggestion_result( + "POC to select vehicle damages on a car and mark the severity - sudheeshcm/vehicle-damage-selector.", + Damages::default(), + "POC to select vehicle damage on a car and mark the severity - sudheeshcm/vehicle-damage-selector.", + ); + } + + #[test] + fn fix_damages_on_mangoes() { + assert_suggestion_result( + "This is a web application that detects damages on mangoes using a TensorFlow model with Django as the frontend framework", + Damages::default(), + "This is a web application that detects damage on mangoes using a TensorFlow model with Django as the frontend framework", + ); + } + + #[test] + fn fix_types_of_damages_of_roads() { + assert_suggestion_result( + "Detecting different types of damages of roads like cracks and potholes for the given image/video of the road.", + Damages::default(), + "Detecting different types of damage of roads like cracks and potholes for the given image/video of the road.", + ); + } + + // Examples from GitHub where it seems to be used correctly in regard to financial compensation: + + // TODO: would the word "calculate" before "damages" be a good heuristic? + #[test] + fn ignore_damages_in_lost_chance_cases() { + assert_no_lints( + "Code used for calculating damages in lost chance cases.", + Damages::default(), + ); + } + + #[test] + fn ignore_claim_for_damages() { + assert_no_lints( + "Where the dispute involves a claim for damages in respect of a motor accident for cost of rental of a replacement vehicle", + Damages::default(), + ); + } + + #[test] + fn ignore_pay_damages() { + assert_no_lints( + "Under this section, the Commercial Contributor would have to + defend claims against the other Contributors related to those + performance claims and warranties, and if a court requires any other + Contributor to pay any damages as a result, the Commercial Contributor + must pay those damages.", + Damages::default(), + ); + } + + // Examples from GitHub where it's not an error but a verb: + + #[test] + fn ignore_damages_them() { + assert_no_lints( + "Profiles pb's and damages them when their runtime goes over a set value - sirhamsteralot/HaE-PBLimiter.", + Damages::default(), + ); + } + + #[test] + fn ignore_damages_firefox() { + assert_no_lints( + "Opening Wayland-native terminal damages Firefox", + Damages::default(), + ); + } + + #[test] + fn ignore_damages_underlaying_windows() { + assert_no_lints( + "Open File Requester damages underlaying windows when moved", + Damages::default(), + ); + } + + // Examples from GitHub that are too hard to call - maybe they are talking about financial compensation? + + #[test] + #[ignore = "too close to call for now"] + fn ignore_estimate_the_damages_and_the_damages_result() { + assert_no_lints( + "The goal is to estimate the damages of each link in the Graph object using the Damages result (estimating the damages for each segment of a Network).", + Damages::default(), + ); + } + + // https://github.com › dpasmat › cartel-damages-inference + #[test] + #[ignore = "too close to call for now"] + fn ignore_damages_inference() { + assert_no_lints( + "This repository contains code to conduct statistical inference in cartel damages estimation. It will be updated to include a Stata .do file which approximates the standard error of total damages from a fixed effects panel data model, using the delta method.", + Damages::default(), + ); + } + + #[test] + #[ignore = "too close to call for now"] + fn ignore_received_errors() { + assert_no_lints( + "Financial damages caused by received errors $$$$.", + Damages::default(), + ); + } + + #[test] + #[ignore = "too close to call for now"] + fn ignore_asset_level_damages() { + assert_no_lints( + "It would be useful to be able to see asset-level damages after running FDA 2.0.", + Damages::default(), + ); + } +} diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index 3abe208df..aca532f54 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -49,6 +49,7 @@ use super::correct_number_suffix::CorrectNumberSuffix; use super::criteria_phenomena::CriteriaPhenomena; use super::cure_for::CureFor; use super::currency_placement::CurrencyPlacement; +use super::damages::Damages; use super::despite_of::DespiteOf; use super::didnt::Didnt; use super::discourse_markers::DiscourseMarkers; @@ -630,6 +631,11 @@ impl LintGroup { ); out.config.set_rule_enabled("DisjointPrefixes", true); + // add_chunk_expr_linter doesn't support the `Sentence` `Unit` and there is not yet any + // `add_sentence_expr_linter` + out.add("Damages", Damages::default()); + out.config.set_rule_enabled("Damages", true); + out.add_chunk_expr_linter("TransposedSpace", TransposedSpace::new(dictionary.clone())); out.config.set_rule_enabled("TransposedSpace", true); diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 60daa00ce..83a2546b5 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -42,6 +42,7 @@ mod correct_number_suffix; mod criteria_phenomena; mod cure_for; mod currency_placement; +mod damages; mod dashes; mod despite_of; mod determiner_without_noun; @@ -266,7 +267,6 @@ where } } -#[cfg(test)] pub mod tests { use crate::parsers::Markdown; use crate::{Document, Span, Token}; @@ -302,7 +302,7 @@ pub mod tests { pub fn assert_lint_count(text: &str, mut linter: impl Linter, count: usize) { let test = Document::new_markdown_default_curated(text); let lints = linter.lint(&test); - dbg!(&lints); + // dbg!(&lints); if lints.len() != count { panic!( "Expected \"{text}\" to create {count} lints, but it created {}.", @@ -493,13 +493,13 @@ pub mod tests { let lints = linter.lint(&test); // Just check the first lint for now - if let Some(lint) = lints.first() { - if lint.message != expected_message { - panic!( - "Expected lint message \"{expected_message}\", but got \"{}\"", - lint.message - ); - } + if let Some(lint) = lints.first() + && lint.message != expected_message + { + panic!( + "Expected lint message \"{expected_message}\", but got \"{}\"", + lint.message + ); } } @@ -520,8 +520,8 @@ pub mod tests { if let Some(sug) = lint.suggestions.get(n) { sug.apply(lint.span, &mut text_chars); - let transformed_str: String = text_chars.iter().collect(); - dbg!(transformed_str); + // let transformed_str: String = text_chars.iter().collect(); + // dbg!(transformed_str); } else { break; }