In hunspell, handle apostrophes, ignore words w/ non-english letters

maxbrunsfeld · maxbrunsfeld · commit 2fa7057a9460 · 2016-01-07T13:19:12.000-08:00
diff --git a/spec/spellchecker-spec.coffee b/spec/spellchecker-spec.coffee
@@ -35,6 +35,17 @@ describe "SpellChecker", ->
         {start: 20, end: 25},
       ]
 
+    it "doesn't considr words to be misspelled just because they have non-english characters", ->
+      expect(SpellChecker.checkSpelling("cliché")).toEqual []
+      expect(SpellChecker.checkSpelling("águila")).toEqual []
+      expect(SpellChecker.checkSpelling("águila cliché")).toEqual []
+
+    it "handles words with apostrophes", ->
+      string = "doesn't isn't aint hasn't"
+      expect(SpellChecker.checkSpelling(string)).toEqual [
+        {start: string.indexOf("aint"), end: string.indexOf("aint") + 4}
+      ]
+
     it "handles invalid inputs", ->
       expect(SpellChecker.checkSpelling("")).toEqual []
       expect(-> SpellChecker.checkSpelling()).toThrow("Bad argument")
diff --git a/src/spellchecker_hunspell.cc b/src/spellchecker_hunspell.cc
@@ -63,28 +63,49 @@ std::vector<MisspelledRange> HunspellSpellchecker::CheckSpelling(const uint16_t
 
   std::vector<char> utf8_buffer(256);
 
-  size_t word_start = 0;
-  bool within_word = false;
-  for (size_t i = 0; i < utf16_length; i++) {
+  enum {
+    unknown,
+    in_separator,
+    in_word,
+  } state = in_separator;
+
+  for (size_t word_start = 0, i = 0; i < utf16_length; i++) {
     uint16_t c = utf16_text[i];
-    bool is_word_character = iswalpha(c);
-    if (within_word) {
-      if (!is_word_character) {
-        within_word = false;
-
-        bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start);
-        if (converted) {
-          if (hunspell->spell(utf8_buffer.data()) == 0) {
-            MisspelledRange range;
-            range.start = word_start;
-            range.end = i;
-            result.push_back(range);
+
+    switch (state) {
+      case unknown:
+        if (iswpunct(c) || iswspace(c)) {
+          state = in_separator;
+        }
+        break;
+
+      case in_separator:
+        if (iswalpha(c)) {
+          word_start = i;
+          state = in_word;
+        } else if (!iswpunct(c) && !iswspace(c)) {
+          state = unknown;
+        }
+        break;
+
+      case in_word:
+        if (c == '\'' && iswalpha(utf16_text[i + 1])) {
+          i++;
+        } else if (c == 0 || iswpunct(c) || iswspace(c)) {
+          state = in_separator;
+          bool converted = TranscodeUTF16ToUTF8(transcoder, (char *)utf8_buffer.data(), utf8_buffer.size(), utf16_text + word_start, i - word_start);
+          if (converted) {
+            if (hunspell->spell(utf8_buffer.data()) == 0) {
+              MisspelledRange range;
+              range.start = word_start;
+              range.end = i;
+              result.push_back(range);
+            }
           }
+        } else if (!iswalpha(c)) {
+          state = unknown;
         }
-      }
-    } else if (is_word_character) {
-      word_start = i;
-      within_word = true;
+        break;
     }
   }