cursorless-dev
diff --git a/‎cursorless-talon/src/modifiers/scopes.py
Lines changed: 12 additions & 11 deletions b/‎cursorless-talon/src/modifiers/scopes.py
Lines changed: 12 additions & 11 deletions
diff --git a/‎packages/common/src/types/command/PartialTargetDescriptor.types.ts
Lines changed: 6 additions & 5 deletions b/‎packages/common/src/types/command/PartialTargetDescriptor.types.ts
Lines changed: 6 additions & 5 deletions
diff --git a/‎packages/cursorless-engine/package.json
Lines changed: 3 additions & 1 deletion b/‎packages/cursorless-engine/package.json
Lines changed: 3 additions & 1 deletion
diff --git a/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/ScopeHandlerFactoryImpl.ts
Lines changed: 5 additions & 2 deletions b/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/ScopeHandlerFactoryImpl.ts
Lines changed: 5 additions & 2 deletions
diff --git a/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/SentenceScopeHandler/SentenceScopeHandler.ts
Lines changed: 47 additions & 0 deletions b/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/SentenceScopeHandler/SentenceScopeHandler.ts
Lines changed: 47 additions & 0 deletions
diff --git a/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/SentenceScopeHandler/SentenceSegmenter.ts
Lines changed: 61 additions & 0 deletions b/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/SentenceScopeHandler/SentenceSegmenter.ts
Lines changed: 61 additions & 0 deletions
diff --git a/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler.ts renamed to ‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler/WordScopeHandler.ts
Lines changed: 4 additions & 4 deletions b/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler.ts renamed to ‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler/WordScopeHandler.ts
Lines changed: 4 additions & 4 deletions
diff --git a/‎packages/cursorless-engine/src/scopeHandlers/WordScopeHandler/WordTokenizer.ts renamed to ‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler/WordTokenizer.ts
Lines changed: 2 additions & 2 deletions b/‎packages/cursorless-engine/src/scopeHandlers/WordScopeHandler/WordTokenizer.ts renamed to ‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler/WordTokenizer.ts
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/index.ts
Lines changed: 4 additions & 2 deletions b/‎packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/index.ts
Lines changed: 4 additions & 2 deletions
diff --git a/‎packages/cursorless-engine/src/test/fixtures/sentenceSegmeter.fixture.ts
Lines changed: 59 additions & 0 deletions b/‎packages/cursorless-engine/src/test/fixtures/sentenceSegmeter.fixture.ts
Lines changed: 59 additions & 0 deletions
@@ -57,26 +57,27 @@
     "tags": "xmlBothTags",
     "start tag": "xmlStartTag",
     "end tag": "xmlEndTag",
+    # LaTeX
+    "part": "part",
+    "chapter": "chapter",
+    "subsection": "subSection",
+    "subsubsection": "subSubSection",
+    "paragraph": "namedParagraph",
+    "subparagraph": "subParagraph",
+    "environment": "environment",
     # Text-based scope types
     "char": "character",
     "word": "word",
+    "token": "token",
     "identifier": "identifier",
+    "line": "line",
+    "sentence": "sentence",
     "block": "paragraph",
-    "cell": "notebookCell",
     "file": "document",
-    "line": "line",
     "paint": "nonWhitespaceSequence",
     "short paint": "boundedNonWhitespaceSequence",
     "link": "url",
-    "token": "token",
-    # LaTeX
-    "part": "part",
-    "chapter": "chapter",
-    "subsection": "subSection",
-    "subsubsection": "subSubSection",
-    "paragraph": "namedParagraph",
-    "subparagraph": "subParagraph",
-    "environment": "environment",
+    "cell": "notebookCell",
 }
 
 
 
@@ -120,17 +120,18 @@ export type SimpleScopeTypeType =
   | "subParagraph"
   | "environment"
   // Text based scopes
+  | "character"
+  | "word"
   | "token"
+  | "identifier"
   | "line"
-  | "notebookCell"
+  | "sentence"
   | "paragraph"
   | "document"
-  | "character"
-  | "word"
-  | "identifier"
   | "nonWhitespaceSequence"
   | "boundedNonWhitespaceSequence"
-  | "url";
+  | "url"
+  | "notebookCell";
 
 export interface SimpleScopeType {
   type: SimpleScopeTypeType;
 
@@ -18,13 +18,15 @@
     "itertools": "^1.7.1",
     "lodash": "^4.17.21",
     "node-html-parser": "^5.3.3",
-    "zod": "3.21.4"
+    "zod": "3.21.4",
+    "sbd": "^1.0.19"
   },
   "devDependencies": {
     "@types/js-yaml": "^4.0.2",
     "@types/lodash": "4.14.181",
     "@types/mocha": "^8.0.4",
     "@types/sinon": "^10.0.2",
+    "@types/sbd": "^1.0.3",
     "js-yaml": "^4.1.0",
     "mocha": "^8.1.3",
     "sinon": "^11.1.1"
 
@@ -9,6 +9,7 @@ import {
   OneOfScopeHandler,
   ParagraphScopeHandler,
   ScopeHandlerFactory,
+  SentenceScopeHandler,
   TokenScopeHandler,
   UrlScopeHandler,
   WordScopeHandler,
@@ -53,12 +54,14 @@ export class ScopeHandlerFactoryImpl implements ScopeHandlerFactory {
         return new IdentifierScopeHandler(this, scopeType, languageId);
       case "line":
         return new LineScopeHandler(scopeType, languageId);
+      case "sentence":
+        return new SentenceScopeHandler(this, scopeType, languageId);
+      case "paragraph":
+        return new ParagraphScopeHandler(scopeType, languageId);
       case "document":
         return new DocumentScopeHandler(scopeType, languageId);
       case "oneOf":
         return OneOfScopeHandler.create(this, scopeType, languageId);
-      case "paragraph":
-        return new ParagraphScopeHandler(scopeType, languageId);
       case "nonWhitespaceSequence":
         return new NonWhitespaceSequenceScopeHandler(
           this,
 
@@ -0,0 +1,47 @@
+import { Direction, Range } from "@cursorless/common";
+import { imap } from "itertools";
+import { NestedScopeHandler } from "..";
+import { TokenTarget } from "../../../targets";
+import type { TargetScope } from "../scope.types";
+import SentenceSegmenter from "./SentenceSegmenter";
+import { MatchedText } from "../../../../util/regex";
+
+export default class SentenceScopeHandler extends NestedScopeHandler {
+  public readonly scopeType = { type: "sentence" } as const;
+  public readonly iterationScopeType = { type: "paragraph" } as const;
+  private segmenter = new SentenceSegmenter();
+
+  protected generateScopesInSearchScope(
+    direction: Direction,
+    { editor, domain }: TargetScope,
+  ): Iterable<TargetScope> {
+    const offset = editor.document.offsetAt(domain.start);
+    const text = editor.document.getText(domain);
+
+    const sentenceToScope = (sentence: MatchedText): TargetScope => {
+      const contentRange = new Range(
+        editor.document.positionAt(offset + sentence.index),
+        editor.document.positionAt(
+          offset + sentence.index + sentence.text.length,
+        ),
+      );
+      return {
+        editor,
+        domain: contentRange,
+        getTargets: (isReversed) => [
+          new TokenTarget({
+            editor,
+            contentRange,
+            isReversed,
+          }),
+        ],
+      };
+    };
+
+    const sentences = this.segmenter.segment(text);
+
+    return direction === "forward"
+      ? imap(sentences, sentenceToScope)
+      : Array.from(sentences, sentenceToScope).reverse();
+  }
+}
@@ -0,0 +1,61 @@
+import * as sbd from "sbd";
+import { MatchedText, matchRegex, testRegex } from "../../../../util/regex";
+
+// A sentence starts with a letter
+const leadingOffsetRegex = /\p{L}/u;
+// A line with no letters is invalid and breaks sentences
+const invalidLineRegex = /(\n[^\p{L}]*\n)/gu;
+
+const options: sbd.Options = {
+  ["newline_boundaries"]: false,
+  ["preserve_whitespace"]: true,
+};
+
+export default class SentenceSegmenter {
+  *segment(text: string): Iterable<MatchedText> {
+    const sentences = sbd.sentences(text, options);
+    let index = 0;
+
+    for (const sentence of sentences) {
+      const parts = sentence.split(invalidLineRegex);
+
+      for (const part of parts) {
+        if (!isInvalidLine(part)) {
+          const segment = createSegment(part, index);
+          if (segment != null) {
+            yield segment;
+          }
+        }
+
+        index += part.length;
+      }
+    }
+  }
+}
+
+function createSegment(
+  sentence: string,
+  index: number,
+): MatchedText | undefined {
+  const leadingOffsetMatch = matchRegex(leadingOffsetRegex, sentence);
+
+  if (leadingOffsetMatch == null) {
+    return undefined;
+  }
+
+  const leadingOffset = leadingOffsetMatch.index!;
+
+  if (leadingOffset !== 0) {
+    index += leadingOffset;
+    sentence = sentence.slice(leadingOffset);
+  }
+
+  return {
+    text: sentence.trimEnd(),
+    index,
+  };
+}
+
+function isInvalidLine(text: string): boolean {
+  return testRegex(invalidLineRegex, text);
+}
@@ -1,9 +1,9 @@
 import { Range, TextEditor } from "@cursorless/common";
-import { NestedScopeHandler } from ".";
-import WordTokenizer from "../../../scopeHandlers/WordScopeHandler/WordTokenizer";
+import { NestedScopeHandler } from "..";
+import WordTokenizer from "./WordTokenizer";
 import { Direction } from "@cursorless/common";
-import { SubTokenWordTarget } from "../../targets";
-import type { TargetScope } from "./scope.types";
+import { SubTokenWordTarget } from "../../../targets";
+import type { TargetScope } from "../scope.types";
 
 export default class WordScopeHandler extends NestedScopeHandler {
   public readonly scopeType = { type: "word" } as const;
 
@@ -1,5 +1,5 @@
-import { getMatcher } from "../../tokenizer";
-import { matchText } from "../../util/regex";
+import { getMatcher } from "../../../../tokenizer";
+import { matchText } from "../../../../util/regex";
 
 const CAMEL_REGEX = /\p{Lu}?\p{Ll}+|\p{Lu}+(?!\p{Ll})|\p{N}+/gu;
 
 
@@ -6,8 +6,8 @@ export * from "./IdentifierScopeHandler";
 export { default as IdentifierScopeHandler } from "./IdentifierScopeHandler";
 export * from "./CharacterScopeHandler";
 export { default as CharacterScopeHandler } from "./CharacterScopeHandler";
-export * from "./WordScopeHandler";
-export { default as WordScopeHandler } from "./WordScopeHandler";
+export * from "./WordScopeHandler/WordScopeHandler";
+export { default as WordScopeHandler } from "./WordScopeHandler/WordScopeHandler";
 export * from "./TokenScopeHandler";
 export { default as TokenScopeHandler } from "./TokenScopeHandler";
 export * from "./DocumentScopeHandler";
@@ -17,6 +17,8 @@ export * from "./OneOfScopeHandler";
 export { default as OneOfScopeHandler } from "./OneOfScopeHandler";
 export * from "./ParagraphScopeHandler";
 export { default as ParagraphScopeHandler } from "./ParagraphScopeHandler";
+export * from "./SentenceScopeHandler/SentenceScopeHandler";
+export { default as SentenceScopeHandler } from "./SentenceScopeHandler/SentenceScopeHandler";
 export * from "./RegexScopeHandler";
 export * from "./ScopeHandlerFactory";
 export * from "./ScopeHandlerFactoryImpl";
@@ -0,0 +1,59 @@
+interface Fixture {
+  input: string;
+  expectedOutput: string[];
+}
+
+export const sentenceSegmenterFixture: Fixture[] = [
+  {
+    input: "Foo foo. Bar? Baz! bongo",
+    expectedOutput: ["Foo foo.", "Bar?", "Baz!", "bongo"],
+  },
+  {
+    input: "Hello, Mr. Anderson.",
+    expectedOutput: ["Hello, Mr. Anderson."],
+  },
+  {
+    input: "Visit example.com now!",
+    expectedOutput: ["Visit example.com now!"],
+  },
+  {
+    input: "Foo.bar",
+    expectedOutput: ["Foo.bar"],
+  },
+  {
+    input: " Foo ",
+    expectedOutput: ["Foo"],
+  },
+  {
+    input: "1Foo",
+    expectedOutput: ["Foo"],
+  },
+  {
+    input: "_foo",
+    expectedOutput: ["foo"],
+  },
+  {
+    input: " Foo \n\tbar ",
+    expectedOutput: ["Foo \n\tbar"],
+  },
+  {
+    input: "* Foo \nbar. *",
+    expectedOutput: ["Foo \nbar."],
+  },
+  {
+    input: "Foo \n*\nbar",
+    expectedOutput: ["Foo", "bar"],
+  },
+  {
+    input: "Foo\n\nbar",
+    expectedOutput: ["Foo", "bar"],
+  },
+  {
+    input: "Foo bar...",
+    expectedOutput: ["Foo bar..."],
+  },
+  {
+    input: "Å\nä\nö",
+    expectedOutput: ["Å\nä\nö"],
+  },
+];