Skip to content

Commit 921f8dd

Browse files
Implemented sentence scope (#1595)
Fixes #19 While I was there I also sorted the textual scopes. ## Checklist - [x] I have added [tests](https://www.cursorless.org/docs/contributing/test-case-recorder/) - [ ] I have updated the [docs](https://github.com/cursorless-dev/cursorless/tree/main/docs) and [cheatsheet](https://github.com/cursorless-dev/cursorless/tree/main/cursorless-talon/src/cheatsheet) - [ ] I have not broken the cheatsheet --------- Co-authored-by: Pokey Rule <[email protected]>
1 parent 69679de commit 921f8dd

File tree

20 files changed

+446
-28
lines changed

20 files changed

+446
-28
lines changed

cursorless-talon/src/modifiers/scopes.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -57,26 +57,27 @@
5757
"tags": "xmlBothTags",
5858
"start tag": "xmlStartTag",
5959
"end tag": "xmlEndTag",
60+
# LaTeX
61+
"part": "part",
62+
"chapter": "chapter",
63+
"subsection": "subSection",
64+
"subsubsection": "subSubSection",
65+
"paragraph": "namedParagraph",
66+
"subparagraph": "subParagraph",
67+
"environment": "environment",
6068
# Text-based scope types
6169
"char": "character",
6270
"word": "word",
71+
"token": "token",
6372
"identifier": "identifier",
73+
"line": "line",
74+
"sentence": "sentence",
6475
"block": "paragraph",
65-
"cell": "notebookCell",
6676
"file": "document",
67-
"line": "line",
6877
"paint": "nonWhitespaceSequence",
6978
"short paint": "boundedNonWhitespaceSequence",
7079
"link": "url",
71-
"token": "token",
72-
# LaTeX
73-
"part": "part",
74-
"chapter": "chapter",
75-
"subsection": "subSection",
76-
"subsubsection": "subSubSection",
77-
"paragraph": "namedParagraph",
78-
"subparagraph": "subParagraph",
79-
"environment": "environment",
80+
"cell": "notebookCell",
8081
}
8182

8283

packages/common/src/types/command/PartialTargetDescriptor.types.ts

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -120,17 +120,18 @@ export type SimpleScopeTypeType =
120120
| "subParagraph"
121121
| "environment"
122122
// Text based scopes
123+
| "character"
124+
| "word"
123125
| "token"
126+
| "identifier"
124127
| "line"
125-
| "notebookCell"
128+
| "sentence"
126129
| "paragraph"
127130
| "document"
128-
| "character"
129-
| "word"
130-
| "identifier"
131131
| "nonWhitespaceSequence"
132132
| "boundedNonWhitespaceSequence"
133-
| "url";
133+
| "url"
134+
| "notebookCell";
134135

135136
export interface SimpleScopeType {
136137
type: SimpleScopeTypeType;

packages/cursorless-engine/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@
1818
"itertools": "^1.7.1",
1919
"lodash": "^4.17.21",
2020
"node-html-parser": "^5.3.3",
21-
"zod": "3.21.4"
21+
"zod": "3.21.4",
22+
"sbd": "^1.0.19"
2223
},
2324
"devDependencies": {
2425
"@types/js-yaml": "^4.0.2",
2526
"@types/lodash": "4.14.181",
2627
"@types/mocha": "^8.0.4",
2728
"@types/sinon": "^10.0.2",
29+
"@types/sbd": "^1.0.3",
2830
"js-yaml": "^4.1.0",
2931
"mocha": "^8.1.3",
3032
"sinon": "^11.1.1"

packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/ScopeHandlerFactoryImpl.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
OneOfScopeHandler,
1010
ParagraphScopeHandler,
1111
ScopeHandlerFactory,
12+
SentenceScopeHandler,
1213
TokenScopeHandler,
1314
UrlScopeHandler,
1415
WordScopeHandler,
@@ -53,12 +54,14 @@ export class ScopeHandlerFactoryImpl implements ScopeHandlerFactory {
5354
return new IdentifierScopeHandler(this, scopeType, languageId);
5455
case "line":
5556
return new LineScopeHandler(scopeType, languageId);
57+
case "sentence":
58+
return new SentenceScopeHandler(this, scopeType, languageId);
59+
case "paragraph":
60+
return new ParagraphScopeHandler(scopeType, languageId);
5661
case "document":
5762
return new DocumentScopeHandler(scopeType, languageId);
5863
case "oneOf":
5964
return OneOfScopeHandler.create(this, scopeType, languageId);
60-
case "paragraph":
61-
return new ParagraphScopeHandler(scopeType, languageId);
6265
case "nonWhitespaceSequence":
6366
return new NonWhitespaceSequenceScopeHandler(
6467
this,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { Direction, Range } from "@cursorless/common";
2+
import { imap } from "itertools";
3+
import { NestedScopeHandler } from "..";
4+
import { TokenTarget } from "../../../targets";
5+
import type { TargetScope } from "../scope.types";
6+
import SentenceSegmenter from "./SentenceSegmenter";
7+
import { MatchedText } from "../../../../util/regex";
8+
9+
export default class SentenceScopeHandler extends NestedScopeHandler {
10+
public readonly scopeType = { type: "sentence" } as const;
11+
public readonly iterationScopeType = { type: "paragraph" } as const;
12+
private segmenter = new SentenceSegmenter();
13+
14+
protected generateScopesInSearchScope(
15+
direction: Direction,
16+
{ editor, domain }: TargetScope,
17+
): Iterable<TargetScope> {
18+
const offset = editor.document.offsetAt(domain.start);
19+
const text = editor.document.getText(domain);
20+
21+
const sentenceToScope = (sentence: MatchedText): TargetScope => {
22+
const contentRange = new Range(
23+
editor.document.positionAt(offset + sentence.index),
24+
editor.document.positionAt(
25+
offset + sentence.index + sentence.text.length,
26+
),
27+
);
28+
return {
29+
editor,
30+
domain: contentRange,
31+
getTargets: (isReversed) => [
32+
new TokenTarget({
33+
editor,
34+
contentRange,
35+
isReversed,
36+
}),
37+
],
38+
};
39+
};
40+
41+
const sentences = this.segmenter.segment(text);
42+
43+
return direction === "forward"
44+
? imap(sentences, sentenceToScope)
45+
: Array.from(sentences, sentenceToScope).reverse();
46+
}
47+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import * as sbd from "sbd";
2+
import { MatchedText, matchRegex, testRegex } from "../../../../util/regex";
3+
4+
// A sentence starts with a letter
5+
const leadingOffsetRegex = /\p{L}/u;
6+
// A line with no letters is invalid and breaks sentences
7+
const invalidLineRegex = /(\n[^\p{L}]*\n)/gu;
8+
9+
const options: sbd.Options = {
10+
["newline_boundaries"]: false,
11+
["preserve_whitespace"]: true,
12+
};
13+
14+
export default class SentenceSegmenter {
15+
*segment(text: string): Iterable<MatchedText> {
16+
const sentences = sbd.sentences(text, options);
17+
let index = 0;
18+
19+
for (const sentence of sentences) {
20+
const parts = sentence.split(invalidLineRegex);
21+
22+
for (const part of parts) {
23+
if (!isInvalidLine(part)) {
24+
const segment = createSegment(part, index);
25+
if (segment != null) {
26+
yield segment;
27+
}
28+
}
29+
30+
index += part.length;
31+
}
32+
}
33+
}
34+
}
35+
36+
function createSegment(
37+
sentence: string,
38+
index: number,
39+
): MatchedText | undefined {
40+
const leadingOffsetMatch = matchRegex(leadingOffsetRegex, sentence);
41+
42+
if (leadingOffsetMatch == null) {
43+
return undefined;
44+
}
45+
46+
const leadingOffset = leadingOffsetMatch.index!;
47+
48+
if (leadingOffset !== 0) {
49+
index += leadingOffset;
50+
sentence = sentence.slice(leadingOffset);
51+
}
52+
53+
return {
54+
text: sentence.trimEnd(),
55+
index,
56+
};
57+
}
58+
59+
function isInvalidLine(text: string): boolean {
60+
return testRegex(invalidLineRegex, text);
61+
}

packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler.ts renamed to packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler/WordScopeHandler.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import { Range, TextEditor } from "@cursorless/common";
2-
import { NestedScopeHandler } from ".";
3-
import WordTokenizer from "../../../scopeHandlers/WordScopeHandler/WordTokenizer";
2+
import { NestedScopeHandler } from "..";
3+
import WordTokenizer from "./WordTokenizer";
44
import { Direction } from "@cursorless/common";
5-
import { SubTokenWordTarget } from "../../targets";
6-
import type { TargetScope } from "./scope.types";
5+
import { SubTokenWordTarget } from "../../../targets";
6+
import type { TargetScope } from "../scope.types";
77

88
export default class WordScopeHandler extends NestedScopeHandler {
99
public readonly scopeType = { type: "word" } as const;

packages/cursorless-engine/src/scopeHandlers/WordScopeHandler/WordTokenizer.ts renamed to packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/WordScopeHandler/WordTokenizer.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { getMatcher } from "../../tokenizer";
2-
import { matchText } from "../../util/regex";
1+
import { getMatcher } from "../../../../tokenizer";
2+
import { matchText } from "../../../../util/regex";
33

44
const CAMEL_REGEX = /\p{Lu}?\p{Ll}+|\p{Lu}+(?!\p{Ll})|\p{N}+/gu;
55

packages/cursorless-engine/src/processTargets/modifiers/scopeHandlers/index.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ export * from "./IdentifierScopeHandler";
66
export { default as IdentifierScopeHandler } from "./IdentifierScopeHandler";
77
export * from "./CharacterScopeHandler";
88
export { default as CharacterScopeHandler } from "./CharacterScopeHandler";
9-
export * from "./WordScopeHandler";
10-
export { default as WordScopeHandler } from "./WordScopeHandler";
9+
export * from "./WordScopeHandler/WordScopeHandler";
10+
export { default as WordScopeHandler } from "./WordScopeHandler/WordScopeHandler";
1111
export * from "./TokenScopeHandler";
1212
export { default as TokenScopeHandler } from "./TokenScopeHandler";
1313
export * from "./DocumentScopeHandler";
@@ -17,6 +17,8 @@ export * from "./OneOfScopeHandler";
1717
export { default as OneOfScopeHandler } from "./OneOfScopeHandler";
1818
export * from "./ParagraphScopeHandler";
1919
export { default as ParagraphScopeHandler } from "./ParagraphScopeHandler";
20+
export * from "./SentenceScopeHandler/SentenceScopeHandler";
21+
export { default as SentenceScopeHandler } from "./SentenceScopeHandler/SentenceScopeHandler";
2022
export * from "./RegexScopeHandler";
2123
export * from "./ScopeHandlerFactory";
2224
export * from "./ScopeHandlerFactoryImpl";
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
interface Fixture {
2+
input: string;
3+
expectedOutput: string[];
4+
}
5+
6+
export const sentenceSegmenterFixture: Fixture[] = [
7+
{
8+
input: "Foo foo. Bar? Baz! bongo",
9+
expectedOutput: ["Foo foo.", "Bar?", "Baz!", "bongo"],
10+
},
11+
{
12+
input: "Hello, Mr. Anderson.",
13+
expectedOutput: ["Hello, Mr. Anderson."],
14+
},
15+
{
16+
input: "Visit example.com now!",
17+
expectedOutput: ["Visit example.com now!"],
18+
},
19+
{
20+
input: "Foo.bar",
21+
expectedOutput: ["Foo.bar"],
22+
},
23+
{
24+
input: " Foo ",
25+
expectedOutput: ["Foo"],
26+
},
27+
{
28+
input: "1Foo",
29+
expectedOutput: ["Foo"],
30+
},
31+
{
32+
input: "_foo",
33+
expectedOutput: ["foo"],
34+
},
35+
{
36+
input: " Foo \n\tbar ",
37+
expectedOutput: ["Foo \n\tbar"],
38+
},
39+
{
40+
input: "* Foo \nbar. *",
41+
expectedOutput: ["Foo \nbar."],
42+
},
43+
{
44+
input: "Foo \n*\nbar",
45+
expectedOutput: ["Foo", "bar"],
46+
},
47+
{
48+
input: "Foo\n\nbar",
49+
expectedOutput: ["Foo", "bar"],
50+
},
51+
{
52+
input: "Foo bar...",
53+
expectedOutput: ["Foo bar..."],
54+
},
55+
{
56+
input: "Å\nä\nö",
57+
expectedOutput: ["Å\nä\nö"],
58+
},
59+
];

0 commit comments

Comments
 (0)