Skip to content

Commit 093effc

Browse files
authored
Leverage syntax tree for paragraph count (#899)
1 parent 2e358ad commit 093effc

File tree

10 files changed

+113
-102
lines changed

10 files changed

+113
-102
lines changed

CoreEditor/src/bridge/web/core.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import {
55
resetEditor,
66
clearEditor,
77
getEditorText,
8-
getReadableContent,
8+
getReadableContentPair,
99
insertText,
1010
replaceText,
1111
handleFocusLost,
@@ -22,7 +22,7 @@ export interface WebModuleCore extends WebModule {
2222
resetEditor({ text }: { text: string }): void;
2323
clearEditor(): void;
2424
getEditorText(): string;
25-
getReadableContent(): ReadableContentPair;
25+
getReadableContentPair(): ReadableContentPair;
2626
insertText({ text, from, to }: { text: string; from: CodeGen_Int; to: CodeGen_Int }): void;
2727
replaceText({ text, granularity }: { text: string; granularity: ReplaceGranularity }): void;
2828
handleFocusLost(): void;
@@ -43,8 +43,8 @@ export class WebModuleCoreImpl implements WebModuleCore {
4343
return getEditorText();
4444
}
4545

46-
getReadableContent(): ReadableContentPair {
47-
return getReadableContent();
46+
getReadableContentPair(): ReadableContentPair {
47+
return getReadableContentPair();
4848
}
4949

5050
insertText({ text, from, to }: { text: string; from: CodeGen_Int; to: CodeGen_Int }): void {

CoreEditor/src/core.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { setUp, setGutterHovered } from './styling/config';
1010
import { notifyBackgroundColor } from './styling/helper';
1111
import { loadTheme } from './styling/themes';
1212
import { adjustGutterPositions } from './modules/lines';
13-
import { extractComments } from './modules/lezer';
13+
import { getReadableContent } from './modules/lezer';
1414
import { getLineBreak, normalizeLineBreaks } from './modules/lineEndings';
1515
import { removeFrontMatter } from './modules/frontMatter';
1616
import { selectedMainText, scrollIntoView } from './modules/selection';
@@ -22,6 +22,7 @@ import { editorReadyListeners } from './api/methods';
2222
type ReadableContent = {
2323
sourceText: string;
2424
trimmedText: string;
25+
paragraphCount: CodeGen_Int;
2526
commentCount: CodeGen_Int;
2627
};
2728

@@ -168,15 +169,16 @@ export function getEditorText() {
168169
return lines.join(state.lineBreak);
169170
}
170171

171-
export function getReadableContent(): ReadableContentPair {
172+
export function getReadableContentPair(): ReadableContentPair {
172173
const getContent = (sourceText: string): ReadableContent => {
173-
// Remove front matter and extract comments
174+
// Remove front matter and parse the content to get paragraphs and comments
174175
const actualText = removeFrontMatter(sourceText);
175-
const { trimmedText, commentCount } = extractComments(actualText);
176+
const { trimmedText, paragraphCount, commentCount } = getReadableContent(actualText);
176177

177178
return {
178179
sourceText,
179180
trimmedText,
181+
paragraphCount: paragraphCount as CodeGen_Int,
180182
commentCount: commentCount as CodeGen_Int,
181183
};
182184
};

CoreEditor/src/modules/lezer/index.ts

Lines changed: 35 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,61 +34,59 @@ export function getNodesNamed(state: EditorState, nodeName: string) {
3434
return nodes;
3535
}
3636

37-
export function extractComments(source: string) {
38-
// Fail fast since we cannot find an open tag of comments
39-
if (!source.includes('<!--')) {
40-
return {
41-
trimmedText: source,
42-
commentCount: 0,
43-
};
44-
}
37+
export function getReadableContent(source: string) {
38+
const result = {
39+
trimmedText: `${source}`,
40+
paragraphCount: 0,
41+
commentCount: 0,
42+
};
4543

46-
// Parse the content as syntax tree, time-consuming for long content
44+
// Parse the content as syntax tree
4745
const tree = markdownParser.parse(source);
4846
const comments: { from: number; to: number }[] = [];
4947

5048
tree.iterate({
5149
from: 0, to: source.length,
5250
enter: node => {
53-
if (node.name !== 'Comment' && node.name !== 'CommentBlock') {
54-
return;
51+
// Get number of paragraphs
52+
if (node.name === 'Paragraph') {
53+
result.paragraphCount += 1;
5554
}
5655

57-
const offset = node.from;
58-
const html = source.slice(offset, node.to);
56+
// Get comment ranges
57+
if (node.name === 'Comment' || node.name === 'CommentBlock') {
58+
const offset = node.from;
59+
const html = source.slice(offset, node.to);
5960

60-
// A "CommentBlock" in Markdown can be something like this:
61-
// <!-- Hello --> World
62-
//
63-
// The Markdown parser won't extract the "comment" part,
64-
// here we need to parse it again using a html parser.
65-
htmlParser.parse(html).iterate({
66-
from: 0, to: html.length,
67-
enter: comment => {
68-
if (comment.name !== 'Comment') {
69-
return;
70-
}
61+
// A "CommentBlock" in Markdown can be something like this:
62+
// <!-- Hello --> World
63+
//
64+
// The Markdown parser won't extract the "comment" part,
65+
// here we need to parse it again using a html parser.
66+
htmlParser.parse(html).iterate({
67+
from: 0, to: html.length,
68+
enter: comment => {
69+
if (comment.name !== 'Comment') {
70+
return;
71+
}
7172

72-
// Text range with offset from the original Markdown source
73-
comments.push({
74-
from: comment.from + offset,
75-
to: comment.to + offset,
76-
});
77-
},
78-
});
73+
// Text range with offset from the original Markdown source
74+
comments.push({
75+
from: comment.from + offset,
76+
to: comment.to + offset,
77+
});
78+
},
79+
});
80+
}
7981
},
8082
});
8183

82-
const result = {
83-
trimmedText: `${source}`,
84-
commentCount: comments.length,
85-
};
86-
87-
// Enumerate reversely
84+
// Reversely remove all comments from the source text
8885
const sorted = comments.sort((lhs, rhs) => rhs.from - lhs.from);
8986
sorted.forEach(({ from, to }) => {
9087
result.trimmedText = replaceRange(result.trimmedText, from, takePossibleNewline(result.trimmedText, to), '');
9188
});
9289

90+
result.commentCount = comments.length;
9391
return result;
9492
}

CoreEditor/test/lezer.test.ts

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { describe, expect, test } from '@jest/globals';
22
import { EditorView } from '@codemirror/view';
33
import { syntaxTree } from '@codemirror/language';
4-
import { getNodesNamed, extractComments } from '../src/modules/lezer';
4+
import { getNodesNamed, getReadableContent } from '../src/modules/lezer';
55
import * as editor from './utils/editor';
66

77
describe('Lezer parser', () => {
@@ -101,49 +101,58 @@ describe('Lezer parser', () => {
101101
expect(nodes.length).toBe(3);
102102
});
103103

104-
test('test extracting comments', () => {
105-
expect(extractComments('Hello')).toStrictEqual({
104+
test('test getting readable content', () => {
105+
expect(getReadableContent('Hello')).toStrictEqual({
106106
trimmedText: 'Hello',
107+
paragraphCount: 1,
107108
commentCount: 0,
108109
});
109110

110-
expect(extractComments('<!-- Hello -->')).toStrictEqual({
111+
expect(getReadableContent('<!-- Hello -->')).toStrictEqual({
111112
trimmedText: '',
113+
paragraphCount: 0,
112114
commentCount: 1,
113115
});
114116

115-
expect(extractComments('<!-- Hello -->\nWorld')).toStrictEqual({
117+
expect(getReadableContent('<!-- Hello -->\nWorld')).toStrictEqual({
116118
trimmedText: 'World',
119+
paragraphCount: 1,
117120
commentCount: 1,
118121
});
119122

120-
expect(extractComments('<!-- Hello -->\n<!-- World -->')).toStrictEqual({
123+
expect(getReadableContent('<!-- Hello -->\n<!-- World -->')).toStrictEqual({
121124
trimmedText: '',
125+
paragraphCount: 0,
122126
commentCount: 2,
123127
});
124128

125-
expect(extractComments('<!-- Hello --> World -->')).toStrictEqual({
129+
expect(getReadableContent('<!-- Hello --> World -->')).toStrictEqual({
126130
trimmedText: ' World -->',
131+
paragraphCount: 0,
127132
commentCount: 1,
128133
});
129134

130-
expect(extractComments('Hello <!-- Hello \n\n World -->')).toStrictEqual({
135+
expect(getReadableContent('Hello <!-- Hello \n\n World -->')).toStrictEqual({
131136
trimmedText: 'Hello <!-- Hello \n\n World -->',
137+
paragraphCount: 2,
132138
commentCount: 0,
133139
});
134140

135-
expect(extractComments('<!-- Hello \n\n World -->')).toStrictEqual({
141+
expect(getReadableContent('<!-- Hello \n\n World -->')).toStrictEqual({
136142
trimmedText: '',
143+
paragraphCount: 0,
137144
commentCount: 1,
138145
});
139146

140-
expect(extractComments('Hello <!-- Hello \n.\n. World -->')).toStrictEqual({
147+
expect(getReadableContent('Hello <!-- Hello \n.\n. World -->')).toStrictEqual({
141148
trimmedText: 'Hello ',
149+
paragraphCount: 1,
142150
commentCount: 1,
143151
});
144152

145-
expect(extractComments('<!-- Hello -->World\n\nHello <!-- World -->')).toStrictEqual({
153+
expect(getReadableContent('<!-- Hello -->World\n\nHello <!-- World -->')).toStrictEqual({
146154
trimmedText: 'World\n\nHello ',
155+
paragraphCount: 1,
147156
commentCount: 2,
148157
});
149158
});

MarkEditKit/Sources/Bridge/Web/Generated/WebBridgeCore.swift

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ public final class WebBridgeCore {
4444
}
4545
}
4646

47-
public func getReadableContent() async throws -> ReadableContentPair {
47+
public func getReadableContentPair() async throws -> ReadableContentPair {
4848
return try await withCheckedThrowingContinuation { continuation in
49-
webView?.invoke(path: "webModules.core.getReadableContent") { result in
49+
webView?.invoke(path: "webModules.core.getReadableContentPair") { result in
5050
Task { @MainActor in
5151
continuation.resume(with: result)
5252
}
@@ -128,11 +128,13 @@ public struct ReadableContentPair: Codable {
128128
public struct ReadableContent: Codable {
129129
public var sourceText: String
130130
public var trimmedText: String
131+
public var paragraphCount: Int
131132
public var commentCount: Int
132133

133-
public init(sourceText: String, trimmedText: String, commentCount: Int) {
134+
public init(sourceText: String, trimmedText: String, paragraphCount: Int, commentCount: Int) {
134135
self.sourceText = sourceText
135136
self.trimmedText = trimmedText
137+
self.paragraphCount = paragraphCount
136138
self.commentCount = commentCount
137139
}
138140
}

MarkEditMac/Modules/Sources/Statistics/StatisticsController.swift

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ public final class StatisticsController: NSViewController {
6565

6666
// Natural language processing is time-consuming for large documents
6767
Task.detached(priority: .userInitiated) {
68-
let fullResult = self.content.fullText.tokenized
69-
let selectionResult = self.content.selection?.tokenized
68+
let fullResult = self.content.fullText.result
69+
let selectionResult = self.content.selection?.result
7070

7171
// Remove the spinner and show the result view on main thread
7272
DispatchQueue.main.async {
@@ -103,11 +103,18 @@ extension ReadableContentPair: @unchecked @retroactive Sendable {}
103103
// MARK: - Private
104104

105105
private extension ReadableContent {
106-
var tokenized: Tokenizer.Result {
107-
Tokenizer.tokenize(
108-
sourceText: sourceText,
109-
trimmedText: trimmedText,
110-
commentCount: commentCount
106+
var result: StatisticsResult {
107+
StatisticsResult(
108+
// Length of the full text
109+
characters: sourceText.count,
110+
// Result from the syntax tree
111+
paragraphs: paragraphCount,
112+
// Result from the syntax tree
113+
comments: commentCount,
114+
// Result from the NLP tokenizer
115+
words: Tokenizer.count(text: trimmedText, unit: .word),
116+
// Result from the NLP tokenizer
117+
sentences: Tokenizer.count(text: trimmedText, unit: .sentence)
111118
)
112119
}
113120
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
//
2+
// StatisticsResult.swift
3+
//
4+
// Created by cyan on 5/4/25.
5+
//
6+
7+
import Foundation
8+
9+
struct StatisticsResult {
10+
let characters: Int
11+
let paragraphs: Int
12+
let comments: Int
13+
let words: Int
14+
let sentences: Int
15+
}

MarkEditMac/Modules/Sources/Statistics/Utilities/Tokenizer.swift

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,7 @@ import NaturalLanguage
1111
NLP based tokenizer to count words, sentences, etc.
1212
*/
1313
enum Tokenizer {
14-
struct Result {
15-
let characters: Int
16-
let words: Int
17-
let sentences: Int
18-
let paragraphs: Int
19-
let comments: Int
20-
}
21-
22-
static func tokenize(sourceText: String, trimmedText: String, commentCount: Int) -> Result {
23-
Result(
24-
characters: sourceText.count, // Always use full text for characters
25-
words: tokenize(text: trimmedText, unit: .word),
26-
sentences: tokenize(text: trimmedText, unit: .sentence),
27-
paragraphs: tokenize(text: trimmedText, unit: .paragraph),
28-
comments: commentCount
29-
)
30-
}
31-
}
32-
33-
// MARK: - Private
34-
35-
private extension Tokenizer {
36-
static func tokenize(text: String, unit: NLTokenUnit) -> Int {
14+
static func count(text: String, unit: NLTokenUnit) -> Int {
3715
let tokenizer = NLTokenizer(unit: unit)
3816
tokenizer.string = text
3917

0 commit comments

Comments
 (0)