Leverage syntax tree for paragraph count (#899)

cyanzhong · web-flow · commit 093effc300eb · 2025-05-06T10:15:38.000+08:00
diff --git a/CoreEditor/src/bridge/web/core.ts b/CoreEditor/src/bridge/web/core.ts
@@ -5,7 +5,7 @@ import {
   resetEditor,
   clearEditor,
   getEditorText,
-  getReadableContent,
+  getReadableContentPair,
   insertText,
   replaceText,
   handleFocusLost,
@@ -22,7 +22,7 @@ export interface WebModuleCore extends WebModule {
   resetEditor({ text }: { text: string }): void;
   clearEditor(): void;
   getEditorText(): string;
-  getReadableContent(): ReadableContentPair;
+  getReadableContentPair(): ReadableContentPair;
   insertText({ text, from, to }: { text: string; from: CodeGen_Int; to: CodeGen_Int }): void;
   replaceText({ text, granularity }: { text: string; granularity: ReplaceGranularity }): void;
   handleFocusLost(): void;
@@ -43,8 +43,8 @@ export class WebModuleCoreImpl implements WebModuleCore {
     return getEditorText();
   }
 
-  getReadableContent(): ReadableContentPair {
-    return getReadableContent();
+  getReadableContentPair(): ReadableContentPair {
+    return getReadableContentPair();
   }
 
   insertText({ text, from, to }: { text: string; from: CodeGen_Int; to: CodeGen_Int }): void {
diff --git a/CoreEditor/src/core.ts b/CoreEditor/src/core.ts
@@ -10,7 +10,7 @@ import { setUp, setGutterHovered } from './styling/config';
 import { notifyBackgroundColor } from './styling/helper';
 import { loadTheme } from './styling/themes';
 import { adjustGutterPositions } from './modules/lines';
-import { extractComments } from './modules/lezer';
+import { getReadableContent } from './modules/lezer';
 import { getLineBreak, normalizeLineBreaks } from './modules/lineEndings';
 import { removeFrontMatter } from './modules/frontMatter';
 import { selectedMainText, scrollIntoView } from './modules/selection';
@@ -22,6 +22,7 @@ import { editorReadyListeners } from './api/methods';
 type ReadableContent = {
   sourceText: string;
   trimmedText: string;
+  paragraphCount: CodeGen_Int;
   commentCount: CodeGen_Int;
 };
 
@@ -168,15 +169,16 @@ export function getEditorText() {
   return lines.join(state.lineBreak);
 }
 
-export function getReadableContent(): ReadableContentPair {
+export function getReadableContentPair(): ReadableContentPair {
   const getContent = (sourceText: string): ReadableContent => {
-    // Remove front matter and extract comments
+    // Remove front matter and parse the content to get paragraphs and comments
     const actualText = removeFrontMatter(sourceText);
-    const { trimmedText, commentCount } = extractComments(actualText);
+    const { trimmedText, paragraphCount, commentCount } = getReadableContent(actualText);
 
     return {
       sourceText,
       trimmedText,
+      paragraphCount: paragraphCount as CodeGen_Int,
       commentCount: commentCount as CodeGen_Int,
     };
   };
diff --git a/CoreEditor/src/modules/lezer/index.ts b/CoreEditor/src/modules/lezer/index.ts
@@ -34,61 +34,59 @@ export function getNodesNamed(state: EditorState, nodeName: string) {
   return nodes;
 }
 
-export function extractComments(source: string) {
-  // Fail fast since we cannot find an open tag of comments
-  if (!source.includes('<!--')) {
-    return {
-      trimmedText: source,
-      commentCount: 0,
-    };
-  }
+export function getReadableContent(source: string) {
+  const result = {
+    trimmedText: `${source}`,
+    paragraphCount: 0,
+    commentCount: 0,
+  };
 
-  // Parse the content as syntax tree, time-consuming for long content
+  // Parse the content as syntax tree
   const tree = markdownParser.parse(source);
   const comments: { from: number; to: number }[] = [];
 
   tree.iterate({
     from: 0, to: source.length,
     enter: node => {
-      if (node.name !== 'Comment' && node.name !== 'CommentBlock') {
-        return;
+      // Get number of paragraphs
+      if (node.name === 'Paragraph') {
+        result.paragraphCount += 1;
       }
 
-      const offset = node.from;
-      const html = source.slice(offset, node.to);
+      // Get comment ranges
+      if (node.name === 'Comment' || node.name === 'CommentBlock') {
+        const offset = node.from;
+        const html = source.slice(offset, node.to);
 
-      // A "CommentBlock" in Markdown can be something like this:
-      //   <!-- Hello --> World
-      //
-      // The Markdown parser won't extract the "comment" part,
-      // here we need to parse it again using a html parser.
-      htmlParser.parse(html).iterate({
-        from: 0, to: html.length,
-        enter: comment => {
-          if (comment.name !== 'Comment') {
-            return;
-          }
+        // A "CommentBlock" in Markdown can be something like this:
+        //   <!-- Hello --> World
+        //
+        // The Markdown parser won't extract the "comment" part,
+        // here we need to parse it again using a html parser.
+        htmlParser.parse(html).iterate({
+          from: 0, to: html.length,
+          enter: comment => {
+            if (comment.name !== 'Comment') {
+              return;
+            }
 
-          // Text range with offset from the original Markdown source
-          comments.push({
-            from: comment.from + offset,
-            to: comment.to + offset,
-          });
-        },
-      });
+            // Text range with offset from the original Markdown source
+            comments.push({
+              from: comment.from + offset,
+              to: comment.to + offset,
+            });
+          },
+        });
+      }
     },
   });
 
-  const result = {
-    trimmedText: `${source}`,
-    commentCount: comments.length,
-  };
-
-  // Enumerate reversely
+  // Reversely remove all comments from the source text
   const sorted = comments.sort((lhs, rhs) => rhs.from - lhs.from);
   sorted.forEach(({ from, to }) => {
     result.trimmedText = replaceRange(result.trimmedText, from, takePossibleNewline(result.trimmedText, to), '');
   });
 
+  result.commentCount = comments.length;
   return result;
 }
diff --git a/CoreEditor/test/lezer.test.ts b/CoreEditor/test/lezer.test.ts
@@ -1,7 +1,7 @@
 import { describe, expect, test } from '@jest/globals';
 import { EditorView } from '@codemirror/view';
 import { syntaxTree } from '@codemirror/language';
-import { getNodesNamed, extractComments } from '../src/modules/lezer';
+import { getNodesNamed, getReadableContent } from '../src/modules/lezer';
 import * as editor from './utils/editor';
 
 describe('Lezer parser', () => {
@@ -101,49 +101,58 @@ describe('Lezer parser', () => {
     expect(nodes.length).toBe(3);
   });
 
-  test('test extracting comments', () => {
-    expect(extractComments('Hello')).toStrictEqual({
+  test('test getting readable content', () => {
+    expect(getReadableContent('Hello')).toStrictEqual({
       trimmedText: 'Hello',
+      paragraphCount: 1,
       commentCount: 0,
     });
 
-    expect(extractComments('<!-- Hello -->')).toStrictEqual({
+    expect(getReadableContent('<!-- Hello -->')).toStrictEqual({
       trimmedText: '',
+      paragraphCount: 0,
       commentCount: 1,
     });
 
-    expect(extractComments('<!-- Hello -->\nWorld')).toStrictEqual({
+    expect(getReadableContent('<!-- Hello -->\nWorld')).toStrictEqual({
       trimmedText: 'World',
+      paragraphCount: 1,
       commentCount: 1,
     });
 
-    expect(extractComments('<!-- Hello -->\n<!-- World -->')).toStrictEqual({
+    expect(getReadableContent('<!-- Hello -->\n<!-- World -->')).toStrictEqual({
       trimmedText: '',
+      paragraphCount: 0,
       commentCount: 2,
     });
 
-    expect(extractComments('<!-- Hello --> World -->')).toStrictEqual({
+    expect(getReadableContent('<!-- Hello --> World -->')).toStrictEqual({
       trimmedText: ' World -->',
+      paragraphCount: 0,
       commentCount: 1,
     });
 
-    expect(extractComments('Hello <!-- Hello \n\n World -->')).toStrictEqual({
+    expect(getReadableContent('Hello <!-- Hello \n\n World -->')).toStrictEqual({
       trimmedText: 'Hello <!-- Hello \n\n World -->',
+      paragraphCount: 2,
       commentCount: 0,
     });
 
-    expect(extractComments('<!-- Hello \n\n World -->')).toStrictEqual({
+    expect(getReadableContent('<!-- Hello \n\n World -->')).toStrictEqual({
       trimmedText: '',
+      paragraphCount: 0,
       commentCount: 1,
     });
 
-    expect(extractComments('Hello <!-- Hello \n.\n. World -->')).toStrictEqual({
+    expect(getReadableContent('Hello <!-- Hello \n.\n. World -->')).toStrictEqual({
       trimmedText: 'Hello ',
+      paragraphCount: 1,
       commentCount: 1,
     });
 
-    expect(extractComments('<!-- Hello -->World\n\nHello <!-- World -->')).toStrictEqual({
+    expect(getReadableContent('<!-- Hello -->World\n\nHello <!-- World -->')).toStrictEqual({
       trimmedText: 'World\n\nHello ',
+      paragraphCount: 1,
       commentCount: 2,
     });
   });
diff --git a/MarkEditKit/Sources/Bridge/Web/Generated/WebBridgeCore.swift b/MarkEditKit/Sources/Bridge/Web/Generated/WebBridgeCore.swift
@@ -44,9 +44,9 @@ public final class WebBridgeCore {
     }
   }
 
-  public func getReadableContent() async throws -> ReadableContentPair {
+  public func getReadableContentPair() async throws -> ReadableContentPair {
     return try await withCheckedThrowingContinuation { continuation in
-      webView?.invoke(path: "webModules.core.getReadableContent") { result in
+      webView?.invoke(path: "webModules.core.getReadableContentPair") { result in
         Task { @MainActor in
           continuation.resume(with: result)
         }
@@ -128,11 +128,13 @@ public struct ReadableContentPair: Codable {
 public struct ReadableContent: Codable {
   public var sourceText: String
   public var trimmedText: String
+  public var paragraphCount: Int
   public var commentCount: Int
 
-  public init(sourceText: String, trimmedText: String, commentCount: Int) {
+  public init(sourceText: String, trimmedText: String, paragraphCount: Int, commentCount: Int) {
     self.sourceText = sourceText
     self.trimmedText = trimmedText
+    self.paragraphCount = paragraphCount
     self.commentCount = commentCount
   }
 }
diff --git a/MarkEditMac/Modules/Sources/Statistics/StatisticsController.swift b/MarkEditMac/Modules/Sources/Statistics/StatisticsController.swift
@@ -65,8 +65,8 @@ public final class StatisticsController: NSViewController {
 
     // Natural language processing is time-consuming for large documents
     Task.detached(priority: .userInitiated) {
-      let fullResult = self.content.fullText.tokenized
-      let selectionResult = self.content.selection?.tokenized
+      let fullResult = self.content.fullText.result
+      let selectionResult = self.content.selection?.result
 
       // Remove the spinner and show the result view on main thread
       DispatchQueue.main.async {
@@ -103,11 +103,18 @@ extension ReadableContentPair: @unchecked @retroactive Sendable {}
 // MARK: - Private
 
 private extension ReadableContent {
-  var tokenized: Tokenizer.Result {
-    Tokenizer.tokenize(
-      sourceText: sourceText,
-      trimmedText: trimmedText,
-      commentCount: commentCount
+  var result: StatisticsResult {
+    StatisticsResult(
+      // Length of the full text
+      characters: sourceText.count,
+      // Result from the syntax tree
+      paragraphs: paragraphCount,
+      // Result from the syntax tree
+      comments: commentCount,
+      // Result from the NLP tokenizer
+      words: Tokenizer.count(text: trimmedText, unit: .word),
+      // Result from the NLP tokenizer
+      sentences: Tokenizer.count(text: trimmedText, unit: .sentence)
     )
   }
 }
diff --git a/MarkEditMac/Modules/Sources/Statistics/StatisticsResult.swift b/MarkEditMac/Modules/Sources/Statistics/StatisticsResult.swift
@@ -0,0 +1,15 @@
+//
+//  StatisticsResult.swift
+//
+//  Created by cyan on 5/4/25.
+//
+
+import Foundation
+
+struct StatisticsResult {
+  let characters: Int
+  let paragraphs: Int
+  let comments: Int
+  let words: Int
+  let sentences: Int
+}
diff --git a/MarkEditMac/Modules/Sources/Statistics/Utilities/Tokenizer.swift b/MarkEditMac/Modules/Sources/Statistics/Utilities/Tokenizer.swift
@@ -11,29 +11,7 @@ import NaturalLanguage
  NLP based tokenizer to count words, sentences, etc.
  */
 enum Tokenizer {
-  struct Result {
-    let characters: Int
-    let words: Int
-    let sentences: Int
-    let paragraphs: Int
-    let comments: Int
-  }
-
-  static func tokenize(sourceText: String, trimmedText: String, commentCount: Int) -> Result {
-    Result(
-      characters: sourceText.count, // Always use full text for characters
-      words: tokenize(text: trimmedText, unit: .word),
-      sentences: tokenize(text: trimmedText, unit: .sentence),
-      paragraphs: tokenize(text: trimmedText, unit: .paragraph),
-      comments: commentCount
-    )
-  }
-}
-
-// MARK: - Private
-
-private extension Tokenizer {
-  static func tokenize(text: String, unit: NLTokenUnit) -> Int {
+  static func count(text: String, unit: NLTokenUnit) -> Int {
     let tokenizer = NLTokenizer(unit: unit)
     tokenizer.string = text
 
diff --git a/MarkEditMac/Modules/Sources/Statistics/Views/StatisticsView.swift b/MarkEditMac/Modules/Sources/Statistics/Views/StatisticsView.swift
diff --git a/MarkEditMac/Sources/Editor/Controllers/EditorViewController+Statistics.swift b/MarkEditMac/Sources/Editor/Controllers/EditorViewController+Statistics.swift

Original file line number	Diff line number	Diff line change
`@@ -44,9 +44,9 @@ public final class WebBridgeCore {`
`44`	`44`	`}`
`45`	`45`	`}`
`46`	`46`
`47`		`- public func getReadableContent() async throws -> ReadableContentPair {`
	`47`	`+ public func getReadableContentPair() async throws -> ReadableContentPair {`
`48`	`48`	`return try await withCheckedThrowingContinuation { continuation in`
`49`		`- webView?.invoke(path: "webModules.core.getReadableContent") { result in`
	`49`	`+ webView?.invoke(path: "webModules.core.getReadableContentPair") { result in`
`50`	`50`	`Task { @MainActor in`
`51`	`51`	`continuation.resume(with: result)`
`52`	`52`	`}`
`@@ -128,11 +128,13 @@ public struct ReadableContentPair: Codable {`
`128`	`128`	`public struct ReadableContent: Codable {`
`129`	`129`	`public var sourceText: String`
`130`	`130`	`public var trimmedText: String`
	`131`	`+ public var paragraphCount: Int`
`131`	`132`	`public var commentCount: Int`
`132`	`133`
`133`		`- public init(sourceText: String, trimmedText: String, commentCount: Int) {`
	`134`	`+ public init(sourceText: String, trimmedText: String, paragraphCount: Int, commentCount: Int) {`
`134`	`135`	`self.sourceText = sourceText`
`135`	`136`	`self.trimmedText = trimmedText`
	`137`	`+ self.paragraphCount = paragraphCount`
`136`	`138`	`self.commentCount = commentCount`
`137`	`139`	`}`
`138`	`140`	`}`