Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 241 additions & 0 deletions TeXmacs/plugins/html/progs/data/html.scm
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,220 @@
;; Html
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; 按行分割文本
(define (html-string-split-lines s)
(let ((len (if (>= (string-length s) 1000) 1000 (string-length s))))
(let loop ((i 0)
(start 0)
(result '()))
(cond ((>= i len)
(reverse (cons (substring s start i) result)))
((char=? (string-ref s i) #\newline)
(loop (+ i 1)
(+ i 1)
(cons (substring s start i) result)))
(else (loop (+ i 1) start result))))))

;; 某个字符在文本中的含量
(define (charactor-from-string s ch)
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spelling error in function name: "charactor" should be "character". This typo appears in the function name and should be corrected for consistency with standard English spelling.

Copilot uses AI. Check for mistakes.
(if (not (string-null? s))
(let* ((len (string-length s))
(limit (if (>= len 1000) 1000 len)))
(let loop ((ref 0)
(count 0))
(if (>= ref limit)
(/ count len)
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logic error in density calculation. The function counts characters in the first 1000 characters (limit) but divides by the total string length (len). This produces incorrect density values for strings longer than 1000 characters. The division should use 'limit' instead of 'len' to accurately represent the density of the sampled portion.

Suggested change
(/ count len)
(/ count limit)

Copilot uses AI. Check for mistakes.
(loop (+ ref 1)
(if (char=? (string-ref s ref) ch)
(+ count 1)
count)))))
#f))

;; 计算一个子串在文本中的含量,计算的是子串的字符数,而不是个数
(define (html-string-count-substring s sub)
(let ((sub-len (string-length sub)))
(if (zero? sub-len)
0
(let loop ((i 0)
(count 0))
(if (>= i (- (string-length s) sub-len -1))
count
(if (string=? (substring s i (+ i sub-len)) sub)
(loop (+ i sub-len) (+ count 1))
(loop (+ i 1) count)))))))

;; < 和 > 的含量
(define (html-angle-bracket-density s)
(if (string-null? s)
0
(let* ((len (string-length s))
(limit (if (>= len 1000) 1000 len))
(substr (substring s 0 limit)))
(/ (+ (charactor-from-string substr #\<)
(charactor-from-string substr #\>))
len))))
Comment on lines +69 to +71
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logic error in density calculation. The function sums the densities returned by charactor-from-string (which are already ratios count/len) and then divides by len again. This double division produces incorrect results. The correct approach is to sum the character counts and then divide once by the total length, or use the already-computed density values without further division.

Suggested change
(/ (+ (charactor-from-string substr #\<)
(charactor-from-string substr #\>))
len))))
(+ (charactor-from-string substr #\<)
(charactor-from-string substr #\>)))))

Copilot uses AI. Check for mistakes.

;; 完整的tag子串在文本中的字符含量
(define (html-tag-density s)
(if (string-null? s)
0
(let* ((len (string-length s))
(limit (if (>= len 1000) 1000 len))
(substr (substring s 0 limit))
(lc-substr (string-downcase substr)))
(let ((count (+ (html-string-count-substring lc-substr "<div")
(html-string-count-substring lc-substr "<span")
(html-string-count-substring lc-substr "<p")
(html-string-count-substring lc-substr "<a")
(html-string-count-substring lc-substr "<img")
(html-string-count-substring lc-substr "<ul")
(html-string-count-substring lc-substr "<ol")
(html-string-count-substring lc-substr "<li")
(html-string-count-substring lc-substr "<table")
(html-string-count-substring lc-substr "<tr")
(html-string-count-substring lc-substr "<td")
(html-string-count-substring lc-substr "<th")
(html-string-count-substring lc-substr "<h1")
(html-string-count-substring lc-substr "<h2")
(html-string-count-substring lc-substr "<h3")
(html-string-count-substring lc-substr "<h4")
(html-string-count-substring lc-substr "<h5")
(html-string-count-substring lc-substr "<h6")
(html-string-count-substring lc-substr "<form")
(html-string-count-substring lc-substr "<input")
(html-string-count-substring lc-substr "<button")
(html-string-count-substring lc-substr "<textarea")
(html-string-count-substring lc-substr "<select")
(html-string-count-substring lc-substr "<option")
(html-string-count-substring lc-substr "<style")
(html-string-count-substring lc-substr "<script")
(html-string-count-substring lc-substr "<meta")
(html-string-count-substring lc-substr "<link")
(html-string-count-substring lc-substr "</div")
(html-string-count-substring lc-substr "</ul")
(html-string-count-substring lc-substr "</ol")
(html-string-count-substring lc-substr "</table")
(html-string-count-substring lc-substr "</tr")
(html-string-count-substring lc-substr "</form")
(html-string-count-substring lc-substr "</style")
(html-string-count-substring lc-substr "</script"))))
(/ count len)))))
Comment on lines +81 to +117
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance concern: This function performs multiple linear scans of the same string, calling html-string-count-substring 34 times. Each call scans the entire substring. For better performance, consider combining these checks into a single pass through the string, using a state machine or regex pattern matching to identify all tag types in one scan.

Suggested change
(let ((count (+ (html-string-count-substring lc-substr "<div")
(html-string-count-substring lc-substr "<span")
(html-string-count-substring lc-substr "<p")
(html-string-count-substring lc-substr "<a")
(html-string-count-substring lc-substr "<img")
(html-string-count-substring lc-substr "<ul")
(html-string-count-substring lc-substr "<ol")
(html-string-count-substring lc-substr "<li")
(html-string-count-substring lc-substr "<table")
(html-string-count-substring lc-substr "<tr")
(html-string-count-substring lc-substr "<td")
(html-string-count-substring lc-substr "<th")
(html-string-count-substring lc-substr "<h1")
(html-string-count-substring lc-substr "<h2")
(html-string-count-substring lc-substr "<h3")
(html-string-count-substring lc-substr "<h4")
(html-string-count-substring lc-substr "<h5")
(html-string-count-substring lc-substr "<h6")
(html-string-count-substring lc-substr "<form")
(html-string-count-substring lc-substr "<input")
(html-string-count-substring lc-substr "<button")
(html-string-count-substring lc-substr "<textarea")
(html-string-count-substring lc-substr "<select")
(html-string-count-substring lc-substr "<option")
(html-string-count-substring lc-substr "<style")
(html-string-count-substring lc-substr "<script")
(html-string-count-substring lc-substr "<meta")
(html-string-count-substring lc-substr "<link")
(html-string-count-substring lc-substr "</div")
(html-string-count-substring lc-substr "</ul")
(html-string-count-substring lc-substr "</ol")
(html-string-count-substring lc-substr "</table")
(html-string-count-substring lc-substr "</tr")
(html-string-count-substring lc-substr "</form")
(html-string-count-substring lc-substr "</style")
(html-string-count-substring lc-substr "</script"))))
(/ count len)))))
(letrec* ((string-prefix-at?
(lambda (s prefix idx)
(let* ((s-len (string-length s))
(p-len (string-length prefix)))
(if (> (+ idx p-len) s-len)
#f
(let loop ((j 0))
(if (= j p-len)
#t
(if (char=? (string-ref s (+ idx j))
(string-ref prefix j))
(loop (+ j 1))
#f)))))))
(tags '("<div"
"<span"
"<p"
"<a"
"<img"
"<ul"
"<ol"
"<li"
"<table"
"<tr"
"<td"
"<th"
"<h1"
"<h2"
"<h3"
"<h4"
"<h5"
"<h6"
"<form"
"<input"
"<button"
"<textarea"
"<select"
"<option"
"<style"
"<script"
"<meta"
"<link"
"</div"
"</ul"
"</ol"
"</table"
"</tr"
"</form"
"</style"
"</script")))
(substr-len (string-length lc-substr)))
(let loop ((i 0) (count 0))
(if (>= i substr-len)
(/ count len)
(let ((new-count
(let tag-loop ((ts tags) (c count))
(if (null? ts)
c
(if (string-prefix-at? lc-substr (car ts) i)
(tag-loop (cdr ts) (+ c 1))
(tag-loop (cdr ts) c))))))
(loop (+ i 1) new-count)))))))

Copilot uses AI. Check for mistakes.

;; = 和 " 的含量
(define (html-attribute-density s)
(if (string-null? s)
0
(let* ((len (string-length s))
(limit (if (>= len 1000) 1000 len))
(substr (substring s 0 limit)))
(/ (+ (charactor-from-string substr #\=)
(charactor-from-string substr #\"))
len))))
Comment on lines +126 to +128
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logic error in density calculation. The function sums the densities returned by charactor-from-string (which are already ratios count/len) and then divides by len again. This double division produces incorrect results. The correct approach is to sum the character counts and then divide once by the total length, or use the already-computed density values without further division.

Copilot uses AI. Check for mistakes.

;; 这一行文本是否包含html标签
(define (html-line-contains-features? line)
(let ((lc-line (string-downcase line)))
(or
(> (html-string-count-substring lc-line "<div") 0)
(> (html-string-count-substring lc-line "<span") 0)
(> (html-string-count-substring lc-line "<p") 0)
(> (html-string-count-substring lc-line "<a") 0)
(> (html-string-count-substring lc-line "<img") 0)
(> (html-string-count-substring lc-line "<ul") 0)
(> (html-string-count-substring lc-line "<ol") 0)
(> (html-string-count-substring lc-line "<li") 0)
(> (html-string-count-substring lc-line "<table") 0)
(> (html-string-count-substring lc-line "<tr") 0)
(> (html-string-count-substring lc-line "<td") 0)
(> (html-string-count-substring lc-line "<th") 0)
(> (html-string-count-substring lc-line "<h1") 0)
(> (html-string-count-substring lc-line "<h2") 0)
(> (html-string-count-substring lc-line "<h3") 0)
(> (html-string-count-substring lc-line "<h4") 0)
(> (html-string-count-substring lc-line "<h5") 0)
(> (html-string-count-substring lc-line "<h6") 0)
(> (html-string-count-substring lc-line "</div") 0)
(> (html-string-count-substring lc-line "</span") 0)
(> (html-string-count-substring lc-line "</p") 0)
(> (html-string-count-substring lc-line "</a") 0)
(> (html-string-count-substring lc-line "/>") 0)
(> (html-string-count-substring lc-line "<!doctype") 0)
(> (html-string-count-substring lc-line "<?xml") 0))))
Comment on lines +130 to +158
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance concern: This function performs multiple linear scans of the same string, calling html-string-count-substring 25 times. Each call scans the entire line. For better performance, consider combining these checks into a single pass through the line, or using a more efficient pattern matching approach.

Suggested change
;; 这一行文本是否包含html标签
(define (html-line-contains-features? line)
(let ((lc-line (string-downcase line)))
(or
(> (html-string-count-substring lc-line "<div") 0)
(> (html-string-count-substring lc-line "<span") 0)
(> (html-string-count-substring lc-line "<p") 0)
(> (html-string-count-substring lc-line "<a") 0)
(> (html-string-count-substring lc-line "<img") 0)
(> (html-string-count-substring lc-line "<ul") 0)
(> (html-string-count-substring lc-line "<ol") 0)
(> (html-string-count-substring lc-line "<li") 0)
(> (html-string-count-substring lc-line "<table") 0)
(> (html-string-count-substring lc-line "<tr") 0)
(> (html-string-count-substring lc-line "<td") 0)
(> (html-string-count-substring lc-line "<th") 0)
(> (html-string-count-substring lc-line "<h1") 0)
(> (html-string-count-substring lc-line "<h2") 0)
(> (html-string-count-substring lc-line "<h3") 0)
(> (html-string-count-substring lc-line "<h4") 0)
(> (html-string-count-substring lc-line "<h5") 0)
(> (html-string-count-substring lc-line "<h6") 0)
(> (html-string-count-substring lc-line "</div") 0)
(> (html-string-count-substring lc-line "</span") 0)
(> (html-string-count-substring lc-line "</p") 0)
(> (html-string-count-substring lc-line "</a") 0)
(> (html-string-count-substring lc-line "/>") 0)
(> (html-string-count-substring lc-line "<!doctype") 0)
(> (html-string-count-substring lc-line "<?xml") 0))))
;; Helper: check whether STR has PREFIX starting at position POS
(define (string-prefix-at? str prefix pos)
(let* ((len-str (string-length str))
(len-pre (string-length prefix))
(end (+ pos len-pre)))
(and (<= end len-str)
(string=? (substring str pos end) prefix))))
;; 这一行文本是否包含html标签
(define (html-line-contains-features? line)
(let* ((lc-line (string-downcase line))
(len (string-length lc-line)))
(let loop ((i 0))
(cond
((>= i len) #f)
(else
(let ((c (string-ref lc-line i)))
(cond
;; Check for patterns starting with '<'
((char=? c #\<)
(if (or (string-prefix-at? lc-line "<div" i)
(string-prefix-at? lc-line "<span" i)
(string-prefix-at? lc-line "<p" i)
(string-prefix-at? lc-line "<a" i)
(string-prefix-at? lc-line "<img" i)
(string-prefix-at? lc-line "<ul" i)
(string-prefix-at? lc-line "<ol" i)
(string-prefix-at? lc-line "<li" i)
(string-prefix-at? lc-line "<table" i)
(string-prefix-at? lc-line "<tr" i)
(string-prefix-at? lc-line "<td" i)
(string-prefix-at? lc-line "<th" i)
(string-prefix-at? lc-line "<h1" i)
(string-prefix-at? lc-line "<h2" i)
(string-prefix-at? lc-line "<h3" i)
(string-prefix-at? lc-line "<h4" i)
(string-prefix-at? lc-line "<h5" i)
(string-prefix-at? lc-line "<h6" i)
(string-prefix-at? lc-line "</div" i)
(string-prefix-at? lc-line "</span" i)
(string-prefix-at? lc-line "</p" i)
(string-prefix-at? lc-line "</a" i)
(string-prefix-at? lc-line "<!doctype" i)
(string-prefix-at? lc-line "<?xml" i))
#t
(loop (+ i 1))))
;; Preserve detection of "/>" anywhere in the line
((and (char=? c #\/)
(< (+ i 1) len)
(char=? (string-ref lc-line (+ i 1)) #\>))
#t)
(else
(loop (+ i 1)))))))))

Copilot uses AI. Check for mistakes.

;; 计算存在html特征的行的含量
(define (html-feature-line-density s)
(let ((lines (html-string-split-lines s)))
(if (null? lines)
0
(let loop ((remaining lines)
(count 0)
(total 0))
(if (null? remaining)
(if (> total 0) (/ count total) 0)
(let ((line (car remaining)))
(loop (cdr remaining)
(if (html-line-contains-features? line) (+ count 1) count)
(+ total 1))))))))

;; 计算div标签的平衡性
(define (html-structure-balanced? s)
(let* ((lc-s (string-downcase s))
(open-tags (html-string-count-substring lc-s "<div"))
(close-tags (html-string-count-substring lc-s "</div")))
;; div 的开标签与闭标签数量差小于2
(and (> open-tags 0) (> close-tags 0) (<= (abs (- open-tags close-tags)) 2))))

;; 短字符串的特殊检测
(define (determine-short-html-string s)
(let* ((len (string-length s)))
(cond
((or
(and (> (charactor-from-string s #\<) 0)
(> (charactor-from-string s #\>) 0)
(> (html-string-count-substring s "</") 0))
(> (html-string-count-substring (string-downcase s) "class=") 0)
(> (html-string-count-substring (string-downcase s) "id=") 0)
(> (html-string-count-substring (string-downcase s) "style=") 0)
(> (html-string-count-substring (string-downcase s) "href=") 0)
(> (html-string-count-substring (string-downcase s) "src=") 0))
#t)
((>= (html-angle-bracket-density s) 0.03) #t)
(else #f))))

(define (is-short-html-string? s)
(if (<= (string-length s) 100)
(determine-short-html-string s)
#f))

(define (is-html-string? s)
Copy link

Copilot AI Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent indentation. This line has a leading space before the opening parenthesis, while all other function definitions in the file start at column 1. Remove the leading space for consistency.

Suggested change
(define (is-html-string? s)
(define (is-html-string? s)

Copilot uses AI. Check for mistakes.
(let* ((angle-density (html-angle-bracket-density s))
(tag-density (html-tag-density s))
(attr-density (html-attribute-density s))
(feature-line-density (html-feature-line-density s))
(balanced? (html-structure-balanced? s)))
(cond
;; High confidence: clear HTML structure
;; < > 含量,标签含量,特征行含量
((and (>= angle-density 0.02)
(>= tag-density 0.01)
(>= feature-line-density 0.25))
#t)
;; Medium confidence: good angle bracket density with either tags or attributes
;;
((and (>= angle-density 0.015)
(or (>= tag-density 0.005)
(>= attr-density 0.01))
(>= feature-line-density 0.15))
#t)
;; Lower confidence: balanced structure with some HTML features
((and balanced?
(>= angle-density 0.01)
(>= feature-line-density 0.10))
#t)
;; Very high angle bracket density (likely HTML/XML)
((>= angle-density 0.03) #t)
(else #f))))

(define (html-recognizes-at? s pos)
(set! pos (format-skip-spaces s pos))
(cond ((format-test? s pos "<html") #t)
Expand All @@ -26,10 +240,37 @@
((format-test? s pos "<!doctype html") #t)
((format-test? s pos "<math") #t)
((format-test? s pos "<table") #t)
((format-test? s pos "<p>") #t)
((format-test? s pos "<div") #t)
((format-test? s pos "<span") #t)
((format-test? s pos "<a ") #t)
((format-test? s pos "<img") #t)
((format-test? s pos "<ul") #t)
((format-test? s pos "<ol") #t)
((format-test? s pos "<li") #t)
((format-test? s pos "<h1") #t)
((format-test? s pos "<h2") #t)
((format-test? s pos "<h3") #t)
((format-test? s pos "<h4") #t)
((format-test? s pos "<h5") #t)
((format-test? s pos "<h6") #t)
((format-test? s pos "<form") #t)
((format-test? s pos "<input") #t)
((format-test? s pos "<button") #t)
((format-test? s pos "<textarea") #t)
((format-test? s pos "<select") #t)
((format-test? s pos "<option") #t)
((format-test? s pos "<style") #t)
((format-test? s pos "<script") #t)
((format-test? s pos "<meta") #t)
((format-test? s pos "<link") #t)
((format-test? s pos "<!--") #t)
((format-test? s pos "<?xml ")
(html-recognizes-at? s (format-skip-line s pos)))
((format-test? s pos "<!doctype ")
(html-recognizes-at? s (format-skip-line s pos)))
((is-short-html-string? s) #t)
((is-html-string? s) #t)
(else #f)))

(define (html-recognizes? s)
Expand Down
Loading