fmacpro
diff --git a/‎APIDOC.md‎
Lines changed: 3 additions & 1 deletion b/‎APIDOC.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 27 additions & 1 deletion b/‎README.md‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎controllers/entityParser.js‎
Lines changed: 27 additions & 8 deletions b/‎controllers/entityParser.js‎
Lines changed: 27 additions & 8 deletions
diff --git a/‎controllers/keywordParser.js‎
Lines changed: 6 additions & 1 deletion b/‎controllers/keywordParser.js‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎controllers/language.js‎
Lines changed: 46 additions & 0 deletions b/‎controllers/language.js‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎controllers/readability.js‎
Lines changed: 18 additions & 0 deletions b/‎controllers/readability.js‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎controllers/spellCheck.js‎
Lines changed: 2 additions & 2 deletions b/‎controllers/spellCheck.js‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎controllers/summary.js‎
Lines changed: 6 additions & 0 deletions b/‎controllers/summary.js‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎controllers/textProcessing.js‎
Lines changed: 1 addition & 1 deletion b/‎controllers/textProcessing.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎controllers/titleDetector.js‎
Lines changed: 3 additions & 1 deletion b/‎controllers/titleDetector.js‎
Lines changed: 3 additions & 1 deletion
@@ -27,7 +27,9 @@
 main article parser module export function
 
 **Kind**: global function  
-**Returns**: <code>Object</code> - article parser results object  
+**Returns**: <code>Object</code> - article parser results object. Includes `text.summary` and
+`text.sentences` when `options.enabled` contains `'summary'`. Also exposes
+`language` with ISO-639-1 and ISO-639-3 codes when detection succeeds. Includes `readability` with estimated reading time and basic text statistics when `options.enabled` contains 'readability'.
 
 | Param | Type | Description |
 | --- | --- | --- |
 
@@ -1,6 +1,6 @@
 # Horseman Article Parser
 
-Horseman is a focused article scraping module for the open web. It loads pages (dynamic or AMP), detects the main story body, and returns clean, structured content ready for downstream use. Alongside text and title, it includes in-article links, metadata, sentiment, keywords/keyphrases, named entities, optional spelling suggestions, site icon, and Lighthouse signals. It also copes with live blogs, applies simple per-domain tweaks (headers/cookies/goto), and uses Puppeteer + stealth to reduce blocking.
+Horseman is a focused article scraping module for the open web. It loads pages (dynamic or AMP), detects the main story body, and returns clean, structured content ready for downstream use. Alongside text and title, it includes in-article links, metadata, sentiment, keywords/keyphrases, named entities, optional summaries, optional spelling suggestions, readability metrics and basic counts (characters, words, sentences, paragraphs), site icon, and Lighthouse signals. It also copes with live blogs, applies simple per-domain tweaks (headers/cookies/goto), and uses Puppeteer + stealth to reduce blocking. The parser now detects the article language and exposes ISO codes, with best-effort support for non-English content (features may fall back to English dictionaries when specific resources are missing).
 
 ## Table of Contents
 
@@ -51,6 +51,8 @@ const options = {
     "entities",
     "spelling",
     "keywords",
+    "summary",
+    "readability",
   ],
 };
 
@@ -72,10 +74,20 @@ const options = {
       people: article.people,
       orgs: article.orgs,
       places: article.places,
+      language: article.language,
+      readability: {
+        readingTime: article.readability.readingTime,
+        characters: article.readability.characters,
+        words: article.readability.words,
+        sentences: article.readability.sentences,
+        paragraphs: article.readability.paragraphs,
+      },
       text: {
         raw: article.processed.text.raw,
         formatted: article.processed.text.formatted,
         html: article.processed.text.html,
+        summary: article.processed.text.summary,
+        sentences: article.processed.text.sentences,
       },
       spelling: article.spelling,
       meta: article.meta,
@@ -196,9 +208,15 @@ var options = {
     "entities",
     "spelling",
     "keywords",
+    "summary",
+    "readability",
   ],
 };
 ```
+Add "summary" to `options.enabled` to generate a short summary of the article text. The result
+includes `text.summary` and a `text.sentences` array containing the first five sentences.
+
+Add "readability" to `options.enabled` to evaluate readability, estimate reading time, and gather basic text statistics. The result is available as `article.readability` with `readingTime` (seconds), `characters`, `words`, `sentences`, and `paragraphs`.
 
 You may pass rules for returning an articles title & contents. This is useful in a case
 where the parser is unable to return the desired title or content e.g.
@@ -317,6 +335,8 @@ const options = {
     "entities",
     "spelling",
     "keywords",
+    "summary",
+    "readability",
   ],
   // Optional: tweak spelling output/filters
   retextspell: {
@@ -366,6 +386,10 @@ contentDetection: {
 }
 ```
 
+### Language Detection
+
+Horseman automatically detects the article language and exposes ISO codes via `article.language` in the result. Downstream steps such as keyword extraction or spelling use these codes to select language-specific resources when available. Dictionaries for English, French, and Spanish are bundled; other languages fall back to English if a matching dictionary or NLP plugin is not found.
+
 ## Development
 
 Please feel free to fork the repo or open pull requests to the development branch. I've used [eslint](https://eslint.org/) for linting.
@@ -558,6 +582,8 @@ npm run docs
 - [retext-pos](https://github.com/retextjs/retext-pos): Plugin to add part-of-speech (POS) tags
 - [retext-keywords](https://ghub.io/retext-keywords): Keyword extraction with Retext
 - [retext-spell](https://ghub.io/retext-spell): Spelling checker for retext
+- [retext-language](https://ghub.io/retext-language): Language detection for retext
+- [franc](https://ghub.io/franc): Fast language detection from text
 - [sentiment](https://ghub.io/sentiment): AFINN-based sentiment analysis for Node.js
 - [jquery](https://ghub.io/jquery): JavaScript library for DOM operations
 - [jsdom](https://ghub.io/jsdom): A JavaScript implementation of many web standards
 
@@ -5,25 +5,28 @@ export function normalizeEntity (w) {
   if (typeof w !== 'string') return ''
   return w
     .replace(/[’']/g, '')
-    .replace(/[^A-Za-z0-9]+/g, ' ')
+    .replace(/[^A-Za-z0-9-]+/g, ' ')
     .trim()
     .toLowerCase()
 }
 
 export default function entityParser (nlpInput, pluginHints = { first: [], last: [] }, timeLeft = () => Infinity) {
+  const doc = nlp(nlpInput)
   const entityToString = (e) => {
     if (Array.isArray(e?.terms) && e.terms.length) {
       const parts = []
-      for (const term of e.terms) {
+      for (let i = 0; i < e.terms.length; i++) {
+        const term = e.terms[i]
         let text = String(term.text || '').trim()
         if (!text) continue
         if (/^[’']s$/i.test(text) && parts.length) {
           parts[parts.length - 1] += "'s"
         } else {
-          parts.push(text)
+          const isHyphen = typeof term.post === 'string' && term.post.trim() === '-' && i < e.terms.length - 1
+          parts.push(isHyphen ? text + '-' : text)
         }
       }
-      return parts.join(' ').trim()
+      return parts.join(' ').replace(/- /g, '-').trim()
     }
     if (typeof e?.text === 'string') return e.text.trim()
     return null
@@ -45,7 +48,23 @@ export default function entityParser (nlpInput, pluginHints = { first: [], last:
   }
 
   const result = {}
-  result.people = dedupeEntities(nlp(nlpInput).people().json().map(entityToString), true)
+  // use compromise's richer person parsing to split name parts
+  doc.people().parse()
+  result.people = dedupeEntities(
+    doc.people().json().map(p => {
+      const text = entityToString(p)
+      if (p.person && (p.person.honorific || p.person.firstName || p.person.middleName || p.person.lastName)) {
+        const parts = [p.person.honorific, p.person.firstName, p.person.middleName, p.person.lastName]
+          .filter(Boolean)
+          .map(capitalizeFirstLetter)
+        const joined = parts.join(' ')
+        // preserve hyphenated names using original text
+        return /-/.test(p.text) ? text : joined
+      }
+      return text
+    }),
+    true
+  )
   const seen = new Set(result.people.map(p => normalizeEntity(p)))
   if (pluginHints.first.length && pluginHints.last.length) {
     const haystack = normalizeEntity(nlpInput)
@@ -61,8 +80,8 @@ export default function entityParser (nlpInput, pluginHints = { first: [], last:
     }
   }
   result.people = dedupeEntities(result.people, true)
-  if (timeLeft() >= 1000) result.places = dedupeEntities(nlp(nlpInput).places().json().map(entityToString))
-  if (timeLeft() >= 900) result.orgs = dedupeEntities(nlp(nlpInput).organizations().json().map(entityToString))
-  if (timeLeft() >= 800) result.topics = dedupeEntities(nlp(nlpInput).topics().json().map(entityToString))
+  if (timeLeft() >= 1000) result.places = dedupeEntities(doc.places().json().map(entityToString))
+  if (timeLeft() >= 900) result.orgs = dedupeEntities(doc.organizations().json().map(entityToString))
+  if (timeLeft() >= 800) result.topics = dedupeEntities(doc.topics().json().map(entityToString))
   return result
 }
@@ -2,11 +2,16 @@ import { retext } from 'retext'
 import { toString as nlcstToString } from 'nlcst-to-string'
 import pos from 'retext-pos'
 import keywords from 'retext-keywords'
+import language from 'retext-language'
 import _ from 'lodash'
 import { capitalizeFirstLetter, stripPossessive } from '../helpers.js'
 
 export default async function keywordParser (html, options = { maximum: 10 }) {
-  const file = await retext().use(pos).use(keywords, options).process(html)
+  const { lang, ...rest } = options || {}
+  const processor = retext()
+  if (lang) processor.use(language, { language: lang })
+  processor.use(pos).use(keywords, rest)
+  const file = await processor.process(html)
 
   const keywordsArr = file.data.keywords.map(keyword => ({
     keyword: capitalizeFirstLetter(stripPossessive(nlcstToString(keyword.matches[0].node))),
 
@@ -0,0 +1,46 @@
+import { retext } from 'retext'
+import retextLanguage from 'retext-language'
+import { franc } from 'franc'
+
+// Minimal ISO-639-3 to ISO-639-1 mapping for common languages
+const ISO3_TO_1 = {
+  afr: 'af', ara: 'ar', ben: 'bn', bul: 'bg', cat: 'ca', ces: 'cs', dan: 'da',
+  deu: 'de', ell: 'el', eng: 'en', est: 'et', eus: 'eu', fin: 'fi', fra: 'fr',
+  heb: 'he', hin: 'hi', hrv: 'hr', hun: 'hu', ind: 'id', ita: 'it', jpn: 'ja',
+  kor: 'ko', lit: 'lt', lav: 'lv', nld: 'nl', pol: 'pl', por: 'pt', ron: 'ro',
+  rus: 'ru', slk: 'sk', slv: 'sl', spa: 'es', srp: 'sr', swe: 'sv', tam: 'ta',
+  tel: 'te', tha: 'th', tur: 'tr', ukr: 'uk', urd: 'ur', vie: 'vi', zho: 'zh'
+}
+
+function iso3to1(code) {
+  return ISO3_TO_1[code] || null
+}
+
+/**
+ * Detect language of provided text.
+ * Returns ISO-639-1 and ISO-639-3 codes.
+ * Defaults to English if detection fails.
+ * @param {string} text raw text input
+ * @returns {{iso6391: string, iso6393: string}}
+ */
+export default async function detectLanguage(text) {
+  let iso6393 = 'eng'
+  if (typeof text === 'string' && text.trim()) {
+    try {
+      const file = await retext().use(retextLanguage).process(text)
+      if (file.data && file.data.language && file.data.language !== 'und') {
+        iso6393 = file.data.language
+      } else {
+        const f = franc(text)
+        if (f && f !== 'und') iso6393 = f
+      }
+    } catch {
+      try {
+        const f = franc(text)
+        if (f && f !== 'und') iso6393 = f
+      } catch {}
+    }
+  }
+  const iso6391 = iso3to1(iso6393) || 'en'
+  return { iso6391, iso6393 }
+}
@@ -0,0 +1,18 @@
+/**
+ * Evaluate basic readability statistics and estimate reading time.
+ * Returns an estimated reading time in seconds (assuming ~200 wpm) and
+ * basic document statistics (characters, words, sentences, paragraphs).
+ *
+ * @param {string} text raw text input
+ * @returns {{readingTime: number, characters: number, words: number, sentences: number, paragraphs: number}}
+ */
+export default async function checkReadability (text) {
+  if (!text || typeof text !== 'string') return { readingTime: 0, characters: 0, words: 0, sentences: 0, paragraphs: 0 }
+  const trimmed = text.trim()
+  const characters = trimmed.length
+  const words = trimmed.split(/\s+/).filter(Boolean).length
+  const sentences = trimmed.split(/[.!?]+/).filter(s => s.trim().length > 0).length
+  const paragraphs = trimmed.split(/\r?\n+/).filter(p => p.trim().length > 0).length
+  const readingTime = Math.round((words / 200) * 60)
+  return { readingTime, characters, words, sentences, paragraphs }
+}
@@ -11,8 +11,8 @@ export default async function spellCheck (text, options) {
   input = input.replace(/\b[\w-]+(?:\.[\w-]+)+(?:\/\S*)?/gi, ' ')
   // remove alphanumeric tokens like 123abc
   input = input.replace(/[0-9]{1,}[a-zA-Z]{1,}/gi, ' ')
-  // collapse whitespace
-  input = input.replace(/\s+/g, ' ').trim()
+  // collapse spaces but preserve line breaks for accurate line numbers
+  input = input.replace(/\r\n/g, '\n').replace(/[ \t]+/g, ' ')
 
   if (typeof options === 'undefined') {
     options = { dictionary }
 
@@ -0,0 +1,6 @@
+export function buildSummary (text) {
+  if (!text || typeof text !== 'string') return { text: '', sentences: [] }
+  const sentences = text.match(/[^.!?]+[.!?]/g) || [text]
+  const top = sentences.slice(0, 5).map(s => s.trim())
+  return { text: top.join(' ').trim(), sentences: top }
+}
@@ -13,7 +13,7 @@ export function getRawText (html) {
     unorderedListItemPrefix: ''
   }
   let rawText = htmlToText(html, options)
-  rawText = nlp(rawText).normalize().out('text')
+  rawText = nlp(rawText).out('text')
   const containsUrlLike = (s) => {
     if (!s) return false
     const str = String(s)
 
@@ -12,7 +12,9 @@ function normalizeTitle(title) {
   if (!title) return null
   let t = String(title).replace(/(\r\n|\n|\r)/gm, ' ').replace(/\s+/g, ' ').trim()
   // remove common site suffixes after delimiters
-  t = t.replace(/\s*[|\-–:·»]\s*[^|\-–:·»]{2,}\s*$/u, () => '')
+  t = t
+    .replace(/\s*[|–:·»]\s*[^|–:·»-]{2,}\s*$/u, '')
+    .replace(/\s+-\s+[^|–:·»-]{2,}\s*$/u, '')
   return t.trim() || null
 }
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ export function getRawText (html) {`
`13`	`13`	`unorderedListItemPrefix: ''`
`14`	`14`	`}`
`15`	`15`	`let rawText = htmlToText(html, options)`
`16`		`- rawText = nlp(rawText).normalize().out('text')`
	`16`	`+ rawText = nlp(rawText).out('text')`
`17`	`17`	`const containsUrlLike = (s) => {`
`18`	`18`	`if (!s) return false`
`19`	`19`	`const str = String(s)`