From 961f724ca667bb93cb518b51ce9ee803518db4ce Mon Sep 17 00:00:00 2001 From: Philipp Burckhardt Date: Sat, 22 Feb 2025 16:59:05 -0500 Subject: [PATCH 1/2] fix: ensure `nlp/sentencize` handles punctuation in quotation marks properly --- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: passed - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: passed - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: na - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: na - task: lint_typescript_tests status: na - task: lint_license_headers status: passed --- --- .../@stdlib/nlp/sentencize/lib/main.js | 37 +++++++++++++++--- .../@stdlib/nlp/sentencize/test/test.js | 38 +++++++++++++++++++ 2 files changed, 69 insertions(+), 6 deletions(-) diff --git a/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js b/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js index 8cf46b32640f..e03dba300ab7 100644 --- a/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js +++ b/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js @@ -27,11 +27,13 @@ var trim = require( '@stdlib/string/base/trim' ); // VARIABLES // +var RE_LOWERCASE = /^[a-z]+$/; var RE_CAPITALIZED = /^[A-Z][a-z]{0,4}$/; var RE_CAPITALIZED_PERIOD = /^([A-Z]\.)*[A-Z]$/; var RE_NUMBER = /^[0-9]$/; var RE_PREFIXES = /^[{[(<:;"'”`]/; var RE_SUFFIXES = /[})\]>:;"'”`]$/; +var RE_QUOTES = /^["'`]$/; // FUNCTIONS // @@ -51,10 +53,38 @@ var RE_SUFFIXES = /[})\]>:;"'”`]$/; * @returns {boolean} boolean indicating whether the token at a specified index is an end-of-sentence token */ function isEndOfSentence( tokens, i ) { + var nextToken; var token; var im1 = i - 1; var ip1 = i + 1; + token = tokens[ i ]; + + // Handle quoted text with punctuation... + if ( + RE_QUOTES.test( token ) && + i > 0 && + ( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' ) + ) { + // Look ahead to see if sentence continues: + ip1 = i + 1; + if ( ip1 < tokens.length ) { + // Skip spaces... + while ( ip1 < tokens.length && tokens[ ip1 ] === ' ' ) { + ip1 += 1; + } + // If next non-space token is lowercase or certain punctuation, sentence continues: + if ( ip1 < tokens.length ) { + nextToken = tokens[ ip1 ]; + if ( RE_LOWERCASE.test( nextToken ) ) { + return false; + } + } + } + return true; + } + + // Regular sentence ending punctuation... if ( token === '.' && !RE_CAPITALIZED.test( tokens[ im1 ] ) && // for other short abbreviations and bullet points @@ -73,12 +103,6 @@ function isEndOfSentence( tokens, i ) { ) { return true; } - if ( - RE_SUFFIXES.test( token ) && - ( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' ) - ) { - return true; - } return false; } @@ -112,6 +136,7 @@ function sentencize( str ) { var tokens; var out; var i; + if ( !isString( str ) ) { throw new TypeError( 'invalid argument. Must provide a string. Value: `' + str + '`.' ); } diff --git a/lib/node_modules/@stdlib/nlp/sentencize/test/test.js b/lib/node_modules/@stdlib/nlp/sentencize/test/test.js index 304534773b85..95e641e770c0 100644 --- a/lib/node_modules/@stdlib/nlp/sentencize/test/test.js +++ b/lib/node_modules/@stdlib/nlp/sentencize/test/test.js @@ -317,3 +317,41 @@ tape( 'the function returns an empty array if provided an empty string', functio t.equal( out.length, 0, 'array length is zero' ); t.end(); }); + +tape( 'the function correctly handles punctuation within quotation marks', function test( t ) { + var expected; + var actual; + var str; + + str = 'I said "Look out" right before he banged his head.'; + expected = [ 'I said "Look out" right before he banged his head.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with simple quotes together' ); + + str = 'I said "Look out!" right before he banged his head.'; + expected = [ 'I said "Look out!" right before he banged his head.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with exclamation in quotes together' ); + + str = 'He asked "What time is it?" before leaving.'; + expected = [ 'He asked "What time is it?" before leaving.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with question mark in quotes together' ); + + str = '"Stop!" he yelled. "We need to think about this."'; + expected = [ '"Stop!" he yelled.', '"We need to think about this."' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'correctly splits multiple quoted sentences' ); + + str = 'She said "This is great!" and smiled.'; + expected = [ 'She said "This is great!" and smiled.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'keeps sentence with exclamation in middle quotes together' ); + + str = '"Is this correct?" he wondered. "I think so!" she replied.'; + expected = [ '"Is this correct?" he wondered.', '"I think so!" she replied.' ]; + actual = sentencize( str ); + t.deepEqual( actual, expected, 'correctly handles multiple quoted sentences with different punctuation' ); + + t.end(); +}); From 876302ffdff51339ad33aeb58926593d2f1a151c Mon Sep 17 00:00:00 2001 From: Philipp Burckhardt Date: Sat, 22 Feb 2025 17:07:13 -0500 Subject: [PATCH 2/2] chore: update comment --- type: pre_commit_static_analysis_report description: Results of running static analysis checks when committing changes. report: - task: lint_filenames status: passed - task: lint_editorconfig status: passed - task: lint_markdown status: na - task: lint_package_json status: na - task: lint_repl_help status: na - task: lint_javascript_src status: passed - task: lint_javascript_cli status: na - task: lint_javascript_examples status: na - task: lint_javascript_tests status: na - task: lint_javascript_benchmarks status: na - task: lint_python status: na - task: lint_r status: na - task: lint_c_src status: na - task: lint_c_examples status: na - task: lint_c_benchmarks status: na - task: lint_c_tests_fixtures status: na - task: lint_shell status: na - task: lint_typescript_declarations status: na - task: lint_typescript_tests status: na - task: lint_license_headers status: passed --- --- type: pre_push_report description: Results of running various checks prior to pushing changes. report: - task: run_javascript_examples status: na - task: run_c_examples status: na - task: run_cpp_examples status: na - task: run_javascript_readme_examples status: na - task: run_c_benchmarks status: na - task: run_cpp_benchmarks status: na - task: run_fortran_benchmarks status: na - task: run_javascript_benchmarks status: na - task: run_julia_benchmarks status: na - task: run_python_benchmarks status: na - task: run_r_benchmarks status: na - task: run_javascript_tests status: na --- --- lib/node_modules/@stdlib/nlp/sentencize/lib/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js b/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js index e03dba300ab7..e41feaf0130a 100644 --- a/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js +++ b/lib/node_modules/@stdlib/nlp/sentencize/lib/main.js @@ -73,7 +73,7 @@ function isEndOfSentence( tokens, i ) { while ( ip1 < tokens.length && tokens[ ip1 ] === ' ' ) { ip1 += 1; } - // If next non-space token is lowercase or certain punctuation, sentence continues: + // If next non-space token is lowercase, we assume the sentence continues: if ( ip1 < tokens.length ) { nextToken = tokens[ ip1 ]; if ( RE_LOWERCASE.test( nextToken ) ) {