From d181905784f1130ef601f1a80a7f5b8065a4404a Mon Sep 17 00:00:00 2001 From: dhruvil410 Date: Fri, 19 Mar 2021 15:57:36 +0530 Subject: [PATCH 1/2] fix #60 --- src/sentences/sentence_splitting.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentences/sentence_splitting.jl b/src/sentences/sentence_splitting.jl index 0f58147..4ff84d5 100644 --- a/src/sentences/sentence_splitting.jl +++ b/src/sentences/sentence_splitting.jl @@ -120,7 +120,8 @@ function postproc_splits(sentences::AbstractString) sentences = replace(sentences, r"(\bMs\.)\n" => s"\1 ") sentences = replace(sentences, r"(\bMrs\.)\n" => s"\1 ") - + # no sentence break in between two words with no punctuation + sentences=replace(sentences,r"([a-zA-Z0-9])\n([a-zA-Z0-9])"=>s"\1 \2") # possible TODO: filter excessively long / short sentences From 8e48cec8accdf608e0169829b38a44fba5eb86c0 Mon Sep 17 00:00:00 2001 From: dhruvil410 Date: Sat, 10 Apr 2021 18:03:01 +0530 Subject: [PATCH 2/2] made optional collapse_newlines and added test for that --- src/sentences/sentence_splitting.jl | 10 ++++++---- src/set_method_api.jl | 2 +- test/sentence_splitting.jl | 13 +++++++++++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/sentences/sentence_splitting.jl b/src/sentences/sentence_splitting.jl index 4ff84d5..c6489a3 100644 --- a/src/sentences/sentence_splitting.jl +++ b/src/sentences/sentence_splitting.jl @@ -1,7 +1,7 @@ -function rulebased_split_sentences(sentences) +function rulebased_split_sentences(sentences;collapse_newlines::Bool=false) sentences = replace(sentences, r"([?!.])\s" => Base.SubstitutionString("\\1\n")) - sentences = postproc_splits(sentences) + sentences = postproc_splits(sentences,collapse_newlines) split(sentences, "\n") end @@ -34,7 +34,7 @@ Which draws in part on heuristics included in Yoshimasa Tsuruoka's medss.pl script. """ -function postproc_splits(sentences::AbstractString) +function postproc_splits(sentences::AbstractString,collapse_newlines) # Before we do anything remove windows line-ends sentences = replace(sentences, "\r" => "") @@ -121,7 +121,9 @@ function postproc_splits(sentences::AbstractString) sentences = replace(sentences, r"(\bMrs\.)\n" => s"\1 ") # no sentence break in between two words with no punctuation - sentences=replace(sentences,r"([a-zA-Z0-9])\n([a-zA-Z0-9])"=>s"\1 \2") + if collapse_newlines==true + sentences=replace(sentences,r"([a-zA-Z0-9])\n([a-zA-Z0-9])"=>s"\1 \2") + end # possible TODO: filter excessively long / short sentences diff --git a/src/set_method_api.jl b/src/set_method_api.jl index d0d7b4f..8b9b26c 100644 --- a/src/set_method_api.jl +++ b/src/set_method_api.jl @@ -22,7 +22,7 @@ Calling this will trigger recompilation of any functions that use `split_sentenc Calling `set_sentence_splitter` will give method overwritten warnings. They are expected, be worried if they do not occur """ function set_sentence_splitter(fun) - @eval split_sentences(str::AbstractString) = $(fun)(str) + @eval split_sentences(str::AbstractString;collapse_newlines::Bool=false) = $(fun)(str;collapse_newlines) end diff --git a/test/sentence_splitting.jl b/test/sentence_splitting.jl index d3b860d..3063a41 100644 --- a/test/sentence_splitting.jl +++ b/test/sentence_splitting.jl @@ -87,3 +87,16 @@ end And sometimes sentences can start with non-capitalized words. i is a good variable name.""") end + +@testset "collapse_newlines" begin + @test length(rulebased_split_sentences(""" + In this article, we present a language-independent, unsupervised approach to sentence boundary + detection. It is based on the assumption that a large number of ambiguities in the determination + of sentence boundaries can be eliminated once abbreviations have been identified. Instead of + relying on orthographic clues, the proposed system is able to detect abbreviations with high + accuracy using three criteria that only require information about the candidate type itself and + are independent of context: Abbreviations can be defined as a very tight collocation consisting + of a truncated word and a final period, abbreviations are usually short, and abbreviations + sometimes contain internal periods.""",collapse_newlines=true))==3 +end +