Skip to content

Commit 6badafe

Browse files
committed
Squashed commit of the following:
commit 61cf5de entering 2026 added a (tricky, hidden) option --keep-spaces-inside-quotes. Allows a limited use to keep quoted values together: "One quote", or ' keep this alive '
1 parent a027041 commit 6badafe

File tree

10 files changed

+85
-19
lines changed

10 files changed

+85
-19
lines changed

.github/workflows/ucto.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ jobs:
3636
strategy:
3737
matrix:
3838
os: [ubuntu-latest, macos-latest]
39-
compiler: [g++-12, clang++ -std=c++17]
39+
compiler: [g++ -std=c++17, clang++ -std=c++17]
4040

4141
steps:
4242
- uses: actions/[email protected]

codemeta.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"@type": "SoftwareSourceCode",
99
"identifier": "ucto",
1010
"name": "ucto",
11-
"version": "0.35",
11+
"version": "0.36",
1212
"description": "Ucto tokenizes text files: it separates words from punctuation, and splits sentences. This is one of the first tasks for almost any Natural Language Processing application. Ucto offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation.",
1313
"license": "https://spdx.org/licenses/GPL-3.0-only",
1414
"url": "https://languagemachines.github.io/ucto",

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Process this file with autoconf to produce a configure script.
33

44
AC_PREREQ([2.69])
5-
AC_INIT([ucto],[0.35],[[email protected]]) #adapt version number in codemeta.json as well
5+
AC_INIT([ucto],[0.36],[[email protected]]) #adapt version number in codemeta.json as well
66
AM_INIT_AUTOMAKE([foreign])
77
AC_CONFIG_SRCDIR([configure.ac])
88
AC_CONFIG_MACRO_DIR([m4])

include/ucto/my_textcat.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2024
2+
Copyright (c) 2026
33
CLST - Radboud University
44
ILK - Tilburg University
55

include/ucto/setting.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006 - 2024
2+
Copyright (c) 2006 - 2026
33
CLST - Radboud University
44
ILK - Tilburg University
55

include/ucto/tokenize.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006 - 2024
2+
Copyright (c) 2006 - 2026
33
CLST - Radboud University
44
ILK - Tilburg University
55
@@ -45,6 +45,7 @@ namespace Tokenizer {
4545

4646
const std::string Version();
4747
const std::string VersionName();
48+
extern bool keep_quoted_spaces;
4849

4950
enum TokenRole {
5051
NOROLE = 0,
@@ -99,8 +100,8 @@ namespace Tokenizer {
99100
const UnicodeString&,
100101
const std::string& = "" );
101102
std::string lang_code; // ISO 639-3 language code
102-
std::string texttostring();
103-
std::string typetostring();
103+
std::string texttostring() const;
104+
std::string typetostring() const ;
104105
};
105106

106107
class TokenizerClass{
@@ -273,6 +274,10 @@ namespace Tokenizer {
273274
bool setUndLang( bool b ){ bool r = und_language; und_language = b; return r; };
274275
bool getUndLang(){ return und_language; };
275276

277+
bool setKeepQuotedSpaces( bool b ){ bool r = keep_quoted_spaces;
278+
keep_quoted_spaces = b; return r; };
279+
bool getKeepQuotedSpaces() const { return keep_quoted_spaces; };
280+
276281
const std::string& getInputClass( ) const { return inputclass; }
277282
const std::string setInputClass( const std::string& cls) {
278283
std::string res = inputclass;

src/my_textcat.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2024
2+
Copyright (c) 2026
33
CLST - Radboud University
44
55
This file is part of Ucto

src/setting.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006 - 2024
2+
Copyright (c) 2006 - 2026
33
CLST - Radboud University
44
ILK - Tilburg University
55

src/tokenize.cxx

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006 - 2024
2+
Copyright (c) 2006 - 2026
33
CLST - Radboud University
44
ILK - Tilburg University
55
@@ -76,6 +76,7 @@ namespace Tokenizer {
7676

7777
using namespace icu;
7878
using TiCC::operator<<;
79+
bool keep_quoted_spaces = false;
7980

8081
const UChar32 ZWJ = u'\u200D';
8182

@@ -154,21 +155,46 @@ namespace Tokenizer {
154155
const UnicodeString type_unknown = "UNKNOWN";
155156
const UnicodeString type_unanalyzed = "UNANALYZED";
156157

158+
UnicodeString filter_ZCARON( const UnicodeString& in ){
159+
UnicodeString result;
160+
for ( int i=0; i < in.length(); ++i ){
161+
UChar32 c = in[i];
162+
if ( c == U'Ž' ){
163+
c = ' ';
164+
}
165+
result += c;
166+
}
167+
return result;
168+
}
169+
157170
Token::Token( const UnicodeString& _type,
158171
const UnicodeString& _s,
159-
TokenRole _role, const string& _lang_code ):
160-
type(_type), us(_s), role(_role), lang_code(_lang_code) {
172+
TokenRole _role,
173+
const string& _lang_code ):
174+
type(_type), role(_role), lang_code(_lang_code) {
175+
if ( keep_quoted_spaces ){
176+
us = filter_ZCARON( _s );
177+
}
178+
else {
179+
us = _s;
180+
}
161181
}
162182

163183
Token::Token( const UnicodeString& _type,
164184
const UnicodeString& _s,
165185
const string& _lang_code ):
166-
type(_type), us(_s), role(NOROLE), lang_code(_lang_code) {
186+
type(_type), role(NOROLE), lang_code(_lang_code) {
187+
if ( keep_quoted_spaces ){
188+
us = filter_ZCARON( _s );
189+
}
190+
else {
191+
us = _s;
192+
}
167193
}
168194

169195

170-
std::string Token::texttostring() { return TiCC::UnicodeToUTF8(us); }
171-
std::string Token::typetostring() { return TiCC::UnicodeToUTF8(type); }
196+
std::string Token::texttostring() const { return TiCC::UnicodeToUTF8(us); }
197+
std::string Token::typetostring() const { return TiCC::UnicodeToUTF8(type); }
172198

173199
ostream& operator<< (std::ostream& os, const Token& t ){
174200
os << t.type << " : " << t.role << ": '" << t.us << "' (" << t.lang_code << ")";
@@ -2946,6 +2972,32 @@ namespace Tokenizer {
29462972
}
29472973
}
29482974

2975+
UnicodeString replace_quoted_spaces( const UnicodeString& in ){
2976+
UnicodeString result;
2977+
UChar32 quote = '\x0';
2978+
for ( int i=0; i < in.length(); ++i ){
2979+
UChar32 c = in[i];
2980+
// cerr << "bekijk: " << UnicodeString( c ) << endl;
2981+
if ( c == '"' || c == '\'' ){
2982+
// found quote
2983+
// cerr << "found quote!" << endl;
2984+
if ( c == quote ){
2985+
// so a second one, reset
2986+
quote = '\x0';
2987+
// cerr << "reset quote!" << endl;
2988+
}
2989+
else {
2990+
quote = c;
2991+
}
2992+
}
2993+
else if ( c == ' ' && quote != '\x0' ){
2994+
c = U'Ž'; // mark as Ž
2995+
}
2996+
result += c;
2997+
}
2998+
return result;
2999+
}
3000+
29493001
int TokenizerClass::internal_tokenize_line( const UnicodeString& originput,
29503002
const string& _lang ){
29513003
if ( originput.isBogus() ){ //only tokenize valid input
@@ -2971,6 +3023,9 @@ namespace Tokenizer {
29713023
<< originput << "] (language= " << lang << ")" << endl;
29723024
}
29733025
UnicodeString input = originput;
3026+
if ( keep_quoted_spaces ){
3027+
input = replace_quoted_spaces( input );
3028+
}
29743029
if ( doFilter ){
29753030
input = settings[lang]->filter.filter( input );
29763031
}
@@ -3290,7 +3345,7 @@ namespace Tokenizer {
32903345
else {
32913346
if ( tokDebug >= 4 ){
32923347
DBG << "\trecurse, match changes the type:"
3293-
<< assigned_type << " to " << type << endl;
3348+
<< assigned_type << " to " << type << endl;
32943349
}
32953350
TokenRole role = (space ? NOROLE : NOSPACE);
32963351
if ( paragraphsignal_next ){

src/ucto.cxx

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006 - 2024
2+
Copyright (c) 2006 - 2026
33
CLST - Radboud University
44
ILK - Tilburg University
55
@@ -373,6 +373,11 @@ void runtime_opts::fill( TiCC::CL_Options& Opts ){
373373
pass_thru = Opts.extract( "passthru" );
374374
Opts.extract("normalize", norm_set_string );
375375
Opts.extract( "separators", separators );
376+
keep_quoted_spaces = Opts.extract( "keep-spaces-inside-quotes" );
377+
if ( keep_quoted_spaces && quotedetection ){
378+
throw TiCC::OptionError( "ucto: combining '--keep-spaces-inside-quotes' "
379+
"conflicts with '-Q'" );
380+
}
376381
if ( Opts.extract( 'x', docid ) ){
377382
throw TiCC::OptionError( "ucto: The option '-x ID' is removed. "
378383
"Please use '-X' and '--id=ID' instead" );
@@ -713,6 +718,7 @@ int main( int argc, char *argv[] ){
713718
"help,detectlanguages:,uselanguages:,"
714719
"textredundancy:,add-tokens:,split,"
715720
"allow-word-corrections,ignore-tag-hints,"
721+
"keep-spaces-inside-quotes,"
716722
"separators:");
717723
Opts.init(argc, argv );
718724
if ( Opts.extract( 'h' )
@@ -723,7 +729,7 @@ int main( int argc, char *argv[] ){
723729
if ( Opts.extract( 'V' ) ||
724730
Opts.extract( "version" ) ){
725731
cout << "Ucto - Unicode Tokenizer - version " << Version() << endl
726-
<< "(c) CLST 2015 - 2024, Centre for Language and Speech Technology, Radboud University Nijmegen" << endl
732+
<< "(c) CLST 2015 - 2026, Centre for Language and Speech Technology, Radboud University Nijmegen" << endl
727733
<< "(c) ILK 2009 - 2015, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl
728734
<< "Licensed under the GNU General Public License v3" << endl;
729735
cout << "based on [" << folia::VersionName() << "]" << endl;

0 commit comments

Comments
 (0)