Squashed commit of the following:

kosloot · kosloot · commit 6badafe90845 · 2025-12-11T13:32:31.000+01:00
commit 61cf5de entering 2026 added a (tricky, hidden) option --keep-spaces-inside-quotes. Allows a limited use to keep quoted values together: "One quote", or ' keep this alive '
diff --git a/.github/workflows/ucto.yml b/.github/workflows/ucto.yml
@@ -36,7 +36,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest]
-        compiler: [g++-12, clang++ -std=c++17]
+        compiler: [g++ -std=c++17, clang++ -std=c++17]
 
     steps:
       - uses: actions/checkout@v4.1.1
diff --git a/codemeta.json b/codemeta.json
@@ -8,7 +8,7 @@
     "@type": "SoftwareSourceCode",
     "identifier": "ucto",
     "name": "ucto",
-    "version": "0.35",
+    "version": "0.36",
     "description": "Ucto tokenizes text files: it separates words from punctuation, and splits sentences. This is one of the first tasks for almost any Natural Language Processing application. Ucto offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation.",
     "license": "https://spdx.org/licenses/GPL-3.0-only",
 	"url": "https://languagemachines.github.io/ucto",
diff --git a/configure.ac b/configure.ac
@@ -2,7 +2,7 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.69])
-AC_INIT([ucto],[0.35],[lamasoftware@science.ru.nl]) #adapt version number in codemeta.json as well
+AC_INIT([ucto],[0.36],[lamasoftware@science.ru.nl]) #adapt version number in codemeta.json as well
 AM_INIT_AUTOMAKE([foreign])
 AC_CONFIG_SRCDIR([configure.ac])
 AC_CONFIG_MACRO_DIR([m4])
diff --git a/include/ucto/my_textcat.h b/include/ucto/my_textcat.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2024
+  Copyright (c) 2026
   CLST - Radboud University
   ILK  - Tilburg University
 
diff --git a/include/ucto/setting.h b/include/ucto/setting.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2006 - 2024
+  Copyright (c) 2006 - 2026
   CLST - Radboud University
   ILK  - Tilburg University
 
diff --git a/include/ucto/tokenize.h b/include/ucto/tokenize.h
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2006 - 2024
+  Copyright (c) 2006 - 2026
   CLST - Radboud University
   ILK  - Tilburg University
 
@@ -45,6 +45,7 @@ namespace Tokenizer {
 
   const std::string Version();
   const std::string VersionName();
+  extern bool keep_quoted_spaces;
 
   enum TokenRole {
     NOROLE                      = 0,
@@ -99,8 +100,8 @@ namespace Tokenizer {
 	   const UnicodeString&,
 	   const std::string& = "" );
     std::string lang_code;                // ISO 639-3 language code
-    std::string texttostring();
-    std::string typetostring();
+    std::string texttostring() const;
+    std::string typetostring() const ;
   };
 
   class TokenizerClass{
@@ -273,6 +274,10 @@ namespace Tokenizer {
     bool setUndLang( bool b ){ bool r = und_language; und_language = b; return r; };
     bool getUndLang(){ return und_language; };
 
+    bool setKeepQuotedSpaces( bool b ){ bool r = keep_quoted_spaces;
+      keep_quoted_spaces = b; return r; };
+    bool getKeepQuotedSpaces() const { return keep_quoted_spaces; };
+
     const std::string& getInputClass( ) const { return inputclass; }
     const std::string setInputClass( const std::string& cls) {
       std::string res = inputclass;
diff --git a/src/my_textcat.cxx b/src/my_textcat.cxx
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2024
+  Copyright (c) 2026
   CLST - Radboud University
 
   This file is part of Ucto
diff --git a/src/setting.cxx b/src/setting.cxx
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2006 - 2024
+  Copyright (c) 2006 - 2026
   CLST - Radboud University
   ILK  - Tilburg University
 
diff --git a/src/tokenize.cxx b/src/tokenize.cxx
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2006 - 2024
+  Copyright (c) 2006 - 2026
   CLST - Radboud University
   ILK  - Tilburg University
 
@@ -76,6 +76,7 @@ namespace Tokenizer {
 
   using namespace icu;
   using TiCC::operator<<;
+  bool keep_quoted_spaces = false;
 
   const UChar32 ZWJ = u'\u200D';
 
@@ -154,21 +155,46 @@ namespace Tokenizer {
   const UnicodeString type_unknown = "UNKNOWN";
   const UnicodeString type_unanalyzed = "UNANALYZED";
 
+  UnicodeString filter_ZCARON( const UnicodeString& in ){
+    UnicodeString result;
+    for ( int i=0; i < in.length(); ++i ){
+      UChar32 c = in[i];
+      if ( c == U'Ž' ){
+	c = ' ';
+      }
+      result += c;
+    }
+    return result;
+  }
+
   Token::Token( const UnicodeString& _type,
 		const UnicodeString& _s,
-		TokenRole _role, const string& _lang_code ):
-    type(_type), us(_s), role(_role), lang_code(_lang_code) {
+		TokenRole _role,
+		const string& _lang_code ):
+    type(_type), role(_role), lang_code(_lang_code) {
+    if ( keep_quoted_spaces ){
+      us = filter_ZCARON( _s );
+    }
+    else {
+      us = _s;
+    }
   }
 
   Token::Token( const UnicodeString& _type,
 		const UnicodeString& _s,
 		const string& _lang_code ):
-    type(_type), us(_s), role(NOROLE), lang_code(_lang_code) {
+    type(_type), role(NOROLE), lang_code(_lang_code) {
+    if ( keep_quoted_spaces ){
+      us = filter_ZCARON( _s );
+    }
+    else {
+      us = _s;
+    }
   }
 
 
-  std::string Token::texttostring() { return TiCC::UnicodeToUTF8(us); }
-  std::string Token::typetostring() { return TiCC::UnicodeToUTF8(type); }
+  std::string Token::texttostring() const { return TiCC::UnicodeToUTF8(us); }
+  std::string Token::typetostring() const { return TiCC::UnicodeToUTF8(type); }
 
   ostream& operator<< (std::ostream& os, const Token& t ){
     os << t.type << " : " << t.role  << ": '" << t.us << "' (" << t.lang_code << ")";
@@ -2946,6 +2972,32 @@ namespace Tokenizer {
     }
   }
 
+  UnicodeString replace_quoted_spaces( const UnicodeString& in ){
+    UnicodeString result;
+    UChar32 quote = '\x0';
+    for ( int i=0; i < in.length(); ++i ){
+      UChar32 c = in[i];
+      //      cerr << "bekijk: " << UnicodeString( c ) << endl;
+      if ( c == '"' || c == '\'' ){
+	// found quote
+	//	cerr << "found quote!" << endl;
+	if ( c == quote ){
+	  // so a second one, reset
+	  quote = '\x0';
+	  //	  cerr << "reset quote!" << endl;
+	}
+	else {
+	  quote = c;
+	}
+      }
+      else if ( c == ' ' && quote != '\x0' ){
+	c = U'Ž'; // mark as  Ž
+      }
+      result += c;
+    }
+    return result;
+  }
+
   int TokenizerClass::internal_tokenize_line( const UnicodeString& originput,
 					      const string& _lang ){
     if ( originput.isBogus() ){ //only tokenize valid input
@@ -2971,6 +3023,9 @@ namespace Tokenizer {
 	  << originput << "] (language= " << lang << ")" << endl;
     }
     UnicodeString input = originput;
+    if ( keep_quoted_spaces ){
+      input = replace_quoted_spaces( input );
+    }
     if ( doFilter ){
       input = settings[lang]->filter.filter( input );
     }
@@ -3290,7 +3345,7 @@ namespace Tokenizer {
 	    else {
 	      if ( tokDebug >= 4 ){
 		DBG << "\trecurse, match changes the type:"
-				<< assigned_type << " to " << type << endl;
+		    << assigned_type << " to " << type << endl;
 	      }
 	      TokenRole role = (space ? NOROLE : NOSPACE);
 	      if ( paragraphsignal_next ){
diff --git a/src/ucto.cxx b/src/ucto.cxx
@@ -1,5 +1,5 @@
 /*
-  Copyright (c) 2006 - 2024
+  Copyright (c) 2006 - 2026
   CLST - Radboud University
   ILK  - Tilburg University
 
@@ -373,6 +373,11 @@ void runtime_opts::fill( TiCC::CL_Options& Opts ){
   pass_thru = Opts.extract( "passthru" );
   Opts.extract("normalize", norm_set_string );
   Opts.extract( "separators", separators );
+  keep_quoted_spaces = Opts.extract( "keep-spaces-inside-quotes" );
+  if ( keep_quoted_spaces && quotedetection ){
+    throw TiCC::OptionError( "ucto: combining '--keep-spaces-inside-quotes' "
+			     "conflicts with '-Q'" );
+  }
   if ( Opts.extract( 'x', docid ) ){
     throw TiCC::OptionError( "ucto: The option '-x ID' is removed. "
 			     "Please use '-X' and '--id=ID' instead" );
@@ -713,6 +718,7 @@ int main( int argc, char *argv[] ){
 			   "help,detectlanguages:,uselanguages:,"
 			   "textredundancy:,add-tokens:,split,"
 			   "allow-word-corrections,ignore-tag-hints,"
+			   "keep-spaces-inside-quotes,"
 			   "separators:");
     Opts.init(argc, argv );
     if ( Opts.extract( 'h' )
@@ -723,7 +729,7 @@ int main( int argc, char *argv[] ){
     if ( Opts.extract( 'V' ) ||
 	 Opts.extract( "version" ) ){
       cout << "Ucto - Unicode Tokenizer - version " << Version() << endl
-	   << "(c) CLST 2015 - 2024, Centre for Language and Speech Technology, Radboud University Nijmegen" << endl
+	   << "(c) CLST 2015 - 2026, Centre for Language and Speech Technology, Radboud University Nijmegen" << endl
 	   << "(c) ILK 2009 - 2015, Induction of Linguistic Knowledge Research Group, Tilburg University" << endl
 	   << "Licensed under the GNU General Public License v3" << endl;
       cout << "based on [" << folia::VersionName() << "]" << endl;