cisocrgroup · finkf · Nov 22, 2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -77,6 +77,7 @@ add_subdirectory(INIConfig)
 add_subdirectory(LevDEA)
 add_subdirectory(LevenshteinWeights)
 add_subdirectory(MinDic)
+add_subdirectory(MinDicString)
 add_subdirectory(Pattern)
 add_subdirectory(Profiler)
 add_subdirectory(SimpleEnrich)
@@ -86,6 +87,7 @@ add_subdirectory(Vaam)
 add_subdirectory(Val)
 add_subdirectory(markup)
 add_subdirectory(tools)
+add_subdirectory(tools/vaamFilter)
 
 ###### TO DO: Move this (and all related headers and sources) to a separate folder ##########
 SET( OCRCORRECTION_SOURCES

diff --git a/MinDicString/CMakeLists.txt b/MinDicString/CMakeLists.txt
@@ -0,0 +1 @@
+# empty
diff --git a/MinDicString/MinDicString.h b/MinDicString/MinDicString.h
@@ -0,0 +1,112 @@
+#ifndef FSDICT_MINDICSTRING_H
+#define FSDICT_MINDICSTRING_H FSDICT_MINDICSTRING_H
+
+#include "../Global.h"
+#include "../MinDic/MinDic.h"
+#include "../Hash/Hash.h"
+#include <sys/stat.h>
+
+namespace csl {
+
+    /**
+     * @briefThis class provides the storage and lookup of huge dictionaries whose entries are annotated
+     * with a string-value of arbitrary length.
+     *
+     * Currently the keys of the dictionaries are represented as wide strings, while the annotations are
+     * stored as utf-8 sequences. This makes the data structure smaller, and in some contexts also faster.
+     * But, now that the whole rest of the library has been changed to wide strings, this is definitely
+     * a bit out-dated.
+     *
+     * @todo Change to consistent use of wstring
+     *
+     * @author Uli Reffle
+     * @date 2006-2010
+     */
+    class MinDicString : public MinDic<> {
+    public:
+	typedef MinDic<> MinDic_t;
+
+	inline MinDicString();
+
+	/**
+	 * @brief Create a MinDicString object and load a dic from the specified file
+	 */
+	inline MinDicString( char const* dicFile );
+
+	inline void setKeyValueDelimiter( uchar c );
+
+	inline uchar const* lookup( wchar_t* key ) const;
+
+	/**
+	 * @brief return the string value annotated to the entry with the given perfect hash value
+	 */
+	inline uchar const* getAnnByPerfHashValue( size_t perfHashValue ) const;
+
+	/**
+	 * @brief return the string value at the specified offset in the annotation buffer
+	 *
+	 *
+	 */
+	inline uchar const* getAnnByOffset( size_t offset ) const;
+
+
+	inline void loadFromFile( char const* dicFile );
+	inline void loadFromStream( FILE* fi );
+
+	inline void writeToFile( char const* dicFile ) const;
+	inline void writeToStream( FILE* fo ) const;
+
+	inline void initConstruction();
+	inline void finishConstruction();
+	inline void compileDic( const char* lexFile );
+	inline void printDic() const;
+
+	inline void doAnalysis() const;
+
+    private:
+	inline void printDic_rec( int pos, int depth, size_t perfHashValue ) const;
+
+
+	static const bits64 magicNumber_ = 2343572;
+
+	class Header {
+	public:
+	    Header() :
+		magicNumber_( 0 ),
+		sizeOfAnnStrings_( 0 ) {
+	    }
+
+	    bits64 getMagicNumber() const {
+		return magicNumber_;
+	    }
+	    size_t getSizeOfAnnStrings() const {
+		return sizeOfAnnStrings_;
+	    }
+
+	    void set( const MinDicString& mds ) {
+		magicNumber_ = mds.magicNumber_;
+		sizeOfAnnStrings_ = mds.sizeOfAnnStrings_;
+	    }
+	private:
+	    bits64 magicNumber_;
+	    bits64 sizeOfAnnStrings_;
+	}; // class Header
+
+	Header header_;
+
+	uchar* annStrings_;
+	size_t sizeOfAnnStrings_;
+
+	Hash< uchar >* annHash_;
+
+	uchar keyValueDelimiter_;
+
+	mutable size_t count_; // is used for counting during printing
+    }; // class MinDicString
+
+
+} // namespace fsdict
+
+#include "./MinDicString.tcc"
+
+#endif
diff --git a/MinDicString/MinDicString.tcc b/MinDicString/MinDicString.tcc
@@ -0,0 +1,213 @@
+#ifndef FSDICT_MINDICSTRING_TCC
+#define FSDICT_MINDICSTRING_TCC FSDICT_MINDICSTRING_TCC
+
+namespace csl {
+
+    inline MinDicString::MinDicString() :
+	annStrings_( 0 ),
+	sizeOfAnnStrings_( 0 ),
+	annHash_( 0 ),
+	keyValueDelimiter_( Global::keyValueDelimiter )
+    {
+
+    }
+
+    inline MinDicString::MinDicString( char const* dicFile ) :
+	annStrings_( 0 ),
+	sizeOfAnnStrings_( 0 ),
+	annHash_( 0 ),
+	keyValueDelimiter_( Global::keyValueDelimiter )
+    {
+	loadFromFile( dicFile );
+    }
+
+    inline void MinDicString::setKeyValueDelimiter( uchar c ) {
+	keyValueDelimiter_ = c;
+    }
+
+    inline const uchar* MinDicString::lookup( wchar_t* key ) const {
+	int annInt = 0;
+	if( MinDic_t::lookup( key, &annInt ) )
+	    return annStrings_ + annInt;
+	return 0;
+    }
+
+    inline const uchar* MinDicString::getAnnByPerfHashValue( size_t perfHashValue ) const {
+	return getAnnByOffset( getAnnotation( perfHashValue ) );
+    }
+
+    inline const uchar* MinDicString::getAnnByOffset( size_t offset ) const {
+	assert( offset < sizeOfAnnStrings_ );
+	return annStrings_ + offset;
+    }
+
+
+
+    inline void MinDicString::loadFromFile( char const* dicFile ) {
+	FILE* fi = fopen( dicFile, "rb" );
+	if ( !fi ) {
+	    throw exceptions::badFileHandle( "fsdict::MinDicString: Couldn't open file '" +
+					     std::string( dicFile ) +
+					     "' for reading." );
+	}
+	loadFromStream( fi );
+	fclose( fi );
+    }
+
+    inline void MinDicString::loadFromStream( FILE* fi ) {
+	size_t elementsRead = fread( &header_, sizeof( Header ), 1, fi );
+	if( elementsRead != 1 ) {
+	    throw exceptions::badInput( "fsdict::MinDicString::loadFromStream: could not read Header" );
+	}
+
+
+	if ( ( header_.getMagicNumber() != magicNumber_ ) ) {
+	    throw exceptions::badDictFile( "MinDicString: Magic number comparison failed.\n" );
+	}
+
+	sizeOfAnnStrings_ = header_.getSizeOfAnnStrings();
+	MinDic_t::loadFromStream( fi );
+	annStrings_ = (uchar*) malloc( sizeOfAnnStrings_ * sizeof( uchar ) );
+	elementsRead = fread( annStrings_, sizeof( uchar ), sizeOfAnnStrings_, fi );
+	if( elementsRead != sizeOfAnnStrings_ ) {
+	    throw exceptions::badInput( "fsdict::MinDicString::loadFromStream: could not read annotations" );
+	}
+
+    }
+
+    inline void MinDicString::writeToFile( char const* dicFile ) const {
+	FILE* fo = fopen( dicFile, "wb" );
+	if ( !fo ) {
+	    throw exceptions::badFileHandle( "MinDicString: Couldn't open file '" +
+					     std::string( dicFile ) +
+					     "' for writing." );
+	}
+	writeToStream( fo );
+	fclose( fo );
+    }
+
+    inline void MinDicString::writeToStream( FILE* fo ) const {
+	fwrite( &header_, sizeof( Header ), 1, fo );
+	MinDic_t::writeToStream( fo );
+	fwrite( annStrings_, sizeof( uchar ), sizeOfAnnStrings_, fo );
+    }
+
+
+    inline void MinDicString::initConstruction() {
+	MinDic_t::initConstruction();
+
+    }
+
+    inline void MinDicString::finishConstruction() {
+	MinDic_t::finishConstruction();
+	sizeOfAnnStrings_ = annHash_->getLengthOfKeyStrings();
+	annStrings_ = (uchar*)realloc( annStrings_, sizeOfAnnStrings_ );
+	delete( annHash_ );
+	header_.set( *this );
+    }
+
+    inline void MinDicString::compileDic( const char* lexFile ) {
+	initConstruction();
+
+	std::ifstream fileHandle( lexFile );
+	if( !fileHandle.good() ) {
+	    throw exceptions::badFileHandle( "Couldn't open file '" +
+					     std::string( lexFile ) +
+					     "' for reading." );
+	}
+
+
+	struct stat f_stat;
+	stat( lexFile, &f_stat );
+	size_t estimatedNrOfKeys = f_stat.st_size / 100;
+	if( estimatedNrOfKeys < 1000 ) estimatedNrOfKeys = 1000; // set a minimum of 1000
+
+	std::wcerr<<"Estimate about "<< estimatedNrOfKeys << " Keys."<< std::endl;
+
+	annHash_ = new Hash< uchar >( estimatedNrOfKeys, annStrings_, sizeOfAnnStrings_ );
+
+
+	uchar bytesIn[Global::lengthOfLongStr];
+	// set the last byte to 0. So we can recognize when an overlong string was read by getline().
+	bytesIn[Global::lengthOfLongStr - 1] = 0;
+
+	wchar_t key[Global::lengthOfLongStr];
+	uchar* annotationStr = 0;
+
+	while( fileHandle.getline(( char* ) bytesIn, Global::lengthOfLongStr ) )  {
+	    if ( bytesIn[Global::lengthOfLongStr-1] != 0 ) {
+		throw exceptions::badInput( "fsdict::MinDicString::compileDic: Maximum length of input line violated (set by Global::lengthOfLongStr)" );
+	    }
+
+	    /////////////////// PARSE THE INPUT STRING
+	    uchar *c;
+	    c = ( uchar* )strchr( ( char* )bytesIn, keyValueDelimiter_ );
+
+	    if( c ) {
+		*c = 0;
+		annotationStr = ( c + 1 );
+	    }
+	    else throw exceptions::badInput( "fsdict::MinDicString::compileDic: No string annotation given." );
+
+	    if( mbstowcs( key, (const char*)bytesIn, Global::lengthOfLongStr ) == (size_t)-1 ) {
+		throw exceptions::badInput( "fsdict::MinDicString::compileDic: Invalid utf-8 sequence" );
+	    }
+
+	    if( ! ( c  && *key ) ) {
+		throw exceptions::badInput( "fsdict::MinDicString::compileDic: wrong input format" );
+	    }
+
+	    size_t offset = annHash_->findOrInsert( annotationStr );
+
+	    MinDic_t::addToken( key, offset );
+
+	}
+	fileHandle.close();
+
+	finishConstruction();
+    }
+
+    void MinDicString::printDic() const {
+	count_ = 0;
+	printDic_rec( getRoot(), 0, 0 );
+    }
+
+    void MinDicString::printDic_rec( int pos, int depth, size_t perfHashValue ) const {
+	int newPos;
+	static wchar_t w[Global::lengthOfStr];
+	size_t newPerfHashValue;
+
+	const wchar_t* transitions = getSusoString( pos );
+	while( *transitions ) {
+	    newPerfHashValue = perfHashValue;;
+	    if( ( newPos = walkPerfHash( pos, *transitions, &newPerfHashValue ) ) ) {
+		w[depth] = *transitions;
+
+		if( isFinal( newPos ) ) {
+		    w[depth+1] = 0;
+		    wprintf( L"%ls#%s\n", w, annStrings_ + getAnnotation( newPerfHashValue ) );
+		    //printf( "%ls#%d\n", w, newPerfHashValue );
+
+		    if( ( ++count_ % 100000 ) == 0 ) fprintf( stderr, "%d\n", (int)count_ );
+		} // if isFinal
+		printDic_rec( newPos, depth + 1, newPerfHashValue );
+
+	    } // if couldWalk
+	    else {
+		throw exceptions::badDictFile( "suso-string seems to be corrupted." );
+	    }
+	    ++transitions;
+	} // while
+    } // end of method
+
+    void MinDicString::doAnalysis() const {
+	MinDic_t::doAnalysis();
+	printf( "**********\nMinDicString Analysis\n**********\nannotation strings: %.3f MB\n\n",
+		(double)header_.getSizeOfAnnStrings() / 1048576
+	    );
+
+    }
+
+}
+
+#endif
diff --git a/MinDicString/extractMDS.cxx b/MinDicString/extractMDS.cxx
@@ -0,0 +1,28 @@
+#include<iostream>
+#include "../Alphabet/Alphabet.h"
+#include "./MinDicString.h"
+
+using namespace fsdict;
+
+int main( int argc, char** argv ) {
+    setlocale(LC_CTYPE, "de_DE.UTF-8");  /*Setzt das Default Encoding für das Programm */
+
+    if( argc < 2 ) {
+	std::cerr<<"Use like: "<<argv[0]<<" <binDic> [DOT]"<<std::endl;
+	exit(1);
+    }
+    try {
+	MinDicString t;
+	t.loadFromFile( argv[1] );
+	if( argc == 3 && ! strcmp( argv[2], "DOT" ) ) {
+	    t.toDot();
+	}
+	else t.printDic();
+
+
+	return 0;
+    } catch ( exceptions::fsdictException ex ) {
+	std::cerr << "Dictionary extraction failed: " << ex.what() << std::endl;
+	return 1;
+    }
+}