Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ add_subdirectory(INIConfig)
add_subdirectory(LevDEA)
add_subdirectory(LevenshteinWeights)
add_subdirectory(MinDic)
add_subdirectory(MinDicString)
add_subdirectory(Pattern)
add_subdirectory(Profiler)
add_subdirectory(SimpleEnrich)
Expand All @@ -86,6 +87,7 @@ add_subdirectory(Vaam)
add_subdirectory(Val)
add_subdirectory(markup)
add_subdirectory(tools)
add_subdirectory(tools/vaamFilter)

###### TO DO: Move this (and all related headers and sources) to a separate folder ##########
SET( OCRCORRECTION_SOURCES
Expand Down
1 change: 1 addition & 0 deletions MinDicString/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# empty
112 changes: 112 additions & 0 deletions MinDicString/MinDicString.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#ifndef FSDICT_MINDICSTRING_H
#define FSDICT_MINDICSTRING_H FSDICT_MINDICSTRING_H

#include "../Global.h"
#include "../MinDic/MinDic.h"
#include "../Hash/Hash.h"
#include <sys/stat.h>

namespace csl {

/**
* @briefThis class provides the storage and lookup of huge dictionaries whose entries are annotated
* with a string-value of arbitrary length.
*
* Currently the keys of the dictionaries are represented as wide strings, while the annotations are
* stored as utf-8 sequences. This makes the data structure smaller, and in some contexts also faster.
* But, now that the whole rest of the library has been changed to wide strings, this is definitely
* a bit out-dated.
*
* @todo Change to consistent use of wstring
*
* @author Uli Reffle
* @date 2006-2010
*/
class MinDicString : public MinDic<> {
public:
typedef MinDic<> MinDic_t;

inline MinDicString();

/**
* @brief Create a MinDicString object and load a dic from the specified file
*/
inline MinDicString( char const* dicFile );

inline void setKeyValueDelimiter( uchar c );

inline uchar const* lookup( wchar_t* key ) const;

/**
* @brief return the string value annotated to the entry with the given perfect hash value
*/
inline uchar const* getAnnByPerfHashValue( size_t perfHashValue ) const;

/**
* @brief return the string value at the specified offset in the annotation buffer
*
*
*/
inline uchar const* getAnnByOffset( size_t offset ) const;


inline void loadFromFile( char const* dicFile );
inline void loadFromStream( FILE* fi );

inline void writeToFile( char const* dicFile ) const;
inline void writeToStream( FILE* fo ) const;

inline void initConstruction();
inline void finishConstruction();
inline void compileDic( const char* lexFile );
inline void printDic() const;

inline void doAnalysis() const;

private:
inline void printDic_rec( int pos, int depth, size_t perfHashValue ) const;


static const bits64 magicNumber_ = 2343572;

class Header {
public:
Header() :
magicNumber_( 0 ),
sizeOfAnnStrings_( 0 ) {
}

bits64 getMagicNumber() const {
return magicNumber_;
}
size_t getSizeOfAnnStrings() const {
return sizeOfAnnStrings_;
}

void set( const MinDicString& mds ) {
magicNumber_ = mds.magicNumber_;
sizeOfAnnStrings_ = mds.sizeOfAnnStrings_;
}
private:
bits64 magicNumber_;
bits64 sizeOfAnnStrings_;
}; // class Header

Header header_;

uchar* annStrings_;
size_t sizeOfAnnStrings_;

Hash< uchar >* annHash_;

uchar keyValueDelimiter_;

mutable size_t count_; // is used for counting during printing
}; // class MinDicString


} // namespace fsdict

#include "./MinDicString.tcc"

#endif
213 changes: 213 additions & 0 deletions MinDicString/MinDicString.tcc
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#ifndef FSDICT_MINDICSTRING_TCC
#define FSDICT_MINDICSTRING_TCC FSDICT_MINDICSTRING_TCC

namespace csl {

inline MinDicString::MinDicString() :
annStrings_( 0 ),
sizeOfAnnStrings_( 0 ),
annHash_( 0 ),
keyValueDelimiter_( Global::keyValueDelimiter )
{

}

inline MinDicString::MinDicString( char const* dicFile ) :
annStrings_( 0 ),
sizeOfAnnStrings_( 0 ),
annHash_( 0 ),
keyValueDelimiter_( Global::keyValueDelimiter )
{
loadFromFile( dicFile );
}

inline void MinDicString::setKeyValueDelimiter( uchar c ) {
keyValueDelimiter_ = c;
}

inline const uchar* MinDicString::lookup( wchar_t* key ) const {
int annInt = 0;
if( MinDic_t::lookup( key, &annInt ) )
return annStrings_ + annInt;
return 0;
}

inline const uchar* MinDicString::getAnnByPerfHashValue( size_t perfHashValue ) const {
return getAnnByOffset( getAnnotation( perfHashValue ) );
}

inline const uchar* MinDicString::getAnnByOffset( size_t offset ) const {
assert( offset < sizeOfAnnStrings_ );
return annStrings_ + offset;
}



inline void MinDicString::loadFromFile( char const* dicFile ) {
FILE* fi = fopen( dicFile, "rb" );
if ( !fi ) {
throw exceptions::badFileHandle( "fsdict::MinDicString: Couldn't open file '" +
std::string( dicFile ) +
"' for reading." );
}
loadFromStream( fi );
fclose( fi );
}

inline void MinDicString::loadFromStream( FILE* fi ) {
size_t elementsRead = fread( &header_, sizeof( Header ), 1, fi );
if( elementsRead != 1 ) {
throw exceptions::badInput( "fsdict::MinDicString::loadFromStream: could not read Header" );
}


if ( ( header_.getMagicNumber() != magicNumber_ ) ) {
throw exceptions::badDictFile( "MinDicString: Magic number comparison failed.\n" );
}

sizeOfAnnStrings_ = header_.getSizeOfAnnStrings();
MinDic_t::loadFromStream( fi );
annStrings_ = (uchar*) malloc( sizeOfAnnStrings_ * sizeof( uchar ) );
elementsRead = fread( annStrings_, sizeof( uchar ), sizeOfAnnStrings_, fi );
if( elementsRead != sizeOfAnnStrings_ ) {
throw exceptions::badInput( "fsdict::MinDicString::loadFromStream: could not read annotations" );
}

}

inline void MinDicString::writeToFile( char const* dicFile ) const {
FILE* fo = fopen( dicFile, "wb" );
if ( !fo ) {
throw exceptions::badFileHandle( "MinDicString: Couldn't open file '" +
std::string( dicFile ) +
"' for writing." );
}
writeToStream( fo );
fclose( fo );
}

inline void MinDicString::writeToStream( FILE* fo ) const {
fwrite( &header_, sizeof( Header ), 1, fo );
MinDic_t::writeToStream( fo );
fwrite( annStrings_, sizeof( uchar ), sizeOfAnnStrings_, fo );
}


inline void MinDicString::initConstruction() {
MinDic_t::initConstruction();

}

inline void MinDicString::finishConstruction() {
MinDic_t::finishConstruction();
sizeOfAnnStrings_ = annHash_->getLengthOfKeyStrings();
annStrings_ = (uchar*)realloc( annStrings_, sizeOfAnnStrings_ );
delete( annHash_ );
header_.set( *this );
}

inline void MinDicString::compileDic( const char* lexFile ) {
initConstruction();

std::ifstream fileHandle( lexFile );
if( !fileHandle.good() ) {
throw exceptions::badFileHandle( "Couldn't open file '" +
std::string( lexFile ) +
"' for reading." );
}


struct stat f_stat;
stat( lexFile, &f_stat );
size_t estimatedNrOfKeys = f_stat.st_size / 100;
if( estimatedNrOfKeys < 1000 ) estimatedNrOfKeys = 1000; // set a minimum of 1000

std::wcerr<<"Estimate about "<< estimatedNrOfKeys << " Keys."<< std::endl;

annHash_ = new Hash< uchar >( estimatedNrOfKeys, annStrings_, sizeOfAnnStrings_ );


uchar bytesIn[Global::lengthOfLongStr];
// set the last byte to 0. So we can recognize when an overlong string was read by getline().
bytesIn[Global::lengthOfLongStr - 1] = 0;

wchar_t key[Global::lengthOfLongStr];
uchar* annotationStr = 0;

while( fileHandle.getline(( char* ) bytesIn, Global::lengthOfLongStr ) ) {
if ( bytesIn[Global::lengthOfLongStr-1] != 0 ) {
throw exceptions::badInput( "fsdict::MinDicString::compileDic: Maximum length of input line violated (set by Global::lengthOfLongStr)" );
}

/////////////////// PARSE THE INPUT STRING
uchar *c;
c = ( uchar* )strchr( ( char* )bytesIn, keyValueDelimiter_ );

if( c ) {
*c = 0;
annotationStr = ( c + 1 );
}
else throw exceptions::badInput( "fsdict::MinDicString::compileDic: No string annotation given." );

if( mbstowcs( key, (const char*)bytesIn, Global::lengthOfLongStr ) == (size_t)-1 ) {
throw exceptions::badInput( "fsdict::MinDicString::compileDic: Invalid utf-8 sequence" );
}

if( ! ( c && *key ) ) {
throw exceptions::badInput( "fsdict::MinDicString::compileDic: wrong input format" );
}

size_t offset = annHash_->findOrInsert( annotationStr );

MinDic_t::addToken( key, offset );

}
fileHandle.close();

finishConstruction();
}

void MinDicString::printDic() const {
count_ = 0;
printDic_rec( getRoot(), 0, 0 );
}

void MinDicString::printDic_rec( int pos, int depth, size_t perfHashValue ) const {
int newPos;
static wchar_t w[Global::lengthOfStr];
size_t newPerfHashValue;

const wchar_t* transitions = getSusoString( pos );
while( *transitions ) {
newPerfHashValue = perfHashValue;;
if( ( newPos = walkPerfHash( pos, *transitions, &newPerfHashValue ) ) ) {
w[depth] = *transitions;

if( isFinal( newPos ) ) {
w[depth+1] = 0;
wprintf( L"%ls#%s\n", w, annStrings_ + getAnnotation( newPerfHashValue ) );
//printf( "%ls#%d\n", w, newPerfHashValue );

if( ( ++count_ % 100000 ) == 0 ) fprintf( stderr, "%d\n", (int)count_ );
} // if isFinal
printDic_rec( newPos, depth + 1, newPerfHashValue );

} // if couldWalk
else {
throw exceptions::badDictFile( "suso-string seems to be corrupted." );
}
++transitions;
} // while
} // end of method

void MinDicString::doAnalysis() const {
MinDic_t::doAnalysis();
printf( "**********\nMinDicString Analysis\n**********\nannotation strings: %.3f MB\n\n",
(double)header_.getSizeOfAnnStrings() / 1048576
);

}

}

#endif
28 changes: 28 additions & 0 deletions MinDicString/extractMDS.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include<iostream>
#include "../Alphabet/Alphabet.h"
#include "./MinDicString.h"

using namespace fsdict;

int main( int argc, char** argv ) {
setlocale(LC_CTYPE, "de_DE.UTF-8"); /*Setzt das Default Encoding für das Programm */

if( argc < 2 ) {
std::cerr<<"Use like: "<<argv[0]<<" <binDic> [DOT]"<<std::endl;
exit(1);
}
try {
MinDicString t;
t.loadFromFile( argv[1] );
if( argc == 3 && ! strcmp( argv[2], "DOT" ) ) {
t.toDot();
}
else t.printDic();


return 0;
} catch ( exceptions::fsdictException ex ) {
std::cerr << "Dictionary extraction failed: " << ex.what() << std::endl;
return 1;
}
}
Loading