2424
2525// #include "stdafx.h"
2626#include " CTWLanguageModel.h"
27+ #include < cstddef>
2728#include < math.h> // not in use anymore? needed it for log
2829#include < cstring>
30+ #include " HashTable.h"
2931
3032using namespace Dasher ;
3133
@@ -73,7 +75,7 @@ inline int CCTWLanguageModel::MapIndex(int b, int f){
7375 return ((1 <<f)-1 + (b>>(NrPhases-f))); // (2^phase -1) + dec. value of most significant bits
7476}
7577
76- inline void CCTWLanguageModel::Scale (uint64 &a, uint64 &b)
78+ inline void CCTWLanguageModel::Scale (uint64_t &a, uint64_t &b)
7779{
7880 // Instead of using the full 16 bits for the probabilities, use only 9,
7981 // that's the only relevant information the other bits are noise <- depends on the value of MaxCount,
@@ -105,18 +107,18 @@ void CCTWLanguageModel::UpdatePath(int bit, int Update, int ValidDepth, int* & i
105107 // Update specifies yes (1) or no (0) (GetProbs). In the case 'no', the new Pws are calculated but the tree is not
106108 // altered in any way
107109
108- uint64 GammaZero; // (GammaZero / (GammaZero + GammaOne)) = Pw(0|x)
109- uint64 GammaOne; // (GammaOne / (GammaZero + GammaOne)) = Pw(1|x)
110+ uint64_t GammaZero; // (GammaZero / (GammaZero + GammaOne)) = Pw(0|x)
111+ uint64_t GammaOne; // (GammaOne / (GammaZero + GammaOne)) = Pw(1|x)
110112 unsigned short int CountZero; // Number of zeros seen so far in this node
111113 unsigned short int CountOne; // Number of ones seen so far in this node
112- uint64 PeBlockZero; // Local block probability of sequence (0,x)
113- uint64 PeBlockOne; // Local block probability of sequence (1,x)
114- uint64 PwCBlockZero; // Product of the weighted block probabilities of the childnodes of sequence (0,x)
115- uint64 PwCBlockOne; // Product of the weighted block probabilities of the childnodes of sequence (1,x)
116- uint64 PeCondZero; // Conditional local probability (0|x)
117- uint64 PeCondOne; // Conditional local probability (1|x)
118- uint64 PwCBlock; // Product of the weighted block probabilities of the childnodes of sequence (x)
119- uint64 PeBlock; // Local block probability of sequence (x)
114+ uint64_t PeBlockZero; // Local block probability of sequence (0,x)
115+ uint64_t PeBlockOne; // Local block probability of sequence (1,x)
116+ uint64_t PwCBlockZero; // Product of the weighted block probabilities of the childnodes of sequence (0,x)
117+ uint64_t PwCBlockOne; // Product of the weighted block probabilities of the childnodes of sequence (1,x)
118+ uint64_t PeCondZero; // Conditional local probability (0|x)
119+ uint64_t PeCondOne; // Conditional local probability (1|x)
120+ uint64_t PwCBlock; // Product of the weighted block probabilities of the childnodes of sequence (x)
121+ uint64_t PeBlock; // Local block probability of sequence (x)
120122
121123 // The deepest index can be a leaf, a failed node, or a not-placed node
122124 const int DeepestIndex = index[ValidDepth];
@@ -268,8 +270,7 @@ int CCTWLanguageModel::FindPath(CCTWContext & context, char NewChar, int phase,
268270 found = true ; // to avoid 'failed'
269271 index[i+1 ] = curindex; // tell calling function where to find the node, i+1 because index[0] = rootnode
270272 break ; // to escape loop and continue with next character
271- }
272- else // can't create a new node
273+ } else // can't create a new node
273274 {
274275 found = false ;
275276 index[i+1 ] = MaxNrNodes+1 ; // to indicate node could not be placed
@@ -357,9 +358,9 @@ void CCTWLanguageModel::GetProbs(Context context, std::vector<unsigned int> &Pro
357358 Interval[0 ] = Norm;
358359
359360 int ValidDepth = 0 ;
360- uint64 IntervalB = 0 ; // 'base' interval
361- uint64 IntervalZ = 0 ; // divided interval for the 0-branch
362- uint64 IntervalO = 0 ; // divided interval for the 1-branch
361+ uint64_t IntervalB = 0 ; // 'base' interval
362+ uint64_t IntervalZ = 0 ; // divided interval for the 0-branch
363+ uint64_t IntervalO = 0 ; // divided interval for the 1-branch
363364 unsigned int MinInterval = 0 ;
364365 unsigned short int Pw0 = 0 ;
365366 unsigned short int Pw1 = 0 ;
@@ -375,7 +376,7 @@ void CCTWLanguageModel::GetProbs(Context context, std::vector<unsigned int> &Pro
375376 IntervalB = Interval[(1 <<phase)+ steps - 1 ];
376377 self->UpdatePath (0 ,0 , ValidDepth, Index, Pw0, Pw1);
377378
378- IntervalZ = (IntervalB * Pw0)/(uint64 )(Pw0+Pw1); // flooring, influence of flooring P0 instead of P1 is negligible
379+ IntervalZ = (IntervalB * Pw0)/(uint64_t )(Pw0+Pw1); // flooring, influence of flooring P0 instead of P1 is negligible
379380 IntervalO = IntervalB - IntervalZ;
380381
381382 MinInterval = MinProb*1 <<(NrPhases-1 -phase); // leafs for each rootnode at the current phase, assuming a full alphabet!!
@@ -423,12 +424,6 @@ void CCTWLanguageModel::GetProbs(Context context, std::vector<unsigned int> &Pro
423424
424425bool CCTWLanguageModel::WriteToFile (std::string strFilename, std::string AlphabetName){
425426 SLMFileHeader GenericHeader;
426- // Magic number ("%DLF" in ASCII)
427- GenericHeader.szMagic [0 ] = ' %' ;
428- GenericHeader.szMagic [1 ] = ' D' ;
429- GenericHeader.szMagic [2 ] = ' L' ;
430- GenericHeader.szMagic [3 ] = ' F' ;
431-
432427 GenericHeader.iAlphabetSize = GetSize (); // Number of characters in the alphabet
433428 GenericHeader.iHeaderVersion = 1 ; // Version of the header
434429 GenericHeader.iLMID = 5 ; // ID of the language model, 5 for CTW
@@ -440,38 +435,32 @@ bool CCTWLanguageModel::WriteToFile(std::string strFilename, std::string Alphabe
440435 OutputFile = fopen (strFilename.c_str (), " wb" );
441436 if (OutputFile)
442437 {
443- char * buffer;
444- buffer = new char [AlphabetName.length ()+1 ];
445- strcpy (buffer, AlphabetName.c_str ());
446-
447438 // write header
448- fwrite (GenericHeader.szMagic , sizeof (GenericHeader.szMagic [0 ]), sizeof (GenericHeader.szMagic ), OutputFile );
449- fwrite (&GenericHeader.iHeaderVersion , 2 ,1 , OutputFile);
450- fwrite (&GenericHeader.iHeaderSize , 2 ,1 , OutputFile);
451- fwrite (&GenericHeader.iLMID , 2 ,1 , OutputFile);
452- fwrite (&GenericHeader.iLMVersion , 2 ,1 , OutputFile);
453- fwrite (&GenericHeader.iLMMinVersion , 2 ,1 , OutputFile);
454- fwrite (&GenericHeader.iAlphabetSize , 2 ,1 , OutputFile);
455- fwrite (buffer, 1 , AlphabetName.length (), OutputFile ); // UTF-8 encoded alphabet name (variable length struct)
456- delete[] buffer;
439+ fwrite (GenericHeader.szMagic , sizeof (GenericHeader.szMagic [0 ]), sizeof (GenericHeader.szMagic ) - 1 , OutputFile); // Do not print Null-Char
440+ fwrite (&GenericHeader.iHeaderVersion , sizeof (GenericHeader.iHeaderVersion ), 1 , OutputFile);
441+ fwrite (&GenericHeader.iHeaderSize , sizeof (GenericHeader.iHeaderSize ), 1 , OutputFile);
442+ fwrite (&GenericHeader.iLMID , sizeof (GenericHeader.iLMID ), 1 , OutputFile);
443+ fwrite (&GenericHeader.iLMVersion , sizeof (GenericHeader.iLMVersion ), 1 , OutputFile);
444+ fwrite (&GenericHeader.iLMMinVersion , sizeof (GenericHeader.iLMMinVersion ), 1 , OutputFile);
445+ fwrite (&GenericHeader.iAlphabetSize , sizeof (GenericHeader.iAlphabetSize ), 1 , OutputFile);
446+ fwrite (AlphabetName.c_str (), sizeof (AlphabetName[0 ]), AlphabetName.length (), OutputFile); // UTF-8 encoded alphabet name (variable length struct)
457447
458448 // CTW specific, not in SLMFileHeader
459- fwrite (&MaxNrNodes, 4 , 1 , OutputFile);
449+ fwrite (&MaxNrNodes, sizeof (MaxNrNodes), 1 , OutputFile);
460450
461451 for (int i=0 ;i<MaxNrNodes;i++)
462452 {
463- fwrite (&Tree[i].a , 1 , 1 , OutputFile);
464- fwrite (&Tree[i].b , 1 , 1 , OutputFile);
465- fwrite (&Tree[i].Symbol , 1 , 1 ,OutputFile);
466- fwrite (&Tree[i].NrTries , 1 ,1 ,OutputFile);
467- fwrite (&Tree[i].Pe , 2 ,1 ,OutputFile);
468- fwrite (&Tree[i].PwChild , 2 ,1 ,OutputFile);
453+ fwrite (&Tree[i].a , sizeof (CCTWNode::a), 1 , OutputFile);
454+ fwrite (&Tree[i].b , sizeof (CCTWNode::b), 1 , OutputFile);
455+ fwrite (&Tree[i].Symbol , sizeof (CCTWNode::Symbol), 1 ,OutputFile);
456+ fwrite (&Tree[i].NrTries , sizeof (CCTWNode::NrTries) ,1 ,OutputFile);
457+ fwrite (&Tree[i].Pe , sizeof (CCTWNode::Pe) ,1 ,OutputFile);
458+ fwrite (&Tree[i].PwChild , sizeof (CCTWNode::PwChild) ,1 ,OutputFile);
469459 }
470460 fclose (OutputFile);
471461 return true ;
472462 }
473- else
474- return false ;
463+ else return false ;
475464}
476465
477466bool CCTWLanguageModel::ReadFromFile (std::string strFilename, std::string AlphabetName){
@@ -485,38 +474,49 @@ bool CCTWLanguageModel::ReadFromFile(std::string strFilename, std::string Alphab
485474 The values to compare with should be parameters and not hardcoded. */
486475
487476 SLMFileHeader GenericHeader;
488- char * ReadAlphabetName;
477+ char * ReadAlphabetName;
478+
479+ size_t bytesRead;
489480
490- fread (&GenericHeader.szMagic , sizeof (GenericHeader.szMagic [0 ]), sizeof (GenericHeader.szMagic ), InputFile);
491- if (memcmp (GenericHeader.szMagic ," %DLF" ,4 ))
481+ bytesRead = fread (&GenericHeader.szMagic , sizeof (GenericHeader.szMagic [0 ]), sizeof (GenericHeader.szMagic ) - 1 , InputFile); // Magic string is written without null-char
482+ if (bytesRead < ( sizeof (GenericHeader. szMagic ) - 1 ) || memcmp (GenericHeader.szMagic ," %DLF" ,bytesRead ))
492483 { // magic strings not equal
493484 return false ;
494485 }
495- fread (&GenericHeader.iHeaderVersion ,2 ,1 , InputFile);
496- if (GenericHeader.iHeaderVersion != 1 )
486+
487+ bytesRead = fread (&GenericHeader.iHeaderVersion , sizeof (GenericHeader.iHeaderVersion ), 1 , InputFile);
488+ if (bytesRead < sizeof (GenericHeader.iHeaderVersion ) || GenericHeader.iHeaderVersion != 1 )
497489 { // unknown header version
498490 return false ;
499491 }
500- fread (&GenericHeader.iHeaderSize ,2 ,1 , InputFile);
501- fread (&GenericHeader.iLMID ,2 ,1 , InputFile);
502- if (GenericHeader.iLMID != 5 )
492+
493+ bytesRead = fread (&GenericHeader.iHeaderSize ,sizeof (GenericHeader.iHeaderSize ),1 , InputFile);
494+ if (bytesRead < sizeof (GenericHeader.iHeaderSize )) return false ; // Not enough bytes read
495+
496+ bytesRead = fread (&GenericHeader.iLMID ,sizeof (GenericHeader.iLMID ),1 , InputFile);
497+ if (bytesRead < sizeof (GenericHeader.iLMID ) || GenericHeader.iLMID != 5 )
503498 { // header indicates this is not a CTW model
504499 return false ;
505500 }
506- fread (&GenericHeader.iLMVersion ,2 ,1 , InputFile);
507- fread (&GenericHeader.iLMMinVersion ,2 ,1 , InputFile);
508- if (GenericHeader.iLMMinVersion > 1 )
501+
502+ bytesRead = fread (&GenericHeader.iLMVersion ,sizeof (GenericHeader.iLMVersion ),1 , InputFile);
503+ if (bytesRead < sizeof (GenericHeader.iLMVersion )) return false ; // Not enough bytes read
504+
505+ bytesRead = fread (&GenericHeader.iLMMinVersion ,sizeof (GenericHeader.iLMMinVersion ),1 , InputFile);
506+ if (bytesRead < sizeof (GenericHeader.iLMMinVersion ) || GenericHeader.iLMMinVersion > 1 )
509507 { // header indicates stored model newer than we can handle
510508 return false ;
511509 }
512- fread (&GenericHeader.iAlphabetSize ,2 ,1 , InputFile);
513- if (GenericHeader.iAlphabetSize != GetSize ())
510+
511+ bytesRead = fread (&GenericHeader.iAlphabetSize ,sizeof (GenericHeader.iAlphabetSize ),1 , InputFile);
512+ if (bytesRead < sizeof (GenericHeader.iAlphabetSize ) || GenericHeader.iAlphabetSize != GetSize ())
514513 { // header indicates stored model uses an alphabet of different size
515514 return false ;
516515 }
517516
518- ReadAlphabetName = new char [GenericHeader.iHeaderSize - sizeof (SLMFileHeader)+1 ];
519- fread (ReadAlphabetName,1 ,GenericHeader.iHeaderSize - sizeof (SLMFileHeader), InputFile);
517+ ReadAlphabetName = new char [GenericHeader.iHeaderSize - sizeof (SLMFileHeader) + 1 ];
518+ bytesRead = fread (ReadAlphabetName,sizeof (ReadAlphabetName[0 ]), GenericHeader.iHeaderSize - sizeof (SLMFileHeader), InputFile);
519+ if (bytesRead < GenericHeader.iHeaderSize - sizeof (SLMFileHeader)) return false ; // Not enough bytes read
520520 ReadAlphabetName[GenericHeader.iHeaderSize - sizeof (SLMFileHeader)] = ' \0 ' ; // write the terminating 0 and read it in as well
521521
522522 if (strcmp (ReadAlphabetName,AlphabetName.c_str ()))
@@ -525,21 +525,22 @@ bool CCTWLanguageModel::ReadFromFile(std::string strFilename, std::string Alphab
525525 return false ;
526526 }
527527 delete[] ReadAlphabetName;
528+
528529 int ReadNrNodes;
529- fread (&ReadNrNodes,4 , 1 , InputFile);
530- if (ReadNrNodes != MaxNrNodes)
530+ bytesRead = fread (&ReadNrNodes,sizeof (ReadNrNodes), 1 , InputFile);
531+ if (bytesRead < sizeof (ReadNrNodes) || ReadNrNodes != MaxNrNodes)
531532 { // header indicates different number of nodes in the hashtable
532533 return false ;
533534 }
534535
535536 for (int i=0 ;i<MaxNrNodes;i++)
536537 {
537- fread (&Tree[i].a ,1 ,1 ,InputFile);
538- fread (&Tree[i].b ,1 ,1 ,InputFile);
539- fread (&Tree[i].Symbol , 1 ,1 ,InputFile);
540- fread (&Tree[i].NrTries , 1 ,1 ,InputFile);
541- fread (&Tree[i].Pe , 2 ,1 ,InputFile);
542- fread (&Tree[i].PwChild , 2 ,1 ,InputFile);
538+ if ( fread (&Tree[i].a ,sizeof (CCTWNode::a) ,1 ,InputFile) < sizeof (CCTWNode::a)) return false ;
539+ if ( fread (&Tree[i].b ,sizeof (CCTWNode::b) ,1 ,InputFile) < sizeof (CCTWNode::b)) return false ;
540+ if ( fread (&Tree[i].Symbol , sizeof (CCTWNode::Symbol) ,1 ,InputFile) < sizeof (CCTWNode::Symbol)) return false ;
541+ if ( fread (&Tree[i].NrTries , sizeof (CCTWNode::NrTries) ,1 ,InputFile) < sizeof (CCTWNode::NrTries)) return false ;
542+ if ( fread (&Tree[i].Pe , sizeof (CCTWNode::Pe) ,1 ,InputFile) < sizeof (CCTWNode::Pe)) return false ;
543+ if ( fread (&Tree[i].PwChild , sizeof (CCTWNode::PwChild) ,1 ,InputFile) < sizeof (CCTWNode::PwChild)) return false ;
543544 }
544545 fclose (InputFile);
545546 return true ;
0 commit comments