Skip to content

Commit 0e159e5

Browse files
authored
Add Bliss::VocabTextLexiconParser for simple text-based lexica (#105)
1 parent 60b83f7 commit 0e159e5

File tree

5 files changed

+203
-24
lines changed

5 files changed

+203
-24
lines changed

src/Bliss/Lexicon.cc

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -137,27 +137,30 @@ Lexicon::~Lexicon() {
137137
}
138138

139139
void Lexicon::load(const std::string& filename) {
140-
Core::MD5 md5;
141-
if (md5.updateFromFile(filename))
140+
Core::MD5 md5;
141+
std::string strippedFilename = Core::FormatSet::stripQualifier(filename);
142+
if (md5.updateFromFile(strippedFilename)) {
142143
dependency_.setValue(md5);
143-
else
144-
warning("could not derive md5 sum from file '%s'", filename.c_str());
145-
LexiconParser parser(config, this);
146-
log("reading lexicon from file") << " \"" << filename << "\" ...";
147-
if (parser.parseFile(filename.c_str()) != 0)
144+
}
145+
else {
146+
warning("Could not derive md5 sum from file '%s'", strippedFilename.c_str());
147+
}
148+
149+
log("Reading lexicon from file") << " \"" << strippedFilename << "\" ...";
150+
if (!formats().read(filename, *this)) {
148151
error("Error while reading lexicon file.");
152+
}
149153
log("dependency value: ") << dependency_.value();
150154
}
151155

152156
LexiconRef Lexicon::create(const Configuration& c) {
153-
Lexicon* result = new Lexicon(c);
157+
auto result = Core::ref(new Lexicon(c));
154158
result->load(paramFilename(c));
155159
if (result->hasFatalErrors()) {
156-
delete result;
157160
return LexiconRef();
158161
}
159162
result->logStatistics();
160-
return LexiconRef(result);
163+
return result;
161164
}
162165

163166
Lemma* Lexicon::newLemma() {
@@ -847,3 +850,35 @@ Core::Ref<LemmaToEvaluationTokenTransducer> Lexicon::createLemmaToEvaluationToke
847850
Core::Ref<LemmaToEvaluationTokenTransducer> Lexicon::createLemmaToPreferredEvaluationTokenSequenceTransducer() const {
848851
return createLemmaToEvaluationTokenTransducer(false);
849852
}
853+
854+
template<>
855+
class Core::NameHelper<Lexicon> {
856+
public:
857+
operator std::string() const {
858+
return "Lexicon";
859+
}
860+
const char* c_str() const {
861+
return "Lexicon";
862+
}
863+
};
864+
865+
template<>
866+
class Core::NameHelper<Lexicon*> {
867+
public:
868+
operator std::string() const {
869+
return "Lexicon*";
870+
}
871+
const char* c_str() const {
872+
return "Lexicon*";
873+
}
874+
};
875+
876+
Core::FormatSet& Lexicon::formats() {
877+
if (!formats_) {
878+
formats_ = std::make_unique<Core::FormatSet>(Core::Configuration(Core::Application::us()->getConfiguration(), "lexicon-file-format-set"));
879+
formats_->registerFormat("xml", new XmlLexiconFormat(), true);
880+
formats_->registerFormat("vocab-text", new VocabTextLexiconFormat());
881+
formats_->registerFormat("vocab-txt", new VocabTextLexiconFormat());
882+
}
883+
return *formats_;
884+
}

src/Bliss/Lexicon.hh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,12 @@
2525
#include <Core/Component.hh>
2626
#include <Core/Dependency.hh>
2727
#include <Core/Extensions.hh>
28+
#include <Core/FormatSet.hh>
2829
#include <Core/Obstack.hh>
2930
#include <Core/Parameter.hh>
3031
#include <Core/ReferenceCounting.hh>
3132
#include <Core/StringUtilities.hh>
33+
#include <Core/Types.hh>
3234
#include "Phoneme.hh"
3335
#include "Symbol.hh"
3436

@@ -477,7 +479,7 @@ class LemmaToEvaluationTokenTransducer;
477479
*
478480
* A lemma may be assigned a symbolic name, which the system can
479481
* use to identify lemmas which have a special meaning to it.
480-
* E.g. the silence word is is identified by the symbolic name
482+
* E.g. the silence word is identified by the symbolic name
481483
* "silence". Such lemmas a called "special lemmas".
482484
*/
483485

@@ -607,7 +609,7 @@ public:
607609
void defineSpecialLemma(const std::string& name, Lemma* lemma);
608610

609611
/**
610-
* Load lexicon from XML file.
612+
* Load lexicon from XML or txt file.
611613
*/
612614
void load(const std::string& filename);
613615

@@ -883,6 +885,11 @@ public:
883885
* evaluation token sequences, the first is used.
884886
*/
885887
Core::Ref<LemmaToEvaluationTokenTransducer> createLemmaToPreferredEvaluationTokenSequenceTransducer() const;
888+
889+
private:
890+
std::unique_ptr<Core::FormatSet> formats_;
891+
892+
Core::FormatSet& formats();
886893
};
887894

888895
} // namespace Bliss

src/Bliss/LexiconParser.cc

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ void LexiconElement::addPhon(const WeightedPhonemeString& phon) {
257257
return;
258258
if (!product_->phonemeInventory()) {
259259
parser()->warning(
260-
"No phoneme inventory defined. Ingnoring pronunciation");
260+
"No phoneme inventory defined. Ignoring pronunciation");
261261
return;
262262
}
263263

@@ -358,7 +358,7 @@ const Core::ParameterString paramEncoding(
358358
"utf-8");
359359
} // namespace
360360

361-
void LexiconParser::loadWhitelist(const Core::Configuration& config, Core::StringHashSet& whitelist) {
361+
void XmlLexiconParser::loadWhitelist(const Core::Configuration& config, Core::StringHashSet& whitelist) {
362362
std::string filename = paramFile(config);
363363
if (!filename.empty()) {
364364
Core::CompressedInputStream* cis = new Core::CompressedInputStream(filename.c_str());
@@ -379,12 +379,88 @@ void LexiconParser::loadWhitelist(const Core::Configuration& config, Core::Strin
379379
}
380380
}
381381

382-
LexiconParser::LexiconParser(const Core::Configuration& c, Lexicon* _lexicon)
383-
: Precursor(c) {
382+
XmlLexiconParser::XmlLexiconParser(const Core::Configuration& c, Lexicon* _lexicon)
383+
: LexiconParser(),
384+
XmlSchemaParser(c) {
384385
lexicon_ = _lexicon;
385386

386387
// build schema
387388
LexiconElement* lexElement = new LexiconElement(this, LexiconElement::creationHandler(&Self::pseudoCreateLexicon), c);
388389
loadWhitelist(select("vocab"), lexElement->whitelist_);
389390
setRoot(collect(lexElement));
390391
}
392+
393+
// use base class parse function
394+
bool XmlLexiconParser::parseFile(const std::string& filename) {
395+
return parser()->Core::XmlSchemaParser::parseFile(filename.c_str()) == 0;
396+
}
397+
398+
VocabTextLexiconParser::VocabTextLexiconParser(Lexicon* _lexicon)
399+
: LexiconParser(),
400+
lexicon_(_lexicon) {
401+
phonemeInventory_ = Core::Ref(new PhonemeInventory());
402+
}
403+
404+
// parse txt file line by line to a Bliss::Lexicon
405+
// in the first step, the phonemes are created and the phoneme inventory is set
406+
// and afterwards the lemmata can be created from these phonemes
407+
bool VocabTextLexiconParser::parseFile(const std::string& filename) {
408+
// collect all labels from the file and add them as phonemes to the phoneme inventory
409+
std::ifstream file(filename);
410+
if (!file.is_open()) {
411+
return false;
412+
}
413+
std::string line;
414+
while (std::getline(file, line)) {
415+
if (line.empty())
416+
continue;
417+
createPhoneme(line);
418+
}
419+
420+
// set the phoneme inventory
421+
lexicon_->setPhonemeInventory(phonemeInventory_);
422+
// iterate over the phonemes in the inventory to create the lemmata in the lexicon
423+
createLemmata();
424+
return true;
425+
}
426+
427+
// helper function to handle one label and create a corresponding phoneme
428+
void VocabTextLexiconParser::createPhoneme(const std::string& line) {
429+
std::string symbol(line);
430+
stripWhitespace(symbol); // in case there are any unintentional whitespaces
431+
suppressTrailingBlank(symbol);
432+
433+
// check if phoneme was already added (if one label appears more than once)
434+
if (phonemeInventory_->phoneme(symbol)) {
435+
Core::Application::us()->error("Phoneme \"%s\" was already added to the inventory. It may be duplicated in the lexicon.", symbol.c_str());
436+
}
437+
438+
// create a new phoneme
439+
Phoneme* newPhoneme_ = phonemeInventory_->newPhoneme();
440+
// set symbol
441+
phonemeInventory_->assignSymbol(newPhoneme_, symbol);
442+
// set variation to none
443+
newPhoneme_->setContextDependent(false);
444+
}
445+
446+
// helper function to create the lemmata
447+
void VocabTextLexiconParser::createLemmata() {
448+
// iterate over the phonemes which were assigned to the inventory previously
449+
auto phonemes = phonemeInventory_->phonemes();
450+
for (auto it = phonemes.first; it != phonemes.second; ++it) {
451+
const Phoneme* phoneme = *it;
452+
std::string symbol = phoneme->symbol();
453+
454+
// make sure that lemma has not been added yet
455+
verify(!lexicon_->lemma(symbol));
456+
457+
// create a new lemma
458+
Lemma* newLemma_ = lexicon_->newLemma();
459+
// set orth
460+
lexicon_->setOrthographicForms(newLemma_, {symbol});
461+
// set phon
462+
Pronunciation* pron = lexicon_->getPronunciation(symbol);
463+
lexicon_->addPronunciation(newLemma_, pron);
464+
lexicon_->setDefaultLemmaName(newLemma_);
465+
}
466+
}

src/Bliss/LexiconParser.hh

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,15 @@ struct WeightedPhonemeString;
5151
class PronunciationElement;
5252
class LexiconElement;
5353
class LexiconParser;
54+
class TextLexiconParser;
55+
class XmlLexiconParser;
5456

5557
class LexiconElement : public Core::XmlBuilderElement<
5658
Lexicon,
5759
Core::XmlRegularElement,
5860
Core::CreateByContext> {
5961
friend class LexiconParser;
62+
friend class XmlLexiconParser;
6063
typedef Core::XmlBuilderElement<
6164
Lexicon,
6265
Core::XmlRegularElement,
@@ -96,17 +99,25 @@ public:
9699
virtual void characters(const char*, int) {};
97100
};
98101

102+
/*
103+
* Base lexicon parser class
104+
*/
105+
class LexiconParser {
106+
public:
107+
virtual ~LexiconParser() {}
108+
virtual bool parseFile(const std::string& filename) = 0;
109+
virtual Lexicon* lexicon() const = 0;
110+
};
111+
99112
/**
100113
* Parser for Bliss lexicon files.
101114
* This class implements parsing of the lexicon XML format
102115
* described in <a href="../../doc/Lexicon.pdf">Lexicon File
103116
* Format Reference</a>. It is normally not used directly but
104117
* through Lexicon.
105118
*/
106-
107-
class LexiconParser : public Core::XmlSchemaParser {
108-
typedef Core::XmlSchemaParser Precursor;
109-
typedef LexiconParser Self;
119+
class XmlLexiconParser : public virtual LexiconParser, public Core::XmlSchemaParser {
120+
typedef XmlLexiconParser Self;
110121

111122
private:
112123
Lexicon* lexicon_;
@@ -116,12 +127,55 @@ private:
116127
void loadWhitelist(const Core::Configuration&, Core::StringHashSet&);
117128

118129
public:
119-
LexiconParser(const Core::Configuration& c, Lexicon*);
120-
Lexicon* lexicon() const {
130+
XmlLexiconParser(const Core::Configuration& c, Lexicon*);
131+
bool parseFile(const std::string& filename) override;
132+
Lexicon* lexicon() const override {
121133
return lexicon_;
122134
}
123135
};
124136

137+
struct XmlLexiconFormat : public Core::FormatSet::Format<Lexicon> {
138+
bool read(const std::string& filename, Lexicon& lexicon) const override {
139+
XmlLexiconParser parser(Core::Application::us()->getConfiguration(), &lexicon);
140+
return parser.parseFile(filename);
141+
}
142+
143+
bool write(const std::string& filename, Lexicon const& lexicon) const override {
144+
return false;
145+
}
146+
};
147+
148+
/**
149+
* Parser for text lexicon files containing the vocab, so only the labels
150+
* This is meant for "lexicon-free" search
151+
* The .txt-file should contain one label per line
152+
*/
153+
class VocabTextLexiconParser : public LexiconParser {
154+
private:
155+
Core::Ref<Lexicon> lexicon_;
156+
Core::Ref<PhonemeInventory> phonemeInventory_;
157+
void createPhoneme(const std::string& line);
158+
void createLemmata();
159+
160+
public:
161+
VocabTextLexiconParser(Lexicon*);
162+
bool parseFile(const std::string& filename) override;
163+
Lexicon* lexicon() const override {
164+
return lexicon_.get();
165+
}
166+
};
167+
168+
struct VocabTextLexiconFormat : public Core::FormatSet::Format<Lexicon> {
169+
bool read(const std::string& filename, Lexicon& lexicon) const override {
170+
VocabTextLexiconParser parser(&lexicon);
171+
return parser.parseFile(filename);
172+
}
173+
174+
bool write(const std::string& filename, Lexicon const& lexicon) const override {
175+
return false;
176+
}
177+
};
178+
125179
} // namespace Bliss
126180

127181
#endif // _BLISS_LEXICONPARSER_HH

src/Core/FormatSet.hh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,19 @@ public:
8383
*/
8484
typedef StringHashMap<Ref<ReferenceCounted>> Formats;
8585

86+
/*
87+
* Returns the format specifier or and empty string if not found
88+
*/
89+
static std::string getQualifier(const std::string& filename);
90+
/*
91+
* Returns the filename without the format specifier or the full filename if not found
92+
*/
93+
static std::string stripQualifier(const std::string& filename);
94+
8695
private:
8796
Formats formats_;
8897

8998
private:
90-
static std::string getQualifier(const std::string& filename);
91-
static std::string stripQualifier(const std::string& filename);
9299
template<class T>
93100
void getTypeSpecificFormats(Ref<TypeSpecificFormats<T>>&);
94101
template<class T>

0 commit comments

Comments
 (0)