Skip to content

Commit 4672e07

Browse files
authored
Merge pull request #1445 from linas/empty-zzz
Revised ZZZ-connector handling.
2 parents 01b836d + 34fc152 commit 4672e07

File tree

7 files changed

+49
-20
lines changed

7 files changed

+49
-20
lines changed

data/en/4.0.dict

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@
5252
% This is mostly fixed, except that some uses of <noun-main-m>
5353
% remain, below.
5454

55+
% empty-connector is used for hard-coded handling of unpaired
56+
% quote marks in the C code.
57+
#define empty-connector ZZZ;
58+
5559
% Capitalization handling (null effect for now- behave as empty words).
5660
<1stCAP>: ZZZ-;
5761
<nonCAP>: ZZZ-;

data/en/4.0.dict.m4

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ changecom(`%')
6161
% This is mostly fixed, except that some uses of <noun-main-m>
6262
% remain, below.
6363

64+
% empty-connector is used for hard-coded handling of unpaired
65+
% quote marks in the C code.
66+
#define empty-connector ZZZ;
67+
6468
% Capitalization handling (null effect for now- behave as empty words).
6569
<1stCAP>: ZZZ-;
6670
<nonCAP>: ZZZ-;

link-grammar/dict-common/dict-common.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
#include "memory-pool.h" // Pool_desc
2323
#include "utilities.h" // locale_t
2424

25-
// EMPTY_CONNECTOR must be at least 6 or 7 chars long, to avoid
26-
// collisions with autogened connectors in the Atomese dict.
27-
#define EMPTY_CONNECTOR "ZZZZZZZZZ"
25+
26+
// Dict may have `#define empty-connector ZZZ` in it.
27+
#define EMPTY_CONNECTOR "empty-connector"
2828
#define UNLIMITED_CONNECTORS_WORD ("UNLIMITED-CONNECTORS")
2929
#define LIMITED_CONNECTORS_WORD ("LENGTH-LIMIT-")
3030
#define IS_GENERATION(dict) (dict->category != NULL)
@@ -123,10 +123,13 @@ struct Dictionary_s
123123
const char * version;
124124
const char * locale; /* Locale name */
125125
locale_t lctype; /* Locale argument for the *_l() functions */
126+
126127
int num_entries;
127128
float default_max_disjunct_cost;
128129
dfine_s dfine; /* Name-value definitions */
129130

131+
const char * zzz_connector;
132+
130133
bool use_unknown_word;
131134
bool unknown_word_defined;
132135
bool left_wall_defined;

link-grammar/dict-common/dict-locale.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,11 @@ bool dictionary_setup_defines(Dictionary dict)
441441

442442
dict->shuffle_linkages = false;
443443

444+
// Used for unattached quote marks, in the English dict only.
445+
dict->zzz_connector = linkgrammar_get_dict_define(dict, EMPTY_CONNECTOR);
446+
if (NULL != dict->zzz_connector)
447+
dict->zzz_connector = string_set_add(dict->zzz_connector, dict->string_set);
448+
444449
dictionary_setup_locale(dict);
445450

446451
if (!dictionary_setup_max_disjunct_cost(dict)) return false;

link-grammar/dict-common/dict-utils.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ static bool exp_has_connector(const Exp * e, int depth,
288288
}
289289

290290
/**
291-
* Find if an expression has a connector ZZZ- (that an empty-word has).
291+
* Check if an expression has a connector ZZZ- (that an empty-word has).
292292
* This is a costly way to find it. To reduce the overhead, the
293293
* exp_has_connector() "depth" argument limits the expression depth check,
294294
* supposing the ZZZ- connectors are not deep in the word expression.
@@ -297,9 +297,8 @@ static bool exp_has_connector(const Exp * e, int depth,
297297
**/
298298
bool is_exp_like_empty_word(Dictionary dict, Exp *exp)
299299
{
300-
const char *cs = string_set_lookup(EMPTY_CONNECTOR, dict->string_set);
301-
if (NULL == cs) return false;
302-
return exp_has_connector(exp, 2, cs, '-');
300+
if (NULL == dict->zzz_connector) return false;
301+
return exp_has_connector(exp, 2, dict->zzz_connector, '-');
303302
}
304303

305304
/* ======================================================== */

link-grammar/linkage/linkage.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -692,12 +692,15 @@ static void compute_chosen_words(Sentence sent, Linkage linkage,
692692
* to facilitate using diff on sentence batch runs. */
693693
if (test_enabled("removeZZZ"))
694694
{
695-
for (i=0; i<linkage->num_links; i++)
695+
if (sent->dict->zzz_connector)
696696
{
697-
Link *lnk = &(linkage->link_array[i]);
697+
for (i=0; i<linkage->num_links; i++)
698+
{
699+
Link *lnk = &(linkage->link_array[i]);
698700

699-
if (0 == strcmp("ZZZ", lnk->link_name))
700-
chosen_words[lnk->rw] = NULL;
701+
if (0 == strcmp(sent->dict->zzz_connector, lnk->link_name))
702+
chosen_words[lnk->rw] = NULL;
703+
}
701704
}
702705
}
703706

link-grammar/tokenize/lookup-exprs.c

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -161,30 +161,31 @@ GNUC_UNUSED static void print_x_node(X_node *x)
161161
* This function was mainly used to support using empty-words, a concept
162162
* that has been eliminated. However, it is still used to support linking of
163163
* quotes that don't get the QUc/QUd links.
164+
*
165+
* This function is called only if ZZZ is defined in the dictionary.
166+
* This is currently used only by the English dict, to allow quotes to
167+
* appear anywhere in the sentence.
164168
*/
165169
static void add_empty_word(Sentence sent, X_node *x)
166170
{
167-
Exp *zn, *an;
168-
const char *ZZZ = string_set_lookup(EMPTY_CONNECTOR, sent->dict->string_set);
169-
/* This function is called only if ZZZ is in the dictionary. */
170-
171171
/* The left-wall already has ZZZ-. The right-wall will not arrive here. */
172172
if (MT_WALL == x->word->morpheme_type) return;
173173

174174
/* Replace plain-word-exp by {ZZZ+} & (plain-word-exp) in each X_node. */
175175
for(; NULL != x; x = x->next)
176176
{
177-
/* Ignore stems for now, decreases a little the overhead for
178-
* stem-suffix languages. */
177+
/* Ignore stems for now; this decreases the overhead a little
178+
* for stem-suffix languages. */
179179
if (is_stem(x->string)) continue; /* Avoid an unneeded overhead. */
180180
//lgdebug(+0, "Processing '%s'\n", x->string);
181181

182182
/* zn points at {ZZZ+} */
183-
zn = make_connector_node(sent->dict, sent->Exp_pool, ZZZ, '+', false);
183+
Exp *zn = make_connector_node(sent->dict,
184+
sent->Exp_pool, sent->dict->zzz_connector, '+', false);
184185
zn = make_optional_node(sent->Exp_pool, zn);
185186

186187
/* an will be {ZZZ+} & (plain-word-exp) */
187-
an = make_and_node(sent->Exp_pool, zn, x->exp);
188+
Exp *an = make_and_node(sent->Exp_pool, zn, x->exp);
188189

189190
x->exp = an;
190191
}
@@ -257,7 +258,17 @@ static bool determine_word_expressions(Sentence sent, Gword *w,
257258
* supposing that the word has it in all of its dict entries
258259
* (in any case, currently there is only 1 entry for each such word).
259260
* Note that ZZZ_added starts by 0 and so also wordpos, and that the
260-
* first sentence word (usually LEFT-WALL) doesn't need a check. */
261+
* first sentence word (usually LEFT-WALL) doesn't need a check.
262+
*
263+
* At this time, the empty-connector device is used only by the
264+
* English dict, to allow quotation marks to appear in random
265+
* locations in sentences. Rather than writing the English dict
266+
* so that *every word* has an optional {ZZZ-} & connector on it,
267+
* which would double the size of the dict, we instead add it here,
268+
* dynamically, on-the-fly, as needed. This whole thing feels
269+
* half-baked to me. It works, but is this weird exception being
270+
* made for one language.
271+
*/
261272
if ((wordpos != *ZZZ_added) && is_exp_like_empty_word(dict, we->exp))
262273
{
263274
lgdebug(D_DWE, " (has ZZZ-)");

0 commit comments

Comments
 (0)