Skip to content
This repository was archived by the owner on Sep 13, 2022. It is now read-only.

Commit 8003af3

Browse files
committed
Merge pull request #12 from EducationalTestingService/feature/parse_tagged_sent
Added function to parse already tagged sentences and files.
2 parents 2388603 + 4b23eb8 commit 8003af3

File tree

9 files changed

+331
-28
lines changed

9 files changed

+331
-28
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ dist
33
build
44
build.sh
55
python_zpar.egg-info
6-
zpar/__pycache__
6+
zpar/__pycache__
7+
*.pyc
8+
test_twice.py

README.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,11 @@ Here's a small example of how to use python-zpar:
107107
tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False)
108108
print_(tagged_sent)
109109
110-
# get the dependency parses of the same two sentences
111-
dep_parsed_sent = depparser.dep_parse_sentence("I am going to the market.")
110+
# get the dependency parse of an already tagged sentence
111+
dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")
112112
print_(dep_parsed_sent)
113113
114+
# get the dependency parse of an already tokenized sentence
114115
dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False)
115116
print_(dep_parsed_sent)
116117

examples/test_tagged.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.
2+
Are/VBP you/PRP going/VBG to/TO come/VB with/IN me/PRP ?/.

examples/zpar_example.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,11 @@
3131
tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False)
3232
print_(tagged_sent)
3333

34-
# get the dependency parses of the same two sentences
35-
dep_parsed_sent = depparser.dep_parse_sentence("I am going to the market.")
34+
# get the dependency parse of an already tagged sentence
35+
dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")
3636
print_(dep_parsed_sent)
3737

38+
# get the dependency parse of an already tokenized sentence
3839
dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False)
3940
print_(dep_parsed_sent)
4041

src/zpar.lib.cpp

Lines changed: 200 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ extern "C" int load_parser(void* vzps, const char *sFeaturePath) {
166166
return 0;
167167
}
168168

169+
170+
169171
// The function to load the dependency parser model
170172
extern "C" int load_depparser(void* vzps, const char *sFeaturePath) {
171173

@@ -282,19 +284,61 @@ extern "C" char* parse_sentence(void* vzps, const char *input_sentence, bool tok
282284
zps->output_buffer = new char[1];
283285
strcpy(zps->output_buffer, "");
284286
} else {
285-
// initialize the variable that will hold the tagged sentence
287+
// initialize the variables that will hold the tagged and parsed sentences
286288
CTwoStringVector tagged_sent[1];
287289
english::CCFGTree parsed_sent[1];
288290

289-
// get the tagger that was stored earlier
291+
// get the tagger and parser that were stored earlier
290292
CTagger *tagger = zps->tagger;
291293
CConParser *conparser = zps->conparser;
292294

293-
// tag the sentence
295+
// tag and parse the sentence
294296
tagger->tag(tokenized_sent, tagged_sent);
295297
conparser->parse(*tagged_sent, parsed_sent);
296298

297-
// now put the tagged_sent into a string stream
299+
// now put the parsed sentence into a string stream
300+
std::string parse = parsed_sent->str_unbinarized();
301+
int parselen = parse.length();
302+
zps->output_buffer = new char[parselen + 1];
303+
strcpy(zps->output_buffer, parse.c_str());
304+
}
305+
306+
return zps->output_buffer;
307+
}
308+
309+
extern "C" char* parse_tagged_sentence(void* vzps, const char *input_tagged_sentence, const char seperator='/')
310+
{
311+
312+
zparSession_t* zps = static_cast<zparSession_t *>(vzps);
313+
314+
// create a temporary string stream from the input char *
315+
CSentenceReader input_reader(std::string(input_tagged_sentence), false);
316+
317+
// read the tagged sentence into a CTwoStringVector
318+
CTwoStringVector tagged_sent[1];
319+
input_reader.readTaggedSentence(tagged_sent, false, seperator);
320+
321+
if (zps->output_buffer != NULL) {
322+
delete zps->output_buffer;
323+
zps->output_buffer = NULL;
324+
}
325+
326+
if(tagged_sent->size() >= MAX_SENTENCE_SIZE){
327+
// The ZPar code asserts that length < MAX_SENTENCE_SIZE...
328+
std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_tagged_sentence << std::endl;
329+
zps->output_buffer = new char[1];
330+
strcpy(zps->output_buffer, "");
331+
} else {
332+
// initialize the variable that will hold the parsed sentence
333+
english::CCFGTree parsed_sent[1];
334+
335+
// get the parser that was stored earlier
336+
CConParser *conparser = zps->conparser;
337+
338+
// parse the tagged sentence
339+
conparser->parse(*tagged_sent, parsed_sent);
340+
341+
// now put the parsed sentence into a string stream
298342
std::string parse = parsed_sent->str_unbinarized();
299343
int parselen = parse.length();
300344
zps->output_buffer = new char[parselen + 1];
@@ -321,10 +365,6 @@ extern "C" char* dep_parse_sentence(void* vzps, const char *input_sentence, bool
321365
input_reader.readSegmentedSentence(tokenized_sent);
322366
}
323367

324-
// initialize the variable that will hold the tagged sentence
325-
CTwoStringVector tagged_sent[1];
326-
CDependencyParse parsed_sent[1];
327-
328368
if (zps->output_buffer != NULL) {
329369
delete zps->output_buffer;
330370
zps->output_buffer = NULL;
@@ -336,11 +376,16 @@ extern "C" char* dep_parse_sentence(void* vzps, const char *input_sentence, bool
336376
zps->output_buffer = new char[1];
337377
strcpy(zps->output_buffer, "");
338378
} else {
339-
// get the tagger that was stored earlier
379+
380+
// initialize the variable that will hold the tagged and parsed sentences
381+
CTwoStringVector tagged_sent[1];
382+
CDependencyParse parsed_sent[1];
383+
384+
// get the tagger and parser that were stored earlier
340385
CTagger *tagger = zps->tagger;
341386
CDepParser *depparser = zps->depparser;
342387

343-
// tag the sentence
388+
// tag and parse the sentence
344389
tagger->tag(tokenized_sent, tagged_sent);
345390
depparser->parse(*tagged_sent, parsed_sent);
346391

@@ -354,6 +399,49 @@ extern "C" char* dep_parse_sentence(void* vzps, const char *input_sentence, bool
354399
return zps->output_buffer;
355400
}
356401

402+
// Function to dependency parse a sentence
403+
extern "C" char* dep_parse_tagged_sentence(void* vzps, const char *input_tagged_sentence, const char seperator='/')
404+
{
405+
zparSession_t* zps = static_cast<zparSession_t *>(vzps);
406+
407+
// create a temporary string stream from the input char *
408+
CSentenceReader input_reader(std::string(input_tagged_sentence), false);
409+
410+
// read the tagged sentence into a CTwoStringVector
411+
CTwoStringVector tagged_sent[1];
412+
input_reader.readTaggedSentence(tagged_sent, false, seperator);
413+
414+
if (zps->output_buffer != NULL) {
415+
delete zps->output_buffer;
416+
zps->output_buffer = NULL;
417+
}
418+
419+
if(tagged_sent->size() >= MAX_SENTENCE_SIZE){
420+
// The ZPar code asserts that length < MAX_SENTENCE_SIZE...
421+
std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_tagged_sentence << std::endl;
422+
zps->output_buffer = new char[1];
423+
strcpy(zps->output_buffer, "");
424+
} else {
425+
426+
// initialize the variable that will hold the parsed sentence
427+
CDependencyParse parsed_sent[1];
428+
429+
// get the parser that was stored earlier
430+
CDepParser *depparser = zps->depparser;
431+
432+
// parse the sentence
433+
depparser->parse(*tagged_sent, parsed_sent);
434+
435+
// now output the formatted dependency tree
436+
std::string deptree = format_dependency_tree(parsed_sent);
437+
int deptreelen = deptree.length();
438+
zps->output_buffer = new char[deptreelen + 1];
439+
strcpy(zps->output_buffer, deptree.c_str());
440+
}
441+
442+
return zps->output_buffer;
443+
}
444+
357445
// Function to tag all sentence in the given input file
358446
// and write tagged sentences to the given output file
359447
extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize)
@@ -366,17 +454,17 @@ extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutput
366454
// initialize the input reader
367455
CSentenceReader input_reader(sInputFile);
368456

369-
// open the output file
370-
FILE *outfp = NULL;
371-
outfp = fopen(sOutputFile, "w");
372-
373457
// initialize the temporary sentence variables
374458
CStringVector tokenized_sent[1];
375459
CTwoStringVector tagged_sent[1];
376460

377461
// get the tagger and the parser that were stored earlier
378462
CTagger *tagger = zps->tagger;
379463

464+
// initialize the output file writer
465+
std::string outputFileName = std::string(sOutputFile);
466+
CSentenceWriter output_writer(outputFileName);
467+
380468
// read in and tokenize the given input file if asked
381469
bool readSomething;
382470
if (tokenize) {
@@ -396,8 +484,7 @@ extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutput
396484
tagger->tag(tokenized_sent, tagged_sent);
397485

398486
// write the formatted sentence to the output file
399-
std::string tagvec = format_tagged_vector(tagged_sent);
400-
fprintf(outfp, "%s\n", tagvec.c_str());
487+
output_writer.writeSentence(tagged_sent, '/', true);
401488

402489
if (tokenize) {
403490
readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
@@ -409,7 +496,6 @@ extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutput
409496

410497
// close the output file
411498
std::cerr << "Wrote output to " << sOutputFile << std::endl;
412-
fclose(outfp);
413499
}
414500

415501
// Function to constituency parse all sentence in the given input file
@@ -477,6 +563,51 @@ extern "C" void parse_file(void* vzps, const char *sInputFile, const char *sOutp
477563
fclose(outfp);
478564
}
479565

566+
extern "C" void parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/')
567+
{
568+
569+
zparSession_t* zps = static_cast<zparSession_t *>(vzps);
570+
571+
std::cerr << "Processing file " << sInputFile << std::endl;
572+
573+
// initialize the input reader
574+
CSentenceReader input_reader(sInputFile);
575+
576+
// open the output file
577+
FILE *outfp = NULL;
578+
outfp = fopen(sOutputFile, "w");
579+
580+
// initialize the temporary sentence variables
581+
CTwoStringVector tagged_sent[1];
582+
english::CCFGTree parsed_sent[1];
583+
584+
// get the parser that was stored earlier
585+
CConParser *conparser = zps->conparser;
586+
587+
// read in and tokenize the given input file if asked
588+
bool readSomething;
589+
readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
590+
591+
while ( readSomething )
592+
{
593+
std::string parse = "";
594+
if(tagged_sent->size() < MAX_SENTENCE_SIZE){
595+
conparser->parse(*tagged_sent, parsed_sent);
596+
parse = parsed_sent->str_unbinarized();
597+
} else {
598+
std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl;
599+
}
600+
601+
fprintf(outfp, "%s\n", parse.c_str());
602+
603+
readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
604+
}
605+
606+
// close the output file
607+
std::cerr << "Wrote output to " << sOutputFile << std::endl;
608+
fclose(outfp);
609+
}
610+
480611
// Function to dependency parse all sentence in the given input file
481612
// and write parsed sentences to the given output file
482613
extern "C" void dep_parse_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize)
@@ -542,6 +673,51 @@ extern "C" void dep_parse_file(void* vzps, const char *sInputFile, const char *s
542673
fclose(outfp);
543674
}
544675

676+
extern "C" void dep_parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/')
677+
{
678+
679+
zparSession_t* zps = static_cast<zparSession_t *>(vzps);
680+
681+
std::cerr << "Processing file " << sInputFile << std::endl;
682+
683+
// initialize the input reader
684+
CSentenceReader input_reader(sInputFile);
685+
686+
// open the output file
687+
FILE *outfp = NULL;
688+
outfp = fopen(sOutputFile, "w");
689+
690+
// initialize the temporary sentence variables
691+
CTwoStringVector tagged_sent[1];
692+
CDependencyParse parsed_sent[1];
693+
694+
// get the parser that was stored earlier
695+
CDepParser *depparser = zps->depparser;
696+
697+
// read in and tokenize the given input file if asked
698+
bool readSomething;
699+
readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
700+
701+
while ( readSomething )
702+
{
703+
std::string deptree = "";
704+
if(tagged_sent->size() < MAX_SENTENCE_SIZE){
705+
depparser->parse(*tagged_sent, parsed_sent);
706+
deptree = format_dependency_tree(parsed_sent);
707+
} else {
708+
std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl;
709+
}
710+
711+
fprintf(outfp, "%s\n", deptree.c_str());
712+
713+
readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
714+
}
715+
716+
// close the output file
717+
std::cerr << "Wrote output to " << sOutputFile << std::endl;
718+
fclose(outfp);
719+
}
720+
545721
// Function to unload all the models
546722
extern "C" void unload_models(void* vzps)
547723
{
@@ -554,12 +730,17 @@ extern "C" void unload_models(void* vzps)
554730
zps = NULL;
555731
}
556732

557-
// // A main function for testing
733+
// A main function for testing
558734
// extern "C" int main(int argc, char *argv[])
559735
// {
560736
// void* vzps = initialize();
561737
// load_tagger(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models");
562-
// std::cout << std::string(tag_sentence(vzps, "I said I am going to the market.", false));
738+
// load_parser(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models");
739+
// load_depparser(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models");
740+
// parse_tagged_file(vzps, "/Users/nmadnani/work/python-zpar/examples/test_tagged.txt", "/Users/nmadnani/work/python-zpar/examples/test_tagged.parse");
741+
// dep_parse_tagged_file(vzps, "/Users/nmadnani/work/python-zpar/examples/test_tagged.txt", "/Users/nmadnani/work/python-zpar/examples/test_tagged.dep");
742+
// std::cout << std::string(parse_tagged_sentence(vzps, "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")) << std::endl;
743+
// std::cout << std::string(dep_parse_tagged_sentence(vzps, "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")) << std::endl;
563744
// unload_models(vzps);
564745
// return 0;
565746
// }

tests/test_depparser.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,39 @@ def check_dep_parse_file(tokenize=False):
7171
def test_dep_parse_file():
7272
yield check_dep_parse_file, False
7373
yield check_dep_parse_file, True
74+
75+
76+
def test_dep_parse_tagged_sentence():
77+
from tests import depparser
78+
79+
tagged_sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./."
80+
correct_output = "I\tPRP\t1\tSUB\n'm\tVBP\t-1\tROOT\ngoing\tVBG\t1\tVC\nto\tTO\t2\tVMOD\nthe\tDT\t5\tNMOD\nmarket\tNN\t3\tPMOD\n.\t.\t1\tP\n"
81+
parsed_sentence = depparser.dep_parse_tagged_sentence(tagged_sentence)
82+
83+
assert_equal(parsed_sentence, correct_output)
84+
85+
86+
def test_dep_parse_tagged_file():
87+
88+
from tests import depparser
89+
90+
correct_output = ['I\tPRP\t1\tSUB', 'am\tVBP\t-1\tROOT',
91+
'going\tVBG\t1\tVC', 'to\tTO\t2\tVMOD',
92+
'the\tDT\t5\tNMOD', 'market\tNN\t3\tPMOD',
93+
'.\t.\t1\tP', '', 'Are\tVBP\t-1\tROOT',
94+
'you\tPRP\t0\tSUB', 'going\tVBG\t0\tVMOD',
95+
'to\tTO\t4\tVMOD', 'come\tVB\t2\tVMOD',
96+
'with\tIN\t4\tVMOD', 'me\tPRP\t5\tPMOD',
97+
'?\t.\t0\tP', '']
98+
99+
input_file = abspath(join(_my_dir, '..', 'examples', 'test_tagged.txt'))
100+
output_file = abspath(join(_my_dir, '..', 'examples', 'test_tagged.dep'))
101+
102+
# parse the file
103+
depparser.dep_parse_tagged_file(input_file, output_file)
104+
105+
# read the output file and make sure we have the expected output
106+
with open(output_file, 'r') as outf:
107+
output = [l.strip() for l in outf.readlines()]
108+
109+
assert_equal(output, correct_output)

0 commit comments

Comments
 (0)