1818 */
1919public class CoNLLUDocumentWriter {
2020
21+ private static final String LRB_PATTERN = "(?i)-LRB-" ;
22+ private static final String RRB_PATTERN = "(?i)-RRB-" ;
2123
22- private static final String LRB_PATTERN = "(?i)-LRB-" ;
23- private static final String RRB_PATTERN = "(?i)-RRB-" ;
2424
25+ public String printSemanticGraph (SemanticGraph basicSg ) {
26+ return printSemanticGraph (basicSg , null , true );
27+ }
2528
26- public String printSemanticGraph (SemanticGraph basicSg ) {
27- return printSemanticGraph (basicSg , null , true );
28- }
29+ public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg ) {
30+ return printSemanticGraph (basicSg , enhancedSg , true );
31+ }
2932
30- public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg ) {
31- return printSemanticGraph (basicSg , enhancedSg , true );
32- }
33+ public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg , boolean unescapeParenthesis ) {
34+ StringBuilder sb = new StringBuilder ();
3335
34- public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg , boolean unescapeParenthesis ) {
36+ /* Print comments. */
37+ for (String comment : basicSg .getComments ()) {
38+ sb .append (comment ).append (System .lineSeparator ());
39+ }
3540
41+ SemanticGraph tokenSg = enhancedSg != null ? enhancedSg : basicSg ;
3642
43+ for (IndexedWord token : tokenSg .vertexListSorted ()) {
44+ /* Check for multiword tokens. */
45+ if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
46+ printSpan (sb , token );
47+ } else if (token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) {
48+ printMWT (sb , tokenSg , token );
49+ }
3750
38- StringBuilder sb = new StringBuilder ();
51+ /* Try to find main governor and additional dependencies. */
52+ IndexedWord gov = basicSg .containsVertex (token ) ? basicSg .getParent (token ) : null ;
53+ String govIdx = gov != null ? gov .toCopyIndex () : null ;
54+ GrammaticalRelation reln = gov != null ? basicSg .getEdge (gov , token ).getRelation () : null ;
55+
56+ HashMap <String , String > enhancedDependencies = new HashMap <>();
57+ if (enhancedSg != null ) {
58+ for (IndexedWord parent : enhancedSg .getParents (token )) {
59+ SemanticGraphEdge edge = enhancedSg .getEdge (parent , token );
60+ String relationString = edge .getRelation ().toString ();
61+ // for Joakim
62+ //if (edge.getWeight() == 1.0) {
63+ // relationString = relationString + ":ENH_CONTROL";
64+ //} else if (edge.getWeight() == 3.0) {
65+ // relationString = relationString + ":ENH_RELCL";
66+ //} else if (edge.getWeight() == 4.0) {
67+ // relationString = relationString + ":ENH_GAPPING";
68+ //} else if (edge.getWeight() == 5.0) {
69+ // relationString = relationString + ":ENH_CONJ_PROP";
70+ //}
71+ enhancedDependencies .put (parent .toCopyIndex (), relationString );
72+ }
73+ } else {
74+ // add enhanced ones stored with token
75+ HashMap <String , String > secondaryDeps = token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class );
76+ if (secondaryDeps != null ) {
77+ enhancedDependencies .putAll (token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class ));
78+ //add basic dependency
79+ if (gov != null ) {
80+ enhancedDependencies .put (govIdx , reln .toString ());
81+ }
82+ }
83+ }
3984
40- /* Print comments. */
41- for (String comment : basicSg .getComments ()) {
42- sb .append (comment ).append (System .lineSeparator ());
85+ String additionalDepsString = CoNLLUUtils .toExtraDepsString (enhancedDependencies );
86+ String word = token .word ();
87+ String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
88+ String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
89+ String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
90+ String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
91+ String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
92+ String relnName = reln == null ? "_" : reln .toString ();
93+
94+ // don't use after() directly; it returns a default of ""
95+ // TODO: does this handle SpaceAfter on other tokens or SpacesAfter?
96+ if (token .get (CoreAnnotations .AfterAnnotation .class ) != null && token .after ().equals ("" )) {
97+ IndexedWord nextVertex = tokenSg .getNodeByIndexSafe (token .index () + 1 );
98+ // the next word needs to exist and be part of the same MWT
99+ // and either this word is the start of the MWT
100+ // or this word is the middle of the same MWT as the next word
101+ // if that is true, we will skip the SpaceAfter annotation
102+ boolean inMWT = ((nextVertex != null && isMWTbutNotStart (nextVertex )) &&
103+ ((token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) ||
104+ (isMWTbutNotStart (token ))));
105+ if (!inMWT ) {
106+ if (misc .equals ("_" )) {
107+ misc = "SpaceAfter=No" ;
108+ } else {
109+ misc = misc + "|SpaceAfter=No" ;
110+ }
43111 }
112+ }
44113
45- SemanticGraph tokenSg = enhancedSg != null ? enhancedSg : basicSg ;
46-
47- for (IndexedWord token : tokenSg .vertexListSorted ()) {
48- /* Check for multiword tokens. */
49- if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
50- printSpan (sb , token );
51- } else if (token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) {
52- printMWT (sb , tokenSg , token );
53- }
54-
55- /* Try to find main governor and additional dependencies. */
56- IndexedWord gov = basicSg .containsVertex (token ) ? basicSg .getParent (token ) : null ;
57- String govIdx = gov != null ? gov .toCopyIndex () : null ;
58- GrammaticalRelation reln = gov != null ? basicSg .getEdge (gov , token ).getRelation () : null ;
59-
60- HashMap <String , String > enhancedDependencies = new HashMap <>();
61- if (enhancedSg != null ) {
62-
63- for (IndexedWord parent : enhancedSg .getParents (token )) {
64- SemanticGraphEdge edge = enhancedSg .getEdge (parent , token );
65- String relationString = edge .getRelation ().toString ();
66- // for Joakim
67- //if (edge.getWeight() == 1.0) {
68- // relationString = relationString + ":ENH_CONTROL";
69- //} else if (edge.getWeight() == 3.0) {
70- // relationString = relationString + ":ENH_RELCL";
71- //} else if (edge.getWeight() == 4.0) {
72- // relationString = relationString + ":ENH_GAPPING";
73- //} else if (edge.getWeight() == 5.0) {
74- // relationString = relationString + ":ENH_CONJ_PROP";
75- //}
76- enhancedDependencies .put (parent .toCopyIndex (), relationString );
77- }
78-
79- } else {
80-
81- // add enhanced ones stored with token
82- HashMap <String , String > secondaryDeps = token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class );
83- if (secondaryDeps != null ) {
84- enhancedDependencies .putAll (token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class ));
85- //add basic dependency
86- if (gov != null ) {
87- enhancedDependencies .put (govIdx , reln .toString ());
88- }
89- }
90- }
91-
92-
93- String additionalDepsString = CoNLLUUtils .toExtraDepsString (enhancedDependencies );
94- String word = token .word ();
95- String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
96- String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
97- String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
98- String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
99- String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
100- String relnName = reln == null ? "_" : reln .toString ();
101-
102- // don't use after() directly; it returns a default of ""
103- // TODO: does this handle SpaceAfter on other tokens or SpacesAfter?
104- if (token .get (CoreAnnotations .AfterAnnotation .class ) != null && token .after ().equals ("" )) {
105- IndexedWord nextVertex = tokenSg .getNodeByIndexSafe (token .index () + 1 );
106- // the next word needs to exist and be part of the same MWT
107- // and either this word is the start of the MWT
108- // or this word is the middle of the same MWT as the next word
109- // if that is true, we will skip the SpaceAfter annotation
110- boolean inMWT = ((nextVertex != null && isMWTbutNotStart (nextVertex )) &&
111- ((token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) ||
112- (isMWTbutNotStart (token ))));
113- if (!inMWT ) {
114- if (misc .equals ("_" )) {
115- misc = "SpaceAfter=No" ;
116- } else {
117- misc = misc + "|SpaceAfter=No" ;
118- }
119- }
120- }
121-
122- /* Root. */
123- if (govIdx == null && basicSg .getRoots ().contains (token )) {
124- govIdx = "0" ;
125- relnName = GrammaticalRelation .ROOT .toString ();
126- } else if (govIdx == null ) {
127- govIdx = "_" ;
128- relnName = "_" ;
129- }
130-
131- if (enhancedSg != null && enhancedSg .getRoots ().contains (token )) {
132- if (enhancedDependencies .isEmpty ()) {
133- additionalDepsString = "0:root" ;
134- } else {
135- additionalDepsString = "0:root|" + additionalDepsString ;
136- }
137- }
138-
139- if (unescapeParenthesis ) {
140- word = word .replaceAll (LRB_PATTERN , "(" );
141- word = word .replaceAll (RRB_PATTERN , ")" );
142- lemma = lemma .replaceAll (LRB_PATTERN , "(" );
143- lemma = lemma .replaceAll (RRB_PATTERN , ")" );
144- }
145-
146- sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .toCopyIndex (), word ,
147- lemma , upos , pos , featuresString , govIdx , relnName , additionalDepsString , misc ));
114+ /* Root. */
115+ if (govIdx == null && basicSg .getRoots ().contains (token )) {
116+ govIdx = "0" ;
117+ relnName = GrammaticalRelation .ROOT .toString ();
118+ } else if (govIdx == null ) {
119+ govIdx = "_" ;
120+ relnName = "_" ;
121+ }
122+
123+ if (enhancedSg != null && enhancedSg .getRoots ().contains (token )) {
124+ if (enhancedDependencies .isEmpty ()) {
125+ additionalDepsString = "0:root" ;
126+ } else {
127+ additionalDepsString = "0:root|" + additionalDepsString ;
148128 }
149- sb . append ( System . lineSeparator ());
129+ }
150130
151- return sb .toString ();
131+ if (unescapeParenthesis ) {
132+ word = word .replaceAll (LRB_PATTERN , "(" );
133+ word = word .replaceAll (RRB_PATTERN , ")" );
134+ lemma = lemma .replaceAll (LRB_PATTERN , "(" );
135+ lemma = lemma .replaceAll (RRB_PATTERN , ")" );
136+ }
137+
138+ sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .toCopyIndex (), word ,
139+ lemma , upos , pos , featuresString , govIdx , relnName , additionalDepsString , misc ));
152140 }
141+ sb .append (System .lineSeparator ());
142+
143+ return sb .toString ();
144+ }
153145
154146 /**
155147 * Outputs just one token span (MWT)
156148 */
157149 public static void printSpan (StringBuilder sb , AbstractCoreLabel token ) {
158- IntPair tokenSpan = token .get (CoreAnnotations .CoNLLUTokenSpanAnnotation .class );
159- if (tokenSpan .getSource () == token .index ()) {
160- String range = String .format ("%d-%d" , tokenSpan .getSource (), tokenSpan .getTarget ());
161- sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .originalText ()));
162- }
150+ IntPair tokenSpan = token .get (CoreAnnotations .CoNLLUTokenSpanAnnotation .class );
151+ if (tokenSpan .getSource () == token .index ()) {
152+ String range = String .format ("%d-%d" , tokenSpan .getSource (), tokenSpan .getTarget ());
153+ sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .originalText ()));
154+ }
163155 }
164156
165157 /**
@@ -178,22 +170,22 @@ public static boolean isMWTbutNotStart(IndexedWord nextVertex) {
178170 }
179171
180172 public static void printMWT (StringBuilder sb , SemanticGraph graph , IndexedWord token ) {
181- int startIndex = token .index ();
182- int endIndex = startIndex ;
183- // advance endIndex until we reach the end of the sentence, the start of the next MWT,
184- // or a word which isn't part of any MWT
185- IndexedWord nextVertex ;
186- while ((nextVertex = graph .getNodeByIndexSafe (endIndex +1 )) != null ) {
187- if (!isMWTbutNotStart (nextVertex )) {
188- break ;
189- }
190- ++endIndex ;
173+ int startIndex = token .index ();
174+ int endIndex = startIndex ;
175+ // advance endIndex until we reach the end of the sentence, the start of the next MWT,
176+ // or a word which isn't part of any MWT
177+ IndexedWord nextVertex ;
178+ while ((nextVertex = graph .getNodeByIndexSafe (endIndex +1 )) != null ) {
179+ if (!isMWTbutNotStart (nextVertex )) {
180+ break ;
191181 }
192- if (startIndex == endIndex ) {
193- return ;
194- }
195- String range = String .format ("%d-%d" , startIndex , endIndex );
196- sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .get (CoreAnnotations .MWTTokenTextAnnotation .class )));
182+ ++endIndex ;
183+ }
184+ if (startIndex == endIndex ) {
185+ return ;
186+ }
187+ String range = String .format ("%d-%d" , startIndex , endIndex );
188+ sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .get (CoreAnnotations .MWTTokenTextAnnotation .class )));
197189 }
198190
199191 /**
@@ -205,42 +197,40 @@ public static void printMWT(StringBuilder sb, SemanticGraph graph, IndexedWord t
205197 */
206198
207199 public String printPOSAnnotations (CoreMap sentence , boolean fakeDeps ) {
208- StringBuilder sb = new StringBuilder ();
209-
210- int index = 0 ;
211- for (CoreLabel token : sentence .get (CoreAnnotations .TokensAnnotation .class )) {
212- /* Check for multiword tokens. */
213- if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
214- printSpan (sb , token );
215- }
200+ StringBuilder sb = new StringBuilder ();
216201
217- String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
218- String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
219- String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
220- String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
221- String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
222- final String head ;
223- final String rel ;
224- final String headrel ;
225- if (fakeDeps ) {
226- // deps count from 1, with 0 as the root.
227- // we will have the first word go to fake root
228- head = Integer .toString (index );
229- rel = (index == 0 ) ? "root" : "dep" ;
230- headrel = head + ":" + rel ;
231- } else {
232- head = "_" ;
233- rel = "_" ;
234- headrel = "_" ;
235- }
236- index ++;
237- sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .index (), token .word (),
238- lemma , upos , pos , featuresString , head , rel , headrel , misc ));
202+ int index = 0 ;
203+ for (CoreLabel token : sentence .get (CoreAnnotations .TokensAnnotation .class )) {
204+ /* Check for multiword tokens. */
205+ if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
206+ printSpan (sb , token );
239207 }
240- sb .append (System .lineSeparator ());
241-
242- return sb .toString ();
243208
209+ String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
210+ String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
211+ String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
212+ String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
213+ String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
214+ final String head ;
215+ final String rel ;
216+ final String headrel ;
217+ if (fakeDeps ) {
218+ // deps count from 1, with 0 as the root.
219+ // we will have the first word go to fake root
220+ head = Integer .toString (index );
221+ rel = (index == 0 ) ? "root" : "dep" ;
222+ headrel = head + ":" + rel ;
223+ } else {
224+ head = "_" ;
225+ rel = "_" ;
226+ headrel = "_" ;
227+ }
228+ index ++;
229+ sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .index (), token .word (),
230+ lemma , upos , pos , featuresString , head , rel , headrel , misc ));
244231 }
232+ sb .append (System .lineSeparator ());
245233
234+ return sb .toString ();
235+ }
246236}
0 commit comments