|
1 | 1 | package com.semmle.js.extractor;
|
2 | 2 |
|
3 |
| -import com.semmle.util.data.StringUtil; |
4 |
| -import com.semmle.util.exception.CatastrophicError; |
5 |
| -import com.semmle.util.exception.UserError; |
6 |
| -import com.semmle.util.locations.LineTable; |
7 |
| -import com.semmle.util.trap.TrapWriter; |
8 |
| -import com.semmle.util.trap.TrapWriter.Label; |
9 |
| -import com.semmle.util.trap.TrapWriter.Table; |
10 |
| - |
11 | 3 | import java.util.Collections;
|
12 | 4 |
|
13 |
| -import org.yaml.snakeyaml.composer.Composer; |
14 |
| -import org.yaml.snakeyaml.error.Mark; |
15 |
| -import org.yaml.snakeyaml.error.MarkedYAMLException; |
16 |
| -import org.yaml.snakeyaml.events.AliasEvent; |
17 |
| -import org.yaml.snakeyaml.events.Event; |
18 |
| -import org.yaml.snakeyaml.events.MappingStartEvent; |
19 |
| -import org.yaml.snakeyaml.events.NodeEvent; |
20 |
| -import org.yaml.snakeyaml.events.ScalarEvent; |
21 |
| -import org.yaml.snakeyaml.events.SequenceStartEvent; |
22 |
| -import org.yaml.snakeyaml.nodes.NodeId; |
23 |
| -import org.yaml.snakeyaml.parser.Parser; |
24 |
| -import org.yaml.snakeyaml.parser.ParserImpl; |
25 |
| -import org.yaml.snakeyaml.reader.ReaderException; |
26 |
| -import org.yaml.snakeyaml.reader.StreamReader; |
27 |
| -import org.yaml.snakeyaml.resolver.Resolver; |
| 5 | +import com.semmle.extractor.yaml.YamlPopulator; |
28 | 6 |
|
29 | 7 | /**
|
30 | 8 | * Extractor for populating YAML files.
|
31 | 9 | *
|
32 |
| - * <p>The extractor uses <a href="http://www.snakeyaml.org/">SnakeYAML</a> to parse YAML. |
| 10 | + * <p> |
| 11 | + * The extractor uses <a href="http://www.snakeyaml.org/">SnakeYAML</a> to parse |
| 12 | + * YAML. |
33 | 13 | */
|
34 | 14 | public class YAMLExtractor implements IExtractor {
|
35 |
| - /** The tables constituting the YAML dbscheme. */ |
36 |
| - private static enum YAMLTables implements Table { |
37 |
| - YAML(6), // yaml (id: @yaml_node, kind: int ref, parent: @yaml_node_parent ref, |
38 |
| - // idx: int ref, tag: string ref, tostring: string ref) |
39 |
| - YAML_ANCHORS(2), // yaml_anchors (node: @yaml_node ref, anchor: string ref) |
40 |
| - YAML_ALIASES(2), // yaml_aliases (alias: @yaml_alias_node ref, target: string ref) |
41 |
| - YAML_SCALARS( |
42 |
| - 3), // yaml_scalars (scalar: @yaml_scalar_node ref, style: int ref, value: string ref) |
43 |
| - YAML_ERRORS(2); // yaml_errors (id: @yaml_error, message: string ref) |
44 |
| - |
45 |
| - private final int arity; |
46 |
| - |
47 |
| - private YAMLTables(int arity) { |
48 |
| - this.arity = arity; |
49 |
| - } |
50 |
| - |
51 |
| - @Override |
52 |
| - public String getName() { |
53 |
| - return StringUtil.lc(name()); |
54 |
| - } |
55 |
| - |
56 |
| - @Override |
57 |
| - public int getArity() { |
58 |
| - return arity; |
59 |
| - } |
60 |
| - |
61 |
| - @Override |
62 |
| - public boolean validate(Object... values) { |
63 |
| - return true; |
64 |
| - } |
65 |
| - } |
66 |
| - |
67 |
| - /* |
68 |
| - * case @yaml_node.kind of |
69 |
| - * 0 = @yaml_scalar_node |
70 |
| - * | 1 = @yaml_mapping_node |
71 |
| - * | 2 = @yaml_sequence_node |
72 |
| - * | 3 = @yaml_alias_node |
73 |
| - */ |
74 |
| - private static enum NodeKind { |
75 |
| - SCALAR, |
76 |
| - MAPPING, |
77 |
| - SEQUENCE, |
78 |
| - ALIAS |
79 |
| - }; |
80 |
| - |
81 | 15 | private final boolean tolerateParseErrors;
|
82 | 16 |
|
83 |
| - private TextualExtractor textualExtractor; |
84 |
| - private LocationManager locationManager; |
85 |
| - private TrapWriter trapWriter; |
86 |
| - private LineTable lineTable; |
87 |
| - |
88 |
| - /** |
89 |
| - * The underlying SnakeYAML parser; we use the relatively low-level {@linkplain Parser} instead of |
90 |
| - * the more high-level {@linkplain Composer}, since our dbscheme represents YAML documents in AST |
91 |
| - * form, with aliases left unresolved. |
92 |
| - */ |
93 |
| - private Parser parser; |
94 |
| - |
95 |
| - /** The resolver used for resolving type tags. */ |
96 |
| - private Resolver resolver; |
97 |
| - |
98 | 17 | public YAMLExtractor(ExtractorConfig config) {
|
99 | 18 | this.tolerateParseErrors = config.isTolerateParseErrors();
|
100 | 19 | }
|
101 | 20 |
|
102 |
| - private LineTable getLineTable() { |
103 |
| - if (lineTable == null) { |
104 |
| - lineTable = new LineTable(this.textualExtractor.getSource()); |
105 |
| - } |
106 |
| - return lineTable; |
107 |
| - } |
108 |
| - |
109 | 21 | @Override
|
110 | 22 | public ParseResultInfo extract(TextualExtractor textualExtractor) {
|
111 |
| - this.textualExtractor = textualExtractor; |
112 |
| - locationManager = textualExtractor.getLocationManager(); |
113 |
| - trapWriter = textualExtractor.getTrapwriter(); |
114 |
| - |
115 |
| - Label fileLabel = locationManager.getFileLabel(); |
116 |
| - locationManager.setHasLocationTable("yaml_locations"); |
117 |
| - try { |
118 |
| - parser = new ParserImpl(new StreamReader(textualExtractor.getSource())); |
119 |
| - resolver = new Resolver(); |
120 |
| - int idx = 0; |
121 |
| - while (!atStreamEnd()) |
122 |
| - extractDocument(fileLabel, idx++, textualExtractor.getSource().codePoints().toArray()); |
123 |
| - } catch (MarkedYAMLException e) { |
124 |
| - int line = e.getProblemMark().getLine() + 1; |
125 |
| - int column = e.getProblemMark().getColumn() + 1; |
126 |
| - if (!this.tolerateParseErrors) |
127 |
| - throw new UserError(e.getProblem() + ": " + line + ":" + column); |
128 |
| - Label lbl = trapWriter.freshLabel(); |
129 |
| - trapWriter.addTuple(YAMLTables.YAML_ERRORS, lbl, e.getProblem()); |
130 |
| - locationManager.emitSnippetLocation(lbl, line, column, line, column); |
131 |
| - } catch (ReaderException e) { |
132 |
| - if (!this.tolerateParseErrors) throw new UserError(e.toString()); |
133 |
| - int c = e.getCodePoint(); |
134 |
| - String s = String.valueOf(Character.toChars(c)); |
135 |
| - trapWriter.addTuple( |
136 |
| - YAMLTables.YAML_ERRORS, |
137 |
| - trapWriter.freshLabel(), |
138 |
| - "Unexpected character " + s + "(" + c + ")"); |
139 |
| - // unfortunately, SnakeYAML does not provide structured location information for |
140 |
| - // ReaderExceptions |
141 |
| - } |
142 |
| - |
| 23 | + new YamlPopulator(textualExtractor.getExtractedFile(), textualExtractor.getSource(), |
| 24 | + textualExtractor.getTrapwriter(), |
| 25 | + this.tolerateParseErrors).extract(); |
143 | 26 | return new ParseResultInfo(0, 0, Collections.emptyList());
|
144 | 27 | }
|
145 |
| - |
146 |
| - /** Check whether the parser has encountered the end of the YAML input stream. */ |
147 |
| - private boolean atStreamEnd() { |
148 |
| - if (parser.checkEvent(Event.ID.StreamStart)) parser.getEvent(); |
149 |
| - return parser.checkEvent(Event.ID.StreamEnd); |
150 |
| - } |
151 |
| - |
152 |
| - /** Extract a complete YAML document; cf. {@link Composer#getNode}. */ |
153 |
| - private void extractDocument(Label parent, int idx, int[] codepoints) { |
154 |
| - // Drop the DOCUMENT-START event |
155 |
| - parser.getEvent(); |
156 |
| - extractNode(parent, idx, codepoints); |
157 |
| - // Drop the DOCUMENT-END event |
158 |
| - parser.getEvent(); |
159 |
| - } |
160 |
| - |
161 |
| - /** Extract a single YAML node; cf. {@link Composer#composeNode}. */ |
162 |
| - private void extractNode(Label parent, int idx, int[] codepoints) { |
163 |
| - Label label = trapWriter.freshLabel(); |
164 |
| - NodeKind kind; |
165 |
| - String tag = ""; |
166 |
| - Event start = parser.getEvent(), end = start; |
167 |
| - |
168 |
| - if (start.is(Event.ID.Alias)) { |
169 |
| - kind = NodeKind.ALIAS; |
170 |
| - trapWriter.addTuple(YAMLTables.YAML_ALIASES, label, ((AliasEvent) start).getAnchor()); |
171 |
| - } else { |
172 |
| - String anchor = start instanceof NodeEvent ? ((NodeEvent) start).getAnchor() : null; |
173 |
| - if (anchor != null) trapWriter.addTuple(YAMLTables.YAML_ANCHORS, label, anchor); |
174 |
| - |
175 |
| - if (start.is(Event.ID.Scalar)) { |
176 |
| - kind = NodeKind.SCALAR; |
177 |
| - ScalarEvent scalar = (ScalarEvent) start; |
178 |
| - tag = |
179 |
| - getTag( |
180 |
| - scalar.getTag(), |
181 |
| - NodeId.scalar, |
182 |
| - scalar.getValue(), |
183 |
| - scalar.getImplicit().canOmitTagInPlainScalar()); |
184 |
| - Character style = scalar.getStyle(); |
185 |
| - int styleCode = style == null ? 0 : (int) style; |
186 |
| - trapWriter.addTuple(YAMLTables.YAML_SCALARS, label, styleCode, scalar.getValue()); |
187 |
| - } else if (start.is(Event.ID.SequenceStart)) { |
188 |
| - kind = NodeKind.SEQUENCE; |
189 |
| - SequenceStartEvent sequenceStart = (SequenceStartEvent) start; |
190 |
| - tag = getTag(sequenceStart.getTag(), NodeId.sequence, null, sequenceStart.getImplicit()); |
191 |
| - |
192 |
| - int childIdx = 0; |
193 |
| - while (!parser.checkEvent(Event.ID.SequenceEnd)) extractNode(label, childIdx++, codepoints); |
194 |
| - |
195 |
| - end = parser.getEvent(); |
196 |
| - } else if (start.is(Event.ID.MappingStart)) { |
197 |
| - kind = NodeKind.MAPPING; |
198 |
| - MappingStartEvent mappingStart = (MappingStartEvent) start; |
199 |
| - tag = getTag(mappingStart.getTag(), NodeId.mapping, null, mappingStart.getImplicit()); |
200 |
| - |
201 |
| - int childIdx = 1; |
202 |
| - while (!parser.checkEvent(Event.ID.MappingEnd)) { |
203 |
| - extractNode(label, childIdx, codepoints); |
204 |
| - extractNode(label, -childIdx, codepoints); |
205 |
| - ++childIdx; |
206 |
| - } |
207 |
| - |
208 |
| - end = parser.getEvent(); |
209 |
| - } else { |
210 |
| - throw new CatastrophicError("Unexpected YAML parser event: " + start); |
211 |
| - } |
212 |
| - } |
213 |
| - |
214 |
| - trapWriter.addTuple( |
215 |
| - YAMLTables.YAML, |
216 |
| - label, |
217 |
| - kind.ordinal(), |
218 |
| - parent, |
219 |
| - idx, |
220 |
| - tag, |
221 |
| - mkToString(start.getStartMark(), end.getEndMark(), codepoints)); |
222 |
| - extractLocation(label, start.getStartMark(), end.getEndMark()); |
223 |
| - } |
224 |
| - |
225 |
| - /** Determine the type tag of a node. */ |
226 |
| - private String getTag(String explicitTag, NodeId kind, String value, boolean implicit) { |
227 |
| - if (explicitTag == null || "!".equals(explicitTag)) |
228 |
| - return resolver.resolve(kind, value, implicit).getValue(); |
229 |
| - return explicitTag; |
230 |
| - } |
231 |
| - |
232 |
| - private static boolean isNewLine(int codePoint) { |
233 |
| - switch (codePoint) { |
234 |
| - case '\n': |
235 |
| - case '\r': |
236 |
| - case '\u0085': |
237 |
| - case '\u2028': |
238 |
| - case '\u2029': |
239 |
| - return true; |
240 |
| - default: |
241 |
| - return false; |
242 |
| - } |
243 |
| - } |
244 |
| - |
245 |
| - /** |
246 |
| - * SnakeYAML doesn't directly expose the source text of nodes, but we also take the file contents |
247 |
| - * as an array of Unicode code points. The start and end marks each contain an index into the code |
248 |
| - * point stream (the end is exclusive), so we can reconstruct the snippet. For readability, we |
249 |
| - * stop at the first encountered newline. |
250 |
| - */ |
251 |
| - private static String mkToString(Mark startMark, Mark endMark, int[] codepoints) { |
252 |
| - StringBuilder b = new StringBuilder(); |
253 |
| - for (int i = startMark.getIndex(); i < endMark.getIndex() && !isNewLine(codepoints[i]); i++) |
254 |
| - b.appendCodePoint(codepoints[i]); |
255 |
| - return TextualExtractor.sanitiseToString(b.toString()); |
256 |
| - } |
257 |
| - |
258 |
| - /** Emit a source location for a YAML node. */ |
259 |
| - private void extractLocation(Label label, Mark startMark, Mark endMark) { |
260 |
| - int startLine, startColumn, endLine, endColumn; |
261 |
| - |
262 |
| - // SnakeYAML uses 0-based indexing for both lines and columns, so need to +1 |
263 |
| - startLine = startMark.getLine() + 1; |
264 |
| - startColumn = startMark.getColumn() + 1; |
265 |
| - |
266 |
| - // SnakeYAML's end positions are exclusive, so only need to +1 for the line |
267 |
| - endLine = endMark.getLine() + 1; |
268 |
| - endColumn = endMark.getColumn(); |
269 |
| - |
270 |
| - // Avoid emitting column zero for non-empty locations |
271 |
| - if (endColumn == 0 && !(startLine == endLine && startColumn == endColumn)) { |
272 |
| - String source = textualExtractor.getSource(); |
273 |
| - int offset = getLineTable().getOffsetFromPoint(endMark.getLine(), endMark.getColumn()) - 1; |
274 |
| - while (offset > 0 && isNewLine((int)source.charAt(offset))) { |
275 |
| - --offset; |
276 |
| - } |
277 |
| - com.semmle.util.locations.Position adjustedEndPos = getLineTable().getEndPositionFromOffset(offset); |
278 |
| - endLine = adjustedEndPos.getLine(); |
279 |
| - endColumn = adjustedEndPos.getColumn(); |
280 |
| - } |
281 |
| - |
282 |
| - locationManager.emitSnippetLocation(label, startLine, startColumn, endLine, endColumn); |
283 |
| - } |
284 | 28 | }
|
0 commit comments