Skip to content

Commit be512ba

Browse files
committed
Add regex filter option; improve GML handling
1 parent 24d87db commit be512ba

File tree

2 files changed

+83
-30
lines changed

2 files changed

+83
-30
lines changed

Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@ edition = "2018"
66
description = "Fast converter from XML to PostgreSQL dump format"
77
publish = false
88

9-
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
10-
119
[dependencies]
12-
quick-xml = "0.20"
10+
quick-xml = "0.22"
1311
yaml-rust = "0.4"
12+
regex = "1"

src/main.rs

Lines changed: 81 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
extern crate quick_xml;
2-
extern crate yaml_rust;
3-
41
use std::io::{Read, Write, stdout};
52
use std::fs::{File, OpenOptions};
63
use std::path::Path;
@@ -11,6 +8,7 @@ use quick_xml::Reader;
118
use quick_xml::events::Event;
129
use yaml_rust::YamlLoader;
1310
use yaml_rust::yaml::Yaml;
11+
use regex::Regex;
1412

1513
struct Table<'a> {
1614
path: String,
@@ -37,13 +35,19 @@ impl<'a> Table<'a> {
3735
fn write(&self, text: &str) {
3836
self.file.borrow_mut().write_all(&text.as_bytes()).expect("Write error encountered; exiting...");
3937
}
38+
fn clear_columns(&self) {
39+
for col in &self.columns {
40+
col.value.borrow_mut().clear();
41+
}
42+
}
4043
}
4144

4245
struct Column<'a> {
4346
name: String,
4447
path: String,
4548
value: RefCell<String>,
4649
attr: Option<&'a str>,
50+
filter: Option<Regex>,
4751
convert: Option<&'a str>,
4852
find: Option<&'a str>,
4953
replace: Option<&'a str>,
@@ -117,12 +121,32 @@ fn add_table<'a>(rowpath: &str, outfile: Option<&str>, filemode: &str, colspec:
117121
Some(add_table(&path, Some(&file), filemode, col["cols"].as_vec().expect("Subtable 'cols' entry is not an array")))
118122
}
119123
};
124+
let filter: Option<Regex> = match col["filt"].as_str() {
125+
Some(str) => Some(Regex::new(&str).expect("Invalid regex in 'filt' entry in configuration file")),
126+
None => None
127+
};
120128
let attr = col["attr"].as_str();
121129
let convert = col["conv"].as_str();
122130
let find = col["find"].as_str();
123131
let replace = col["repl"].as_str();
124132
let consol = col["cons"].as_str();
125-
let column = Column { name: name.to_string(), path, value: RefCell::new(String::new()), attr, convert, find, replace, consol, subtable };
133+
134+
if convert.is_some() && !vec!("xml-to-text", "gml-to-ewkb").contains(&convert.unwrap()) {
135+
panic!("Option 'convert' contains invalid value {}", convert.unwrap());
136+
}
137+
if filter.is_some() {
138+
if convert.is_some() {
139+
panic!("Option 'filt' and 'conv' cannot be used together on a single column");
140+
}
141+
if find.is_some() {
142+
eprintln!("Notice: when using a filter and find/replace on a single column, the filter is applied before replacements");
143+
}
144+
if consol.is_some() {
145+
eprintln!("Notice: when using a filter and consolidation on a single column, the filter is applied to each phase of consolidation separately");
146+
}
147+
}
148+
149+
let column = Column { name: name.to_string(), path, value: RefCell::new(String::new()), attr, filter, convert, find, replace, consol, subtable };
126150
table.columns.push(column);
127151
}
128152
table
@@ -148,7 +172,8 @@ fn main() -> std::io::Result<()> {
148172

149173
let mut path = String::new();
150174
let mut buf = Vec::new();
151-
let mut count = 0;
175+
let mut fullcount = 0;
176+
let mut filtercount = 0;
152177

153178
let rowpath = config["path"].as_str().expect("No valid 'path' entry in configuration file");
154179
let colspec = config["cols"].as_vec().expect("No valid 'cols' array in configuration file");
@@ -161,17 +186,20 @@ fn main() -> std::io::Result<()> {
161186
let mut tables: Vec<&Table> = Vec::new();
162187
let mut table = &maintable;
163188

189+
let mut filtered = false;
164190
let mut xmltotext = false;
165191
let mut text = String::new();
166192
let mut gmltoewkb = false;
167193
let mut gmlpos = false;
194+
let mut gmlint = false;
168195
let mut gmlgeom = Geometry::new();
169196
let start = Instant::now();
170197
loop {
171198
match reader.read_event(&mut buf) {
172199
Ok(Event::Start(ref e)) => {
173200
path.push('/');
174201
path.push_str(reader.decode(e.name()).unwrap());
202+
if filtered { continue; }
175203
if xmltotext {
176204
text.push_str(&format!("<{}>", &e.unescape_and_decode(&reader).unwrap()));
177205
continue;
@@ -211,8 +239,13 @@ fn main() -> std::io::Result<()> {
211239
"gml:Point" => gmlgeom.gtype = 1,
212240
"gml:LineString" => gmlgeom.gtype = 2,
213241
"gml:Polygon" => gmlgeom.gtype = 3,
242+
"gml:MultiPolygon" => (),
243+
"gml:polygonMember" => (),
214244
"gml:exterior" => (),
215-
"gml:interior" => (),
245+
"gml:interior" => {
246+
eprintln!("GML polygon interior ring not yet supported; ignored");
247+
gmlint = true;
248+
},
216249
"gml:LinearRing" => gmlgeom.rings.push(Vec::new()),
217250
"gml:posList" => gmlpos = true,
218251
_ => eprintln!("GML type {} not supported", tag)
@@ -221,7 +254,7 @@ fn main() -> std::io::Result<()> {
221254
continue;
222255
}
223256
else if path == table.path {
224-
count += 1;
257+
fullcount += 1;
225258
}
226259
else if path.len() > table.path.len() {
227260
for i in 0..table.columns.len() {
@@ -254,6 +287,12 @@ fn main() -> std::io::Result<()> {
254287
if table.columns[i].value.borrow().is_empty() {
255288
eprintln!("Column {} requested attribute {} not found", table.columns[i].name, request);
256289
}
290+
if let Some(re) = &table.columns[i].filter {
291+
if !re.is_match(&table.columns[i].value.borrow()) {
292+
filtered = true;
293+
table.clear_columns();
294+
}
295+
}
257296
}
258297

259298
// Set the appropriate convert flag for the following data in case the 'conv' option is present
@@ -269,12 +308,13 @@ fn main() -> std::io::Result<()> {
269308
}
270309
},
271310
Ok(Event::Text(ref e)) => {
311+
if filtered { continue; }
272312
if xmltotext {
273313
text.push_str(&e.unescape_and_decode(&reader).unwrap());
274314
continue;
275315
}
276316
else if gmltoewkb {
277-
if gmlpos {
317+
if gmlpos && !gmlint {
278318
let value = String::from(&e.unescape_and_decode(&reader).unwrap());
279319
for pos in value.split(' ') {
280320
gmlgeom.rings.last_mut().unwrap().push(pos.parse().unwrap());
@@ -301,35 +341,47 @@ fn main() -> std::io::Result<()> {
301341
}
302342
}
303343
table.columns[i].value.borrow_mut().push_str(&e.unescape_and_decode(&reader).unwrap().replace("\\", "\\\\"));
344+
if let Some(re) = &table.columns[i].filter {
345+
if !re.is_match(&table.columns[i].value.borrow()) {
346+
filtered = true;
347+
table.clear_columns();
348+
}
349+
}
304350
break;
305351
}
306352
}
307353
},
308354
Ok(Event::End(_)) => {
309355
if path == table.path {
310-
311-
// End tag of a subtable; write the first column value of the parent table as the first column of the subtable
312-
if !tables.is_empty() {
313-
table.write(&tables.last().unwrap().columns[0].value.borrow());
314-
table.write("\t");
356+
if filtered {
357+
filtered = false;
358+
filtercount += 1;
315359
}
360+
else {
316361

317-
// Now write out the other column values
318-
for i in 0..table.columns.len() {
319-
if table.columns[i].subtable.is_some() { continue; }
320-
if i > 0 { table.write("\t"); }
321-
if table.columns[i].value.borrow().is_empty() { table.write("\\N"); }
322-
else {
323-
if let (Some(s), Some(r)) = (table.columns[i].find, table.columns[i].replace) {
324-
let mut value = table.columns[i].value.borrow_mut();
325-
*value = value.replace(s, r);
362+
// End tag of a subtable; write the first column value of the parent table as the first column of the subtable
363+
if !tables.is_empty() {
364+
table.write(&tables.last().unwrap().columns[0].value.borrow());
365+
table.write("\t");
366+
}
367+
368+
// Now write out the other column values
369+
for i in 0..table.columns.len() {
370+
if table.columns[i].subtable.is_some() { continue; }
371+
if i > 0 { table.write("\t"); }
372+
if table.columns[i].value.borrow().is_empty() { table.write("\\N"); }
373+
else {
374+
if let (Some(s), Some(r)) = (table.columns[i].find, table.columns[i].replace) {
375+
let mut value = table.columns[i].value.borrow_mut();
376+
*value = value.replace(s, r);
377+
}
378+
table.write(&table.columns[i].value.borrow());
379+
table.columns[i].value.borrow_mut().clear();
326380
}
327-
table.write(&table.columns[i].value.borrow());
328-
table.columns[i].value.borrow_mut().clear();
329381
}
382+
table.write("\n");
383+
if !tables.is_empty() { table = tables.pop().unwrap(); }
330384
}
331-
table.write("\n");
332-
if !tables.is_empty() { table = tables.pop().unwrap(); }
333385
}
334386
let i = path.rfind('/').unwrap();
335387
let tag = path.split_off(i);
@@ -348,6 +400,8 @@ fn main() -> std::io::Result<()> {
348400
}
349401
}
350402
else if gmltoewkb {
403+
if gmlpos && (tag == "gml:posList") { gmlpos = false; }
404+
if gmlint && (tag == "gml:interior") { gmlint = false; }
351405
for i in 0..table.columns.len() {
352406
if path == table.columns[i].path {
353407
gmltoewkb = false;
@@ -364,6 +418,6 @@ fn main() -> std::io::Result<()> {
364418
}
365419
buf.clear();
366420
}
367-
eprintln!("{} rows processed in {} seconds", count, start.elapsed().as_secs());
421+
eprintln!("{} rows processed in {} seconds{}", fullcount-filtercount, start.elapsed().as_secs(), match filtercount { 0 => "".to_owned(), n => format!(" ({} filtered)", n) });
368422
Ok(())
369423
}

0 commit comments

Comments
 (0)