Skip to content

Commit 0017d00

Browse files
Read gzipped graphml files (#1315)
* Use flate2 crate to read gzipped graphml files * fix typo * run rustfmt * apply suggestion from clippy * add test for gzipped graphml * write separate function * add changelog * reformat * Revert "write separate function" This reverts commit 2dba252. * run with compression argument * update contribution * lint python * add stub * try avoid error in test in Windows * use Option for compression variable * correct text signature --------- Co-authored-by: Ivan Carvalho <[email protected]>
1 parent ff611f2 commit 0017d00

File tree

6 files changed

+139
-10
lines changed

6 files changed

+139
-10
lines changed

Cargo.lock

Lines changed: 35 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ serde = { version = "1.0", features = ["derive"] }
6060
serde_json = "1.0"
6161
smallvec = { version = "1.0", features = ["union"] }
6262
rustworkx-core = { path = "rustworkx-core", version = "=0.16.0" }
63+
flate2 = "1.0.35"
6364

6465
[dependencies.pyo3]
6566
version = "0.22.6"
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
features:
3+
- |
4+
Added the ability to read GraphML files that are compressed using gzip, with function :func:`~rustworkx.read_graphml`.
5+
The extensions `.graphmlz` and `.gz` are automatically recognised, but the gzip decompression can be forced with the "compression" optional argument.

rustworkx/rustworkx.pyi

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,11 @@ def directed_random_bipartite_graph(
646646

647647
# Read Write
648648

649-
def read_graphml(path: str, /) -> list[PyGraph | PyDiGraph]: ...
649+
def read_graphml(
650+
path: str,
651+
/,
652+
compression: str | None = ...,
653+
) -> list[PyGraph | PyDiGraph]: ...
650654
def digraph_node_link_json(
651655
graph: PyDiGraph[_S, _T],
652656
/,

src/graphml.rs

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,15 @@
1313
#![allow(clippy::borrow_as_ptr)]
1414

1515
use std::convert::From;
16+
use std::ffi::OsStr;
17+
use std::fs::File;
18+
use std::io::{BufRead, BufReader};
1619
use std::iter::FromIterator;
1720
use std::num::{ParseFloatError, ParseIntError};
1821
use std::path::Path;
1922
use std::str::ParseBoolError;
2023

24+
use flate2::bufread::GzDecoder;
2125
use hashbrown::HashMap;
2226
use indexmap::IndexMap;
2327

@@ -524,19 +528,27 @@ impl GraphML {
524528

525529
Ok(())
526530
}
531+
/// Open file compressed with gzip, using the GzDecoder
532+
/// Returns a quick_xml Reader instance
533+
fn open_file_gzip<P: AsRef<Path>>(
534+
path: P,
535+
) -> Result<Reader<BufReader<GzDecoder<BufReader<File>>>>, quick_xml::Error> {
536+
let file = File::open(path)?;
537+
let reader = BufReader::new(file);
538+
let gzip_reader = BufReader::new(GzDecoder::new(reader));
539+
Ok(Reader::from_reader(gzip_reader))
540+
}
527541

528-
/// Parse a file written in GraphML format.
542+
/// Parse a file written in GraphML format from a BufReader
529543
///
530544
/// The implementation is based on a state machine in order to
531545
/// accept only valid GraphML syntax (e.g a `<data>` element should
532546
/// be nested inside a `<node>` element) where the internal state changes
533547
/// after handling each quick_xml event.
534-
fn from_file<P: AsRef<Path>>(path: P) -> Result<GraphML, Error> {
548+
fn read_graph_from_reader<R: BufRead>(mut reader: Reader<R>) -> Result<GraphML, Error> {
535549
let mut graphml = GraphML::default();
536550

537551
let mut buf = Vec::new();
538-
let mut reader = Reader::from_file(path)?;
539-
540552
let mut state = State::Start;
541553
let mut domain_of_last_key = Domain::Node;
542554
let mut last_data_key = String::new();
@@ -677,6 +689,23 @@ impl GraphML {
677689

678690
Ok(graphml)
679691
}
692+
693+
/// Read a graph from a file in the GraphML format
694+
/// If the the file extension is "graphmlz" or "gz", decompress it on the fly
695+
fn from_file<P: AsRef<Path>>(path: P, compression: &str) -> Result<GraphML, Error> {
696+
let extension = path.as_ref().extension().unwrap_or(OsStr::new(""));
697+
698+
let graph: Result<GraphML, Error> =
699+
if extension.eq("graphmlz") || extension.eq("gz") || compression.eq("gzip") {
700+
let reader = Self::open_file_gzip(path)?;
701+
Self::read_graph_from_reader(reader)
702+
} else {
703+
let reader = Reader::from_file(path)?;
704+
Self::read_graph_from_reader(reader)
705+
};
706+
707+
graph
708+
}
680709
}
681710

682711
/// Read a list of graphs from a file in GraphML format.
@@ -703,9 +732,13 @@ impl GraphML {
703732
/// :rtype: list[Union[PyGraph, PyDiGraph]]
704733
/// :raises RuntimeError: when an error is encountered while parsing the GraphML file.
705734
#[pyfunction]
706-
#[pyo3(text_signature = "(path, /)")]
707-
pub fn read_graphml(py: Python, path: &str) -> PyResult<Vec<PyObject>> {
708-
let graphml = GraphML::from_file(path)?;
735+
#[pyo3(signature=(path, compression=None),text_signature = "(path, /, compression=None)")]
736+
pub fn read_graphml(
737+
py: Python,
738+
path: &str,
739+
compression: Option<String>,
740+
) -> PyResult<Vec<PyObject>> {
741+
let graphml = GraphML::from_file(path, &compression.unwrap_or_default())?;
709742

710743
let mut out = Vec::new();
711744
for graph in graphml.graphs {

tests/test_graphml.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
import unittest
1414
import tempfile
15+
import gzip
16+
1517
import numpy
1618

1719
import rustworkx
@@ -55,8 +57,8 @@ def assertGraphMLRaises(self, graph_xml):
5557
with self.assertRaises(Exception):
5658
rustworkx.read_graphml(fd.name)
5759

58-
def test_simple(self):
59-
graph_xml = self.HEADER.format(
60+
def graphml_xml_example(self):
61+
return self.HEADER.format(
6062
"""
6163
<key id="d0" for="node" attr.name="color" attr.type="string">
6264
<default>yellow</default>
@@ -80,6 +82,8 @@ def test_simple(self):
8082
"""
8183
)
8284

85+
def test_simple(self):
86+
graph_xml = self.graphml_xml_example()
8387
with tempfile.NamedTemporaryFile("wt") as fd:
8488
fd.write(graph_xml)
8589
fd.flush()
@@ -96,6 +100,53 @@ def test_simple(self):
96100
]
97101
self.assertGraphEqual(graph, nodes, edges, directed=False)
98102

103+
def test_gzipped(self):
104+
graph_xml = self.graphml_xml_example()
105+
106+
## Test reading a graphmlz
107+
with tempfile.NamedTemporaryFile("w+b") as fd:
108+
fd.flush()
109+
newname = fd.name + ".gz"
110+
with gzip.open(newname, "wt") as wf:
111+
wf.write(graph_xml)
112+
113+
graphml = rustworkx.read_graphml(newname)
114+
graph = graphml[0]
115+
nodes = [
116+
{"id": "n0", "color": "blue"},
117+
{"id": "n1", "color": "yellow"},
118+
{"id": "n2", "color": "green"},
119+
]
120+
edges = [
121+
("n0", "n1", {"fidelity": 0.98}),
122+
("n0", "n2", {"fidelity": 0.95}),
123+
]
124+
self.assertGraphEqual(graph, nodes, edges, directed=False)
125+
126+
def test_gzipped_force(self):
127+
graph_xml = self.graphml_xml_example()
128+
129+
## Test reading a graphmlz
130+
with tempfile.NamedTemporaryFile("w+b") as fd:
131+
# close the file
132+
fd.flush()
133+
newname = fd.name + ".ext"
134+
with gzip.open(newname, "wt") as wf:
135+
wf.write(graph_xml)
136+
137+
graphml = rustworkx.read_graphml(newname, compression="gzip")
138+
graph = graphml[0]
139+
nodes = [
140+
{"id": "n0", "color": "blue"},
141+
{"id": "n1", "color": "yellow"},
142+
{"id": "n2", "color": "green"},
143+
]
144+
edges = [
145+
("n0", "n1", {"fidelity": 0.98}),
146+
("n0", "n2", {"fidelity": 0.95}),
147+
]
148+
self.assertGraphEqual(graph, nodes, edges, directed=False)
149+
99150
def test_multiple_graphs_in_single_file(self):
100151
graph_xml = self.HEADER.format(
101152
"""

0 commit comments

Comments
 (0)