PDAL
diff --git a/‎README.rst‎
Lines changed: 30 additions & 8 deletions b/‎README.rst‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎pdal/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎pdal/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pdal/PyPipeline.cpp‎
Lines changed: 119 additions & 34 deletions b/‎pdal/PyPipeline.cpp‎
Lines changed: 119 additions & 34 deletions
diff --git a/‎pdal/PyPipeline.hpp‎
Lines changed: 43 additions & 6 deletions b/‎pdal/PyPipeline.hpp‎
Lines changed: 43 additions & 6 deletions
@@ -187,6 +187,31 @@ PDAL and Python:
     ).pipeline(clamped)
     print(pipeline.execute())  # 387 points
 
+Executing Streamable Pipelines
+................................................................................
+Streamable pipelines (pipelines that consist exclusively of streamable PDAL
+stages) can be executed in streaming mode via ``Pipeline.iterator()``. This
+returns an iterator object that yields Numpy arrays of up to ``chunk_size`` size
+(default=10000) at a time.
+
+.. code-block:: python
+
+    import pdal
+    pipeline = pdal.Reader("test/data/autzen-utm.las") | pdal.Filter.range(limits="Intensity[80:120)")
+    for array in pipeline.iterator(chunk_size=500):
+        print(len(array))
+    # or to concatenate all arrays into one
+    # full_array = np.concatenate(list(pipeline))
+
+``Pipeline.iterator()`` also takes an optional ``prefetch`` parameter (default=0)
+to allow prefetching up to to this number of arrays in parallel and buffering
+them until they are yielded to the caller.
+
+If you just want to execute a streamable pipeline in streaming mode and don't
+need to access the data points (typically when the pipeline has Writer stage(s)),
+you can use the ``Pipeline.execute_streaming(chunk_size)`` method instead. This
+is functionally equivalent to ``sum(map(len, pipeline.iterator(chunk_size)))``
+but more efficient as it avoids allocating and filling any arrays in memory.
 
 Accessing Mesh Data
 ................................................................................
@@ -236,17 +261,14 @@ USE-CASE : Take a LiDAR map, create a mesh from the ground points, split into ti
 .. code-block:: python
 
     import pdal
-    import json
     import psycopg2
     import io
 
-    pipe = [
-        '.../python/test/data/1.2-with-color.las',
-        {"type":  "filters.splitter", "length": 1000}, 
-        {"type":  "filters.delaunay"}
-    ]
-
-    pl = pdal.Pipeline(json.dumps(pipe))
+    pl = (
+        pdal.Reader(".../python/test/data/1.2-with-color.las")
+        | pdal.Filter.splitter(length=1000)
+        | pdal.Filter.delaunay()
+    )
     pl.execute()
 
     conn = psycopg(%CONNNECTION_STRING%)
 
@@ -1,4 +1,4 @@
-set(EXTENSION_SRC PyArray.cpp PyPipeline.cpp)
+set(EXTENSION_SRC PyArray.cpp PyPipeline.cpp StreamableExecutor.cpp)
 
 set(extension "libpdalpython")
 add_cython_target(${extension} "libpdalpython.pyx" CXX PY3)
 
@@ -39,25 +39,100 @@
 #include <dlfcn.h>
 #endif
 
-#include <Python.h>
-
-#include <pdal/Stage.hpp>
-#include <pdal/pdal_features.hpp>
-
 namespace pdal
 {
 namespace python
 {
 
 
-void readPipeline(PipelineExecutor* executor, std::string json)
+void CountPointTable::reset()
+{
+    for (PointId idx = 0; idx < numPoints(); idx++)
+        if (!skip(idx))
+            m_count++;
+    FixedPointTable::reset();
+}
+
+
+PipelineExecutor::PipelineExecutor(
+    std::string const& json, std::vector<std::shared_ptr<Array>> arrays, int level)
+{
+    if (level < 0 || level > 8)
+        throw pdal_error("log level must be between 0 and 8!");
+
+    LogPtr log(Log::makeLog("pypipeline", &m_logStream));
+    log->setLevel(static_cast<pdal::LogLevel>(level));
+    m_manager.setLog(log);
+
+    std::stringstream strm;
+    strm << json;
+    m_manager.readPipeline(strm);
+
+    addArrayReaders(arrays);
+}
+
+
+point_count_t PipelineExecutor::execute()
+{
+    point_count_t count = m_manager.execute();
+    m_executed = true;
+    return count;
+}
+
+
+point_count_t PipelineExecutor::executeStream(point_count_t streamLimit)
+{
+    CountPointTable table(streamLimit);
+    m_manager.executeStream(table);
+    m_executed = true;
+    return table.count();
+}
+
+const PointViewSet& PipelineExecutor::views() const
+{
+    if (!m_executed)
+        throw pdal_error("Pipeline has not been executed!");
+
+    return m_manager.views();
+}
+
+
+std::string PipelineExecutor::getPipeline() const
 {
-    std::stringstream strm(json);
-    executor->getManager().readPipeline(strm);
+    if (!m_executed)
+        throw pdal_error("Pipeline has not been executed!");
+
+    std::stringstream strm;
+    pdal::PipelineWriter::writePipeline(m_manager.getStage(), strm);
+    return strm.str();
 }
 
 
-void addArrayReaders(PipelineExecutor* executor, std::vector<std::shared_ptr<Array>> arrays)
+std::string PipelineExecutor::getMetadata() const
+{
+    if (!m_executed)
+        throw pdal_error("Pipeline has not been executed!");
+
+    std::stringstream strm;
+    MetadataNode root = m_manager.getMetadata().clone("metadata");
+    pdal::Utils::toJSON(root, strm);
+    return strm.str();
+}
+
+
+std::string PipelineExecutor::getSchema() const
+{
+    if (!m_executed)
+        throw pdal_error("Pipeline has not been executed!");
+
+    std::stringstream strm;
+    MetadataNode root = pointTable().layout()->toMetadata().clone("schema");
+    pdal::Utils::toJSON(root, strm);
+    return strm.str();
+}
+
+
+void PipelineExecutor::addArrayReaders(std::vector<std::shared_ptr<Array>> arrays)
 {
     // Make the symbols in pdal_base global so that they're accessible
     // to PDAL plugins.  Python dlopen's this extension with RTLD_LOCAL,
@@ -72,8 +147,7 @@ void addArrayReaders(PipelineExecutor* executor, std::vector<std::shared_ptr<Arr
     if (arrays.empty())
         return;
 
-    PipelineManager& manager = executor->getManager();
-    std::vector<Stage *> roots = manager.roots();
+    std::vector<Stage *> roots = m_manager.roots();
     if (roots.size() != 1)
         throw pdal_error("Filter pipeline must contain a single root stage.");
 
@@ -88,7 +162,7 @@ void addArrayReaders(PipelineExecutor* executor, std::vector<std::shared_ptr<Arr
             MemoryViewReader::Order::ColumnMajor);
         options.add("shape", MemoryViewReader::Shape(array->shape()));
 
-        Stage& s = manager.makeReader("", "readers.memoryview", options);
+        Stage& s = m_manager.makeReader("", "readers.memoryview", options);
         MemoryViewReader& r = dynamic_cast<MemoryViewReader &>(s);
         for (auto f : array->fields())
             r.pushField(f);
@@ -108,11 +182,11 @@ void addArrayReaders(PipelineExecutor* executor, std::vector<std::shared_ptr<Arr
         roots[0]->setInput(r);
     }
 
-    manager.validateStageOptions();
+    m_manager.validateStageOptions();
 }
 
 
-inline PyObject* buildNumpyDescription(PointViewPtr view)
+PyObject* buildNumpyDescriptor(PointLayoutPtr layout)
 {
     // Build up a numpy dtype dictionary
     //
@@ -123,31 +197,42 @@ inline PyObject* buildNumpyDescription(PointViewPtr view)
     //           'Classification', 'ScanAngleRank', 'UserData',
     //           'PointSourceId', 'GpsTime', 'Red', 'Green', 'Blue']}
     //
-    Dimension::IdList dims = view->dims();
+
+    // Ensure that the dimensions are sorted by offset
+    // Is there a better way? Can they be sorted by offset already?
+    auto sortByOffset = [layout](Dimension::Id id1, Dimension::Id id2) -> bool
+    {
+        return layout->dimOffset(id1) < layout->dimOffset(id2);
+    };
+    auto dims = layout->dims();
+    std::sort(dims.begin(), dims.end(), sortByOffset);
+
     PyObject* names = PyList_New(dims.size());
     PyObject* formats = PyList_New(dims.size());
     for (size_t i = 0; i < dims.size(); ++i)
     {
         Dimension::Id id = dims[i];
-        std::string name = view->dimName(id);
-        npy_intp stride = view->dimSize(id);
-
-        std::string kind;
-        Dimension::BaseType b = Dimension::base(view->dimType(id));
-        if (b == Dimension::BaseType::Unsigned)
-            kind = "u";
-        else if (b == Dimension::BaseType::Signed)
-            kind = "i";
-        else if (b == Dimension::BaseType::Floating)
-            kind = "f";
-        else
-            throw pdal_error("Unable to map kind '" + kind  +
-                "' to PDAL dimension type");
-
-        std::stringstream oss;
-        oss << kind << stride;
+        auto name = layout->dimName(id);
         PyList_SetItem(names, i, PyUnicode_FromString(name.c_str()));
-        PyList_SetItem(formats, i, PyUnicode_FromString(oss.str().c_str()));
+
+        std::stringstream format;
+        switch (Dimension::base(layout->dimType(id)))
+        {
+            case Dimension::BaseType::Unsigned:
+                format << 'u';
+                break;
+            case Dimension::BaseType::Signed:
+                format << 'i';
+                break;
+            case Dimension::BaseType::Floating:
+                format << 'f';
+                break;
+            default:
+                throw pdal_error("Unable to map dimension '" + name  + "' to Numpy");
+        }
+        format << layout->dimSize(id);
+        PyList_SetItem(formats, i, PyUnicode_FromString(format.str().c_str()));
+
     }
     PyObject* dtype_dict = PyDict_New();
     PyDict_SetItemString(dtype_dict, "names", names);
@@ -161,7 +246,7 @@ PyArrayObject* viewToNumpyArray(PointViewPtr view)
     if (_import_array() < 0)
         throw pdal_error("Could not import numpy.core.multiarray.");
 
-    PyObject* dtype_dict = buildNumpyDescription(view);
+    PyObject* dtype_dict = buildNumpyDescriptor(view->layout());
     PyArray_Descr *dtype = nullptr;
     if (PyArray_DescrConverter(dtype_dict, &dtype) == NPY_FAIL)
         throw pdal_error("Unable to build numpy dtype");
 
@@ -34,21 +34,58 @@
 
 #pragma once
 
-#include <pdal/PipelineExecutor.hpp>
-
+#include <pdal/PipelineManager.hpp>
 #include <numpy/arrayobject.h>
 
 namespace pdal
 {
 namespace python
 {
 
-class Array;
-
-void readPipeline(PipelineExecutor* executor, std::string json);
-void addArrayReaders(PipelineExecutor* executor, std::vector<std::shared_ptr<Array>> arrays);
+PyObject* buildNumpyDescriptor(PointLayoutPtr layout);
 PyArrayObject* viewToNumpyArray(PointViewPtr view);
 PyArrayObject* meshToNumpyArray(const TriangularMesh* mesh);
 
+class Array;
+
+class PDAL_DLL PipelineExecutor {
+public:
+    PipelineExecutor(std::string const& json, std::vector<std::shared_ptr<Array>> arrays, int level);
+    virtual ~PipelineExecutor() = default;
+
+    point_count_t execute();
+    point_count_t executeStream(point_count_t streamLimit);
+
+    const PointViewSet& views() const;
+    std::string getPipeline() const;
+    std::string getMetadata() const;
+    std::string getSchema() const;
+    std::string getLog() const { return m_logStream.str(); }
+
+protected:
+    virtual ConstPointTableRef pointTable() const { return m_manager.pointTable(); }
+
+    pdal::PipelineManager m_manager;
+    bool m_executed = false;
+
+private:
+    void addArrayReaders(std::vector<std::shared_ptr<Array>> arrays);
+
+    std::stringstream m_logStream;
+};
+
+class CountPointTable : public FixedPointTable
+{
+public:
+    CountPointTable(point_count_t capacity) : FixedPointTable(capacity), m_count(0) {}
+    point_count_t count() const { return m_count; }
+
+protected:
+    virtual void reset();
+
+private:
+    point_count_t m_count;
+};
+
 } // namespace python
 } // namespace pdal
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-set(EXTENSION_SRC PyArray.cpp PyPipeline.cpp)`
	`1`	`+set(EXTENSION_SRC PyArray.cpp PyPipeline.cpp StreamableExecutor.cpp)`
`2`	`2`
`3`	`3`	`set(extension "libpdalpython")`
`4`	`4`	`add_cython_target(${extension} "libpdalpython.pyx" CXX PY3)`