diff --git a/.gitignore b/.gitignore
index a9748b3e..ff06707b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,13 +7,23 @@ raja-stream
 kokkos-stream
 sycl-stream
 hip-stream
+poplar-stream
 
 *.o
 *.bc
 *.sycl
+*.gc
 *.tar
 *.gz
 
 .DS_Store
 
 Makefile
+
+vars.capnp
+graph.json
+execution.json
+*.csv
+arhive.a
+*.gp
+popops-stream
diff --git a/Poplar.make b/Poplar.make
new file mode 100644
index 00000000..760006ba
--- /dev/null
+++ b/Poplar.make
@@ -0,0 +1,39 @@
+ifndef COMPILER
+define compiler_help
+Set COMPILER to change flags (defaulting to GNU).
+Available compilers are:
+  GNU
+
+endef
+$(info $(compiler_help))
+COMPILER=GNU
+endif
+
+COMPILER_GNU = g++
+CXX = $(COMPILER_$(COMPILER))
+
+FLAGS_ = -O3 -std=c++17 -Wall
+FLAGS_GNU = -O3 -std=c++17 -Wall
+CXXFLAGS=$(FLAGS_$(COMPILER))
+
+PLATFORM = $(shell uname -s)
+LIBS = -lpoplar -lpopops -lpoputil
+
+
+.PHONY: all
+all: poplar-stream popops-stream poplar-stream-vectorised
+
+.PHONY: clean
+clean:
+	rm -f poplar-stream popops-stream poplar-stream-vectorised PoplarKernels.gc
+
+poplar-stream: main.cpp PoplarStream.cpp 
+	$(CXX) $(CXXFLAGS) -DPOPLAR $^ $(EXTRA_FLAGS) $(LIBS) -o $@
+
+poplar-stream-vectorised: main.cpp PoplarStream.cpp 
+	$(CXX) $(CXXFLAGS) -DPOPLAR -DVECTORISED=true $^ $(EXTRA_FLAGS) $(LIBS) -o $@	
+
+popops-stream: main.cpp PopopsStream.cpp 
+	$(CXX) $(CXXFLAGS) -DPOPLAR $^ $(EXTRA_FLAGS) $(LIBS) -o $@
+
+
diff --git a/PoplarKernels.cpp b/PoplarKernels.cpp
new file mode 100644
index 00000000..46ff75e7
--- /dev/null
+++ b/PoplarKernels.cpp
@@ -0,0 +1,340 @@
+#include <ipudef.h>
+#include <poplar/Vertex.hpp>
+#include <print.h>
+using namespace poplar;
+
+#define UNROLL 8
+
+template <typename T, typename V>
+class InitKernel : public Vertex
+{
+
+public:
+    Output<Vector<T>> a, b, c;
+    unsigned size;
+    Input<float> initA, initB, initC;
+
+    bool compute()
+    {
+        for (auto i = 0u; i < size; i++)
+        {
+            a[i] = initA;
+            b[i] = initB;
+            c[i] = initC;
+        }
+        return true;
+    }
+};
+
+template class InitKernel<float, float>;
+template class InitKernel<half, half>;
+template class InitKernel<float, float2>;
+template class InitKernel<half, half4>;
+
+template <typename T, typename V>
+class CopyKernel : public Vertex
+{
+
+public:
+    Input<Vector<T, VectorLayout::ONE_PTR, 8>> a;
+    Output<Vector<T, VectorLayout::ONE_PTR, 8>> c;
+    unsigned size;
+
+    inline void doCopy(const V *__restrict src, V *__restrict dst, const unsigned size)
+    {
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            dst[i] = src[i];
+        }
+    }
+
+    bool compute()
+    {
+        doCopy(reinterpret_cast<V *>(&a[0]), reinterpret_cast<V *>(&c[0]), size * sizeof(T) / sizeof(V));
+        return true;
+    }
+};
+
+template class CopyKernel<float, float>;
+template class CopyKernel<half, half>;
+template class CopyKernel<float, float2>;
+template class CopyKernel<half, half4>;
+
+template <typename T, typename V>
+class MulKernel : public Vertex
+{
+
+public:
+    Input<Vector<T, VectorLayout::ONE_PTR, 8>> c;
+    Output<Vector<T, VectorLayout::ONE_PTR, 8>> b;
+    unsigned size;
+    float alpha;
+
+    inline void doMul(const V *__restrict src, V *__restrict dst, const float alpha, const unsigned size)
+    {
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            dst[i] = alpha * src[i];
+        }
+    }
+
+    bool compute()
+    {
+        doMul(reinterpret_cast<V *>(&c[0]), reinterpret_cast<V *>(&b[0]), alpha, size * sizeof(T) / sizeof(V));
+        return true;
+    }
+};
+
+template class MulKernel<float, float>;
+template class MulKernel<half, half>;
+template class MulKernel<float, float2>;
+
+template <>
+class MulKernel<half, half4> : public Vertex
+{
+
+public:
+    Input<Vector<half, VectorLayout::ONE_PTR, 8>> c;
+    Output<Vector<half, VectorLayout::ONE_PTR, 8>> b;
+    unsigned size;
+    float alpha;
+
+    inline void doMul(const half4 *__restrict src, half4 *__restrict dst, const float alpha, const unsigned size)
+    {
+        half _alpha = (half)alpha;
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            dst[i] = _alpha * src[i];
+        }
+    }
+
+    bool compute()
+    {
+        doMul(reinterpret_cast<half4 *>(&c[0]), reinterpret_cast<half4 *>(&b[0]), alpha, size / 4);
+        return true;
+    }
+};
+
+template <typename T, typename V>
+class AddKernel : public Vertex
+{
+
+public:
+    Input<Vector<T, VectorLayout::ONE_PTR, 8>> b;
+    Input<Vector<T, VectorLayout::ONE_PTR, 8>> a;
+    Output<Vector<T, VectorLayout::ONE_PTR, 8>> c;
+    unsigned size;
+
+    inline void doAdd(const V *__restrict a, const V *__restrict b, V *__restrict c, const unsigned size)
+    {
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            c[i] = a[i] + b[i];
+        }
+    }
+
+    bool compute()
+    {
+        doAdd(reinterpret_cast<V *>(&a[0]), reinterpret_cast<V *>(&b[0]), reinterpret_cast<V *>(&c[0]), size * sizeof(T) / sizeof(V));
+        return true;
+    }
+};
+
+template class AddKernel<float, float>;
+template class AddKernel<half, half>;
+template class AddKernel<float, float2>;
+template class AddKernel<half, half4>;
+
+template <typename T, typename V>
+class TriadKernel : public Vertex
+{
+
+public:
+    Input<Vector<T>> b;
+    Input<Vector<T>> c;
+    Output<Vector<T>> a;
+    float alpha;
+    unsigned size;
+
+    inline void doTriad(V *__restrict a, const V *__restrict b, const V *__restrict c, const float alpha, const unsigned size)
+    {
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            a[i] = b[i] + alpha * c[i];
+        }
+    }
+
+    bool compute()
+    {
+        doTriad(reinterpret_cast<V *>(&a[0]), reinterpret_cast<V *>(&b[0]), reinterpret_cast<V *>(&c[0]), alpha, size * sizeof(T) / sizeof(V));
+        return true;
+    }
+};
+
+template class TriadKernel<float, float>;
+template class TriadKernel<half, half>;
+template <>
+class TriadKernel<float, float2> : public Vertex
+{
+
+public:
+    Input<Vector<float, VectorLayout::ONE_PTR, 8>> b;
+    Input<Vector<float, VectorLayout::ONE_PTR, 8>> c;
+    Output<Vector<float, VectorLayout::ONE_PTR, 8>> a;
+    float alpha;
+    unsigned size;
+
+    inline void doTriad(float2 *__restrict a, const float2 *__restrict b, const float2 *__restrict c, const float alpha, const unsigned size)
+    {
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            a[i] = b[i] + alpha * c[i];
+        }
+    }
+
+    bool compute()
+    {
+        doTriad(reinterpret_cast<float2 *>(&a[0]), reinterpret_cast<float2 *>(&b[0]), reinterpret_cast<float2 *>(&c[0]), alpha, size / 2);
+        return true;
+    }
+};
+template <>
+class TriadKernel<half, half4> : public Vertex
+{
+
+public:
+    Input<Vector<half, VectorLayout::ONE_PTR, 8>> b;
+    Input<Vector<half, VectorLayout::ONE_PTR, 8>> c;
+    Output<Vector<half, VectorLayout::ONE_PTR, 8>> a;
+    float alpha;
+    unsigned size;
+
+    inline void doTriad(half4 *__restrict a, const half4 *__restrict b, const half4 *__restrict c, const float alpha, const unsigned size)
+    {
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            a[i] = b[i] + (half)alpha * c[i];
+        }
+    }
+
+    bool compute()
+    {
+        doTriad(reinterpret_cast<half4 *>(&a[0]), reinterpret_cast<half4 *>(&b[0]), reinterpret_cast<half4 *>(&c[0]), alpha, size / 4);
+        return true;
+    }
+};
+
+template <typename T, typename V>
+class DotProdKernel : public Vertex
+{
+
+public:
+    Input<Vector<T>> a;
+    Input<Vector<T>> b;
+    Output<float> sum;
+    unsigned size;
+
+    inline auto doDotProd(const V *__restrict a, const V *__restrict b, const unsigned size) -> float
+    {
+        float tmp = 0.f;
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            tmp += a[i] * b[i];
+        }
+        return tmp;
+    }
+
+    bool compute()
+    {
+        *sum = doDotProd(reinterpret_cast<V *>(&a[0]), reinterpret_cast<V *>(&b[0]), size * sizeof(T) / sizeof(V));
+        return true;
+    }
+};
+
+template class DotProdKernel<float, float>;
+template class DotProdKernel<half, half>;
+template <>
+class DotProdKernel<float, float2> : public Vertex
+{
+
+public:
+    Input<Vector<float, VectorLayout::ONE_PTR, 8>> a;
+    Input<Vector<float, VectorLayout::ONE_PTR, 8>> b;
+    Output<float> sum;
+    unsigned size;
+
+    inline auto doDotProd(const float2 *__restrict a, const float2 *__restrict b, const unsigned size) -> float
+    {
+        float2 tmp = {0.f, 0.f};
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            tmp += a[i] * b[i];
+        }
+        return (float)tmp[0] + tmp[1];
+    }
+    bool compute()
+    {
+        *sum = doDotProd(reinterpret_cast<float2 *>(&a[0]), reinterpret_cast<float2 *>(&b[0]), size / 2);
+        return true;
+    }
+};
+template <>
+class DotProdKernel<half, half4> : public Vertex
+{
+
+public:
+    Input<Vector<half, VectorLayout::ONE_PTR, 8>> a;
+    Input<Vector<half, VectorLayout::ONE_PTR, 8>> b;
+    Output<float> sum;
+    unsigned size;
+    inline auto doDotProd(const half4 *__restrict a, const half4 *__restrict b, const unsigned size) -> float
+    {
+        half4 tmp = {0, 0, 0, 0};
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            tmp += a[i] * b[i];
+        }
+        return (float)tmp[0] + tmp[1] + tmp[2] + tmp[3];
+    }
+
+    bool compute()
+    {
+        *sum = doDotProd(reinterpret_cast<half4 *>(&a[0]), reinterpret_cast<half4 *>(&b[0]), size) / 4;
+        return true;
+    }
+};
+
+class ReduceSum : public Vertex
+{
+public:
+    Input<Vector<float>> partialSums;
+    Output<float> sum;
+    unsigned size;
+
+    inline auto doReduceSum(const float *__restrict partials, const unsigned size) -> float
+    {
+        float tmp = 0.f;
+#pragma unroll UNROLL
+        for (auto i = 0u; i < size; i++)
+        {
+            tmp += partials[i];
+        }
+        return tmp;
+    }
+
+    bool compute()
+    {
+        *sum = doReduceSum(reinterpret_cast<float *>(&partialSums[0]), size);
+        return true;
+    }
+};
diff --git a/PoplarStream.cpp b/PoplarStream.cpp
new file mode 100644
index 00000000..3bdc0edf
--- /dev/null
+++ b/PoplarStream.cpp
@@ -0,0 +1,645 @@
+
+// Copyright (c) 2015-16 Thorben Louw, Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "PoplarStream.h"
+#include <poplar/Engine.hpp>
+#include <poplar/Target.hpp>
+#include <poplar/DeviceManager.hpp>
+#include <poplin/codelets.hpp>
+#include <map>
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <optional>
+
+using namespace poplar;
+using namespace poplar::program;
+
+enum Programs
+{
+    INIT_PROGRAM,
+    COPY_PROGRAM,
+    MUL_PROGRAM,
+    ADD_PROGRAM,
+    TRIAD_PROGRAM,
+    DOT_PROGRAM,
+    STREAM_BACK_TO_HOST_PROGRAM
+};
+
+#ifdef DEBUG
+const OptionFlags POPLAR_ENGINE_OPTIONS{
+    {"target.saveArchive", "archive.a"},
+    {"debug.instrument", "true"},
+    {"debug.instrumentCompute", "true"},
+    {"debug.loweredVarDumpFile", "vars.capnp"},
+    {"debug.instrumentControlFlow", "true"},
+    {"debug.computeInstrumentationLevel", "tile"}};
+#else
+const OptionFlags POPLAR_ENGINE_OPTIONS{
+    {"debug.instrument", "false"}};
+#endif
+
+
+// This is due to https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion,
+// and is an ultra-hacky way to convert the initial array values to half without having a full half library
+// on the host
+uint16_t toHalf(float val)
+{
+    uint32_t x = static_cast<uint32_t>(val);
+    return ((x >> 16) & 0x8000) | ((((x & 0x7f800000) - 0x38000000) >> 13) & 0x7c00) | ((x >> 13) & 0x03ff);
+}
+
+void captureProfileInfo(Engine &engine)
+{
+    std::ofstream graphOfs;
+    graphOfs.open("graph.json", std::ofstream::out | std::ofstream::trunc);
+
+    std::ofstream executionOfs;
+    executionOfs.open("execution.json", std::ofstream::out | std::ofstream::trunc);
+
+    poplar::serializeToJSON(graphOfs, engine.getGraphProfile(), false);
+    poplar::serializeToJSON(executionOfs, engine.getExecutionProfile(), false);
+
+    graphOfs.close();
+    executionOfs.close();
+}
+
+std::optional<Device> getIpuDevice(const unsigned deviceType = 0)
+{
+    const auto validOptions = std::array<unsigned, 6>{0, 1, 2, 4, 8, 16};
+    const bool isValid = std::find(validOptions.begin(), validOptions.end(), deviceType) != validOptions.end();
+    if (isValid)
+    {
+        if (deviceType == 0)
+        { // Use the CPUDevice
+            // Note that as of Poplar v1.1.11, this ony returns a useless device with 1 tile and 256Kb of memory!
+            return std::optional<Device>(Device::createCPUDevice());
+        }
+        else
+        {
+            // Target an IPUDevice
+            DeviceManager manager = DeviceManager::createDeviceManager();
+            Device device;
+            for (auto &hwDevice : manager.getDevices(poplar::TargetType::IPU, deviceType))
+            {
+                device = std::move(hwDevice);
+                if (device.attach())
+                {
+                    std::cout << "Attached to IPU " << device.getId() << std::endl;
+                    return std::optional<Device>(std::move(device));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+void listDevices()
+{
+
+    DeviceManager manager = DeviceManager::createDeviceManager();
+
+    std::cout << 0 << ": "
+              << "CPUDevice" << std::endl;
+
+    // Attempt to attach to a single IPU:
+    Device device;
+    auto multiIpu = std::array<unsigned, 4>{2, 4, 8, 16};
+    for (auto i : multiIpu)
+    {
+        if (auto devices = manager.getDevices(poplar::TargetType::IPU, i); !devices.empty())
+        {
+            std::cout << i << ": "
+                      << "IPUDevice" << std::endl;
+        }
+    }
+}
+
+class PoplarStreamUtil
+{
+private:
+    const unsigned numTiles;
+    const unsigned numWorkersPerTile;
+    const unsigned arraySize;
+
+    const unsigned totalTilesThatWillBeUsed;
+    const unsigned totalWorkersThatWillBeUsed;
+
+    std::map<std::string, Tensor> tensors = {};
+    std::vector<Program> programs = {};
+    std::map<std::string, DataStream> dataStreams = {};
+
+    [[nodiscard]] unsigned numItemsForTileAndWorker(const unsigned tileNum, const unsigned workerNum) const
+    {
+        if (arraySize <= numTiles)
+        { // Just use 1 item per worker
+            return 1;
+        }
+        else
+        { // Balance as fairly as possible
+            auto extra = tileNum < arraySize % numTiles;
+            auto totalForTile = unsigned(arraySize / numTiles) + extra;
+            auto extraForThread = workerNum < totalForTile % numWorkersPerTile;
+            return unsigned(totalForTile / numWorkersPerTile) + extraForThread;
+        }
+    }
+
+    [[nodiscard]] unsigned numItemsForTile(const unsigned tileNum) const
+    {
+        if (arraySize <= numTiles)
+        { // We'll just use one item per worker
+            return numWorkersUsedOnTile(tileNum);
+        }
+        else
+        { // Now we must balance as fairly as possible
+            auto extra = tileNum < arraySize % numTiles;
+            return unsigned(arraySize / numTiles) + extra;
+        }
+    }
+
+    [[nodiscard]] unsigned numWorkersUsedOnTile(const unsigned tileNum) const
+    {
+        return std::min(numWorkersPerTile, totalWorkersThatWillBeUsed - tileNum * numWorkersPerTile);
+    }
+
+    // This could be done faster with a Stream copy, but that creates a very large FIFO
+    // which limits the array size even further
+    Program initProgram(const std::string &templateStr, Graph &graph)
+    {
+        ComputeSet cs = graph.addComputeSet("init");
+
+        unsigned idx = 0u;
+        for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++)
+        {
+            auto tile = w / numWorkersPerTile;
+            auto worker = w % numWorkersPerTile;
+            auto numItems = numItemsForTileAndWorker(tile, worker);
+            const auto v = graph.addVertex(
+                cs,
+                "InitKernel" + templateStr,
+                {
+                    {"a", tensors["a"].slice({idx}, {idx + numItems}).flatten()},
+                    {"b", tensors["b"].slice({idx}, {idx + numItems}).flatten()},
+                    {"c", tensors["c"].slice({idx}, {idx + numItems}).flatten()},
+                    {"initA", tensors["initA"]},
+                    {"initB", tensors["initB"]},
+                    {"initC", tensors["initC"]},
+                });
+            graph.setInitialValue(v["size"], unsigned(numItems));
+            graph.setCycleEstimate(v, numItems);
+            graph.setTileMapping(v, tile);
+            idx += numItems;
+        }
+
+        return Execute(cs);
+    }
+
+    Program copyProgram(const std::string &templateStr, Graph &graph)
+    {
+        ComputeSet cs = graph.addComputeSet("copy");
+
+        unsigned idx = 0u;
+        for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++)
+        {
+            auto tile = w / numWorkersPerTile;
+            auto worker = w % numWorkersPerTile;
+            auto numItems = numItemsForTileAndWorker(tile, worker);
+            const auto v = graph.addVertex(cs,
+                                           "CopyKernel" + templateStr,
+                                           {{"a", tensors["a"].slice({idx}, {idx + numItems}).flatten()},
+                                            {"c", tensors["c"].slice({idx}, {idx + numItems}).flatten()}});
+            graph.setInitialValue(v["size"], unsigned(numItems));
+
+            graph.setCycleEstimate(v, numItems);
+            graph.setTileMapping(v, tile);
+            idx += numItems;
+        }
+
+        return Execute(cs);
+    }
+
+    Program mulProgram(const std::string &templateStr, Graph &graph)
+    {
+        ComputeSet cs = graph.addComputeSet("mul");
+
+        unsigned idx = 0u;
+        for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++)
+        {
+            auto tile = w / numWorkersPerTile;
+            auto worker = w % numWorkersPerTile;
+            auto numItems = numItemsForTileAndWorker(tile, worker);
+            const auto v = graph.addVertex(
+                cs,
+                "MulKernel" + templateStr,
+                {
+                    {"b", tensors["b"].slice({idx}, {idx + numItems})},
+                    {"c", tensors["c"].slice({idx}, {idx + numItems})},
+                });
+            graph.setInitialValue(v["size"], unsigned(numItems));
+            graph.setInitialValue(v["alpha"], float(startScalar));
+            graph.setCycleEstimate(v, numItems);
+            graph.setTileMapping(v, tile);
+            idx += numItems;
+        }
+
+        return Execute(cs);
+    }
+
+    Program addProgram(const std::string &templateStr, Graph &graph)
+    {
+        ComputeSet cs = graph.addComputeSet("add");
+
+        unsigned idx = 0u;
+        for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++)
+        {
+            auto tile = w / numWorkersPerTile;
+            auto worker = w % numWorkersPerTile;
+            auto numItems = numItemsForTileAndWorker(tile, worker);
+            const auto v = graph.addVertex(
+                cs,
+                "AddKernel" + templateStr,
+                {
+                    {"b", tensors["b"].slice({idx}, {idx + numItems})},
+                    {"a", tensors["a"].slice({idx}, {idx + numItems})},
+                    {"c", tensors["c"].slice({idx}, {idx + numItems})},
+                });
+            graph.setInitialValue(v["size"], unsigned(numItems));
+            graph.setCycleEstimate(v, numItems);
+            graph.setTileMapping(v, tile);
+            idx += numItems;
+        }
+
+        return Execute(cs);
+    }
+
+    Program triadProgram(const std::string &templateStr, Graph &graph)
+    {
+        ComputeSet cs = graph.addComputeSet("triad");
+
+        unsigned idx = 0u;
+        for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++)
+        {
+            auto tile = w / numWorkersPerTile;
+            auto worker = w % numWorkersPerTile;
+
+            auto numItems = numItemsForTileAndWorker(tile, worker);
+            const auto v = graph.addVertex(
+                cs,
+                "TriadKernel" + templateStr,
+                {
+                    {"b", tensors["b"].slice({idx}, {idx + numItems})},
+                    {"a", tensors["a"].slice({idx}, {idx + numItems})},
+                    {"c", tensors["c"].slice({idx}, {idx + numItems})},
+                });
+            graph.setInitialValue(v["size"], unsigned(numItems));
+            graph.setInitialValue(v["alpha"], float(startScalar));
+            graph.setCycleEstimate(v, numItems);
+            graph.setTileMapping(v, tile);
+            idx += numItems;
+        }
+
+        return Execute(cs);
+    }
+
+    Program dotProdProgram(const std::string &templateStr, Graph &graph)
+    {
+        ComputeSet dotCs = graph.addComputeSet("dot");
+        unsigned idx = 0u;
+        for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++)
+        {
+            auto tile = w / numWorkersPerTile;
+            auto worker = w % numWorkersPerTile;
+            auto numItems = numItemsForTileAndWorker(tile, worker);
+            const auto v = graph.addVertex(
+                dotCs,
+                "DotProdKernel" + templateStr,
+                {{"b", tensors["b"].slice({idx}, {idx + numItems})},
+                 {"a", tensors["a"].slice({idx}, {idx + numItems})},
+                 {"sum", tensors["partialSumsPerWorker"][tile * numWorkersPerTile +
+                                                         worker]}
+
+                });
+            graph.setInitialValue(v["size"], unsigned(numItems));
+            graph.setCycleEstimate(v, numItems);
+            graph.setTileMapping(v, tile);
+            idx += numItems;
+        }
+
+        ComputeSet partialReduxCs = graph.addComputeSet("reductionPerTile");
+        idx = 0u;
+        for (unsigned tile = 0; tile < totalTilesThatWillBeUsed; tile++)
+        {
+            auto workersUsedOnThisTile = numWorkersUsedOnTile(tile);
+            const auto v = graph.addVertex(
+                partialReduxCs,
+                "ReduceSum",
+                {
+                    {"partialSums", tensors["partialSumsPerWorker"].slice(
+                                        {idx}, {idx + workersUsedOnThisTile})},
+                    {"sum", tensors["partialSumsPerTile"][tile]},
+                });
+            graph.setInitialValue(v["size"], unsigned(workersUsedOnThisTile));
+
+            graph.setCycleEstimate(v, workersUsedOnThisTile);
+            graph.setTileMapping(v, tile);
+            idx += workersUsedOnThisTile;
+        }
+
+        ComputeSet finalReduxCs = graph.addComputeSet("finalReduction");
+        const auto v = graph.addVertex(
+            finalReduxCs,
+            "ReduceSum",
+            {{"partialSums", tensors["partialSumsPerTile"]},
+             {"sum", tensors["sum"]}});
+        graph.setInitialValue(v["size"], unsigned(totalTilesThatWillBeUsed));
+
+        graph.setCycleEstimate(v, numTiles);
+        graph.setTileMapping(v, numTiles - 1);
+
+        return Sequence(
+            Execute(dotCs),
+            Execute(partialReduxCs),
+            Execute(finalReduxCs));
+    }
+
+    void createAndLayOutTensors(poplar::Type type, Graph &graph)
+    {
+        tensors["initA"] = graph.addVariable(FLOAT, {}, "initA");
+        tensors["initB"] = graph.addVariable(FLOAT, {}, "initB");
+        tensors["initC"] = graph.addVariable(FLOAT, {}, "initC");
+
+        tensors["a"] = graph.addVariable(type, {arraySize}, "a");
+        tensors["b"] = graph.addVariable(type, {arraySize}, "b");
+        tensors["c"] = graph.addVariable(type, {arraySize}, "c");
+        tensors["sum"] = graph.addVariable(FLOAT, {}, "sum");
+        tensors["partialSumsPerWorker"] = graph.addVariable(FLOAT, {totalWorkersThatWillBeUsed},
+                                                            "partialSumsPerWorker");
+        tensors["partialSumsPerTile"] = graph.addVariable(FLOAT, {totalTilesThatWillBeUsed}, "partialSumsPerTile");
+        graph.createHostRead("sum", tensors["sum"]);
+        graph.createHostWrite("initA", tensors["initA"]);
+        graph.createHostWrite("initB", tensors["initB"]);
+        graph.createHostWrite("initC", tensors["initC"]);
+
+        auto idx = 0u;
+        for (auto tile = 0u; tile < totalTilesThatWillBeUsed; tile++)
+        {
+            auto mapMem = [=]() -> unsigned {
+#ifdef MEM_ON_NEXT_TILE
+                return (tile + 1) % totalTilesThatWillBeUsed;
+#else
+                return tile;
+#endif
+            };
+
+            if (auto numItems = numItemsForTile(tile); numItems > 0)
+            {
+                graph.setTileMapping(tensors["a"].slice(idx, idx + numItems), mapMem());
+                graph.setTileMapping(tensors["b"].slice(idx, idx + numItems), mapMem());
+                graph.setTileMapping(tensors["c"].slice(idx, idx + numItems), mapMem());
+                idx += numItems;
+            }
+
+            graph.setTileMapping(tensors["partialSumsPerWorker"].slice(
+                                     {tile * numWorkersPerTile}, {tile * numWorkersPerTile + numWorkersUsedOnTile(tile)}),
+                                 mapMem());
+            graph.setTileMapping(tensors["partialSumsPerTile"].slice(tile, tile + 1), mapMem());
+        }
+        graph.setTileMapping(tensors["sum"], numTiles - 1);
+        graph.setTileMapping(tensors["initA"], 0);
+        graph.setTileMapping(tensors["initB"], 0);
+        graph.setTileMapping(tensors["initC"], 0);
+    }
+
+    void createDataStreams(poplar::Type type, Graph &graph)
+    {
+        // dataStreams["in_a"] = graph.addHostToDeviceFIFO("in_a", type, arraySize);
+        // dataStreams["in_b"] = graph.addHostToDeviceFIFO("in_b", type, arraySize);
+        // dataStreams["in_c"] = graph.addHostToDeviceFIFO("in_c", type, arraySize);
+
+        dataStreams["out_a"] = graph.addDeviceToHostFIFO("out_a", type, arraySize);
+        dataStreams["out_b"] = graph.addDeviceToHostFIFO("out_b", type, arraySize);
+        dataStreams["out_c"] = graph.addDeviceToHostFIFO("out_c", type, arraySize);
+    }
+
+    [[nodiscard]] const unsigned numWorkersNeeded(unsigned arraySize, unsigned numTiles, unsigned numWorkersPerTile) const
+    {
+        return std::min(
+            arraySize,
+            std::min(
+                numTiles * numWorkersPerTile,
+                unsigned(std::ceil(arraySize / (numWorkersPerTile * 1.0))) * 6));
+    }
+
+    [[nodiscard]] const unsigned numTilesNeeded(unsigned arraySize, unsigned numTiles, unsigned numWorkersPerTile) const
+    {
+        return std::min(numTiles, unsigned(std::ceil(arraySize / (numWorkersPerTile * 1.0))));
+    }
+
+public:
+    PoplarStreamUtil(unsigned numTiles, unsigned numWorkersPerTile, unsigned arraySize) : numTiles(numTiles),
+                                                                                          numWorkersPerTile(numWorkersPerTile),
+                                                                                          arraySize(arraySize),
+                                                                                          totalTilesThatWillBeUsed(numTilesNeeded(arraySize, numTiles, numWorkersPerTile)),
+                                                                                          totalWorkersThatWillBeUsed(numWorkersNeeded(arraySize, numTiles, numWorkersPerTile))
+    {
+    }
+
+    std::unique_ptr<Engine> prepareEngine(const Device &device,
+                                          const Graph &graph,
+                                          void *a,
+                                          void *b,
+                                          void *c)
+    {
+        assert(!programs.empty());
+        auto engine = std::make_unique<Engine>(graph, programs, POPLAR_ENGINE_OPTIONS);
+
+        engine->connectStream("out_a", a);
+        engine->connectStream("out_b", b);
+        engine->connectStream("out_c", c);
+
+        engine->load(device);
+
+        return std::move(engine);
+    }
+
+    void buildComputeGraph(poplar::Type type, Graph &graph)
+    {
+        // Set up data streams to copy data in and out of graph
+
+        auto typeStr1 = type == FLOAT ? "<float," : "<half,";
+#ifdef VECTORISED
+        auto typeStr2 = type == FLOAT ? "float2>" : "half4>";
+#else
+        auto typeStr2 = type == FLOAT ? "float>" : "half>";
+#endif
+
+        auto typeStr = std::string(typeStr1) + std::string(typeStr2);
+
+        createDataStreams(type, graph);
+
+        createAndLayOutTensors(type, graph);
+
+        auto InitProg = initProgram(typeStr, graph);
+
+        auto CopyProg = copyProgram(typeStr, graph);
+        auto MulProg = mulProgram(typeStr, graph);
+        auto AddProg = addProgram(typeStr, graph);
+        auto TriadProg = triadProgram(typeStr, graph);
+        auto DotProg = dotProdProgram(typeStr, graph);
+
+        auto StreamToHostProg = Sequence(Copy(tensors["a"], dataStreams["out_a"]),
+                                         Copy(tensors["b"], dataStreams["out_b"]),
+                                         Copy(tensors["c"], dataStreams["out_c"]));
+
+        programs = {InitProg, CopyProg, MulProg, AddProg, TriadProg, DotProg,
+                    StreamToHostProg};
+    }
+};
+
+template <class T>
+PoplarStream<T>::PoplarStream(const unsigned int arraySize, const int device_num, const bool halfPrecision) : arraySize(arraySize),
+                                                                                                              halfPrecision(halfPrecision),
+                                                                                                              a(std::unique_ptr<T[]>(new T[arraySize]())),
+                                                                                                              b(std::unique_ptr<T[]>(new T[arraySize]())),
+                                                                                                              c(std::unique_ptr<T[]>(new T[arraySize]()))
+{
+
+    auto device = getIpuDevice(device_num);
+
+    if (!device.has_value())
+    {
+        throw std::runtime_error("Could not allocate IPU device");
+    }
+    target = device->getTarget();
+
+    Graph graph(device.value());
+    const auto numTiles = graph.getTarget().getNumTiles();
+    const auto numWorkers = graph.getTarget().getNumWorkerContexts();
+    const auto maxBytesPerTile = graph.getTarget().getBytesPerTile();
+    const auto clockFrequency = graph.getTarget().getTileClockFrequency();
+
+    const auto maxArraySize = ((double)numTiles) * maxBytesPerTile / 1024.0 / 1024.0 / 3;
+
+    std::cout << "Using IPU with " << numTiles << " tiles, each with " << numWorkers
+              << " workers and " << maxBytesPerTile / 1024 << "KB of memory per tile, and clock frequency "
+              << (int)clockFrequency / 1000 / 1000 << " MHz. Maximum array size will be slightly less than "
+              << std::fixed << std::setprecision(2) << std::floor(maxArraySize) << " MB"
+              << std::endl;
+
+    auto util = PoplarStreamUtil(numTiles, numWorkers, arraySize);
+
+    graph.addCodelets("PoplarKernels.cpp", CodeletFileType::Auto, "-O3");
+
+    if (sizeof(T) > sizeof(float))
+    {
+        throw std::runtime_error("Device does not support double precision, please use --float");
+    }
+
+    // Check buffers fit on the device
+    size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T) / 2 : sizeof(T);
+    unsigned long maxbuffer = ((unsigned long)numTiles) * maxBytesPerTile;
+    unsigned long totalmem = ((unsigned long)numTiles) * maxBytesPerTile;
+    if (maxbuffer < sizeT * ((unsigned long)arraySize))
+        throw std::runtime_error("Device cannot allocate a buffer big enough");
+    if (totalmem < 3L * sizeT * arraySize)
+        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
+    util.buildComputeGraph(halfPrecision ? HALF : FLOAT, graph);
+    engine = util.prepareEngine(device.value(), graph, a.get(), b.get(), c.get());
+}
+
+template <class T>
+PoplarStream<T>::~PoplarStream() = default;
+
+template <class T>
+void PoplarStream<T>::copy()
+{
+    engine->run(COPY_PROGRAM);
+}
+
+template <class T>
+void PoplarStream<T>::mul()
+{
+    engine->run(MUL_PROGRAM);
+}
+
+template <class T>
+void PoplarStream<T>::add()
+{
+    engine->run(ADD_PROGRAM);
+}
+
+template <class T>
+void PoplarStream<T>::triad()
+{
+    engine->run(TRIAD_PROGRAM);
+}
+
+template <class T>
+T PoplarStream<T>::dot()
+{
+    engine->run(DOT_PROGRAM);
+    engine->readTensor("sum", &sum);
+    return sum;
+}
+
+template <class T>
+void PoplarStream<T>::init_arrays(T initA, T initB, T initC)
+{
+
+    if (halfPrecision)
+    {
+        const uint32_t fakeA = toHalf(initA);
+        const uint32_t fakeB = toHalf(initB);
+        const uint32_t fakeC = toHalf(initC);
+
+        engine->writeTensor("initA", &fakeA);
+        engine->writeTensor("initB", &fakeB);
+        engine->writeTensor("initC", &fakeC);
+    }
+    else
+    {
+        engine->writeTensor("initA", &initA);
+        engine->writeTensor("initB", &initB);
+        engine->writeTensor("initC", &initC);
+    }
+    engine->run(INIT_PROGRAM);
+}
+
+
+
+template<> void PoplarStream<double>::copyArrays( const double *src, double *dst) {
+    std::memcpy(dst, src, arraySize * sizeof(double));
+}
+
+template<> void PoplarStream<float>::copyArrays(const float *src, float *dst) {
+    copyDeviceHalfToFloat(target, src, dst, arraySize);
+}
+
+
+
+template <class T>
+void PoplarStream<T>::read_arrays(std::vector<T> &h_a, std::vector<T> &h_b, std::vector<T> &h_c)
+{
+
+    engine->run(STREAM_BACK_TO_HOST_PROGRAM);
+    copyArrays(a.get(), h_a.data());
+    copyArrays(b.get(), h_b.data());
+    copyArrays(c.get(), h_c.data());
+
+
+#ifdef DEBUG
+    captureProfileInfo(*engine);
+    engine->printProfileSummary(std::cout,
+                                OptionFlags{{"showExecutionSteps", "true"}});
+#endif
+}
+
+template class PoplarStream<float>;
+
+template class PoplarStream<double>; // Not usable, but needs to exist for stream.cpp
diff --git a/PoplarStream.h b/PoplarStream.h
new file mode 100644
index 00000000..0db6805b
--- /dev/null
+++ b/PoplarStream.h
@@ -0,0 +1,58 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <poplar/Engine.hpp>
+#include <poplar/Program.hpp>
+
+#include "Stream.h"
+
+#define IMPLEMENTATION_STRING "Poplar"
+
+
+using namespace poplar::program;
+
+template<class T>
+class PoplarStream : public Stream<T> {
+
+protected:
+    unsigned int arraySize;
+    const bool halfPrecision;
+    T sum = 0;
+    std::unique_ptr <poplar::Engine> engine;
+    poplar::Target target;
+    std::unique_ptr<T[]> a;
+    std::unique_ptr<T[]> b;
+    std::unique_ptr<T[]> c;
+
+public:
+
+    PoplarStream(const unsigned int, const int, const bool halfPrecision);
+
+    ~PoplarStream();
+
+    virtual void copy() override;
+
+    virtual void add() override;
+
+    virtual void mul() override;
+
+    virtual void triad() override;
+
+    virtual T dot() override;
+
+    virtual void init_arrays(T initA, T initB, T initC) override;
+
+    virtual void read_arrays(std::vector <T> &a, std::vector <T> &b, std::vector <T> &c) override;
+    virtual void copyArrays(const T *src, T *dst);
+
+};
+
diff --git a/PopopsStream.cpp b/PopopsStream.cpp
new file mode 100644
index 00000000..cacf3144
--- /dev/null
+++ b/PopopsStream.cpp
@@ -0,0 +1,429 @@
+
+// Copyright (c) 2015-16 Thorben Louw, Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "PoplarStream.h"
+#include <poplar/Engine.hpp>
+#include <poplar/DeviceManager.hpp>
+#include <popops/codelets.hpp>
+#include <map>
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <optional>
+#include <popops/Reduce.hpp>
+#include <popops/ElementWise.hpp>
+#include <popops/ScaledAdd.hpp>
+#include <poputil/Util.hpp>
+#include <poputil/TileMapping.hpp>
+
+using namespace poplar;
+using namespace poplar::program;
+using namespace popops;
+
+enum Programs
+{
+    INIT_PROGRAM,
+    COPY_PROGRAM,
+    MUL_PROGRAM,
+    ADD_PROGRAM,
+    TRIAD_PROGRAM,
+    DOT_PROGRAM,
+    STREAM_BACK_TO_HOST_PROGRAM
+};
+
+// const OptionFlags POPLAR_ENGINE_OPTIONS{
+//         {"target.saveArchive",                "archive.a"},
+//         {"debug.instrument",                  "true"},
+//         {"debug.instrumentCompute",           "true"},
+//         {"debug.loweredVarDumpFile",          "vars.capnp"},
+//         {"debug.instrumentControlFlow",       "true"},
+//         {"debug.computeInstrumentationLevel", "tile"}
+// };
+
+const OptionFlags POPLAR_ENGINE_OPTIONS{
+    {"debug.instrument", "false"}};
+
+// This is due to https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion,
+// and is an ultra-hacky way to convert the initial array values to half without having a full half library
+// on the host
+uint16_t toHalf(float val)
+{
+    uint32_t x = static_cast<uint32_t>(val);
+    return ((x >> 16) & 0x8000) | ((((x & 0x7f800000) - 0x38000000) >> 13) & 0x7c00) | ((x >> 13) & 0x03ff);
+}
+
+void captureProfileInfo(Engine &engine)
+{
+    std::ofstream graphOfs;
+    graphOfs.open("graph.json", std::ofstream::out | std::ofstream::trunc);
+
+    std::ofstream executionOfs;
+    executionOfs.open("execution.json", std::ofstream::out | std::ofstream::trunc);
+
+    poplar::serializeToJSON(graphOfs, engine.getGraphProfile(), false);
+    poplar::serializeToJSON(executionOfs, engine.getExecutionProfile(), false);
+
+    graphOfs.close();
+    executionOfs.close();
+}
+
+std::optional<Device> getIpuDevice(const unsigned deviceType = 0)
+{
+    const auto validOptions = std::array<unsigned, 6>{0, 1, 2, 4, 8, 16};
+    const bool isValid = std::find(validOptions.begin(), validOptions.end(), deviceType) != validOptions.end();
+    if (isValid)
+    {
+        if (deviceType == 0)
+        { // Use the CPUDevice
+            // Note that as of Poplar v1.1.11, this ony returns a useless device with 1 tile and 256Kb of memory!
+            return std::optional<Device>(Device::createCPUDevice());
+        }
+        else
+        {
+            // Target an IPUDevice
+            DeviceManager manager = DeviceManager::createDeviceManager();
+            Device device;
+            for (auto &hwDevice : manager.getDevices(poplar::TargetType::IPU, deviceType))
+            {
+                device = std::move(hwDevice);
+                if (device.attach())
+                {
+                    std::cout << "Attached to IPU " << device.getId() << std::endl;
+                    return std::optional<Device>(std::move(device));
+                }
+            }
+        }
+    }
+    return std::nullopt;
+}
+
+void listDevices()
+{
+
+    DeviceManager manager = DeviceManager::createDeviceManager();
+
+    std::cout << 0 << ": "
+              << "CPUDevice" << std::endl;
+
+    // Attempt to attach to a single IPU:
+    Device device;
+    if (auto devices = manager.getDevices(poplar::TargetType::IPU, 1); !devices.empty())
+    {
+        std::cout << 1 << ": "
+                  << "IPUDevice" << std::endl;
+    }
+
+    // Attempt to attach to 2 IPUs:
+    if (auto devices = manager.getDevices(poplar::TargetType::IPU, 2); !devices.empty())
+    {
+        std::cout << 2 << ": "
+                  << "2x IPUDevices" << std::endl;
+    }
+
+    // Attempt to attach to 4 IPUs:
+    if (auto devices = manager.getDevices(poplar::TargetType::IPU, 4); !devices.empty())
+    {
+        std::cout << 4 << ": "
+                  << "4x IPUDevices" << std::endl;
+    }
+
+    // Attempt to attach to 8 IPUs:
+    if (auto devices = manager.getDevices(poplar::TargetType::IPU, 8); !devices.empty())
+    {
+        std::cout << 8 << ": "
+                  << "8x IPUDevices" << std::endl;
+    }
+
+    // Attempt to attach to 16 IPUs:
+    if (auto devices = manager.getDevices(poplar::TargetType::IPU, 16); !devices.empty())
+    {
+        std::cout << 16 << ": "
+                  << "16x IPUDevices" << std::endl;
+    }
+}
+
+class PoplarStreamUtil
+{
+private:
+    std::map<std::string, Tensor> tensors = {};
+    std::vector<Program> programs = {};
+    std::map<std::string, DataStream> dataStreams = {};
+
+    // This could be done faster with a Stream copy, but that creates a very large FIFO
+    // which limits the array size even further
+    Program initProgram(Graph &graph, size_t arraySize)
+    {
+        Sequence(s);
+        tensors["a"] = poputil::duplicate(graph, tensors["initA"].reshape({1}).broadcast(arraySize, 0), s, "a");
+        tensors["b"] = poputil::duplicate(graph, tensors["initB"].reshape({1}).broadcast(arraySize, 0), s, "b");
+        tensors["c"] = poputil::duplicate(graph, tensors["initC"].reshape({1}).broadcast(arraySize, 0), s, "c");
+        poputil::mapTensorLinearly(graph, tensors["a"]);
+        poputil::mapTensorLinearly(graph, tensors["b"]);
+        poputil::mapTensorLinearly(graph, tensors["c"]);
+
+        // s.add(PrintTensor("a", tensors["a"]));
+        // s.add(PrintTensor("b", tensors["b"]));
+        // s.add(PrintTensor("c", tensors["c"]));
+
+        return s;
+    }
+
+    // c = a
+    Program copyProgram(Graph &graph)
+    {
+        auto s = Sequence();
+        s.add(Copy(tensors["a"], tensors["c"]));
+    //    s.add(PrintTensor("c (=a)", tensors["c"]));
+        return s;
+    }
+
+    // b[i] = x * c[i];
+    Program mulProgram(Graph &graph)
+    {
+        auto s = Sequence();
+        s.add(Copy(popops::mul(graph, tensors["c"], tensors["alpha"], s, "Mul"), tensors["b"]));
+     //   s.add(PrintTensor("b (=xc)", tensors["b"]));
+        return s;
+    }
+
+    // c = a + b
+    Program addProgram(Graph &graph)
+    {
+        auto s = Sequence();
+        s.add(Copy(popops::add(graph, tensors["a"], tensors["b"], s, "Add"), tensors["c"]));
+     //   s.add(PrintTensor("c (=a+b)", tensors["c"]));
+        return s;
+    }
+
+    // a = b + xc
+    Program triadProgram(Graph &graph)
+    {
+        auto s = Sequence();
+        s.add(Copy(tensors["b"], tensors["a"]));
+        popops::scaledAddTo(graph, tensors["a"], tensors["c"], tensors["alpha"], s, "Triad", {{"optimizeForSpeed", "true"}});
+     //   s.add(PrintTensor("a (=b + xc)", tensors["a"]));
+        return s;
+    }
+
+    // sum = reduce+(a * b)
+    Program dotProdProgram(Graph &graph)
+    {
+        Sequence s;
+
+        popops::reduceWithOutput(graph,
+                                 popops::mul(graph, tensors["a"], tensors["b"], s, "a*b"), tensors["sum"], {0},
+                                 {popops::Operation::ADD},
+                                 s,
+                                 "reduce+");
+     //   s.add(PrintTensor("reduce+(a * b)", tensors["sum"]));
+
+        return s;
+    }
+
+    void createAndLayOutTensors(poplar::Type type, Graph &graph)
+    {
+        tensors["initA"] = graph.addVariable(type, {}, "initA");
+        tensors["initB"] = graph.addVariable(type, {}, "initB");
+        tensors["initC"] = graph.addVariable(type, {}, "initC");
+
+        tensors["alpha"] = graph.addConstant(type, {}, startScalar, "alpha");
+        tensors["sum"] = graph.addVariable(FLOAT, {}, "sum");
+
+        graph.createHostRead("sum", tensors["sum"]);
+        graph.createHostWrite("initA", tensors["initA"]);
+        graph.createHostWrite("initB", tensors["initB"]);
+        graph.createHostWrite("initC", tensors["initC"]);
+
+        graph.setTileMapping(tensors["sum"], 4);
+        graph.setTileMapping(tensors["initA"], 0);
+        graph.setTileMapping(tensors["initB"], 1);
+        graph.setTileMapping(tensors["initC"], 2);
+        graph.setTileMapping(tensors["alpha"], 3);
+    }
+
+    void createDataStreams(poplar::Type type, Graph &graph, size_t arraySize)
+    {
+        dataStreams["out_a"] = graph.addDeviceToHostFIFO("out_a", type, arraySize);
+        dataStreams["out_b"] = graph.addDeviceToHostFIFO("out_b", type, arraySize);
+        dataStreams["out_c"] = graph.addDeviceToHostFIFO("out_c", type, arraySize);
+    }
+
+public:
+    std::unique_ptr<Engine> prepareEngine(const Device &device,
+                                          const Graph &graph,
+                                          void *a,
+                                          void *b,
+                                          void *c)
+    {
+        assert(!programs.empty());
+        auto engine = std::make_unique<Engine>(graph, programs, POPLAR_ENGINE_OPTIONS);
+
+        engine->connectStream("out_a", a);
+        engine->connectStream("out_b", b);
+        engine->connectStream("out_c", c);
+
+        engine->load(device);
+
+        return std::move(engine);
+    }
+
+    void buildComputeGraph(poplar::Type type, Graph &graph, size_t arraySize)
+    {
+
+        createDataStreams(type, graph, arraySize);
+
+        createAndLayOutTensors(type, graph);
+
+        auto InitProg = initProgram(graph, arraySize);
+
+        auto CopyProg = copyProgram(graph);
+        auto MulProg = mulProgram(graph);
+        auto AddProg = addProgram(graph);
+        auto TriadProg = triadProgram(graph);
+        auto DotProg = dotProdProgram(graph);
+
+        auto StreamToHostProg = Sequence(Copy(tensors["a"], dataStreams["out_a"]),
+                                         Copy(tensors["b"], dataStreams["out_b"]),
+                                         Copy(tensors["c"], dataStreams["out_c"]));
+
+        programs = {InitProg, CopyProg, MulProg, AddProg, TriadProg, DotProg,
+                    StreamToHostProg};
+    }
+};
+
+template <class T>
+PoplarStream<T>::PoplarStream(const unsigned int arraySize, const int device_num, const bool halfPrecision) : arraySize(arraySize),
+                                                                                                              halfPrecision(halfPrecision),
+                                                                                                              a(std::unique_ptr<T[]>(new T[arraySize]())),
+                                                                                                              b(std::unique_ptr<T[]>(new T[arraySize]())),
+                                                                                                              c(std::unique_ptr<T[]>(new T[arraySize]()))
+{
+
+    auto device = getIpuDevice(device_num);
+
+    if (!device.has_value())
+    {
+        throw std::runtime_error("Could not allocate IPU device");
+    }
+
+    Graph graph(device.value());
+    const auto numTiles = graph.getTarget().getNumTiles();
+    const auto numWorkers = graph.getTarget().getNumWorkerContexts();
+    const auto maxBytesPerTile = graph.getTarget().getBytesPerTile();
+    const auto clockFrequency = graph.getTarget().getTileClockFrequency();
+
+    const auto maxArraySize = ((double)numTiles) * maxBytesPerTile / 1024.0 / 1024.0 / 3;
+
+    std::cout << "Using IPU with " << numTiles << " tiles, each with " << numWorkers
+              << " workers and " << maxBytesPerTile / 1024 << "KB of memory per tile, and clock frequency "
+              << (int)clockFrequency / 1000 / 1000 << " MHz. Maximum array size will be slightly less than "
+              << std::fixed << std::setprecision(2) << std::floor(maxArraySize) << " MB"
+              << std::endl;
+
+    auto util = PoplarStreamUtil();
+
+    graph.addCodelets("PoplarKernels.cpp");
+    popops::addCodelets(graph);
+
+    if (sizeof(T) > sizeof(float))
+    {
+        throw std::runtime_error("Device does not support double precision, please use --float");
+    }
+
+    // Check buffers fit on the device
+    size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T) / 2 : sizeof(T);
+    unsigned long maxbuffer = ((unsigned long)numTiles) * maxBytesPerTile;
+    unsigned long totalmem = ((unsigned long)numTiles) * maxBytesPerTile;
+    if (maxbuffer < sizeT * ((unsigned long)arraySize))
+        throw std::runtime_error("Device cannot allocate a buffer big enough");
+    if (totalmem < 3L * sizeT * arraySize)
+        throw std::runtime_error("Device does not have enough memory for all 3 buffers");
+
+    util.buildComputeGraph(halfPrecision ? HALF : FLOAT, graph, arraySize);
+    engine = util.prepareEngine(device.value(), graph, a.get(), b.get(), c.get());
+}
+
+template <class T>
+PoplarStream<T>::~PoplarStream() = default;
+
+template <class T>
+void PoplarStream<T>::copy()
+{
+    engine->run(COPY_PROGRAM);
+}
+
+template <class T>
+void PoplarStream<T>::mul()
+{
+    engine->run(MUL_PROGRAM);
+}
+
+template <class T>
+void PoplarStream<T>::add()
+{
+    engine->run(ADD_PROGRAM);
+}
+
+template <class T>
+void PoplarStream<T>::triad()
+{
+    engine->run(TRIAD_PROGRAM);
+}
+
+template <class T>
+T PoplarStream<T>::dot()
+{
+    engine->run(DOT_PROGRAM);
+    engine->readTensor("sum", &sum);
+    return sum;
+}
+
+template <class T>
+void PoplarStream<T>::init_arrays(T initA, T initB, T initC)
+{
+
+    if (halfPrecision)
+    {
+        const uint32_t fakeA = toHalf(initA);
+        const uint32_t fakeB = toHalf(initB);
+        const uint32_t fakeC = toHalf(initC);
+
+        engine->writeTensor("initA", &fakeA);
+        engine->writeTensor("initB", &fakeB);
+        engine->writeTensor("initC", &fakeC);
+    }
+    else
+    {
+        engine->writeTensor("initA", &initA);
+        engine->writeTensor("initB", &initB);
+        engine->writeTensor("initC", &initC);
+    }
+    engine->run(INIT_PROGRAM);
+}
+
+template <class T>
+void PoplarStream<T>::read_arrays(std::vector<T> &h_a, std::vector<T> &h_b, std::vector<T> &h_c)
+{
+
+    engine->run(STREAM_BACK_TO_HOST_PROGRAM);
+
+    for (unsigned i = 0; i < arraySize; i++)
+    {
+        h_a[i] = a[i];
+        h_b[i] = b[i];
+        h_c[i] = c[i];
+    }
+
+    // captureProfileInfo(*engine);
+    // engine->printProfileSummary(std::cout,
+    //                          OptionFlags{{"showExecutionSteps", "true"}});
+}
+
+template class PoplarStream<float>;
+
+template class PoplarStream<double>; // Not usable, but needs to exist for stream.cpp
diff --git a/README.md b/README.md
index e3c98b72..cd9ddcef 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Currently implemented are:
   - Kokkos
   - RAJA
   - SYCL
+  - Graphcore's Poplar framework for the Graphcore IPU
 
 This code was previously called GPU-STREAM.
 
@@ -82,6 +83,31 @@ For building with CUDA support, we use the following command.
 cmake .. -DCMAKE_INSTALL_PREFIX=<prefix> -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off
 ```
 
+
+Building and running for Poplar
+-------------------------------
+See the Developer Documentation at https://www.graphcore.ai/developer for how to install Poplar and set up 
+the environment variables for building. The latest version we used was v1.1.11.
+
+The IPU doesn't support doubles, so you must run with the --float option. 
+For a single IPU (with 1216 tiles), the largest array size we used was 16185000.
+
+Run with
+```
+./poplar-stream --arraysize 16185000  --device 1  --float 
+```
+
+You can run on multiple IPU targets (such as the 2 IPUs on 1 C2 IPU Processor card) by selecting a device appropriately
+using the `--device` flag. The options are 2,4,8 or 16 for the respective number of IPUs.
+
+You can run the Poplar program on a CPU target, but as of v1.1.11 this only gives you 1 tile with 256Kb of 
+memory. The largest array size we used was 21845.
+
+Run with
+```
+./poplar-stream --arraysize 21845  --device 0  --float 
+```
+
 Results
 -------
 
diff --git a/main.cpp b/main.cpp
index f006f8ca..7e7fc596 100644
--- a/main.cpp
+++ b/main.cpp
@@ -37,6 +37,8 @@
 #include "SYCLStream.h"
 #elif defined(OMP)
 #include "OMPStream.h"
+#elif defined(POPLAR)
+#include "PoplarStream.h"
 #endif
 
 // Default size of 2^25
@@ -44,6 +46,7 @@ unsigned int ARRAY_SIZE = 33554432;
 unsigned int num_times = 100;
 unsigned int deviceIndex = 0;
 bool use_float = false;
+bool use_half = false;
 bool triad_only = false;
 bool output_as_csv = false;
 bool mibibytes = false;
@@ -53,10 +56,10 @@ template <typename T>
 void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>& b, std::vector<T>& c, T& sum);
 
 template <typename T>
-void run();
+void run(bool halfPrecision=false);
 
 template <typename T>
-void run_triad();
+void run_triad(bool halfPrecision=false);
 
 void parseArguments(int argc, char *argv[]);
 
@@ -76,15 +79,15 @@ int main(int argc, char *argv[])
   // TODO: Fix Kokkos to allow multiple template specializations
   if (triad_only)
   {
-    if (use_float)
-      run_triad<float>();
+    if (use_float || use_half)
+      run_triad<float>(use_half);
     else
       run_triad<double>();
   }
   else
   {
-    if (use_float)
-      run<float>();
+    if (use_float || use_half)
+      run<float>(use_half);
     else
       run<double>();
   }
@@ -92,17 +95,21 @@ int main(int argc, char *argv[])
 }
 
 template <typename T>
-void run()
+void run(bool halfPrecision)
 {
   std::streamsize ss = std::cout.precision();
+  size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T)/2 : sizeof(T);
 
   if (!output_as_csv)
   {
     std::cout << "Running kernels " << num_times << " times" << std::endl;
 
-    if (sizeof(T) == sizeof(float))
-      std::cout << "Precision: float" << std::endl;
-    else
+    if (sizeof(T) == sizeof(float)) {
+      if (halfPrecision)
+        std::cout << "Precision: half (if implemented, else float)" << std::endl;
+      else
+        std::cout << "Precision: float" << std::endl;   
+   } else
       std::cout << "Precision: double" << std::endl;
 
 
@@ -110,19 +117,19 @@ void run()
     {
       // MiB = 2^20
       std::cout << std::setprecision(1) << std::fixed
-                << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB"
-                << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl;
-      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB"
-                << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl;
+                << "Array size: " << ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB"
+                << " (=" << ARRAY_SIZE*sizeT*pow(2.0, -30.0) << " GiB)" << std::endl;
+      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB"
+                << " (=" << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -30.0) << " GiB)" << std::endl;
     }
     else
     {
       // MB = 10^6
       std::cout << std::setprecision(1) << std::fixed
-                << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
-                << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
-      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
-                << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
+                << "Array size: " << ARRAY_SIZE*sizeT*1.0E-6 << " MB"
+                << " (=" << ARRAY_SIZE*sizeT*1.0E-9 << " GB)" << std::endl;
+      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*1.0E-6 << " MB"
+                << " (=" << 3.0*ARRAY_SIZE*sizeT*1.0E-9 << " GB)" << std::endl;
     }
     std::cout.precision(ss);
 
@@ -174,6 +181,10 @@ void run()
   // Use the OpenMP implementation
   stream = new OMPStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 
+#elif defined(POPLAR)
+  // Use the Graphcore Poplar implementation
+  stream = new PoplarStream<T>(ARRAY_SIZE, deviceIndex, halfPrecision);
+
 #endif
 
   stream->init_arrays(startA, startB, startC);
@@ -249,14 +260,13 @@ void run()
   }
 
 
-
   std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
   size_t sizes[5] = {
-    2 * sizeof(T) * ARRAY_SIZE,
-    2 * sizeof(T) * ARRAY_SIZE,
-    3 * sizeof(T) * ARRAY_SIZE,
-    3 * sizeof(T) * ARRAY_SIZE,
-    2 * sizeof(T) * ARRAY_SIZE
+    2 * sizeT * ARRAY_SIZE,
+    2 * sizeT * ARRAY_SIZE,
+    3 * sizeT * ARRAY_SIZE,
+    3 * sizeT * ARRAY_SIZE,
+    2 * sizeT * ARRAY_SIZE
   };
 
   for (int i = 0; i < 5; i++)
@@ -274,7 +284,7 @@ void run()
         << labels[i] << csv_separator
         << num_times << csv_separator
         << ARRAY_SIZE << csv_separator
-        << sizeof(T) << csv_separator
+        << sizeT << csv_separator
         << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator
         << *minmax.first << csv_separator
         << *minmax.second << csv_separator
@@ -299,16 +309,21 @@ void run()
 }
 
 template <typename T>
-void run_triad()
+void run_triad(const bool halfPrecision)
 {
+  size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T)/2 : sizeof(T);
 
   if (!output_as_csv)
   {
     std::cout << "Running triad " << num_times << " times" << std::endl;
     std::cout << "Number of elements: " << ARRAY_SIZE << std::endl;
 
-    if (sizeof(T) == sizeof(float))
-      std::cout << "Precision: float" << std::endl;
+    if (sizeof(T) == sizeof(float)) {
+      if (halfPrecision)
+        std::cout << "Precision: half (if implemented, else float)" << std::endl;
+      else
+        std::cout << "Precision: float" << std::endl;
+    }
     else
       std::cout << "Precision: double" << std::endl;
 
@@ -316,18 +331,18 @@ void run_triad()
     if (mibibytes)
     {
       std::cout << std::setprecision(1) << std::fixed
-        << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
-        << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
-      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB"
-        << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl;
+        << "Array size: " << ARRAY_SIZE*sizeT*pow(2.0, -10.0) << " KiB"
+        << " (=" << ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB)" << std::endl;
+      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -10.0) << " KiB"
+        << " (=" << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB)" << std::endl;
     }
     else
     {
       std::cout << std::setprecision(1) << std::fixed
-        << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
-        << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
-      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB"
-        << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl;
+        << "Array size: " << ARRAY_SIZE*sizeT*1.0E-3 << " KB"
+        << " (=" << ARRAY_SIZE*sizeT*1.0E-6 << " MB)" << std::endl;
+      std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*1.0E-3 << " KB"
+        << " (=" << 3.0*ARRAY_SIZE*sizeT*1.0E-6 << " MB)" << std::endl;
     }
     std::cout.precision(ss);
   }
@@ -371,6 +386,10 @@ void run_triad()
   // Use the OpenMP implementation
   stream = new OMPStream<T>(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex);
 
+#elif defined(POPLAR)
+  // Use the Graphcore Poplar implementation
+  stream = new PoplarStream<T>(ARRAY_SIZE, deviceIndex, halfPrecision);
+
 #endif
 
   stream->init_arrays(startA, startB, startC);
@@ -394,7 +413,7 @@ void run_triad()
   check_solution<T>(num_times, a, b, c, sum);
 
   // Display timing results
-  double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times;
+  double total_bytes = 3 * sizeT * ARRAY_SIZE * num_times;
   double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / runtime);
 
   if (output_as_csv)
@@ -411,7 +430,7 @@ void run_triad()
       << "Triad" << csv_separator
       << num_times << csv_separator
       << ARRAY_SIZE << csv_separator
-      << sizeof(T) << csv_separator
+      << sizeT << csv_separator
       << bandwidth << csv_separator
       << runtime
       << std::endl;
@@ -541,6 +560,10 @@ void parseArguments(int argc, char *argv[])
     {
       use_float = true;
     }
+    else if (!std::string("--half").compare(argv[i]))
+    {
+      use_half = true;
+    }
     else if (!std::string("--triad-only").compare(argv[i]))
     {
       triad_only = true;
@@ -565,6 +588,7 @@ void parseArguments(int argc, char *argv[])
       std::cout << "  -s  --arraysize  SIZE    Use SIZE elements in the array" << std::endl;
       std::cout << "  -n  --numtimes   NUM     Run the test NUM times (NUM >= 2)" << std::endl;
       std::cout << "      --float              Use floats (rather than doubles)" << std::endl;
+      std::cout << "      --half               Use half-length (16-bit) floats on supported platforms" << std::endl;
       std::cout << "      --triad-only         Only run triad" << std::endl;
       std::cout << "      --csv                Output as csv table" << std::endl;
       std::cout << "      --mibibytes          Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl;
diff --git a/package-up-report.sh b/package-up-report.sh
new file mode 100755
index 00000000..9bccf8e1
--- /dev/null
+++ b/package-up-report.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+tar czvf analysis-report.tar.gz graph.json execution.json vars.capnp
diff --git a/results/v3.4/graphcore-c2-ipu/ipu.txt b/results/v3.4/graphcore-c2-ipu/ipu.txt
new file mode 100644
index 00000000..03d8ac30
--- /dev/null
+++ b/results/v3.4/graphcore-c2-ipu/ipu.txt
@@ -0,0 +1,18 @@
+BabelStream
+Version: 3.4
+Implementation: Poplar
+Running kernels 100 times
+Precision: float
+Array size: 64.7 MB (=0.1 GB)
+Total size: 194.2 MB (=0.2 GB)
+Attached to IPU 12
+Using IPU with 1216 tiles, each with 6 workers and 256KB of memory per tile, and clock frequency 1600 MHz
+Validation failed on sum. Error 7.62939e-06
+Sum was 19.1933250427246 but should be 19.1933326721191
+Function    MBytes/sec  Min (sec)   Max         Average     
+Copy        4193140.970 0.00003     0.00003     0.00003     
+Mul         3586405.562 0.00004     0.00004     0.00004     
+Add         4192914.661 0.00005     0.00005     0.00005     
+Triad       3772580.708 0.00005     0.00005     0.00005     
+Dot         1301842.970 0.00010     0.00010     0.00010  
+