diff --git a/.gitignore b/.gitignore index a9748b3e..ff06707b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,13 +7,23 @@ raja-stream kokkos-stream sycl-stream hip-stream +poplar-stream *.o *.bc *.sycl +*.gc *.tar *.gz .DS_Store Makefile + +vars.capnp +graph.json +execution.json +*.csv +arhive.a +*.gp +popops-stream diff --git a/Poplar.make b/Poplar.make new file mode 100644 index 00000000..760006ba --- /dev/null +++ b/Poplar.make @@ -0,0 +1,39 @@ +ifndef COMPILER +define compiler_help +Set COMPILER to change flags (defaulting to GNU). +Available compilers are: + GNU + +endef +$(info $(compiler_help)) +COMPILER=GNU +endif + +COMPILER_GNU = g++ +CXX = $(COMPILER_$(COMPILER)) + +FLAGS_ = -O3 -std=c++17 -Wall +FLAGS_GNU = -O3 -std=c++17 -Wall +CXXFLAGS=$(FLAGS_$(COMPILER)) + +PLATFORM = $(shell uname -s) +LIBS = -lpoplar -lpopops -lpoputil + + +.PHONY: all +all: poplar-stream popops-stream poplar-stream-vectorised + +.PHONY: clean +clean: + rm -f poplar-stream popops-stream poplar-stream-vectorised PoplarKernels.gc + +poplar-stream: main.cpp PoplarStream.cpp + $(CXX) $(CXXFLAGS) -DPOPLAR $^ $(EXTRA_FLAGS) $(LIBS) -o $@ + +poplar-stream-vectorised: main.cpp PoplarStream.cpp + $(CXX) $(CXXFLAGS) -DPOPLAR -DVECTORISED=true $^ $(EXTRA_FLAGS) $(LIBS) -o $@ + +popops-stream: main.cpp PopopsStream.cpp + $(CXX) $(CXXFLAGS) -DPOPLAR $^ $(EXTRA_FLAGS) $(LIBS) -o $@ + + diff --git a/PoplarKernels.cpp b/PoplarKernels.cpp new file mode 100644 index 00000000..46ff75e7 --- /dev/null +++ b/PoplarKernels.cpp @@ -0,0 +1,340 @@ +#include +#include +#include +using namespace poplar; + +#define UNROLL 8 + +template +class InitKernel : public Vertex +{ + +public: + Output> a, b, c; + unsigned size; + Input initA, initB, initC; + + bool compute() + { + for (auto i = 0u; i < size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } + return true; + } +}; + +template class InitKernel; +template class InitKernel; +template class InitKernel; +template class InitKernel; + +template +class CopyKernel : public Vertex +{ + +public: + Input> a; + Output> c; + unsigned size; + + inline void doCopy(const V *__restrict src, V *__restrict dst, const unsigned size) + { +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + dst[i] = src[i]; + } + } + + bool compute() + { + doCopy(reinterpret_cast(&a[0]), reinterpret_cast(&c[0]), size * sizeof(T) / sizeof(V)); + return true; + } +}; + +template class CopyKernel; +template class CopyKernel; +template class CopyKernel; +template class CopyKernel; + +template +class MulKernel : public Vertex +{ + +public: + Input> c; + Output> b; + unsigned size; + float alpha; + + inline void doMul(const V *__restrict src, V *__restrict dst, const float alpha, const unsigned size) + { +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + dst[i] = alpha * src[i]; + } + } + + bool compute() + { + doMul(reinterpret_cast(&c[0]), reinterpret_cast(&b[0]), alpha, size * sizeof(T) / sizeof(V)); + return true; + } +}; + +template class MulKernel; +template class MulKernel; +template class MulKernel; + +template <> +class MulKernel : public Vertex +{ + +public: + Input> c; + Output> b; + unsigned size; + float alpha; + + inline void doMul(const half4 *__restrict src, half4 *__restrict dst, const float alpha, const unsigned size) + { + half _alpha = (half)alpha; +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + dst[i] = _alpha * src[i]; + } + } + + bool compute() + { + doMul(reinterpret_cast(&c[0]), reinterpret_cast(&b[0]), alpha, size / 4); + return true; + } +}; + +template +class AddKernel : public Vertex +{ + +public: + Input> b; + Input> a; + Output> c; + unsigned size; + + inline void doAdd(const V *__restrict a, const V *__restrict b, V *__restrict c, const unsigned size) + { +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + c[i] = a[i] + b[i]; + } + } + + bool compute() + { + doAdd(reinterpret_cast(&a[0]), reinterpret_cast(&b[0]), reinterpret_cast(&c[0]), size * sizeof(T) / sizeof(V)); + return true; + } +}; + +template class AddKernel; +template class AddKernel; +template class AddKernel; +template class AddKernel; + +template +class TriadKernel : public Vertex +{ + +public: + Input> b; + Input> c; + Output> a; + float alpha; + unsigned size; + + inline void doTriad(V *__restrict a, const V *__restrict b, const V *__restrict c, const float alpha, const unsigned size) + { +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + a[i] = b[i] + alpha * c[i]; + } + } + + bool compute() + { + doTriad(reinterpret_cast(&a[0]), reinterpret_cast(&b[0]), reinterpret_cast(&c[0]), alpha, size * sizeof(T) / sizeof(V)); + return true; + } +}; + +template class TriadKernel; +template class TriadKernel; +template <> +class TriadKernel : public Vertex +{ + +public: + Input> b; + Input> c; + Output> a; + float alpha; + unsigned size; + + inline void doTriad(float2 *__restrict a, const float2 *__restrict b, const float2 *__restrict c, const float alpha, const unsigned size) + { +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + a[i] = b[i] + alpha * c[i]; + } + } + + bool compute() + { + doTriad(reinterpret_cast(&a[0]), reinterpret_cast(&b[0]), reinterpret_cast(&c[0]), alpha, size / 2); + return true; + } +}; +template <> +class TriadKernel : public Vertex +{ + +public: + Input> b; + Input> c; + Output> a; + float alpha; + unsigned size; + + inline void doTriad(half4 *__restrict a, const half4 *__restrict b, const half4 *__restrict c, const float alpha, const unsigned size) + { +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + a[i] = b[i] + (half)alpha * c[i]; + } + } + + bool compute() + { + doTriad(reinterpret_cast(&a[0]), reinterpret_cast(&b[0]), reinterpret_cast(&c[0]), alpha, size / 4); + return true; + } +}; + +template +class DotProdKernel : public Vertex +{ + +public: + Input> a; + Input> b; + Output sum; + unsigned size; + + inline auto doDotProd(const V *__restrict a, const V *__restrict b, const unsigned size) -> float + { + float tmp = 0.f; +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + tmp += a[i] * b[i]; + } + return tmp; + } + + bool compute() + { + *sum = doDotProd(reinterpret_cast(&a[0]), reinterpret_cast(&b[0]), size * sizeof(T) / sizeof(V)); + return true; + } +}; + +template class DotProdKernel; +template class DotProdKernel; +template <> +class DotProdKernel : public Vertex +{ + +public: + Input> a; + Input> b; + Output sum; + unsigned size; + + inline auto doDotProd(const float2 *__restrict a, const float2 *__restrict b, const unsigned size) -> float + { + float2 tmp = {0.f, 0.f}; +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + tmp += a[i] * b[i]; + } + return (float)tmp[0] + tmp[1]; + } + bool compute() + { + *sum = doDotProd(reinterpret_cast(&a[0]), reinterpret_cast(&b[0]), size / 2); + return true; + } +}; +template <> +class DotProdKernel : public Vertex +{ + +public: + Input> a; + Input> b; + Output sum; + unsigned size; + inline auto doDotProd(const half4 *__restrict a, const half4 *__restrict b, const unsigned size) -> float + { + half4 tmp = {0, 0, 0, 0}; +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + tmp += a[i] * b[i]; + } + return (float)tmp[0] + tmp[1] + tmp[2] + tmp[3]; + } + + bool compute() + { + *sum = doDotProd(reinterpret_cast(&a[0]), reinterpret_cast(&b[0]), size) / 4; + return true; + } +}; + +class ReduceSum : public Vertex +{ +public: + Input> partialSums; + Output sum; + unsigned size; + + inline auto doReduceSum(const float *__restrict partials, const unsigned size) -> float + { + float tmp = 0.f; +#pragma unroll UNROLL + for (auto i = 0u; i < size; i++) + { + tmp += partials[i]; + } + return tmp; + } + + bool compute() + { + *sum = doReduceSum(reinterpret_cast(&partialSums[0]), size); + return true; + } +}; diff --git a/PoplarStream.cpp b/PoplarStream.cpp new file mode 100644 index 00000000..3bdc0edf --- /dev/null +++ b/PoplarStream.cpp @@ -0,0 +1,645 @@ + +// Copyright (c) 2015-16 Thorben Louw, Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "PoplarStream.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace poplar; +using namespace poplar::program; + +enum Programs +{ + INIT_PROGRAM, + COPY_PROGRAM, + MUL_PROGRAM, + ADD_PROGRAM, + TRIAD_PROGRAM, + DOT_PROGRAM, + STREAM_BACK_TO_HOST_PROGRAM +}; + +#ifdef DEBUG +const OptionFlags POPLAR_ENGINE_OPTIONS{ + {"target.saveArchive", "archive.a"}, + {"debug.instrument", "true"}, + {"debug.instrumentCompute", "true"}, + {"debug.loweredVarDumpFile", "vars.capnp"}, + {"debug.instrumentControlFlow", "true"}, + {"debug.computeInstrumentationLevel", "tile"}}; +#else +const OptionFlags POPLAR_ENGINE_OPTIONS{ + {"debug.instrument", "false"}}; +#endif + + +// This is due to https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion, +// and is an ultra-hacky way to convert the initial array values to half without having a full half library +// on the host +uint16_t toHalf(float val) +{ + uint32_t x = static_cast(val); + return ((x >> 16) & 0x8000) | ((((x & 0x7f800000) - 0x38000000) >> 13) & 0x7c00) | ((x >> 13) & 0x03ff); +} + +void captureProfileInfo(Engine &engine) +{ + std::ofstream graphOfs; + graphOfs.open("graph.json", std::ofstream::out | std::ofstream::trunc); + + std::ofstream executionOfs; + executionOfs.open("execution.json", std::ofstream::out | std::ofstream::trunc); + + poplar::serializeToJSON(graphOfs, engine.getGraphProfile(), false); + poplar::serializeToJSON(executionOfs, engine.getExecutionProfile(), false); + + graphOfs.close(); + executionOfs.close(); +} + +std::optional getIpuDevice(const unsigned deviceType = 0) +{ + const auto validOptions = std::array{0, 1, 2, 4, 8, 16}; + const bool isValid = std::find(validOptions.begin(), validOptions.end(), deviceType) != validOptions.end(); + if (isValid) + { + if (deviceType == 0) + { // Use the CPUDevice + // Note that as of Poplar v1.1.11, this ony returns a useless device with 1 tile and 256Kb of memory! + return std::optional(Device::createCPUDevice()); + } + else + { + // Target an IPUDevice + DeviceManager manager = DeviceManager::createDeviceManager(); + Device device; + for (auto &hwDevice : manager.getDevices(poplar::TargetType::IPU, deviceType)) + { + device = std::move(hwDevice); + if (device.attach()) + { + std::cout << "Attached to IPU " << device.getId() << std::endl; + return std::optional(std::move(device)); + } + } + } + } + return std::nullopt; +} + +void listDevices() +{ + + DeviceManager manager = DeviceManager::createDeviceManager(); + + std::cout << 0 << ": " + << "CPUDevice" << std::endl; + + // Attempt to attach to a single IPU: + Device device; + auto multiIpu = std::array{2, 4, 8, 16}; + for (auto i : multiIpu) + { + if (auto devices = manager.getDevices(poplar::TargetType::IPU, i); !devices.empty()) + { + std::cout << i << ": " + << "IPUDevice" << std::endl; + } + } +} + +class PoplarStreamUtil +{ +private: + const unsigned numTiles; + const unsigned numWorkersPerTile; + const unsigned arraySize; + + const unsigned totalTilesThatWillBeUsed; + const unsigned totalWorkersThatWillBeUsed; + + std::map tensors = {}; + std::vector programs = {}; + std::map dataStreams = {}; + + [[nodiscard]] unsigned numItemsForTileAndWorker(const unsigned tileNum, const unsigned workerNum) const + { + if (arraySize <= numTiles) + { // Just use 1 item per worker + return 1; + } + else + { // Balance as fairly as possible + auto extra = tileNum < arraySize % numTiles; + auto totalForTile = unsigned(arraySize / numTiles) + extra; + auto extraForThread = workerNum < totalForTile % numWorkersPerTile; + return unsigned(totalForTile / numWorkersPerTile) + extraForThread; + } + } + + [[nodiscard]] unsigned numItemsForTile(const unsigned tileNum) const + { + if (arraySize <= numTiles) + { // We'll just use one item per worker + return numWorkersUsedOnTile(tileNum); + } + else + { // Now we must balance as fairly as possible + auto extra = tileNum < arraySize % numTiles; + return unsigned(arraySize / numTiles) + extra; + } + } + + [[nodiscard]] unsigned numWorkersUsedOnTile(const unsigned tileNum) const + { + return std::min(numWorkersPerTile, totalWorkersThatWillBeUsed - tileNum * numWorkersPerTile); + } + + // This could be done faster with a Stream copy, but that creates a very large FIFO + // which limits the array size even further + Program initProgram(const std::string &templateStr, Graph &graph) + { + ComputeSet cs = graph.addComputeSet("init"); + + unsigned idx = 0u; + for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++) + { + auto tile = w / numWorkersPerTile; + auto worker = w % numWorkersPerTile; + auto numItems = numItemsForTileAndWorker(tile, worker); + const auto v = graph.addVertex( + cs, + "InitKernel" + templateStr, + { + {"a", tensors["a"].slice({idx}, {idx + numItems}).flatten()}, + {"b", tensors["b"].slice({idx}, {idx + numItems}).flatten()}, + {"c", tensors["c"].slice({idx}, {idx + numItems}).flatten()}, + {"initA", tensors["initA"]}, + {"initB", tensors["initB"]}, + {"initC", tensors["initC"]}, + }); + graph.setInitialValue(v["size"], unsigned(numItems)); + graph.setCycleEstimate(v, numItems); + graph.setTileMapping(v, tile); + idx += numItems; + } + + return Execute(cs); + } + + Program copyProgram(const std::string &templateStr, Graph &graph) + { + ComputeSet cs = graph.addComputeSet("copy"); + + unsigned idx = 0u; + for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++) + { + auto tile = w / numWorkersPerTile; + auto worker = w % numWorkersPerTile; + auto numItems = numItemsForTileAndWorker(tile, worker); + const auto v = graph.addVertex(cs, + "CopyKernel" + templateStr, + {{"a", tensors["a"].slice({idx}, {idx + numItems}).flatten()}, + {"c", tensors["c"].slice({idx}, {idx + numItems}).flatten()}}); + graph.setInitialValue(v["size"], unsigned(numItems)); + + graph.setCycleEstimate(v, numItems); + graph.setTileMapping(v, tile); + idx += numItems; + } + + return Execute(cs); + } + + Program mulProgram(const std::string &templateStr, Graph &graph) + { + ComputeSet cs = graph.addComputeSet("mul"); + + unsigned idx = 0u; + for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++) + { + auto tile = w / numWorkersPerTile; + auto worker = w % numWorkersPerTile; + auto numItems = numItemsForTileAndWorker(tile, worker); + const auto v = graph.addVertex( + cs, + "MulKernel" + templateStr, + { + {"b", tensors["b"].slice({idx}, {idx + numItems})}, + {"c", tensors["c"].slice({idx}, {idx + numItems})}, + }); + graph.setInitialValue(v["size"], unsigned(numItems)); + graph.setInitialValue(v["alpha"], float(startScalar)); + graph.setCycleEstimate(v, numItems); + graph.setTileMapping(v, tile); + idx += numItems; + } + + return Execute(cs); + } + + Program addProgram(const std::string &templateStr, Graph &graph) + { + ComputeSet cs = graph.addComputeSet("add"); + + unsigned idx = 0u; + for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++) + { + auto tile = w / numWorkersPerTile; + auto worker = w % numWorkersPerTile; + auto numItems = numItemsForTileAndWorker(tile, worker); + const auto v = graph.addVertex( + cs, + "AddKernel" + templateStr, + { + {"b", tensors["b"].slice({idx}, {idx + numItems})}, + {"a", tensors["a"].slice({idx}, {idx + numItems})}, + {"c", tensors["c"].slice({idx}, {idx + numItems})}, + }); + graph.setInitialValue(v["size"], unsigned(numItems)); + graph.setCycleEstimate(v, numItems); + graph.setTileMapping(v, tile); + idx += numItems; + } + + return Execute(cs); + } + + Program triadProgram(const std::string &templateStr, Graph &graph) + { + ComputeSet cs = graph.addComputeSet("triad"); + + unsigned idx = 0u; + for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++) + { + auto tile = w / numWorkersPerTile; + auto worker = w % numWorkersPerTile; + + auto numItems = numItemsForTileAndWorker(tile, worker); + const auto v = graph.addVertex( + cs, + "TriadKernel" + templateStr, + { + {"b", tensors["b"].slice({idx}, {idx + numItems})}, + {"a", tensors["a"].slice({idx}, {idx + numItems})}, + {"c", tensors["c"].slice({idx}, {idx + numItems})}, + }); + graph.setInitialValue(v["size"], unsigned(numItems)); + graph.setInitialValue(v["alpha"], float(startScalar)); + graph.setCycleEstimate(v, numItems); + graph.setTileMapping(v, tile); + idx += numItems; + } + + return Execute(cs); + } + + Program dotProdProgram(const std::string &templateStr, Graph &graph) + { + ComputeSet dotCs = graph.addComputeSet("dot"); + unsigned idx = 0u; + for (unsigned w = 0; w < totalWorkersThatWillBeUsed; w++) + { + auto tile = w / numWorkersPerTile; + auto worker = w % numWorkersPerTile; + auto numItems = numItemsForTileAndWorker(tile, worker); + const auto v = graph.addVertex( + dotCs, + "DotProdKernel" + templateStr, + {{"b", tensors["b"].slice({idx}, {idx + numItems})}, + {"a", tensors["a"].slice({idx}, {idx + numItems})}, + {"sum", tensors["partialSumsPerWorker"][tile * numWorkersPerTile + + worker]} + + }); + graph.setInitialValue(v["size"], unsigned(numItems)); + graph.setCycleEstimate(v, numItems); + graph.setTileMapping(v, tile); + idx += numItems; + } + + ComputeSet partialReduxCs = graph.addComputeSet("reductionPerTile"); + idx = 0u; + for (unsigned tile = 0; tile < totalTilesThatWillBeUsed; tile++) + { + auto workersUsedOnThisTile = numWorkersUsedOnTile(tile); + const auto v = graph.addVertex( + partialReduxCs, + "ReduceSum", + { + {"partialSums", tensors["partialSumsPerWorker"].slice( + {idx}, {idx + workersUsedOnThisTile})}, + {"sum", tensors["partialSumsPerTile"][tile]}, + }); + graph.setInitialValue(v["size"], unsigned(workersUsedOnThisTile)); + + graph.setCycleEstimate(v, workersUsedOnThisTile); + graph.setTileMapping(v, tile); + idx += workersUsedOnThisTile; + } + + ComputeSet finalReduxCs = graph.addComputeSet("finalReduction"); + const auto v = graph.addVertex( + finalReduxCs, + "ReduceSum", + {{"partialSums", tensors["partialSumsPerTile"]}, + {"sum", tensors["sum"]}}); + graph.setInitialValue(v["size"], unsigned(totalTilesThatWillBeUsed)); + + graph.setCycleEstimate(v, numTiles); + graph.setTileMapping(v, numTiles - 1); + + return Sequence( + Execute(dotCs), + Execute(partialReduxCs), + Execute(finalReduxCs)); + } + + void createAndLayOutTensors(poplar::Type type, Graph &graph) + { + tensors["initA"] = graph.addVariable(FLOAT, {}, "initA"); + tensors["initB"] = graph.addVariable(FLOAT, {}, "initB"); + tensors["initC"] = graph.addVariable(FLOAT, {}, "initC"); + + tensors["a"] = graph.addVariable(type, {arraySize}, "a"); + tensors["b"] = graph.addVariable(type, {arraySize}, "b"); + tensors["c"] = graph.addVariable(type, {arraySize}, "c"); + tensors["sum"] = graph.addVariable(FLOAT, {}, "sum"); + tensors["partialSumsPerWorker"] = graph.addVariable(FLOAT, {totalWorkersThatWillBeUsed}, + "partialSumsPerWorker"); + tensors["partialSumsPerTile"] = graph.addVariable(FLOAT, {totalTilesThatWillBeUsed}, "partialSumsPerTile"); + graph.createHostRead("sum", tensors["sum"]); + graph.createHostWrite("initA", tensors["initA"]); + graph.createHostWrite("initB", tensors["initB"]); + graph.createHostWrite("initC", tensors["initC"]); + + auto idx = 0u; + for (auto tile = 0u; tile < totalTilesThatWillBeUsed; tile++) + { + auto mapMem = [=]() -> unsigned { +#ifdef MEM_ON_NEXT_TILE + return (tile + 1) % totalTilesThatWillBeUsed; +#else + return tile; +#endif + }; + + if (auto numItems = numItemsForTile(tile); numItems > 0) + { + graph.setTileMapping(tensors["a"].slice(idx, idx + numItems), mapMem()); + graph.setTileMapping(tensors["b"].slice(idx, idx + numItems), mapMem()); + graph.setTileMapping(tensors["c"].slice(idx, idx + numItems), mapMem()); + idx += numItems; + } + + graph.setTileMapping(tensors["partialSumsPerWorker"].slice( + {tile * numWorkersPerTile}, {tile * numWorkersPerTile + numWorkersUsedOnTile(tile)}), + mapMem()); + graph.setTileMapping(tensors["partialSumsPerTile"].slice(tile, tile + 1), mapMem()); + } + graph.setTileMapping(tensors["sum"], numTiles - 1); + graph.setTileMapping(tensors["initA"], 0); + graph.setTileMapping(tensors["initB"], 0); + graph.setTileMapping(tensors["initC"], 0); + } + + void createDataStreams(poplar::Type type, Graph &graph) + { + // dataStreams["in_a"] = graph.addHostToDeviceFIFO("in_a", type, arraySize); + // dataStreams["in_b"] = graph.addHostToDeviceFIFO("in_b", type, arraySize); + // dataStreams["in_c"] = graph.addHostToDeviceFIFO("in_c", type, arraySize); + + dataStreams["out_a"] = graph.addDeviceToHostFIFO("out_a", type, arraySize); + dataStreams["out_b"] = graph.addDeviceToHostFIFO("out_b", type, arraySize); + dataStreams["out_c"] = graph.addDeviceToHostFIFO("out_c", type, arraySize); + } + + [[nodiscard]] const unsigned numWorkersNeeded(unsigned arraySize, unsigned numTiles, unsigned numWorkersPerTile) const + { + return std::min( + arraySize, + std::min( + numTiles * numWorkersPerTile, + unsigned(std::ceil(arraySize / (numWorkersPerTile * 1.0))) * 6)); + } + + [[nodiscard]] const unsigned numTilesNeeded(unsigned arraySize, unsigned numTiles, unsigned numWorkersPerTile) const + { + return std::min(numTiles, unsigned(std::ceil(arraySize / (numWorkersPerTile * 1.0)))); + } + +public: + PoplarStreamUtil(unsigned numTiles, unsigned numWorkersPerTile, unsigned arraySize) : numTiles(numTiles), + numWorkersPerTile(numWorkersPerTile), + arraySize(arraySize), + totalTilesThatWillBeUsed(numTilesNeeded(arraySize, numTiles, numWorkersPerTile)), + totalWorkersThatWillBeUsed(numWorkersNeeded(arraySize, numTiles, numWorkersPerTile)) + { + } + + std::unique_ptr prepareEngine(const Device &device, + const Graph &graph, + void *a, + void *b, + void *c) + { + assert(!programs.empty()); + auto engine = std::make_unique(graph, programs, POPLAR_ENGINE_OPTIONS); + + engine->connectStream("out_a", a); + engine->connectStream("out_b", b); + engine->connectStream("out_c", c); + + engine->load(device); + + return std::move(engine); + } + + void buildComputeGraph(poplar::Type type, Graph &graph) + { + // Set up data streams to copy data in and out of graph + + auto typeStr1 = type == FLOAT ? "" : "half4>"; +#else + auto typeStr2 = type == FLOAT ? "float>" : "half>"; +#endif + + auto typeStr = std::string(typeStr1) + std::string(typeStr2); + + createDataStreams(type, graph); + + createAndLayOutTensors(type, graph); + + auto InitProg = initProgram(typeStr, graph); + + auto CopyProg = copyProgram(typeStr, graph); + auto MulProg = mulProgram(typeStr, graph); + auto AddProg = addProgram(typeStr, graph); + auto TriadProg = triadProgram(typeStr, graph); + auto DotProg = dotProdProgram(typeStr, graph); + + auto StreamToHostProg = Sequence(Copy(tensors["a"], dataStreams["out_a"]), + Copy(tensors["b"], dataStreams["out_b"]), + Copy(tensors["c"], dataStreams["out_c"])); + + programs = {InitProg, CopyProg, MulProg, AddProg, TriadProg, DotProg, + StreamToHostProg}; + } +}; + +template +PoplarStream::PoplarStream(const unsigned int arraySize, const int device_num, const bool halfPrecision) : arraySize(arraySize), + halfPrecision(halfPrecision), + a(std::unique_ptr(new T[arraySize]())), + b(std::unique_ptr(new T[arraySize]())), + c(std::unique_ptr(new T[arraySize]())) +{ + + auto device = getIpuDevice(device_num); + + if (!device.has_value()) + { + throw std::runtime_error("Could not allocate IPU device"); + } + target = device->getTarget(); + + Graph graph(device.value()); + const auto numTiles = graph.getTarget().getNumTiles(); + const auto numWorkers = graph.getTarget().getNumWorkerContexts(); + const auto maxBytesPerTile = graph.getTarget().getBytesPerTile(); + const auto clockFrequency = graph.getTarget().getTileClockFrequency(); + + const auto maxArraySize = ((double)numTiles) * maxBytesPerTile / 1024.0 / 1024.0 / 3; + + std::cout << "Using IPU with " << numTiles << " tiles, each with " << numWorkers + << " workers and " << maxBytesPerTile / 1024 << "KB of memory per tile, and clock frequency " + << (int)clockFrequency / 1000 / 1000 << " MHz. Maximum array size will be slightly less than " + << std::fixed << std::setprecision(2) << std::floor(maxArraySize) << " MB" + << std::endl; + + auto util = PoplarStreamUtil(numTiles, numWorkers, arraySize); + + graph.addCodelets("PoplarKernels.cpp", CodeletFileType::Auto, "-O3"); + + if (sizeof(T) > sizeof(float)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + + // Check buffers fit on the device + size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T) / 2 : sizeof(T); + unsigned long maxbuffer = ((unsigned long)numTiles) * maxBytesPerTile; + unsigned long totalmem = ((unsigned long)numTiles) * maxBytesPerTile; + if (maxbuffer < sizeT * ((unsigned long)arraySize)) + throw std::runtime_error("Device cannot allocate a buffer big enough"); + if (totalmem < 3L * sizeT * arraySize) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + util.buildComputeGraph(halfPrecision ? HALF : FLOAT, graph); + engine = util.prepareEngine(device.value(), graph, a.get(), b.get(), c.get()); +} + +template +PoplarStream::~PoplarStream() = default; + +template +void PoplarStream::copy() +{ + engine->run(COPY_PROGRAM); +} + +template +void PoplarStream::mul() +{ + engine->run(MUL_PROGRAM); +} + +template +void PoplarStream::add() +{ + engine->run(ADD_PROGRAM); +} + +template +void PoplarStream::triad() +{ + engine->run(TRIAD_PROGRAM); +} + +template +T PoplarStream::dot() +{ + engine->run(DOT_PROGRAM); + engine->readTensor("sum", &sum); + return sum; +} + +template +void PoplarStream::init_arrays(T initA, T initB, T initC) +{ + + if (halfPrecision) + { + const uint32_t fakeA = toHalf(initA); + const uint32_t fakeB = toHalf(initB); + const uint32_t fakeC = toHalf(initC); + + engine->writeTensor("initA", &fakeA); + engine->writeTensor("initB", &fakeB); + engine->writeTensor("initC", &fakeC); + } + else + { + engine->writeTensor("initA", &initA); + engine->writeTensor("initB", &initB); + engine->writeTensor("initC", &initC); + } + engine->run(INIT_PROGRAM); +} + + + +template<> void PoplarStream::copyArrays( const double *src, double *dst) { + std::memcpy(dst, src, arraySize * sizeof(double)); +} + +template<> void PoplarStream::copyArrays(const float *src, float *dst) { + copyDeviceHalfToFloat(target, src, dst, arraySize); +} + + + +template +void PoplarStream::read_arrays(std::vector &h_a, std::vector &h_b, std::vector &h_c) +{ + + engine->run(STREAM_BACK_TO_HOST_PROGRAM); + copyArrays(a.get(), h_a.data()); + copyArrays(b.get(), h_b.data()); + copyArrays(c.get(), h_c.data()); + + +#ifdef DEBUG + captureProfileInfo(*engine); + engine->printProfileSummary(std::cout, + OptionFlags{{"showExecutionSteps", "true"}}); +#endif +} + +template class PoplarStream; + +template class PoplarStream; // Not usable, but needs to exist for stream.cpp diff --git a/PoplarStream.h b/PoplarStream.h new file mode 100644 index 00000000..0db6805b --- /dev/null +++ b/PoplarStream.h @@ -0,0 +1,58 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "Poplar" + + +using namespace poplar::program; + +template +class PoplarStream : public Stream { + +protected: + unsigned int arraySize; + const bool halfPrecision; + T sum = 0; + std::unique_ptr engine; + poplar::Target target; + std::unique_ptr a; + std::unique_ptr b; + std::unique_ptr c; + +public: + + PoplarStream(const unsigned int, const int, const bool halfPrecision); + + ~PoplarStream(); + + virtual void copy() override; + + virtual void add() override; + + virtual void mul() override; + + virtual void triad() override; + + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + + virtual void read_arrays(std::vector &a, std::vector &b, std::vector &c) override; + virtual void copyArrays(const T *src, T *dst); + +}; + diff --git a/PopopsStream.cpp b/PopopsStream.cpp new file mode 100644 index 00000000..cacf3144 --- /dev/null +++ b/PopopsStream.cpp @@ -0,0 +1,429 @@ + +// Copyright (c) 2015-16 Thorben Louw, Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "PoplarStream.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace poplar; +using namespace poplar::program; +using namespace popops; + +enum Programs +{ + INIT_PROGRAM, + COPY_PROGRAM, + MUL_PROGRAM, + ADD_PROGRAM, + TRIAD_PROGRAM, + DOT_PROGRAM, + STREAM_BACK_TO_HOST_PROGRAM +}; + +// const OptionFlags POPLAR_ENGINE_OPTIONS{ +// {"target.saveArchive", "archive.a"}, +// {"debug.instrument", "true"}, +// {"debug.instrumentCompute", "true"}, +// {"debug.loweredVarDumpFile", "vars.capnp"}, +// {"debug.instrumentControlFlow", "true"}, +// {"debug.computeInstrumentationLevel", "tile"} +// }; + +const OptionFlags POPLAR_ENGINE_OPTIONS{ + {"debug.instrument", "false"}}; + +// This is due to https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion, +// and is an ultra-hacky way to convert the initial array values to half without having a full half library +// on the host +uint16_t toHalf(float val) +{ + uint32_t x = static_cast(val); + return ((x >> 16) & 0x8000) | ((((x & 0x7f800000) - 0x38000000) >> 13) & 0x7c00) | ((x >> 13) & 0x03ff); +} + +void captureProfileInfo(Engine &engine) +{ + std::ofstream graphOfs; + graphOfs.open("graph.json", std::ofstream::out | std::ofstream::trunc); + + std::ofstream executionOfs; + executionOfs.open("execution.json", std::ofstream::out | std::ofstream::trunc); + + poplar::serializeToJSON(graphOfs, engine.getGraphProfile(), false); + poplar::serializeToJSON(executionOfs, engine.getExecutionProfile(), false); + + graphOfs.close(); + executionOfs.close(); +} + +std::optional getIpuDevice(const unsigned deviceType = 0) +{ + const auto validOptions = std::array{0, 1, 2, 4, 8, 16}; + const bool isValid = std::find(validOptions.begin(), validOptions.end(), deviceType) != validOptions.end(); + if (isValid) + { + if (deviceType == 0) + { // Use the CPUDevice + // Note that as of Poplar v1.1.11, this ony returns a useless device with 1 tile and 256Kb of memory! + return std::optional(Device::createCPUDevice()); + } + else + { + // Target an IPUDevice + DeviceManager manager = DeviceManager::createDeviceManager(); + Device device; + for (auto &hwDevice : manager.getDevices(poplar::TargetType::IPU, deviceType)) + { + device = std::move(hwDevice); + if (device.attach()) + { + std::cout << "Attached to IPU " << device.getId() << std::endl; + return std::optional(std::move(device)); + } + } + } + } + return std::nullopt; +} + +void listDevices() +{ + + DeviceManager manager = DeviceManager::createDeviceManager(); + + std::cout << 0 << ": " + << "CPUDevice" << std::endl; + + // Attempt to attach to a single IPU: + Device device; + if (auto devices = manager.getDevices(poplar::TargetType::IPU, 1); !devices.empty()) + { + std::cout << 1 << ": " + << "IPUDevice" << std::endl; + } + + // Attempt to attach to 2 IPUs: + if (auto devices = manager.getDevices(poplar::TargetType::IPU, 2); !devices.empty()) + { + std::cout << 2 << ": " + << "2x IPUDevices" << std::endl; + } + + // Attempt to attach to 4 IPUs: + if (auto devices = manager.getDevices(poplar::TargetType::IPU, 4); !devices.empty()) + { + std::cout << 4 << ": " + << "4x IPUDevices" << std::endl; + } + + // Attempt to attach to 8 IPUs: + if (auto devices = manager.getDevices(poplar::TargetType::IPU, 8); !devices.empty()) + { + std::cout << 8 << ": " + << "8x IPUDevices" << std::endl; + } + + // Attempt to attach to 16 IPUs: + if (auto devices = manager.getDevices(poplar::TargetType::IPU, 16); !devices.empty()) + { + std::cout << 16 << ": " + << "16x IPUDevices" << std::endl; + } +} + +class PoplarStreamUtil +{ +private: + std::map tensors = {}; + std::vector programs = {}; + std::map dataStreams = {}; + + // This could be done faster with a Stream copy, but that creates a very large FIFO + // which limits the array size even further + Program initProgram(Graph &graph, size_t arraySize) + { + Sequence(s); + tensors["a"] = poputil::duplicate(graph, tensors["initA"].reshape({1}).broadcast(arraySize, 0), s, "a"); + tensors["b"] = poputil::duplicate(graph, tensors["initB"].reshape({1}).broadcast(arraySize, 0), s, "b"); + tensors["c"] = poputil::duplicate(graph, tensors["initC"].reshape({1}).broadcast(arraySize, 0), s, "c"); + poputil::mapTensorLinearly(graph, tensors["a"]); + poputil::mapTensorLinearly(graph, tensors["b"]); + poputil::mapTensorLinearly(graph, tensors["c"]); + + // s.add(PrintTensor("a", tensors["a"])); + // s.add(PrintTensor("b", tensors["b"])); + // s.add(PrintTensor("c", tensors["c"])); + + return s; + } + + // c = a + Program copyProgram(Graph &graph) + { + auto s = Sequence(); + s.add(Copy(tensors["a"], tensors["c"])); + // s.add(PrintTensor("c (=a)", tensors["c"])); + return s; + } + + // b[i] = x * c[i]; + Program mulProgram(Graph &graph) + { + auto s = Sequence(); + s.add(Copy(popops::mul(graph, tensors["c"], tensors["alpha"], s, "Mul"), tensors["b"])); + // s.add(PrintTensor("b (=xc)", tensors["b"])); + return s; + } + + // c = a + b + Program addProgram(Graph &graph) + { + auto s = Sequence(); + s.add(Copy(popops::add(graph, tensors["a"], tensors["b"], s, "Add"), tensors["c"])); + // s.add(PrintTensor("c (=a+b)", tensors["c"])); + return s; + } + + // a = b + xc + Program triadProgram(Graph &graph) + { + auto s = Sequence(); + s.add(Copy(tensors["b"], tensors["a"])); + popops::scaledAddTo(graph, tensors["a"], tensors["c"], tensors["alpha"], s, "Triad", {{"optimizeForSpeed", "true"}}); + // s.add(PrintTensor("a (=b + xc)", tensors["a"])); + return s; + } + + // sum = reduce+(a * b) + Program dotProdProgram(Graph &graph) + { + Sequence s; + + popops::reduceWithOutput(graph, + popops::mul(graph, tensors["a"], tensors["b"], s, "a*b"), tensors["sum"], {0}, + {popops::Operation::ADD}, + s, + "reduce+"); + // s.add(PrintTensor("reduce+(a * b)", tensors["sum"])); + + return s; + } + + void createAndLayOutTensors(poplar::Type type, Graph &graph) + { + tensors["initA"] = graph.addVariable(type, {}, "initA"); + tensors["initB"] = graph.addVariable(type, {}, "initB"); + tensors["initC"] = graph.addVariable(type, {}, "initC"); + + tensors["alpha"] = graph.addConstant(type, {}, startScalar, "alpha"); + tensors["sum"] = graph.addVariable(FLOAT, {}, "sum"); + + graph.createHostRead("sum", tensors["sum"]); + graph.createHostWrite("initA", tensors["initA"]); + graph.createHostWrite("initB", tensors["initB"]); + graph.createHostWrite("initC", tensors["initC"]); + + graph.setTileMapping(tensors["sum"], 4); + graph.setTileMapping(tensors["initA"], 0); + graph.setTileMapping(tensors["initB"], 1); + graph.setTileMapping(tensors["initC"], 2); + graph.setTileMapping(tensors["alpha"], 3); + } + + void createDataStreams(poplar::Type type, Graph &graph, size_t arraySize) + { + dataStreams["out_a"] = graph.addDeviceToHostFIFO("out_a", type, arraySize); + dataStreams["out_b"] = graph.addDeviceToHostFIFO("out_b", type, arraySize); + dataStreams["out_c"] = graph.addDeviceToHostFIFO("out_c", type, arraySize); + } + +public: + std::unique_ptr prepareEngine(const Device &device, + const Graph &graph, + void *a, + void *b, + void *c) + { + assert(!programs.empty()); + auto engine = std::make_unique(graph, programs, POPLAR_ENGINE_OPTIONS); + + engine->connectStream("out_a", a); + engine->connectStream("out_b", b); + engine->connectStream("out_c", c); + + engine->load(device); + + return std::move(engine); + } + + void buildComputeGraph(poplar::Type type, Graph &graph, size_t arraySize) + { + + createDataStreams(type, graph, arraySize); + + createAndLayOutTensors(type, graph); + + auto InitProg = initProgram(graph, arraySize); + + auto CopyProg = copyProgram(graph); + auto MulProg = mulProgram(graph); + auto AddProg = addProgram(graph); + auto TriadProg = triadProgram(graph); + auto DotProg = dotProdProgram(graph); + + auto StreamToHostProg = Sequence(Copy(tensors["a"], dataStreams["out_a"]), + Copy(tensors["b"], dataStreams["out_b"]), + Copy(tensors["c"], dataStreams["out_c"])); + + programs = {InitProg, CopyProg, MulProg, AddProg, TriadProg, DotProg, + StreamToHostProg}; + } +}; + +template +PoplarStream::PoplarStream(const unsigned int arraySize, const int device_num, const bool halfPrecision) : arraySize(arraySize), + halfPrecision(halfPrecision), + a(std::unique_ptr(new T[arraySize]())), + b(std::unique_ptr(new T[arraySize]())), + c(std::unique_ptr(new T[arraySize]())) +{ + + auto device = getIpuDevice(device_num); + + if (!device.has_value()) + { + throw std::runtime_error("Could not allocate IPU device"); + } + + Graph graph(device.value()); + const auto numTiles = graph.getTarget().getNumTiles(); + const auto numWorkers = graph.getTarget().getNumWorkerContexts(); + const auto maxBytesPerTile = graph.getTarget().getBytesPerTile(); + const auto clockFrequency = graph.getTarget().getTileClockFrequency(); + + const auto maxArraySize = ((double)numTiles) * maxBytesPerTile / 1024.0 / 1024.0 / 3; + + std::cout << "Using IPU with " << numTiles << " tiles, each with " << numWorkers + << " workers and " << maxBytesPerTile / 1024 << "KB of memory per tile, and clock frequency " + << (int)clockFrequency / 1000 / 1000 << " MHz. Maximum array size will be slightly less than " + << std::fixed << std::setprecision(2) << std::floor(maxArraySize) << " MB" + << std::endl; + + auto util = PoplarStreamUtil(); + + graph.addCodelets("PoplarKernels.cpp"); + popops::addCodelets(graph); + + if (sizeof(T) > sizeof(float)) + { + throw std::runtime_error("Device does not support double precision, please use --float"); + } + + // Check buffers fit on the device + size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T) / 2 : sizeof(T); + unsigned long maxbuffer = ((unsigned long)numTiles) * maxBytesPerTile; + unsigned long totalmem = ((unsigned long)numTiles) * maxBytesPerTile; + if (maxbuffer < sizeT * ((unsigned long)arraySize)) + throw std::runtime_error("Device cannot allocate a buffer big enough"); + if (totalmem < 3L * sizeT * arraySize) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + util.buildComputeGraph(halfPrecision ? HALF : FLOAT, graph, arraySize); + engine = util.prepareEngine(device.value(), graph, a.get(), b.get(), c.get()); +} + +template +PoplarStream::~PoplarStream() = default; + +template +void PoplarStream::copy() +{ + engine->run(COPY_PROGRAM); +} + +template +void PoplarStream::mul() +{ + engine->run(MUL_PROGRAM); +} + +template +void PoplarStream::add() +{ + engine->run(ADD_PROGRAM); +} + +template +void PoplarStream::triad() +{ + engine->run(TRIAD_PROGRAM); +} + +template +T PoplarStream::dot() +{ + engine->run(DOT_PROGRAM); + engine->readTensor("sum", &sum); + return sum; +} + +template +void PoplarStream::init_arrays(T initA, T initB, T initC) +{ + + if (halfPrecision) + { + const uint32_t fakeA = toHalf(initA); + const uint32_t fakeB = toHalf(initB); + const uint32_t fakeC = toHalf(initC); + + engine->writeTensor("initA", &fakeA); + engine->writeTensor("initB", &fakeB); + engine->writeTensor("initC", &fakeC); + } + else + { + engine->writeTensor("initA", &initA); + engine->writeTensor("initB", &initB); + engine->writeTensor("initC", &initC); + } + engine->run(INIT_PROGRAM); +} + +template +void PoplarStream::read_arrays(std::vector &h_a, std::vector &h_b, std::vector &h_c) +{ + + engine->run(STREAM_BACK_TO_HOST_PROGRAM); + + for (unsigned i = 0; i < arraySize; i++) + { + h_a[i] = a[i]; + h_b[i] = b[i]; + h_c[i] = c[i]; + } + + // captureProfileInfo(*engine); + // engine->printProfileSummary(std::cout, + // OptionFlags{{"showExecutionSteps", "true"}}); +} + +template class PoplarStream; + +template class PoplarStream; // Not usable, but needs to exist for stream.cpp diff --git a/README.md b/README.md index e3c98b72..cd9ddcef 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Currently implemented are: - Kokkos - RAJA - SYCL + - Graphcore's Poplar framework for the Graphcore IPU This code was previously called GPU-STREAM. @@ -82,6 +83,31 @@ For building with CUDA support, we use the following command. cmake .. -DCMAKE_INSTALL_PREFIX= -DRAJA_PTR="RAJA_USE_RESTRICT_ALIGNED_PTR" -DRAJA_ENABLE_CUDA=1 -DRAJA_ENABLE_TESTS=Off ``` + +Building and running for Poplar +------------------------------- +See the Developer Documentation at https://www.graphcore.ai/developer for how to install Poplar and set up +the environment variables for building. The latest version we used was v1.1.11. + +The IPU doesn't support doubles, so you must run with the --float option. +For a single IPU (with 1216 tiles), the largest array size we used was 16185000. + +Run with +``` +./poplar-stream --arraysize 16185000 --device 1 --float +``` + +You can run on multiple IPU targets (such as the 2 IPUs on 1 C2 IPU Processor card) by selecting a device appropriately +using the `--device` flag. The options are 2,4,8 or 16 for the respective number of IPUs. + +You can run the Poplar program on a CPU target, but as of v1.1.11 this only gives you 1 tile with 256Kb of +memory. The largest array size we used was 21845. + +Run with +``` +./poplar-stream --arraysize 21845 --device 0 --float +``` + Results ------- diff --git a/main.cpp b/main.cpp index f006f8ca..7e7fc596 100644 --- a/main.cpp +++ b/main.cpp @@ -37,6 +37,8 @@ #include "SYCLStream.h" #elif defined(OMP) #include "OMPStream.h" +#elif defined(POPLAR) +#include "PoplarStream.h" #endif // Default size of 2^25 @@ -44,6 +46,7 @@ unsigned int ARRAY_SIZE = 33554432; unsigned int num_times = 100; unsigned int deviceIndex = 0; bool use_float = false; +bool use_half = false; bool triad_only = false; bool output_as_csv = false; bool mibibytes = false; @@ -53,10 +56,10 @@ template void check_solution(const unsigned int ntimes, std::vector& a, std::vector& b, std::vector& c, T& sum); template -void run(); +void run(bool halfPrecision=false); template -void run_triad(); +void run_triad(bool halfPrecision=false); void parseArguments(int argc, char *argv[]); @@ -76,15 +79,15 @@ int main(int argc, char *argv[]) // TODO: Fix Kokkos to allow multiple template specializations if (triad_only) { - if (use_float) - run_triad(); + if (use_float || use_half) + run_triad(use_half); else run_triad(); } else { - if (use_float) - run(); + if (use_float || use_half) + run(use_half); else run(); } @@ -92,17 +95,21 @@ int main(int argc, char *argv[]) } template -void run() +void run(bool halfPrecision) { std::streamsize ss = std::cout.precision(); + size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T)/2 : sizeof(T); if (!output_as_csv) { std::cout << "Running kernels " << num_times << " times" << std::endl; - if (sizeof(T) == sizeof(float)) - std::cout << "Precision: float" << std::endl; - else + if (sizeof(T) == sizeof(float)) { + if (halfPrecision) + std::cout << "Precision: half (if implemented, else float)" << std::endl; + else + std::cout << "Precision: float" << std::endl; + } else std::cout << "Precision: double" << std::endl; @@ -110,19 +117,19 @@ void run() { // MiB = 2^20 std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; + << "Array size: " << ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB" + << " (=" << ARRAY_SIZE*sizeT*pow(2.0, -30.0) << " GiB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB" + << " (=" << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -30.0) << " GiB)" << std::endl; } else { // MB = 10^6 std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" - << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + << "Array size: " << ARRAY_SIZE*sizeT*1.0E-6 << " MB" + << " (=" << ARRAY_SIZE*sizeT*1.0E-9 << " GB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*1.0E-6 << " MB" + << " (=" << 3.0*ARRAY_SIZE*sizeT*1.0E-9 << " GB)" << std::endl; } std::cout.precision(ss); @@ -174,6 +181,10 @@ void run() // Use the OpenMP implementation stream = new OMPStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); +#elif defined(POPLAR) + // Use the Graphcore Poplar implementation + stream = new PoplarStream(ARRAY_SIZE, deviceIndex, halfPrecision); + #endif stream->init_arrays(startA, startB, startC); @@ -249,14 +260,13 @@ void run() } - std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; size_t sizes[5] = { - 2 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 3 * sizeof(T) * ARRAY_SIZE, - 2 * sizeof(T) * ARRAY_SIZE + 2 * sizeT * ARRAY_SIZE, + 2 * sizeT * ARRAY_SIZE, + 3 * sizeT * ARRAY_SIZE, + 3 * sizeT * ARRAY_SIZE, + 2 * sizeT * ARRAY_SIZE }; for (int i = 0; i < 5; i++) @@ -274,7 +284,7 @@ void run() << labels[i] << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator + << sizeT << csv_separator << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator << *minmax.first << csv_separator << *minmax.second << csv_separator @@ -299,16 +309,21 @@ void run() } template -void run_triad() +void run_triad(const bool halfPrecision) { + size_t sizeT = (sizeof(T) == sizeof(float) && halfPrecision) ? sizeof(T)/2 : sizeof(T); if (!output_as_csv) { std::cout << "Running triad " << num_times << " times" << std::endl; std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; - if (sizeof(T) == sizeof(float)) - std::cout << "Precision: float" << std::endl; + if (sizeof(T) == sizeof(float)) { + if (halfPrecision) + std::cout << "Precision: half (if implemented, else float)" << std::endl; + else + std::cout << "Precision: float" << std::endl; + } else std::cout << "Precision: double" << std::endl; @@ -316,18 +331,18 @@ void run_triad() if (mibibytes) { std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB" - << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -10.0) << " KiB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB)" << std::endl; + << "Array size: " << ARRAY_SIZE*sizeT*pow(2.0, -10.0) << " KiB" + << " (=" << ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -10.0) << " KiB" + << " (=" << 3.0*ARRAY_SIZE*sizeT*pow(2.0, -20.0) << " MiB)" << std::endl; } else { std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB" - << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-3 << " KB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB)" << std::endl; + << "Array size: " << ARRAY_SIZE*sizeT*1.0E-3 << " KB" + << " (=" << ARRAY_SIZE*sizeT*1.0E-6 << " MB)" << std::endl; + std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeT*1.0E-3 << " KB" + << " (=" << 3.0*ARRAY_SIZE*sizeT*1.0E-6 << " MB)" << std::endl; } std::cout.precision(ss); } @@ -371,6 +386,10 @@ void run_triad() // Use the OpenMP implementation stream = new OMPStream(ARRAY_SIZE, a.data(), b.data(), c.data(), deviceIndex); +#elif defined(POPLAR) + // Use the Graphcore Poplar implementation + stream = new PoplarStream(ARRAY_SIZE, deviceIndex, halfPrecision); + #endif stream->init_arrays(startA, startB, startC); @@ -394,7 +413,7 @@ void run_triad() check_solution(num_times, a, b, c, sum); // Display timing results - double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; + double total_bytes = 3 * sizeT * ARRAY_SIZE * num_times; double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / runtime); if (output_as_csv) @@ -411,7 +430,7 @@ void run_triad() << "Triad" << csv_separator << num_times << csv_separator << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator + << sizeT << csv_separator << bandwidth << csv_separator << runtime << std::endl; @@ -541,6 +560,10 @@ void parseArguments(int argc, char *argv[]) { use_float = true; } + else if (!std::string("--half").compare(argv[i])) + { + use_half = true; + } else if (!std::string("--triad-only").compare(argv[i])) { triad_only = true; @@ -565,6 +588,7 @@ void parseArguments(int argc, char *argv[]) std::cout << " -s --arraysize SIZE Use SIZE elements in the array" << std::endl; std::cout << " -n --numtimes NUM Run the test NUM times (NUM >= 2)" << std::endl; std::cout << " --float Use floats (rather than doubles)" << std::endl; + std::cout << " --half Use half-length (16-bit) floats on supported platforms" << std::endl; std::cout << " --triad-only Only run triad" << std::endl; std::cout << " --csv Output as csv table" << std::endl; std::cout << " --mibibytes Use MiB=2^20 for bandwidth calculation (default MB=10^6)" << std::endl; diff --git a/package-up-report.sh b/package-up-report.sh new file mode 100755 index 00000000..9bccf8e1 --- /dev/null +++ b/package-up-report.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +tar czvf analysis-report.tar.gz graph.json execution.json vars.capnp diff --git a/results/v3.4/graphcore-c2-ipu/ipu.txt b/results/v3.4/graphcore-c2-ipu/ipu.txt new file mode 100644 index 00000000..03d8ac30 --- /dev/null +++ b/results/v3.4/graphcore-c2-ipu/ipu.txt @@ -0,0 +1,18 @@ +BabelStream +Version: 3.4 +Implementation: Poplar +Running kernels 100 times +Precision: float +Array size: 64.7 MB (=0.1 GB) +Total size: 194.2 MB (=0.2 GB) +Attached to IPU 12 +Using IPU with 1216 tiles, each with 6 workers and 256KB of memory per tile, and clock frequency 1600 MHz +Validation failed on sum. Error 7.62939e-06 +Sum was 19.1933250427246 but should be 19.1933326721191 +Function MBytes/sec Min (sec) Max Average +Copy 4193140.970 0.00003 0.00003 0.00003 +Mul 3586405.562 0.00004 0.00004 0.00004 +Add 4192914.661 0.00005 0.00005 0.00005 +Triad 3772580.708 0.00005 0.00005 0.00005 +Dot 1301842.970 0.00010 0.00010 0.00010 +